blob: 2d23450e76145a64e4f90a71c5f606ce844cd0aa [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080030#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <linux/completion.h>
32#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070033#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080037#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080038#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070052#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080053#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070054#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070055#include <linux/reciprocal_div.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
Eric Dumazet5517d862007-05-08 00:32:57 -070057#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070058#include <asm/unistd.h>
59
60/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080061 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
70/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
Ingo Molnar6aa645e2007-07-09 18:51:58 +020094#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
Linus Torvalds1da177e2005-04-16 15:20:36 -070097/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
106#define ON_RUNQUEUE_WEIGHT 30
107#define CHILD_PENALTY 95
108#define PARENT_PENALTY 100
109#define EXIT_WEIGHT 3
110#define PRIO_BONUS_RATIO 25
111#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
112#define INTERACTIVE_DELTA 2
113#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
114#define STARVATION_LIMIT (MAX_SLEEP_AVG)
115#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
116
117/*
118 * If a task is 'interactive' then we reinsert it in the active
119 * array after it has expired its current timeslice. (it will not
120 * continue to run immediately, it will still roundrobin with
121 * other interactive tasks.)
122 *
123 * This part scales the interactivity limit depending on niceness.
124 *
125 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
126 * Here are a few examples of different nice levels:
127 *
128 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
129 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
130 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
131 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
132 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
133 *
134 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
135 * priority range a task can explore, a value of '1' means the
136 * task is rated interactive.)
137 *
138 * Ie. nice +19 tasks can never get 'interactive' enough to be
139 * reinserted into the active array. And only heavily CPU-hog nice -20
140 * tasks will be expired. Default nice 0 tasks are somewhere between,
141 * it takes some effort for them to get interactive, but it's not
142 * too hard.
143 */
144
145#define CURRENT_BONUS(p) \
146 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
147 MAX_SLEEP_AVG)
148
149#define GRANULARITY (10 * HZ / 1000 ? : 1)
150
151#ifdef CONFIG_SMP
152#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
154 num_online_cpus())
155#else
156#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
157 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
158#endif
159
160#define SCALE(v1,v1_max,v2_max) \
161 (v1) * (v2_max) / (v1_max)
162
163#define DELTA(p) \
Martin Andersson013d3862006-03-27 01:15:18 -0800164 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
165 INTERACTIVE_DELTA)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166
167#define TASK_INTERACTIVE(p) \
168 ((p)->prio <= (p)->static_prio - DELTA(p))
169
170#define INTERACTIVE_SLEEP(p) \
171 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
172 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
173
174#define TASK_PREEMPTS_CURR(p, rq) \
Andrew Mortond5f9f942007-05-08 20:27:06 -0700175 ((p)->prio < (rq)->curr->prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177#define SCALE_PRIO(x, prio) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700178 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179
Peter Williams2dd73a42006-06-27 02:54:34 -0700180static unsigned int static_prio_timeslice(int static_prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181{
Peter Williams2dd73a42006-06-27 02:54:34 -0700182 if (static_prio < NICE_TO_PRIO(0))
183 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 else
Peter Williams2dd73a42006-06-27 02:54:34 -0700185 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186}
Peter Williams2dd73a42006-06-27 02:54:34 -0700187
Eric Dumazet5517d862007-05-08 00:32:57 -0700188#ifdef CONFIG_SMP
189/*
190 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
191 * Since cpu_power is a 'constant', we can use a reciprocal divide.
192 */
193static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
194{
195 return reciprocal_divide(load, sg->reciprocal_cpu_power);
196}
197
198/*
199 * Each time a sched group cpu_power is changed,
200 * we must compute its reciprocal value
201 */
202static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203{
204 sg->__cpu_power += val;
205 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
206}
207#endif
208
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700209/*
210 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
211 * to time slice values: [800ms ... 100ms ... 5ms]
212 *
213 * The higher a thread's priority, the bigger timeslices
214 * it gets during one round of execution. But even the lowest
215 * priority thread gets MIN_TIMESLICE worth of execution time.
216 */
217
Ingo Molnar36c8b582006-07-03 00:25:41 -0700218static inline unsigned int task_timeslice(struct task_struct *p)
Peter Williams2dd73a42006-06-27 02:54:34 -0700219{
220 return static_prio_timeslice(p->static_prio);
221}
222
Ingo Molnare05606d2007-07-09 18:51:59 +0200223static inline int rt_policy(int policy)
224{
225 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
226 return 1;
227 return 0;
228}
229
230static inline int task_has_rt_policy(struct task_struct *p)
231{
232 return rt_policy(p->policy);
233}
234
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200236 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200238struct rt_prio_array {
239 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
240 struct list_head queue[MAX_RT_PRIO];
241};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200243struct load_stat {
244 struct load_weight load;
245 u64 load_update_start, load_update_last;
246 unsigned long delta_fair, delta_exec, delta_stat;
247};
248
249/* CFS-related fields in a runqueue */
250struct cfs_rq {
251 struct load_weight load;
252 unsigned long nr_running;
253
254 s64 fair_clock;
255 u64 exec_clock;
256 s64 wait_runtime;
257 u64 sleeper_bonus;
258 unsigned long wait_runtime_overruns, wait_runtime_underruns;
259
260 struct rb_root tasks_timeline;
261 struct rb_node *rb_leftmost;
262 struct rb_node *rb_load_balance_curr;
263#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* 'curr' points to currently running entity on this cfs_rq.
265 * It is set to NULL otherwise (i.e when none are currently running).
266 */
267 struct sched_entity *curr;
268 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
269
270 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
271 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
272 * (like users, containers etc.)
273 *
274 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
275 * list is used during load balance.
276 */
277 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
278#endif
279};
280
281/* Real-Time classes' related field in a runqueue: */
282struct rt_rq {
283 struct rt_prio_array active;
284 int rt_load_balance_idx;
285 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
286};
287
288/*
289 * The prio-array type of the old scheduler:
290 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291struct prio_array {
292 unsigned int nr_active;
Steven Rostedtd4448862006-06-27 02:54:29 -0700293 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 struct list_head queue[MAX_PRIO];
295};
296
297/*
298 * This is the main, per-CPU runqueue data structure.
299 *
300 * Locking rule: those places that want to lock multiple runqueues
301 * (such as the load balancing or the thread migration code), lock
302 * acquire operations must be ordered by ascending &runqueue.
303 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700304struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200305 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
307 /*
308 * nr_running and cpu_load should be in the same cacheline because
309 * remote CPUs use both these fields when doing load calculation.
310 */
311 unsigned long nr_running;
Peter Williams2dd73a42006-06-27 02:54:34 -0700312 unsigned long raw_weighted_load;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200313 #define CPU_LOAD_IDX_MAX 5
314 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700315 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700316#ifdef CONFIG_NO_HZ
317 unsigned char in_nohz_recently;
318#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200319 struct load_stat ls; /* capture load from *all* tasks on this cpu */
320 unsigned long nr_load_updates;
321 u64 nr_switches;
322
323 struct cfs_rq cfs;
324#ifdef CONFIG_FAIR_GROUP_SCHED
325 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200327 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328
329 /*
330 * This is part of a global counter where only the total sum
331 * over all CPUs matters. A task can increase this counter on
332 * one CPU and if it got migrated afterwards it may decrease
333 * it on another CPU. Always updated under the runqueue lock:
334 */
335 unsigned long nr_uninterruptible;
336
337 unsigned long expired_timestamp;
Mike Galbraithb18ec802006-12-10 02:20:31 -0800338 unsigned long long most_recent_timestamp;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200339
Ingo Molnar36c8b582006-07-03 00:25:41 -0700340 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800341 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200343
Ingo Molnar70b97a72006-07-03 00:25:42 -0700344 struct prio_array *active, *expired, arrays[2];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345 int best_expired_prio;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200346
347 u64 clock, prev_clock_raw;
348 s64 clock_max_delta;
349
350 unsigned int clock_warps, clock_overflows;
351 unsigned int clock_unstable_events;
352
353 struct sched_class *load_balance_class;
354
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 atomic_t nr_iowait;
356
357#ifdef CONFIG_SMP
358 struct sched_domain *sd;
359
360 /* For active balancing */
361 int active_balance;
362 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700363 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
Ingo Molnar36c8b582006-07-03 00:25:41 -0700365 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 struct list_head migration_queue;
367#endif
368
369#ifdef CONFIG_SCHEDSTATS
370 /* latency stats */
371 struct sched_info rq_sched_info;
372
373 /* sys_sched_yield() stats */
374 unsigned long yld_exp_empty;
375 unsigned long yld_act_empty;
376 unsigned long yld_both_empty;
377 unsigned long yld_cnt;
378
379 /* schedule() stats */
380 unsigned long sched_switch;
381 unsigned long sched_cnt;
382 unsigned long sched_goidle;
383
384 /* try_to_wake_up() stats */
385 unsigned long ttwu_cnt;
386 unsigned long ttwu_local;
387#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700388 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389};
390
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700391static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700392static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393
Ingo Molnardd41f592007-07-09 18:51:59 +0200394static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
395{
396 rq->curr->sched_class->check_preempt_curr(rq, p);
397}
398
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700399static inline int cpu_of(struct rq *rq)
400{
401#ifdef CONFIG_SMP
402 return rq->cpu;
403#else
404 return 0;
405#endif
406}
407
Nick Piggin674311d2005-06-25 14:57:27 -0700408/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200409 * Per-runqueue clock, as finegrained as the platform can give us:
410 */
411static unsigned long long __rq_clock(struct rq *rq)
412{
413 u64 prev_raw = rq->prev_clock_raw;
414 u64 now = sched_clock();
415 s64 delta = now - prev_raw;
416 u64 clock = rq->clock;
417
418 /*
419 * Protect against sched_clock() occasionally going backwards:
420 */
421 if (unlikely(delta < 0)) {
422 clock++;
423 rq->clock_warps++;
424 } else {
425 /*
426 * Catch too large forward jumps too:
427 */
428 if (unlikely(delta > 2*TICK_NSEC)) {
429 clock++;
430 rq->clock_overflows++;
431 } else {
432 if (unlikely(delta > rq->clock_max_delta))
433 rq->clock_max_delta = delta;
434 clock += delta;
435 }
436 }
437
438 rq->prev_clock_raw = now;
439 rq->clock = clock;
440
441 return clock;
442}
443
444static inline unsigned long long rq_clock(struct rq *rq)
445{
446 int this_cpu = smp_processor_id();
447
448 if (this_cpu == cpu_of(rq))
449 return __rq_clock(rq);
450
451 return rq->clock;
452}
453
454/*
Nick Piggin674311d2005-06-25 14:57:27 -0700455 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700456 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700457 *
458 * The domain tree of any CPU may only be accessed from within
459 * preempt-disabled sections.
460 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700461#define for_each_domain(cpu, __sd) \
462 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463
464#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
465#define this_rq() (&__get_cpu_var(runqueues))
466#define task_rq(p) cpu_rq(task_cpu(p))
467#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
468
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200469#ifdef CONFIG_FAIR_GROUP_SCHED
470/* Change a task's ->cfs_rq if it moves across CPUs */
471static inline void set_task_cfs_rq(struct task_struct *p)
472{
473 p->se.cfs_rq = &task_rq(p)->cfs;
474}
475#else
476static inline void set_task_cfs_rq(struct task_struct *p)
477{
478}
479#endif
480
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700482# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700484#ifndef finish_arch_switch
485# define finish_arch_switch(prev) do { } while (0)
486#endif
487
488#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700489static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700490{
491 return rq->curr == p;
492}
493
Ingo Molnar70b97a72006-07-03 00:25:42 -0700494static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700495{
496}
497
Ingo Molnar70b97a72006-07-03 00:25:42 -0700498static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700499{
Ingo Molnarda04c032005-09-13 11:17:59 +0200500#ifdef CONFIG_DEBUG_SPINLOCK
501 /* this is a valid case when another task releases the spinlock */
502 rq->lock.owner = current;
503#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700504 /*
505 * If we are tracking spinlock dependencies then we have to
506 * fix up the runqueue lock - which gets 'carried over' from
507 * prev into current:
508 */
509 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
510
Nick Piggin4866cde2005-06-25 14:57:23 -0700511 spin_unlock_irq(&rq->lock);
512}
513
514#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700515static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700516{
517#ifdef CONFIG_SMP
518 return p->oncpu;
519#else
520 return rq->curr == p;
521#endif
522}
523
Ingo Molnar70b97a72006-07-03 00:25:42 -0700524static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700525{
526#ifdef CONFIG_SMP
527 /*
528 * We can optimise this out completely for !SMP, because the
529 * SMP rebalancing from interrupt is the only thing that cares
530 * here.
531 */
532 next->oncpu = 1;
533#endif
534#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
535 spin_unlock_irq(&rq->lock);
536#else
537 spin_unlock(&rq->lock);
538#endif
539}
540
Ingo Molnar70b97a72006-07-03 00:25:42 -0700541static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700542{
543#ifdef CONFIG_SMP
544 /*
545 * After ->oncpu is cleared, the task can be moved to a different CPU.
546 * We must ensure this doesn't happen until the switch is completely
547 * finished.
548 */
549 smp_wmb();
550 prev->oncpu = 0;
551#endif
552#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
553 local_irq_enable();
554#endif
555}
556#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557
558/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700559 * __task_rq_lock - lock the runqueue a given task resides on.
560 * Must be called interrupts disabled.
561 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700562static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700563 __acquires(rq->lock)
564{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700565 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700566
567repeat_lock_task:
568 rq = task_rq(p);
569 spin_lock(&rq->lock);
570 if (unlikely(rq != task_rq(p))) {
571 spin_unlock(&rq->lock);
572 goto repeat_lock_task;
573 }
574 return rq;
575}
576
577/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 * task_rq_lock - lock the runqueue a given task resides on and disable
579 * interrupts. Note the ordering: we can safely lookup the task_rq without
580 * explicitly disabling preemption.
581 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700582static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 __acquires(rq->lock)
584{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700585 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586
587repeat_lock_task:
588 local_irq_save(*flags);
589 rq = task_rq(p);
590 spin_lock(&rq->lock);
591 if (unlikely(rq != task_rq(p))) {
592 spin_unlock_irqrestore(&rq->lock, *flags);
593 goto repeat_lock_task;
594 }
595 return rq;
596}
597
Ingo Molnar70b97a72006-07-03 00:25:42 -0700598static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700599 __releases(rq->lock)
600{
601 spin_unlock(&rq->lock);
602}
603
Ingo Molnar70b97a72006-07-03 00:25:42 -0700604static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 __releases(rq->lock)
606{
607 spin_unlock_irqrestore(&rq->lock, *flags);
608}
609
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800611 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700613static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 __acquires(rq->lock)
615{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700616 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618 local_irq_disable();
619 rq = this_rq();
620 spin_lock(&rq->lock);
621
622 return rq;
623}
624
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200625/*
Ingo Molnar1b9f19c2007-07-09 18:51:59 +0200626 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
627 */
628void sched_clock_unstable_event(void)
629{
630 unsigned long flags;
631 struct rq *rq;
632
633 rq = task_rq_lock(current, &flags);
634 rq->prev_clock_raw = sched_clock();
635 rq->clock_unstable_events++;
636 task_rq_unlock(rq, &flags);
637}
638
639/*
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200640 * resched_task - mark a task 'to be rescheduled now'.
641 *
642 * On UP this means the setting of the need_resched flag, on SMP it
643 * might also involve a cross-CPU call to trigger the scheduler on
644 * the target CPU.
645 */
646#ifdef CONFIG_SMP
647
648#ifndef tsk_is_polling
649#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
650#endif
651
652static void resched_task(struct task_struct *p)
653{
654 int cpu;
655
656 assert_spin_locked(&task_rq(p)->lock);
657
658 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
659 return;
660
661 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
662
663 cpu = task_cpu(p);
664 if (cpu == smp_processor_id())
665 return;
666
667 /* NEED_RESCHED must be visible before we test polling */
668 smp_mb();
669 if (!tsk_is_polling(p))
670 smp_send_reschedule(cpu);
671}
672
673static void resched_cpu(int cpu)
674{
675 struct rq *rq = cpu_rq(cpu);
676 unsigned long flags;
677
678 if (!spin_trylock_irqsave(&rq->lock, flags))
679 return;
680 resched_task(cpu_curr(cpu));
681 spin_unlock_irqrestore(&rq->lock, flags);
682}
683#else
684static inline void resched_task(struct task_struct *p)
685{
686 assert_spin_locked(&task_rq(p)->lock);
687 set_tsk_need_resched(p);
688}
689#endif
690
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200691static u64 div64_likely32(u64 divident, unsigned long divisor)
692{
693#if BITS_PER_LONG == 32
694 if (likely(divident <= 0xffffffffULL))
695 return (u32)divident / divisor;
696 do_div(divident, divisor);
697
698 return divident;
699#else
700 return divident / divisor;
701#endif
702}
703
704#if BITS_PER_LONG == 32
705# define WMULT_CONST (~0UL)
706#else
707# define WMULT_CONST (1UL << 32)
708#endif
709
710#define WMULT_SHIFT 32
711
712static inline unsigned long
713calc_delta_mine(unsigned long delta_exec, unsigned long weight,
714 struct load_weight *lw)
715{
716 u64 tmp;
717
718 if (unlikely(!lw->inv_weight))
719 lw->inv_weight = WMULT_CONST / lw->weight;
720
721 tmp = (u64)delta_exec * weight;
722 /*
723 * Check whether we'd overflow the 64-bit multiplication:
724 */
725 if (unlikely(tmp > WMULT_CONST)) {
726 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
727 >> (WMULT_SHIFT/2);
728 } else {
729 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
730 }
731
732 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
733}
734
735static inline unsigned long
736calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
737{
738 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
739}
740
741static void update_load_add(struct load_weight *lw, unsigned long inc)
742{
743 lw->weight += inc;
744 lw->inv_weight = 0;
745}
746
747static void update_load_sub(struct load_weight *lw, unsigned long dec)
748{
749 lw->weight -= dec;
750 lw->inv_weight = 0;
751}
752
753static void __update_curr_load(struct rq *rq, struct load_stat *ls)
754{
755 if (rq->curr != rq->idle && ls->load.weight) {
756 ls->delta_exec += ls->delta_stat;
757 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
758 ls->delta_stat = 0;
759 }
760}
761
762/*
763 * Update delta_exec, delta_fair fields for rq.
764 *
765 * delta_fair clock advances at a rate inversely proportional to
766 * total load (rq->ls.load.weight) on the runqueue, while
767 * delta_exec advances at the same rate as wall-clock (provided
768 * cpu is not idle).
769 *
770 * delta_exec / delta_fair is a measure of the (smoothened) load on this
771 * runqueue over any given interval. This (smoothened) load is used
772 * during load balance.
773 *
774 * This function is called /before/ updating rq->ls.load
775 * and when switching tasks.
776 */
777static void update_curr_load(struct rq *rq, u64 now)
778{
779 struct load_stat *ls = &rq->ls;
780 u64 start;
781
782 start = ls->load_update_start;
783 ls->load_update_start = now;
784 ls->delta_stat += now - start;
785 /*
786 * Stagger updates to ls->delta_fair. Very frequent updates
787 * can be expensive.
788 */
789 if (ls->delta_stat >= sysctl_sched_stat_granularity)
790 __update_curr_load(rq, ls);
791}
792
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700794 * To aid in avoiding the subversion of "niceness" due to uneven distribution
795 * of tasks with abnormal "nice" values across CPUs the contribution that
796 * each task makes to its run queue's load is weighted according to its
797 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
798 * scaled version of the new time slice allocation that they receive on time
799 * slice expiry etc.
800 */
801
802/*
803 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
804 * If static_prio_timeslice() is ever changed to break this assumption then
805 * this code will need modification
806 */
807#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200808#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700809 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
810#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200811 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700812#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200813 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
814
815#define WEIGHT_IDLEPRIO 2
816#define WMULT_IDLEPRIO (1 << 31)
817
818/*
819 * Nice levels are multiplicative, with a gentle 10% change for every
820 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
821 * nice 1, it will get ~10% less CPU time than another CPU-bound task
822 * that remained on nice 0.
823 *
824 * The "10% effect" is relative and cumulative: from _any_ nice level,
825 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
826 * it's +10% CPU usage.
827 */
828static const int prio_to_weight[40] = {
829/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
830/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
831/* 0 */ NICE_0_LOAD /* 1024 */,
832/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
833/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
834};
835
836static const u32 prio_to_wmult[40] = {
837 48356, 60446, 75558, 94446, 118058, 147573,
838 184467, 230589, 288233, 360285, 450347,
839 562979, 703746, 879575, 1099582, 1374389,
840 717986, 2147483, 2684354, 3355443, 4194304,
841 244160, 6557201, 8196502, 10250518, 12782640,
842 16025997, 19976592, 24970740, 31350126, 39045157,
843 49367440, 61356675, 76695844, 95443717, 119304647,
844 148102320, 186737708, 238609294, 286331153,
845};
Peter Williams2dd73a42006-06-27 02:54:34 -0700846
Ingo Molnar36c8b582006-07-03 00:25:41 -0700847static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200848inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700849{
Ingo Molnardd41f592007-07-09 18:51:59 +0200850 update_curr_load(rq, now);
851 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700852}
853
Ingo Molnar36c8b582006-07-03 00:25:41 -0700854static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200855dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700856{
Ingo Molnardd41f592007-07-09 18:51:59 +0200857 update_curr_load(rq, now);
858 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700859}
860
Ingo Molnardd41f592007-07-09 18:51:59 +0200861static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700862{
863 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200864 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700865}
866
Ingo Molnardd41f592007-07-09 18:51:59 +0200867static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700868{
869 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200870 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700871}
872
Ingo Molnardd41f592007-07-09 18:51:59 +0200873static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
874
875/*
876 * runqueue iterator, to support SMP load-balancing between different
877 * scheduling classes, without having to expose their internal data
878 * structures to the load-balancing proper:
879 */
880struct rq_iterator {
881 void *arg;
882 struct task_struct *(*start)(void *);
883 struct task_struct *(*next)(void *);
884};
885
886static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
887 unsigned long max_nr_move, unsigned long max_load_move,
888 struct sched_domain *sd, enum cpu_idle_type idle,
889 int *all_pinned, unsigned long *load_moved,
890 int this_best_prio, int best_prio, int best_prio_seen,
891 struct rq_iterator *iterator);
892
893#include "sched_stats.h"
894#include "sched_rt.c"
895#include "sched_fair.c"
896#include "sched_idletask.c"
897#ifdef CONFIG_SCHED_DEBUG
898# include "sched_debug.c"
899#endif
900
901#define sched_class_highest (&rt_sched_class)
902
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200903static void set_load_weight(struct task_struct *p)
904{
Ingo Molnardd41f592007-07-09 18:51:59 +0200905 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
906 p->se.wait_runtime = 0;
907
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200908 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200909 p->se.load.weight = prio_to_weight[0] * 2;
910 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
911 return;
912 }
913
914 /*
915 * SCHED_IDLE tasks get minimal weight:
916 */
917 if (p->policy == SCHED_IDLE) {
918 p->se.load.weight = WEIGHT_IDLEPRIO;
919 p->se.load.inv_weight = WMULT_IDLEPRIO;
920 return;
921 }
922
923 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
924 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200925}
926
Ingo Molnardd41f592007-07-09 18:51:59 +0200927static void
928enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200929{
930 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200931 p->sched_class->enqueue_task(rq, p, wakeup, now);
932 p->se.on_rq = 1;
933}
934
935static void
936dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
937{
938 p->sched_class->dequeue_task(rq, p, sleep, now);
939 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200940}
941
942/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200943 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200944 */
Ingo Molnar14531182007-07-09 18:51:59 +0200945static inline int __normal_prio(struct task_struct *p)
946{
Ingo Molnardd41f592007-07-09 18:51:59 +0200947 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200948}
949
950/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700951 * Calculate the expected normal priority: i.e. priority
952 * without taking RT-inheritance into account. Might be
953 * boosted by interactivity modifiers. Changes upon fork,
954 * setprio syscalls, and whenever the interactivity
955 * estimator recalculates.
956 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700957static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700958{
959 int prio;
960
Ingo Molnare05606d2007-07-09 18:51:59 +0200961 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700962 prio = MAX_RT_PRIO-1 - p->rt_priority;
963 else
964 prio = __normal_prio(p);
965 return prio;
966}
967
968/*
969 * Calculate the current priority, i.e. the priority
970 * taken into account by the scheduler. This value might
971 * be boosted by RT tasks, or might be boosted by
972 * interactivity modifiers. Will be RT if the task got
973 * RT-boosted. If not then it returns p->normal_prio.
974 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700975static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700976{
977 p->normal_prio = normal_prio(p);
978 /*
979 * If we are RT tasks or we were boosted to RT priority,
980 * keep the priority unchanged. Otherwise, update priority
981 * to the normal priority:
982 */
983 if (!rt_prio(p->prio))
984 return p->normal_prio;
985 return p->prio;
986}
987
988/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200989 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200991static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992{
Ingo Molnardd41f592007-07-09 18:51:59 +0200993 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800994
Ingo Molnardd41f592007-07-09 18:51:59 +0200995 if (p->state == TASK_UNINTERRUPTIBLE)
996 rq->nr_uninterruptible--;
997
998 enqueue_task(rq, p, wakeup, now);
999 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000}
1001
1002/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001003 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001005static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006{
Ingo Molnardd41f592007-07-09 18:51:59 +02001007 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008
Ingo Molnardd41f592007-07-09 18:51:59 +02001009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011
Ingo Molnardd41f592007-07-09 18:51:59 +02001012 enqueue_task(rq, p, 0, now);
1013 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014}
1015
1016/*
1017 * deactivate_task - remove a task from the runqueue.
1018 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001019static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020{
Ingo Molnardd41f592007-07-09 18:51:59 +02001021 u64 now = rq_clock(rq);
1022
1023 if (p->state == TASK_UNINTERRUPTIBLE)
1024 rq->nr_uninterruptible++;
1025
1026 dequeue_task(rq, p, sleep, now);
1027 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028}
1029
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030/**
1031 * task_curr - is this task currently executing on a CPU?
1032 * @p: the task in question.
1033 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001034inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035{
1036 return cpu_curr(task_cpu(p)) == p;
1037}
1038
Peter Williams2dd73a42006-06-27 02:54:34 -07001039/* Used instead of source_load when we know the type == 0 */
1040unsigned long weighted_cpuload(const int cpu)
1041{
Ingo Molnardd41f592007-07-09 18:51:59 +02001042 return cpu_rq(cpu)->ls.load.weight;
1043}
1044
1045static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1046{
1047#ifdef CONFIG_SMP
1048 task_thread_info(p)->cpu = cpu;
1049 set_task_cfs_rq(p);
1050#endif
Peter Williams2dd73a42006-06-27 02:54:34 -07001051}
1052
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +02001054
Ingo Molnardd41f592007-07-09 18:51:59 +02001055void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +02001056{
Ingo Molnardd41f592007-07-09 18:51:59 +02001057 int old_cpu = task_cpu(p);
1058 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1059 u64 clock_offset, fair_clock_offset;
1060
1061 clock_offset = old_rq->clock - new_rq->clock;
1062 fair_clock_offset = old_rq->cfs.fair_clock -
1063 new_rq->cfs.fair_clock;
1064 if (p->se.wait_start)
1065 p->se.wait_start -= clock_offset;
1066 if (p->se.wait_start_fair)
1067 p->se.wait_start_fair -= fair_clock_offset;
1068 if (p->se.sleep_start)
1069 p->se.sleep_start -= clock_offset;
1070 if (p->se.block_start)
1071 p->se.block_start -= clock_offset;
1072 if (p->se.sleep_start_fair)
1073 p->se.sleep_start_fair -= fair_clock_offset;
1074
1075 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +02001076}
1077
Ingo Molnar70b97a72006-07-03 00:25:42 -07001078struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080
Ingo Molnar36c8b582006-07-03 00:25:41 -07001081 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 int dest_cpu;
1083
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001085};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086
1087/*
1088 * The task's runqueue lock must be held.
1089 * Returns true if you have to wait for migration thread.
1090 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001091static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07001092migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001094 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095
1096 /*
1097 * If the task is not on a runqueue (and not running), then
1098 * it is sufficient to simply update the task's cpu field.
1099 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001100 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101 set_task_cpu(p, dest_cpu);
1102 return 0;
1103 }
1104
1105 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 req->task = p;
1107 req->dest_cpu = dest_cpu;
1108 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001109
Linus Torvalds1da177e2005-04-16 15:20:36 -07001110 return 1;
1111}
1112
1113/*
1114 * wait_task_inactive - wait for a thread to unschedule.
1115 *
1116 * The caller must ensure that the task *will* unschedule sometime soon,
1117 * else this function might spin for a *long* time. This function can't
1118 * be called with interrupts off, or it may introduce deadlock with
1119 * smp_call_function() if an IPI is sent by the same process we are
1120 * waiting to become inactive.
1121 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001122void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123{
1124 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001125 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001126 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127
1128repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001129 /*
1130 * We do the initial early heuristics without holding
1131 * any task-queue locks at all. We'll only try to get
1132 * the runqueue lock when things look like they will
1133 * work out!
1134 */
1135 rq = task_rq(p);
1136
1137 /*
1138 * If the task is actively running on another CPU
1139 * still, just relax and busy-wait without holding
1140 * any locks.
1141 *
1142 * NOTE! Since we don't hold any locks, it's not
1143 * even sure that "rq" stays as the right runqueue!
1144 * But we don't care, since "task_running()" will
1145 * return false if the runqueue has changed and p
1146 * is actually now running somewhere else!
1147 */
1148 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001150
1151 /*
1152 * Ok, time to look more closely! We need the rq
1153 * lock now, to be *sure*. If we're wrong, we'll
1154 * just go back and repeat.
1155 */
1156 rq = task_rq_lock(p, &flags);
1157 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001158 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001159 task_rq_unlock(rq, &flags);
1160
1161 /*
1162 * Was it really running after all now that we
1163 * checked with the proper locks actually held?
1164 *
1165 * Oops. Go back and try again..
1166 */
1167 if (unlikely(running)) {
1168 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169 goto repeat;
1170 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001171
1172 /*
1173 * It's not enough that it's not actively running,
1174 * it must be off the runqueue _entirely_, and not
1175 * preempted!
1176 *
1177 * So if it wa still runnable (but just not actively
1178 * running right now), it's preempted, and we should
1179 * yield - it could be a while.
1180 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001181 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001182 yield();
1183 goto repeat;
1184 }
1185
1186 /*
1187 * Ahh, all good. It wasn't running, and it wasn't
1188 * runnable, which means that it will never become
1189 * running in the future either. We're all done!
1190 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191}
1192
1193/***
1194 * kick_process - kick a running thread to enter/exit the kernel
1195 * @p: the to-be-kicked thread
1196 *
1197 * Cause a process which is running on another CPU to enter
1198 * kernel-mode, without any delay. (to get signals handled.)
1199 *
1200 * NOTE: this function doesnt have to take the runqueue lock,
1201 * because all it wants to ensure is that the remote task enters
1202 * the kernel. If the IPI races and the task has been migrated
1203 * to another CPU then no harm is done and the purpose has been
1204 * achieved as well.
1205 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001206void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207{
1208 int cpu;
1209
1210 preempt_disable();
1211 cpu = task_cpu(p);
1212 if ((cpu != smp_processor_id()) && task_curr(p))
1213 smp_send_reschedule(cpu);
1214 preempt_enable();
1215}
1216
1217/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001218 * Return a low guess at the load of a migration-source cpu weighted
1219 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 *
1221 * We want to under-estimate the load of migration sources, to
1222 * balance conservatively.
1223 */
Con Kolivasb9104722005-11-08 21:38:55 -08001224static inline unsigned long source_load(int cpu, int type)
1225{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001226 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001227 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001228
Peter Williams2dd73a42006-06-27 02:54:34 -07001229 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001230 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001231
Ingo Molnardd41f592007-07-09 18:51:59 +02001232 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233}
1234
1235/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001236 * Return a high guess at the load of a migration-target cpu weighted
1237 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 */
Con Kolivasb9104722005-11-08 21:38:55 -08001239static inline unsigned long target_load(int cpu, int type)
1240{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001241 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001242 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001243
Peter Williams2dd73a42006-06-27 02:54:34 -07001244 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001245 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001246
Ingo Molnardd41f592007-07-09 18:51:59 +02001247 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001248}
1249
1250/*
1251 * Return the average load per task on the cpu's run queue
1252 */
1253static inline unsigned long cpu_avg_load_per_task(int cpu)
1254{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001255 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001256 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001257 unsigned long n = rq->nr_running;
1258
Ingo Molnardd41f592007-07-09 18:51:59 +02001259 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260}
1261
Nick Piggin147cbb42005-06-25 14:57:19 -07001262/*
1263 * find_idlest_group finds and returns the least busy CPU group within the
1264 * domain.
1265 */
1266static struct sched_group *
1267find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1268{
1269 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270 unsigned long min_load = ULONG_MAX, this_load = 0;
1271 int load_idx = sd->forkexec_idx;
1272 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1273
1274 do {
1275 unsigned long load, avg_load;
1276 int local_group;
1277 int i;
1278
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001279 /* Skip over this group if it has no CPUs allowed */
1280 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1281 goto nextgroup;
1282
Nick Piggin147cbb42005-06-25 14:57:19 -07001283 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001284
1285 /* Tally up the load of all CPUs in the group */
1286 avg_load = 0;
1287
1288 for_each_cpu_mask(i, group->cpumask) {
1289 /* Bias balancing toward cpus of our domain */
1290 if (local_group)
1291 load = source_load(i, load_idx);
1292 else
1293 load = target_load(i, load_idx);
1294
1295 avg_load += load;
1296 }
1297
1298 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001299 avg_load = sg_div_cpu_power(group,
1300 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001301
1302 if (local_group) {
1303 this_load = avg_load;
1304 this = group;
1305 } else if (avg_load < min_load) {
1306 min_load = avg_load;
1307 idlest = group;
1308 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001309nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001310 group = group->next;
1311 } while (group != sd->groups);
1312
1313 if (!idlest || 100*this_load < imbalance*min_load)
1314 return NULL;
1315 return idlest;
1316}
1317
1318/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001319 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001320 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001321static int
1322find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001323{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001324 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001325 unsigned long load, min_load = ULONG_MAX;
1326 int idlest = -1;
1327 int i;
1328
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001329 /* Traverse only the allowed CPUs */
1330 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1331
1332 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001333 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001334
1335 if (load < min_load || (load == min_load && i == this_cpu)) {
1336 min_load = load;
1337 idlest = i;
1338 }
1339 }
1340
1341 return idlest;
1342}
1343
Nick Piggin476d1392005-06-25 14:57:29 -07001344/*
1345 * sched_balance_self: balance the current task (running on cpu) in domains
1346 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1347 * SD_BALANCE_EXEC.
1348 *
1349 * Balance, ie. select the least loaded group.
1350 *
1351 * Returns the target CPU number, or the same CPU if no balancing is needed.
1352 *
1353 * preempt must be disabled.
1354 */
1355static int sched_balance_self(int cpu, int flag)
1356{
1357 struct task_struct *t = current;
1358 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001359
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001360 for_each_domain(cpu, tmp) {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001361 /*
1362 * If power savings logic is enabled for a domain, stop there.
1363 */
1364 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1365 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001366 if (tmp->flags & flag)
1367 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001368 }
Nick Piggin476d1392005-06-25 14:57:29 -07001369
1370 while (sd) {
1371 cpumask_t span;
1372 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001373 int new_cpu, weight;
1374
1375 if (!(sd->flags & flag)) {
1376 sd = sd->child;
1377 continue;
1378 }
Nick Piggin476d1392005-06-25 14:57:29 -07001379
1380 span = sd->span;
1381 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001382 if (!group) {
1383 sd = sd->child;
1384 continue;
1385 }
Nick Piggin476d1392005-06-25 14:57:29 -07001386
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001387 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001388 if (new_cpu == -1 || new_cpu == cpu) {
1389 /* Now try balancing at a lower domain level of cpu */
1390 sd = sd->child;
1391 continue;
1392 }
Nick Piggin476d1392005-06-25 14:57:29 -07001393
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001394 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001395 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001396 sd = NULL;
1397 weight = cpus_weight(span);
1398 for_each_domain(cpu, tmp) {
1399 if (weight <= cpus_weight(tmp->span))
1400 break;
1401 if (tmp->flags & flag)
1402 sd = tmp;
1403 }
1404 /* while loop will break here if sd == NULL */
1405 }
1406
1407 return cpu;
1408}
1409
1410#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411
1412/*
1413 * wake_idle() will wake a task on an idle cpu if task->cpu is
1414 * not idle and an idle cpu is available. The span of cpus to
1415 * search starts with cpus closest then further out as needed,
1416 * so we always favor a closer, idle cpu.
1417 *
1418 * Returns the CPU we should wake onto.
1419 */
1420#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001421static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422{
1423 cpumask_t tmp;
1424 struct sched_domain *sd;
1425 int i;
1426
Siddha, Suresh B49531982007-05-08 00:33:01 -07001427 /*
1428 * If it is idle, then it is the best cpu to run this task.
1429 *
1430 * This cpu is also the best, if it has more than one task already.
1431 * Siblings must be also busy(in most cases) as they didn't already
1432 * pickup the extra load from this cpu and hence we need not check
1433 * sibling runqueue info. This will avoid the checks and cache miss
1434 * penalities associated with that.
1435 */
1436 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 return cpu;
1438
1439 for_each_domain(cpu, sd) {
1440 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001441 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 for_each_cpu_mask(i, tmp) {
1443 if (idle_cpu(i))
1444 return i;
1445 }
1446 }
Nick Piggine0f364f2005-06-25 14:57:06 -07001447 else
1448 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449 }
1450 return cpu;
1451}
1452#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001453static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454{
1455 return cpu;
1456}
1457#endif
1458
1459/***
1460 * try_to_wake_up - wake up a thread
1461 * @p: the to-be-woken-up thread
1462 * @state: the mask of task states that can be woken
1463 * @sync: do a synchronous wakeup?
1464 *
1465 * Put it on the run-queue if it's not already there. The "current"
1466 * thread is always on the run-queue (except when the actual
1467 * re-schedule is in progress), and as such you're allowed to do
1468 * the simpler "current->state = TASK_RUNNING" to mark yourself
1469 * runnable without the overhead of this.
1470 *
1471 * returns failure only if the task is already active.
1472 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001473static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474{
1475 int cpu, this_cpu, success = 0;
1476 unsigned long flags;
1477 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001478 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001480 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001481 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 int new_cpu;
1483#endif
1484
1485 rq = task_rq_lock(p, &flags);
1486 old_state = p->state;
1487 if (!(old_state & state))
1488 goto out;
1489
Ingo Molnardd41f592007-07-09 18:51:59 +02001490 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 goto out_running;
1492
1493 cpu = task_cpu(p);
1494 this_cpu = smp_processor_id();
1495
1496#ifdef CONFIG_SMP
1497 if (unlikely(task_running(rq, p)))
1498 goto out_activate;
1499
Nick Piggin78979862005-06-25 14:57:13 -07001500 new_cpu = cpu;
1501
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 schedstat_inc(rq, ttwu_cnt);
1503 if (cpu == this_cpu) {
1504 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001505 goto out_set_cpu;
1506 }
1507
1508 for_each_domain(this_cpu, sd) {
1509 if (cpu_isset(cpu, sd->span)) {
1510 schedstat_inc(sd, ttwu_wake_remote);
1511 this_sd = sd;
1512 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513 }
1514 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001515
Nick Piggin78979862005-06-25 14:57:13 -07001516 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 goto out_set_cpu;
1518
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 /*
Nick Piggin78979862005-06-25 14:57:13 -07001520 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 */
Nick Piggin78979862005-06-25 14:57:13 -07001522 if (this_sd) {
1523 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
Nick Piggina3f21bc2005-06-25 14:57:15 -07001526 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1527
Nick Piggin78979862005-06-25 14:57:13 -07001528 load = source_load(cpu, idx);
1529 this_load = target_load(this_cpu, idx);
1530
Nick Piggin78979862005-06-25 14:57:13 -07001531 new_cpu = this_cpu; /* Wake to this CPU if we can */
1532
Nick Piggina3f21bc2005-06-25 14:57:15 -07001533 if (this_sd->flags & SD_WAKE_AFFINE) {
1534 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001535 unsigned long tl_per_task;
1536
1537 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001538
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001540 * If sync wakeup then subtract the (maximum possible)
1541 * effect of the currently running task from the load
1542 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001544 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001545 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001546
1547 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001548 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001549 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001550 /*
1551 * This domain has SD_WAKE_AFFINE and
1552 * p is cache cold in this domain, and
1553 * there is no bad imbalance.
1554 */
1555 schedstat_inc(this_sd, ttwu_move_affine);
1556 goto out_set_cpu;
1557 }
1558 }
1559
1560 /*
1561 * Start passive balancing when half the imbalance_pct
1562 * limit is reached.
1563 */
1564 if (this_sd->flags & SD_WAKE_BALANCE) {
1565 if (imbalance*this_load <= 100*load) {
1566 schedstat_inc(this_sd, ttwu_move_balance);
1567 goto out_set_cpu;
1568 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001569 }
1570 }
1571
1572 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1573out_set_cpu:
1574 new_cpu = wake_idle(new_cpu, p);
1575 if (new_cpu != cpu) {
1576 set_task_cpu(p, new_cpu);
1577 task_rq_unlock(rq, &flags);
1578 /* might preempt at this point */
1579 rq = task_rq_lock(p, &flags);
1580 old_state = p->state;
1581 if (!(old_state & state))
1582 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001583 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 goto out_running;
1585
1586 this_cpu = smp_processor_id();
1587 cpu = task_cpu(p);
1588 }
1589
1590out_activate:
1591#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001592 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001593 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 * Sync wakeups (i.e. those types of wakeups where the waker
1595 * has indicated that it will leave the CPU in short order)
1596 * don't trigger a preemption, if the woken up task will run on
1597 * this cpu. (in this case the 'I will reschedule' promise of
1598 * the waker guarantees that the freshly woken up task is going
1599 * to be considered on this CPU.)
1600 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001601 if (!sync || cpu != this_cpu)
1602 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603 success = 1;
1604
1605out_running:
1606 p->state = TASK_RUNNING;
1607out:
1608 task_rq_unlock(rq, &flags);
1609
1610 return success;
1611}
1612
Ingo Molnar36c8b582006-07-03 00:25:41 -07001613int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614{
1615 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1616 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1617}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618EXPORT_SYMBOL(wake_up_process);
1619
Ingo Molnar36c8b582006-07-03 00:25:41 -07001620int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621{
1622 return try_to_wake_up(p, state, 0);
1623}
1624
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625/*
1626 * Perform scheduler related setup for a newly forked process p.
1627 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001628 *
1629 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001631static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632{
Ingo Molnardd41f592007-07-09 18:51:59 +02001633 p->se.wait_start_fair = 0;
1634 p->se.wait_start = 0;
1635 p->se.exec_start = 0;
1636 p->se.sum_exec_runtime = 0;
1637 p->se.delta_exec = 0;
1638 p->se.delta_fair_run = 0;
1639 p->se.delta_fair_sleep = 0;
1640 p->se.wait_runtime = 0;
1641 p->se.sum_wait_runtime = 0;
1642 p->se.sum_sleep_runtime = 0;
1643 p->se.sleep_start = 0;
1644 p->se.sleep_start_fair = 0;
1645 p->se.block_start = 0;
1646 p->se.sleep_max = 0;
1647 p->se.block_max = 0;
1648 p->se.exec_max = 0;
1649 p->se.wait_max = 0;
1650 p->se.wait_runtime_overruns = 0;
1651 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001652
Ingo Molnardd41f592007-07-09 18:51:59 +02001653 INIT_LIST_HEAD(&p->run_list);
1654 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 /*
1657 * We mark the process as running here, but have not actually
1658 * inserted it onto the runqueue yet. This guarantees that
1659 * nobody will actually run it, and a signal or other external
1660 * event cannot wake it up and insert it on the runqueue either.
1661 */
1662 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001663}
1664
1665/*
1666 * fork()/clone()-time setup:
1667 */
1668void sched_fork(struct task_struct *p, int clone_flags)
1669{
1670 int cpu = get_cpu();
1671
1672 __sched_fork(p);
1673
1674#ifdef CONFIG_SMP
1675 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1676#endif
1677 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001678
1679 /*
1680 * Make sure we do not leak PI boosting priority to the child:
1681 */
1682 p->prio = current->normal_prio;
1683
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001684#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001685 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001686 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001688#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001689 p->oncpu = 0;
1690#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001692 /* Want to start with kernel preemption disabled. */
Al Viroa1261f52005-11-13 16:06:55 -08001693 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001695 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696}
1697
1698/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001699 * After fork, child runs first. (default) If set to 0 then
1700 * parent will (try to) run first.
1701 */
1702unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1703
1704/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705 * wake_up_new_task - wake up a newly created task for the first time.
1706 *
1707 * This function will do some initial scheduler statistics housekeeping
1708 * that must be done for every newly created context, then puts the task
1709 * on the runqueue and wakes it.
1710 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001711void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712{
1713 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001714 struct rq *rq;
1715 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716
1717 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001719 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720
1721 p->prio = effective_prio(p);
1722
Ingo Molnardd41f592007-07-09 18:51:59 +02001723 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1724 task_cpu(p) != this_cpu || !current->se.on_rq) {
1725 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001728 * Let the scheduling class do new task startup
1729 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001731 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001733 check_preempt_curr(rq, p);
1734 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735}
1736
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001738 * prepare_task_switch - prepare to switch tasks
1739 * @rq: the runqueue preparing to switch
1740 * @next: the task we are going to switch to.
1741 *
1742 * This is called with the rq lock held and interrupts off. It must
1743 * be paired with a subsequent finish_task_switch after the context
1744 * switch.
1745 *
1746 * prepare_task_switch sets up locking and calls architecture specific
1747 * hooks.
1748 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001749static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001750{
1751 prepare_lock_switch(rq, next);
1752 prepare_arch_switch(next);
1753}
1754
1755/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001757 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758 * @prev: the thread we just switched away from.
1759 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001760 * finish_task_switch must be called after the context switch, paired
1761 * with a prepare_task_switch call before the context switch.
1762 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1763 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 *
1765 * Note that we may have delayed dropping an mm in context_switch(). If
1766 * so, we finish that here outside of the runqueue lock. (Doing it
1767 * with the lock held can cause deadlocks; see schedule() for
1768 * details.)
1769 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001770static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 __releases(rq->lock)
1772{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001774 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
1776 rq->prev_mm = NULL;
1777
1778 /*
1779 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001780 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001781 * schedule one last time. The schedule call will never return, and
1782 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001783 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 * still held, otherwise prev could be scheduled on another cpu, die
1785 * there before we look at prev->state, and then the reference would
1786 * be dropped twice.
1787 * Manfred Spraul <manfred@colorfullife.com>
1788 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001789 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001790 finish_arch_switch(prev);
1791 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001792 if (mm)
1793 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001794 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001795 /*
1796 * Remove function-return probe instances associated with this
1797 * task and put them back on the free list.
1798 */
1799 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001801 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802}
1803
1804/**
1805 * schedule_tail - first thing a freshly forked thread must call.
1806 * @prev: the thread we just switched away from.
1807 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001808asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001809 __releases(rq->lock)
1810{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001811 struct rq *rq = this_rq();
1812
Nick Piggin4866cde2005-06-25 14:57:23 -07001813 finish_task_switch(rq, prev);
1814#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1815 /* In this case, finish_task_switch does not reenable preemption */
1816 preempt_enable();
1817#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 if (current->set_child_tid)
1819 put_user(current->pid, current->set_child_tid);
1820}
1821
1822/*
1823 * context_switch - switch to the new MM and the new
1824 * thread's register state.
1825 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001826static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001827context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001828 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829{
Ingo Molnardd41f592007-07-09 18:51:59 +02001830 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831
Ingo Molnardd41f592007-07-09 18:51:59 +02001832 prepare_task_switch(rq, next);
1833 mm = next->mm;
1834 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001835 /*
1836 * For paravirt, this is coupled with an exit in switch_to to
1837 * combine the page table reload and the switch backend into
1838 * one hypercall.
1839 */
1840 arch_enter_lazy_cpu_mode();
1841
Ingo Molnardd41f592007-07-09 18:51:59 +02001842 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 next->active_mm = oldmm;
1844 atomic_inc(&oldmm->mm_count);
1845 enter_lazy_tlb(oldmm, next);
1846 } else
1847 switch_mm(oldmm, mm, next);
1848
Ingo Molnardd41f592007-07-09 18:51:59 +02001849 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 rq->prev_mm = oldmm;
1852 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001853 /*
1854 * Since the runqueue lock will be released by the next
1855 * task (which is an invalid locking op but in the case
1856 * of the scheduler it's an obvious special-case), so we
1857 * do an early lockdep release here:
1858 */
1859#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001860 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001861#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862
1863 /* Here we just switch the register state and the stack. */
1864 switch_to(prev, next, prev);
1865
Ingo Molnardd41f592007-07-09 18:51:59 +02001866 barrier();
1867 /*
1868 * this_rq must be evaluated again because prev may have moved
1869 * CPUs since it called schedule(), thus the 'rq' on its stack
1870 * frame will be invalid.
1871 */
1872 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873}
1874
1875/*
1876 * nr_running, nr_uninterruptible and nr_context_switches:
1877 *
1878 * externally visible scheduler statistics: current number of runnable
1879 * threads, current number of uninterruptible-sleeping threads, total
1880 * number of context switches performed since bootup.
1881 */
1882unsigned long nr_running(void)
1883{
1884 unsigned long i, sum = 0;
1885
1886 for_each_online_cpu(i)
1887 sum += cpu_rq(i)->nr_running;
1888
1889 return sum;
1890}
1891
1892unsigned long nr_uninterruptible(void)
1893{
1894 unsigned long i, sum = 0;
1895
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001896 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 sum += cpu_rq(i)->nr_uninterruptible;
1898
1899 /*
1900 * Since we read the counters lockless, it might be slightly
1901 * inaccurate. Do not allow it to go below zero though:
1902 */
1903 if (unlikely((long)sum < 0))
1904 sum = 0;
1905
1906 return sum;
1907}
1908
1909unsigned long long nr_context_switches(void)
1910{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001911 int i;
1912 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001914 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 sum += cpu_rq(i)->nr_switches;
1916
1917 return sum;
1918}
1919
1920unsigned long nr_iowait(void)
1921{
1922 unsigned long i, sum = 0;
1923
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001924 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1926
1927 return sum;
1928}
1929
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001930unsigned long nr_active(void)
1931{
1932 unsigned long i, running = 0, uninterruptible = 0;
1933
1934 for_each_online_cpu(i) {
1935 running += cpu_rq(i)->nr_running;
1936 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1937 }
1938
1939 if (unlikely((long)uninterruptible < 0))
1940 uninterruptible = 0;
1941
1942 return running + uninterruptible;
1943}
1944
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001946 * Update rq->cpu_load[] statistics. This function is usually called every
1947 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001948 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001949static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001950{
Ingo Molnardd41f592007-07-09 18:51:59 +02001951 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1952 unsigned long total_load = this_rq->ls.load.weight;
1953 unsigned long this_load = total_load;
1954 struct load_stat *ls = &this_rq->ls;
1955 u64 now = __rq_clock(this_rq);
1956 int i, scale;
1957
1958 this_rq->nr_load_updates++;
1959 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1960 goto do_avg;
1961
1962 /* Update delta_fair/delta_exec fields first */
1963 update_curr_load(this_rq, now);
1964
1965 fair_delta64 = ls->delta_fair + 1;
1966 ls->delta_fair = 0;
1967
1968 exec_delta64 = ls->delta_exec + 1;
1969 ls->delta_exec = 0;
1970
1971 sample_interval64 = now - ls->load_update_last;
1972 ls->load_update_last = now;
1973
1974 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1975 sample_interval64 = TICK_NSEC;
1976
1977 if (exec_delta64 > sample_interval64)
1978 exec_delta64 = sample_interval64;
1979
1980 idle_delta64 = sample_interval64 - exec_delta64;
1981
1982 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1983 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1984
1985 this_load = (unsigned long)tmp64;
1986
1987do_avg:
1988
1989 /* Update our load: */
1990 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1991 unsigned long old_load, new_load;
1992
1993 /* scale is effectively 1 << i now, and >> i divides by scale */
1994
1995 old_load = this_rq->cpu_load[i];
1996 new_load = this_load;
1997
1998 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1999 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002000}
2001
Ingo Molnardd41f592007-07-09 18:51:59 +02002002#ifdef CONFIG_SMP
2003
Ingo Molnar48f24c42006-07-03 00:25:40 -07002004/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005 * double_rq_lock - safely lock two runqueues
2006 *
2007 * Note this does not disable interrupts like task_rq_lock,
2008 * you need to do so manually before calling.
2009 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002010static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011 __acquires(rq1->lock)
2012 __acquires(rq2->lock)
2013{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002014 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 if (rq1 == rq2) {
2016 spin_lock(&rq1->lock);
2017 __acquire(rq2->lock); /* Fake it out ;) */
2018 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002019 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 spin_lock(&rq1->lock);
2021 spin_lock(&rq2->lock);
2022 } else {
2023 spin_lock(&rq2->lock);
2024 spin_lock(&rq1->lock);
2025 }
2026 }
2027}
2028
2029/*
2030 * double_rq_unlock - safely unlock two runqueues
2031 *
2032 * Note this does not restore interrupts like task_rq_unlock,
2033 * you need to do so manually after calling.
2034 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002035static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 __releases(rq1->lock)
2037 __releases(rq2->lock)
2038{
2039 spin_unlock(&rq1->lock);
2040 if (rq1 != rq2)
2041 spin_unlock(&rq2->lock);
2042 else
2043 __release(rq2->lock);
2044}
2045
2046/*
2047 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2048 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002049static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 __releases(this_rq->lock)
2051 __acquires(busiest->lock)
2052 __acquires(this_rq->lock)
2053{
Kirill Korotaev054b9102006-12-10 02:20:11 -08002054 if (unlikely(!irqs_disabled())) {
2055 /* printk() doesn't work good under rq->lock */
2056 spin_unlock(&this_rq->lock);
2057 BUG_ON(1);
2058 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002060 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061 spin_unlock(&this_rq->lock);
2062 spin_lock(&busiest->lock);
2063 spin_lock(&this_rq->lock);
2064 } else
2065 spin_lock(&busiest->lock);
2066 }
2067}
2068
2069/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 * If dest_cpu is allowed for this process, migrate the task to it.
2071 * This is accomplished by forcing the cpu_allowed mask to only
2072 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2073 * the cpu_allowed mask is restored.
2074 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002075static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002077 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002079 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080
2081 rq = task_rq_lock(p, &flags);
2082 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2083 || unlikely(cpu_is_offline(dest_cpu)))
2084 goto out;
2085
2086 /* force the process onto the specified CPU */
2087 if (migrate_task(p, dest_cpu, &req)) {
2088 /* Need to wait for migration thread (might exit: take ref). */
2089 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07002090
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 get_task_struct(mt);
2092 task_rq_unlock(rq, &flags);
2093 wake_up_process(mt);
2094 put_task_struct(mt);
2095 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002096
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 return;
2098 }
2099out:
2100 task_rq_unlock(rq, &flags);
2101}
2102
2103/*
Nick Piggin476d1392005-06-25 14:57:29 -07002104 * sched_exec - execve() is a valuable balancing opportunity, because at
2105 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002106 */
2107void sched_exec(void)
2108{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002110 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002112 if (new_cpu != this_cpu)
2113 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114}
2115
2116/*
2117 * pull_task - move a task from a remote runqueue to the local runqueue.
2118 * Both runqueues must be locked.
2119 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002120static void pull_task(struct rq *src_rq, struct task_struct *p,
2121 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122{
Ingo Molnardd41f592007-07-09 18:51:59 +02002123 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002125 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126 /*
2127 * Note that idle threads have a prio of MAX_PRIO, for this test
2128 * to be always true for them.
2129 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002130 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131}
2132
2133/*
2134 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2135 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002136static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002137int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002138 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002139 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140{
2141 /*
2142 * We do not migrate tasks that are:
2143 * 1) running (obviously), or
2144 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2145 * 3) are cache-hot on their current CPU.
2146 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147 if (!cpu_isset(this_cpu, p->cpus_allowed))
2148 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002149 *all_pinned = 0;
2150
2151 if (task_running(rq, p))
2152 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
2154 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002155 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002157 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 return 1;
2159
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 return 1;
2161}
2162
Ingo Molnardd41f592007-07-09 18:51:59 +02002163static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2164 unsigned long max_nr_move, unsigned long max_load_move,
2165 struct sched_domain *sd, enum cpu_idle_type idle,
2166 int *all_pinned, unsigned long *load_moved,
2167 int this_best_prio, int best_prio, int best_prio_seen,
2168 struct rq_iterator *iterator)
2169{
2170 int pulled = 0, pinned = 0, skip_for_load;
2171 struct task_struct *p;
2172 long rem_load_move = max_load_move;
2173
2174 if (max_nr_move == 0 || max_load_move == 0)
2175 goto out;
2176
2177 pinned = 1;
2178
2179 /*
2180 * Start the load-balancing iterator:
2181 */
2182 p = iterator->start(iterator->arg);
2183next:
2184 if (!p)
2185 goto out;
2186 /*
2187 * To help distribute high priority tasks accross CPUs we don't
2188 * skip a task if it will be the highest priority task (i.e. smallest
2189 * prio value) on its new queue regardless of its load weight
2190 */
2191 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2192 SCHED_LOAD_SCALE_FUZZ;
2193 if (skip_for_load && p->prio < this_best_prio)
2194 skip_for_load = !best_prio_seen && p->prio == best_prio;
2195 if (skip_for_load ||
2196 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2197
2198 best_prio_seen |= p->prio == best_prio;
2199 p = iterator->next(iterator->arg);
2200 goto next;
2201 }
2202
2203 pull_task(busiest, p, this_rq, this_cpu);
2204 pulled++;
2205 rem_load_move -= p->se.load.weight;
2206
2207 /*
2208 * We only want to steal up to the prescribed number of tasks
2209 * and the prescribed amount of weighted load.
2210 */
2211 if (pulled < max_nr_move && rem_load_move > 0) {
2212 if (p->prio < this_best_prio)
2213 this_best_prio = p->prio;
2214 p = iterator->next(iterator->arg);
2215 goto next;
2216 }
2217out:
2218 /*
2219 * Right now, this is the only place pull_task() is called,
2220 * so we can safely collect pull_task() stats here rather than
2221 * inside pull_task().
2222 */
2223 schedstat_add(sd, lb_gained[idle], pulled);
2224
2225 if (all_pinned)
2226 *all_pinned = pinned;
2227 *load_moved = max_load_move - rem_load_move;
2228 return pulled;
2229}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002230
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002232 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2233 * load from busiest to this_rq, as part of a balancing operation within
2234 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 *
2236 * Called with both runqueues locked.
2237 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002238static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002239 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002240 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002241 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242{
Ingo Molnardd41f592007-07-09 18:51:59 +02002243 struct sched_class *class = sched_class_highest;
2244 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2245 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246
Ingo Molnardd41f592007-07-09 18:51:59 +02002247 do {
2248 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2249 max_nr_move, (unsigned long)rem_load_move,
2250 sd, idle, all_pinned, &load_moved);
2251 total_nr_moved += nr_moved;
2252 max_nr_move -= nr_moved;
2253 rem_load_move -= load_moved;
2254 class = class->next;
2255 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
Ingo Molnardd41f592007-07-09 18:51:59 +02002257 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258}
2259
2260/*
2261 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002262 * domain. It calculates and returns the amount of weighted load which
2263 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 */
2265static struct sched_group *
2266find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002267 unsigned long *imbalance, enum cpu_idle_type idle,
2268 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269{
2270 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2271 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002272 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002273 unsigned long busiest_load_per_task, busiest_nr_running;
2274 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002275 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002276#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2277 int power_savings_balance = 1;
2278 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2279 unsigned long min_nr_running = ULONG_MAX;
2280 struct sched_group *group_min = NULL, *group_leader = NULL;
2281#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282
2283 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002284 busiest_load_per_task = busiest_nr_running = 0;
2285 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002286 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002287 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002288 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002289 load_idx = sd->newidle_idx;
2290 else
2291 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292
2293 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002294 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295 int local_group;
2296 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002297 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002298 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299
2300 local_group = cpu_isset(this_cpu, group->cpumask);
2301
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002302 if (local_group)
2303 balance_cpu = first_cpu(group->cpumask);
2304
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002306 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307
2308 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002309 struct rq *rq;
2310
2311 if (!cpu_isset(i, *cpus))
2312 continue;
2313
2314 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002315
Nick Piggin5969fe02005-09-10 00:26:19 -07002316 if (*sd_idle && !idle_cpu(i))
2317 *sd_idle = 0;
2318
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002320 if (local_group) {
2321 if (idle_cpu(i) && !first_idle_cpu) {
2322 first_idle_cpu = 1;
2323 balance_cpu = i;
2324 }
2325
Nick Piggina2000572006-02-10 01:51:02 -08002326 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002327 } else
Nick Piggina2000572006-02-10 01:51:02 -08002328 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329
2330 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002331 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002332 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 }
2334
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002335 /*
2336 * First idle cpu or the first cpu(busiest) in this sched group
2337 * is eligible for doing load balancing at this and above
2338 * domains.
2339 */
2340 if (local_group && balance_cpu != this_cpu && balance) {
2341 *balance = 0;
2342 goto ret;
2343 }
2344
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002346 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347
2348 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002349 avg_load = sg_div_cpu_power(group,
2350 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351
Eric Dumazet5517d862007-05-08 00:32:57 -07002352 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002353
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 if (local_group) {
2355 this_load = avg_load;
2356 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002357 this_nr_running = sum_nr_running;
2358 this_load_per_task = sum_weighted_load;
2359 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002360 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 max_load = avg_load;
2362 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002363 busiest_nr_running = sum_nr_running;
2364 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002366
2367#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2368 /*
2369 * Busy processors will not participate in power savings
2370 * balance.
2371 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002372 if (idle == CPU_NOT_IDLE ||
2373 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2374 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002375
2376 /*
2377 * If the local group is idle or completely loaded
2378 * no need to do power savings balance at this domain
2379 */
2380 if (local_group && (this_nr_running >= group_capacity ||
2381 !this_nr_running))
2382 power_savings_balance = 0;
2383
Ingo Molnardd41f592007-07-09 18:51:59 +02002384 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002385 * If a group is already running at full capacity or idle,
2386 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002387 */
2388 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002389 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002390 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002391
Ingo Molnardd41f592007-07-09 18:51:59 +02002392 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002393 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002394 * This is the group from where we need to pick up the load
2395 * for saving power
2396 */
2397 if ((sum_nr_running < min_nr_running) ||
2398 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002399 first_cpu(group->cpumask) <
2400 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002401 group_min = group;
2402 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002403 min_load_per_task = sum_weighted_load /
2404 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002405 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002406
Ingo Molnardd41f592007-07-09 18:51:59 +02002407 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002408 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002409 * capacity but still has some space to pick up some load
2410 * from other group and save more power
2411 */
2412 if (sum_nr_running <= group_capacity - 1) {
2413 if (sum_nr_running > leader_nr_running ||
2414 (sum_nr_running == leader_nr_running &&
2415 first_cpu(group->cpumask) >
2416 first_cpu(group_leader->cpumask))) {
2417 group_leader = group;
2418 leader_nr_running = sum_nr_running;
2419 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002420 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002421group_next:
2422#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423 group = group->next;
2424 } while (group != sd->groups);
2425
Peter Williams2dd73a42006-06-27 02:54:34 -07002426 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 goto out_balanced;
2428
2429 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2430
2431 if (this_load >= avg_load ||
2432 100*max_load <= sd->imbalance_pct*this_load)
2433 goto out_balanced;
2434
Peter Williams2dd73a42006-06-27 02:54:34 -07002435 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 /*
2437 * We're trying to get all the cpus to the average_load, so we don't
2438 * want to push ourselves above the average load, nor do we wish to
2439 * reduce the max loaded cpu below the average load, as either of these
2440 * actions would just result in more rebalancing later, and ping-pong
2441 * tasks around. Thus we look for the minimum possible imbalance.
2442 * Negative imbalances (*we* are more loaded than anyone else) will
2443 * be counted as no imbalance for these purposes -- we can't fix that
2444 * by pulling tasks to us. Be careful of negative numbers as they'll
2445 * appear as very large values with unsigned longs.
2446 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002447 if (max_load <= busiest_load_per_task)
2448 goto out_balanced;
2449
2450 /*
2451 * In the presence of smp nice balancing, certain scenarios can have
2452 * max load less than avg load(as we skip the groups at or below
2453 * its cpu_power, while calculating max_load..)
2454 */
2455 if (max_load < avg_load) {
2456 *imbalance = 0;
2457 goto small_imbalance;
2458 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002459
2460 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002461 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002462
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002464 *imbalance = min(max_pull * busiest->__cpu_power,
2465 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466 / SCHED_LOAD_SCALE;
2467
Peter Williams2dd73a42006-06-27 02:54:34 -07002468 /*
2469 * if *imbalance is less than the average load per runnable task
2470 * there is no gaurantee that any tasks will be moved so we'll have
2471 * a think about bumping its value to force at least one task to be
2472 * moved
2473 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002474 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002475 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002476 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477
Peter Williams2dd73a42006-06-27 02:54:34 -07002478small_imbalance:
2479 pwr_move = pwr_now = 0;
2480 imbn = 2;
2481 if (this_nr_running) {
2482 this_load_per_task /= this_nr_running;
2483 if (busiest_load_per_task > this_load_per_task)
2484 imbn = 1;
2485 } else
2486 this_load_per_task = SCHED_LOAD_SCALE;
2487
Ingo Molnardd41f592007-07-09 18:51:59 +02002488 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2489 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002490 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 return busiest;
2492 }
2493
2494 /*
2495 * OK, we don't have enough imbalance to justify moving tasks,
2496 * however we may be able to increase total CPU power used by
2497 * moving them.
2498 */
2499
Eric Dumazet5517d862007-05-08 00:32:57 -07002500 pwr_now += busiest->__cpu_power *
2501 min(busiest_load_per_task, max_load);
2502 pwr_now += this->__cpu_power *
2503 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 pwr_now /= SCHED_LOAD_SCALE;
2505
2506 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002507 tmp = sg_div_cpu_power(busiest,
2508 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002510 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002511 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512
2513 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002514 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002515 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002516 tmp = sg_div_cpu_power(this,
2517 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002519 tmp = sg_div_cpu_power(this,
2520 busiest_load_per_task * SCHED_LOAD_SCALE);
2521 pwr_move += this->__cpu_power *
2522 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 pwr_move /= SCHED_LOAD_SCALE;
2524
2525 /* Move if we gain throughput */
2526 if (pwr_move <= pwr_now)
2527 goto out_balanced;
2528
Peter Williams2dd73a42006-06-27 02:54:34 -07002529 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530 }
2531
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532 return busiest;
2533
2534out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002535#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002536 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002537 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002539 if (this == group_leader && group_leader != group_min) {
2540 *imbalance = min_load_per_task;
2541 return group_min;
2542 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002543#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002544ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002545 *imbalance = 0;
2546 return NULL;
2547}
2548
2549/*
2550 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2551 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002552static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002553find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002554 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002556 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002557 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 int i;
2559
2560 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002561 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002562
2563 if (!cpu_isset(i, *cpus))
2564 continue;
2565
Ingo Molnar48f24c42006-07-03 00:25:40 -07002566 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002567 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568
Ingo Molnardd41f592007-07-09 18:51:59 +02002569 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002570 continue;
2571
Ingo Molnardd41f592007-07-09 18:51:59 +02002572 if (wl > max_load) {
2573 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002574 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002575 }
2576 }
2577
2578 return busiest;
2579}
2580
2581/*
Nick Piggin77391d72005-06-25 14:57:30 -07002582 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2583 * so long as it is large enough.
2584 */
2585#define MAX_PINNED_INTERVAL 512
2586
Ingo Molnar48f24c42006-07-03 00:25:40 -07002587static inline unsigned long minus_1_or_zero(unsigned long n)
2588{
2589 return n > 0 ? n - 1 : 0;
2590}
2591
Nick Piggin77391d72005-06-25 14:57:30 -07002592/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2594 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002596static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002597 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002598 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002600 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002603 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002604 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002605 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002606
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002607 /*
2608 * When power savings policy is enabled for the parent domain, idle
2609 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002610 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002611 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002612 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002613 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002614 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002615 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617 schedstat_inc(sd, lb_cnt[idle]);
2618
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002619redo:
2620 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002621 &cpus, balance);
2622
Chen, Kenneth W06066712006-12-10 02:20:35 -08002623 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002624 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002625
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 if (!group) {
2627 schedstat_inc(sd, lb_nobusyg[idle]);
2628 goto out_balanced;
2629 }
2630
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002631 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 if (!busiest) {
2633 schedstat_inc(sd, lb_nobusyq[idle]);
2634 goto out_balanced;
2635 }
2636
Nick Piggindb935db2005-06-25 14:57:11 -07002637 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638
2639 schedstat_add(sd, lb_imbalance[idle], imbalance);
2640
2641 nr_moved = 0;
2642 if (busiest->nr_running > 1) {
2643 /*
2644 * Attempt to move tasks. If find_busiest_group has found
2645 * an imbalance but busiest->nr_running <= 1, the group is
2646 * still unbalanced. nr_moved simply stays zero, so it is
2647 * correctly treated as an imbalance.
2648 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002649 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002650 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002652 minus_1_or_zero(busiest->nr_running),
2653 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002654 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002655 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002656
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002657 /*
2658 * some other cpu did the load balance for us.
2659 */
2660 if (nr_moved && this_cpu != smp_processor_id())
2661 resched_cpu(this_cpu);
2662
Nick Piggin81026792005-06-25 14:57:07 -07002663 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002664 if (unlikely(all_pinned)) {
2665 cpu_clear(cpu_of(busiest), cpus);
2666 if (!cpus_empty(cpus))
2667 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002668 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002669 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670 }
Nick Piggin81026792005-06-25 14:57:07 -07002671
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 if (!nr_moved) {
2673 schedstat_inc(sd, lb_failed[idle]);
2674 sd->nr_balance_failed++;
2675
2676 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002678 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002679
2680 /* don't kick the migration_thread, if the curr
2681 * task on busiest cpu can't be moved to this_cpu
2682 */
2683 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002684 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002685 all_pinned = 1;
2686 goto out_one_pinned;
2687 }
2688
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 if (!busiest->active_balance) {
2690 busiest->active_balance = 1;
2691 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002692 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002694 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002695 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 wake_up_process(busiest->migration_thread);
2697
2698 /*
2699 * We've kicked active balancing, reset the failure
2700 * counter.
2701 */
Nick Piggin39507452005-06-25 14:57:09 -07002702 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703 }
Nick Piggin81026792005-06-25 14:57:07 -07002704 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 sd->nr_balance_failed = 0;
2706
Nick Piggin81026792005-06-25 14:57:07 -07002707 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708 /* We were unbalanced, so reset the balancing interval */
2709 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002710 } else {
2711 /*
2712 * If we've begun active balancing, start to back off. This
2713 * case may not be covered by the all_pinned logic if there
2714 * is only 1 task on the busy runqueue (because we don't call
2715 * move_tasks).
2716 */
2717 if (sd->balance_interval < sd->max_interval)
2718 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002719 }
2720
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002721 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002723 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 return nr_moved;
2725
2726out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727 schedstat_inc(sd, lb_balanced[idle]);
2728
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002729 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002730
2731out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002733 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2734 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002735 sd->balance_interval *= 2;
2736
Ingo Molnar48f24c42006-07-03 00:25:40 -07002737 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002738 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002739 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740 return 0;
2741}
2742
2743/*
2744 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2745 * tasks if there is an imbalance.
2746 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002747 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 * this_rq is locked.
2749 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002750static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002751load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002752{
2753 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002754 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755 unsigned long imbalance;
2756 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002757 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002758 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002759
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002760 /*
2761 * When power savings policy is enabled for the parent domain, idle
2762 * sibling can pick up load irrespective of busy siblings. In this case,
2763 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002764 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002765 */
2766 if (sd->flags & SD_SHARE_CPUPOWER &&
2767 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002768 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002769
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002770 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002771redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002772 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002773 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002775 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002776 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 }
2778
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002779 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002780 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002781 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002782 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002783 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 }
2785
Nick Piggindb935db2005-06-25 14:57:11 -07002786 BUG_ON(busiest == this_rq);
2787
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002788 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002789
2790 nr_moved = 0;
2791 if (busiest->nr_running > 1) {
2792 /* Attempt to move tasks */
2793 double_lock_balance(this_rq, busiest);
2794 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002795 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002796 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002797 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002798
2799 if (!nr_moved) {
2800 cpu_clear(cpu_of(busiest), cpus);
2801 if (!cpus_empty(cpus))
2802 goto redo;
2803 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002804 }
2805
Nick Piggin5969fe02005-09-10 00:26:19 -07002806 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002807 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002808 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2809 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002810 return -1;
2811 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002812 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002815
2816out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002817 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002818 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002819 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002820 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002821 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002822
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002823 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824}
2825
2826/*
2827 * idle_balance is called by schedule() if this_cpu is about to become
2828 * idle. Attempts to pull tasks from other CPUs.
2829 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002830static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831{
2832 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002833 int pulled_task = -1;
2834 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835
2836 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002837 unsigned long interval;
2838
2839 if (!(sd->flags & SD_LOAD_BALANCE))
2840 continue;
2841
2842 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002843 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002844 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002845 this_rq, sd);
2846
2847 interval = msecs_to_jiffies(sd->balance_interval);
2848 if (time_after(next_balance, sd->last_balance + interval))
2849 next_balance = sd->last_balance + interval;
2850 if (pulled_task)
2851 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002853 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002854 /*
2855 * We are going idle. next_balance may be set based on
2856 * a busy processor. So reset next_balance.
2857 */
2858 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002859 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860}
2861
2862/*
2863 * active_load_balance is run by migration threads. It pushes running tasks
2864 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2865 * running on each physical CPU where possible, and avoids physical /
2866 * logical imbalances.
2867 *
2868 * Called with busiest_rq locked.
2869 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002870static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871{
Nick Piggin39507452005-06-25 14:57:09 -07002872 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002873 struct sched_domain *sd;
2874 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002875
Ingo Molnar48f24c42006-07-03 00:25:40 -07002876 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002877 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002878 return;
2879
2880 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881
2882 /*
Nick Piggin39507452005-06-25 14:57:09 -07002883 * This condition is "impossible", if it occurs
2884 * we need to fix it. Originally reported by
2885 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886 */
Nick Piggin39507452005-06-25 14:57:09 -07002887 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888
Nick Piggin39507452005-06-25 14:57:09 -07002889 /* move a task from busiest_rq to target_rq */
2890 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891
Nick Piggin39507452005-06-25 14:57:09 -07002892 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002893 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002894 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002895 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002896 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002897 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898
Ingo Molnar48f24c42006-07-03 00:25:40 -07002899 if (likely(sd)) {
2900 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901
Ingo Molnar48f24c42006-07-03 00:25:40 -07002902 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002903 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002904 NULL))
2905 schedstat_inc(sd, alb_pushed);
2906 else
2907 schedstat_inc(sd, alb_failed);
2908 }
Nick Piggin39507452005-06-25 14:57:09 -07002909 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910}
2911
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002912#ifdef CONFIG_NO_HZ
2913static struct {
2914 atomic_t load_balancer;
2915 cpumask_t cpu_mask;
2916} nohz ____cacheline_aligned = {
2917 .load_balancer = ATOMIC_INIT(-1),
2918 .cpu_mask = CPU_MASK_NONE,
2919};
2920
Christoph Lameter7835b982006-12-10 02:20:22 -08002921/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002922 * This routine will try to nominate the ilb (idle load balancing)
2923 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2924 * load balancing on behalf of all those cpus. If all the cpus in the system
2925 * go into this tickless mode, then there will be no ilb owner (as there is
2926 * no need for one) and all the cpus will sleep till the next wakeup event
2927 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002928 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002929 * For the ilb owner, tick is not stopped. And this tick will be used
2930 * for idle load balancing. ilb owner will still be part of
2931 * nohz.cpu_mask..
2932 *
2933 * While stopping the tick, this cpu will become the ilb owner if there
2934 * is no other owner. And will be the owner till that cpu becomes busy
2935 * or if all cpus in the system stop their ticks at which point
2936 * there is no need for ilb owner.
2937 *
2938 * When the ilb owner becomes busy, it nominates another owner, during the
2939 * next busy scheduler_tick()
2940 */
2941int select_nohz_load_balancer(int stop_tick)
2942{
2943 int cpu = smp_processor_id();
2944
2945 if (stop_tick) {
2946 cpu_set(cpu, nohz.cpu_mask);
2947 cpu_rq(cpu)->in_nohz_recently = 1;
2948
2949 /*
2950 * If we are going offline and still the leader, give up!
2951 */
2952 if (cpu_is_offline(cpu) &&
2953 atomic_read(&nohz.load_balancer) == cpu) {
2954 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2955 BUG();
2956 return 0;
2957 }
2958
2959 /* time for ilb owner also to sleep */
2960 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2961 if (atomic_read(&nohz.load_balancer) == cpu)
2962 atomic_set(&nohz.load_balancer, -1);
2963 return 0;
2964 }
2965
2966 if (atomic_read(&nohz.load_balancer) == -1) {
2967 /* make me the ilb owner */
2968 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2969 return 1;
2970 } else if (atomic_read(&nohz.load_balancer) == cpu)
2971 return 1;
2972 } else {
2973 if (!cpu_isset(cpu, nohz.cpu_mask))
2974 return 0;
2975
2976 cpu_clear(cpu, nohz.cpu_mask);
2977
2978 if (atomic_read(&nohz.load_balancer) == cpu)
2979 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2980 BUG();
2981 }
2982 return 0;
2983}
2984#endif
2985
2986static DEFINE_SPINLOCK(balancing);
2987
2988/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002989 * It checks each scheduling domain to see if it is due to be balanced,
2990 * and initiates a balancing operation if so.
2991 *
2992 * Balancing parameters are set up in arch_init_sched_domains.
2993 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002994static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002995{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002996 int balance = 1;
2997 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002998 unsigned long interval;
2999 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003000 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08003001 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003002
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003003 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003004 if (!(sd->flags & SD_LOAD_BALANCE))
3005 continue;
3006
3007 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02003008 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 interval *= sd->busy_factor;
3010
3011 /* scale ms to jiffies */
3012 interval = msecs_to_jiffies(interval);
3013 if (unlikely(!interval))
3014 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02003015 if (interval > HZ*NR_CPUS/10)
3016 interval = HZ*NR_CPUS/10;
3017
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018
Christoph Lameter08c183f2006-12-10 02:20:29 -08003019 if (sd->flags & SD_SERIALIZE) {
3020 if (!spin_trylock(&balancing))
3021 goto out;
3022 }
3023
Christoph Lameterc9819f42006-12-10 02:20:25 -08003024 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003025 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07003026 /*
3027 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07003028 * longer idle, or one of our SMT siblings is
3029 * not idle.
3030 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02003031 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08003033 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08003035 if (sd->flags & SD_SERIALIZE)
3036 spin_unlock(&balancing);
3037out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08003038 if (time_after(next_balance, sd->last_balance + interval))
3039 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08003040
3041 /*
3042 * Stop the load balance at this level. There is another
3043 * CPU in our sched group which is doing load balancing more
3044 * actively.
3045 */
3046 if (!balance)
3047 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003049 rq->next_balance = next_balance;
3050}
3051
3052/*
3053 * run_rebalance_domains is triggered when needed from the scheduler tick.
3054 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3055 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3056 */
3057static void run_rebalance_domains(struct softirq_action *h)
3058{
Ingo Molnardd41f592007-07-09 18:51:59 +02003059 int this_cpu = smp_processor_id();
3060 struct rq *this_rq = cpu_rq(this_cpu);
3061 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3062 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003063
Ingo Molnardd41f592007-07-09 18:51:59 +02003064 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003065
3066#ifdef CONFIG_NO_HZ
3067 /*
3068 * If this cpu is the owner for idle load balancing, then do the
3069 * balancing on behalf of the other idle cpus whose ticks are
3070 * stopped.
3071 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003072 if (this_rq->idle_at_tick &&
3073 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003074 cpumask_t cpus = nohz.cpu_mask;
3075 struct rq *rq;
3076 int balance_cpu;
3077
Ingo Molnardd41f592007-07-09 18:51:59 +02003078 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003079 for_each_cpu_mask(balance_cpu, cpus) {
3080 /*
3081 * If this cpu gets work to do, stop the load balancing
3082 * work being done for other cpus. Next load
3083 * balancing owner will pick it up.
3084 */
3085 if (need_resched())
3086 break;
3087
Ingo Molnardd41f592007-07-09 18:51:59 +02003088 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003089
3090 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003091 if (time_after(this_rq->next_balance, rq->next_balance))
3092 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003093 }
3094 }
3095#endif
3096}
3097
3098/*
3099 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3100 *
3101 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3102 * idle load balancing owner or decide to stop the periodic load balancing,
3103 * if the whole system is idle.
3104 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003105static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003106{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003107#ifdef CONFIG_NO_HZ
3108 /*
3109 * If we were in the nohz mode recently and busy at the current
3110 * scheduler tick, then check if we need to nominate new idle
3111 * load balancer.
3112 */
3113 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3114 rq->in_nohz_recently = 0;
3115
3116 if (atomic_read(&nohz.load_balancer) == cpu) {
3117 cpu_clear(cpu, nohz.cpu_mask);
3118 atomic_set(&nohz.load_balancer, -1);
3119 }
3120
3121 if (atomic_read(&nohz.load_balancer) == -1) {
3122 /*
3123 * simple selection for now: Nominate the
3124 * first cpu in the nohz list to be the next
3125 * ilb owner.
3126 *
3127 * TBD: Traverse the sched domains and nominate
3128 * the nearest cpu in the nohz.cpu_mask.
3129 */
3130 int ilb = first_cpu(nohz.cpu_mask);
3131
3132 if (ilb != NR_CPUS)
3133 resched_cpu(ilb);
3134 }
3135 }
3136
3137 /*
3138 * If this cpu is idle and doing idle load balancing for all the
3139 * cpus with ticks stopped, is it time for that to stop?
3140 */
3141 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3142 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3143 resched_cpu(cpu);
3144 return;
3145 }
3146
3147 /*
3148 * If this cpu is idle and the idle load balancing is done by
3149 * someone else, then no need raise the SCHED_SOFTIRQ
3150 */
3151 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3152 cpu_isset(cpu, nohz.cpu_mask))
3153 return;
3154#endif
3155 if (time_after_eq(jiffies, rq->next_balance))
3156 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157}
Ingo Molnardd41f592007-07-09 18:51:59 +02003158
3159#else /* CONFIG_SMP */
3160
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161/*
3162 * on UP we do not need to balance between CPUs:
3163 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003164static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165{
3166}
Ingo Molnardd41f592007-07-09 18:51:59 +02003167
3168/* Avoid "used but not defined" warning on UP */
3169static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3170 unsigned long max_nr_move, unsigned long max_load_move,
3171 struct sched_domain *sd, enum cpu_idle_type idle,
3172 int *all_pinned, unsigned long *load_moved,
3173 int this_best_prio, int best_prio, int best_prio_seen,
3174 struct rq_iterator *iterator)
3175{
3176 *load_moved = 0;
3177
3178 return 0;
3179}
3180
Linus Torvalds1da177e2005-04-16 15:20:36 -07003181#endif
3182
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183DEFINE_PER_CPU(struct kernel_stat, kstat);
3184
3185EXPORT_PER_CPU_SYMBOL(kstat);
3186
3187/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003188 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3189 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003191unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003192{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003193 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003194 u64 ns, delta_exec;
3195 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003196
Ingo Molnar41b86e92007-07-09 18:51:58 +02003197 rq = task_rq_lock(p, &flags);
3198 ns = p->se.sum_exec_runtime;
3199 if (rq->curr == p) {
3200 delta_exec = rq_clock(rq) - p->se.exec_start;
3201 if ((s64)delta_exec > 0)
3202 ns += delta_exec;
3203 }
3204 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003205
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206 return ns;
3207}
3208
3209/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 * Account user cpu time to a process.
3211 * @p: the process that the cpu time gets accounted to
3212 * @hardirq_offset: the offset to subtract from hardirq_count()
3213 * @cputime: the cpu time spent in user space since the last update
3214 */
3215void account_user_time(struct task_struct *p, cputime_t cputime)
3216{
3217 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3218 cputime64_t tmp;
3219
3220 p->utime = cputime_add(p->utime, cputime);
3221
3222 /* Add user time to cpustat. */
3223 tmp = cputime_to_cputime64(cputime);
3224 if (TASK_NICE(p) > 0)
3225 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3226 else
3227 cpustat->user = cputime64_add(cpustat->user, tmp);
3228}
3229
3230/*
3231 * Account system cpu time to a process.
3232 * @p: the process that the cpu time gets accounted to
3233 * @hardirq_offset: the offset to subtract from hardirq_count()
3234 * @cputime: the cpu time spent in kernel space since the last update
3235 */
3236void account_system_time(struct task_struct *p, int hardirq_offset,
3237 cputime_t cputime)
3238{
3239 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003240 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 cputime64_t tmp;
3242
3243 p->stime = cputime_add(p->stime, cputime);
3244
3245 /* Add system time to cpustat. */
3246 tmp = cputime_to_cputime64(cputime);
3247 if (hardirq_count() - hardirq_offset)
3248 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3249 else if (softirq_count())
3250 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3251 else if (p != rq->idle)
3252 cpustat->system = cputime64_add(cpustat->system, tmp);
3253 else if (atomic_read(&rq->nr_iowait) > 0)
3254 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3255 else
3256 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3257 /* Account for system time used */
3258 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003259}
3260
3261/*
3262 * Account for involuntary wait time.
3263 * @p: the process from which the cpu time has been stolen
3264 * @steal: the cpu time spent in involuntary wait
3265 */
3266void account_steal_time(struct task_struct *p, cputime_t steal)
3267{
3268 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3269 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003270 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271
3272 if (p == rq->idle) {
3273 p->stime = cputime_add(p->stime, steal);
3274 if (atomic_read(&rq->nr_iowait) > 0)
3275 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3276 else
3277 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3278 } else
3279 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3280}
3281
Christoph Lameter7835b982006-12-10 02:20:22 -08003282/*
3283 * This function gets called by the timer code, with HZ frequency.
3284 * We call it with interrupts disabled.
3285 *
3286 * It also gets called by the fork code, when changing the parent's
3287 * timeslices.
3288 */
3289void scheduler_tick(void)
3290{
Christoph Lameter7835b982006-12-10 02:20:22 -08003291 int cpu = smp_processor_id();
3292 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003293 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003294
Ingo Molnardd41f592007-07-09 18:51:59 +02003295 spin_lock(&rq->lock);
3296 if (curr != rq->idle) /* FIXME: needed? */
3297 curr->sched_class->task_tick(rq, curr);
3298 update_cpu_load(rq);
3299 spin_unlock(&rq->lock);
3300
Christoph Lametere418e1c2006-12-10 02:20:23 -08003301#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003302 rq->idle_at_tick = idle_cpu(cpu);
3303 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003304#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305}
3306
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3308
3309void fastcall add_preempt_count(int val)
3310{
3311 /*
3312 * Underflow?
3313 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003314 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3315 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 preempt_count() += val;
3317 /*
3318 * Spinlock count overflowing soon?
3319 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003320 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3321 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322}
3323EXPORT_SYMBOL(add_preempt_count);
3324
3325void fastcall sub_preempt_count(int val)
3326{
3327 /*
3328 * Underflow?
3329 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003330 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3331 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332 /*
3333 * Is the spinlock portion underflowing?
3334 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003335 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3336 !(preempt_count() & PREEMPT_MASK)))
3337 return;
3338
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339 preempt_count() -= val;
3340}
3341EXPORT_SYMBOL(sub_preempt_count);
3342
3343#endif
3344
3345/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003346 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003348static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003349{
Ingo Molnardd41f592007-07-09 18:51:59 +02003350 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3351 prev->comm, preempt_count(), prev->pid);
3352 debug_show_held_locks(prev);
3353 if (irqs_disabled())
3354 print_irqtrace_events(prev);
3355 dump_stack();
3356}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003357
Ingo Molnardd41f592007-07-09 18:51:59 +02003358/*
3359 * Various schedule()-time debugging checks and statistics:
3360 */
3361static inline void schedule_debug(struct task_struct *prev)
3362{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363 /*
3364 * Test if we are atomic. Since do_exit() needs to call into
3365 * schedule() atomically, we ignore that path for now.
3366 * Otherwise, whine if we are scheduling when we should not be.
3367 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003368 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3369 __schedule_bug(prev);
3370
Linus Torvalds1da177e2005-04-16 15:20:36 -07003371 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3372
Ingo Molnardd41f592007-07-09 18:51:59 +02003373 schedstat_inc(this_rq(), sched_cnt);
3374}
3375
3376/*
3377 * Pick up the highest-prio task:
3378 */
3379static inline struct task_struct *
3380pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3381{
3382 struct sched_class *class;
3383 struct task_struct *p;
3384
3385 /*
3386 * Optimization: we know that if all tasks are in
3387 * the fair class we can call that function directly:
3388 */
3389 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3390 p = fair_sched_class.pick_next_task(rq, now);
3391 if (likely(p))
3392 return p;
3393 }
3394
3395 class = sched_class_highest;
3396 for ( ; ; ) {
3397 p = class->pick_next_task(rq, now);
3398 if (p)
3399 return p;
3400 /*
3401 * Will never be NULL as the idle class always
3402 * returns a non-NULL p:
3403 */
3404 class = class->next;
3405 }
3406}
3407
3408/*
3409 * schedule() is the main scheduler function.
3410 */
3411asmlinkage void __sched schedule(void)
3412{
3413 struct task_struct *prev, *next;
3414 long *switch_count;
3415 struct rq *rq;
3416 u64 now;
3417 int cpu;
3418
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419need_resched:
3420 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003421 cpu = smp_processor_id();
3422 rq = cpu_rq(cpu);
3423 rcu_qsctr_inc(cpu);
3424 prev = rq->curr;
3425 switch_count = &prev->nivcsw;
3426
Linus Torvalds1da177e2005-04-16 15:20:36 -07003427 release_kernel_lock(prev);
3428need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003429
Ingo Molnardd41f592007-07-09 18:51:59 +02003430 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003431
3432 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003433 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003434
Ingo Molnardd41f592007-07-09 18:51:59 +02003435 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3436 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3437 unlikely(signal_pending(prev)))) {
3438 prev->state = TASK_RUNNING;
3439 } else {
3440 deactivate_task(rq, prev, 1);
3441 }
3442 switch_count = &prev->nvcsw;
3443 }
3444
3445 if (unlikely(!rq->nr_running))
3446 idle_balance(cpu, rq);
3447
3448 now = __rq_clock(rq);
3449 prev->sched_class->put_prev_task(rq, prev, now);
3450 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003451
3452 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003453
Linus Torvalds1da177e2005-04-16 15:20:36 -07003454 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455 rq->nr_switches++;
3456 rq->curr = next;
3457 ++*switch_count;
3458
Ingo Molnardd41f592007-07-09 18:51:59 +02003459 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003460 } else
3461 spin_unlock_irq(&rq->lock);
3462
Ingo Molnardd41f592007-07-09 18:51:59 +02003463 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3464 cpu = smp_processor_id();
3465 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003466 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003467 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003468 preempt_enable_no_resched();
3469 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3470 goto need_resched;
3471}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003472EXPORT_SYMBOL(schedule);
3473
3474#ifdef CONFIG_PREEMPT
3475/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003476 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003477 * off of preempt_enable. Kernel preemptions off return from interrupt
3478 * occur there and call schedule directly.
3479 */
3480asmlinkage void __sched preempt_schedule(void)
3481{
3482 struct thread_info *ti = current_thread_info();
3483#ifdef CONFIG_PREEMPT_BKL
3484 struct task_struct *task = current;
3485 int saved_lock_depth;
3486#endif
3487 /*
3488 * If there is a non-zero preempt_count or interrupts are disabled,
3489 * we do not want to preempt the current task. Just return..
3490 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003491 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003492 return;
3493
3494need_resched:
3495 add_preempt_count(PREEMPT_ACTIVE);
3496 /*
3497 * We keep the big kernel semaphore locked, but we
3498 * clear ->lock_depth so that schedule() doesnt
3499 * auto-release the semaphore:
3500 */
3501#ifdef CONFIG_PREEMPT_BKL
3502 saved_lock_depth = task->lock_depth;
3503 task->lock_depth = -1;
3504#endif
3505 schedule();
3506#ifdef CONFIG_PREEMPT_BKL
3507 task->lock_depth = saved_lock_depth;
3508#endif
3509 sub_preempt_count(PREEMPT_ACTIVE);
3510
3511 /* we could miss a preemption opportunity between schedule and now */
3512 barrier();
3513 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3514 goto need_resched;
3515}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003516EXPORT_SYMBOL(preempt_schedule);
3517
3518/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003519 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003520 * off of irq context.
3521 * Note, that this is called and return with irqs disabled. This will
3522 * protect us against recursive calling from irq.
3523 */
3524asmlinkage void __sched preempt_schedule_irq(void)
3525{
3526 struct thread_info *ti = current_thread_info();
3527#ifdef CONFIG_PREEMPT_BKL
3528 struct task_struct *task = current;
3529 int saved_lock_depth;
3530#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003531 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003532 BUG_ON(ti->preempt_count || !irqs_disabled());
3533
3534need_resched:
3535 add_preempt_count(PREEMPT_ACTIVE);
3536 /*
3537 * We keep the big kernel semaphore locked, but we
3538 * clear ->lock_depth so that schedule() doesnt
3539 * auto-release the semaphore:
3540 */
3541#ifdef CONFIG_PREEMPT_BKL
3542 saved_lock_depth = task->lock_depth;
3543 task->lock_depth = -1;
3544#endif
3545 local_irq_enable();
3546 schedule();
3547 local_irq_disable();
3548#ifdef CONFIG_PREEMPT_BKL
3549 task->lock_depth = saved_lock_depth;
3550#endif
3551 sub_preempt_count(PREEMPT_ACTIVE);
3552
3553 /* we could miss a preemption opportunity between schedule and now */
3554 barrier();
3555 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3556 goto need_resched;
3557}
3558
3559#endif /* CONFIG_PREEMPT */
3560
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003561int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3562 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003563{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003564 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003565}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003566EXPORT_SYMBOL(default_wake_function);
3567
3568/*
3569 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3570 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3571 * number) then we wake all the non-exclusive tasks and one exclusive task.
3572 *
3573 * There are circumstances in which we can try to wake a task which has already
3574 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3575 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3576 */
3577static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3578 int nr_exclusive, int sync, void *key)
3579{
3580 struct list_head *tmp, *next;
3581
3582 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003583 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3584 unsigned flags = curr->flags;
3585
Linus Torvalds1da177e2005-04-16 15:20:36 -07003586 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003587 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588 break;
3589 }
3590}
3591
3592/**
3593 * __wake_up - wake up threads blocked on a waitqueue.
3594 * @q: the waitqueue
3595 * @mode: which threads
3596 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003597 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003598 */
3599void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003600 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601{
3602 unsigned long flags;
3603
3604 spin_lock_irqsave(&q->lock, flags);
3605 __wake_up_common(q, mode, nr_exclusive, 0, key);
3606 spin_unlock_irqrestore(&q->lock, flags);
3607}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003608EXPORT_SYMBOL(__wake_up);
3609
3610/*
3611 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3612 */
3613void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3614{
3615 __wake_up_common(q, mode, 1, 0, NULL);
3616}
3617
3618/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003619 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003620 * @q: the waitqueue
3621 * @mode: which threads
3622 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3623 *
3624 * The sync wakeup differs that the waker knows that it will schedule
3625 * away soon, so while the target thread will be woken up, it will not
3626 * be migrated to another CPU - ie. the two threads are 'synchronized'
3627 * with each other. This can prevent needless bouncing between CPUs.
3628 *
3629 * On UP it can prevent extra preemption.
3630 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003631void fastcall
3632__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003633{
3634 unsigned long flags;
3635 int sync = 1;
3636
3637 if (unlikely(!q))
3638 return;
3639
3640 if (unlikely(!nr_exclusive))
3641 sync = 0;
3642
3643 spin_lock_irqsave(&q->lock, flags);
3644 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3645 spin_unlock_irqrestore(&q->lock, flags);
3646}
3647EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3648
3649void fastcall complete(struct completion *x)
3650{
3651 unsigned long flags;
3652
3653 spin_lock_irqsave(&x->wait.lock, flags);
3654 x->done++;
3655 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3656 1, 0, NULL);
3657 spin_unlock_irqrestore(&x->wait.lock, flags);
3658}
3659EXPORT_SYMBOL(complete);
3660
3661void fastcall complete_all(struct completion *x)
3662{
3663 unsigned long flags;
3664
3665 spin_lock_irqsave(&x->wait.lock, flags);
3666 x->done += UINT_MAX/2;
3667 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3668 0, 0, NULL);
3669 spin_unlock_irqrestore(&x->wait.lock, flags);
3670}
3671EXPORT_SYMBOL(complete_all);
3672
3673void fastcall __sched wait_for_completion(struct completion *x)
3674{
3675 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003676
Linus Torvalds1da177e2005-04-16 15:20:36 -07003677 spin_lock_irq(&x->wait.lock);
3678 if (!x->done) {
3679 DECLARE_WAITQUEUE(wait, current);
3680
3681 wait.flags |= WQ_FLAG_EXCLUSIVE;
3682 __add_wait_queue_tail(&x->wait, &wait);
3683 do {
3684 __set_current_state(TASK_UNINTERRUPTIBLE);
3685 spin_unlock_irq(&x->wait.lock);
3686 schedule();
3687 spin_lock_irq(&x->wait.lock);
3688 } while (!x->done);
3689 __remove_wait_queue(&x->wait, &wait);
3690 }
3691 x->done--;
3692 spin_unlock_irq(&x->wait.lock);
3693}
3694EXPORT_SYMBOL(wait_for_completion);
3695
3696unsigned long fastcall __sched
3697wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3698{
3699 might_sleep();
3700
3701 spin_lock_irq(&x->wait.lock);
3702 if (!x->done) {
3703 DECLARE_WAITQUEUE(wait, current);
3704
3705 wait.flags |= WQ_FLAG_EXCLUSIVE;
3706 __add_wait_queue_tail(&x->wait, &wait);
3707 do {
3708 __set_current_state(TASK_UNINTERRUPTIBLE);
3709 spin_unlock_irq(&x->wait.lock);
3710 timeout = schedule_timeout(timeout);
3711 spin_lock_irq(&x->wait.lock);
3712 if (!timeout) {
3713 __remove_wait_queue(&x->wait, &wait);
3714 goto out;
3715 }
3716 } while (!x->done);
3717 __remove_wait_queue(&x->wait, &wait);
3718 }
3719 x->done--;
3720out:
3721 spin_unlock_irq(&x->wait.lock);
3722 return timeout;
3723}
3724EXPORT_SYMBOL(wait_for_completion_timeout);
3725
3726int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3727{
3728 int ret = 0;
3729
3730 might_sleep();
3731
3732 spin_lock_irq(&x->wait.lock);
3733 if (!x->done) {
3734 DECLARE_WAITQUEUE(wait, current);
3735
3736 wait.flags |= WQ_FLAG_EXCLUSIVE;
3737 __add_wait_queue_tail(&x->wait, &wait);
3738 do {
3739 if (signal_pending(current)) {
3740 ret = -ERESTARTSYS;
3741 __remove_wait_queue(&x->wait, &wait);
3742 goto out;
3743 }
3744 __set_current_state(TASK_INTERRUPTIBLE);
3745 spin_unlock_irq(&x->wait.lock);
3746 schedule();
3747 spin_lock_irq(&x->wait.lock);
3748 } while (!x->done);
3749 __remove_wait_queue(&x->wait, &wait);
3750 }
3751 x->done--;
3752out:
3753 spin_unlock_irq(&x->wait.lock);
3754
3755 return ret;
3756}
3757EXPORT_SYMBOL(wait_for_completion_interruptible);
3758
3759unsigned long fastcall __sched
3760wait_for_completion_interruptible_timeout(struct completion *x,
3761 unsigned long timeout)
3762{
3763 might_sleep();
3764
3765 spin_lock_irq(&x->wait.lock);
3766 if (!x->done) {
3767 DECLARE_WAITQUEUE(wait, current);
3768
3769 wait.flags |= WQ_FLAG_EXCLUSIVE;
3770 __add_wait_queue_tail(&x->wait, &wait);
3771 do {
3772 if (signal_pending(current)) {
3773 timeout = -ERESTARTSYS;
3774 __remove_wait_queue(&x->wait, &wait);
3775 goto out;
3776 }
3777 __set_current_state(TASK_INTERRUPTIBLE);
3778 spin_unlock_irq(&x->wait.lock);
3779 timeout = schedule_timeout(timeout);
3780 spin_lock_irq(&x->wait.lock);
3781 if (!timeout) {
3782 __remove_wait_queue(&x->wait, &wait);
3783 goto out;
3784 }
3785 } while (!x->done);
3786 __remove_wait_queue(&x->wait, &wait);
3787 }
3788 x->done--;
3789out:
3790 spin_unlock_irq(&x->wait.lock);
3791 return timeout;
3792}
3793EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3794
3795
3796#define SLEEP_ON_VAR \
3797 unsigned long flags; \
3798 wait_queue_t wait; \
3799 init_waitqueue_entry(&wait, current);
3800
3801#define SLEEP_ON_HEAD \
3802 spin_lock_irqsave(&q->lock,flags); \
3803 __add_wait_queue(q, &wait); \
3804 spin_unlock(&q->lock);
3805
3806#define SLEEP_ON_TAIL \
3807 spin_lock_irq(&q->lock); \
3808 __remove_wait_queue(q, &wait); \
3809 spin_unlock_irqrestore(&q->lock, flags);
3810
3811void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3812{
3813 SLEEP_ON_VAR
3814
3815 current->state = TASK_INTERRUPTIBLE;
3816
3817 SLEEP_ON_HEAD
3818 schedule();
3819 SLEEP_ON_TAIL
3820}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003821EXPORT_SYMBOL(interruptible_sleep_on);
3822
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003823long fastcall __sched
3824interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003825{
3826 SLEEP_ON_VAR
3827
3828 current->state = TASK_INTERRUPTIBLE;
3829
3830 SLEEP_ON_HEAD
3831 timeout = schedule_timeout(timeout);
3832 SLEEP_ON_TAIL
3833
3834 return timeout;
3835}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003836EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3837
3838void fastcall __sched sleep_on(wait_queue_head_t *q)
3839{
3840 SLEEP_ON_VAR
3841
3842 current->state = TASK_UNINTERRUPTIBLE;
3843
3844 SLEEP_ON_HEAD
3845 schedule();
3846 SLEEP_ON_TAIL
3847}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003848EXPORT_SYMBOL(sleep_on);
3849
3850long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3851{
3852 SLEEP_ON_VAR
3853
3854 current->state = TASK_UNINTERRUPTIBLE;
3855
3856 SLEEP_ON_HEAD
3857 timeout = schedule_timeout(timeout);
3858 SLEEP_ON_TAIL
3859
3860 return timeout;
3861}
3862
3863EXPORT_SYMBOL(sleep_on_timeout);
3864
Ingo Molnarb29739f2006-06-27 02:54:51 -07003865#ifdef CONFIG_RT_MUTEXES
3866
3867/*
3868 * rt_mutex_setprio - set the current priority of a task
3869 * @p: task
3870 * @prio: prio value (kernel-internal form)
3871 *
3872 * This function changes the 'effective' priority of a task. It does
3873 * not touch ->normal_prio like __setscheduler().
3874 *
3875 * Used by the rt_mutex code to implement priority inheritance logic.
3876 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003877void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003878{
3879 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003880 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003881 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003882 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003883
3884 BUG_ON(prio < 0 || prio > MAX_PRIO);
3885
3886 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003887 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003888
Andrew Mortond5f9f942007-05-08 20:27:06 -07003889 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003890 on_rq = p->se.on_rq;
3891 if (on_rq)
3892 dequeue_task(rq, p, 0, now);
3893
3894 if (rt_prio(prio))
3895 p->sched_class = &rt_sched_class;
3896 else
3897 p->sched_class = &fair_sched_class;
3898
Ingo Molnarb29739f2006-06-27 02:54:51 -07003899 p->prio = prio;
3900
Ingo Molnardd41f592007-07-09 18:51:59 +02003901 if (on_rq) {
3902 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003903 /*
3904 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003905 * our priority decreased, or if we are not currently running on
3906 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003907 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003908 if (task_running(rq, p)) {
3909 if (p->prio > oldprio)
3910 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003911 } else {
3912 check_preempt_curr(rq, p);
3913 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003914 }
3915 task_rq_unlock(rq, &flags);
3916}
3917
3918#endif
3919
Ingo Molnar36c8b582006-07-03 00:25:41 -07003920void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003921{
Ingo Molnardd41f592007-07-09 18:51:59 +02003922 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003923 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003924 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003925 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003926
3927 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3928 return;
3929 /*
3930 * We have to be careful, if called from sys_setpriority(),
3931 * the task might be in the middle of scheduling on another CPU.
3932 */
3933 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003934 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003935 /*
3936 * The RT priorities are set via sched_setscheduler(), but we still
3937 * allow the 'normal' nice value to be set - but as expected
3938 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003939 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003940 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003941 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003942 p->static_prio = NICE_TO_PRIO(nice);
3943 goto out_unlock;
3944 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003945 on_rq = p->se.on_rq;
3946 if (on_rq) {
3947 dequeue_task(rq, p, 0, now);
3948 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003949 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003950
Linus Torvalds1da177e2005-04-16 15:20:36 -07003951 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003952 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003953 old_prio = p->prio;
3954 p->prio = effective_prio(p);
3955 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003956
Ingo Molnardd41f592007-07-09 18:51:59 +02003957 if (on_rq) {
3958 enqueue_task(rq, p, 0, now);
3959 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003960 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003961 * If the task increased its priority or is running and
3962 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003963 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003964 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003965 resched_task(rq->curr);
3966 }
3967out_unlock:
3968 task_rq_unlock(rq, &flags);
3969}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003970EXPORT_SYMBOL(set_user_nice);
3971
Matt Mackalle43379f2005-05-01 08:59:00 -07003972/*
3973 * can_nice - check if a task can reduce its nice value
3974 * @p: task
3975 * @nice: nice value
3976 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003977int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003978{
Matt Mackall024f4742005-08-18 11:24:19 -07003979 /* convert nice value [19,-20] to rlimit style value [1,40] */
3980 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003981
Matt Mackalle43379f2005-05-01 08:59:00 -07003982 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3983 capable(CAP_SYS_NICE));
3984}
3985
Linus Torvalds1da177e2005-04-16 15:20:36 -07003986#ifdef __ARCH_WANT_SYS_NICE
3987
3988/*
3989 * sys_nice - change the priority of the current process.
3990 * @increment: priority increment
3991 *
3992 * sys_setpriority is a more generic, but much slower function that
3993 * does similar things.
3994 */
3995asmlinkage long sys_nice(int increment)
3996{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003997 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003998
3999 /*
4000 * Setpriority might change our priority at the same moment.
4001 * We don't have to worry. Conceptually one call occurs first
4002 * and we have a single winner.
4003 */
Matt Mackalle43379f2005-05-01 08:59:00 -07004004 if (increment < -40)
4005 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004006 if (increment > 40)
4007 increment = 40;
4008
4009 nice = PRIO_TO_NICE(current->static_prio) + increment;
4010 if (nice < -20)
4011 nice = -20;
4012 if (nice > 19)
4013 nice = 19;
4014
Matt Mackalle43379f2005-05-01 08:59:00 -07004015 if (increment < 0 && !can_nice(current, nice))
4016 return -EPERM;
4017
Linus Torvalds1da177e2005-04-16 15:20:36 -07004018 retval = security_task_setnice(current, nice);
4019 if (retval)
4020 return retval;
4021
4022 set_user_nice(current, nice);
4023 return 0;
4024}
4025
4026#endif
4027
4028/**
4029 * task_prio - return the priority value of a given task.
4030 * @p: the task in question.
4031 *
4032 * This is the priority value as seen by users in /proc.
4033 * RT tasks are offset by -200. Normal tasks are centered
4034 * around 0, value goes from -16 to +15.
4035 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004036int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004037{
4038 return p->prio - MAX_RT_PRIO;
4039}
4040
4041/**
4042 * task_nice - return the nice value of a given task.
4043 * @p: the task in question.
4044 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004045int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004046{
4047 return TASK_NICE(p);
4048}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004049EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004050
4051/**
4052 * idle_cpu - is a given cpu idle currently?
4053 * @cpu: the processor in question.
4054 */
4055int idle_cpu(int cpu)
4056{
4057 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4058}
4059
Linus Torvalds1da177e2005-04-16 15:20:36 -07004060/**
4061 * idle_task - return the idle task for a given cpu.
4062 * @cpu: the processor in question.
4063 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004064struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004065{
4066 return cpu_rq(cpu)->idle;
4067}
4068
4069/**
4070 * find_process_by_pid - find a process with a matching PID value.
4071 * @pid: the pid in question.
4072 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004073static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004074{
4075 return pid ? find_task_by_pid(pid) : current;
4076}
4077
4078/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02004079static void
4080__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004081{
Ingo Molnardd41f592007-07-09 18:51:59 +02004082 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004083
Linus Torvalds1da177e2005-04-16 15:20:36 -07004084 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004085 switch (p->policy) {
4086 case SCHED_NORMAL:
4087 case SCHED_BATCH:
4088 case SCHED_IDLE:
4089 p->sched_class = &fair_sched_class;
4090 break;
4091 case SCHED_FIFO:
4092 case SCHED_RR:
4093 p->sched_class = &rt_sched_class;
4094 break;
4095 }
4096
Linus Torvalds1da177e2005-04-16 15:20:36 -07004097 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004098 p->normal_prio = normal_prio(p);
4099 /* we are holding p->pi_lock already */
4100 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004101 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004102}
4103
4104/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004105 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004106 * @p: the task in question.
4107 * @policy: new policy.
4108 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004109 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004110 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004111 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004112int sched_setscheduler(struct task_struct *p, int policy,
4113 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004114{
Ingo Molnardd41f592007-07-09 18:51:59 +02004115 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004116 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004117 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004118
Steven Rostedt66e53932006-06-27 02:54:44 -07004119 /* may grab non-irq protected spin_locks */
4120 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004121recheck:
4122 /* double check policy once rq lock held */
4123 if (policy < 0)
4124 policy = oldpolicy = p->policy;
4125 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004126 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4127 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004128 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004129 /*
4130 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004131 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4132 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004133 */
4134 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004135 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004136 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004137 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004138 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004139 return -EINVAL;
4140
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004141 /*
4142 * Allow unprivileged RT tasks to decrease priority:
4143 */
4144 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004145 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004146 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004147
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004148 if (!lock_task_sighand(p, &flags))
4149 return -ESRCH;
4150 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4151 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004152
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004153 /* can't set/change the rt policy */
4154 if (policy != p->policy && !rlim_rtprio)
4155 return -EPERM;
4156
4157 /* can't increase priority */
4158 if (param->sched_priority > p->rt_priority &&
4159 param->sched_priority > rlim_rtprio)
4160 return -EPERM;
4161 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004162 /*
4163 * Like positive nice levels, dont allow tasks to
4164 * move out of SCHED_IDLE either:
4165 */
4166 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4167 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004168
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004169 /* can't change other user's priorities */
4170 if ((current->euid != p->euid) &&
4171 (current->euid != p->uid))
4172 return -EPERM;
4173 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004174
4175 retval = security_task_setscheduler(p, policy, param);
4176 if (retval)
4177 return retval;
4178 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004179 * make sure no PI-waiters arrive (or leave) while we are
4180 * changing the priority of the task:
4181 */
4182 spin_lock_irqsave(&p->pi_lock, flags);
4183 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004184 * To be able to change p->policy safely, the apropriate
4185 * runqueue lock must be held.
4186 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004187 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004188 /* recheck policy now with rq lock held */
4189 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4190 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004191 __task_rq_unlock(rq);
4192 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004193 goto recheck;
4194 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004195 on_rq = p->se.on_rq;
4196 if (on_rq)
4197 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004198 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004199 __setscheduler(rq, p, policy, param->sched_priority);
4200 if (on_rq) {
4201 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004202 /*
4203 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004204 * our priority decreased, or if we are not currently running on
4205 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004206 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004207 if (task_running(rq, p)) {
4208 if (p->prio > oldprio)
4209 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004210 } else {
4211 check_preempt_curr(rq, p);
4212 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004213 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004214 __task_rq_unlock(rq);
4215 spin_unlock_irqrestore(&p->pi_lock, flags);
4216
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004217 rt_mutex_adjust_pi(p);
4218
Linus Torvalds1da177e2005-04-16 15:20:36 -07004219 return 0;
4220}
4221EXPORT_SYMBOL_GPL(sched_setscheduler);
4222
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004223static int
4224do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004225{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004226 struct sched_param lparam;
4227 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004228 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004229
4230 if (!param || pid < 0)
4231 return -EINVAL;
4232 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4233 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004234
4235 rcu_read_lock();
4236 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004237 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004238 if (p != NULL)
4239 retval = sched_setscheduler(p, policy, &lparam);
4240 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004241
Linus Torvalds1da177e2005-04-16 15:20:36 -07004242 return retval;
4243}
4244
4245/**
4246 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4247 * @pid: the pid in question.
4248 * @policy: new policy.
4249 * @param: structure containing the new RT priority.
4250 */
4251asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4252 struct sched_param __user *param)
4253{
Jason Baronc21761f2006-01-18 17:43:03 -08004254 /* negative values for policy are not valid */
4255 if (policy < 0)
4256 return -EINVAL;
4257
Linus Torvalds1da177e2005-04-16 15:20:36 -07004258 return do_sched_setscheduler(pid, policy, param);
4259}
4260
4261/**
4262 * sys_sched_setparam - set/change the RT priority of a thread
4263 * @pid: the pid in question.
4264 * @param: structure containing the new RT priority.
4265 */
4266asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4267{
4268 return do_sched_setscheduler(pid, -1, param);
4269}
4270
4271/**
4272 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4273 * @pid: the pid in question.
4274 */
4275asmlinkage long sys_sched_getscheduler(pid_t pid)
4276{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004277 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004278 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004279
4280 if (pid < 0)
4281 goto out_nounlock;
4282
4283 retval = -ESRCH;
4284 read_lock(&tasklist_lock);
4285 p = find_process_by_pid(pid);
4286 if (p) {
4287 retval = security_task_getscheduler(p);
4288 if (!retval)
4289 retval = p->policy;
4290 }
4291 read_unlock(&tasklist_lock);
4292
4293out_nounlock:
4294 return retval;
4295}
4296
4297/**
4298 * sys_sched_getscheduler - get the RT priority of a thread
4299 * @pid: the pid in question.
4300 * @param: structure containing the RT priority.
4301 */
4302asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4303{
4304 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004305 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004306 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004307
4308 if (!param || pid < 0)
4309 goto out_nounlock;
4310
4311 read_lock(&tasklist_lock);
4312 p = find_process_by_pid(pid);
4313 retval = -ESRCH;
4314 if (!p)
4315 goto out_unlock;
4316
4317 retval = security_task_getscheduler(p);
4318 if (retval)
4319 goto out_unlock;
4320
4321 lp.sched_priority = p->rt_priority;
4322 read_unlock(&tasklist_lock);
4323
4324 /*
4325 * This one might sleep, we cannot do it with a spinlock held ...
4326 */
4327 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4328
4329out_nounlock:
4330 return retval;
4331
4332out_unlock:
4333 read_unlock(&tasklist_lock);
4334 return retval;
4335}
4336
4337long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4338{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004339 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004340 struct task_struct *p;
4341 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004342
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004343 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004344 read_lock(&tasklist_lock);
4345
4346 p = find_process_by_pid(pid);
4347 if (!p) {
4348 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004349 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350 return -ESRCH;
4351 }
4352
4353 /*
4354 * It is not safe to call set_cpus_allowed with the
4355 * tasklist_lock held. We will bump the task_struct's
4356 * usage count and then drop tasklist_lock.
4357 */
4358 get_task_struct(p);
4359 read_unlock(&tasklist_lock);
4360
4361 retval = -EPERM;
4362 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4363 !capable(CAP_SYS_NICE))
4364 goto out_unlock;
4365
David Quigleye7834f82006-06-23 02:03:59 -07004366 retval = security_task_setscheduler(p, 0, NULL);
4367 if (retval)
4368 goto out_unlock;
4369
Linus Torvalds1da177e2005-04-16 15:20:36 -07004370 cpus_allowed = cpuset_cpus_allowed(p);
4371 cpus_and(new_mask, new_mask, cpus_allowed);
4372 retval = set_cpus_allowed(p, new_mask);
4373
4374out_unlock:
4375 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004376 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004377 return retval;
4378}
4379
4380static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4381 cpumask_t *new_mask)
4382{
4383 if (len < sizeof(cpumask_t)) {
4384 memset(new_mask, 0, sizeof(cpumask_t));
4385 } else if (len > sizeof(cpumask_t)) {
4386 len = sizeof(cpumask_t);
4387 }
4388 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4389}
4390
4391/**
4392 * sys_sched_setaffinity - set the cpu affinity of a process
4393 * @pid: pid of the process
4394 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4395 * @user_mask_ptr: user-space pointer to the new cpu mask
4396 */
4397asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4398 unsigned long __user *user_mask_ptr)
4399{
4400 cpumask_t new_mask;
4401 int retval;
4402
4403 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4404 if (retval)
4405 return retval;
4406
4407 return sched_setaffinity(pid, new_mask);
4408}
4409
4410/*
4411 * Represents all cpu's present in the system
4412 * In systems capable of hotplug, this map could dynamically grow
4413 * as new cpu's are detected in the system via any platform specific
4414 * method, such as ACPI for e.g.
4415 */
4416
Andi Kleen4cef0c62006-01-11 22:44:57 +01004417cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004418EXPORT_SYMBOL(cpu_present_map);
4419
4420#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004421cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004422EXPORT_SYMBOL(cpu_online_map);
4423
Andi Kleen4cef0c62006-01-11 22:44:57 +01004424cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004425EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004426#endif
4427
4428long sched_getaffinity(pid_t pid, cpumask_t *mask)
4429{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004430 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004431 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004432
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004433 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004434 read_lock(&tasklist_lock);
4435
4436 retval = -ESRCH;
4437 p = find_process_by_pid(pid);
4438 if (!p)
4439 goto out_unlock;
4440
David Quigleye7834f82006-06-23 02:03:59 -07004441 retval = security_task_getscheduler(p);
4442 if (retval)
4443 goto out_unlock;
4444
Jack Steiner2f7016d2006-02-01 03:05:18 -08004445 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004446
4447out_unlock:
4448 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004449 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004450 if (retval)
4451 return retval;
4452
4453 return 0;
4454}
4455
4456/**
4457 * sys_sched_getaffinity - get the cpu affinity of a process
4458 * @pid: pid of the process
4459 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4460 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4461 */
4462asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4463 unsigned long __user *user_mask_ptr)
4464{
4465 int ret;
4466 cpumask_t mask;
4467
4468 if (len < sizeof(cpumask_t))
4469 return -EINVAL;
4470
4471 ret = sched_getaffinity(pid, &mask);
4472 if (ret < 0)
4473 return ret;
4474
4475 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4476 return -EFAULT;
4477
4478 return sizeof(cpumask_t);
4479}
4480
4481/**
4482 * sys_sched_yield - yield the current processor to other threads.
4483 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004484 * This function yields the current CPU to other tasks. If there are no
4485 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004486 */
4487asmlinkage long sys_sched_yield(void)
4488{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004489 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004490
4491 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004492 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004493 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004494 else
4495 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004496
4497 /*
4498 * Since we are going to call schedule() anyway, there's
4499 * no need to preempt or enable interrupts:
4500 */
4501 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004502 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004503 _raw_spin_unlock(&rq->lock);
4504 preempt_enable_no_resched();
4505
4506 schedule();
4507
4508 return 0;
4509}
4510
Andrew Mortone7b38402006-06-30 01:56:00 -07004511static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004512{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004513#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4514 __might_sleep(__FILE__, __LINE__);
4515#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004516 /*
4517 * The BKS might be reacquired before we have dropped
4518 * PREEMPT_ACTIVE, which could trigger a second
4519 * cond_resched() call.
4520 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004521 do {
4522 add_preempt_count(PREEMPT_ACTIVE);
4523 schedule();
4524 sub_preempt_count(PREEMPT_ACTIVE);
4525 } while (need_resched());
4526}
4527
4528int __sched cond_resched(void)
4529{
Ingo Molnar94142322006-12-29 16:48:13 -08004530 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4531 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004532 __cond_resched();
4533 return 1;
4534 }
4535 return 0;
4536}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004537EXPORT_SYMBOL(cond_resched);
4538
4539/*
4540 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4541 * call schedule, and on return reacquire the lock.
4542 *
4543 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4544 * operations here to prevent schedule() from being called twice (once via
4545 * spin_unlock(), once by hand).
4546 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004547int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004548{
Jan Kara6df3cec2005-06-13 15:52:32 -07004549 int ret = 0;
4550
Linus Torvalds1da177e2005-04-16 15:20:36 -07004551 if (need_lockbreak(lock)) {
4552 spin_unlock(lock);
4553 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004554 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004555 spin_lock(lock);
4556 }
Ingo Molnar94142322006-12-29 16:48:13 -08004557 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004558 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004559 _raw_spin_unlock(lock);
4560 preempt_enable_no_resched();
4561 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004562 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004563 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004564 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004565 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004566}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004567EXPORT_SYMBOL(cond_resched_lock);
4568
4569int __sched cond_resched_softirq(void)
4570{
4571 BUG_ON(!in_softirq());
4572
Ingo Molnar94142322006-12-29 16:48:13 -08004573 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d82562007-05-23 13:58:18 -07004574 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004575 __cond_resched();
4576 local_bh_disable();
4577 return 1;
4578 }
4579 return 0;
4580}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004581EXPORT_SYMBOL(cond_resched_softirq);
4582
Linus Torvalds1da177e2005-04-16 15:20:36 -07004583/**
4584 * yield - yield the current processor to other threads.
4585 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004586 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004587 * thread runnable and calls sys_sched_yield().
4588 */
4589void __sched yield(void)
4590{
4591 set_current_state(TASK_RUNNING);
4592 sys_sched_yield();
4593}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004594EXPORT_SYMBOL(yield);
4595
4596/*
4597 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4598 * that process accounting knows that this is a task in IO wait state.
4599 *
4600 * But don't do that if it is a deliberate, throttling IO wait (this task
4601 * has set its backing_dev_info: the queue against which it should throttle)
4602 */
4603void __sched io_schedule(void)
4604{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004605 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004607 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004608 atomic_inc(&rq->nr_iowait);
4609 schedule();
4610 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004611 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004612}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004613EXPORT_SYMBOL(io_schedule);
4614
4615long __sched io_schedule_timeout(long timeout)
4616{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004617 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004618 long ret;
4619
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004620 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004621 atomic_inc(&rq->nr_iowait);
4622 ret = schedule_timeout(timeout);
4623 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004624 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625 return ret;
4626}
4627
4628/**
4629 * sys_sched_get_priority_max - return maximum RT priority.
4630 * @policy: scheduling class.
4631 *
4632 * this syscall returns the maximum rt_priority that can be used
4633 * by a given scheduling class.
4634 */
4635asmlinkage long sys_sched_get_priority_max(int policy)
4636{
4637 int ret = -EINVAL;
4638
4639 switch (policy) {
4640 case SCHED_FIFO:
4641 case SCHED_RR:
4642 ret = MAX_USER_RT_PRIO-1;
4643 break;
4644 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004645 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004646 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004647 ret = 0;
4648 break;
4649 }
4650 return ret;
4651}
4652
4653/**
4654 * sys_sched_get_priority_min - return minimum RT priority.
4655 * @policy: scheduling class.
4656 *
4657 * this syscall returns the minimum rt_priority that can be used
4658 * by a given scheduling class.
4659 */
4660asmlinkage long sys_sched_get_priority_min(int policy)
4661{
4662 int ret = -EINVAL;
4663
4664 switch (policy) {
4665 case SCHED_FIFO:
4666 case SCHED_RR:
4667 ret = 1;
4668 break;
4669 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004670 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004671 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004672 ret = 0;
4673 }
4674 return ret;
4675}
4676
4677/**
4678 * sys_sched_rr_get_interval - return the default timeslice of a process.
4679 * @pid: pid of the process.
4680 * @interval: userspace pointer to the timeslice value.
4681 *
4682 * this syscall writes the default timeslice value of a given process
4683 * into the user-space timespec buffer. A value of '0' means infinity.
4684 */
4685asmlinkage
4686long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4687{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004688 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004689 int retval = -EINVAL;
4690 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004691
4692 if (pid < 0)
4693 goto out_nounlock;
4694
4695 retval = -ESRCH;
4696 read_lock(&tasklist_lock);
4697 p = find_process_by_pid(pid);
4698 if (!p)
4699 goto out_unlock;
4700
4701 retval = security_task_getscheduler(p);
4702 if (retval)
4703 goto out_unlock;
4704
Peter Williamsb78709c2006-06-26 16:58:00 +10004705 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004706 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004707 read_unlock(&tasklist_lock);
4708 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4709out_nounlock:
4710 return retval;
4711out_unlock:
4712 read_unlock(&tasklist_lock);
4713 return retval;
4714}
4715
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004716static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004717
4718static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004719{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004720 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004721 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004722
Linus Torvalds1da177e2005-04-16 15:20:36 -07004723 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004724 printk("%-13.13s %c", p->comm,
4725 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004726#if (BITS_PER_LONG == 32)
4727 if (state == TASK_RUNNING)
4728 printk(" running ");
4729 else
4730 printk(" %08lX ", thread_saved_pc(p));
4731#else
4732 if (state == TASK_RUNNING)
4733 printk(" running task ");
4734 else
4735 printk(" %016lx ", thread_saved_pc(p));
4736#endif
4737#ifdef CONFIG_DEBUG_STACK_USAGE
4738 {
Al Viro10ebffd2005-11-13 16:06:56 -08004739 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004740 while (!*n)
4741 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004742 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004743 }
4744#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004745 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004746 if (!p->mm)
4747 printk(" (L-TLB)\n");
4748 else
4749 printk(" (NOTLB)\n");
4750
4751 if (state != TASK_RUNNING)
4752 show_stack(p, NULL);
4753}
4754
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004755void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004756{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004757 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004758
4759#if (BITS_PER_LONG == 32)
4760 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004761 " free sibling\n");
4762 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004763#else
4764 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004765 " free sibling\n");
4766 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004767#endif
4768 read_lock(&tasklist_lock);
4769 do_each_thread(g, p) {
4770 /*
4771 * reset the NMI-timeout, listing all files on a slow
4772 * console might take alot of time:
4773 */
4774 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004775 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004776 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004777 } while_each_thread(g, p);
4778
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004779 touch_all_softlockup_watchdogs();
4780
Ingo Molnardd41f592007-07-09 18:51:59 +02004781#ifdef CONFIG_SCHED_DEBUG
4782 sysrq_sched_debug_show();
4783#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004784 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004785 /*
4786 * Only show locks if all tasks are dumped:
4787 */
4788 if (state_filter == -1)
4789 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004790}
4791
Ingo Molnar1df21052007-07-09 18:51:58 +02004792void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4793{
Ingo Molnardd41f592007-07-09 18:51:59 +02004794 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004795}
4796
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004797/**
4798 * init_idle - set up an idle thread for a given CPU
4799 * @idle: task in question
4800 * @cpu: cpu the idle task belongs to
4801 *
4802 * NOTE: this function does not set the idle thread's NEED_RESCHED
4803 * flag, to make booting more robust.
4804 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004805void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004806{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004807 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004808 unsigned long flags;
4809
Ingo Molnardd41f592007-07-09 18:51:59 +02004810 __sched_fork(idle);
4811 idle->se.exec_start = sched_clock();
4812
Ingo Molnarb29739f2006-06-27 02:54:51 -07004813 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004814 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004815 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004816
4817 spin_lock_irqsave(&rq->lock, flags);
4818 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004819#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4820 idle->oncpu = 1;
4821#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004822 spin_unlock_irqrestore(&rq->lock, flags);
4823
4824 /* Set the preempt count _outside_ the spinlocks! */
4825#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f52005-11-13 16:06:55 -08004826 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004827#else
Al Viroa1261f52005-11-13 16:06:55 -08004828 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004829#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004830 /*
4831 * The idle tasks have their own, simple scheduling class:
4832 */
4833 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004834}
4835
4836/*
4837 * In a system that switches off the HZ timer nohz_cpu_mask
4838 * indicates which cpus entered this state. This is used
4839 * in the rcu update to wait only for active cpus. For system
4840 * which do not switch off the HZ timer nohz_cpu_mask should
4841 * always be CPU_MASK_NONE.
4842 */
4843cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4844
Ingo Molnardd41f592007-07-09 18:51:59 +02004845/*
4846 * Increase the granularity value when there are more CPUs,
4847 * because with more CPUs the 'effective latency' as visible
4848 * to users decreases. But the relationship is not linear,
4849 * so pick a second-best guess by going with the log2 of the
4850 * number of CPUs.
4851 *
4852 * This idea comes from the SD scheduler of Con Kolivas:
4853 */
4854static inline void sched_init_granularity(void)
4855{
4856 unsigned int factor = 1 + ilog2(num_online_cpus());
4857 const unsigned long gran_limit = 10000000;
4858
4859 sysctl_sched_granularity *= factor;
4860 if (sysctl_sched_granularity > gran_limit)
4861 sysctl_sched_granularity = gran_limit;
4862
4863 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4864 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4865}
4866
Linus Torvalds1da177e2005-04-16 15:20:36 -07004867#ifdef CONFIG_SMP
4868/*
4869 * This is how migration works:
4870 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004871 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004872 * runqueue and wake up that CPU's migration thread.
4873 * 2) we down() the locked semaphore => thread blocks.
4874 * 3) migration thread wakes up (implicitly it forces the migrated
4875 * thread off the CPU)
4876 * 4) it gets the migration request and checks whether the migrated
4877 * task is still in the wrong runqueue.
4878 * 5) if it's in the wrong runqueue then the migration thread removes
4879 * it and puts it into the right queue.
4880 * 6) migration thread up()s the semaphore.
4881 * 7) we wake up and the migration is done.
4882 */
4883
4884/*
4885 * Change a given task's CPU affinity. Migrate the thread to a
4886 * proper CPU and schedule it away if the CPU it's executing on
4887 * is removed from the allowed bitmask.
4888 *
4889 * NOTE: the caller must have a valid reference to the task, the
4890 * task must not exit() & deallocate itself prematurely. The
4891 * call is not atomic; no spinlocks may be held.
4892 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004893int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004894{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004895 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004896 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004897 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004898 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004899
4900 rq = task_rq_lock(p, &flags);
4901 if (!cpus_intersects(new_mask, cpu_online_map)) {
4902 ret = -EINVAL;
4903 goto out;
4904 }
4905
4906 p->cpus_allowed = new_mask;
4907 /* Can the task run on the task's current CPU? If so, we're done */
4908 if (cpu_isset(task_cpu(p), new_mask))
4909 goto out;
4910
4911 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4912 /* Need help from migration thread: drop lock and wait. */
4913 task_rq_unlock(rq, &flags);
4914 wake_up_process(rq->migration_thread);
4915 wait_for_completion(&req.done);
4916 tlb_migrate_finish(p->mm);
4917 return 0;
4918 }
4919out:
4920 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004921
Linus Torvalds1da177e2005-04-16 15:20:36 -07004922 return ret;
4923}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004924EXPORT_SYMBOL_GPL(set_cpus_allowed);
4925
4926/*
4927 * Move (not current) task off this cpu, onto dest cpu. We're doing
4928 * this because either it can't run here any more (set_cpus_allowed()
4929 * away from this CPU, or CPU going down), or because we're
4930 * attempting to rebalance this task on exec (sched_exec).
4931 *
4932 * So we race with normal scheduler movements, but that's OK, as long
4933 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004934 *
4935 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004936 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004937static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004938{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004939 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004940 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004941
4942 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004943 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004944
4945 rq_src = cpu_rq(src_cpu);
4946 rq_dest = cpu_rq(dest_cpu);
4947
4948 double_rq_lock(rq_src, rq_dest);
4949 /* Already moved. */
4950 if (task_cpu(p) != src_cpu)
4951 goto out;
4952 /* Affinity changed (again). */
4953 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4954 goto out;
4955
Ingo Molnardd41f592007-07-09 18:51:59 +02004956 on_rq = p->se.on_rq;
4957 if (on_rq)
4958 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004959 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004960 if (on_rq) {
4961 activate_task(rq_dest, p, 0);
4962 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004963 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004964 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965out:
4966 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004967 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004968}
4969
4970/*
4971 * migration_thread - this is a highprio system thread that performs
4972 * thread migration by bumping thread off CPU then 'pushing' onto
4973 * another runqueue.
4974 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004975static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004976{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004978 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979
4980 rq = cpu_rq(cpu);
4981 BUG_ON(rq->migration_thread != current);
4982
4983 set_current_state(TASK_INTERRUPTIBLE);
4984 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004985 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004986 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004987
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004988 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004989
4990 spin_lock_irq(&rq->lock);
4991
4992 if (cpu_is_offline(cpu)) {
4993 spin_unlock_irq(&rq->lock);
4994 goto wait_to_die;
4995 }
4996
4997 if (rq->active_balance) {
4998 active_load_balance(rq, cpu);
4999 rq->active_balance = 0;
5000 }
5001
5002 head = &rq->migration_queue;
5003
5004 if (list_empty(head)) {
5005 spin_unlock_irq(&rq->lock);
5006 schedule();
5007 set_current_state(TASK_INTERRUPTIBLE);
5008 continue;
5009 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07005010 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005011 list_del_init(head->next);
5012
Nick Piggin674311d2005-06-25 14:57:27 -07005013 spin_unlock(&rq->lock);
5014 __migrate_task(req->task, cpu, req->dest_cpu);
5015 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005016
5017 complete(&req->done);
5018 }
5019 __set_current_state(TASK_RUNNING);
5020 return 0;
5021
5022wait_to_die:
5023 /* Wait for kthread_stop */
5024 set_current_state(TASK_INTERRUPTIBLE);
5025 while (!kthread_should_stop()) {
5026 schedule();
5027 set_current_state(TASK_INTERRUPTIBLE);
5028 }
5029 __set_current_state(TASK_RUNNING);
5030 return 0;
5031}
5032
5033#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08005034/*
5035 * Figure out where task on dead CPU should go, use force if neccessary.
5036 * NOTE: interrupts should be disabled by the caller
5037 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005038static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005039{
Kirill Korotaevefc30812006-06-27 02:54:32 -07005040 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005041 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005042 struct rq *rq;
5043 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005044
Kirill Korotaevefc30812006-06-27 02:54:32 -07005045restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046 /* On same node? */
5047 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07005048 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005049 dest_cpu = any_online_cpu(mask);
5050
5051 /* On any allowed CPU? */
5052 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07005053 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005054
5055 /* No more Mr. Nice Guy. */
5056 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07005057 rq = task_rq_lock(p, &flags);
5058 cpus_setall(p->cpus_allowed);
5059 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07005060 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005061
5062 /*
5063 * Don't tell them about moving exiting tasks or
5064 * kernel threads (both mm NULL), since they never
5065 * leave kernel.
5066 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005067 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005068 printk(KERN_INFO "process %d (%s) no "
5069 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07005070 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005071 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07005072 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07005073 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005074}
5075
5076/*
5077 * While a dead CPU has no uninterruptible tasks queued at this point,
5078 * it might still have a nonzero ->nr_uninterruptible counter, because
5079 * for performance reasons the counter is not stricly tracking tasks to
5080 * their home CPUs. So we just add the counter to another CPU's counter,
5081 * to keep the global sum constant after CPU-down:
5082 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005083static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005084{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005085 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005086 unsigned long flags;
5087
5088 local_irq_save(flags);
5089 double_rq_lock(rq_src, rq_dest);
5090 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5091 rq_src->nr_uninterruptible = 0;
5092 double_rq_unlock(rq_src, rq_dest);
5093 local_irq_restore(flags);
5094}
5095
5096/* Run through task list and migrate tasks from the dead cpu. */
5097static void migrate_live_tasks(int src_cpu)
5098{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005099 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005100
5101 write_lock_irq(&tasklist_lock);
5102
Ingo Molnar48f24c42006-07-03 00:25:40 -07005103 do_each_thread(t, p) {
5104 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005105 continue;
5106
Ingo Molnar48f24c42006-07-03 00:25:40 -07005107 if (task_cpu(p) == src_cpu)
5108 move_task_off_dead_cpu(src_cpu, p);
5109 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005110
5111 write_unlock_irq(&tasklist_lock);
5112}
5113
Ingo Molnardd41f592007-07-09 18:51:59 +02005114/*
5115 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005116 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005117 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005118 */
5119void sched_idle_next(void)
5120{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005121 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005122 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005123 struct task_struct *p = rq->idle;
5124 unsigned long flags;
5125
5126 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005127 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005128
Ingo Molnar48f24c42006-07-03 00:25:40 -07005129 /*
5130 * Strictly not necessary since rest of the CPUs are stopped by now
5131 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005132 */
5133 spin_lock_irqsave(&rq->lock, flags);
5134
Ingo Molnardd41f592007-07-09 18:51:59 +02005135 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005136
5137 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005138 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005139
5140 spin_unlock_irqrestore(&rq->lock, flags);
5141}
5142
Ingo Molnar48f24c42006-07-03 00:25:40 -07005143/*
5144 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005145 * offline.
5146 */
5147void idle_task_exit(void)
5148{
5149 struct mm_struct *mm = current->active_mm;
5150
5151 BUG_ON(cpu_online(smp_processor_id()));
5152
5153 if (mm != &init_mm)
5154 switch_mm(mm, &init_mm, current);
5155 mmdrop(mm);
5156}
5157
Kirill Korotaev054b9102006-12-10 02:20:11 -08005158/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005159static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005160{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005161 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005162
5163 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005164 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005165
5166 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005167 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005168
Ingo Molnar48f24c42006-07-03 00:25:40 -07005169 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005170
5171 /*
5172 * Drop lock around migration; if someone else moves it,
5173 * that's OK. No task can be added to this CPU, so iteration is
5174 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005175 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005176 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005177 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005178 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005179 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005180
Ingo Molnar48f24c42006-07-03 00:25:40 -07005181 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005182}
5183
5184/* release_task() removes task from tasklist, so we won't find dead tasks. */
5185static void migrate_dead_tasks(unsigned int dead_cpu)
5186{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005187 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005188 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005189
Ingo Molnardd41f592007-07-09 18:51:59 +02005190 for ( ; ; ) {
5191 if (!rq->nr_running)
5192 break;
5193 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5194 if (!next)
5195 break;
5196 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005197 }
5198}
5199#endif /* CONFIG_HOTPLUG_CPU */
5200
5201/*
5202 * migration_call - callback that gets triggered when a CPU is added.
5203 * Here we can start up the necessary migration thread for the new CPU.
5204 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005205static int __cpuinit
5206migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005207{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005208 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005209 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005210 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005211 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005212
5213 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005214 case CPU_LOCK_ACQUIRE:
5215 mutex_lock(&sched_hotcpu_mutex);
5216 break;
5217
Linus Torvalds1da177e2005-04-16 15:20:36 -07005218 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005219 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005220 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005221 if (IS_ERR(p))
5222 return NOTIFY_BAD;
5223 p->flags |= PF_NOFREEZE;
5224 kthread_bind(p, cpu);
5225 /* Must be high prio: stop_machine expects to yield to it. */
5226 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005227 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005228 task_rq_unlock(rq, &flags);
5229 cpu_rq(cpu)->migration_thread = p;
5230 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005231
Linus Torvalds1da177e2005-04-16 15:20:36 -07005232 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005233 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005234 /* Strictly unneccessary, as first user will wake it. */
5235 wake_up_process(cpu_rq(cpu)->migration_thread);
5236 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005237
Linus Torvalds1da177e2005-04-16 15:20:36 -07005238#ifdef CONFIG_HOTPLUG_CPU
5239 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005240 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005241 if (!cpu_rq(cpu)->migration_thread)
5242 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005243 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005244 kthread_bind(cpu_rq(cpu)->migration_thread,
5245 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005246 kthread_stop(cpu_rq(cpu)->migration_thread);
5247 cpu_rq(cpu)->migration_thread = NULL;
5248 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005249
Linus Torvalds1da177e2005-04-16 15:20:36 -07005250 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005251 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005252 migrate_live_tasks(cpu);
5253 rq = cpu_rq(cpu);
5254 kthread_stop(rq->migration_thread);
5255 rq->migration_thread = NULL;
5256 /* Idle task back to normal (off runqueue, low prio) */
5257 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005258 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005259 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005260 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5261 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005262 migrate_dead_tasks(cpu);
5263 task_rq_unlock(rq, &flags);
5264 migrate_nr_uninterruptible(rq);
5265 BUG_ON(rq->nr_running != 0);
5266
5267 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005268 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005269 * the requestors. */
5270 spin_lock_irq(&rq->lock);
5271 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005272 struct migration_req *req;
5273
Linus Torvalds1da177e2005-04-16 15:20:36 -07005274 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005275 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005276 list_del_init(&req->list);
5277 complete(&req->done);
5278 }
5279 spin_unlock_irq(&rq->lock);
5280 break;
5281#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005282 case CPU_LOCK_RELEASE:
5283 mutex_unlock(&sched_hotcpu_mutex);
5284 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005285 }
5286 return NOTIFY_OK;
5287}
5288
5289/* Register at highest priority so that task migration (migrate_all_tasks)
5290 * happens before everything else.
5291 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005292static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005293 .notifier_call = migration_call,
5294 .priority = 10
5295};
5296
5297int __init migration_init(void)
5298{
5299 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005300 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005301
5302 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005303 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5304 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005305 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5306 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005307
Linus Torvalds1da177e2005-04-16 15:20:36 -07005308 return 0;
5309}
5310#endif
5311
5312#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005313
5314/* Number of possible processor ids */
5315int nr_cpu_ids __read_mostly = NR_CPUS;
5316EXPORT_SYMBOL(nr_cpu_ids);
5317
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005318#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005319#ifdef SCHED_DOMAIN_DEBUG
5320static void sched_domain_debug(struct sched_domain *sd, int cpu)
5321{
5322 int level = 0;
5323
Nick Piggin41c7ce92005-06-25 14:57:24 -07005324 if (!sd) {
5325 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5326 return;
5327 }
5328
Linus Torvalds1da177e2005-04-16 15:20:36 -07005329 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5330
5331 do {
5332 int i;
5333 char str[NR_CPUS];
5334 struct sched_group *group = sd->groups;
5335 cpumask_t groupmask;
5336
5337 cpumask_scnprintf(str, NR_CPUS, sd->span);
5338 cpus_clear(groupmask);
5339
5340 printk(KERN_DEBUG);
5341 for (i = 0; i < level + 1; i++)
5342 printk(" ");
5343 printk("domain %d: ", level);
5344
5345 if (!(sd->flags & SD_LOAD_BALANCE)) {
5346 printk("does not load-balance\n");
5347 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005348 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5349 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005350 break;
5351 }
5352
5353 printk("span %s\n", str);
5354
5355 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005356 printk(KERN_ERR "ERROR: domain->span does not contain "
5357 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005358 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005359 printk(KERN_ERR "ERROR: domain->groups does not contain"
5360 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005361
5362 printk(KERN_DEBUG);
5363 for (i = 0; i < level + 2; i++)
5364 printk(" ");
5365 printk("groups:");
5366 do {
5367 if (!group) {
5368 printk("\n");
5369 printk(KERN_ERR "ERROR: group is NULL\n");
5370 break;
5371 }
5372
Eric Dumazet5517d862007-05-08 00:32:57 -07005373 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005374 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005375 printk(KERN_ERR "ERROR: domain->cpu_power not "
5376 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005377 }
5378
5379 if (!cpus_weight(group->cpumask)) {
5380 printk("\n");
5381 printk(KERN_ERR "ERROR: empty group\n");
5382 }
5383
5384 if (cpus_intersects(groupmask, group->cpumask)) {
5385 printk("\n");
5386 printk(KERN_ERR "ERROR: repeated CPUs\n");
5387 }
5388
5389 cpus_or(groupmask, groupmask, group->cpumask);
5390
5391 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5392 printk(" %s", str);
5393
5394 group = group->next;
5395 } while (group != sd->groups);
5396 printk("\n");
5397
5398 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005399 printk(KERN_ERR "ERROR: groups don't span "
5400 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005401
5402 level++;
5403 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005404 if (!sd)
5405 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005406
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005407 if (!cpus_subset(groupmask, sd->span))
5408 printk(KERN_ERR "ERROR: parent span is not a superset "
5409 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005410
5411 } while (sd);
5412}
5413#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005414# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005415#endif
5416
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005417static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005418{
5419 if (cpus_weight(sd->span) == 1)
5420 return 1;
5421
5422 /* Following flags need at least 2 groups */
5423 if (sd->flags & (SD_LOAD_BALANCE |
5424 SD_BALANCE_NEWIDLE |
5425 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005426 SD_BALANCE_EXEC |
5427 SD_SHARE_CPUPOWER |
5428 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005429 if (sd->groups != sd->groups->next)
5430 return 0;
5431 }
5432
5433 /* Following flags don't use groups */
5434 if (sd->flags & (SD_WAKE_IDLE |
5435 SD_WAKE_AFFINE |
5436 SD_WAKE_BALANCE))
5437 return 0;
5438
5439 return 1;
5440}
5441
Ingo Molnar48f24c42006-07-03 00:25:40 -07005442static int
5443sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005444{
5445 unsigned long cflags = sd->flags, pflags = parent->flags;
5446
5447 if (sd_degenerate(parent))
5448 return 1;
5449
5450 if (!cpus_equal(sd->span, parent->span))
5451 return 0;
5452
5453 /* Does parent contain flags not in child? */
5454 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5455 if (cflags & SD_WAKE_AFFINE)
5456 pflags &= ~SD_WAKE_BALANCE;
5457 /* Flags needing groups don't count if only 1 group in parent */
5458 if (parent->groups == parent->groups->next) {
5459 pflags &= ~(SD_LOAD_BALANCE |
5460 SD_BALANCE_NEWIDLE |
5461 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005462 SD_BALANCE_EXEC |
5463 SD_SHARE_CPUPOWER |
5464 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005465 }
5466 if (~cflags & pflags)
5467 return 0;
5468
5469 return 1;
5470}
5471
Linus Torvalds1da177e2005-04-16 15:20:36 -07005472/*
5473 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5474 * hold the hotplug lock.
5475 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005476static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005477{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005478 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005479 struct sched_domain *tmp;
5480
5481 /* Remove the sched domains which do not contribute to scheduling. */
5482 for (tmp = sd; tmp; tmp = tmp->parent) {
5483 struct sched_domain *parent = tmp->parent;
5484 if (!parent)
5485 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005486 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005487 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005488 if (parent->parent)
5489 parent->parent->child = tmp;
5490 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005491 }
5492
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005493 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005494 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005495 if (sd)
5496 sd->child = NULL;
5497 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005498
5499 sched_domain_debug(sd, cpu);
5500
Nick Piggin674311d2005-06-25 14:57:27 -07005501 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005502}
5503
5504/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005505static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005506
5507/* Setup the mask of cpus configured for isolated domains */
5508static int __init isolated_cpu_setup(char *str)
5509{
5510 int ints[NR_CPUS], i;
5511
5512 str = get_options(str, ARRAY_SIZE(ints), ints);
5513 cpus_clear(cpu_isolated_map);
5514 for (i = 1; i <= ints[0]; i++)
5515 if (ints[i] < NR_CPUS)
5516 cpu_set(ints[i], cpu_isolated_map);
5517 return 1;
5518}
5519
5520__setup ("isolcpus=", isolated_cpu_setup);
5521
5522/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005523 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5524 * to a function which identifies what group(along with sched group) a CPU
5525 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5526 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005527 *
5528 * init_sched_build_groups will build a circular linked list of the groups
5529 * covered by the given span, and will set each group's ->cpumask correctly,
5530 * and ->cpu_power to 0.
5531 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005532static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005533init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5534 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5535 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005536{
5537 struct sched_group *first = NULL, *last = NULL;
5538 cpumask_t covered = CPU_MASK_NONE;
5539 int i;
5540
5541 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005542 struct sched_group *sg;
5543 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005544 int j;
5545
5546 if (cpu_isset(i, covered))
5547 continue;
5548
5549 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005550 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005551
5552 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005553 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005554 continue;
5555
5556 cpu_set(j, covered);
5557 cpu_set(j, sg->cpumask);
5558 }
5559 if (!first)
5560 first = sg;
5561 if (last)
5562 last->next = sg;
5563 last = sg;
5564 }
5565 last->next = first;
5566}
5567
John Hawkes9c1cfda2005-09-06 15:18:14 -07005568#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005569
John Hawkes9c1cfda2005-09-06 15:18:14 -07005570#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005571
John Hawkes9c1cfda2005-09-06 15:18:14 -07005572/**
5573 * find_next_best_node - find the next node to include in a sched_domain
5574 * @node: node whose sched_domain we're building
5575 * @used_nodes: nodes already in the sched_domain
5576 *
5577 * Find the next node to include in a given scheduling domain. Simply
5578 * finds the closest node not already in the @used_nodes map.
5579 *
5580 * Should use nodemask_t.
5581 */
5582static int find_next_best_node(int node, unsigned long *used_nodes)
5583{
5584 int i, n, val, min_val, best_node = 0;
5585
5586 min_val = INT_MAX;
5587
5588 for (i = 0; i < MAX_NUMNODES; i++) {
5589 /* Start at @node */
5590 n = (node + i) % MAX_NUMNODES;
5591
5592 if (!nr_cpus_node(n))
5593 continue;
5594
5595 /* Skip already used nodes */
5596 if (test_bit(n, used_nodes))
5597 continue;
5598
5599 /* Simple min distance search */
5600 val = node_distance(node, n);
5601
5602 if (val < min_val) {
5603 min_val = val;
5604 best_node = n;
5605 }
5606 }
5607
5608 set_bit(best_node, used_nodes);
5609 return best_node;
5610}
5611
5612/**
5613 * sched_domain_node_span - get a cpumask for a node's sched_domain
5614 * @node: node whose cpumask we're constructing
5615 * @size: number of nodes to include in this span
5616 *
5617 * Given a node, construct a good cpumask for its sched_domain to span. It
5618 * should be one that prevents unnecessary balancing, but also spreads tasks
5619 * out optimally.
5620 */
5621static cpumask_t sched_domain_node_span(int node)
5622{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005623 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005624 cpumask_t span, nodemask;
5625 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005626
5627 cpus_clear(span);
5628 bitmap_zero(used_nodes, MAX_NUMNODES);
5629
5630 nodemask = node_to_cpumask(node);
5631 cpus_or(span, span, nodemask);
5632 set_bit(node, used_nodes);
5633
5634 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5635 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005636
John Hawkes9c1cfda2005-09-06 15:18:14 -07005637 nodemask = node_to_cpumask(next_node);
5638 cpus_or(span, span, nodemask);
5639 }
5640
5641 return span;
5642}
5643#endif
5644
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005645int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005646
John Hawkes9c1cfda2005-09-06 15:18:14 -07005647/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005648 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005649 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005650#ifdef CONFIG_SCHED_SMT
5651static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005652static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005653
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005654static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5655 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005656{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005657 if (sg)
5658 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005659 return cpu;
5660}
5661#endif
5662
Ingo Molnar48f24c42006-07-03 00:25:40 -07005663/*
5664 * multi-core sched-domains:
5665 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005666#ifdef CONFIG_SCHED_MC
5667static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005668static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005669#endif
5670
5671#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005672static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5673 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005674{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005675 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005676 cpumask_t mask = cpu_sibling_map[cpu];
5677 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005678 group = first_cpu(mask);
5679 if (sg)
5680 *sg = &per_cpu(sched_group_core, group);
5681 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005682}
5683#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005684static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5685 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005686{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005687 if (sg)
5688 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005689 return cpu;
5690}
5691#endif
5692
Linus Torvalds1da177e2005-04-16 15:20:36 -07005693static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005694static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005695
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005696static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5697 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005698{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005699 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005700#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005701 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005702 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005703 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005704#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005705 cpumask_t mask = cpu_sibling_map[cpu];
5706 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005707 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005708#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005709 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005710#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005711 if (sg)
5712 *sg = &per_cpu(sched_group_phys, group);
5713 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005714}
5715
5716#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005717/*
5718 * The init_sched_build_groups can't handle what we want to do with node
5719 * groups, so roll our own. Now each node has its own list of groups which
5720 * gets dynamically allocated.
5721 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005722static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005723static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005724
5725static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005726static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005727
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005728static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5729 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005730{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005731 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5732 int group;
5733
5734 cpus_and(nodemask, nodemask, *cpu_map);
5735 group = first_cpu(nodemask);
5736
5737 if (sg)
5738 *sg = &per_cpu(sched_group_allnodes, group);
5739 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005740}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005741
Siddha, Suresh B08069032006-03-27 01:15:23 -08005742static void init_numa_sched_groups_power(struct sched_group *group_head)
5743{
5744 struct sched_group *sg = group_head;
5745 int j;
5746
5747 if (!sg)
5748 return;
5749next_sg:
5750 for_each_cpu_mask(j, sg->cpumask) {
5751 struct sched_domain *sd;
5752
5753 sd = &per_cpu(phys_domains, j);
5754 if (j != first_cpu(sd->groups->cpumask)) {
5755 /*
5756 * Only add "power" once for each
5757 * physical package.
5758 */
5759 continue;
5760 }
5761
Eric Dumazet5517d862007-05-08 00:32:57 -07005762 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005763 }
5764 sg = sg->next;
5765 if (sg != group_head)
5766 goto next_sg;
5767}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005768#endif
5769
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005770#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005771/* Free memory allocated for various sched_group structures */
5772static void free_sched_groups(const cpumask_t *cpu_map)
5773{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005774 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005775
5776 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005777 struct sched_group **sched_group_nodes
5778 = sched_group_nodes_bycpu[cpu];
5779
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005780 if (!sched_group_nodes)
5781 continue;
5782
5783 for (i = 0; i < MAX_NUMNODES; i++) {
5784 cpumask_t nodemask = node_to_cpumask(i);
5785 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5786
5787 cpus_and(nodemask, nodemask, *cpu_map);
5788 if (cpus_empty(nodemask))
5789 continue;
5790
5791 if (sg == NULL)
5792 continue;
5793 sg = sg->next;
5794next_sg:
5795 oldsg = sg;
5796 sg = sg->next;
5797 kfree(oldsg);
5798 if (oldsg != sched_group_nodes[i])
5799 goto next_sg;
5800 }
5801 kfree(sched_group_nodes);
5802 sched_group_nodes_bycpu[cpu] = NULL;
5803 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005804}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005805#else
5806static void free_sched_groups(const cpumask_t *cpu_map)
5807{
5808}
5809#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005810
Linus Torvalds1da177e2005-04-16 15:20:36 -07005811/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005812 * Initialize sched groups cpu_power.
5813 *
5814 * cpu_power indicates the capacity of sched group, which is used while
5815 * distributing the load between different sched groups in a sched domain.
5816 * Typically cpu_power for all the groups in a sched domain will be same unless
5817 * there are asymmetries in the topology. If there are asymmetries, group
5818 * having more cpu_power will pickup more load compared to the group having
5819 * less cpu_power.
5820 *
5821 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5822 * the maximum number of tasks a group can handle in the presence of other idle
5823 * or lightly loaded groups in the same sched domain.
5824 */
5825static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5826{
5827 struct sched_domain *child;
5828 struct sched_group *group;
5829
5830 WARN_ON(!sd || !sd->groups);
5831
5832 if (cpu != first_cpu(sd->groups->cpumask))
5833 return;
5834
5835 child = sd->child;
5836
Eric Dumazet5517d862007-05-08 00:32:57 -07005837 sd->groups->__cpu_power = 0;
5838
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005839 /*
5840 * For perf policy, if the groups in child domain share resources
5841 * (for example cores sharing some portions of the cache hierarchy
5842 * or SMT), then set this domain groups cpu_power such that each group
5843 * can handle only one task, when there are other idle groups in the
5844 * same sched domain.
5845 */
5846 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5847 (child->flags &
5848 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005849 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005850 return;
5851 }
5852
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005853 /*
5854 * add cpu_power of each child group to this groups cpu_power
5855 */
5856 group = child->groups;
5857 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005858 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005859 group = group->next;
5860 } while (group != child->groups);
5861}
5862
5863/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005864 * Build sched domains for a given set of cpus and attach the sched domains
5865 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005866 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005867static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005868{
5869 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005870#ifdef CONFIG_NUMA
5871 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005872 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005873
5874 /*
5875 * Allocate the per-node list of sched groups
5876 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005877 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005878 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005879 if (!sched_group_nodes) {
5880 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005881 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005882 }
5883 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5884#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005885
5886 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005887 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005888 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005889 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005890 struct sched_domain *sd = NULL, *p;
5891 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5892
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005893 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005894
5895#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005896 if (cpus_weight(*cpu_map) >
5897 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005898 sd = &per_cpu(allnodes_domains, i);
5899 *sd = SD_ALLNODES_INIT;
5900 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005901 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005902 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005903 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005904 } else
5905 p = NULL;
5906
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005908 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005909 sd->span = sched_domain_node_span(cpu_to_node(i));
5910 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005911 if (p)
5912 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005913 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005914#endif
5915
5916 p = sd;
5917 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005918 *sd = SD_CPU_INIT;
5919 sd->span = nodemask;
5920 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005921 if (p)
5922 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005923 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005924
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005925#ifdef CONFIG_SCHED_MC
5926 p = sd;
5927 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005928 *sd = SD_MC_INIT;
5929 sd->span = cpu_coregroup_map(i);
5930 cpus_and(sd->span, sd->span, *cpu_map);
5931 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005932 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005933 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005934#endif
5935
Linus Torvalds1da177e2005-04-16 15:20:36 -07005936#ifdef CONFIG_SCHED_SMT
5937 p = sd;
5938 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005939 *sd = SD_SIBLING_INIT;
5940 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005941 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005942 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005943 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005944 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005945#endif
5946 }
5947
5948#ifdef CONFIG_SCHED_SMT
5949 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005950 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005951 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005952 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005953 if (i != first_cpu(this_sibling_map))
5954 continue;
5955
Ingo Molnardd41f592007-07-09 18:51:59 +02005956 init_sched_build_groups(this_sibling_map, cpu_map,
5957 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005958 }
5959#endif
5960
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005961#ifdef CONFIG_SCHED_MC
5962 /* Set up multi-core groups */
5963 for_each_cpu_mask(i, *cpu_map) {
5964 cpumask_t this_core_map = cpu_coregroup_map(i);
5965 cpus_and(this_core_map, this_core_map, *cpu_map);
5966 if (i != first_cpu(this_core_map))
5967 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005968 init_sched_build_groups(this_core_map, cpu_map,
5969 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005970 }
5971#endif
5972
Linus Torvalds1da177e2005-04-16 15:20:36 -07005973 /* Set up physical groups */
5974 for (i = 0; i < MAX_NUMNODES; i++) {
5975 cpumask_t nodemask = node_to_cpumask(i);
5976
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005977 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005978 if (cpus_empty(nodemask))
5979 continue;
5980
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005981 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005982 }
5983
5984#ifdef CONFIG_NUMA
5985 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005986 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005987 init_sched_build_groups(*cpu_map, cpu_map,
5988 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005989
5990 for (i = 0; i < MAX_NUMNODES; i++) {
5991 /* Set up node groups */
5992 struct sched_group *sg, *prev;
5993 cpumask_t nodemask = node_to_cpumask(i);
5994 cpumask_t domainspan;
5995 cpumask_t covered = CPU_MASK_NONE;
5996 int j;
5997
5998 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005999 if (cpus_empty(nodemask)) {
6000 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006001 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07006002 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006003
6004 domainspan = sched_domain_node_span(i);
6005 cpus_and(domainspan, domainspan, *cpu_map);
6006
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07006007 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006008 if (!sg) {
6009 printk(KERN_WARNING "Can not alloc domain group for "
6010 "node %d\n", i);
6011 goto error;
6012 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006013 sched_group_nodes[i] = sg;
6014 for_each_cpu_mask(j, nodemask) {
6015 struct sched_domain *sd;
6016 sd = &per_cpu(node_domains, j);
6017 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006018 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006019 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006020 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006021 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006022 cpus_or(covered, covered, nodemask);
6023 prev = sg;
6024
6025 for (j = 0; j < MAX_NUMNODES; j++) {
6026 cpumask_t tmp, notcovered;
6027 int n = (i + j) % MAX_NUMNODES;
6028
6029 cpus_complement(notcovered, covered);
6030 cpus_and(tmp, notcovered, *cpu_map);
6031 cpus_and(tmp, tmp, domainspan);
6032 if (cpus_empty(tmp))
6033 break;
6034
6035 nodemask = node_to_cpumask(n);
6036 cpus_and(tmp, tmp, nodemask);
6037 if (cpus_empty(tmp))
6038 continue;
6039
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07006040 sg = kmalloc_node(sizeof(struct sched_group),
6041 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006042 if (!sg) {
6043 printk(KERN_WARNING
6044 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006045 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006046 }
Eric Dumazet5517d862007-05-08 00:32:57 -07006047 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006048 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006049 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07006050 cpus_or(covered, covered, tmp);
6051 prev->next = sg;
6052 prev = sg;
6053 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006054 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006055#endif
6056
6057 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006058#ifdef CONFIG_SCHED_SMT
6059 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006060 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6061
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006062 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006063 }
6064#endif
6065#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006066 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006067 struct sched_domain *sd = &per_cpu(core_domains, i);
6068
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006069 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006070 }
6071#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006072
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006073 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006074 struct sched_domain *sd = &per_cpu(phys_domains, i);
6075
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006076 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006077 }
6078
John Hawkes9c1cfda2005-09-06 15:18:14 -07006079#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08006080 for (i = 0; i < MAX_NUMNODES; i++)
6081 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006082
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006083 if (sd_allnodes) {
6084 struct sched_group *sg;
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006085
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006086 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006087 init_numa_sched_groups_power(sg);
6088 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006089#endif
6090
Linus Torvalds1da177e2005-04-16 15:20:36 -07006091 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006092 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006093 struct sched_domain *sd;
6094#ifdef CONFIG_SCHED_SMT
6095 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006096#elif defined(CONFIG_SCHED_MC)
6097 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006098#else
6099 sd = &per_cpu(phys_domains, i);
6100#endif
6101 cpu_attach_domain(sd, i);
6102 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006103
6104 return 0;
6105
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006106#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006107error:
6108 free_sched_groups(cpu_map);
6109 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006110#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006111}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006112/*
6113 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6114 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006115static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006116{
6117 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006118 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006119
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006120 /*
6121 * Setup mask for cpus without special case scheduling requirements.
6122 * For now this just excludes isolated cpus, but could be used to
6123 * exclude other special cases in the future.
6124 */
6125 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6126
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006127 err = build_sched_domains(&cpu_default_map);
6128
6129 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006130}
6131
6132static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006133{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006134 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006135}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006136
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006137/*
6138 * Detach sched domains from a group of cpus specified in cpu_map
6139 * These cpus will now be attached to the NULL domain
6140 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006141static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006142{
6143 int i;
6144
6145 for_each_cpu_mask(i, *cpu_map)
6146 cpu_attach_domain(NULL, i);
6147 synchronize_sched();
6148 arch_destroy_sched_domains(cpu_map);
6149}
6150
6151/*
6152 * Partition sched domains as specified by the cpumasks below.
6153 * This attaches all cpus from the cpumasks to the NULL domain,
6154 * waits for a RCU quiescent period, recalculates sched
6155 * domain information and then attaches them back to the
6156 * correct sched domains
6157 * Call with hotplug lock held
6158 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006159int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006160{
6161 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006162 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006163
6164 cpus_and(*partition1, *partition1, cpu_online_map);
6165 cpus_and(*partition2, *partition2, cpu_online_map);
6166 cpus_or(change_map, *partition1, *partition2);
6167
6168 /* Detach sched domains from all of the affected cpus */
6169 detach_destroy_domains(&change_map);
6170 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006171 err = build_sched_domains(partition1);
6172 if (!err && !cpus_empty(*partition2))
6173 err = build_sched_domains(partition2);
6174
6175 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006176}
6177
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006178#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6179int arch_reinit_sched_domains(void)
6180{
6181 int err;
6182
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006183 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006184 detach_destroy_domains(&cpu_online_map);
6185 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006186 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006187
6188 return err;
6189}
6190
6191static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6192{
6193 int ret;
6194
6195 if (buf[0] != '0' && buf[0] != '1')
6196 return -EINVAL;
6197
6198 if (smt)
6199 sched_smt_power_savings = (buf[0] == '1');
6200 else
6201 sched_mc_power_savings = (buf[0] == '1');
6202
6203 ret = arch_reinit_sched_domains();
6204
6205 return ret ? ret : count;
6206}
6207
6208int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6209{
6210 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006211
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006212#ifdef CONFIG_SCHED_SMT
6213 if (smt_capable())
6214 err = sysfs_create_file(&cls->kset.kobj,
6215 &attr_sched_smt_power_savings.attr);
6216#endif
6217#ifdef CONFIG_SCHED_MC
6218 if (!err && mc_capable())
6219 err = sysfs_create_file(&cls->kset.kobj,
6220 &attr_sched_mc_power_savings.attr);
6221#endif
6222 return err;
6223}
6224#endif
6225
6226#ifdef CONFIG_SCHED_MC
6227static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6228{
6229 return sprintf(page, "%u\n", sched_mc_power_savings);
6230}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006231static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6232 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006233{
6234 return sched_power_savings_store(buf, count, 0);
6235}
6236SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6237 sched_mc_power_savings_store);
6238#endif
6239
6240#ifdef CONFIG_SCHED_SMT
6241static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6242{
6243 return sprintf(page, "%u\n", sched_smt_power_savings);
6244}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006245static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6246 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006247{
6248 return sched_power_savings_store(buf, count, 1);
6249}
6250SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6251 sched_smt_power_savings_store);
6252#endif
6253
Linus Torvalds1da177e2005-04-16 15:20:36 -07006254/*
6255 * Force a reinitialization of the sched domains hierarchy. The domains
6256 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006257 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006258 * which will prevent rebalancing while the sched domains are recalculated.
6259 */
6260static int update_sched_domains(struct notifier_block *nfb,
6261 unsigned long action, void *hcpu)
6262{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006263 switch (action) {
6264 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006265 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006266 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006267 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006268 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006269 return NOTIFY_OK;
6270
6271 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006272 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006273 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006274 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006275 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006276 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006277 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006278 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006279 /*
6280 * Fall through and re-initialise the domains.
6281 */
6282 break;
6283 default:
6284 return NOTIFY_DONE;
6285 }
6286
6287 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006288 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006289
6290 return NOTIFY_OK;
6291}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006292
6293void __init sched_init_smp(void)
6294{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006295 cpumask_t non_isolated_cpus;
6296
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006297 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006298 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006299 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006300 if (cpus_empty(non_isolated_cpus))
6301 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006302 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006303 /* XXX: Theoretical race here - CPU may be hotplugged now */
6304 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006305
6306 /* Move init over to a non-isolated CPU */
6307 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6308 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006309 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006310}
6311#else
6312void __init sched_init_smp(void)
6313{
Ingo Molnardd41f592007-07-09 18:51:59 +02006314 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006315}
6316#endif /* CONFIG_SMP */
6317
6318int in_sched_functions(unsigned long addr)
6319{
6320 /* Linker adds these: start and end of __sched functions */
6321 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006322
Linus Torvalds1da177e2005-04-16 15:20:36 -07006323 return in_lock_functions(addr) ||
6324 (addr >= (unsigned long)__sched_text_start
6325 && addr < (unsigned long)__sched_text_end);
6326}
6327
Ingo Molnardd41f592007-07-09 18:51:59 +02006328static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6329{
6330 cfs_rq->tasks_timeline = RB_ROOT;
6331 cfs_rq->fair_clock = 1;
6332#ifdef CONFIG_FAIR_GROUP_SCHED
6333 cfs_rq->rq = rq;
6334#endif
6335}
6336
Linus Torvalds1da177e2005-04-16 15:20:36 -07006337void __init sched_init(void)
6338{
Ingo Molnardd41f592007-07-09 18:51:59 +02006339 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006340 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006341 int i, j;
6342
6343 /*
6344 * Link up the scheduling class hierarchy:
6345 */
6346 rt_sched_class.next = &fair_sched_class;
6347 fair_sched_class.next = &idle_sched_class;
6348 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006349
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006350 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006351 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006352 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006353
6354 rq = cpu_rq(i);
6355 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006356 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006357 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006358 rq->clock = 1;
6359 init_cfs_rq(&rq->cfs, rq);
6360#ifdef CONFIG_FAIR_GROUP_SCHED
6361 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6362 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6363#endif
6364 rq->ls.load_update_last = now;
6365 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006366
Ingo Molnardd41f592007-07-09 18:51:59 +02006367 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6368 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006369#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006370 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006371 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006372 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006373 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006374 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006375 rq->migration_thread = NULL;
6376 INIT_LIST_HEAD(&rq->migration_queue);
6377#endif
6378 atomic_set(&rq->nr_iowait, 0);
6379
Ingo Molnardd41f592007-07-09 18:51:59 +02006380 array = &rq->rt.active;
6381 for (j = 0; j < MAX_RT_PRIO; j++) {
6382 INIT_LIST_HEAD(array->queue + j);
6383 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006384 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006385 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006386 /* delimiter for bitsearch: */
6387 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006388 }
6389
Peter Williams2dd73a42006-06-27 02:54:34 -07006390 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006391
Christoph Lameterc9819f42006-12-10 02:20:25 -08006392#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006393 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006394 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6395#endif
6396
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006397#ifdef CONFIG_RT_MUTEXES
6398 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6399#endif
6400
Linus Torvalds1da177e2005-04-16 15:20:36 -07006401 /*
6402 * The boot idle thread does lazy MMU switching as well:
6403 */
6404 atomic_inc(&init_mm.mm_count);
6405 enter_lazy_tlb(&init_mm, current);
6406
6407 /*
6408 * Make us the idle thread. Technically, schedule() should not be
6409 * called from this thread, however somewhere below it might be,
6410 * but because we are the idle thread, we just pick up running again
6411 * when this runqueue becomes "idle".
6412 */
6413 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006414 /*
6415 * During early bootup we pretend to be a normal task:
6416 */
6417 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006418}
6419
6420#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6421void __might_sleep(char *file, int line)
6422{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006423#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006424 static unsigned long prev_jiffy; /* ratelimiting */
6425
6426 if ((in_atomic() || irqs_disabled()) &&
6427 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6428 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6429 return;
6430 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006431 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006432 " context at %s:%d\n", file, line);
6433 printk("in_atomic():%d, irqs_disabled():%d\n",
6434 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006435 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006436 if (irqs_disabled())
6437 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006438 dump_stack();
6439 }
6440#endif
6441}
6442EXPORT_SYMBOL(__might_sleep);
6443#endif
6444
6445#ifdef CONFIG_MAGIC_SYSRQ
6446void normalize_rt_tasks(void)
6447{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006448 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006449 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006450 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006451 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006452
6453 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006454 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006455 p->se.fair_key = 0;
6456 p->se.wait_runtime = 0;
6457 p->se.wait_start_fair = 0;
6458 p->se.wait_start = 0;
6459 p->se.exec_start = 0;
6460 p->se.sleep_start = 0;
6461 p->se.sleep_start_fair = 0;
6462 p->se.block_start = 0;
6463 task_rq(p)->cfs.fair_clock = 0;
6464 task_rq(p)->clock = 0;
6465
6466 if (!rt_task(p)) {
6467 /*
6468 * Renice negative nice level userspace
6469 * tasks back to 0:
6470 */
6471 if (TASK_NICE(p) < 0 && p->mm)
6472 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006473 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006474 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006475
Ingo Molnarb29739f2006-06-27 02:54:51 -07006476 spin_lock_irqsave(&p->pi_lock, flags);
6477 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006478#ifdef CONFIG_SMP
6479 /*
6480 * Do not touch the migration thread:
6481 */
6482 if (p == rq->migration_thread)
6483 goto out_unlock;
6484#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006485
Ingo Molnardd41f592007-07-09 18:51:59 +02006486 on_rq = p->se.on_rq;
6487 if (on_rq)
6488 deactivate_task(task_rq(p), p, 0);
6489 __setscheduler(rq, p, SCHED_NORMAL, 0);
6490 if (on_rq) {
6491 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006492 resched_task(rq->curr);
6493 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006494#ifdef CONFIG_SMP
6495 out_unlock:
6496#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006497 __task_rq_unlock(rq);
6498 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006499 } while_each_thread(g, p);
6500
Linus Torvalds1da177e2005-04-16 15:20:36 -07006501 read_unlock_irq(&tasklist_lock);
6502}
6503
6504#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006505
6506#ifdef CONFIG_IA64
6507/*
6508 * These functions are only useful for the IA64 MCA handling.
6509 *
6510 * They can only be called when the whole system has been
6511 * stopped - every CPU needs to be quiescent, and no scheduling
6512 * activity can take place. Using them for anything else would
6513 * be a serious bug, and as a result, they aren't even visible
6514 * under any other configuration.
6515 */
6516
6517/**
6518 * curr_task - return the current task for a given cpu.
6519 * @cpu: the processor in question.
6520 *
6521 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6522 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006523struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006524{
6525 return cpu_curr(cpu);
6526}
6527
6528/**
6529 * set_curr_task - set the current task for a given cpu.
6530 * @cpu: the processor in question.
6531 * @p: the task pointer to set.
6532 *
6533 * Description: This function must only be used when non-maskable interrupts
6534 * are serviced on a separate stack. It allows the architecture to switch the
6535 * notion of the current task on a cpu in a non-blocking manner. This function
6536 * must be called with all CPU's synchronized, and interrupts disabled, the
6537 * and caller must save the original value of the current task (see
6538 * curr_task() above) and restore that value before reenabling interrupts and
6539 * re-starting the system.
6540 *
6541 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6542 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006543void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006544{
6545 cpu_curr(cpu) = p;
6546}
6547
6548#endif