blob: eeb1b46cf707016cf3fa466e895bc8677f591616 [file] [log] [blame]
Thomas Gleixner0793a612008-12-04 20:12:29 +01001/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
Ingo Molnar04289bb2008-12-11 08:38:42 +010013#include <linux/file.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010014#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
Ingo Molnaraa9c4c02008-12-17 14:10:57 +010021#include <linux/kernel_stat.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010022#include <linux/perf_counter.h>
Paul Mackerras23a185c2009-02-09 22:42:47 +110023#include <linux/mm.h>
24#include <linux/vmstat.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010025
26/*
27 * Each CPU has a list of per CPU counters:
28 */
29DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
30
Ingo Molnar088e2852008-12-14 20:21:00 +010031int perf_max_counters __read_mostly = 1;
Thomas Gleixner0793a612008-12-04 20:12:29 +010032static int perf_reserved_percpu __read_mostly;
33static int perf_overcommit __read_mostly = 1;
34
35/*
36 * Mutex for (sysadmin-configurable) counter reservations:
37 */
38static DEFINE_MUTEX(perf_resource_mutex);
39
40/*
41 * Architecture provided APIs - weak aliases:
42 */
Ingo Molnar5c92d122008-12-11 13:21:10 +010043extern __weak const struct hw_perf_counter_ops *
Ingo Molnar621a01e2008-12-11 12:46:46 +010044hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010045{
Paul Mackerrasff6f0542009-01-09 16:19:25 +110046 return NULL;
Thomas Gleixner0793a612008-12-04 20:12:29 +010047}
48
Ingo Molnar01b28382008-12-11 13:45:51 +010049u64 __weak hw_perf_save_disable(void) { return 0; }
Yinghai Lu01ea1cc2008-12-26 21:05:06 -080050void __weak hw_perf_restore(u64 ctrl) { barrier(); }
Paul Mackerras01d02872009-01-14 13:44:19 +110051void __weak hw_perf_counter_setup(int cpu) { barrier(); }
Paul Mackerras3cbed422009-01-09 16:43:42 +110052int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
53 struct perf_cpu_context *cpuctx,
54 struct perf_counter_context *ctx, int cpu)
55{
56 return 0;
57}
Thomas Gleixner0793a612008-12-04 20:12:29 +010058
Paul Mackerras4eb96fc2009-01-09 17:24:34 +110059void __weak perf_counter_print_debug(void) { }
60
Ingo Molnar04289bb2008-12-11 08:38:42 +010061static void
62list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
63{
64 struct perf_counter *group_leader = counter->group_leader;
65
66 /*
67 * Depending on whether it is a standalone or sibling counter,
68 * add it straight to the context's counter list, or to the group
69 * leader's sibling list:
70 */
71 if (counter->group_leader == counter)
72 list_add_tail(&counter->list_entry, &ctx->counter_list);
73 else
74 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
75}
76
77static void
78list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
79{
80 struct perf_counter *sibling, *tmp;
81
82 list_del_init(&counter->list_entry);
83
Ingo Molnar04289bb2008-12-11 08:38:42 +010084 /*
85 * If this was a group counter with sibling counters then
86 * upgrade the siblings to singleton counters by adding them
87 * to the context list directly:
88 */
89 list_for_each_entry_safe(sibling, tmp,
90 &counter->sibling_list, list_entry) {
91
Peter Zijlstra75564232009-03-13 12:21:29 +010092 list_move_tail(&sibling->list_entry, &ctx->counter_list);
Ingo Molnar04289bb2008-12-11 08:38:42 +010093 sibling->group_leader = sibling;
94 }
95}
96
Paul Mackerras3b6f9e52009-01-14 21:00:30 +110097static void
98counter_sched_out(struct perf_counter *counter,
99 struct perf_cpu_context *cpuctx,
100 struct perf_counter_context *ctx)
101{
102 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
103 return;
104
105 counter->state = PERF_COUNTER_STATE_INACTIVE;
106 counter->hw_ops->disable(counter);
107 counter->oncpu = -1;
108
109 if (!is_software_counter(counter))
110 cpuctx->active_oncpu--;
111 ctx->nr_active--;
112 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
113 cpuctx->exclusive = 0;
114}
115
Paul Mackerrasd859e292009-01-17 18:10:22 +1100116static void
117group_sched_out(struct perf_counter *group_counter,
118 struct perf_cpu_context *cpuctx,
119 struct perf_counter_context *ctx)
120{
121 struct perf_counter *counter;
122
123 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
124 return;
125
126 counter_sched_out(group_counter, cpuctx, ctx);
127
128 /*
129 * Schedule out siblings (if any):
130 */
131 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
132 counter_sched_out(counter, cpuctx, ctx);
133
134 if (group_counter->hw_event.exclusive)
135 cpuctx->exclusive = 0;
136}
137
Thomas Gleixner0793a612008-12-04 20:12:29 +0100138/*
139 * Cross CPU call to remove a performance counter
140 *
141 * We disable the counter on the hardware level first. After that we
142 * remove it from the context list.
143 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100144static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100145{
146 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
147 struct perf_counter *counter = info;
148 struct perf_counter_context *ctx = counter->ctx;
Ingo Molnar9b51f662008-12-12 13:49:45 +0100149 unsigned long flags;
Ingo Molnar5c92d122008-12-11 13:21:10 +0100150 u64 perf_flags;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100151
152 /*
153 * If this is a task context, we need to check whether it is
154 * the current task context of this cpu. If not it has been
155 * scheduled out before the smp call arrived.
156 */
157 if (ctx->task && cpuctx->task_ctx != ctx)
158 return;
159
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100160 curr_rq_lock_irq_save(&flags);
161 spin_lock(&ctx->lock);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100162
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100163 counter_sched_out(counter, cpuctx, ctx);
164
165 counter->task = NULL;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100166 ctx->nr_counters--;
167
168 /*
169 * Protect the list operation against NMI by disabling the
170 * counters on a global level. NOP for non NMI based counters.
171 */
Ingo Molnar01b28382008-12-11 13:45:51 +0100172 perf_flags = hw_perf_save_disable();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100173 list_del_counter(counter, ctx);
Ingo Molnar01b28382008-12-11 13:45:51 +0100174 hw_perf_restore(perf_flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100175
176 if (!ctx->task) {
177 /*
178 * Allow more per task counters with respect to the
179 * reservation:
180 */
181 cpuctx->max_pertask =
182 min(perf_max_counters - ctx->nr_counters,
183 perf_max_counters - perf_reserved_percpu);
184 }
185
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100186 spin_unlock(&ctx->lock);
187 curr_rq_unlock_irq_restore(&flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100188}
189
190
191/*
192 * Remove the counter from a task's (or a CPU's) list of counters.
193 *
Paul Mackerrasd859e292009-01-17 18:10:22 +1100194 * Must be called with counter->mutex and ctx->mutex held.
Thomas Gleixner0793a612008-12-04 20:12:29 +0100195 *
196 * CPU counters are removed with a smp call. For task counters we only
197 * call when the task is on a CPU.
198 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100199static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100200{
201 struct perf_counter_context *ctx = counter->ctx;
202 struct task_struct *task = ctx->task;
203
204 if (!task) {
205 /*
206 * Per cpu counters are removed via an smp call and
207 * the removal is always sucessful.
208 */
209 smp_call_function_single(counter->cpu,
Ingo Molnar04289bb2008-12-11 08:38:42 +0100210 __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100211 counter, 1);
212 return;
213 }
214
215retry:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100216 task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100217 counter);
218
219 spin_lock_irq(&ctx->lock);
220 /*
221 * If the context is active we need to retry the smp call.
222 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100223 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100224 spin_unlock_irq(&ctx->lock);
225 goto retry;
226 }
227
228 /*
229 * The lock prevents that this context is scheduled in so we
Ingo Molnar04289bb2008-12-11 08:38:42 +0100230 * can remove the counter safely, if the call above did not
Thomas Gleixner0793a612008-12-04 20:12:29 +0100231 * succeed.
232 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100233 if (!list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100234 ctx->nr_counters--;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100235 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100236 counter->task = NULL;
237 }
238 spin_unlock_irq(&ctx->lock);
239}
240
Paul Mackerrasd859e292009-01-17 18:10:22 +1100241/*
242 * Cross CPU call to disable a performance counter
243 */
244static void __perf_counter_disable(void *info)
245{
246 struct perf_counter *counter = info;
247 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
248 struct perf_counter_context *ctx = counter->ctx;
249 unsigned long flags;
250
251 /*
252 * If this is a per-task counter, need to check whether this
253 * counter's task is the current task on this cpu.
254 */
255 if (ctx->task && cpuctx->task_ctx != ctx)
256 return;
257
258 curr_rq_lock_irq_save(&flags);
259 spin_lock(&ctx->lock);
260
261 /*
262 * If the counter is on, turn it off.
263 * If it is in error state, leave it in error state.
264 */
265 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
266 if (counter == counter->group_leader)
267 group_sched_out(counter, cpuctx, ctx);
268 else
269 counter_sched_out(counter, cpuctx, ctx);
270 counter->state = PERF_COUNTER_STATE_OFF;
271 }
272
273 spin_unlock(&ctx->lock);
274 curr_rq_unlock_irq_restore(&flags);
275}
276
277/*
278 * Disable a counter.
279 */
280static void perf_counter_disable(struct perf_counter *counter)
281{
282 struct perf_counter_context *ctx = counter->ctx;
283 struct task_struct *task = ctx->task;
284
285 if (!task) {
286 /*
287 * Disable the counter on the cpu that it's on
288 */
289 smp_call_function_single(counter->cpu, __perf_counter_disable,
290 counter, 1);
291 return;
292 }
293
294 retry:
295 task_oncpu_function_call(task, __perf_counter_disable, counter);
296
297 spin_lock_irq(&ctx->lock);
298 /*
299 * If the counter is still active, we need to retry the cross-call.
300 */
301 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
302 spin_unlock_irq(&ctx->lock);
303 goto retry;
304 }
305
306 /*
307 * Since we have the lock this context can't be scheduled
308 * in, so we can change the state safely.
309 */
310 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
311 counter->state = PERF_COUNTER_STATE_OFF;
312
313 spin_unlock_irq(&ctx->lock);
314}
315
316/*
317 * Disable a counter and all its children.
318 */
319static void perf_counter_disable_family(struct perf_counter *counter)
320{
321 struct perf_counter *child;
322
323 perf_counter_disable(counter);
324
325 /*
326 * Lock the mutex to protect the list of children
327 */
328 mutex_lock(&counter->mutex);
329 list_for_each_entry(child, &counter->child_list, child_list)
330 perf_counter_disable(child);
331 mutex_unlock(&counter->mutex);
332}
333
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100334static int
335counter_sched_in(struct perf_counter *counter,
336 struct perf_cpu_context *cpuctx,
337 struct perf_counter_context *ctx,
338 int cpu)
339{
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100340 if (counter->state <= PERF_COUNTER_STATE_OFF)
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100341 return 0;
342
343 counter->state = PERF_COUNTER_STATE_ACTIVE;
344 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
345 /*
346 * The new state must be visible before we turn it on in the hardware:
347 */
348 smp_wmb();
349
350 if (counter->hw_ops->enable(counter)) {
351 counter->state = PERF_COUNTER_STATE_INACTIVE;
352 counter->oncpu = -1;
353 return -EAGAIN;
354 }
355
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100356 if (!is_software_counter(counter))
357 cpuctx->active_oncpu++;
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100358 ctx->nr_active++;
359
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100360 if (counter->hw_event.exclusive)
361 cpuctx->exclusive = 1;
362
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100363 return 0;
364}
365
Thomas Gleixner0793a612008-12-04 20:12:29 +0100366/*
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100367 * Return 1 for a group consisting entirely of software counters,
368 * 0 if the group contains any hardware counters.
369 */
370static int is_software_only_group(struct perf_counter *leader)
371{
372 struct perf_counter *counter;
373
374 if (!is_software_counter(leader))
375 return 0;
376 list_for_each_entry(counter, &leader->sibling_list, list_entry)
377 if (!is_software_counter(counter))
378 return 0;
379 return 1;
380}
381
382/*
383 * Work out whether we can put this counter group on the CPU now.
384 */
385static int group_can_go_on(struct perf_counter *counter,
386 struct perf_cpu_context *cpuctx,
387 int can_add_hw)
388{
389 /*
390 * Groups consisting entirely of software counters can always go on.
391 */
392 if (is_software_only_group(counter))
393 return 1;
394 /*
395 * If an exclusive group is already on, no other hardware
396 * counters can go on.
397 */
398 if (cpuctx->exclusive)
399 return 0;
400 /*
401 * If this group is exclusive and there are already
402 * counters on the CPU, it can't go on.
403 */
404 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
405 return 0;
406 /*
407 * Otherwise, try to add it if all previous groups were able
408 * to go on.
409 */
410 return can_add_hw;
411}
412
413/*
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100414 * Cross CPU call to install and enable a performance counter
Thomas Gleixner0793a612008-12-04 20:12:29 +0100415 */
416static void __perf_install_in_context(void *info)
417{
418 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
419 struct perf_counter *counter = info;
420 struct perf_counter_context *ctx = counter->ctx;
Paul Mackerrasd859e292009-01-17 18:10:22 +1100421 struct perf_counter *leader = counter->group_leader;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100422 int cpu = smp_processor_id();
Ingo Molnar9b51f662008-12-12 13:49:45 +0100423 unsigned long flags;
Ingo Molnar5c92d122008-12-11 13:21:10 +0100424 u64 perf_flags;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100425 int err;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100426
427 /*
428 * If this is a task context, we need to check whether it is
429 * the current task context of this cpu. If not it has been
430 * scheduled out before the smp call arrived.
431 */
432 if (ctx->task && cpuctx->task_ctx != ctx)
433 return;
434
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100435 curr_rq_lock_irq_save(&flags);
436 spin_lock(&ctx->lock);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100437
438 /*
439 * Protect the list operation against NMI by disabling the
440 * counters on a global level. NOP for non NMI based counters.
441 */
Ingo Molnar01b28382008-12-11 13:45:51 +0100442 perf_flags = hw_perf_save_disable();
Thomas Gleixner0793a612008-12-04 20:12:29 +0100443
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100444 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100445 ctx->nr_counters++;
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100446 counter->prev_state = PERF_COUNTER_STATE_OFF;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100447
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100448 /*
Paul Mackerrasd859e292009-01-17 18:10:22 +1100449 * Don't put the counter on if it is disabled or if
450 * it is in a group and the group isn't on.
451 */
452 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
453 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
454 goto unlock;
455
456 /*
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100457 * An exclusive counter can't go on if there are already active
458 * hardware counters, and no hardware counter can go on if there
459 * is already an exclusive counter on.
460 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100461 if (!group_can_go_on(counter, cpuctx, 1))
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100462 err = -EEXIST;
463 else
464 err = counter_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100465
Paul Mackerrasd859e292009-01-17 18:10:22 +1100466 if (err) {
467 /*
468 * This counter couldn't go on. If it is in a group
469 * then we have to pull the whole group off.
470 * If the counter group is pinned then put it in error state.
471 */
472 if (leader != counter)
473 group_sched_out(leader, cpuctx, ctx);
474 if (leader->hw_event.pinned)
475 leader->state = PERF_COUNTER_STATE_ERROR;
476 }
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100477
478 if (!err && !ctx->task && cpuctx->max_pertask)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100479 cpuctx->max_pertask--;
480
Paul Mackerrasd859e292009-01-17 18:10:22 +1100481 unlock:
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100482 hw_perf_restore(perf_flags);
483
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100484 spin_unlock(&ctx->lock);
485 curr_rq_unlock_irq_restore(&flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100486}
487
488/*
489 * Attach a performance counter to a context
490 *
491 * First we add the counter to the list with the hardware enable bit
492 * in counter->hw_config cleared.
493 *
494 * If the counter is attached to a task which is on a CPU we use a smp
495 * call to enable it in the task context. The task might have been
496 * scheduled away, but we check this in the smp call again.
Paul Mackerrasd859e292009-01-17 18:10:22 +1100497 *
498 * Must be called with ctx->mutex held.
Thomas Gleixner0793a612008-12-04 20:12:29 +0100499 */
500static void
501perf_install_in_context(struct perf_counter_context *ctx,
502 struct perf_counter *counter,
503 int cpu)
504{
505 struct task_struct *task = ctx->task;
506
Thomas Gleixner0793a612008-12-04 20:12:29 +0100507 if (!task) {
508 /*
509 * Per cpu counters are installed via an smp call and
510 * the install is always sucessful.
511 */
512 smp_call_function_single(cpu, __perf_install_in_context,
513 counter, 1);
514 return;
515 }
516
517 counter->task = task;
518retry:
519 task_oncpu_function_call(task, __perf_install_in_context,
520 counter);
521
522 spin_lock_irq(&ctx->lock);
523 /*
Thomas Gleixner0793a612008-12-04 20:12:29 +0100524 * we need to retry the smp call.
525 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100526 if (ctx->is_active && list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100527 spin_unlock_irq(&ctx->lock);
528 goto retry;
529 }
530
531 /*
532 * The lock prevents that this context is scheduled in so we
533 * can add the counter safely, if it the call above did not
534 * succeed.
535 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100536 if (list_empty(&counter->list_entry)) {
537 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100538 ctx->nr_counters++;
539 }
540 spin_unlock_irq(&ctx->lock);
541}
542
Paul Mackerrasd859e292009-01-17 18:10:22 +1100543/*
544 * Cross CPU call to enable a performance counter
545 */
546static void __perf_counter_enable(void *info)
Ingo Molnar04289bb2008-12-11 08:38:42 +0100547{
Paul Mackerrasd859e292009-01-17 18:10:22 +1100548 struct perf_counter *counter = info;
549 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
550 struct perf_counter_context *ctx = counter->ctx;
551 struct perf_counter *leader = counter->group_leader;
552 unsigned long flags;
553 int err;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100554
555 /*
Paul Mackerrasd859e292009-01-17 18:10:22 +1100556 * If this is a per-task counter, need to check whether this
557 * counter's task is the current task on this cpu.
Ingo Molnar04289bb2008-12-11 08:38:42 +0100558 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100559 if (ctx->task && cpuctx->task_ctx != ctx)
560 return;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100561
Paul Mackerrasd859e292009-01-17 18:10:22 +1100562 curr_rq_lock_irq_save(&flags);
563 spin_lock(&ctx->lock);
564
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100565 counter->prev_state = counter->state;
Paul Mackerrasd859e292009-01-17 18:10:22 +1100566 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
567 goto unlock;
568 counter->state = PERF_COUNTER_STATE_INACTIVE;
569
570 /*
571 * If the counter is in a group and isn't the group leader,
572 * then don't put it on unless the group is on.
573 */
574 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
575 goto unlock;
576
577 if (!group_can_go_on(counter, cpuctx, 1))
578 err = -EEXIST;
579 else
580 err = counter_sched_in(counter, cpuctx, ctx,
581 smp_processor_id());
582
583 if (err) {
584 /*
585 * If this counter can't go on and it's part of a
586 * group, then the whole group has to come off.
587 */
588 if (leader != counter)
589 group_sched_out(leader, cpuctx, ctx);
590 if (leader->hw_event.pinned)
591 leader->state = PERF_COUNTER_STATE_ERROR;
592 }
593
594 unlock:
595 spin_unlock(&ctx->lock);
596 curr_rq_unlock_irq_restore(&flags);
597}
598
599/*
600 * Enable a counter.
601 */
602static void perf_counter_enable(struct perf_counter *counter)
603{
604 struct perf_counter_context *ctx = counter->ctx;
605 struct task_struct *task = ctx->task;
606
607 if (!task) {
608 /*
609 * Enable the counter on the cpu that it's on
610 */
611 smp_call_function_single(counter->cpu, __perf_counter_enable,
612 counter, 1);
613 return;
614 }
615
616 spin_lock_irq(&ctx->lock);
617 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
618 goto out;
619
620 /*
621 * If the counter is in error state, clear that first.
622 * That way, if we see the counter in error state below, we
623 * know that it has gone back into error state, as distinct
624 * from the task having been scheduled away before the
625 * cross-call arrived.
626 */
627 if (counter->state == PERF_COUNTER_STATE_ERROR)
628 counter->state = PERF_COUNTER_STATE_OFF;
629
630 retry:
631 spin_unlock_irq(&ctx->lock);
632 task_oncpu_function_call(task, __perf_counter_enable, counter);
633
634 spin_lock_irq(&ctx->lock);
635
636 /*
637 * If the context is active and the counter is still off,
638 * we need to retry the cross-call.
639 */
640 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
641 goto retry;
642
643 /*
644 * Since we have the lock this context can't be scheduled
645 * in, so we can change the state safely.
646 */
647 if (counter->state == PERF_COUNTER_STATE_OFF)
648 counter->state = PERF_COUNTER_STATE_INACTIVE;
649 out:
650 spin_unlock_irq(&ctx->lock);
651}
652
653/*
654 * Enable a counter and all its children.
655 */
656static void perf_counter_enable_family(struct perf_counter *counter)
657{
658 struct perf_counter *child;
659
660 perf_counter_enable(counter);
661
662 /*
663 * Lock the mutex to protect the list of children
664 */
665 mutex_lock(&counter->mutex);
666 list_for_each_entry(child, &counter->child_list, child_list)
667 perf_counter_enable(child);
668 mutex_unlock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100669}
670
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100671void __perf_counter_sched_out(struct perf_counter_context *ctx,
672 struct perf_cpu_context *cpuctx)
673{
674 struct perf_counter *counter;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100675 u64 flags;
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100676
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100677 spin_lock(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100678 ctx->is_active = 0;
679 if (likely(!ctx->nr_counters))
680 goto out;
681
Paul Mackerras3cbed422009-01-09 16:43:42 +1100682 flags = hw_perf_save_disable();
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100683 if (ctx->nr_active) {
684 list_for_each_entry(counter, &ctx->counter_list, list_entry)
685 group_sched_out(counter, cpuctx, ctx);
686 }
Paul Mackerras3cbed422009-01-09 16:43:42 +1100687 hw_perf_restore(flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100688 out:
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100689 spin_unlock(&ctx->lock);
690}
691
Thomas Gleixner0793a612008-12-04 20:12:29 +0100692/*
693 * Called from scheduler to remove the counters of the current task,
694 * with interrupts disabled.
695 *
696 * We stop each counter and update the counter value in counter->count.
697 *
Ingo Molnar76715812008-12-17 14:20:28 +0100698 * This does not protect us against NMI, but disable()
Thomas Gleixner0793a612008-12-04 20:12:29 +0100699 * sets the disabled bit in the control field of counter _before_
700 * accessing the counter control register. If a NMI hits, then it will
701 * not restart the counter.
702 */
703void perf_counter_task_sched_out(struct task_struct *task, int cpu)
704{
705 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
706 struct perf_counter_context *ctx = &task->perf_counter_ctx;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100707
708 if (likely(!cpuctx->task_ctx))
709 return;
710
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100711 __perf_counter_sched_out(ctx, cpuctx);
712
Thomas Gleixner0793a612008-12-04 20:12:29 +0100713 cpuctx->task_ctx = NULL;
714}
715
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100716static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
Ingo Molnar04289bb2008-12-11 08:38:42 +0100717{
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100718 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100719}
720
Ingo Molnar79958882008-12-17 08:54:56 +0100721static int
Ingo Molnar04289bb2008-12-11 08:38:42 +0100722group_sched_in(struct perf_counter *group_counter,
723 struct perf_cpu_context *cpuctx,
724 struct perf_counter_context *ctx,
725 int cpu)
726{
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100727 struct perf_counter *counter, *partial_group;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100728 int ret;
729
730 if (group_counter->state == PERF_COUNTER_STATE_OFF)
731 return 0;
732
733 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
734 if (ret)
735 return ret < 0 ? ret : 0;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100736
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100737 group_counter->prev_state = group_counter->state;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100738 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
739 return -EAGAIN;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100740
741 /*
742 * Schedule in siblings as one group (if any):
743 */
Ingo Molnar79958882008-12-17 08:54:56 +0100744 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100745 counter->prev_state = counter->state;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100746 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
747 partial_group = counter;
748 goto group_error;
749 }
Ingo Molnar79958882008-12-17 08:54:56 +0100750 }
751
Paul Mackerras3cbed422009-01-09 16:43:42 +1100752 return 0;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100753
754group_error:
755 /*
756 * Groups can be scheduled in as one unit only, so undo any
757 * partial group before returning:
758 */
759 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
760 if (counter == partial_group)
761 break;
762 counter_sched_out(counter, cpuctx, ctx);
763 }
764 counter_sched_out(group_counter, cpuctx, ctx);
765
766 return -EAGAIN;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100767}
768
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100769static void
770__perf_counter_sched_in(struct perf_counter_context *ctx,
771 struct perf_cpu_context *cpuctx, int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100772{
Thomas Gleixner0793a612008-12-04 20:12:29 +0100773 struct perf_counter *counter;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100774 u64 flags;
Paul Mackerrasdd0e6ba2009-01-12 15:11:00 +1100775 int can_add_hw = 1;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100776
Thomas Gleixner0793a612008-12-04 20:12:29 +0100777 spin_lock(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100778 ctx->is_active = 1;
779 if (likely(!ctx->nr_counters))
780 goto out;
781
Paul Mackerras3cbed422009-01-09 16:43:42 +1100782 flags = hw_perf_save_disable();
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100783
784 /*
785 * First go through the list and put on any pinned groups
786 * in order to give them the best chance of going on.
787 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100788 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100789 if (counter->state <= PERF_COUNTER_STATE_OFF ||
790 !counter->hw_event.pinned)
791 continue;
792 if (counter->cpu != -1 && counter->cpu != cpu)
793 continue;
794
795 if (group_can_go_on(counter, cpuctx, 1))
796 group_sched_in(counter, cpuctx, ctx, cpu);
797
798 /*
799 * If this pinned group hasn't been scheduled,
800 * put it in error state.
801 */
802 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
803 counter->state = PERF_COUNTER_STATE_ERROR;
804 }
805
806 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
807 /*
808 * Ignore counters in OFF or ERROR state, and
809 * ignore pinned counters since we did them already.
810 */
811 if (counter->state <= PERF_COUNTER_STATE_OFF ||
812 counter->hw_event.pinned)
813 continue;
814
Ingo Molnar04289bb2008-12-11 08:38:42 +0100815 /*
816 * Listen to the 'cpu' scheduling filter constraint
817 * of counters:
818 */
Thomas Gleixner0793a612008-12-04 20:12:29 +0100819 if (counter->cpu != -1 && counter->cpu != cpu)
820 continue;
821
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100822 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
Paul Mackerrasdd0e6ba2009-01-12 15:11:00 +1100823 if (group_sched_in(counter, cpuctx, ctx, cpu))
824 can_add_hw = 0;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100825 }
Thomas Gleixner0793a612008-12-04 20:12:29 +0100826 }
Paul Mackerras3cbed422009-01-09 16:43:42 +1100827 hw_perf_restore(flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100828 out:
Thomas Gleixner0793a612008-12-04 20:12:29 +0100829 spin_unlock(&ctx->lock);
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100830}
Ingo Molnar04289bb2008-12-11 08:38:42 +0100831
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100832/*
833 * Called from scheduler to add the counters of the current task
834 * with interrupts disabled.
835 *
836 * We restore the counter value and then enable it.
837 *
838 * This does not protect us against NMI, but enable()
839 * sets the enabled bit in the control field of counter _before_
840 * accessing the counter control register. If a NMI hits, then it will
841 * keep the counter running.
842 */
843void perf_counter_task_sched_in(struct task_struct *task, int cpu)
844{
845 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
846 struct perf_counter_context *ctx = &task->perf_counter_ctx;
847
848 __perf_counter_sched_in(ctx, cpuctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100849 cpuctx->task_ctx = ctx;
850}
851
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100852static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
853{
854 struct perf_counter_context *ctx = &cpuctx->ctx;
855
856 __perf_counter_sched_in(ctx, cpuctx, cpu);
857}
858
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100859int perf_counter_task_disable(void)
860{
861 struct task_struct *curr = current;
862 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
863 struct perf_counter *counter;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100864 unsigned long flags;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100865 u64 perf_flags;
866 int cpu;
867
868 if (likely(!ctx->nr_counters))
869 return 0;
870
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100871 curr_rq_lock_irq_save(&flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100872 cpu = smp_processor_id();
873
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100874 /* force the update of the task clock: */
875 __task_delta_exec(curr, 1);
876
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100877 perf_counter_task_sched_out(curr, cpu);
878
879 spin_lock(&ctx->lock);
880
881 /*
882 * Disable all the counters:
883 */
884 perf_flags = hw_perf_save_disable();
885
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100886 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
887 if (counter->state != PERF_COUNTER_STATE_ERROR)
888 counter->state = PERF_COUNTER_STATE_OFF;
889 }
Ingo Molnar9b51f662008-12-12 13:49:45 +0100890
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100891 hw_perf_restore(perf_flags);
892
893 spin_unlock(&ctx->lock);
894
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100895 curr_rq_unlock_irq_restore(&flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100896
897 return 0;
898}
899
900int perf_counter_task_enable(void)
901{
902 struct task_struct *curr = current;
903 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
904 struct perf_counter *counter;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100905 unsigned long flags;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100906 u64 perf_flags;
907 int cpu;
908
909 if (likely(!ctx->nr_counters))
910 return 0;
911
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100912 curr_rq_lock_irq_save(&flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100913 cpu = smp_processor_id();
914
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100915 /* force the update of the task clock: */
916 __task_delta_exec(curr, 1);
917
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100918 perf_counter_task_sched_out(curr, cpu);
919
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100920 spin_lock(&ctx->lock);
921
922 /*
923 * Disable all the counters:
924 */
925 perf_flags = hw_perf_save_disable();
926
927 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100928 if (counter->state > PERF_COUNTER_STATE_OFF)
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100929 continue;
Ingo Molnar6a930702008-12-11 15:17:03 +0100930 counter->state = PERF_COUNTER_STATE_INACTIVE;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100931 counter->hw_event.disabled = 0;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100932 }
933 hw_perf_restore(perf_flags);
934
935 spin_unlock(&ctx->lock);
936
937 perf_counter_task_sched_in(curr, cpu);
938
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100939 curr_rq_unlock_irq_restore(&flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100940
941 return 0;
942}
943
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100944/*
945 * Round-robin a context's counters:
946 */
947static void rotate_ctx(struct perf_counter_context *ctx)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100948{
Thomas Gleixner0793a612008-12-04 20:12:29 +0100949 struct perf_counter *counter;
Ingo Molnar5c92d122008-12-11 13:21:10 +0100950 u64 perf_flags;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100951
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100952 if (!ctx->nr_counters)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100953 return;
954
Thomas Gleixner0793a612008-12-04 20:12:29 +0100955 spin_lock(&ctx->lock);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100956 /*
Ingo Molnar04289bb2008-12-11 08:38:42 +0100957 * Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner0793a612008-12-04 20:12:29 +0100958 */
Ingo Molnar01b28382008-12-11 13:45:51 +0100959 perf_flags = hw_perf_save_disable();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100960 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Peter Zijlstra75564232009-03-13 12:21:29 +0100961 list_move_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100962 break;
963 }
Ingo Molnar01b28382008-12-11 13:45:51 +0100964 hw_perf_restore(perf_flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100965
966 spin_unlock(&ctx->lock);
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100967}
Thomas Gleixner0793a612008-12-04 20:12:29 +0100968
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100969void perf_counter_task_tick(struct task_struct *curr, int cpu)
970{
971 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
972 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
973 const int rotate_percpu = 0;
974
975 if (rotate_percpu)
976 perf_counter_cpu_sched_out(cpuctx);
977 perf_counter_task_sched_out(curr, cpu);
978
979 if (rotate_percpu)
980 rotate_ctx(&cpuctx->ctx);
981 rotate_ctx(ctx);
982
983 if (rotate_percpu)
984 perf_counter_cpu_sched_in(cpuctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100985 perf_counter_task_sched_in(curr, cpu);
986}
987
988/*
Thomas Gleixner0793a612008-12-04 20:12:29 +0100989 * Cross CPU call to read the hardware counter
990 */
Ingo Molnar76715812008-12-17 14:20:28 +0100991static void __read(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100992{
Ingo Molnar621a01e2008-12-11 12:46:46 +0100993 struct perf_counter *counter = info;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100994 unsigned long flags;
Ingo Molnar621a01e2008-12-11 12:46:46 +0100995
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100996 curr_rq_lock_irq_save(&flags);
Ingo Molnar76715812008-12-17 14:20:28 +0100997 counter->hw_ops->read(counter);
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100998 curr_rq_unlock_irq_restore(&flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100999}
1000
Ingo Molnar04289bb2008-12-11 08:38:42 +01001001static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001002{
1003 /*
1004 * If counter is enabled and currently active on a CPU, update the
1005 * value in the counter structure:
1006 */
Ingo Molnar6a930702008-12-11 15:17:03 +01001007 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
Thomas Gleixner0793a612008-12-04 20:12:29 +01001008 smp_call_function_single(counter->oncpu,
Ingo Molnar76715812008-12-17 14:20:28 +01001009 __read, counter, 1);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001010 }
1011
Ingo Molnaree060942008-12-13 09:00:03 +01001012 return atomic64_read(&counter->count);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001013}
1014
1015/*
1016 * Cross CPU call to switch performance data pointers
1017 */
1018static void __perf_switch_irq_data(void *info)
1019{
1020 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1021 struct perf_counter *counter = info;
1022 struct perf_counter_context *ctx = counter->ctx;
1023 struct perf_data *oldirqdata = counter->irqdata;
1024
1025 /*
1026 * If this is a task context, we need to check whether it is
1027 * the current task context of this cpu. If not it has been
1028 * scheduled out before the smp call arrived.
1029 */
1030 if (ctx->task) {
1031 if (cpuctx->task_ctx != ctx)
1032 return;
1033 spin_lock(&ctx->lock);
1034 }
1035
1036 /* Change the pointer NMI safe */
1037 atomic_long_set((atomic_long_t *)&counter->irqdata,
1038 (unsigned long) counter->usrdata);
1039 counter->usrdata = oldirqdata;
1040
1041 if (ctx->task)
1042 spin_unlock(&ctx->lock);
1043}
1044
1045static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1046{
1047 struct perf_counter_context *ctx = counter->ctx;
1048 struct perf_data *oldirqdata = counter->irqdata;
1049 struct task_struct *task = ctx->task;
1050
1051 if (!task) {
1052 smp_call_function_single(counter->cpu,
1053 __perf_switch_irq_data,
1054 counter, 1);
1055 return counter->usrdata;
1056 }
1057
1058retry:
1059 spin_lock_irq(&ctx->lock);
Ingo Molnar6a930702008-12-11 15:17:03 +01001060 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
Thomas Gleixner0793a612008-12-04 20:12:29 +01001061 counter->irqdata = counter->usrdata;
1062 counter->usrdata = oldirqdata;
1063 spin_unlock_irq(&ctx->lock);
1064 return oldirqdata;
1065 }
1066 spin_unlock_irq(&ctx->lock);
1067 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1068 /* Might have failed, because task was scheduled out */
1069 if (counter->irqdata == oldirqdata)
1070 goto retry;
1071
1072 return counter->usrdata;
1073}
1074
1075static void put_context(struct perf_counter_context *ctx)
1076{
1077 if (ctx->task)
1078 put_task_struct(ctx->task);
1079}
1080
1081static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1082{
1083 struct perf_cpu_context *cpuctx;
1084 struct perf_counter_context *ctx;
1085 struct task_struct *task;
1086
1087 /*
1088 * If cpu is not a wildcard then this is a percpu counter:
1089 */
1090 if (cpu != -1) {
1091 /* Must be root to operate on a CPU counter: */
1092 if (!capable(CAP_SYS_ADMIN))
1093 return ERR_PTR(-EACCES);
1094
1095 if (cpu < 0 || cpu > num_possible_cpus())
1096 return ERR_PTR(-EINVAL);
1097
1098 /*
1099 * We could be clever and allow to attach a counter to an
1100 * offline CPU and activate it when the CPU comes up, but
1101 * that's for later.
1102 */
1103 if (!cpu_isset(cpu, cpu_online_map))
1104 return ERR_PTR(-ENODEV);
1105
1106 cpuctx = &per_cpu(perf_cpu_context, cpu);
1107 ctx = &cpuctx->ctx;
1108
Thomas Gleixner0793a612008-12-04 20:12:29 +01001109 return ctx;
1110 }
1111
1112 rcu_read_lock();
1113 if (!pid)
1114 task = current;
1115 else
1116 task = find_task_by_vpid(pid);
1117 if (task)
1118 get_task_struct(task);
1119 rcu_read_unlock();
1120
1121 if (!task)
1122 return ERR_PTR(-ESRCH);
1123
1124 ctx = &task->perf_counter_ctx;
1125 ctx->task = task;
1126
1127 /* Reuse ptrace permission checks for now. */
1128 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1129 put_context(ctx);
1130 return ERR_PTR(-EACCES);
1131 }
1132
1133 return ctx;
1134}
1135
1136/*
1137 * Called when the last reference to the file is gone.
1138 */
1139static int perf_release(struct inode *inode, struct file *file)
1140{
1141 struct perf_counter *counter = file->private_data;
1142 struct perf_counter_context *ctx = counter->ctx;
1143
1144 file->private_data = NULL;
1145
Paul Mackerrasd859e292009-01-17 18:10:22 +11001146 mutex_lock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001147 mutex_lock(&counter->mutex);
1148
Ingo Molnar04289bb2008-12-11 08:38:42 +01001149 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001150
1151 mutex_unlock(&counter->mutex);
Paul Mackerrasd859e292009-01-17 18:10:22 +11001152 mutex_unlock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001153
1154 kfree(counter);
Mike Galbraith5af75912009-02-11 10:53:37 +01001155 put_context(ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001156
1157 return 0;
1158}
1159
1160/*
1161 * Read the performance counter - simple non blocking version for now
1162 */
1163static ssize_t
1164perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1165{
1166 u64 cntval;
1167
1168 if (count != sizeof(cntval))
1169 return -EINVAL;
1170
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001171 /*
1172 * Return end-of-file for a read on a counter that is in
1173 * error state (i.e. because it was pinned but it couldn't be
1174 * scheduled on to the CPU at some point).
1175 */
1176 if (counter->state == PERF_COUNTER_STATE_ERROR)
1177 return 0;
1178
Thomas Gleixner0793a612008-12-04 20:12:29 +01001179 mutex_lock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01001180 cntval = perf_counter_read(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001181 mutex_unlock(&counter->mutex);
1182
1183 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1184}
1185
1186static ssize_t
1187perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1188{
1189 if (!usrdata->len)
1190 return 0;
1191
1192 count = min(count, (size_t)usrdata->len);
1193 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1194 return -EFAULT;
1195
1196 /* Adjust the counters */
1197 usrdata->len -= count;
1198 if (!usrdata->len)
1199 usrdata->rd_idx = 0;
1200 else
1201 usrdata->rd_idx += count;
1202
1203 return count;
1204}
1205
1206static ssize_t
1207perf_read_irq_data(struct perf_counter *counter,
1208 char __user *buf,
1209 size_t count,
1210 int nonblocking)
1211{
1212 struct perf_data *irqdata, *usrdata;
1213 DECLARE_WAITQUEUE(wait, current);
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001214 ssize_t res, res2;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001215
1216 irqdata = counter->irqdata;
1217 usrdata = counter->usrdata;
1218
1219 if (usrdata->len + irqdata->len >= count)
1220 goto read_pending;
1221
1222 if (nonblocking)
1223 return -EAGAIN;
1224
1225 spin_lock_irq(&counter->waitq.lock);
1226 __add_wait_queue(&counter->waitq, &wait);
1227 for (;;) {
1228 set_current_state(TASK_INTERRUPTIBLE);
1229 if (usrdata->len + irqdata->len >= count)
1230 break;
1231
1232 if (signal_pending(current))
1233 break;
1234
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001235 if (counter->state == PERF_COUNTER_STATE_ERROR)
1236 break;
1237
Thomas Gleixner0793a612008-12-04 20:12:29 +01001238 spin_unlock_irq(&counter->waitq.lock);
1239 schedule();
1240 spin_lock_irq(&counter->waitq.lock);
1241 }
1242 __remove_wait_queue(&counter->waitq, &wait);
1243 __set_current_state(TASK_RUNNING);
1244 spin_unlock_irq(&counter->waitq.lock);
1245
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001246 if (usrdata->len + irqdata->len < count &&
1247 counter->state != PERF_COUNTER_STATE_ERROR)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001248 return -ERESTARTSYS;
1249read_pending:
1250 mutex_lock(&counter->mutex);
1251
1252 /* Drain pending data first: */
1253 res = perf_copy_usrdata(usrdata, buf, count);
1254 if (res < 0 || res == count)
1255 goto out;
1256
1257 /* Switch irq buffer: */
1258 usrdata = perf_switch_irq_data(counter);
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001259 res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1260 if (res2 < 0) {
Thomas Gleixner0793a612008-12-04 20:12:29 +01001261 if (!res)
1262 res = -EFAULT;
1263 } else {
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001264 res += res2;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001265 }
1266out:
1267 mutex_unlock(&counter->mutex);
1268
1269 return res;
1270}
1271
1272static ssize_t
1273perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1274{
1275 struct perf_counter *counter = file->private_data;
1276
Ingo Molnar9f66a382008-12-10 12:33:23 +01001277 switch (counter->hw_event.record_type) {
Thomas Gleixner0793a612008-12-04 20:12:29 +01001278 case PERF_RECORD_SIMPLE:
1279 return perf_read_hw(counter, buf, count);
1280
1281 case PERF_RECORD_IRQ:
1282 case PERF_RECORD_GROUP:
1283 return perf_read_irq_data(counter, buf, count,
1284 file->f_flags & O_NONBLOCK);
1285 }
1286 return -EINVAL;
1287}
1288
1289static unsigned int perf_poll(struct file *file, poll_table *wait)
1290{
1291 struct perf_counter *counter = file->private_data;
1292 unsigned int events = 0;
1293 unsigned long flags;
1294
1295 poll_wait(file, &counter->waitq, wait);
1296
1297 spin_lock_irqsave(&counter->waitq.lock, flags);
1298 if (counter->usrdata->len || counter->irqdata->len)
1299 events |= POLLIN;
1300 spin_unlock_irqrestore(&counter->waitq.lock, flags);
1301
1302 return events;
1303}
1304
Paul Mackerrasd859e292009-01-17 18:10:22 +11001305static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1306{
1307 struct perf_counter *counter = file->private_data;
1308 int err = 0;
1309
1310 switch (cmd) {
1311 case PERF_COUNTER_IOC_ENABLE:
1312 perf_counter_enable_family(counter);
1313 break;
1314 case PERF_COUNTER_IOC_DISABLE:
1315 perf_counter_disable_family(counter);
1316 break;
1317 default:
1318 err = -ENOTTY;
1319 }
1320 return err;
1321}
1322
Thomas Gleixner0793a612008-12-04 20:12:29 +01001323static const struct file_operations perf_fops = {
1324 .release = perf_release,
1325 .read = perf_read,
1326 .poll = perf_poll,
Paul Mackerrasd859e292009-01-17 18:10:22 +11001327 .unlocked_ioctl = perf_ioctl,
1328 .compat_ioctl = perf_ioctl,
Thomas Gleixner0793a612008-12-04 20:12:29 +01001329};
1330
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001331/*
1332 * Generic software counter infrastructure
1333 */
1334
1335static void perf_swcounter_update(struct perf_counter *counter)
1336{
1337 struct hw_perf_counter *hwc = &counter->hw;
1338 u64 prev, now;
1339 s64 delta;
1340
1341again:
1342 prev = atomic64_read(&hwc->prev_count);
1343 now = atomic64_read(&hwc->count);
1344 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1345 goto again;
1346
1347 delta = now - prev;
1348
1349 atomic64_add(delta, &counter->count);
1350 atomic64_sub(delta, &hwc->period_left);
1351}
1352
1353static void perf_swcounter_set_period(struct perf_counter *counter)
1354{
1355 struct hw_perf_counter *hwc = &counter->hw;
1356 s64 left = atomic64_read(&hwc->period_left);
1357 s64 period = hwc->irq_period;
1358
1359 if (unlikely(left <= -period)) {
1360 left = period;
1361 atomic64_set(&hwc->period_left, left);
1362 }
1363
1364 if (unlikely(left <= 0)) {
1365 left += period;
1366 atomic64_add(period, &hwc->period_left);
1367 }
1368
1369 atomic64_set(&hwc->prev_count, -left);
1370 atomic64_set(&hwc->count, -left);
1371}
1372
1373static void perf_swcounter_save_and_restart(struct perf_counter *counter)
1374{
1375 perf_swcounter_update(counter);
1376 perf_swcounter_set_period(counter);
1377}
1378
1379static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
1380{
1381 struct perf_data *irqdata = counter->irqdata;
1382
1383 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1384 irqdata->overrun++;
1385 } else {
1386 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1387
1388 *p = data;
1389 irqdata->len += sizeof(u64);
1390 }
1391}
1392
1393static void perf_swcounter_handle_group(struct perf_counter *sibling)
1394{
1395 struct perf_counter *counter, *group_leader = sibling->group_leader;
1396
1397 list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
1398 perf_swcounter_update(counter);
1399 perf_swcounter_store_irq(sibling, counter->hw_event.type);
1400 perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
1401 }
1402}
1403
1404static void perf_swcounter_interrupt(struct perf_counter *counter,
1405 int nmi, struct pt_regs *regs)
1406{
1407 perf_swcounter_save_and_restart(counter);
1408
1409 switch (counter->hw_event.record_type) {
1410 case PERF_RECORD_SIMPLE:
1411 break;
1412
1413 case PERF_RECORD_IRQ:
1414 perf_swcounter_store_irq(counter, instruction_pointer(regs));
1415 break;
1416
1417 case PERF_RECORD_GROUP:
1418 perf_swcounter_handle_group(counter);
1419 break;
1420 }
1421
1422 if (nmi) {
1423 counter->wakeup_pending = 1;
1424 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
1425 } else
1426 wake_up(&counter->waitq);
1427}
1428
1429static int perf_swcounter_match(struct perf_counter *counter,
1430 enum hw_event_types event,
1431 struct pt_regs *regs)
1432{
1433 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1434 return 0;
1435
1436 if (counter->hw_event.raw)
1437 return 0;
1438
1439 if (counter->hw_event.type != event)
1440 return 0;
1441
1442 if (counter->hw_event.exclude_user && user_mode(regs))
1443 return 0;
1444
1445 if (counter->hw_event.exclude_kernel && !user_mode(regs))
1446 return 0;
1447
1448 return 1;
1449}
1450
1451static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1452 enum hw_event_types event, u64 nr,
1453 int nmi, struct pt_regs *regs)
1454{
1455 struct perf_counter *counter;
1456 unsigned long flags;
1457 int neg;
1458
1459 if (list_empty(&ctx->counter_list))
1460 return;
1461
1462 spin_lock_irqsave(&ctx->lock, flags);
1463
1464 /*
1465 * XXX: make counter_list RCU safe
1466 */
1467 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1468 if (perf_swcounter_match(counter, event, regs)) {
1469 neg = atomic64_add_negative(nr, &counter->hw.count);
1470 if (counter->hw.irq_period && !neg)
1471 perf_swcounter_interrupt(counter, nmi, regs);
1472 }
1473 }
1474
1475 spin_unlock_irqrestore(&ctx->lock, flags);
1476}
1477
1478void perf_swcounter_event(enum hw_event_types event, u64 nr,
1479 int nmi, struct pt_regs *regs)
1480{
1481 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1482
1483 perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
1484 if (cpuctx->task_ctx)
1485 perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
1486
1487 put_cpu_var(perf_cpu_context);
1488}
1489
1490static void perf_swcounter_read(struct perf_counter *counter)
1491{
1492 perf_swcounter_update(counter);
1493}
1494
1495static int perf_swcounter_enable(struct perf_counter *counter)
1496{
1497 perf_swcounter_set_period(counter);
1498 return 0;
1499}
1500
1501static void perf_swcounter_disable(struct perf_counter *counter)
1502{
1503 perf_swcounter_update(counter);
1504}
1505
1506/*
1507 * Software counter: cpu wall time clock
1508 */
1509
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001510static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar5c92d122008-12-11 13:21:10 +01001511{
Paul Mackerras9abf8a02009-01-09 16:26:43 +11001512 int cpu = raw_smp_processor_id();
1513
1514 atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001515 return 0;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001516}
1517
Paul Mackerras9abf8a02009-01-09 16:26:43 +11001518static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1519{
1520 int cpu = raw_smp_processor_id();
1521 s64 prev;
1522 u64 now;
1523
1524 now = cpu_clock(cpu);
1525 prev = atomic64_read(&counter->hw.prev_count);
1526 atomic64_set(&counter->hw.prev_count, now);
1527 atomic64_add(now - prev, &counter->count);
1528}
1529
Ingo Molnar5c92d122008-12-11 13:21:10 +01001530static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1531{
Paul Mackerras9abf8a02009-01-09 16:26:43 +11001532 cpu_clock_perf_counter_update(counter);
Ingo Molnar5c92d122008-12-11 13:21:10 +01001533}
1534
1535static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1536{
Paul Mackerras9abf8a02009-01-09 16:26:43 +11001537 cpu_clock_perf_counter_update(counter);
Ingo Molnar5c92d122008-12-11 13:21:10 +01001538}
1539
1540static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
Ingo Molnar76715812008-12-17 14:20:28 +01001541 .enable = cpu_clock_perf_counter_enable,
1542 .disable = cpu_clock_perf_counter_disable,
1543 .read = cpu_clock_perf_counter_read,
Ingo Molnar5c92d122008-12-11 13:21:10 +01001544};
1545
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001546/*
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001547 * Software counter: task time clock
1548 */
1549
1550/*
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001551 * Called from within the scheduler:
1552 */
1553static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
Ingo Molnarbae43c92008-12-11 14:03:20 +01001554{
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001555 struct task_struct *curr = counter->task;
1556 u64 delta;
1557
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001558 delta = __task_delta_exec(curr, update);
1559
1560 return curr->se.sum_exec_runtime + delta;
1561}
1562
1563static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1564{
1565 u64 prev;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001566 s64 delta;
Ingo Molnarbae43c92008-12-11 14:03:20 +01001567
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001568 prev = atomic64_read(&counter->hw.prev_count);
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001569
1570 atomic64_set(&counter->hw.prev_count, now);
1571
1572 delta = now - prev;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001573
1574 atomic64_add(delta, &counter->count);
Ingo Molnarbae43c92008-12-11 14:03:20 +01001575}
1576
1577static void task_clock_perf_counter_read(struct perf_counter *counter)
1578{
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001579 u64 now = task_clock_perf_counter_val(counter, 1);
1580
1581 task_clock_perf_counter_update(counter, now);
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001582}
1583
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001584static int task_clock_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001585{
Paul Mackerrasc07c99b2009-02-13 22:10:34 +11001586 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1587 atomic64_set(&counter->hw.prev_count,
1588 task_clock_perf_counter_val(counter, 0));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001589
1590 return 0;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01001591}
1592
1593static void task_clock_perf_counter_disable(struct perf_counter *counter)
1594{
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001595 u64 now = task_clock_perf_counter_val(counter, 0);
1596
1597 task_clock_perf_counter_update(counter, now);
Ingo Molnarbae43c92008-12-11 14:03:20 +01001598}
1599
1600static const struct hw_perf_counter_ops perf_ops_task_clock = {
Ingo Molnar76715812008-12-17 14:20:28 +01001601 .enable = task_clock_perf_counter_enable,
1602 .disable = task_clock_perf_counter_disable,
1603 .read = task_clock_perf_counter_read,
Ingo Molnarbae43c92008-12-11 14:03:20 +01001604};
1605
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001606/*
1607 * Software counter: page faults
1608 */
1609
Paul Mackerras23a185c2009-02-09 22:42:47 +11001610#ifdef CONFIG_VM_EVENT_COUNTERS
1611#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
1612#else
1613#define cpu_page_faults() 0
1614#endif
Ingo Molnare06c61a2008-12-14 14:44:31 +01001615
Paul Mackerras23a185c2009-02-09 22:42:47 +11001616static u64 get_page_faults(struct perf_counter *counter)
1617{
1618 struct task_struct *curr = counter->ctx->task;
1619
1620 if (curr)
1621 return curr->maj_flt + curr->min_flt;
1622 return cpu_page_faults();
Ingo Molnare06c61a2008-12-14 14:44:31 +01001623}
1624
1625static void page_faults_perf_counter_update(struct perf_counter *counter)
1626{
1627 u64 prev, now;
1628 s64 delta;
1629
1630 prev = atomic64_read(&counter->hw.prev_count);
Paul Mackerras23a185c2009-02-09 22:42:47 +11001631 now = get_page_faults(counter);
Ingo Molnare06c61a2008-12-14 14:44:31 +01001632
1633 atomic64_set(&counter->hw.prev_count, now);
1634
1635 delta = now - prev;
Ingo Molnare06c61a2008-12-14 14:44:31 +01001636
1637 atomic64_add(delta, &counter->count);
1638}
1639
1640static void page_faults_perf_counter_read(struct perf_counter *counter)
1641{
1642 page_faults_perf_counter_update(counter);
1643}
1644
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001645static int page_faults_perf_counter_enable(struct perf_counter *counter)
Ingo Molnare06c61a2008-12-14 14:44:31 +01001646{
Paul Mackerrasc07c99b2009-02-13 22:10:34 +11001647 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1648 atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001649 return 0;
Ingo Molnare06c61a2008-12-14 14:44:31 +01001650}
1651
1652static void page_faults_perf_counter_disable(struct perf_counter *counter)
1653{
1654 page_faults_perf_counter_update(counter);
1655}
1656
1657static const struct hw_perf_counter_ops perf_ops_page_faults = {
Ingo Molnar76715812008-12-17 14:20:28 +01001658 .enable = page_faults_perf_counter_enable,
1659 .disable = page_faults_perf_counter_disable,
1660 .read = page_faults_perf_counter_read,
Ingo Molnare06c61a2008-12-14 14:44:31 +01001661};
1662
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001663/*
1664 * Software counter: context switches
1665 */
1666
Paul Mackerras23a185c2009-02-09 22:42:47 +11001667static u64 get_context_switches(struct perf_counter *counter)
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001668{
Paul Mackerras23a185c2009-02-09 22:42:47 +11001669 struct task_struct *curr = counter->ctx->task;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001670
Paul Mackerras23a185c2009-02-09 22:42:47 +11001671 if (curr)
1672 return curr->nvcsw + curr->nivcsw;
1673 return cpu_nr_switches(smp_processor_id());
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001674}
1675
1676static void context_switches_perf_counter_update(struct perf_counter *counter)
1677{
1678 u64 prev, now;
1679 s64 delta;
1680
1681 prev = atomic64_read(&counter->hw.prev_count);
Paul Mackerras23a185c2009-02-09 22:42:47 +11001682 now = get_context_switches(counter);
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001683
1684 atomic64_set(&counter->hw.prev_count, now);
1685
1686 delta = now - prev;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001687
1688 atomic64_add(delta, &counter->count);
1689}
1690
1691static void context_switches_perf_counter_read(struct perf_counter *counter)
1692{
1693 context_switches_perf_counter_update(counter);
1694}
1695
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001696static int context_switches_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001697{
Paul Mackerrasc07c99b2009-02-13 22:10:34 +11001698 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1699 atomic64_set(&counter->hw.prev_count,
1700 get_context_switches(counter));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001701 return 0;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001702}
1703
1704static void context_switches_perf_counter_disable(struct perf_counter *counter)
1705{
1706 context_switches_perf_counter_update(counter);
1707}
1708
1709static const struct hw_perf_counter_ops perf_ops_context_switches = {
Ingo Molnar76715812008-12-17 14:20:28 +01001710 .enable = context_switches_perf_counter_enable,
1711 .disable = context_switches_perf_counter_disable,
1712 .read = context_switches_perf_counter_read,
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001713};
1714
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001715/*
1716 * Software counter: cpu migrations
1717 */
1718
Paul Mackerras23a185c2009-02-09 22:42:47 +11001719static inline u64 get_cpu_migrations(struct perf_counter *counter)
Ingo Molnar6c594c22008-12-14 12:34:15 +01001720{
Paul Mackerras23a185c2009-02-09 22:42:47 +11001721 struct task_struct *curr = counter->ctx->task;
1722
1723 if (curr)
1724 return curr->se.nr_migrations;
1725 return cpu_nr_migrations(smp_processor_id());
Ingo Molnar6c594c22008-12-14 12:34:15 +01001726}
1727
1728static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1729{
1730 u64 prev, now;
1731 s64 delta;
1732
1733 prev = atomic64_read(&counter->hw.prev_count);
Paul Mackerras23a185c2009-02-09 22:42:47 +11001734 now = get_cpu_migrations(counter);
Ingo Molnar6c594c22008-12-14 12:34:15 +01001735
1736 atomic64_set(&counter->hw.prev_count, now);
1737
1738 delta = now - prev;
Ingo Molnar6c594c22008-12-14 12:34:15 +01001739
1740 atomic64_add(delta, &counter->count);
1741}
1742
1743static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1744{
1745 cpu_migrations_perf_counter_update(counter);
1746}
1747
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001748static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar6c594c22008-12-14 12:34:15 +01001749{
Paul Mackerrasc07c99b2009-02-13 22:10:34 +11001750 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1751 atomic64_set(&counter->hw.prev_count,
1752 get_cpu_migrations(counter));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01001753 return 0;
Ingo Molnar6c594c22008-12-14 12:34:15 +01001754}
1755
1756static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1757{
1758 cpu_migrations_perf_counter_update(counter);
1759}
1760
1761static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
Ingo Molnar76715812008-12-17 14:20:28 +01001762 .enable = cpu_migrations_perf_counter_enable,
1763 .disable = cpu_migrations_perf_counter_disable,
1764 .read = cpu_migrations_perf_counter_read,
Ingo Molnar6c594c22008-12-14 12:34:15 +01001765};
1766
Ingo Molnar5c92d122008-12-11 13:21:10 +01001767static const struct hw_perf_counter_ops *
1768sw_perf_counter_init(struct perf_counter *counter)
1769{
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001770 struct perf_counter_hw_event *hw_event = &counter->hw_event;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001771 const struct hw_perf_counter_ops *hw_ops = NULL;
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001772 struct hw_perf_counter *hwc = &counter->hw;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001773
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001774 /*
1775 * Software counters (currently) can't in general distinguish
1776 * between user, kernel and hypervisor events.
1777 * However, context switches and cpu migrations are considered
1778 * to be kernel events, and page faults are never hypervisor
1779 * events.
1780 */
Ingo Molnar5c92d122008-12-11 13:21:10 +01001781 switch (counter->hw_event.type) {
1782 case PERF_COUNT_CPU_CLOCK:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001783 if (!(counter->hw_event.exclude_user ||
1784 counter->hw_event.exclude_kernel ||
1785 counter->hw_event.exclude_hv))
1786 hw_ops = &perf_ops_cpu_clock;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001787 break;
Ingo Molnarbae43c92008-12-11 14:03:20 +01001788 case PERF_COUNT_TASK_CLOCK:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001789 if (counter->hw_event.exclude_user ||
1790 counter->hw_event.exclude_kernel ||
1791 counter->hw_event.exclude_hv)
1792 break;
Paul Mackerras23a185c2009-02-09 22:42:47 +11001793 /*
1794 * If the user instantiates this as a per-cpu counter,
1795 * use the cpu_clock counter instead.
1796 */
1797 if (counter->ctx->task)
1798 hw_ops = &perf_ops_task_clock;
1799 else
1800 hw_ops = &perf_ops_cpu_clock;
Ingo Molnarbae43c92008-12-11 14:03:20 +01001801 break;
Ingo Molnare06c61a2008-12-14 14:44:31 +01001802 case PERF_COUNT_PAGE_FAULTS:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001803 if (!(counter->hw_event.exclude_user ||
1804 counter->hw_event.exclude_kernel))
1805 hw_ops = &perf_ops_page_faults;
Ingo Molnare06c61a2008-12-14 14:44:31 +01001806 break;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001807 case PERF_COUNT_CONTEXT_SWITCHES:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001808 if (!counter->hw_event.exclude_kernel)
1809 hw_ops = &perf_ops_context_switches;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01001810 break;
Ingo Molnar6c594c22008-12-14 12:34:15 +01001811 case PERF_COUNT_CPU_MIGRATIONS:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11001812 if (!counter->hw_event.exclude_kernel)
1813 hw_ops = &perf_ops_cpu_migrations;
Ingo Molnar6c594c22008-12-14 12:34:15 +01001814 break;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001815 default:
1816 break;
1817 }
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001818
1819 if (hw_ops)
1820 hwc->irq_period = hw_event->irq_period;
1821
Ingo Molnar5c92d122008-12-11 13:21:10 +01001822 return hw_ops;
1823}
1824
Thomas Gleixner0793a612008-12-04 20:12:29 +01001825/*
1826 * Allocate and initialize a counter structure
1827 */
1828static struct perf_counter *
Ingo Molnar04289bb2008-12-11 08:38:42 +01001829perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1830 int cpu,
Paul Mackerras23a185c2009-02-09 22:42:47 +11001831 struct perf_counter_context *ctx,
Ingo Molnar9b51f662008-12-12 13:49:45 +01001832 struct perf_counter *group_leader,
1833 gfp_t gfpflags)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001834{
Ingo Molnar5c92d122008-12-11 13:21:10 +01001835 const struct hw_perf_counter_ops *hw_ops;
Ingo Molnar621a01e2008-12-11 12:46:46 +01001836 struct perf_counter *counter;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001837
Ingo Molnar9b51f662008-12-12 13:49:45 +01001838 counter = kzalloc(sizeof(*counter), gfpflags);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001839 if (!counter)
1840 return NULL;
1841
Ingo Molnar04289bb2008-12-11 08:38:42 +01001842 /*
1843 * Single counters are their own group leaders, with an
1844 * empty sibling list:
1845 */
1846 if (!group_leader)
1847 group_leader = counter;
1848
Thomas Gleixner0793a612008-12-04 20:12:29 +01001849 mutex_init(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01001850 INIT_LIST_HEAD(&counter->list_entry);
1851 INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001852 init_waitqueue_head(&counter->waitq);
1853
Paul Mackerrasd859e292009-01-17 18:10:22 +11001854 INIT_LIST_HEAD(&counter->child_list);
1855
Ingo Molnar9f66a382008-12-10 12:33:23 +01001856 counter->irqdata = &counter->data[0];
1857 counter->usrdata = &counter->data[1];
1858 counter->cpu = cpu;
1859 counter->hw_event = *hw_event;
1860 counter->wakeup_pending = 0;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001861 counter->group_leader = group_leader;
Ingo Molnar621a01e2008-12-11 12:46:46 +01001862 counter->hw_ops = NULL;
Paul Mackerras23a185c2009-02-09 22:42:47 +11001863 counter->ctx = ctx;
Ingo Molnar621a01e2008-12-11 12:46:46 +01001864
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001865 counter->state = PERF_COUNTER_STATE_INACTIVE;
Ingo Molnara86ed502008-12-17 00:43:10 +01001866 if (hw_event->disabled)
1867 counter->state = PERF_COUNTER_STATE_OFF;
1868
Ingo Molnar5c92d122008-12-11 13:21:10 +01001869 hw_ops = NULL;
1870 if (!hw_event->raw && hw_event->type < 0)
1871 hw_ops = sw_perf_counter_init(counter);
Paul Mackerras23a185c2009-02-09 22:42:47 +11001872 else
Ingo Molnar5c92d122008-12-11 13:21:10 +01001873 hw_ops = hw_perf_counter_init(counter);
Ingo Molnar5c92d122008-12-11 13:21:10 +01001874
Ingo Molnar621a01e2008-12-11 12:46:46 +01001875 if (!hw_ops) {
1876 kfree(counter);
1877 return NULL;
1878 }
1879 counter->hw_ops = hw_ops;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001880
1881 return counter;
1882}
1883
1884/**
Paul Mackerras2743a5b2009-03-04 20:36:51 +11001885 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
Ingo Molnar9f66a382008-12-10 12:33:23 +01001886 *
1887 * @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner0793a612008-12-04 20:12:29 +01001888 * @pid: target pid
Ingo Molnar9f66a382008-12-10 12:33:23 +01001889 * @cpu: target cpu
1890 * @group_fd: group leader counter fd
Thomas Gleixner0793a612008-12-04 20:12:29 +01001891 */
Paul Mackerras2743a5b2009-03-04 20:36:51 +11001892SYSCALL_DEFINE5(perf_counter_open,
Paul Mackerrasf3dfd262009-02-26 22:43:46 +11001893 const struct perf_counter_hw_event __user *, hw_event_uptr,
Paul Mackerras2743a5b2009-03-04 20:36:51 +11001894 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001895{
Ingo Molnar04289bb2008-12-11 08:38:42 +01001896 struct perf_counter *counter, *group_leader;
Ingo Molnar9f66a382008-12-10 12:33:23 +01001897 struct perf_counter_hw_event hw_event;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001898 struct perf_counter_context *ctx;
Ingo Molnar9b51f662008-12-12 13:49:45 +01001899 struct file *counter_file = NULL;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001900 struct file *group_file = NULL;
1901 int fput_needed = 0;
Ingo Molnar9b51f662008-12-12 13:49:45 +01001902 int fput_needed2 = 0;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001903 int ret;
1904
Paul Mackerras2743a5b2009-03-04 20:36:51 +11001905 /* for future expandability... */
1906 if (flags)
1907 return -EINVAL;
1908
Ingo Molnar9f66a382008-12-10 12:33:23 +01001909 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixnereab656a2008-12-08 19:26:59 +01001910 return -EFAULT;
1911
Ingo Molnar04289bb2008-12-11 08:38:42 +01001912 /*
Ingo Molnarccff2862008-12-11 11:26:29 +01001913 * Get the target context (task or percpu):
1914 */
1915 ctx = find_get_context(pid, cpu);
1916 if (IS_ERR(ctx))
1917 return PTR_ERR(ctx);
1918
1919 /*
1920 * Look up the group leader (we will attach this counter to it):
Ingo Molnar04289bb2008-12-11 08:38:42 +01001921 */
1922 group_leader = NULL;
1923 if (group_fd != -1) {
1924 ret = -EINVAL;
1925 group_file = fget_light(group_fd, &fput_needed);
1926 if (!group_file)
Ingo Molnarccff2862008-12-11 11:26:29 +01001927 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001928 if (group_file->f_op != &perf_fops)
Ingo Molnarccff2862008-12-11 11:26:29 +01001929 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001930
1931 group_leader = group_file->private_data;
1932 /*
Ingo Molnarccff2862008-12-11 11:26:29 +01001933 * Do not allow a recursive hierarchy (this new sibling
1934 * becoming part of another group-sibling):
Ingo Molnar04289bb2008-12-11 08:38:42 +01001935 */
Ingo Molnarccff2862008-12-11 11:26:29 +01001936 if (group_leader->group_leader != group_leader)
1937 goto err_put_context;
1938 /*
1939 * Do not allow to attach to a group in a different
1940 * task or CPU context:
1941 */
1942 if (group_leader->ctx != ctx)
1943 goto err_put_context;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001944 /*
1945 * Only a group leader can be exclusive or pinned
1946 */
1947 if (hw_event.exclusive || hw_event.pinned)
1948 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01001949 }
1950
Ingo Molnar5c92d122008-12-11 13:21:10 +01001951 ret = -EINVAL;
Paul Mackerras23a185c2009-02-09 22:42:47 +11001952 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1953 GFP_KERNEL);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001954 if (!counter)
1955 goto err_put_context;
1956
Thomas Gleixner0793a612008-12-04 20:12:29 +01001957 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1958 if (ret < 0)
Ingo Molnar9b51f662008-12-12 13:49:45 +01001959 goto err_free_put_context;
1960
1961 counter_file = fget_light(ret, &fput_needed2);
1962 if (!counter_file)
1963 goto err_free_put_context;
1964
1965 counter->filp = counter_file;
Paul Mackerrasd859e292009-01-17 18:10:22 +11001966 mutex_lock(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01001967 perf_install_in_context(ctx, counter, cpu);
Paul Mackerrasd859e292009-01-17 18:10:22 +11001968 mutex_unlock(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01001969
1970 fput_light(counter_file, fput_needed2);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001971
Ingo Molnar04289bb2008-12-11 08:38:42 +01001972out_fput:
1973 fput_light(group_file, fput_needed);
1974
Thomas Gleixner0793a612008-12-04 20:12:29 +01001975 return ret;
1976
Ingo Molnar9b51f662008-12-12 13:49:45 +01001977err_free_put_context:
Thomas Gleixner0793a612008-12-04 20:12:29 +01001978 kfree(counter);
1979
1980err_put_context:
1981 put_context(ctx);
1982
Ingo Molnar04289bb2008-12-11 08:38:42 +01001983 goto out_fput;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001984}
1985
Ingo Molnar9b51f662008-12-12 13:49:45 +01001986/*
1987 * Initialize the perf_counter context in a task_struct:
1988 */
1989static void
1990__perf_counter_init_context(struct perf_counter_context *ctx,
1991 struct task_struct *task)
1992{
1993 memset(ctx, 0, sizeof(*ctx));
1994 spin_lock_init(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +11001995 mutex_init(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01001996 INIT_LIST_HEAD(&ctx->counter_list);
1997 ctx->task = task;
1998}
1999
2000/*
2001 * inherit a counter from parent task to child task:
2002 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11002003static struct perf_counter *
Ingo Molnar9b51f662008-12-12 13:49:45 +01002004inherit_counter(struct perf_counter *parent_counter,
2005 struct task_struct *parent,
2006 struct perf_counter_context *parent_ctx,
2007 struct task_struct *child,
Paul Mackerrasd859e292009-01-17 18:10:22 +11002008 struct perf_counter *group_leader,
Ingo Molnar9b51f662008-12-12 13:49:45 +01002009 struct perf_counter_context *child_ctx)
2010{
2011 struct perf_counter *child_counter;
2012
Paul Mackerrasd859e292009-01-17 18:10:22 +11002013 /*
2014 * Instead of creating recursive hierarchies of counters,
2015 * we link inherited counters back to the original parent,
2016 * which has a filp for sure, which we use as the reference
2017 * count:
2018 */
2019 if (parent_counter->parent)
2020 parent_counter = parent_counter->parent;
2021
Ingo Molnar9b51f662008-12-12 13:49:45 +01002022 child_counter = perf_counter_alloc(&parent_counter->hw_event,
Paul Mackerras23a185c2009-02-09 22:42:47 +11002023 parent_counter->cpu, child_ctx,
2024 group_leader, GFP_KERNEL);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002025 if (!child_counter)
Paul Mackerrasd859e292009-01-17 18:10:22 +11002026 return NULL;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002027
2028 /*
2029 * Link it up in the child's context:
2030 */
Ingo Molnar9b51f662008-12-12 13:49:45 +01002031 child_counter->task = child;
2032 list_add_counter(child_counter, child_ctx);
2033 child_ctx->nr_counters++;
2034
2035 child_counter->parent = parent_counter;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002036 /*
2037 * inherit into child's child as well:
2038 */
2039 child_counter->hw_event.inherit = 1;
2040
2041 /*
2042 * Get a reference to the parent filp - we will fput it
2043 * when the child counter exits. This is safe to do because
2044 * we are in the parent and we know that the filp still
2045 * exists and has a nonzero count:
2046 */
2047 atomic_long_inc(&parent_counter->filp->f_count);
2048
Paul Mackerrasd859e292009-01-17 18:10:22 +11002049 /*
2050 * Link this into the parent counter's child list
2051 */
2052 mutex_lock(&parent_counter->mutex);
2053 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2054
2055 /*
2056 * Make the child state follow the state of the parent counter,
2057 * not its hw_event.disabled bit. We hold the parent's mutex,
2058 * so we won't race with perf_counter_{en,dis}able_family.
2059 */
2060 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2061 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2062 else
2063 child_counter->state = PERF_COUNTER_STATE_OFF;
2064
2065 mutex_unlock(&parent_counter->mutex);
2066
2067 return child_counter;
2068}
2069
2070static int inherit_group(struct perf_counter *parent_counter,
2071 struct task_struct *parent,
2072 struct perf_counter_context *parent_ctx,
2073 struct task_struct *child,
2074 struct perf_counter_context *child_ctx)
2075{
2076 struct perf_counter *leader;
2077 struct perf_counter *sub;
2078
2079 leader = inherit_counter(parent_counter, parent, parent_ctx,
2080 child, NULL, child_ctx);
2081 if (!leader)
2082 return -ENOMEM;
2083 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2084 if (!inherit_counter(sub, parent, parent_ctx,
2085 child, leader, child_ctx))
2086 return -ENOMEM;
2087 }
Ingo Molnar9b51f662008-12-12 13:49:45 +01002088 return 0;
2089}
2090
Paul Mackerrasd859e292009-01-17 18:10:22 +11002091static void sync_child_counter(struct perf_counter *child_counter,
2092 struct perf_counter *parent_counter)
2093{
2094 u64 parent_val, child_val;
2095
2096 parent_val = atomic64_read(&parent_counter->count);
2097 child_val = atomic64_read(&child_counter->count);
2098
2099 /*
2100 * Add back the child's count to the parent's count:
2101 */
2102 atomic64_add(child_val, &parent_counter->count);
2103
2104 /*
2105 * Remove this counter from the parent's list
2106 */
2107 mutex_lock(&parent_counter->mutex);
2108 list_del_init(&child_counter->child_list);
2109 mutex_unlock(&parent_counter->mutex);
2110
2111 /*
2112 * Release the parent counter, if this was the last
2113 * reference to it.
2114 */
2115 fput(parent_counter->filp);
2116}
2117
Ingo Molnar9b51f662008-12-12 13:49:45 +01002118static void
2119__perf_counter_exit_task(struct task_struct *child,
2120 struct perf_counter *child_counter,
2121 struct perf_counter_context *child_ctx)
2122{
2123 struct perf_counter *parent_counter;
Paul Mackerrasd859e292009-01-17 18:10:22 +11002124 struct perf_counter *sub, *tmp;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002125
2126 /*
Ingo Molnar235c7fc2008-12-21 14:43:25 +01002127 * If we do not self-reap then we have to wait for the
2128 * child task to unschedule (it will happen for sure),
2129 * so that its counter is at its final count. (This
2130 * condition triggers rarely - child tasks usually get
2131 * off their CPU before the parent has a chance to
2132 * get this far into the reaping action)
Ingo Molnar9b51f662008-12-12 13:49:45 +01002133 */
Ingo Molnar235c7fc2008-12-21 14:43:25 +01002134 if (child != current) {
2135 wait_task_inactive(child, 0);
2136 list_del_init(&child_counter->list_entry);
2137 } else {
Ingo Molnar0cc0c022008-12-14 23:20:36 +01002138 struct perf_cpu_context *cpuctx;
Ingo Molnar235c7fc2008-12-21 14:43:25 +01002139 unsigned long flags;
2140 u64 perf_flags;
2141
2142 /*
2143 * Disable and unlink this counter.
2144 *
2145 * Be careful about zapping the list - IRQ/NMI context
2146 * could still be processing it:
2147 */
2148 curr_rq_lock_irq_save(&flags);
2149 perf_flags = hw_perf_save_disable();
Ingo Molnar0cc0c022008-12-14 23:20:36 +01002150
2151 cpuctx = &__get_cpu_var(perf_cpu_context);
2152
Paul Mackerrasd859e292009-01-17 18:10:22 +11002153 group_sched_out(child_counter, cpuctx, child_ctx);
Ingo Molnar0cc0c022008-12-14 23:20:36 +01002154
Ingo Molnar235c7fc2008-12-21 14:43:25 +01002155 list_del_init(&child_counter->list_entry);
2156
2157 child_ctx->nr_counters--;
2158
2159 hw_perf_restore(perf_flags);
2160 curr_rq_unlock_irq_restore(&flags);
Ingo Molnar0cc0c022008-12-14 23:20:36 +01002161 }
2162
Ingo Molnar9b51f662008-12-12 13:49:45 +01002163 parent_counter = child_counter->parent;
2164 /*
2165 * It can happen that parent exits first, and has counters
2166 * that are still around due to the child reference. These
2167 * counters need to be zapped - but otherwise linger.
2168 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11002169 if (parent_counter) {
2170 sync_child_counter(child_counter, parent_counter);
2171 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2172 list_entry) {
Paul Mackerras4bcf3492009-02-11 13:53:19 +01002173 if (sub->parent) {
Paul Mackerrasd859e292009-01-17 18:10:22 +11002174 sync_child_counter(sub, sub->parent);
Paul Mackerras4bcf3492009-02-11 13:53:19 +01002175 kfree(sub);
2176 }
Paul Mackerrasd859e292009-01-17 18:10:22 +11002177 }
Mike Galbraith65d37082009-01-29 14:06:52 +01002178 kfree(child_counter);
Paul Mackerras4bcf3492009-02-11 13:53:19 +01002179 }
Ingo Molnar9b51f662008-12-12 13:49:45 +01002180}
2181
2182/*
Paul Mackerrasd859e292009-01-17 18:10:22 +11002183 * When a child task exits, feed back counter values to parent counters.
Ingo Molnar9b51f662008-12-12 13:49:45 +01002184 *
Paul Mackerrasd859e292009-01-17 18:10:22 +11002185 * Note: we may be running in child context, but the PID is not hashed
Ingo Molnar9b51f662008-12-12 13:49:45 +01002186 * anymore so new counters will not be added.
2187 */
2188void perf_counter_exit_task(struct task_struct *child)
2189{
2190 struct perf_counter *child_counter, *tmp;
2191 struct perf_counter_context *child_ctx;
2192
2193 child_ctx = &child->perf_counter_ctx;
2194
2195 if (likely(!child_ctx->nr_counters))
2196 return;
2197
2198 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2199 list_entry)
2200 __perf_counter_exit_task(child, child_counter, child_ctx);
2201}
2202
2203/*
2204 * Initialize the perf_counter context in task_struct
2205 */
2206void perf_counter_init_task(struct task_struct *child)
2207{
2208 struct perf_counter_context *child_ctx, *parent_ctx;
Paul Mackerrasd859e292009-01-17 18:10:22 +11002209 struct perf_counter *counter;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002210 struct task_struct *parent = current;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002211
2212 child_ctx = &child->perf_counter_ctx;
2213 parent_ctx = &parent->perf_counter_ctx;
2214
2215 __perf_counter_init_context(child_ctx, child);
2216
2217 /*
2218 * This is executed from the parent task context, so inherit
2219 * counters that have been marked for cloning:
2220 */
2221
2222 if (likely(!parent_ctx->nr_counters))
2223 return;
2224
2225 /*
2226 * Lock the parent list. No need to lock the child - not PID
2227 * hashed yet and not running, so nobody can access it.
2228 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11002229 mutex_lock(&parent_ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002230
2231 /*
2232 * We dont have to disable NMIs - we are only looking at
2233 * the list, not manipulating it:
2234 */
2235 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
Paul Mackerrasd859e292009-01-17 18:10:22 +11002236 if (!counter->hw_event.inherit)
Ingo Molnar9b51f662008-12-12 13:49:45 +01002237 continue;
2238
Paul Mackerrasd859e292009-01-17 18:10:22 +11002239 if (inherit_group(counter, parent,
Ingo Molnar9b51f662008-12-12 13:49:45 +01002240 parent_ctx, child, child_ctx))
2241 break;
2242 }
2243
Paul Mackerrasd859e292009-01-17 18:10:22 +11002244 mutex_unlock(&parent_ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002245}
2246
Ingo Molnar04289bb2008-12-11 08:38:42 +01002247static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +01002248{
Ingo Molnar04289bb2008-12-11 08:38:42 +01002249 struct perf_cpu_context *cpuctx;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002250
Ingo Molnar04289bb2008-12-11 08:38:42 +01002251 cpuctx = &per_cpu(perf_cpu_context, cpu);
2252 __perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002253
2254 mutex_lock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01002255 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002256 mutex_unlock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01002257
Paul Mackerras01d02872009-01-14 13:44:19 +11002258 hw_perf_counter_setup(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002259}
2260
2261#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar04289bb2008-12-11 08:38:42 +01002262static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +01002263{
2264 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2265 struct perf_counter_context *ctx = &cpuctx->ctx;
2266 struct perf_counter *counter, *tmp;
2267
Ingo Molnar04289bb2008-12-11 08:38:42 +01002268 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2269 __perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002270}
Ingo Molnar04289bb2008-12-11 08:38:42 +01002271static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +01002272{
Paul Mackerrasd859e292009-01-17 18:10:22 +11002273 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2274 struct perf_counter_context *ctx = &cpuctx->ctx;
2275
2276 mutex_lock(&ctx->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01002277 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002278 mutex_unlock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002279}
2280#else
Ingo Molnar04289bb2008-12-11 08:38:42 +01002281static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner0793a612008-12-04 20:12:29 +01002282#endif
2283
2284static int __cpuinit
2285perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2286{
2287 unsigned int cpu = (long)hcpu;
2288
2289 switch (action) {
2290
2291 case CPU_UP_PREPARE:
2292 case CPU_UP_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +01002293 perf_counter_init_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002294 break;
2295
2296 case CPU_DOWN_PREPARE:
2297 case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +01002298 perf_counter_exit_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002299 break;
2300
2301 default:
2302 break;
2303 }
2304
2305 return NOTIFY_OK;
2306}
2307
2308static struct notifier_block __cpuinitdata perf_cpu_nb = {
2309 .notifier_call = perf_cpu_notify,
2310};
2311
2312static int __init perf_counter_init(void)
2313{
2314 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2315 (void *)(long)smp_processor_id());
2316 register_cpu_notifier(&perf_cpu_nb);
2317
2318 return 0;
2319}
2320early_initcall(perf_counter_init);
2321
2322static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2323{
2324 return sprintf(buf, "%d\n", perf_reserved_percpu);
2325}
2326
2327static ssize_t
2328perf_set_reserve_percpu(struct sysdev_class *class,
2329 const char *buf,
2330 size_t count)
2331{
2332 struct perf_cpu_context *cpuctx;
2333 unsigned long val;
2334 int err, cpu, mpt;
2335
2336 err = strict_strtoul(buf, 10, &val);
2337 if (err)
2338 return err;
2339 if (val > perf_max_counters)
2340 return -EINVAL;
2341
2342 mutex_lock(&perf_resource_mutex);
2343 perf_reserved_percpu = val;
2344 for_each_online_cpu(cpu) {
2345 cpuctx = &per_cpu(perf_cpu_context, cpu);
2346 spin_lock_irq(&cpuctx->ctx.lock);
2347 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2348 perf_max_counters - perf_reserved_percpu);
2349 cpuctx->max_pertask = mpt;
2350 spin_unlock_irq(&cpuctx->ctx.lock);
2351 }
2352 mutex_unlock(&perf_resource_mutex);
2353
2354 return count;
2355}
2356
2357static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2358{
2359 return sprintf(buf, "%d\n", perf_overcommit);
2360}
2361
2362static ssize_t
2363perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2364{
2365 unsigned long val;
2366 int err;
2367
2368 err = strict_strtoul(buf, 10, &val);
2369 if (err)
2370 return err;
2371 if (val > 1)
2372 return -EINVAL;
2373
2374 mutex_lock(&perf_resource_mutex);
2375 perf_overcommit = val;
2376 mutex_unlock(&perf_resource_mutex);
2377
2378 return count;
2379}
2380
2381static SYSDEV_CLASS_ATTR(
2382 reserve_percpu,
2383 0644,
2384 perf_show_reserve_percpu,
2385 perf_set_reserve_percpu
2386 );
2387
2388static SYSDEV_CLASS_ATTR(
2389 overcommit,
2390 0644,
2391 perf_show_overcommit,
2392 perf_set_overcommit
2393 );
2394
2395static struct attribute *perfclass_attrs[] = {
2396 &attr_reserve_percpu.attr,
2397 &attr_overcommit.attr,
2398 NULL
2399};
2400
2401static struct attribute_group perfclass_attr_group = {
2402 .attrs = perfclass_attrs,
2403 .name = "perf_counters",
2404};
2405
2406static int __init perf_counter_sysfs_init(void)
2407{
2408 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2409 &perfclass_attr_group);
2410}
2411device_initcall(perf_counter_sysfs_init);