blob: 76376ecb23b5ebe9a45eff99ba84d6006291373d [file] [log] [blame]
Thomas Gleixner0793a612008-12-04 20:12:29 +01001/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
Peter Zijlstra7b732a72009-03-23 18:22:10 +01007 *
8 * For licensing details see kernel-base/COPYING
Thomas Gleixner0793a612008-12-04 20:12:29 +01009 */
10
11#include <linux/fs.h>
Peter Zijlstrab9cacc72009-03-25 12:30:22 +010012#include <linux/mm.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010013#include <linux/cpu.h>
14#include <linux/smp.h>
Ingo Molnar04289bb2008-12-11 08:38:42 +010015#include <linux/file.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010016#include <linux/poll.h>
17#include <linux/sysfs.h>
18#include <linux/ptrace.h>
19#include <linux/percpu.h>
Peter Zijlstrab9cacc72009-03-25 12:30:22 +010020#include <linux/vmstat.h>
21#include <linux/hardirq.h>
22#include <linux/rculist.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010023#include <linux/uaccess.h>
24#include <linux/syscalls.h>
25#include <linux/anon_inodes.h>
Ingo Molnaraa9c4c02008-12-17 14:10:57 +010026#include <linux/kernel_stat.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010027#include <linux/perf_counter.h>
Peter Zijlstra0a4a9392009-03-30 19:07:05 +020028#include <linux/dcache.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010029
Tim Blechmann4e193bd2009-03-14 14:29:25 +010030#include <asm/irq_regs.h>
31
Thomas Gleixner0793a612008-12-04 20:12:29 +010032/*
33 * Each CPU has a list of per CPU counters:
34 */
35DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
36
Ingo Molnar088e2852008-12-14 20:21:00 +010037int perf_max_counters __read_mostly = 1;
Thomas Gleixner0793a612008-12-04 20:12:29 +010038static int perf_reserved_percpu __read_mostly;
39static int perf_overcommit __read_mostly = 1;
40
Peter Zijlstra9ee318a2009-04-09 10:53:44 +020041static atomic_t nr_mmap_tracking __read_mostly;
42static atomic_t nr_munmap_tracking __read_mostly;
43static atomic_t nr_comm_tracking __read_mostly;
44
Thomas Gleixner0793a612008-12-04 20:12:29 +010045/*
46 * Mutex for (sysadmin-configurable) counter reservations:
47 */
48static DEFINE_MUTEX(perf_resource_mutex);
49
50/*
51 * Architecture provided APIs - weak aliases:
52 */
Ingo Molnar5c92d122008-12-11 13:21:10 +010053extern __weak const struct hw_perf_counter_ops *
Ingo Molnar621a01e2008-12-11 12:46:46 +010054hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010055{
Paul Mackerrasff6f0542009-01-09 16:19:25 +110056 return NULL;
Thomas Gleixner0793a612008-12-04 20:12:29 +010057}
58
Ingo Molnar01b28382008-12-11 13:45:51 +010059u64 __weak hw_perf_save_disable(void) { return 0; }
Yinghai Lu01ea1cc2008-12-26 21:05:06 -080060void __weak hw_perf_restore(u64 ctrl) { barrier(); }
Paul Mackerras01d02872009-01-14 13:44:19 +110061void __weak hw_perf_counter_setup(int cpu) { barrier(); }
Paul Mackerras3cbed422009-01-09 16:43:42 +110062int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
63 struct perf_cpu_context *cpuctx,
64 struct perf_counter_context *ctx, int cpu)
65{
66 return 0;
67}
Thomas Gleixner0793a612008-12-04 20:12:29 +010068
Paul Mackerras4eb96fc2009-01-09 17:24:34 +110069void __weak perf_counter_print_debug(void) { }
70
Ingo Molnar04289bb2008-12-11 08:38:42 +010071static void
72list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
73{
74 struct perf_counter *group_leader = counter->group_leader;
75
76 /*
77 * Depending on whether it is a standalone or sibling counter,
78 * add it straight to the context's counter list, or to the group
79 * leader's sibling list:
80 */
81 if (counter->group_leader == counter)
82 list_add_tail(&counter->list_entry, &ctx->counter_list);
Peter Zijlstra5c148192009-03-25 12:30:23 +010083 else {
Ingo Molnar04289bb2008-12-11 08:38:42 +010084 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
Peter Zijlstra5c148192009-03-25 12:30:23 +010085 group_leader->nr_siblings++;
86 }
Peter Zijlstra592903c2009-03-13 12:21:36 +010087
88 list_add_rcu(&counter->event_entry, &ctx->event_list);
Ingo Molnar04289bb2008-12-11 08:38:42 +010089}
90
91static void
92list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
93{
94 struct perf_counter *sibling, *tmp;
95
96 list_del_init(&counter->list_entry);
Peter Zijlstra592903c2009-03-13 12:21:36 +010097 list_del_rcu(&counter->event_entry);
Ingo Molnar04289bb2008-12-11 08:38:42 +010098
Peter Zijlstra5c148192009-03-25 12:30:23 +010099 if (counter->group_leader != counter)
100 counter->group_leader->nr_siblings--;
101
Ingo Molnar04289bb2008-12-11 08:38:42 +0100102 /*
103 * If this was a group counter with sibling counters then
104 * upgrade the siblings to singleton counters by adding them
105 * to the context list directly:
106 */
107 list_for_each_entry_safe(sibling, tmp,
108 &counter->sibling_list, list_entry) {
109
Peter Zijlstra75564232009-03-13 12:21:29 +0100110 list_move_tail(&sibling->list_entry, &ctx->counter_list);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100111 sibling->group_leader = sibling;
112 }
113}
114
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100115static void
116counter_sched_out(struct perf_counter *counter,
117 struct perf_cpu_context *cpuctx,
118 struct perf_counter_context *ctx)
119{
120 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
121 return;
122
123 counter->state = PERF_COUNTER_STATE_INACTIVE;
Peter Zijlstra4af49982009-04-06 11:45:10 +0200124 counter->tstamp_stopped = ctx->time;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100125 counter->hw_ops->disable(counter);
126 counter->oncpu = -1;
127
128 if (!is_software_counter(counter))
129 cpuctx->active_oncpu--;
130 ctx->nr_active--;
131 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
132 cpuctx->exclusive = 0;
133}
134
Paul Mackerrasd859e292009-01-17 18:10:22 +1100135static void
136group_sched_out(struct perf_counter *group_counter,
137 struct perf_cpu_context *cpuctx,
138 struct perf_counter_context *ctx)
139{
140 struct perf_counter *counter;
141
142 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
143 return;
144
145 counter_sched_out(group_counter, cpuctx, ctx);
146
147 /*
148 * Schedule out siblings (if any):
149 */
150 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
151 counter_sched_out(counter, cpuctx, ctx);
152
153 if (group_counter->hw_event.exclusive)
154 cpuctx->exclusive = 0;
155}
156
Thomas Gleixner0793a612008-12-04 20:12:29 +0100157/*
158 * Cross CPU call to remove a performance counter
159 *
160 * We disable the counter on the hardware level first. After that we
161 * remove it from the context list.
162 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100163static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100164{
165 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
166 struct perf_counter *counter = info;
167 struct perf_counter_context *ctx = counter->ctx;
Ingo Molnar9b51f662008-12-12 13:49:45 +0100168 unsigned long flags;
Ingo Molnar5c92d122008-12-11 13:21:10 +0100169 u64 perf_flags;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100170
171 /*
172 * If this is a task context, we need to check whether it is
173 * the current task context of this cpu. If not it has been
174 * scheduled out before the smp call arrived.
175 */
176 if (ctx->task && cpuctx->task_ctx != ctx)
177 return;
178
Peter Zijlstra849691a2009-04-06 11:45:12 +0200179 spin_lock_irqsave(&ctx->lock, flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100180
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100181 counter_sched_out(counter, cpuctx, ctx);
182
183 counter->task = NULL;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100184 ctx->nr_counters--;
185
186 /*
187 * Protect the list operation against NMI by disabling the
188 * counters on a global level. NOP for non NMI based counters.
189 */
Ingo Molnar01b28382008-12-11 13:45:51 +0100190 perf_flags = hw_perf_save_disable();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100191 list_del_counter(counter, ctx);
Ingo Molnar01b28382008-12-11 13:45:51 +0100192 hw_perf_restore(perf_flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100193
194 if (!ctx->task) {
195 /*
196 * Allow more per task counters with respect to the
197 * reservation:
198 */
199 cpuctx->max_pertask =
200 min(perf_max_counters - ctx->nr_counters,
201 perf_max_counters - perf_reserved_percpu);
202 }
203
Peter Zijlstra849691a2009-04-06 11:45:12 +0200204 spin_unlock_irqrestore(&ctx->lock, flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100205}
206
207
208/*
209 * Remove the counter from a task's (or a CPU's) list of counters.
210 *
Paul Mackerrasd859e292009-01-17 18:10:22 +1100211 * Must be called with counter->mutex and ctx->mutex held.
Thomas Gleixner0793a612008-12-04 20:12:29 +0100212 *
213 * CPU counters are removed with a smp call. For task counters we only
214 * call when the task is on a CPU.
215 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100216static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100217{
218 struct perf_counter_context *ctx = counter->ctx;
219 struct task_struct *task = ctx->task;
220
221 if (!task) {
222 /*
223 * Per cpu counters are removed via an smp call and
224 * the removal is always sucessful.
225 */
226 smp_call_function_single(counter->cpu,
Ingo Molnar04289bb2008-12-11 08:38:42 +0100227 __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100228 counter, 1);
229 return;
230 }
231
232retry:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100233 task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100234 counter);
235
236 spin_lock_irq(&ctx->lock);
237 /*
238 * If the context is active we need to retry the smp call.
239 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100240 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100241 spin_unlock_irq(&ctx->lock);
242 goto retry;
243 }
244
245 /*
246 * The lock prevents that this context is scheduled in so we
Ingo Molnar04289bb2008-12-11 08:38:42 +0100247 * can remove the counter safely, if the call above did not
Thomas Gleixner0793a612008-12-04 20:12:29 +0100248 * succeed.
249 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100250 if (!list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100251 ctx->nr_counters--;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100252 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100253 counter->task = NULL;
254 }
255 spin_unlock_irq(&ctx->lock);
256}
257
Peter Zijlstra4af49982009-04-06 11:45:10 +0200258static inline u64 perf_clock(void)
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100259{
Peter Zijlstra4af49982009-04-06 11:45:10 +0200260 return cpu_clock(smp_processor_id());
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100261}
262
263/*
264 * Update the record of the current time in a context.
265 */
Peter Zijlstra4af49982009-04-06 11:45:10 +0200266static void update_context_time(struct perf_counter_context *ctx)
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100267{
Peter Zijlstra4af49982009-04-06 11:45:10 +0200268 u64 now = perf_clock();
269
270 ctx->time += now - ctx->timestamp;
271 ctx->timestamp = now;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100272}
273
274/*
275 * Update the total_time_enabled and total_time_running fields for a counter.
276 */
277static void update_counter_times(struct perf_counter *counter)
278{
279 struct perf_counter_context *ctx = counter->ctx;
280 u64 run_end;
281
Peter Zijlstra4af49982009-04-06 11:45:10 +0200282 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
283 return;
284
285 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
286
287 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
288 run_end = counter->tstamp_stopped;
289 else
290 run_end = ctx->time;
291
292 counter->total_time_running = run_end - counter->tstamp_running;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100293}
294
295/*
296 * Update total_time_enabled and total_time_running for all counters in a group.
297 */
298static void update_group_times(struct perf_counter *leader)
299{
300 struct perf_counter *counter;
301
302 update_counter_times(leader);
303 list_for_each_entry(counter, &leader->sibling_list, list_entry)
304 update_counter_times(counter);
305}
306
307/*
Paul Mackerrasd859e292009-01-17 18:10:22 +1100308 * Cross CPU call to disable a performance counter
309 */
310static void __perf_counter_disable(void *info)
311{
312 struct perf_counter *counter = info;
313 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
314 struct perf_counter_context *ctx = counter->ctx;
315 unsigned long flags;
316
317 /*
318 * If this is a per-task counter, need to check whether this
319 * counter's task is the current task on this cpu.
320 */
321 if (ctx->task && cpuctx->task_ctx != ctx)
322 return;
323
Peter Zijlstra849691a2009-04-06 11:45:12 +0200324 spin_lock_irqsave(&ctx->lock, flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100325
326 /*
327 * If the counter is on, turn it off.
328 * If it is in error state, leave it in error state.
329 */
330 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
Peter Zijlstra4af49982009-04-06 11:45:10 +0200331 update_context_time(ctx);
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100332 update_counter_times(counter);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100333 if (counter == counter->group_leader)
334 group_sched_out(counter, cpuctx, ctx);
335 else
336 counter_sched_out(counter, cpuctx, ctx);
337 counter->state = PERF_COUNTER_STATE_OFF;
338 }
339
Peter Zijlstra849691a2009-04-06 11:45:12 +0200340 spin_unlock_irqrestore(&ctx->lock, flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100341}
342
343/*
344 * Disable a counter.
345 */
346static void perf_counter_disable(struct perf_counter *counter)
347{
348 struct perf_counter_context *ctx = counter->ctx;
349 struct task_struct *task = ctx->task;
350
351 if (!task) {
352 /*
353 * Disable the counter on the cpu that it's on
354 */
355 smp_call_function_single(counter->cpu, __perf_counter_disable,
356 counter, 1);
357 return;
358 }
359
360 retry:
361 task_oncpu_function_call(task, __perf_counter_disable, counter);
362
363 spin_lock_irq(&ctx->lock);
364 /*
365 * If the counter is still active, we need to retry the cross-call.
366 */
367 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
368 spin_unlock_irq(&ctx->lock);
369 goto retry;
370 }
371
372 /*
373 * Since we have the lock this context can't be scheduled
374 * in, so we can change the state safely.
375 */
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100376 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
377 update_counter_times(counter);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100378 counter->state = PERF_COUNTER_STATE_OFF;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100379 }
Paul Mackerrasd859e292009-01-17 18:10:22 +1100380
381 spin_unlock_irq(&ctx->lock);
382}
383
384/*
385 * Disable a counter and all its children.
386 */
387static void perf_counter_disable_family(struct perf_counter *counter)
388{
389 struct perf_counter *child;
390
391 perf_counter_disable(counter);
392
393 /*
394 * Lock the mutex to protect the list of children
395 */
396 mutex_lock(&counter->mutex);
397 list_for_each_entry(child, &counter->child_list, child_list)
398 perf_counter_disable(child);
399 mutex_unlock(&counter->mutex);
400}
401
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100402static int
403counter_sched_in(struct perf_counter *counter,
404 struct perf_cpu_context *cpuctx,
405 struct perf_counter_context *ctx,
406 int cpu)
407{
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100408 if (counter->state <= PERF_COUNTER_STATE_OFF)
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100409 return 0;
410
411 counter->state = PERF_COUNTER_STATE_ACTIVE;
412 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
413 /*
414 * The new state must be visible before we turn it on in the hardware:
415 */
416 smp_wmb();
417
418 if (counter->hw_ops->enable(counter)) {
419 counter->state = PERF_COUNTER_STATE_INACTIVE;
420 counter->oncpu = -1;
421 return -EAGAIN;
422 }
423
Peter Zijlstra4af49982009-04-06 11:45:10 +0200424 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100425
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100426 if (!is_software_counter(counter))
427 cpuctx->active_oncpu++;
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100428 ctx->nr_active++;
429
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100430 if (counter->hw_event.exclusive)
431 cpuctx->exclusive = 1;
432
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100433 return 0;
434}
435
Thomas Gleixner0793a612008-12-04 20:12:29 +0100436/*
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100437 * Return 1 for a group consisting entirely of software counters,
438 * 0 if the group contains any hardware counters.
439 */
440static int is_software_only_group(struct perf_counter *leader)
441{
442 struct perf_counter *counter;
443
444 if (!is_software_counter(leader))
445 return 0;
Peter Zijlstra5c148192009-03-25 12:30:23 +0100446
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100447 list_for_each_entry(counter, &leader->sibling_list, list_entry)
448 if (!is_software_counter(counter))
449 return 0;
Peter Zijlstra5c148192009-03-25 12:30:23 +0100450
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100451 return 1;
452}
453
454/*
455 * Work out whether we can put this counter group on the CPU now.
456 */
457static int group_can_go_on(struct perf_counter *counter,
458 struct perf_cpu_context *cpuctx,
459 int can_add_hw)
460{
461 /*
462 * Groups consisting entirely of software counters can always go on.
463 */
464 if (is_software_only_group(counter))
465 return 1;
466 /*
467 * If an exclusive group is already on, no other hardware
468 * counters can go on.
469 */
470 if (cpuctx->exclusive)
471 return 0;
472 /*
473 * If this group is exclusive and there are already
474 * counters on the CPU, it can't go on.
475 */
476 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
477 return 0;
478 /*
479 * Otherwise, try to add it if all previous groups were able
480 * to go on.
481 */
482 return can_add_hw;
483}
484
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100485static void add_counter_to_ctx(struct perf_counter *counter,
486 struct perf_counter_context *ctx)
487{
488 list_add_counter(counter, ctx);
489 ctx->nr_counters++;
490 counter->prev_state = PERF_COUNTER_STATE_OFF;
Peter Zijlstra4af49982009-04-06 11:45:10 +0200491 counter->tstamp_enabled = ctx->time;
492 counter->tstamp_running = ctx->time;
493 counter->tstamp_stopped = ctx->time;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100494}
495
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100496/*
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100497 * Cross CPU call to install and enable a performance counter
Thomas Gleixner0793a612008-12-04 20:12:29 +0100498 */
499static void __perf_install_in_context(void *info)
500{
501 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
502 struct perf_counter *counter = info;
503 struct perf_counter_context *ctx = counter->ctx;
Paul Mackerrasd859e292009-01-17 18:10:22 +1100504 struct perf_counter *leader = counter->group_leader;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100505 int cpu = smp_processor_id();
Ingo Molnar9b51f662008-12-12 13:49:45 +0100506 unsigned long flags;
Ingo Molnar5c92d122008-12-11 13:21:10 +0100507 u64 perf_flags;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100508 int err;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100509
510 /*
511 * If this is a task context, we need to check whether it is
512 * the current task context of this cpu. If not it has been
513 * scheduled out before the smp call arrived.
514 */
515 if (ctx->task && cpuctx->task_ctx != ctx)
516 return;
517
Peter Zijlstra849691a2009-04-06 11:45:12 +0200518 spin_lock_irqsave(&ctx->lock, flags);
Peter Zijlstra4af49982009-04-06 11:45:10 +0200519 update_context_time(ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100520
521 /*
522 * Protect the list operation against NMI by disabling the
523 * counters on a global level. NOP for non NMI based counters.
524 */
Ingo Molnar01b28382008-12-11 13:45:51 +0100525 perf_flags = hw_perf_save_disable();
Thomas Gleixner0793a612008-12-04 20:12:29 +0100526
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100527 add_counter_to_ctx(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100528
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100529 /*
Paul Mackerrasd859e292009-01-17 18:10:22 +1100530 * Don't put the counter on if it is disabled or if
531 * it is in a group and the group isn't on.
532 */
533 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
534 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
535 goto unlock;
536
537 /*
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100538 * An exclusive counter can't go on if there are already active
539 * hardware counters, and no hardware counter can go on if there
540 * is already an exclusive counter on.
541 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100542 if (!group_can_go_on(counter, cpuctx, 1))
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100543 err = -EEXIST;
544 else
545 err = counter_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100546
Paul Mackerrasd859e292009-01-17 18:10:22 +1100547 if (err) {
548 /*
549 * This counter couldn't go on. If it is in a group
550 * then we have to pull the whole group off.
551 * If the counter group is pinned then put it in error state.
552 */
553 if (leader != counter)
554 group_sched_out(leader, cpuctx, ctx);
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100555 if (leader->hw_event.pinned) {
556 update_group_times(leader);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100557 leader->state = PERF_COUNTER_STATE_ERROR;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100558 }
Paul Mackerrasd859e292009-01-17 18:10:22 +1100559 }
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100560
561 if (!err && !ctx->task && cpuctx->max_pertask)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100562 cpuctx->max_pertask--;
563
Paul Mackerrasd859e292009-01-17 18:10:22 +1100564 unlock:
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100565 hw_perf_restore(perf_flags);
566
Peter Zijlstra849691a2009-04-06 11:45:12 +0200567 spin_unlock_irqrestore(&ctx->lock, flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100568}
569
570/*
571 * Attach a performance counter to a context
572 *
573 * First we add the counter to the list with the hardware enable bit
574 * in counter->hw_config cleared.
575 *
576 * If the counter is attached to a task which is on a CPU we use a smp
577 * call to enable it in the task context. The task might have been
578 * scheduled away, but we check this in the smp call again.
Paul Mackerrasd859e292009-01-17 18:10:22 +1100579 *
580 * Must be called with ctx->mutex held.
Thomas Gleixner0793a612008-12-04 20:12:29 +0100581 */
582static void
583perf_install_in_context(struct perf_counter_context *ctx,
584 struct perf_counter *counter,
585 int cpu)
586{
587 struct task_struct *task = ctx->task;
588
Thomas Gleixner0793a612008-12-04 20:12:29 +0100589 if (!task) {
590 /*
591 * Per cpu counters are installed via an smp call and
592 * the install is always sucessful.
593 */
594 smp_call_function_single(cpu, __perf_install_in_context,
595 counter, 1);
596 return;
597 }
598
599 counter->task = task;
600retry:
601 task_oncpu_function_call(task, __perf_install_in_context,
602 counter);
603
604 spin_lock_irq(&ctx->lock);
605 /*
Thomas Gleixner0793a612008-12-04 20:12:29 +0100606 * we need to retry the smp call.
607 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100608 if (ctx->is_active && list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100609 spin_unlock_irq(&ctx->lock);
610 goto retry;
611 }
612
613 /*
614 * The lock prevents that this context is scheduled in so we
615 * can add the counter safely, if it the call above did not
616 * succeed.
617 */
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100618 if (list_empty(&counter->list_entry))
619 add_counter_to_ctx(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100620 spin_unlock_irq(&ctx->lock);
621}
622
Paul Mackerrasd859e292009-01-17 18:10:22 +1100623/*
624 * Cross CPU call to enable a performance counter
625 */
626static void __perf_counter_enable(void *info)
Ingo Molnar04289bb2008-12-11 08:38:42 +0100627{
Paul Mackerrasd859e292009-01-17 18:10:22 +1100628 struct perf_counter *counter = info;
629 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
630 struct perf_counter_context *ctx = counter->ctx;
631 struct perf_counter *leader = counter->group_leader;
632 unsigned long flags;
633 int err;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100634
635 /*
Paul Mackerrasd859e292009-01-17 18:10:22 +1100636 * If this is a per-task counter, need to check whether this
637 * counter's task is the current task on this cpu.
Ingo Molnar04289bb2008-12-11 08:38:42 +0100638 */
Paul Mackerrasd859e292009-01-17 18:10:22 +1100639 if (ctx->task && cpuctx->task_ctx != ctx)
640 return;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100641
Peter Zijlstra849691a2009-04-06 11:45:12 +0200642 spin_lock_irqsave(&ctx->lock, flags);
Peter Zijlstra4af49982009-04-06 11:45:10 +0200643 update_context_time(ctx);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100644
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100645 counter->prev_state = counter->state;
Paul Mackerrasd859e292009-01-17 18:10:22 +1100646 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
647 goto unlock;
648 counter->state = PERF_COUNTER_STATE_INACTIVE;
Peter Zijlstra4af49982009-04-06 11:45:10 +0200649 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
Paul Mackerrasd859e292009-01-17 18:10:22 +1100650
651 /*
652 * If the counter is in a group and isn't the group leader,
653 * then don't put it on unless the group is on.
654 */
655 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
656 goto unlock;
657
658 if (!group_can_go_on(counter, cpuctx, 1))
659 err = -EEXIST;
660 else
661 err = counter_sched_in(counter, cpuctx, ctx,
662 smp_processor_id());
663
664 if (err) {
665 /*
666 * If this counter can't go on and it's part of a
667 * group, then the whole group has to come off.
668 */
669 if (leader != counter)
670 group_sched_out(leader, cpuctx, ctx);
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100671 if (leader->hw_event.pinned) {
672 update_group_times(leader);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100673 leader->state = PERF_COUNTER_STATE_ERROR;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100674 }
Paul Mackerrasd859e292009-01-17 18:10:22 +1100675 }
676
677 unlock:
Peter Zijlstra849691a2009-04-06 11:45:12 +0200678 spin_unlock_irqrestore(&ctx->lock, flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100679}
680
681/*
682 * Enable a counter.
683 */
684static void perf_counter_enable(struct perf_counter *counter)
685{
686 struct perf_counter_context *ctx = counter->ctx;
687 struct task_struct *task = ctx->task;
688
689 if (!task) {
690 /*
691 * Enable the counter on the cpu that it's on
692 */
693 smp_call_function_single(counter->cpu, __perf_counter_enable,
694 counter, 1);
695 return;
696 }
697
698 spin_lock_irq(&ctx->lock);
699 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
700 goto out;
701
702 /*
703 * If the counter is in error state, clear that first.
704 * That way, if we see the counter in error state below, we
705 * know that it has gone back into error state, as distinct
706 * from the task having been scheduled away before the
707 * cross-call arrived.
708 */
709 if (counter->state == PERF_COUNTER_STATE_ERROR)
710 counter->state = PERF_COUNTER_STATE_OFF;
711
712 retry:
713 spin_unlock_irq(&ctx->lock);
714 task_oncpu_function_call(task, __perf_counter_enable, counter);
715
716 spin_lock_irq(&ctx->lock);
717
718 /*
719 * If the context is active and the counter is still off,
720 * we need to retry the cross-call.
721 */
722 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
723 goto retry;
724
725 /*
726 * Since we have the lock this context can't be scheduled
727 * in, so we can change the state safely.
728 */
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100729 if (counter->state == PERF_COUNTER_STATE_OFF) {
Paul Mackerrasd859e292009-01-17 18:10:22 +1100730 counter->state = PERF_COUNTER_STATE_INACTIVE;
Peter Zijlstra4af49982009-04-06 11:45:10 +0200731 counter->tstamp_enabled =
732 ctx->time - counter->total_time_enabled;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100733 }
Paul Mackerrasd859e292009-01-17 18:10:22 +1100734 out:
735 spin_unlock_irq(&ctx->lock);
736}
737
Peter Zijlstra79f14642009-04-06 11:45:07 +0200738static void perf_counter_refresh(struct perf_counter *counter, int refresh)
739{
740 atomic_add(refresh, &counter->event_limit);
741 perf_counter_enable(counter);
742}
743
Paul Mackerrasd859e292009-01-17 18:10:22 +1100744/*
745 * Enable a counter and all its children.
746 */
747static void perf_counter_enable_family(struct perf_counter *counter)
748{
749 struct perf_counter *child;
750
751 perf_counter_enable(counter);
752
753 /*
754 * Lock the mutex to protect the list of children
755 */
756 mutex_lock(&counter->mutex);
757 list_for_each_entry(child, &counter->child_list, child_list)
758 perf_counter_enable(child);
759 mutex_unlock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100760}
761
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100762void __perf_counter_sched_out(struct perf_counter_context *ctx,
763 struct perf_cpu_context *cpuctx)
764{
765 struct perf_counter *counter;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100766 u64 flags;
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100767
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100768 spin_lock(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100769 ctx->is_active = 0;
770 if (likely(!ctx->nr_counters))
771 goto out;
Peter Zijlstra4af49982009-04-06 11:45:10 +0200772 update_context_time(ctx);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100773
Paul Mackerras3cbed422009-01-09 16:43:42 +1100774 flags = hw_perf_save_disable();
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100775 if (ctx->nr_active) {
776 list_for_each_entry(counter, &ctx->counter_list, list_entry)
777 group_sched_out(counter, cpuctx, ctx);
778 }
Paul Mackerras3cbed422009-01-09 16:43:42 +1100779 hw_perf_restore(flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100780 out:
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100781 spin_unlock(&ctx->lock);
782}
783
Thomas Gleixner0793a612008-12-04 20:12:29 +0100784/*
785 * Called from scheduler to remove the counters of the current task,
786 * with interrupts disabled.
787 *
788 * We stop each counter and update the counter value in counter->count.
789 *
Ingo Molnar76715812008-12-17 14:20:28 +0100790 * This does not protect us against NMI, but disable()
Thomas Gleixner0793a612008-12-04 20:12:29 +0100791 * sets the disabled bit in the control field of counter _before_
792 * accessing the counter control register. If a NMI hits, then it will
793 * not restart the counter.
794 */
795void perf_counter_task_sched_out(struct task_struct *task, int cpu)
796{
797 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
798 struct perf_counter_context *ctx = &task->perf_counter_ctx;
Peter Zijlstra4a0deca2009-03-19 20:26:12 +0100799 struct pt_regs *regs;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100800
801 if (likely(!cpuctx->task_ctx))
802 return;
803
Peter Zijlstrabce379b2009-04-06 11:45:13 +0200804 update_context_time(ctx);
805
Peter Zijlstra4a0deca2009-03-19 20:26:12 +0100806 regs = task_pt_regs(task);
Peter Zijlstra78f13e92009-04-08 15:01:33 +0200807 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100808 __perf_counter_sched_out(ctx, cpuctx);
809
Thomas Gleixner0793a612008-12-04 20:12:29 +0100810 cpuctx->task_ctx = NULL;
811}
812
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100813static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
Ingo Molnar04289bb2008-12-11 08:38:42 +0100814{
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100815 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100816}
817
Ingo Molnar79958882008-12-17 08:54:56 +0100818static int
Ingo Molnar04289bb2008-12-11 08:38:42 +0100819group_sched_in(struct perf_counter *group_counter,
820 struct perf_cpu_context *cpuctx,
821 struct perf_counter_context *ctx,
822 int cpu)
823{
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100824 struct perf_counter *counter, *partial_group;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100825 int ret;
826
827 if (group_counter->state == PERF_COUNTER_STATE_OFF)
828 return 0;
829
830 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
831 if (ret)
832 return ret < 0 ? ret : 0;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100833
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100834 group_counter->prev_state = group_counter->state;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100835 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
836 return -EAGAIN;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100837
838 /*
839 * Schedule in siblings as one group (if any):
840 */
Ingo Molnar79958882008-12-17 08:54:56 +0100841 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
Paul Mackerrasc07c99b2009-02-13 22:10:34 +1100842 counter->prev_state = counter->state;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100843 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
844 partial_group = counter;
845 goto group_error;
846 }
Ingo Molnar79958882008-12-17 08:54:56 +0100847 }
848
Paul Mackerras3cbed422009-01-09 16:43:42 +1100849 return 0;
Ingo Molnar95cdd2e2008-12-21 13:50:42 +0100850
851group_error:
852 /*
853 * Groups can be scheduled in as one unit only, so undo any
854 * partial group before returning:
855 */
856 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
857 if (counter == partial_group)
858 break;
859 counter_sched_out(counter, cpuctx, ctx);
860 }
861 counter_sched_out(group_counter, cpuctx, ctx);
862
863 return -EAGAIN;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100864}
865
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100866static void
867__perf_counter_sched_in(struct perf_counter_context *ctx,
868 struct perf_cpu_context *cpuctx, int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100869{
Thomas Gleixner0793a612008-12-04 20:12:29 +0100870 struct perf_counter *counter;
Paul Mackerras3cbed422009-01-09 16:43:42 +1100871 u64 flags;
Paul Mackerrasdd0e6ba2009-01-12 15:11:00 +1100872 int can_add_hw = 1;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100873
Thomas Gleixner0793a612008-12-04 20:12:29 +0100874 spin_lock(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100875 ctx->is_active = 1;
876 if (likely(!ctx->nr_counters))
877 goto out;
878
Peter Zijlstra4af49982009-04-06 11:45:10 +0200879 ctx->timestamp = perf_clock();
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100880
Paul Mackerras3cbed422009-01-09 16:43:42 +1100881 flags = hw_perf_save_disable();
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100882
883 /*
884 * First go through the list and put on any pinned groups
885 * in order to give them the best chance of going on.
886 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100887 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100888 if (counter->state <= PERF_COUNTER_STATE_OFF ||
889 !counter->hw_event.pinned)
890 continue;
891 if (counter->cpu != -1 && counter->cpu != cpu)
892 continue;
893
894 if (group_can_go_on(counter, cpuctx, 1))
895 group_sched_in(counter, cpuctx, ctx, cpu);
896
897 /*
898 * If this pinned group hasn't been scheduled,
899 * put it in error state.
900 */
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100901 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
902 update_group_times(counter);
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100903 counter->state = PERF_COUNTER_STATE_ERROR;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100904 }
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100905 }
906
907 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
908 /*
909 * Ignore counters in OFF or ERROR state, and
910 * ignore pinned counters since we did them already.
911 */
912 if (counter->state <= PERF_COUNTER_STATE_OFF ||
913 counter->hw_event.pinned)
914 continue;
915
Ingo Molnar04289bb2008-12-11 08:38:42 +0100916 /*
917 * Listen to the 'cpu' scheduling filter constraint
918 * of counters:
919 */
Thomas Gleixner0793a612008-12-04 20:12:29 +0100920 if (counter->cpu != -1 && counter->cpu != cpu)
921 continue;
922
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100923 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
Paul Mackerrasdd0e6ba2009-01-12 15:11:00 +1100924 if (group_sched_in(counter, cpuctx, ctx, cpu))
925 can_add_hw = 0;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100926 }
Thomas Gleixner0793a612008-12-04 20:12:29 +0100927 }
Paul Mackerras3cbed422009-01-09 16:43:42 +1100928 hw_perf_restore(flags);
Paul Mackerrasd859e292009-01-17 18:10:22 +1100929 out:
Thomas Gleixner0793a612008-12-04 20:12:29 +0100930 spin_unlock(&ctx->lock);
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100931}
Ingo Molnar04289bb2008-12-11 08:38:42 +0100932
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100933/*
934 * Called from scheduler to add the counters of the current task
935 * with interrupts disabled.
936 *
937 * We restore the counter value and then enable it.
938 *
939 * This does not protect us against NMI, but enable()
940 * sets the enabled bit in the control field of counter _before_
941 * accessing the counter control register. If a NMI hits, then it will
942 * keep the counter running.
943 */
944void perf_counter_task_sched_in(struct task_struct *task, int cpu)
945{
946 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
947 struct perf_counter_context *ctx = &task->perf_counter_ctx;
948
949 __perf_counter_sched_in(ctx, cpuctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100950 cpuctx->task_ctx = ctx;
951}
952
Ingo Molnar235c7fc2008-12-21 14:43:25 +0100953static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
954{
955 struct perf_counter_context *ctx = &cpuctx->ctx;
956
957 __perf_counter_sched_in(ctx, cpuctx, cpu);
958}
959
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100960int perf_counter_task_disable(void)
961{
962 struct task_struct *curr = current;
963 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
964 struct perf_counter *counter;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +0100965 unsigned long flags;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100966 u64 perf_flags;
967 int cpu;
968
969 if (likely(!ctx->nr_counters))
970 return 0;
971
Peter Zijlstra849691a2009-04-06 11:45:12 +0200972 local_irq_save(flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100973 cpu = smp_processor_id();
974
975 perf_counter_task_sched_out(curr, cpu);
976
977 spin_lock(&ctx->lock);
978
979 /*
980 * Disable all the counters:
981 */
982 perf_flags = hw_perf_save_disable();
983
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100984 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100985 if (counter->state != PERF_COUNTER_STATE_ERROR) {
986 update_group_times(counter);
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100987 counter->state = PERF_COUNTER_STATE_OFF;
Paul Mackerras53cfbf52009-03-25 22:46:58 +1100988 }
Paul Mackerras3b6f9e52009-01-14 21:00:30 +1100989 }
Ingo Molnar9b51f662008-12-12 13:49:45 +0100990
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100991 hw_perf_restore(perf_flags);
992
Peter Zijlstra849691a2009-04-06 11:45:12 +0200993 spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +0100994
995 return 0;
996}
997
998int perf_counter_task_enable(void)
999{
1000 struct task_struct *curr = current;
1001 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1002 struct perf_counter *counter;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001003 unsigned long flags;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001004 u64 perf_flags;
1005 int cpu;
1006
1007 if (likely(!ctx->nr_counters))
1008 return 0;
1009
Peter Zijlstra849691a2009-04-06 11:45:12 +02001010 local_irq_save(flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001011 cpu = smp_processor_id();
1012
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001013 perf_counter_task_sched_out(curr, cpu);
1014
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001015 spin_lock(&ctx->lock);
1016
1017 /*
1018 * Disable all the counters:
1019 */
1020 perf_flags = hw_perf_save_disable();
1021
1022 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001023 if (counter->state > PERF_COUNTER_STATE_OFF)
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001024 continue;
Ingo Molnar6a930702008-12-11 15:17:03 +01001025 counter->state = PERF_COUNTER_STATE_INACTIVE;
Peter Zijlstra4af49982009-04-06 11:45:10 +02001026 counter->tstamp_enabled =
1027 ctx->time - counter->total_time_enabled;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001028 counter->hw_event.disabled = 0;
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001029 }
1030 hw_perf_restore(perf_flags);
1031
1032 spin_unlock(&ctx->lock);
1033
1034 perf_counter_task_sched_in(curr, cpu);
1035
Peter Zijlstra849691a2009-04-06 11:45:12 +02001036 local_irq_restore(flags);
Ingo Molnar1d1c7dd2008-12-11 14:59:31 +01001037
1038 return 0;
1039}
1040
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001041/*
1042 * Round-robin a context's counters:
1043 */
1044static void rotate_ctx(struct perf_counter_context *ctx)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001045{
Thomas Gleixner0793a612008-12-04 20:12:29 +01001046 struct perf_counter *counter;
Ingo Molnar5c92d122008-12-11 13:21:10 +01001047 u64 perf_flags;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001048
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001049 if (!ctx->nr_counters)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001050 return;
1051
Thomas Gleixner0793a612008-12-04 20:12:29 +01001052 spin_lock(&ctx->lock);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001053 /*
Ingo Molnar04289bb2008-12-11 08:38:42 +01001054 * Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner0793a612008-12-04 20:12:29 +01001055 */
Ingo Molnar01b28382008-12-11 13:45:51 +01001056 perf_flags = hw_perf_save_disable();
Ingo Molnar04289bb2008-12-11 08:38:42 +01001057 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Peter Zijlstra75564232009-03-13 12:21:29 +01001058 list_move_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001059 break;
1060 }
Ingo Molnar01b28382008-12-11 13:45:51 +01001061 hw_perf_restore(perf_flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001062
1063 spin_unlock(&ctx->lock);
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001064}
Thomas Gleixner0793a612008-12-04 20:12:29 +01001065
Ingo Molnar235c7fc2008-12-21 14:43:25 +01001066void perf_counter_task_tick(struct task_struct *curr, int cpu)
1067{
1068 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1069 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1070 const int rotate_percpu = 0;
1071
1072 if (rotate_percpu)
1073 perf_counter_cpu_sched_out(cpuctx);
1074 perf_counter_task_sched_out(curr, cpu);
1075
1076 if (rotate_percpu)
1077 rotate_ctx(&cpuctx->ctx);
1078 rotate_ctx(ctx);
1079
1080 if (rotate_percpu)
1081 perf_counter_cpu_sched_in(cpuctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001082 perf_counter_task_sched_in(curr, cpu);
1083}
1084
1085/*
Thomas Gleixner0793a612008-12-04 20:12:29 +01001086 * Cross CPU call to read the hardware counter
1087 */
Ingo Molnar76715812008-12-17 14:20:28 +01001088static void __read(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001089{
Ingo Molnar621a01e2008-12-11 12:46:46 +01001090 struct perf_counter *counter = info;
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001091 struct perf_counter_context *ctx = counter->ctx;
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01001092 unsigned long flags;
Ingo Molnar621a01e2008-12-11 12:46:46 +01001093
Peter Zijlstra849691a2009-04-06 11:45:12 +02001094 local_irq_save(flags);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001095 if (ctx->is_active)
Peter Zijlstra4af49982009-04-06 11:45:10 +02001096 update_context_time(ctx);
Ingo Molnar76715812008-12-17 14:20:28 +01001097 counter->hw_ops->read(counter);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001098 update_counter_times(counter);
Peter Zijlstra849691a2009-04-06 11:45:12 +02001099 local_irq_restore(flags);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001100}
1101
Ingo Molnar04289bb2008-12-11 08:38:42 +01001102static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +01001103{
1104 /*
1105 * If counter is enabled and currently active on a CPU, update the
1106 * value in the counter structure:
1107 */
Ingo Molnar6a930702008-12-11 15:17:03 +01001108 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
Thomas Gleixner0793a612008-12-04 20:12:29 +01001109 smp_call_function_single(counter->oncpu,
Ingo Molnar76715812008-12-17 14:20:28 +01001110 __read, counter, 1);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001111 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1112 update_counter_times(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001113 }
1114
Ingo Molnaree060942008-12-13 09:00:03 +01001115 return atomic64_read(&counter->count);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001116}
1117
Thomas Gleixner0793a612008-12-04 20:12:29 +01001118static void put_context(struct perf_counter_context *ctx)
1119{
1120 if (ctx->task)
1121 put_task_struct(ctx->task);
1122}
1123
1124static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1125{
1126 struct perf_cpu_context *cpuctx;
1127 struct perf_counter_context *ctx;
1128 struct task_struct *task;
1129
1130 /*
1131 * If cpu is not a wildcard then this is a percpu counter:
1132 */
1133 if (cpu != -1) {
1134 /* Must be root to operate on a CPU counter: */
1135 if (!capable(CAP_SYS_ADMIN))
1136 return ERR_PTR(-EACCES);
1137
1138 if (cpu < 0 || cpu > num_possible_cpus())
1139 return ERR_PTR(-EINVAL);
1140
1141 /*
1142 * We could be clever and allow to attach a counter to an
1143 * offline CPU and activate it when the CPU comes up, but
1144 * that's for later.
1145 */
1146 if (!cpu_isset(cpu, cpu_online_map))
1147 return ERR_PTR(-ENODEV);
1148
1149 cpuctx = &per_cpu(perf_cpu_context, cpu);
1150 ctx = &cpuctx->ctx;
1151
Thomas Gleixner0793a612008-12-04 20:12:29 +01001152 return ctx;
1153 }
1154
1155 rcu_read_lock();
1156 if (!pid)
1157 task = current;
1158 else
1159 task = find_task_by_vpid(pid);
1160 if (task)
1161 get_task_struct(task);
1162 rcu_read_unlock();
1163
1164 if (!task)
1165 return ERR_PTR(-ESRCH);
1166
1167 ctx = &task->perf_counter_ctx;
1168 ctx->task = task;
1169
1170 /* Reuse ptrace permission checks for now. */
1171 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1172 put_context(ctx);
1173 return ERR_PTR(-EACCES);
1174 }
1175
1176 return ctx;
1177}
1178
Peter Zijlstra592903c2009-03-13 12:21:36 +01001179static void free_counter_rcu(struct rcu_head *head)
1180{
1181 struct perf_counter *counter;
1182
1183 counter = container_of(head, struct perf_counter, rcu_head);
1184 kfree(counter);
1185}
1186
Peter Zijlstra925d5192009-03-30 19:07:02 +02001187static void perf_pending_sync(struct perf_counter *counter);
1188
Peter Zijlstraf1600952009-03-19 20:26:16 +01001189static void free_counter(struct perf_counter *counter)
1190{
Peter Zijlstra925d5192009-03-30 19:07:02 +02001191 perf_pending_sync(counter);
1192
Peter Zijlstra9ee318a2009-04-09 10:53:44 +02001193 if (counter->hw_event.mmap)
1194 atomic_dec(&nr_mmap_tracking);
1195 if (counter->hw_event.munmap)
1196 atomic_dec(&nr_munmap_tracking);
1197 if (counter->hw_event.comm)
1198 atomic_dec(&nr_comm_tracking);
1199
Peter Zijlstrae077df42009-03-19 20:26:17 +01001200 if (counter->destroy)
1201 counter->destroy(counter);
1202
Peter Zijlstraf1600952009-03-19 20:26:16 +01001203 call_rcu(&counter->rcu_head, free_counter_rcu);
1204}
1205
Thomas Gleixner0793a612008-12-04 20:12:29 +01001206/*
1207 * Called when the last reference to the file is gone.
1208 */
1209static int perf_release(struct inode *inode, struct file *file)
1210{
1211 struct perf_counter *counter = file->private_data;
1212 struct perf_counter_context *ctx = counter->ctx;
1213
1214 file->private_data = NULL;
1215
Paul Mackerrasd859e292009-01-17 18:10:22 +11001216 mutex_lock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001217 mutex_lock(&counter->mutex);
1218
Ingo Molnar04289bb2008-12-11 08:38:42 +01001219 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001220
1221 mutex_unlock(&counter->mutex);
Paul Mackerrasd859e292009-01-17 18:10:22 +11001222 mutex_unlock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001223
Peter Zijlstraf1600952009-03-19 20:26:16 +01001224 free_counter(counter);
Mike Galbraith5af75912009-02-11 10:53:37 +01001225 put_context(ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001226
1227 return 0;
1228}
1229
1230/*
1231 * Read the performance counter - simple non blocking version for now
1232 */
1233static ssize_t
1234perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1235{
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001236 u64 values[3];
1237 int n;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001238
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11001239 /*
1240 * Return end-of-file for a read on a counter that is in
1241 * error state (i.e. because it was pinned but it couldn't be
1242 * scheduled on to the CPU at some point).
1243 */
1244 if (counter->state == PERF_COUNTER_STATE_ERROR)
1245 return 0;
1246
Thomas Gleixner0793a612008-12-04 20:12:29 +01001247 mutex_lock(&counter->mutex);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001248 values[0] = perf_counter_read(counter);
1249 n = 1;
1250 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1251 values[n++] = counter->total_time_enabled +
1252 atomic64_read(&counter->child_total_time_enabled);
1253 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1254 values[n++] = counter->total_time_running +
1255 atomic64_read(&counter->child_total_time_running);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001256 mutex_unlock(&counter->mutex);
1257
Paul Mackerras53cfbf52009-03-25 22:46:58 +11001258 if (count < n * sizeof(u64))
1259 return -EINVAL;
1260 count = n * sizeof(u64);
1261
1262 if (copy_to_user(buf, values, count))
1263 return -EFAULT;
1264
1265 return count;
Thomas Gleixner0793a612008-12-04 20:12:29 +01001266}
1267
1268static ssize_t
Thomas Gleixner0793a612008-12-04 20:12:29 +01001269perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1270{
1271 struct perf_counter *counter = file->private_data;
1272
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001273 return perf_read_hw(counter, buf, count);
Thomas Gleixner0793a612008-12-04 20:12:29 +01001274}
1275
1276static unsigned int perf_poll(struct file *file, poll_table *wait)
1277{
1278 struct perf_counter *counter = file->private_data;
Peter Zijlstrac7138f32009-03-24 13:18:16 +01001279 struct perf_mmap_data *data;
1280 unsigned int events;
1281
1282 rcu_read_lock();
1283 data = rcu_dereference(counter->data);
1284 if (data)
1285 events = atomic_xchg(&data->wakeup, 0);
1286 else
1287 events = POLL_HUP;
1288 rcu_read_unlock();
Thomas Gleixner0793a612008-12-04 20:12:29 +01001289
1290 poll_wait(file, &counter->waitq, wait);
1291
Thomas Gleixner0793a612008-12-04 20:12:29 +01001292 return events;
1293}
1294
Paul Mackerrasd859e292009-01-17 18:10:22 +11001295static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1296{
1297 struct perf_counter *counter = file->private_data;
1298 int err = 0;
1299
1300 switch (cmd) {
1301 case PERF_COUNTER_IOC_ENABLE:
1302 perf_counter_enable_family(counter);
1303 break;
1304 case PERF_COUNTER_IOC_DISABLE:
1305 perf_counter_disable_family(counter);
1306 break;
Peter Zijlstra79f14642009-04-06 11:45:07 +02001307 case PERF_COUNTER_IOC_REFRESH:
1308 perf_counter_refresh(counter, arg);
1309 break;
Paul Mackerrasd859e292009-01-17 18:10:22 +11001310 default:
1311 err = -ENOTTY;
1312 }
1313 return err;
1314}
1315
Peter Zijlstra38ff6672009-03-30 19:07:03 +02001316/*
1317 * Callers need to ensure there can be no nesting of this function, otherwise
1318 * the seqlock logic goes bad. We can not serialize this because the arch
1319 * code calls this from NMI context.
1320 */
1321void perf_counter_update_userpage(struct perf_counter *counter)
Paul Mackerras37d81822009-03-23 18:22:08 +01001322{
Peter Zijlstra38ff6672009-03-30 19:07:03 +02001323 struct perf_mmap_data *data;
1324 struct perf_counter_mmap_page *userpg;
1325
1326 rcu_read_lock();
1327 data = rcu_dereference(counter->data);
1328 if (!data)
1329 goto unlock;
1330
1331 userpg = data->user_page;
Paul Mackerras37d81822009-03-23 18:22:08 +01001332
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001333 /*
1334 * Disable preemption so as to not let the corresponding user-space
1335 * spin too long if we get preempted.
1336 */
1337 preempt_disable();
Paul Mackerras37d81822009-03-23 18:22:08 +01001338 ++userpg->lock;
Peter Zijlstra92f22a32009-04-02 11:12:04 +02001339 barrier();
Paul Mackerras37d81822009-03-23 18:22:08 +01001340 userpg->index = counter->hw.idx;
1341 userpg->offset = atomic64_read(&counter->count);
1342 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1343 userpg->offset -= atomic64_read(&counter->hw.prev_count);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001344
Peter Zijlstra92f22a32009-04-02 11:12:04 +02001345 barrier();
Paul Mackerras37d81822009-03-23 18:22:08 +01001346 ++userpg->lock;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001347 preempt_enable();
Peter Zijlstra38ff6672009-03-30 19:07:03 +02001348unlock:
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001349 rcu_read_unlock();
Paul Mackerras37d81822009-03-23 18:22:08 +01001350}
1351
1352static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1353{
1354 struct perf_counter *counter = vma->vm_file->private_data;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001355 struct perf_mmap_data *data;
1356 int ret = VM_FAULT_SIGBUS;
Paul Mackerras37d81822009-03-23 18:22:08 +01001357
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001358 rcu_read_lock();
1359 data = rcu_dereference(counter->data);
1360 if (!data)
1361 goto unlock;
Paul Mackerras37d81822009-03-23 18:22:08 +01001362
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001363 if (vmf->pgoff == 0) {
1364 vmf->page = virt_to_page(data->user_page);
1365 } else {
1366 int nr = vmf->pgoff - 1;
1367
1368 if ((unsigned)nr > data->nr_pages)
1369 goto unlock;
1370
1371 vmf->page = virt_to_page(data->data_pages[nr]);
1372 }
Paul Mackerras37d81822009-03-23 18:22:08 +01001373 get_page(vmf->page);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001374 ret = 0;
1375unlock:
1376 rcu_read_unlock();
1377
1378 return ret;
1379}
1380
1381static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1382{
1383 struct perf_mmap_data *data;
1384 unsigned long size;
1385 int i;
1386
1387 WARN_ON(atomic_read(&counter->mmap_count));
1388
1389 size = sizeof(struct perf_mmap_data);
1390 size += nr_pages * sizeof(void *);
1391
1392 data = kzalloc(size, GFP_KERNEL);
1393 if (!data)
1394 goto fail;
1395
1396 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1397 if (!data->user_page)
1398 goto fail_user_page;
1399
1400 for (i = 0; i < nr_pages; i++) {
1401 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1402 if (!data->data_pages[i])
1403 goto fail_data_pages;
1404 }
1405
1406 data->nr_pages = nr_pages;
1407
1408 rcu_assign_pointer(counter->data, data);
1409
Paul Mackerras37d81822009-03-23 18:22:08 +01001410 return 0;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001411
1412fail_data_pages:
1413 for (i--; i >= 0; i--)
1414 free_page((unsigned long)data->data_pages[i]);
1415
1416 free_page((unsigned long)data->user_page);
1417
1418fail_user_page:
1419 kfree(data);
1420
1421fail:
1422 return -ENOMEM;
1423}
1424
1425static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1426{
1427 struct perf_mmap_data *data = container_of(rcu_head,
1428 struct perf_mmap_data, rcu_head);
1429 int i;
1430
1431 free_page((unsigned long)data->user_page);
1432 for (i = 0; i < data->nr_pages; i++)
1433 free_page((unsigned long)data->data_pages[i]);
1434 kfree(data);
1435}
1436
1437static void perf_mmap_data_free(struct perf_counter *counter)
1438{
1439 struct perf_mmap_data *data = counter->data;
1440
1441 WARN_ON(atomic_read(&counter->mmap_count));
1442
1443 rcu_assign_pointer(counter->data, NULL);
1444 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1445}
1446
1447static void perf_mmap_open(struct vm_area_struct *vma)
1448{
1449 struct perf_counter *counter = vma->vm_file->private_data;
1450
1451 atomic_inc(&counter->mmap_count);
1452}
1453
1454static void perf_mmap_close(struct vm_area_struct *vma)
1455{
1456 struct perf_counter *counter = vma->vm_file->private_data;
1457
1458 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1459 &counter->mmap_mutex)) {
Peter Zijlstraebb3c4c2009-04-06 11:45:05 +02001460 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001461 perf_mmap_data_free(counter);
1462 mutex_unlock(&counter->mmap_mutex);
1463 }
Paul Mackerras37d81822009-03-23 18:22:08 +01001464}
1465
1466static struct vm_operations_struct perf_mmap_vmops = {
Peter Zijlstraebb3c4c2009-04-06 11:45:05 +02001467 .open = perf_mmap_open,
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001468 .close = perf_mmap_close,
Paul Mackerras37d81822009-03-23 18:22:08 +01001469 .fault = perf_mmap_fault,
1470};
1471
1472static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1473{
1474 struct perf_counter *counter = file->private_data;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001475 unsigned long vma_size;
1476 unsigned long nr_pages;
1477 unsigned long locked, lock_limit;
1478 int ret = 0;
Paul Mackerras37d81822009-03-23 18:22:08 +01001479
1480 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1481 return -EINVAL;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001482
1483 vma_size = vma->vm_end - vma->vm_start;
1484 nr_pages = (vma_size / PAGE_SIZE) - 1;
1485
Peter Zijlstra7730d862009-03-25 12:48:31 +01001486 /*
1487 * If we have data pages ensure they're a power-of-two number, so we
1488 * can do bitmasks instead of modulo.
1489 */
1490 if (nr_pages != 0 && !is_power_of_2(nr_pages))
Paul Mackerras37d81822009-03-23 18:22:08 +01001491 return -EINVAL;
1492
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001493 if (vma_size != PAGE_SIZE * (1 + nr_pages))
Paul Mackerras37d81822009-03-23 18:22:08 +01001494 return -EINVAL;
1495
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001496 if (vma->vm_pgoff != 0)
1497 return -EINVAL;
Paul Mackerras37d81822009-03-23 18:22:08 +01001498
Peter Zijlstraebb3c4c2009-04-06 11:45:05 +02001499 mutex_lock(&counter->mmap_mutex);
1500 if (atomic_inc_not_zero(&counter->mmap_count)) {
1501 if (nr_pages != counter->data->nr_pages)
1502 ret = -EINVAL;
1503 goto unlock;
1504 }
1505
1506 locked = vma->vm_mm->locked_vm;
1507 locked += nr_pages + 1;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001508
1509 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1510 lock_limit >>= PAGE_SHIFT;
1511
Peter Zijlstraebb3c4c2009-04-06 11:45:05 +02001512 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1513 ret = -EPERM;
1514 goto unlock;
1515 }
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001516
1517 WARN_ON(counter->data);
1518 ret = perf_mmap_data_alloc(counter, nr_pages);
Peter Zijlstraebb3c4c2009-04-06 11:45:05 +02001519 if (ret)
1520 goto unlock;
1521
1522 atomic_set(&counter->mmap_count, 1);
1523 vma->vm_mm->locked_vm += nr_pages + 1;
1524unlock:
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001525 mutex_unlock(&counter->mmap_mutex);
Paul Mackerras37d81822009-03-23 18:22:08 +01001526
1527 vma->vm_flags &= ~VM_MAYWRITE;
1528 vma->vm_flags |= VM_RESERVED;
1529 vma->vm_ops = &perf_mmap_vmops;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001530
1531 return ret;
Paul Mackerras37d81822009-03-23 18:22:08 +01001532}
1533
Peter Zijlstra3c446b3d2009-04-06 11:45:01 +02001534static int perf_fasync(int fd, struct file *filp, int on)
1535{
1536 struct perf_counter *counter = filp->private_data;
1537 struct inode *inode = filp->f_path.dentry->d_inode;
1538 int retval;
1539
1540 mutex_lock(&inode->i_mutex);
1541 retval = fasync_helper(fd, filp, on, &counter->fasync);
1542 mutex_unlock(&inode->i_mutex);
1543
1544 if (retval < 0)
1545 return retval;
1546
1547 return 0;
1548}
1549
Thomas Gleixner0793a612008-12-04 20:12:29 +01001550static const struct file_operations perf_fops = {
1551 .release = perf_release,
1552 .read = perf_read,
1553 .poll = perf_poll,
Paul Mackerrasd859e292009-01-17 18:10:22 +11001554 .unlocked_ioctl = perf_ioctl,
1555 .compat_ioctl = perf_ioctl,
Paul Mackerras37d81822009-03-23 18:22:08 +01001556 .mmap = perf_mmap,
Peter Zijlstra3c446b3d2009-04-06 11:45:01 +02001557 .fasync = perf_fasync,
Thomas Gleixner0793a612008-12-04 20:12:29 +01001558};
1559
Peter Zijlstra15dbf272009-03-13 12:21:32 +01001560/*
Peter Zijlstra925d5192009-03-30 19:07:02 +02001561 * Perf counter wakeup
1562 *
1563 * If there's data, ensure we set the poll() state and publish everything
1564 * to user-space before waking everybody up.
1565 */
1566
1567void perf_counter_wakeup(struct perf_counter *counter)
1568{
1569 struct perf_mmap_data *data;
1570
1571 rcu_read_lock();
1572 data = rcu_dereference(counter->data);
1573 if (data) {
Peter Zijlstra3c446b3d2009-04-06 11:45:01 +02001574 atomic_set(&data->wakeup, POLL_IN);
Peter Zijlstra38ff6672009-03-30 19:07:03 +02001575 /*
1576 * Ensure all data writes are issued before updating the
1577 * user-space data head information. The matching rmb()
1578 * will be in userspace after reading this value.
1579 */
1580 smp_wmb();
1581 data->user_page->data_head = atomic_read(&data->head);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001582 }
1583 rcu_read_unlock();
1584
1585 wake_up_all(&counter->waitq);
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001586
1587 if (counter->pending_kill) {
1588 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1589 counter->pending_kill = 0;
1590 }
Peter Zijlstra925d5192009-03-30 19:07:02 +02001591}
1592
1593/*
1594 * Pending wakeups
1595 *
1596 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1597 *
1598 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1599 * single linked list and use cmpxchg() to add entries lockless.
1600 */
1601
Peter Zijlstra79f14642009-04-06 11:45:07 +02001602static void perf_pending_counter(struct perf_pending_entry *entry)
1603{
1604 struct perf_counter *counter = container_of(entry,
1605 struct perf_counter, pending);
1606
1607 if (counter->pending_disable) {
1608 counter->pending_disable = 0;
1609 perf_counter_disable(counter);
1610 }
1611
1612 if (counter->pending_wakeup) {
1613 counter->pending_wakeup = 0;
1614 perf_counter_wakeup(counter);
1615 }
1616}
1617
Peter Zijlstra671dec52009-04-06 11:45:02 +02001618#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
Peter Zijlstra925d5192009-03-30 19:07:02 +02001619
Peter Zijlstra671dec52009-04-06 11:45:02 +02001620static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
Peter Zijlstra925d5192009-03-30 19:07:02 +02001621 PENDING_TAIL,
1622};
1623
Peter Zijlstra671dec52009-04-06 11:45:02 +02001624static void perf_pending_queue(struct perf_pending_entry *entry,
1625 void (*func)(struct perf_pending_entry *))
Peter Zijlstra925d5192009-03-30 19:07:02 +02001626{
Peter Zijlstra671dec52009-04-06 11:45:02 +02001627 struct perf_pending_entry **head;
Peter Zijlstra925d5192009-03-30 19:07:02 +02001628
Peter Zijlstra671dec52009-04-06 11:45:02 +02001629 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
Peter Zijlstra925d5192009-03-30 19:07:02 +02001630 return;
1631
Peter Zijlstra671dec52009-04-06 11:45:02 +02001632 entry->func = func;
1633
1634 head = &get_cpu_var(perf_pending_head);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001635
1636 do {
Peter Zijlstra671dec52009-04-06 11:45:02 +02001637 entry->next = *head;
1638 } while (cmpxchg(head, entry->next, entry) != entry->next);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001639
1640 set_perf_counter_pending();
1641
Peter Zijlstra671dec52009-04-06 11:45:02 +02001642 put_cpu_var(perf_pending_head);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001643}
1644
1645static int __perf_pending_run(void)
1646{
Peter Zijlstra671dec52009-04-06 11:45:02 +02001647 struct perf_pending_entry *list;
Peter Zijlstra925d5192009-03-30 19:07:02 +02001648 int nr = 0;
1649
Peter Zijlstra671dec52009-04-06 11:45:02 +02001650 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001651 while (list != PENDING_TAIL) {
Peter Zijlstra671dec52009-04-06 11:45:02 +02001652 void (*func)(struct perf_pending_entry *);
1653 struct perf_pending_entry *entry = list;
Peter Zijlstra925d5192009-03-30 19:07:02 +02001654
1655 list = list->next;
1656
Peter Zijlstra671dec52009-04-06 11:45:02 +02001657 func = entry->func;
1658 entry->next = NULL;
Peter Zijlstra925d5192009-03-30 19:07:02 +02001659 /*
1660 * Ensure we observe the unqueue before we issue the wakeup,
1661 * so that we won't be waiting forever.
1662 * -- see perf_not_pending().
1663 */
1664 smp_wmb();
1665
Peter Zijlstra671dec52009-04-06 11:45:02 +02001666 func(entry);
Peter Zijlstra925d5192009-03-30 19:07:02 +02001667 nr++;
1668 }
1669
1670 return nr;
1671}
1672
1673static inline int perf_not_pending(struct perf_counter *counter)
1674{
1675 /*
1676 * If we flush on whatever cpu we run, there is a chance we don't
1677 * need to wait.
1678 */
1679 get_cpu();
1680 __perf_pending_run();
1681 put_cpu();
1682
1683 /*
1684 * Ensure we see the proper queue state before going to sleep
1685 * so that we do not miss the wakeup. -- see perf_pending_handle()
1686 */
1687 smp_rmb();
Peter Zijlstra671dec52009-04-06 11:45:02 +02001688 return counter->pending.next == NULL;
Peter Zijlstra925d5192009-03-30 19:07:02 +02001689}
1690
1691static void perf_pending_sync(struct perf_counter *counter)
1692{
1693 wait_event(counter->waitq, perf_not_pending(counter));
1694}
1695
1696void perf_counter_do_pending(void)
1697{
1698 __perf_pending_run();
1699}
1700
1701/*
Peter Zijlstra394ee072009-03-30 19:07:14 +02001702 * Callchain support -- arch specific
1703 */
1704
Peter Zijlstra9c03d882009-04-06 11:45:00 +02001705__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
Peter Zijlstra394ee072009-03-30 19:07:14 +02001706{
1707 return NULL;
1708}
1709
1710/*
Peter Zijlstra0322cd62009-03-19 20:26:19 +01001711 * Output
1712 */
1713
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001714struct perf_output_handle {
1715 struct perf_counter *counter;
1716 struct perf_mmap_data *data;
1717 unsigned int offset;
Peter Zijlstra63e35b22009-03-25 12:30:24 +01001718 unsigned int head;
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001719 int wakeup;
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001720 int nmi;
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001721 int overflow;
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001722};
1723
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001724static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1725{
Peter Zijlstra671dec52009-04-06 11:45:02 +02001726 if (handle->nmi) {
Peter Zijlstra79f14642009-04-06 11:45:07 +02001727 handle->counter->pending_wakeup = 1;
Peter Zijlstra671dec52009-04-06 11:45:02 +02001728 perf_pending_queue(&handle->counter->pending,
Peter Zijlstra79f14642009-04-06 11:45:07 +02001729 perf_pending_counter);
Peter Zijlstra671dec52009-04-06 11:45:02 +02001730 } else
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001731 perf_counter_wakeup(handle->counter);
1732}
1733
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001734static int perf_output_begin(struct perf_output_handle *handle,
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001735 struct perf_counter *counter, unsigned int size,
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001736 int nmi, int overflow)
Peter Zijlstra0322cd62009-03-19 20:26:19 +01001737{
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001738 struct perf_mmap_data *data;
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001739 unsigned int offset, head;
Peter Zijlstra0322cd62009-03-19 20:26:19 +01001740
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001741 rcu_read_lock();
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001742 data = rcu_dereference(counter->data);
1743 if (!data)
1744 goto out;
Peter Zijlstra0322cd62009-03-19 20:26:19 +01001745
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001746 handle->counter = counter;
1747 handle->nmi = nmi;
1748 handle->overflow = overflow;
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001749
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001750 if (!data->nr_pages)
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001751 goto fail;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001752
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001753 do {
1754 offset = head = atomic_read(&data->head);
Peter Zijlstrac7138f32009-03-24 13:18:16 +01001755 head += size;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001756 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1757
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001758 handle->data = data;
1759 handle->offset = offset;
Peter Zijlstra63e35b22009-03-25 12:30:24 +01001760 handle->head = head;
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001761 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001762
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001763 return 0;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001764
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001765fail:
1766 __perf_output_wakeup(handle);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001767out:
1768 rcu_read_unlock();
1769
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001770 return -ENOSPC;
1771}
1772
1773static void perf_output_copy(struct perf_output_handle *handle,
1774 void *buf, unsigned int len)
1775{
1776 unsigned int pages_mask;
1777 unsigned int offset;
1778 unsigned int size;
1779 void **pages;
1780
1781 offset = handle->offset;
1782 pages_mask = handle->data->nr_pages - 1;
1783 pages = handle->data->data_pages;
1784
1785 do {
1786 unsigned int page_offset;
1787 int nr;
1788
1789 nr = (offset >> PAGE_SHIFT) & pages_mask;
1790 page_offset = offset & (PAGE_SIZE - 1);
1791 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1792
1793 memcpy(pages[nr] + page_offset, buf, size);
1794
1795 len -= size;
1796 buf += size;
1797 offset += size;
1798 } while (len);
1799
1800 handle->offset = offset;
Peter Zijlstra63e35b22009-03-25 12:30:24 +01001801
1802 WARN_ON_ONCE(handle->offset > handle->head);
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001803}
1804
Peter Zijlstra5c148192009-03-25 12:30:23 +01001805#define perf_output_put(handle, x) \
1806 perf_output_copy((handle), &(x), sizeof(x))
1807
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001808static void perf_output_end(struct perf_output_handle *handle)
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001809{
Peter Zijlstrac4578102009-04-02 11:12:01 +02001810 int wakeup_events = handle->counter->hw_event.wakeup_events;
1811
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001812 if (handle->overflow && wakeup_events) {
Peter Zijlstrac4578102009-04-02 11:12:01 +02001813 int events = atomic_inc_return(&handle->data->events);
1814 if (events >= wakeup_events) {
1815 atomic_sub(wakeup_events, &handle->data->events);
1816 __perf_output_wakeup(handle);
1817 }
1818 } else if (handle->wakeup)
Peter Zijlstra78d613e2009-03-30 19:07:11 +02001819 __perf_output_wakeup(handle);
Peter Zijlstrab9cacc72009-03-25 12:30:22 +01001820 rcu_read_unlock();
1821}
1822
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02001823static void perf_counter_output(struct perf_counter *counter,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02001824 int nmi, struct pt_regs *regs, u64 addr)
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001825{
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001826 int ret;
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001827 u64 record_type = counter->hw_event.record_type;
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001828 struct perf_output_handle handle;
1829 struct perf_event_header header;
1830 u64 ip;
Peter Zijlstra5c148192009-03-25 12:30:23 +01001831 struct {
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001832 u32 pid, tid;
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001833 } tid_entry;
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001834 struct {
1835 u64 event;
1836 u64 counter;
1837 } group_entry;
Peter Zijlstra394ee072009-03-30 19:07:14 +02001838 struct perf_callchain_entry *callchain = NULL;
1839 int callchain_size = 0;
Peter Zijlstra339f7c92009-04-06 11:45:06 +02001840 u64 time;
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001841
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001842 header.type = 0;
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001843 header.size = sizeof(header);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001844
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001845 header.misc = PERF_EVENT_MISC_OVERFLOW;
1846 header.misc |= user_mode(regs) ?
Peter Zijlstra6fab0192009-04-08 15:01:26 +02001847 PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
1848
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001849 if (record_type & PERF_RECORD_IP) {
1850 ip = instruction_pointer(regs);
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001851 header.type |= PERF_RECORD_IP;
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001852 header.size += sizeof(ip);
1853 }
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001854
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001855 if (record_type & PERF_RECORD_TID) {
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001856 /* namespace issues */
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001857 tid_entry.pid = current->group_leader->pid;
1858 tid_entry.tid = current->pid;
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001859
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001860 header.type |= PERF_RECORD_TID;
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001861 header.size += sizeof(tid_entry);
1862 }
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001863
Peter Zijlstra4d855452009-04-08 15:01:32 +02001864 if (record_type & PERF_RECORD_TIME) {
1865 /*
1866 * Maybe do better on x86 and provide cpu_clock_nmi()
1867 */
1868 time = sched_clock();
1869
1870 header.type |= PERF_RECORD_TIME;
1871 header.size += sizeof(u64);
1872 }
1873
Peter Zijlstra78f13e92009-04-08 15:01:33 +02001874 if (record_type & PERF_RECORD_ADDR) {
1875 header.type |= PERF_RECORD_ADDR;
1876 header.size += sizeof(u64);
1877 }
1878
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001879 if (record_type & PERF_RECORD_GROUP) {
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001880 header.type |= PERF_RECORD_GROUP;
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001881 header.size += sizeof(u64) +
1882 counter->nr_siblings * sizeof(group_entry);
1883 }
1884
1885 if (record_type & PERF_RECORD_CALLCHAIN) {
Peter Zijlstra394ee072009-03-30 19:07:14 +02001886 callchain = perf_callchain(regs);
1887
1888 if (callchain) {
Peter Zijlstra9c03d882009-04-06 11:45:00 +02001889 callchain_size = (1 + callchain->nr) * sizeof(u64);
Peter Zijlstra394ee072009-03-30 19:07:14 +02001890
Peter Zijlstra6b6e5482009-04-08 15:01:27 +02001891 header.type |= PERF_RECORD_CALLCHAIN;
Peter Zijlstra394ee072009-03-30 19:07:14 +02001892 header.size += callchain_size;
1893 }
1894 }
1895
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02001896 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001897 if (ret)
1898 return;
Peter Zijlstraea5d20c2009-03-25 12:30:25 +01001899
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001900 perf_output_put(&handle, header);
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001901
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001902 if (record_type & PERF_RECORD_IP)
1903 perf_output_put(&handle, ip);
1904
1905 if (record_type & PERF_RECORD_TID)
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001906 perf_output_put(&handle, tid_entry);
1907
Peter Zijlstra4d855452009-04-08 15:01:32 +02001908 if (record_type & PERF_RECORD_TIME)
1909 perf_output_put(&handle, time);
1910
Peter Zijlstra78f13e92009-04-08 15:01:33 +02001911 if (record_type & PERF_RECORD_ADDR)
1912 perf_output_put(&handle, addr);
1913
Peter Zijlstra8a057d82009-04-02 11:11:59 +02001914 if (record_type & PERF_RECORD_GROUP) {
1915 struct perf_counter *leader, *sub;
1916 u64 nr = counter->nr_siblings;
1917
1918 perf_output_put(&handle, nr);
1919
1920 leader = counter->group_leader;
1921 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1922 if (sub != counter)
1923 sub->hw_ops->read(sub);
1924
1925 group_entry.event = sub->hw_event.config;
1926 group_entry.counter = atomic64_read(&sub->count);
1927
1928 perf_output_put(&handle, group_entry);
1929 }
1930 }
1931
Peter Zijlstra394ee072009-03-30 19:07:14 +02001932 if (callchain)
1933 perf_output_copy(&handle, callchain, callchain_size);
1934
Peter Zijlstra5ed00412009-03-30 19:07:12 +02001935 perf_output_end(&handle);
Peter Zijlstra7b732a72009-03-23 18:22:10 +01001936}
1937
Peter Zijlstra0322cd62009-03-19 20:26:19 +01001938/*
Peter Zijlstra8d1b2d92009-04-08 15:01:30 +02001939 * comm tracking
1940 */
1941
1942struct perf_comm_event {
1943 struct task_struct *task;
1944 char *comm;
1945 int comm_size;
1946
1947 struct {
1948 struct perf_event_header header;
1949
1950 u32 pid;
1951 u32 tid;
1952 } event;
1953};
1954
1955static void perf_counter_comm_output(struct perf_counter *counter,
1956 struct perf_comm_event *comm_event)
1957{
1958 struct perf_output_handle handle;
1959 int size = comm_event->event.header.size;
1960 int ret = perf_output_begin(&handle, counter, size, 0, 0);
1961
1962 if (ret)
1963 return;
1964
1965 perf_output_put(&handle, comm_event->event);
1966 perf_output_copy(&handle, comm_event->comm,
1967 comm_event->comm_size);
1968 perf_output_end(&handle);
1969}
1970
1971static int perf_counter_comm_match(struct perf_counter *counter,
1972 struct perf_comm_event *comm_event)
1973{
1974 if (counter->hw_event.comm &&
1975 comm_event->event.header.type == PERF_EVENT_COMM)
1976 return 1;
1977
1978 return 0;
1979}
1980
1981static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
1982 struct perf_comm_event *comm_event)
1983{
1984 struct perf_counter *counter;
1985
1986 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1987 return;
1988
1989 rcu_read_lock();
1990 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1991 if (perf_counter_comm_match(counter, comm_event))
1992 perf_counter_comm_output(counter, comm_event);
1993 }
1994 rcu_read_unlock();
1995}
1996
1997static void perf_counter_comm_event(struct perf_comm_event *comm_event)
1998{
1999 struct perf_cpu_context *cpuctx;
2000 unsigned int size;
2001 char *comm = comm_event->task->comm;
2002
Ingo Molnar888fcee2009-04-09 09:48:22 +02002003 size = ALIGN(strlen(comm)+1, sizeof(u64));
Peter Zijlstra8d1b2d92009-04-08 15:01:30 +02002004
2005 comm_event->comm = comm;
2006 comm_event->comm_size = size;
2007
2008 comm_event->event.header.size = sizeof(comm_event->event) + size;
2009
2010 cpuctx = &get_cpu_var(perf_cpu_context);
2011 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2012 put_cpu_var(perf_cpu_context);
2013
2014 perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2015}
2016
2017void perf_counter_comm(struct task_struct *task)
2018{
Peter Zijlstra9ee318a2009-04-09 10:53:44 +02002019 struct perf_comm_event comm_event;
2020
2021 if (!atomic_read(&nr_comm_tracking))
2022 return;
2023
2024 comm_event = (struct perf_comm_event){
Peter Zijlstra8d1b2d92009-04-08 15:01:30 +02002025 .task = task,
2026 .event = {
2027 .header = { .type = PERF_EVENT_COMM, },
2028 .pid = task->group_leader->pid,
2029 .tid = task->pid,
2030 },
2031 };
2032
2033 perf_counter_comm_event(&comm_event);
2034}
2035
2036/*
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002037 * mmap tracking
2038 */
2039
2040struct perf_mmap_event {
2041 struct file *file;
2042 char *file_name;
2043 int file_size;
2044
2045 struct {
2046 struct perf_event_header header;
2047
2048 u32 pid;
2049 u32 tid;
2050 u64 start;
2051 u64 len;
2052 u64 pgoff;
2053 } event;
2054};
2055
2056static void perf_counter_mmap_output(struct perf_counter *counter,
2057 struct perf_mmap_event *mmap_event)
2058{
2059 struct perf_output_handle handle;
2060 int size = mmap_event->event.header.size;
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02002061 int ret = perf_output_begin(&handle, counter, size, 0, 0);
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002062
2063 if (ret)
2064 return;
2065
2066 perf_output_put(&handle, mmap_event->event);
2067 perf_output_copy(&handle, mmap_event->file_name,
2068 mmap_event->file_size);
Peter Zijlstra78d613e2009-03-30 19:07:11 +02002069 perf_output_end(&handle);
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002070}
2071
2072static int perf_counter_mmap_match(struct perf_counter *counter,
2073 struct perf_mmap_event *mmap_event)
2074{
2075 if (counter->hw_event.mmap &&
2076 mmap_event->event.header.type == PERF_EVENT_MMAP)
2077 return 1;
2078
2079 if (counter->hw_event.munmap &&
2080 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2081 return 1;
2082
2083 return 0;
2084}
2085
2086static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2087 struct perf_mmap_event *mmap_event)
2088{
2089 struct perf_counter *counter;
2090
2091 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2092 return;
2093
2094 rcu_read_lock();
2095 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2096 if (perf_counter_mmap_match(counter, mmap_event))
2097 perf_counter_mmap_output(counter, mmap_event);
2098 }
2099 rcu_read_unlock();
2100}
2101
2102static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2103{
2104 struct perf_cpu_context *cpuctx;
2105 struct file *file = mmap_event->file;
2106 unsigned int size;
2107 char tmp[16];
2108 char *buf = NULL;
2109 char *name;
2110
2111 if (file) {
2112 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2113 if (!buf) {
2114 name = strncpy(tmp, "//enomem", sizeof(tmp));
2115 goto got_name;
2116 }
2117 name = dentry_path(file->f_dentry, buf, PATH_MAX);
2118 if (IS_ERR(name)) {
2119 name = strncpy(tmp, "//toolong", sizeof(tmp));
2120 goto got_name;
2121 }
2122 } else {
2123 name = strncpy(tmp, "//anon", sizeof(tmp));
2124 goto got_name;
2125 }
2126
2127got_name:
Ingo Molnar888fcee2009-04-09 09:48:22 +02002128 size = ALIGN(strlen(name)+1, sizeof(u64));
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002129
2130 mmap_event->file_name = name;
2131 mmap_event->file_size = size;
2132
2133 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2134
2135 cpuctx = &get_cpu_var(perf_cpu_context);
2136 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2137 put_cpu_var(perf_cpu_context);
2138
2139 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2140
2141 kfree(buf);
2142}
2143
2144void perf_counter_mmap(unsigned long addr, unsigned long len,
2145 unsigned long pgoff, struct file *file)
2146{
Peter Zijlstra9ee318a2009-04-09 10:53:44 +02002147 struct perf_mmap_event mmap_event;
2148
2149 if (!atomic_read(&nr_mmap_tracking))
2150 return;
2151
2152 mmap_event = (struct perf_mmap_event){
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002153 .file = file,
2154 .event = {
2155 .header = { .type = PERF_EVENT_MMAP, },
2156 .pid = current->group_leader->pid,
2157 .tid = current->pid,
2158 .start = addr,
2159 .len = len,
2160 .pgoff = pgoff,
2161 },
2162 };
2163
2164 perf_counter_mmap_event(&mmap_event);
2165}
2166
2167void perf_counter_munmap(unsigned long addr, unsigned long len,
2168 unsigned long pgoff, struct file *file)
2169{
Peter Zijlstra9ee318a2009-04-09 10:53:44 +02002170 struct perf_mmap_event mmap_event;
2171
2172 if (!atomic_read(&nr_munmap_tracking))
2173 return;
2174
2175 mmap_event = (struct perf_mmap_event){
Peter Zijlstra0a4a9392009-03-30 19:07:05 +02002176 .file = file,
2177 .event = {
2178 .header = { .type = PERF_EVENT_MUNMAP, },
2179 .pid = current->group_leader->pid,
2180 .tid = current->pid,
2181 .start = addr,
2182 .len = len,
2183 .pgoff = pgoff,
2184 },
2185 };
2186
2187 perf_counter_mmap_event(&mmap_event);
2188}
2189
2190/*
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002191 * Generic counter overflow handling.
2192 */
2193
2194int perf_counter_overflow(struct perf_counter *counter,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002195 int nmi, struct pt_regs *regs, u64 addr)
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002196{
Peter Zijlstra79f14642009-04-06 11:45:07 +02002197 int events = atomic_read(&counter->event_limit);
2198 int ret = 0;
2199
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02002200 counter->pending_kill = POLL_IN;
Peter Zijlstra79f14642009-04-06 11:45:07 +02002201 if (events && atomic_dec_and_test(&counter->event_limit)) {
2202 ret = 1;
Peter Zijlstra4c9e2542009-04-06 11:45:09 +02002203 counter->pending_kill = POLL_HUP;
Peter Zijlstra79f14642009-04-06 11:45:07 +02002204 if (nmi) {
2205 counter->pending_disable = 1;
2206 perf_pending_queue(&counter->pending,
2207 perf_pending_counter);
2208 } else
2209 perf_counter_disable(counter);
2210 }
2211
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002212 perf_counter_output(counter, nmi, regs, addr);
Peter Zijlstra79f14642009-04-06 11:45:07 +02002213 return ret;
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002214}
2215
2216/*
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002217 * Generic software counter infrastructure
2218 */
2219
2220static void perf_swcounter_update(struct perf_counter *counter)
2221{
2222 struct hw_perf_counter *hwc = &counter->hw;
2223 u64 prev, now;
2224 s64 delta;
2225
2226again:
2227 prev = atomic64_read(&hwc->prev_count);
2228 now = atomic64_read(&hwc->count);
2229 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2230 goto again;
2231
2232 delta = now - prev;
2233
2234 atomic64_add(delta, &counter->count);
2235 atomic64_sub(delta, &hwc->period_left);
2236}
2237
2238static void perf_swcounter_set_period(struct perf_counter *counter)
2239{
2240 struct hw_perf_counter *hwc = &counter->hw;
2241 s64 left = atomic64_read(&hwc->period_left);
2242 s64 period = hwc->irq_period;
2243
2244 if (unlikely(left <= -period)) {
2245 left = period;
2246 atomic64_set(&hwc->period_left, left);
2247 }
2248
2249 if (unlikely(left <= 0)) {
2250 left += period;
2251 atomic64_add(period, &hwc->period_left);
2252 }
2253
2254 atomic64_set(&hwc->prev_count, -left);
2255 atomic64_set(&hwc->count, -left);
2256}
2257
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002258static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2259{
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002260 enum hrtimer_restart ret = HRTIMER_RESTART;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002261 struct perf_counter *counter;
2262 struct pt_regs *regs;
2263
2264 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2265 counter->hw_ops->read(counter);
2266
2267 regs = get_irq_regs();
2268 /*
2269 * In case we exclude kernel IPs or are somehow not in interrupt
2270 * context, provide the next best thing, the user IP.
2271 */
2272 if ((counter->hw_event.exclude_kernel || !regs) &&
2273 !counter->hw_event.exclude_user)
2274 regs = task_pt_regs(current);
2275
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002276 if (regs) {
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002277 if (perf_counter_overflow(counter, 0, regs, 0))
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002278 ret = HRTIMER_NORESTART;
2279 }
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002280
2281 hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2282
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002283 return ret;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002284}
2285
2286static void perf_swcounter_overflow(struct perf_counter *counter,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002287 int nmi, struct pt_regs *regs, u64 addr)
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002288{
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002289 perf_swcounter_update(counter);
2290 perf_swcounter_set_period(counter);
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002291 if (perf_counter_overflow(counter, nmi, regs, addr))
Peter Zijlstraf6c7d5f2009-04-06 11:45:04 +02002292 /* soft-disable the counter */
2293 ;
2294
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002295}
2296
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002297static int perf_swcounter_match(struct perf_counter *counter,
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002298 enum perf_event_types type,
2299 u32 event, struct pt_regs *regs)
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002300{
2301 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2302 return 0;
2303
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002304 if (perf_event_raw(&counter->hw_event))
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002305 return 0;
2306
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002307 if (perf_event_type(&counter->hw_event) != type)
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002308 return 0;
2309
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002310 if (perf_event_id(&counter->hw_event) != event)
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002311 return 0;
2312
2313 if (counter->hw_event.exclude_user && user_mode(regs))
2314 return 0;
2315
2316 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2317 return 0;
2318
2319 return 1;
2320}
2321
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002322static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002323 int nmi, struct pt_regs *regs, u64 addr)
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002324{
2325 int neg = atomic64_add_negative(nr, &counter->hw.count);
2326 if (counter->hw.irq_period && !neg)
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002327 perf_swcounter_overflow(counter, nmi, regs, addr);
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002328}
2329
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002330static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002331 enum perf_event_types type, u32 event,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002332 u64 nr, int nmi, struct pt_regs *regs,
2333 u64 addr)
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002334{
2335 struct perf_counter *counter;
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002336
Peter Zijlstra01ef09d2009-03-19 20:26:11 +01002337 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002338 return;
2339
Peter Zijlstra592903c2009-03-13 12:21:36 +01002340 rcu_read_lock();
2341 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002342 if (perf_swcounter_match(counter, type, event, regs))
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002343 perf_swcounter_add(counter, nr, nmi, regs, addr);
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002344 }
Peter Zijlstra592903c2009-03-13 12:21:36 +01002345 rcu_read_unlock();
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002346}
2347
Peter Zijlstra96f6d442009-03-23 18:22:07 +01002348static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2349{
2350 if (in_nmi())
2351 return &cpuctx->recursion[3];
2352
2353 if (in_irq())
2354 return &cpuctx->recursion[2];
2355
2356 if (in_softirq())
2357 return &cpuctx->recursion[1];
2358
2359 return &cpuctx->recursion[0];
2360}
2361
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002362static void __perf_swcounter_event(enum perf_event_types type, u32 event,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002363 u64 nr, int nmi, struct pt_regs *regs,
2364 u64 addr)
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002365{
2366 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
Peter Zijlstra96f6d442009-03-23 18:22:07 +01002367 int *recursion = perf_swcounter_recursion_context(cpuctx);
2368
2369 if (*recursion)
2370 goto out;
2371
2372 (*recursion)++;
2373 barrier();
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002374
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002375 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2376 nr, nmi, regs, addr);
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002377 if (cpuctx->task_ctx) {
2378 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002379 nr, nmi, regs, addr);
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002380 }
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002381
Peter Zijlstra96f6d442009-03-23 18:22:07 +01002382 barrier();
2383 (*recursion)--;
2384
2385out:
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002386 put_cpu_var(perf_cpu_context);
2387}
2388
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002389void
2390perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002391{
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002392 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002393}
2394
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002395static void perf_swcounter_read(struct perf_counter *counter)
2396{
2397 perf_swcounter_update(counter);
2398}
2399
2400static int perf_swcounter_enable(struct perf_counter *counter)
2401{
2402 perf_swcounter_set_period(counter);
2403 return 0;
2404}
2405
2406static void perf_swcounter_disable(struct perf_counter *counter)
2407{
2408 perf_swcounter_update(counter);
2409}
2410
Peter Zijlstraac17dc82009-03-13 12:21:34 +01002411static const struct hw_perf_counter_ops perf_ops_generic = {
2412 .enable = perf_swcounter_enable,
2413 .disable = perf_swcounter_disable,
2414 .read = perf_swcounter_read,
2415};
2416
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002417/*
2418 * Software counter: cpu wall time clock
2419 */
2420
Paul Mackerras9abf8a02009-01-09 16:26:43 +11002421static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2422{
2423 int cpu = raw_smp_processor_id();
2424 s64 prev;
2425 u64 now;
2426
2427 now = cpu_clock(cpu);
2428 prev = atomic64_read(&counter->hw.prev_count);
2429 atomic64_set(&counter->hw.prev_count, now);
2430 atomic64_add(now - prev, &counter->count);
2431}
2432
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002433static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2434{
2435 struct hw_perf_counter *hwc = &counter->hw;
2436 int cpu = raw_smp_processor_id();
2437
2438 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
Peter Zijlstra039fc912009-03-13 16:43:47 +01002439 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2440 hwc->hrtimer.function = perf_swcounter_hrtimer;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002441 if (hwc->irq_period) {
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002442 __hrtimer_start_range_ns(&hwc->hrtimer,
2443 ns_to_ktime(hwc->irq_period), 0,
2444 HRTIMER_MODE_REL, 0);
2445 }
2446
2447 return 0;
2448}
2449
Ingo Molnar5c92d122008-12-11 13:21:10 +01002450static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2451{
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002452 hrtimer_cancel(&counter->hw.hrtimer);
Paul Mackerras9abf8a02009-01-09 16:26:43 +11002453 cpu_clock_perf_counter_update(counter);
Ingo Molnar5c92d122008-12-11 13:21:10 +01002454}
2455
2456static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2457{
Paul Mackerras9abf8a02009-01-09 16:26:43 +11002458 cpu_clock_perf_counter_update(counter);
Ingo Molnar5c92d122008-12-11 13:21:10 +01002459}
2460
2461static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
Ingo Molnar76715812008-12-17 14:20:28 +01002462 .enable = cpu_clock_perf_counter_enable,
2463 .disable = cpu_clock_perf_counter_disable,
2464 .read = cpu_clock_perf_counter_read,
Ingo Molnar5c92d122008-12-11 13:21:10 +01002465};
2466
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01002467/*
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002468 * Software counter: task time clock
2469 */
2470
Peter Zijlstrae30e08f2009-04-08 15:01:25 +02002471static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
Ingo Molnarbae43c92008-12-11 14:03:20 +01002472{
Peter Zijlstrae30e08f2009-04-08 15:01:25 +02002473 u64 prev;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01002474 s64 delta;
Ingo Molnarbae43c92008-12-11 14:03:20 +01002475
Peter Zijlstraa39d6f22009-04-06 11:45:11 +02002476 prev = atomic64_xchg(&counter->hw.prev_count, now);
Ingo Molnar8cb391e2008-12-14 12:22:31 +01002477 delta = now - prev;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01002478 atomic64_add(delta, &counter->count);
Ingo Molnarbae43c92008-12-11 14:03:20 +01002479}
2480
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01002481static int task_clock_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar8cb391e2008-12-14 12:22:31 +01002482{
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002483 struct hw_perf_counter *hwc = &counter->hw;
Peter Zijlstraa39d6f22009-04-06 11:45:11 +02002484 u64 now;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002485
Peter Zijlstraa39d6f22009-04-06 11:45:11 +02002486 now = counter->ctx->time;
2487
2488 atomic64_set(&hwc->prev_count, now);
Peter Zijlstra039fc912009-03-13 16:43:47 +01002489 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2490 hwc->hrtimer.function = perf_swcounter_hrtimer;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002491 if (hwc->irq_period) {
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002492 __hrtimer_start_range_ns(&hwc->hrtimer,
2493 ns_to_ktime(hwc->irq_period), 0,
2494 HRTIMER_MODE_REL, 0);
2495 }
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01002496
2497 return 0;
Ingo Molnar8cb391e2008-12-14 12:22:31 +01002498}
2499
2500static void task_clock_perf_counter_disable(struct perf_counter *counter)
2501{
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002502 hrtimer_cancel(&counter->hw.hrtimer);
Peter Zijlstrae30e08f2009-04-08 15:01:25 +02002503 task_clock_perf_counter_update(counter, counter->ctx->time);
2504
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002505}
Ingo Molnaraa9c4c02008-12-17 14:10:57 +01002506
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002507static void task_clock_perf_counter_read(struct perf_counter *counter)
2508{
Peter Zijlstrae30e08f2009-04-08 15:01:25 +02002509 u64 time;
2510
2511 if (!in_nmi()) {
2512 update_context_time(counter->ctx);
2513 time = counter->ctx->time;
2514 } else {
2515 u64 now = perf_clock();
2516 u64 delta = now - counter->ctx->timestamp;
2517 time = counter->ctx->time + delta;
2518 }
2519
2520 task_clock_perf_counter_update(counter, time);
Ingo Molnarbae43c92008-12-11 14:03:20 +01002521}
2522
2523static const struct hw_perf_counter_ops perf_ops_task_clock = {
Ingo Molnar76715812008-12-17 14:20:28 +01002524 .enable = task_clock_perf_counter_enable,
2525 .disable = task_clock_perf_counter_disable,
2526 .read = task_clock_perf_counter_read,
Ingo Molnarbae43c92008-12-11 14:03:20 +01002527};
2528
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002529/*
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002530 * Software counter: cpu migrations
2531 */
2532
Paul Mackerras23a185c2009-02-09 22:42:47 +11002533static inline u64 get_cpu_migrations(struct perf_counter *counter)
Ingo Molnar6c594c22008-12-14 12:34:15 +01002534{
Paul Mackerras23a185c2009-02-09 22:42:47 +11002535 struct task_struct *curr = counter->ctx->task;
2536
2537 if (curr)
2538 return curr->se.nr_migrations;
2539 return cpu_nr_migrations(smp_processor_id());
Ingo Molnar6c594c22008-12-14 12:34:15 +01002540}
2541
2542static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2543{
2544 u64 prev, now;
2545 s64 delta;
2546
2547 prev = atomic64_read(&counter->hw.prev_count);
Paul Mackerras23a185c2009-02-09 22:42:47 +11002548 now = get_cpu_migrations(counter);
Ingo Molnar6c594c22008-12-14 12:34:15 +01002549
2550 atomic64_set(&counter->hw.prev_count, now);
2551
2552 delta = now - prev;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002553
2554 atomic64_add(delta, &counter->count);
2555}
2556
2557static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2558{
2559 cpu_migrations_perf_counter_update(counter);
2560}
2561
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01002562static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
Ingo Molnar6c594c22008-12-14 12:34:15 +01002563{
Paul Mackerrasc07c99b2009-02-13 22:10:34 +11002564 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2565 atomic64_set(&counter->hw.prev_count,
2566 get_cpu_migrations(counter));
Ingo Molnar95cdd2e2008-12-21 13:50:42 +01002567 return 0;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002568}
2569
2570static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2571{
2572 cpu_migrations_perf_counter_update(counter);
2573}
2574
2575static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
Ingo Molnar76715812008-12-17 14:20:28 +01002576 .enable = cpu_migrations_perf_counter_enable,
2577 .disable = cpu_migrations_perf_counter_disable,
2578 .read = cpu_migrations_perf_counter_read,
Ingo Molnar6c594c22008-12-14 12:34:15 +01002579};
2580
Peter Zijlstrae077df42009-03-19 20:26:17 +01002581#ifdef CONFIG_EVENT_PROFILE
2582void perf_tpcounter_event(int event_id)
2583{
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002584 struct pt_regs *regs = get_irq_regs();
2585
2586 if (!regs)
2587 regs = task_pt_regs(current);
2588
Peter Zijlstra78f13e92009-04-08 15:01:33 +02002589 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
Peter Zijlstrae077df42009-03-19 20:26:17 +01002590}
2591
2592extern int ftrace_profile_enable(int);
2593extern void ftrace_profile_disable(int);
2594
2595static void tp_perf_counter_destroy(struct perf_counter *counter)
2596{
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002597 ftrace_profile_disable(perf_event_id(&counter->hw_event));
Peter Zijlstrae077df42009-03-19 20:26:17 +01002598}
2599
2600static const struct hw_perf_counter_ops *
2601tp_perf_counter_init(struct perf_counter *counter)
2602{
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002603 int event_id = perf_event_id(&counter->hw_event);
Peter Zijlstrae077df42009-03-19 20:26:17 +01002604 int ret;
2605
2606 ret = ftrace_profile_enable(event_id);
2607 if (ret)
2608 return NULL;
2609
2610 counter->destroy = tp_perf_counter_destroy;
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002611 counter->hw.irq_period = counter->hw_event.irq_period;
Peter Zijlstrae077df42009-03-19 20:26:17 +01002612
2613 return &perf_ops_generic;
2614}
2615#else
2616static const struct hw_perf_counter_ops *
2617tp_perf_counter_init(struct perf_counter *counter)
2618{
2619 return NULL;
2620}
2621#endif
2622
Ingo Molnar5c92d122008-12-11 13:21:10 +01002623static const struct hw_perf_counter_ops *
2624sw_perf_counter_init(struct perf_counter *counter)
2625{
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002626 struct perf_counter_hw_event *hw_event = &counter->hw_event;
Ingo Molnar5c92d122008-12-11 13:21:10 +01002627 const struct hw_perf_counter_ops *hw_ops = NULL;
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002628 struct hw_perf_counter *hwc = &counter->hw;
Ingo Molnar5c92d122008-12-11 13:21:10 +01002629
Paul Mackerras0475f9e2009-02-11 14:35:35 +11002630 /*
2631 * Software counters (currently) can't in general distinguish
2632 * between user, kernel and hypervisor events.
2633 * However, context switches and cpu migrations are considered
2634 * to be kernel events, and page faults are never hypervisor
2635 * events.
2636 */
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002637 switch (perf_event_id(&counter->hw_event)) {
Ingo Molnar5c92d122008-12-11 13:21:10 +01002638 case PERF_COUNT_CPU_CLOCK:
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002639 hw_ops = &perf_ops_cpu_clock;
2640
2641 if (hw_event->irq_period && hw_event->irq_period < 10000)
2642 hw_event->irq_period = 10000;
Ingo Molnar5c92d122008-12-11 13:21:10 +01002643 break;
Ingo Molnarbae43c92008-12-11 14:03:20 +01002644 case PERF_COUNT_TASK_CLOCK:
Paul Mackerras23a185c2009-02-09 22:42:47 +11002645 /*
2646 * If the user instantiates this as a per-cpu counter,
2647 * use the cpu_clock counter instead.
2648 */
2649 if (counter->ctx->task)
2650 hw_ops = &perf_ops_task_clock;
2651 else
2652 hw_ops = &perf_ops_cpu_clock;
Peter Zijlstrad6d020e2009-03-13 12:21:35 +01002653
2654 if (hw_event->irq_period && hw_event->irq_period < 10000)
2655 hw_event->irq_period = 10000;
Ingo Molnarbae43c92008-12-11 14:03:20 +01002656 break;
Ingo Molnare06c61a2008-12-14 14:44:31 +01002657 case PERF_COUNT_PAGE_FAULTS:
Peter Zijlstraac17dc82009-03-13 12:21:34 +01002658 case PERF_COUNT_PAGE_FAULTS_MIN:
2659 case PERF_COUNT_PAGE_FAULTS_MAJ:
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01002660 case PERF_COUNT_CONTEXT_SWITCHES:
Peter Zijlstra4a0deca2009-03-19 20:26:12 +01002661 hw_ops = &perf_ops_generic;
Ingo Molnar5d6a27d2008-12-14 12:28:33 +01002662 break;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002663 case PERF_COUNT_CPU_MIGRATIONS:
Paul Mackerras0475f9e2009-02-11 14:35:35 +11002664 if (!counter->hw_event.exclude_kernel)
2665 hw_ops = &perf_ops_cpu_migrations;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002666 break;
Ingo Molnar5c92d122008-12-11 13:21:10 +01002667 }
Peter Zijlstra15dbf272009-03-13 12:21:32 +01002668
2669 if (hw_ops)
2670 hwc->irq_period = hw_event->irq_period;
2671
Ingo Molnar5c92d122008-12-11 13:21:10 +01002672 return hw_ops;
2673}
2674
Thomas Gleixner0793a612008-12-04 20:12:29 +01002675/*
2676 * Allocate and initialize a counter structure
2677 */
2678static struct perf_counter *
Ingo Molnar04289bb2008-12-11 08:38:42 +01002679perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2680 int cpu,
Paul Mackerras23a185c2009-02-09 22:42:47 +11002681 struct perf_counter_context *ctx,
Ingo Molnar9b51f662008-12-12 13:49:45 +01002682 struct perf_counter *group_leader,
2683 gfp_t gfpflags)
Thomas Gleixner0793a612008-12-04 20:12:29 +01002684{
Ingo Molnar5c92d122008-12-11 13:21:10 +01002685 const struct hw_perf_counter_ops *hw_ops;
Ingo Molnar621a01e2008-12-11 12:46:46 +01002686 struct perf_counter *counter;
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002687 long err;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002688
Ingo Molnar9b51f662008-12-12 13:49:45 +01002689 counter = kzalloc(sizeof(*counter), gfpflags);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002690 if (!counter)
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002691 return ERR_PTR(-ENOMEM);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002692
Ingo Molnar04289bb2008-12-11 08:38:42 +01002693 /*
2694 * Single counters are their own group leaders, with an
2695 * empty sibling list:
2696 */
2697 if (!group_leader)
2698 group_leader = counter;
2699
Thomas Gleixner0793a612008-12-04 20:12:29 +01002700 mutex_init(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01002701 INIT_LIST_HEAD(&counter->list_entry);
Peter Zijlstra592903c2009-03-13 12:21:36 +01002702 INIT_LIST_HEAD(&counter->event_entry);
Ingo Molnar04289bb2008-12-11 08:38:42 +01002703 INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002704 init_waitqueue_head(&counter->waitq);
2705
Peter Zijlstra7b732a72009-03-23 18:22:10 +01002706 mutex_init(&counter->mmap_mutex);
2707
Paul Mackerrasd859e292009-01-17 18:10:22 +11002708 INIT_LIST_HEAD(&counter->child_list);
2709
Ingo Molnar9f66a382008-12-10 12:33:23 +01002710 counter->cpu = cpu;
2711 counter->hw_event = *hw_event;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002712 counter->group_leader = group_leader;
Ingo Molnar621a01e2008-12-11 12:46:46 +01002713 counter->hw_ops = NULL;
Paul Mackerras23a185c2009-02-09 22:42:47 +11002714 counter->ctx = ctx;
Ingo Molnar621a01e2008-12-11 12:46:46 +01002715
Ingo Molnar235c7fc2008-12-21 14:43:25 +01002716 counter->state = PERF_COUNTER_STATE_INACTIVE;
Ingo Molnara86ed502008-12-17 00:43:10 +01002717 if (hw_event->disabled)
2718 counter->state = PERF_COUNTER_STATE_OFF;
2719
Ingo Molnar5c92d122008-12-11 13:21:10 +01002720 hw_ops = NULL;
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002721
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002722 if (perf_event_raw(hw_event)) {
Ingo Molnar5c92d122008-12-11 13:21:10 +01002723 hw_ops = hw_perf_counter_init(counter);
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002724 goto done;
2725 }
2726
2727 switch (perf_event_type(hw_event)) {
Peter Zijlstrab8e83512009-03-19 20:26:18 +01002728 case PERF_TYPE_HARDWARE:
2729 hw_ops = hw_perf_counter_init(counter);
2730 break;
2731
2732 case PERF_TYPE_SOFTWARE:
2733 hw_ops = sw_perf_counter_init(counter);
2734 break;
2735
2736 case PERF_TYPE_TRACEPOINT:
2737 hw_ops = tp_perf_counter_init(counter);
2738 break;
2739 }
Peter Zijlstraf4a2deb2009-03-23 18:22:06 +01002740done:
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002741 err = 0;
2742 if (!hw_ops)
2743 err = -EINVAL;
2744 else if (IS_ERR(hw_ops))
2745 err = PTR_ERR(hw_ops);
2746
2747 if (err) {
2748 kfree(counter);
2749 return ERR_PTR(err);
2750 }
2751
Ingo Molnar621a01e2008-12-11 12:46:46 +01002752 counter->hw_ops = hw_ops;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002753
Peter Zijlstra9ee318a2009-04-09 10:53:44 +02002754 if (counter->hw_event.mmap)
2755 atomic_inc(&nr_mmap_tracking);
2756 if (counter->hw_event.munmap)
2757 atomic_inc(&nr_munmap_tracking);
2758 if (counter->hw_event.comm)
2759 atomic_inc(&nr_comm_tracking);
2760
Thomas Gleixner0793a612008-12-04 20:12:29 +01002761 return counter;
2762}
2763
2764/**
Paul Mackerras2743a5b2009-03-04 20:36:51 +11002765 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
Ingo Molnar9f66a382008-12-10 12:33:23 +01002766 *
2767 * @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner0793a612008-12-04 20:12:29 +01002768 * @pid: target pid
Ingo Molnar9f66a382008-12-10 12:33:23 +01002769 * @cpu: target cpu
2770 * @group_fd: group leader counter fd
Thomas Gleixner0793a612008-12-04 20:12:29 +01002771 */
Paul Mackerras2743a5b2009-03-04 20:36:51 +11002772SYSCALL_DEFINE5(perf_counter_open,
Paul Mackerrasf3dfd262009-02-26 22:43:46 +11002773 const struct perf_counter_hw_event __user *, hw_event_uptr,
Paul Mackerras2743a5b2009-03-04 20:36:51 +11002774 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
Thomas Gleixner0793a612008-12-04 20:12:29 +01002775{
Ingo Molnar04289bb2008-12-11 08:38:42 +01002776 struct perf_counter *counter, *group_leader;
Ingo Molnar9f66a382008-12-10 12:33:23 +01002777 struct perf_counter_hw_event hw_event;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002778 struct perf_counter_context *ctx;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002779 struct file *counter_file = NULL;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002780 struct file *group_file = NULL;
2781 int fput_needed = 0;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002782 int fput_needed2 = 0;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002783 int ret;
2784
Paul Mackerras2743a5b2009-03-04 20:36:51 +11002785 /* for future expandability... */
2786 if (flags)
2787 return -EINVAL;
2788
Ingo Molnar9f66a382008-12-10 12:33:23 +01002789 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixnereab656a2008-12-08 19:26:59 +01002790 return -EFAULT;
2791
Ingo Molnar04289bb2008-12-11 08:38:42 +01002792 /*
Ingo Molnarccff2862008-12-11 11:26:29 +01002793 * Get the target context (task or percpu):
2794 */
2795 ctx = find_get_context(pid, cpu);
2796 if (IS_ERR(ctx))
2797 return PTR_ERR(ctx);
2798
2799 /*
2800 * Look up the group leader (we will attach this counter to it):
Ingo Molnar04289bb2008-12-11 08:38:42 +01002801 */
2802 group_leader = NULL;
2803 if (group_fd != -1) {
2804 ret = -EINVAL;
2805 group_file = fget_light(group_fd, &fput_needed);
2806 if (!group_file)
Ingo Molnarccff2862008-12-11 11:26:29 +01002807 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002808 if (group_file->f_op != &perf_fops)
Ingo Molnarccff2862008-12-11 11:26:29 +01002809 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002810
2811 group_leader = group_file->private_data;
2812 /*
Ingo Molnarccff2862008-12-11 11:26:29 +01002813 * Do not allow a recursive hierarchy (this new sibling
2814 * becoming part of another group-sibling):
Ingo Molnar04289bb2008-12-11 08:38:42 +01002815 */
Ingo Molnarccff2862008-12-11 11:26:29 +01002816 if (group_leader->group_leader != group_leader)
2817 goto err_put_context;
2818 /*
2819 * Do not allow to attach to a group in a different
2820 * task or CPU context:
2821 */
2822 if (group_leader->ctx != ctx)
2823 goto err_put_context;
Paul Mackerras3b6f9e52009-01-14 21:00:30 +11002824 /*
2825 * Only a group leader can be exclusive or pinned
2826 */
2827 if (hw_event.exclusive || hw_event.pinned)
2828 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +01002829 }
2830
Paul Mackerras23a185c2009-02-09 22:42:47 +11002831 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2832 GFP_KERNEL);
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002833 ret = PTR_ERR(counter);
2834 if (IS_ERR(counter))
Thomas Gleixner0793a612008-12-04 20:12:29 +01002835 goto err_put_context;
2836
Thomas Gleixner0793a612008-12-04 20:12:29 +01002837 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2838 if (ret < 0)
Ingo Molnar9b51f662008-12-12 13:49:45 +01002839 goto err_free_put_context;
2840
2841 counter_file = fget_light(ret, &fput_needed2);
2842 if (!counter_file)
2843 goto err_free_put_context;
2844
2845 counter->filp = counter_file;
Paul Mackerrasd859e292009-01-17 18:10:22 +11002846 mutex_lock(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002847 perf_install_in_context(ctx, counter, cpu);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002848 mutex_unlock(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002849
2850 fput_light(counter_file, fput_needed2);
Thomas Gleixner0793a612008-12-04 20:12:29 +01002851
Ingo Molnar04289bb2008-12-11 08:38:42 +01002852out_fput:
2853 fput_light(group_file, fput_needed);
2854
Thomas Gleixner0793a612008-12-04 20:12:29 +01002855 return ret;
2856
Ingo Molnar9b51f662008-12-12 13:49:45 +01002857err_free_put_context:
Thomas Gleixner0793a612008-12-04 20:12:29 +01002858 kfree(counter);
2859
2860err_put_context:
2861 put_context(ctx);
2862
Ingo Molnar04289bb2008-12-11 08:38:42 +01002863 goto out_fput;
Thomas Gleixner0793a612008-12-04 20:12:29 +01002864}
2865
Ingo Molnar9b51f662008-12-12 13:49:45 +01002866/*
2867 * Initialize the perf_counter context in a task_struct:
2868 */
2869static void
2870__perf_counter_init_context(struct perf_counter_context *ctx,
2871 struct task_struct *task)
2872{
2873 memset(ctx, 0, sizeof(*ctx));
2874 spin_lock_init(&ctx->lock);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002875 mutex_init(&ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002876 INIT_LIST_HEAD(&ctx->counter_list);
Peter Zijlstra592903c2009-03-13 12:21:36 +01002877 INIT_LIST_HEAD(&ctx->event_list);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002878 ctx->task = task;
2879}
2880
2881/*
2882 * inherit a counter from parent task to child task:
2883 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11002884static struct perf_counter *
Ingo Molnar9b51f662008-12-12 13:49:45 +01002885inherit_counter(struct perf_counter *parent_counter,
2886 struct task_struct *parent,
2887 struct perf_counter_context *parent_ctx,
2888 struct task_struct *child,
Paul Mackerrasd859e292009-01-17 18:10:22 +11002889 struct perf_counter *group_leader,
Ingo Molnar9b51f662008-12-12 13:49:45 +01002890 struct perf_counter_context *child_ctx)
2891{
2892 struct perf_counter *child_counter;
2893
Paul Mackerrasd859e292009-01-17 18:10:22 +11002894 /*
2895 * Instead of creating recursive hierarchies of counters,
2896 * we link inherited counters back to the original parent,
2897 * which has a filp for sure, which we use as the reference
2898 * count:
2899 */
2900 if (parent_counter->parent)
2901 parent_counter = parent_counter->parent;
2902
Ingo Molnar9b51f662008-12-12 13:49:45 +01002903 child_counter = perf_counter_alloc(&parent_counter->hw_event,
Paul Mackerras23a185c2009-02-09 22:42:47 +11002904 parent_counter->cpu, child_ctx,
2905 group_leader, GFP_KERNEL);
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002906 if (IS_ERR(child_counter))
2907 return child_counter;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002908
2909 /*
2910 * Link it up in the child's context:
2911 */
Ingo Molnar9b51f662008-12-12 13:49:45 +01002912 child_counter->task = child;
Paul Mackerras53cfbf52009-03-25 22:46:58 +11002913 add_counter_to_ctx(child_counter, child_ctx);
Ingo Molnar9b51f662008-12-12 13:49:45 +01002914
2915 child_counter->parent = parent_counter;
Ingo Molnar9b51f662008-12-12 13:49:45 +01002916 /*
2917 * inherit into child's child as well:
2918 */
2919 child_counter->hw_event.inherit = 1;
2920
2921 /*
2922 * Get a reference to the parent filp - we will fput it
2923 * when the child counter exits. This is safe to do because
2924 * we are in the parent and we know that the filp still
2925 * exists and has a nonzero count:
2926 */
2927 atomic_long_inc(&parent_counter->filp->f_count);
2928
Paul Mackerrasd859e292009-01-17 18:10:22 +11002929 /*
2930 * Link this into the parent counter's child list
2931 */
2932 mutex_lock(&parent_counter->mutex);
2933 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2934
2935 /*
2936 * Make the child state follow the state of the parent counter,
2937 * not its hw_event.disabled bit. We hold the parent's mutex,
2938 * so we won't race with perf_counter_{en,dis}able_family.
2939 */
2940 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2941 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2942 else
2943 child_counter->state = PERF_COUNTER_STATE_OFF;
2944
2945 mutex_unlock(&parent_counter->mutex);
2946
2947 return child_counter;
2948}
2949
2950static int inherit_group(struct perf_counter *parent_counter,
2951 struct task_struct *parent,
2952 struct perf_counter_context *parent_ctx,
2953 struct task_struct *child,
2954 struct perf_counter_context *child_ctx)
2955{
2956 struct perf_counter *leader;
2957 struct perf_counter *sub;
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002958 struct perf_counter *child_ctr;
Paul Mackerrasd859e292009-01-17 18:10:22 +11002959
2960 leader = inherit_counter(parent_counter, parent, parent_ctx,
2961 child, NULL, child_ctx);
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002962 if (IS_ERR(leader))
2963 return PTR_ERR(leader);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002964 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
Paul Mackerrasd5d2bc0d2009-03-30 19:07:08 +02002965 child_ctr = inherit_counter(sub, parent, parent_ctx,
2966 child, leader, child_ctx);
2967 if (IS_ERR(child_ctr))
2968 return PTR_ERR(child_ctr);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002969 }
Ingo Molnar9b51f662008-12-12 13:49:45 +01002970 return 0;
2971}
2972
Paul Mackerrasd859e292009-01-17 18:10:22 +11002973static void sync_child_counter(struct perf_counter *child_counter,
2974 struct perf_counter *parent_counter)
2975{
2976 u64 parent_val, child_val;
2977
2978 parent_val = atomic64_read(&parent_counter->count);
2979 child_val = atomic64_read(&child_counter->count);
2980
2981 /*
2982 * Add back the child's count to the parent's count:
2983 */
2984 atomic64_add(child_val, &parent_counter->count);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11002985 atomic64_add(child_counter->total_time_enabled,
2986 &parent_counter->child_total_time_enabled);
2987 atomic64_add(child_counter->total_time_running,
2988 &parent_counter->child_total_time_running);
Paul Mackerrasd859e292009-01-17 18:10:22 +11002989
2990 /*
2991 * Remove this counter from the parent's list
2992 */
2993 mutex_lock(&parent_counter->mutex);
2994 list_del_init(&child_counter->child_list);
2995 mutex_unlock(&parent_counter->mutex);
2996
2997 /*
2998 * Release the parent counter, if this was the last
2999 * reference to it.
3000 */
3001 fput(parent_counter->filp);
3002}
3003
Ingo Molnar9b51f662008-12-12 13:49:45 +01003004static void
3005__perf_counter_exit_task(struct task_struct *child,
3006 struct perf_counter *child_counter,
3007 struct perf_counter_context *child_ctx)
3008{
3009 struct perf_counter *parent_counter;
Paul Mackerrasd859e292009-01-17 18:10:22 +11003010 struct perf_counter *sub, *tmp;
Ingo Molnar9b51f662008-12-12 13:49:45 +01003011
3012 /*
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003013 * If we do not self-reap then we have to wait for the
3014 * child task to unschedule (it will happen for sure),
3015 * so that its counter is at its final count. (This
3016 * condition triggers rarely - child tasks usually get
3017 * off their CPU before the parent has a chance to
3018 * get this far into the reaping action)
Ingo Molnar9b51f662008-12-12 13:49:45 +01003019 */
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003020 if (child != current) {
3021 wait_task_inactive(child, 0);
3022 list_del_init(&child_counter->list_entry);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11003023 update_counter_times(child_counter);
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003024 } else {
Ingo Molnar0cc0c022008-12-14 23:20:36 +01003025 struct perf_cpu_context *cpuctx;
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003026 unsigned long flags;
3027 u64 perf_flags;
3028
3029 /*
3030 * Disable and unlink this counter.
3031 *
3032 * Be careful about zapping the list - IRQ/NMI context
3033 * could still be processing it:
3034 */
Peter Zijlstra849691a2009-04-06 11:45:12 +02003035 local_irq_save(flags);
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003036 perf_flags = hw_perf_save_disable();
Ingo Molnar0cc0c022008-12-14 23:20:36 +01003037
3038 cpuctx = &__get_cpu_var(perf_cpu_context);
3039
Paul Mackerrasd859e292009-01-17 18:10:22 +11003040 group_sched_out(child_counter, cpuctx, child_ctx);
Paul Mackerras53cfbf52009-03-25 22:46:58 +11003041 update_counter_times(child_counter);
Ingo Molnar0cc0c022008-12-14 23:20:36 +01003042
Ingo Molnar235c7fc2008-12-21 14:43:25 +01003043 list_del_init(&child_counter->list_entry);
3044
3045 child_ctx->nr_counters--;
3046
3047 hw_perf_restore(perf_flags);
Peter Zijlstra849691a2009-04-06 11:45:12 +02003048 local_irq_restore(flags);
Ingo Molnar0cc0c022008-12-14 23:20:36 +01003049 }
3050
Ingo Molnar9b51f662008-12-12 13:49:45 +01003051 parent_counter = child_counter->parent;
3052 /*
3053 * It can happen that parent exits first, and has counters
3054 * that are still around due to the child reference. These
3055 * counters need to be zapped - but otherwise linger.
3056 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11003057 if (parent_counter) {
3058 sync_child_counter(child_counter, parent_counter);
3059 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
3060 list_entry) {
Paul Mackerras4bcf3492009-02-11 13:53:19 +01003061 if (sub->parent) {
Paul Mackerrasd859e292009-01-17 18:10:22 +11003062 sync_child_counter(sub, sub->parent);
Peter Zijlstraf1600952009-03-19 20:26:16 +01003063 free_counter(sub);
Paul Mackerras4bcf3492009-02-11 13:53:19 +01003064 }
Paul Mackerrasd859e292009-01-17 18:10:22 +11003065 }
Peter Zijlstraf1600952009-03-19 20:26:16 +01003066 free_counter(child_counter);
Paul Mackerras4bcf3492009-02-11 13:53:19 +01003067 }
Ingo Molnar9b51f662008-12-12 13:49:45 +01003068}
3069
3070/*
Paul Mackerrasd859e292009-01-17 18:10:22 +11003071 * When a child task exits, feed back counter values to parent counters.
Ingo Molnar9b51f662008-12-12 13:49:45 +01003072 *
Paul Mackerrasd859e292009-01-17 18:10:22 +11003073 * Note: we may be running in child context, but the PID is not hashed
Ingo Molnar9b51f662008-12-12 13:49:45 +01003074 * anymore so new counters will not be added.
3075 */
3076void perf_counter_exit_task(struct task_struct *child)
3077{
3078 struct perf_counter *child_counter, *tmp;
3079 struct perf_counter_context *child_ctx;
3080
3081 child_ctx = &child->perf_counter_ctx;
3082
3083 if (likely(!child_ctx->nr_counters))
3084 return;
3085
3086 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3087 list_entry)
3088 __perf_counter_exit_task(child, child_counter, child_ctx);
3089}
3090
3091/*
3092 * Initialize the perf_counter context in task_struct
3093 */
3094void perf_counter_init_task(struct task_struct *child)
3095{
3096 struct perf_counter_context *child_ctx, *parent_ctx;
Paul Mackerrasd859e292009-01-17 18:10:22 +11003097 struct perf_counter *counter;
Ingo Molnar9b51f662008-12-12 13:49:45 +01003098 struct task_struct *parent = current;
Ingo Molnar9b51f662008-12-12 13:49:45 +01003099
3100 child_ctx = &child->perf_counter_ctx;
3101 parent_ctx = &parent->perf_counter_ctx;
3102
3103 __perf_counter_init_context(child_ctx, child);
3104
3105 /*
3106 * This is executed from the parent task context, so inherit
3107 * counters that have been marked for cloning:
3108 */
3109
3110 if (likely(!parent_ctx->nr_counters))
3111 return;
3112
3113 /*
3114 * Lock the parent list. No need to lock the child - not PID
3115 * hashed yet and not running, so nobody can access it.
3116 */
Paul Mackerrasd859e292009-01-17 18:10:22 +11003117 mutex_lock(&parent_ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01003118
3119 /*
3120 * We dont have to disable NMIs - we are only looking at
3121 * the list, not manipulating it:
3122 */
3123 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
Paul Mackerrasd859e292009-01-17 18:10:22 +11003124 if (!counter->hw_event.inherit)
Ingo Molnar9b51f662008-12-12 13:49:45 +01003125 continue;
3126
Paul Mackerrasd859e292009-01-17 18:10:22 +11003127 if (inherit_group(counter, parent,
Ingo Molnar9b51f662008-12-12 13:49:45 +01003128 parent_ctx, child, child_ctx))
3129 break;
3130 }
3131
Paul Mackerrasd859e292009-01-17 18:10:22 +11003132 mutex_unlock(&parent_ctx->mutex);
Ingo Molnar9b51f662008-12-12 13:49:45 +01003133}
3134
Ingo Molnar04289bb2008-12-11 08:38:42 +01003135static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +01003136{
Ingo Molnar04289bb2008-12-11 08:38:42 +01003137 struct perf_cpu_context *cpuctx;
Thomas Gleixner0793a612008-12-04 20:12:29 +01003138
Ingo Molnar04289bb2008-12-11 08:38:42 +01003139 cpuctx = &per_cpu(perf_cpu_context, cpu);
3140 __perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003141
3142 mutex_lock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01003143 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner0793a612008-12-04 20:12:29 +01003144 mutex_unlock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01003145
Paul Mackerras01d02872009-01-14 13:44:19 +11003146 hw_perf_counter_setup(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003147}
3148
3149#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar04289bb2008-12-11 08:38:42 +01003150static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +01003151{
3152 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3153 struct perf_counter_context *ctx = &cpuctx->ctx;
3154 struct perf_counter *counter, *tmp;
3155
Ingo Molnar04289bb2008-12-11 08:38:42 +01003156 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3157 __perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003158}
Ingo Molnar04289bb2008-12-11 08:38:42 +01003159static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +01003160{
Paul Mackerrasd859e292009-01-17 18:10:22 +11003161 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3162 struct perf_counter_context *ctx = &cpuctx->ctx;
3163
3164 mutex_lock(&ctx->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +01003165 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Paul Mackerrasd859e292009-01-17 18:10:22 +11003166 mutex_unlock(&ctx->mutex);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003167}
3168#else
Ingo Molnar04289bb2008-12-11 08:38:42 +01003169static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner0793a612008-12-04 20:12:29 +01003170#endif
3171
3172static int __cpuinit
3173perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3174{
3175 unsigned int cpu = (long)hcpu;
3176
3177 switch (action) {
3178
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +01003181 perf_counter_init_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003182 break;
3183
3184 case CPU_DOWN_PREPARE:
3185 case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +01003186 perf_counter_exit_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +01003187 break;
3188
3189 default:
3190 break;
3191 }
3192
3193 return NOTIFY_OK;
3194}
3195
3196static struct notifier_block __cpuinitdata perf_cpu_nb = {
3197 .notifier_call = perf_cpu_notify,
3198};
3199
3200static int __init perf_counter_init(void)
3201{
3202 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3203 (void *)(long)smp_processor_id());
3204 register_cpu_notifier(&perf_cpu_nb);
3205
3206 return 0;
3207}
3208early_initcall(perf_counter_init);
3209
3210static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3211{
3212 return sprintf(buf, "%d\n", perf_reserved_percpu);
3213}
3214
3215static ssize_t
3216perf_set_reserve_percpu(struct sysdev_class *class,
3217 const char *buf,
3218 size_t count)
3219{
3220 struct perf_cpu_context *cpuctx;
3221 unsigned long val;
3222 int err, cpu, mpt;
3223
3224 err = strict_strtoul(buf, 10, &val);
3225 if (err)
3226 return err;
3227 if (val > perf_max_counters)
3228 return -EINVAL;
3229
3230 mutex_lock(&perf_resource_mutex);
3231 perf_reserved_percpu = val;
3232 for_each_online_cpu(cpu) {
3233 cpuctx = &per_cpu(perf_cpu_context, cpu);
3234 spin_lock_irq(&cpuctx->ctx.lock);
3235 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3236 perf_max_counters - perf_reserved_percpu);
3237 cpuctx->max_pertask = mpt;
3238 spin_unlock_irq(&cpuctx->ctx.lock);
3239 }
3240 mutex_unlock(&perf_resource_mutex);
3241
3242 return count;
3243}
3244
3245static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3246{
3247 return sprintf(buf, "%d\n", perf_overcommit);
3248}
3249
3250static ssize_t
3251perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3252{
3253 unsigned long val;
3254 int err;
3255
3256 err = strict_strtoul(buf, 10, &val);
3257 if (err)
3258 return err;
3259 if (val > 1)
3260 return -EINVAL;
3261
3262 mutex_lock(&perf_resource_mutex);
3263 perf_overcommit = val;
3264 mutex_unlock(&perf_resource_mutex);
3265
3266 return count;
3267}
3268
3269static SYSDEV_CLASS_ATTR(
3270 reserve_percpu,
3271 0644,
3272 perf_show_reserve_percpu,
3273 perf_set_reserve_percpu
3274 );
3275
3276static SYSDEV_CLASS_ATTR(
3277 overcommit,
3278 0644,
3279 perf_show_overcommit,
3280 perf_set_overcommit
3281 );
3282
3283static struct attribute *perfclass_attrs[] = {
3284 &attr_reserve_percpu.attr,
3285 &attr_overcommit.attr,
3286 NULL
3287};
3288
3289static struct attribute_group perfclass_attr_group = {
3290 .attrs = perfclass_attrs,
3291 .name = "perf_counters",
3292};
3293
3294static int __init perf_counter_sysfs_init(void)
3295{
3296 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3297 &perfclass_attr_group);
3298}
3299device_initcall(perf_counter_sysfs_init);