blob: fa59fe8c02d5b7d393f6fef36ca97e9a6bd84a74 [file] [log] [blame]
Thomas Gleixner0793a612008-12-04 20:12:29 +01001/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
Ingo Molnar04289bb2008-12-11 08:38:42 +010013#include <linux/file.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010014#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/perf_counter.h>
22
23/*
24 * Each CPU has a list of per CPU counters:
25 */
26DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
27
28int perf_max_counters __read_mostly;
29static int perf_reserved_percpu __read_mostly;
30static int perf_overcommit __read_mostly = 1;
31
32/*
33 * Mutex for (sysadmin-configurable) counter reservations:
34 */
35static DEFINE_MUTEX(perf_resource_mutex);
36
37/*
38 * Architecture provided APIs - weak aliases:
39 */
40
Thomas Gleixnerdfa7c892008-12-08 19:35:37 +010041int __weak hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010042{
43 return -EINVAL;
44}
45
46void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
47void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
48void __weak hw_perf_counter_read(struct perf_counter *counter) { }
49void __weak hw_perf_disable_all(void) { }
50void __weak hw_perf_enable_all(void) { }
51void __weak hw_perf_counter_setup(void) { }
52
53#if BITS_PER_LONG == 64
54
55/*
56 * Read the cached counter in counter safe against cross CPU / NMI
57 * modifications. 64 bit version - no complications.
58 */
Ingo Molnar04289bb2008-12-11 08:38:42 +010059static inline u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010060{
61 return (u64) atomic64_read(&counter->count);
62}
63
64#else
65
66/*
67 * Read the cached counter in counter safe against cross CPU / NMI
68 * modifications. 32 bit version.
69 */
Ingo Molnar04289bb2008-12-11 08:38:42 +010070static u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010071{
72 u32 cntl, cnth;
73
74 local_irq_disable();
75 do {
76 cnth = atomic_read(&counter->count32[1]);
77 cntl = atomic_read(&counter->count32[0]);
78 } while (cnth != atomic_read(&counter->count32[1]));
79
80 local_irq_enable();
81
82 return cntl | ((u64) cnth) << 32;
83}
84
85#endif
86
Ingo Molnar04289bb2008-12-11 08:38:42 +010087static void
88list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
89{
90 struct perf_counter *group_leader = counter->group_leader;
91
92 /*
93 * Depending on whether it is a standalone or sibling counter,
94 * add it straight to the context's counter list, or to the group
95 * leader's sibling list:
96 */
97 if (counter->group_leader == counter)
98 list_add_tail(&counter->list_entry, &ctx->counter_list);
99 else
100 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
101}
102
103static void
104list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
105{
106 struct perf_counter *sibling, *tmp;
107
108 list_del_init(&counter->list_entry);
109
110 if (list_empty(&counter->sibling_list))
111 return;
112
113 /*
114 * If this was a group counter with sibling counters then
115 * upgrade the siblings to singleton counters by adding them
116 * to the context list directly:
117 */
118 list_for_each_entry_safe(sibling, tmp,
119 &counter->sibling_list, list_entry) {
120
121 list_del_init(&sibling->list_entry);
122 list_add_tail(&sibling->list_entry, &ctx->counter_list);
123 WARN_ON_ONCE(!sibling->group_leader);
124 WARN_ON_ONCE(sibling->group_leader == sibling);
125 sibling->group_leader = sibling;
126 }
127}
128
Thomas Gleixner0793a612008-12-04 20:12:29 +0100129/*
130 * Cross CPU call to remove a performance counter
131 *
132 * We disable the counter on the hardware level first. After that we
133 * remove it from the context list.
134 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100135static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100136{
137 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
138 struct perf_counter *counter = info;
139 struct perf_counter_context *ctx = counter->ctx;
140
141 /*
142 * If this is a task context, we need to check whether it is
143 * the current task context of this cpu. If not it has been
144 * scheduled out before the smp call arrived.
145 */
146 if (ctx->task && cpuctx->task_ctx != ctx)
147 return;
148
149 spin_lock(&ctx->lock);
150
151 if (counter->active) {
152 hw_perf_counter_disable(counter);
153 counter->active = 0;
154 ctx->nr_active--;
155 cpuctx->active_oncpu--;
156 counter->task = NULL;
157 }
158 ctx->nr_counters--;
159
160 /*
161 * Protect the list operation against NMI by disabling the
162 * counters on a global level. NOP for non NMI based counters.
163 */
164 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100165 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100166 hw_perf_enable_all();
167
168 if (!ctx->task) {
169 /*
170 * Allow more per task counters with respect to the
171 * reservation:
172 */
173 cpuctx->max_pertask =
174 min(perf_max_counters - ctx->nr_counters,
175 perf_max_counters - perf_reserved_percpu);
176 }
177
178 spin_unlock(&ctx->lock);
179}
180
181
182/*
183 * Remove the counter from a task's (or a CPU's) list of counters.
184 *
185 * Must be called with counter->mutex held.
186 *
187 * CPU counters are removed with a smp call. For task counters we only
188 * call when the task is on a CPU.
189 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100190static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100191{
192 struct perf_counter_context *ctx = counter->ctx;
193 struct task_struct *task = ctx->task;
194
195 if (!task) {
196 /*
197 * Per cpu counters are removed via an smp call and
198 * the removal is always sucessful.
199 */
200 smp_call_function_single(counter->cpu,
Ingo Molnar04289bb2008-12-11 08:38:42 +0100201 __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100202 counter, 1);
203 return;
204 }
205
206retry:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100207 task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100208 counter);
209
210 spin_lock_irq(&ctx->lock);
211 /*
212 * If the context is active we need to retry the smp call.
213 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100214 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100215 spin_unlock_irq(&ctx->lock);
216 goto retry;
217 }
218
219 /*
220 * The lock prevents that this context is scheduled in so we
Ingo Molnar04289bb2008-12-11 08:38:42 +0100221 * can remove the counter safely, if the call above did not
Thomas Gleixner0793a612008-12-04 20:12:29 +0100222 * succeed.
223 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100224 if (!list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100225 ctx->nr_counters--;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100226 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100227 counter->task = NULL;
228 }
229 spin_unlock_irq(&ctx->lock);
230}
231
232/*
233 * Cross CPU call to install and enable a preformance counter
234 */
235static void __perf_install_in_context(void *info)
236{
237 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
238 struct perf_counter *counter = info;
239 struct perf_counter_context *ctx = counter->ctx;
240 int cpu = smp_processor_id();
241
242 /*
243 * If this is a task context, we need to check whether it is
244 * the current task context of this cpu. If not it has been
245 * scheduled out before the smp call arrived.
246 */
247 if (ctx->task && cpuctx->task_ctx != ctx)
248 return;
249
250 spin_lock(&ctx->lock);
251
252 /*
253 * Protect the list operation against NMI by disabling the
254 * counters on a global level. NOP for non NMI based counters.
255 */
256 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100257 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100258 hw_perf_enable_all();
259
260 ctx->nr_counters++;
261
262 if (cpuctx->active_oncpu < perf_max_counters) {
263 hw_perf_counter_enable(counter);
264 counter->active = 1;
265 counter->oncpu = cpu;
266 ctx->nr_active++;
267 cpuctx->active_oncpu++;
268 }
269
270 if (!ctx->task && cpuctx->max_pertask)
271 cpuctx->max_pertask--;
272
273 spin_unlock(&ctx->lock);
274}
275
276/*
277 * Attach a performance counter to a context
278 *
279 * First we add the counter to the list with the hardware enable bit
280 * in counter->hw_config cleared.
281 *
282 * If the counter is attached to a task which is on a CPU we use a smp
283 * call to enable it in the task context. The task might have been
284 * scheduled away, but we check this in the smp call again.
285 */
286static void
287perf_install_in_context(struct perf_counter_context *ctx,
288 struct perf_counter *counter,
289 int cpu)
290{
291 struct task_struct *task = ctx->task;
292
293 counter->ctx = ctx;
294 if (!task) {
295 /*
296 * Per cpu counters are installed via an smp call and
297 * the install is always sucessful.
298 */
299 smp_call_function_single(cpu, __perf_install_in_context,
300 counter, 1);
301 return;
302 }
303
304 counter->task = task;
305retry:
306 task_oncpu_function_call(task, __perf_install_in_context,
307 counter);
308
309 spin_lock_irq(&ctx->lock);
310 /*
311 * If the context is active and the counter has not been added
312 * we need to retry the smp call.
313 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100314 if (ctx->nr_active && list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100315 spin_unlock_irq(&ctx->lock);
316 goto retry;
317 }
318
319 /*
320 * The lock prevents that this context is scheduled in so we
321 * can add the counter safely, if it the call above did not
322 * succeed.
323 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100324 if (list_empty(&counter->list_entry)) {
325 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100326 ctx->nr_counters++;
327 }
328 spin_unlock_irq(&ctx->lock);
329}
330
Ingo Molnar04289bb2008-12-11 08:38:42 +0100331static void
332counter_sched_out(struct perf_counter *counter,
333 struct perf_cpu_context *cpuctx,
334 struct perf_counter_context *ctx)
335{
336 if (!counter->active)
337 return;
338
339 hw_perf_counter_disable(counter);
340 counter->active = 0;
341 counter->oncpu = -1;
342
343 cpuctx->active_oncpu--;
344 ctx->nr_active--;
345}
346
347static void
348group_sched_out(struct perf_counter *group_counter,
349 struct perf_cpu_context *cpuctx,
350 struct perf_counter_context *ctx)
351{
352 struct perf_counter *counter;
353
354 counter_sched_out(group_counter, cpuctx, ctx);
355
356 /*
357 * Schedule out siblings (if any):
358 */
359 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
360 counter_sched_out(counter, cpuctx, ctx);
361}
362
Thomas Gleixner0793a612008-12-04 20:12:29 +0100363/*
364 * Called from scheduler to remove the counters of the current task,
365 * with interrupts disabled.
366 *
367 * We stop each counter and update the counter value in counter->count.
368 *
369 * This does not protect us against NMI, but hw_perf_counter_disable()
370 * sets the disabled bit in the control field of counter _before_
371 * accessing the counter control register. If a NMI hits, then it will
372 * not restart the counter.
373 */
374void perf_counter_task_sched_out(struct task_struct *task, int cpu)
375{
376 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
377 struct perf_counter_context *ctx = &task->perf_counter_ctx;
378 struct perf_counter *counter;
379
380 if (likely(!cpuctx->task_ctx))
381 return;
382
383 spin_lock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100384 if (ctx->nr_active) {
385 list_for_each_entry(counter, &ctx->counter_list, list_entry)
386 group_sched_out(counter, cpuctx, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100387 }
388 spin_unlock(&ctx->lock);
389 cpuctx->task_ctx = NULL;
390}
391
Ingo Molnar04289bb2008-12-11 08:38:42 +0100392static void
393counter_sched_in(struct perf_counter *counter,
394 struct perf_cpu_context *cpuctx,
395 struct perf_counter_context *ctx,
396 int cpu)
397{
398 if (!counter->active)
399 return;
400
401 hw_perf_counter_enable(counter);
402 counter->active = 1;
403 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
404
405 cpuctx->active_oncpu++;
406 ctx->nr_active++;
407}
408
409static void
410group_sched_in(struct perf_counter *group_counter,
411 struct perf_cpu_context *cpuctx,
412 struct perf_counter_context *ctx,
413 int cpu)
414{
415 struct perf_counter *counter;
416
417 counter_sched_in(group_counter, cpuctx, ctx, cpu);
418
419 /*
420 * Schedule in siblings as one group (if any):
421 */
422 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
423 counter_sched_in(counter, cpuctx, ctx, cpu);
424}
425
Thomas Gleixner0793a612008-12-04 20:12:29 +0100426/*
427 * Called from scheduler to add the counters of the current task
428 * with interrupts disabled.
429 *
430 * We restore the counter value and then enable it.
431 *
432 * This does not protect us against NMI, but hw_perf_counter_enable()
433 * sets the enabled bit in the control field of counter _before_
434 * accessing the counter control register. If a NMI hits, then it will
435 * keep the counter running.
436 */
437void perf_counter_task_sched_in(struct task_struct *task, int cpu)
438{
439 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
440 struct perf_counter_context *ctx = &task->perf_counter_ctx;
441 struct perf_counter *counter;
442
443 if (likely(!ctx->nr_counters))
444 return;
445
446 spin_lock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100447 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100448 if (ctx->nr_active == cpuctx->max_pertask)
449 break;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100450
451 /*
452 * Listen to the 'cpu' scheduling filter constraint
453 * of counters:
454 */
Thomas Gleixner0793a612008-12-04 20:12:29 +0100455 if (counter->cpu != -1 && counter->cpu != cpu)
456 continue;
457
Ingo Molnar04289bb2008-12-11 08:38:42 +0100458 group_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100459 }
460 spin_unlock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100461
Thomas Gleixner0793a612008-12-04 20:12:29 +0100462 cpuctx->task_ctx = ctx;
463}
464
465void perf_counter_task_tick(struct task_struct *curr, int cpu)
466{
467 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
468 struct perf_counter *counter;
469
470 if (likely(!ctx->nr_counters))
471 return;
472
473 perf_counter_task_sched_out(curr, cpu);
474
475 spin_lock(&ctx->lock);
476
477 /*
Ingo Molnar04289bb2008-12-11 08:38:42 +0100478 * Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner0793a612008-12-04 20:12:29 +0100479 */
480 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100481 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
482 list_del(&counter->list_entry);
483 list_add_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100484 break;
485 }
486 hw_perf_enable_all();
487
488 spin_unlock(&ctx->lock);
489
490 perf_counter_task_sched_in(curr, cpu);
491}
492
493/*
Ingo Molnar04289bb2008-12-11 08:38:42 +0100494 * Initialize the perf_counter context in a task_struct:
495 */
496static void
497__perf_counter_init_context(struct perf_counter_context *ctx,
498 struct task_struct *task)
499{
500 spin_lock_init(&ctx->lock);
501 INIT_LIST_HEAD(&ctx->counter_list);
502 ctx->nr_counters = 0;
503 ctx->task = task;
504}
505/*
Thomas Gleixner0793a612008-12-04 20:12:29 +0100506 * Initialize the perf_counter context in task_struct
507 */
508void perf_counter_init_task(struct task_struct *task)
509{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100510 __perf_counter_init_context(&task->perf_counter_ctx, task);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100511}
512
513/*
514 * Cross CPU call to read the hardware counter
515 */
516static void __hw_perf_counter_read(void *info)
517{
518 hw_perf_counter_read(info);
519}
520
Ingo Molnar04289bb2008-12-11 08:38:42 +0100521static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100522{
523 /*
524 * If counter is enabled and currently active on a CPU, update the
525 * value in the counter structure:
526 */
527 if (counter->active) {
528 smp_call_function_single(counter->oncpu,
529 __hw_perf_counter_read, counter, 1);
530 }
531
Ingo Molnar04289bb2008-12-11 08:38:42 +0100532 return perf_counter_read_safe(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100533}
534
535/*
536 * Cross CPU call to switch performance data pointers
537 */
538static void __perf_switch_irq_data(void *info)
539{
540 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
541 struct perf_counter *counter = info;
542 struct perf_counter_context *ctx = counter->ctx;
543 struct perf_data *oldirqdata = counter->irqdata;
544
545 /*
546 * If this is a task context, we need to check whether it is
547 * the current task context of this cpu. If not it has been
548 * scheduled out before the smp call arrived.
549 */
550 if (ctx->task) {
551 if (cpuctx->task_ctx != ctx)
552 return;
553 spin_lock(&ctx->lock);
554 }
555
556 /* Change the pointer NMI safe */
557 atomic_long_set((atomic_long_t *)&counter->irqdata,
558 (unsigned long) counter->usrdata);
559 counter->usrdata = oldirqdata;
560
561 if (ctx->task)
562 spin_unlock(&ctx->lock);
563}
564
565static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
566{
567 struct perf_counter_context *ctx = counter->ctx;
568 struct perf_data *oldirqdata = counter->irqdata;
569 struct task_struct *task = ctx->task;
570
571 if (!task) {
572 smp_call_function_single(counter->cpu,
573 __perf_switch_irq_data,
574 counter, 1);
575 return counter->usrdata;
576 }
577
578retry:
579 spin_lock_irq(&ctx->lock);
580 if (!counter->active) {
581 counter->irqdata = counter->usrdata;
582 counter->usrdata = oldirqdata;
583 spin_unlock_irq(&ctx->lock);
584 return oldirqdata;
585 }
586 spin_unlock_irq(&ctx->lock);
587 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
588 /* Might have failed, because task was scheduled out */
589 if (counter->irqdata == oldirqdata)
590 goto retry;
591
592 return counter->usrdata;
593}
594
595static void put_context(struct perf_counter_context *ctx)
596{
597 if (ctx->task)
598 put_task_struct(ctx->task);
599}
600
601static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
602{
603 struct perf_cpu_context *cpuctx;
604 struct perf_counter_context *ctx;
605 struct task_struct *task;
606
607 /*
608 * If cpu is not a wildcard then this is a percpu counter:
609 */
610 if (cpu != -1) {
611 /* Must be root to operate on a CPU counter: */
612 if (!capable(CAP_SYS_ADMIN))
613 return ERR_PTR(-EACCES);
614
615 if (cpu < 0 || cpu > num_possible_cpus())
616 return ERR_PTR(-EINVAL);
617
618 /*
619 * We could be clever and allow to attach a counter to an
620 * offline CPU and activate it when the CPU comes up, but
621 * that's for later.
622 */
623 if (!cpu_isset(cpu, cpu_online_map))
624 return ERR_PTR(-ENODEV);
625
626 cpuctx = &per_cpu(perf_cpu_context, cpu);
627 ctx = &cpuctx->ctx;
628
629 WARN_ON_ONCE(ctx->task);
630 return ctx;
631 }
632
633 rcu_read_lock();
634 if (!pid)
635 task = current;
636 else
637 task = find_task_by_vpid(pid);
638 if (task)
639 get_task_struct(task);
640 rcu_read_unlock();
641
642 if (!task)
643 return ERR_PTR(-ESRCH);
644
645 ctx = &task->perf_counter_ctx;
646 ctx->task = task;
647
648 /* Reuse ptrace permission checks for now. */
649 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
650 put_context(ctx);
651 return ERR_PTR(-EACCES);
652 }
653
654 return ctx;
655}
656
657/*
658 * Called when the last reference to the file is gone.
659 */
660static int perf_release(struct inode *inode, struct file *file)
661{
662 struct perf_counter *counter = file->private_data;
663 struct perf_counter_context *ctx = counter->ctx;
664
665 file->private_data = NULL;
666
667 mutex_lock(&counter->mutex);
668
Ingo Molnar04289bb2008-12-11 08:38:42 +0100669 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100670 put_context(ctx);
671
672 mutex_unlock(&counter->mutex);
673
674 kfree(counter);
675
676 return 0;
677}
678
679/*
680 * Read the performance counter - simple non blocking version for now
681 */
682static ssize_t
683perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
684{
685 u64 cntval;
686
687 if (count != sizeof(cntval))
688 return -EINVAL;
689
690 mutex_lock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100691 cntval = perf_counter_read(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100692 mutex_unlock(&counter->mutex);
693
694 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
695}
696
697static ssize_t
698perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
699{
700 if (!usrdata->len)
701 return 0;
702
703 count = min(count, (size_t)usrdata->len);
704 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
705 return -EFAULT;
706
707 /* Adjust the counters */
708 usrdata->len -= count;
709 if (!usrdata->len)
710 usrdata->rd_idx = 0;
711 else
712 usrdata->rd_idx += count;
713
714 return count;
715}
716
717static ssize_t
718perf_read_irq_data(struct perf_counter *counter,
719 char __user *buf,
720 size_t count,
721 int nonblocking)
722{
723 struct perf_data *irqdata, *usrdata;
724 DECLARE_WAITQUEUE(wait, current);
725 ssize_t res;
726
727 irqdata = counter->irqdata;
728 usrdata = counter->usrdata;
729
730 if (usrdata->len + irqdata->len >= count)
731 goto read_pending;
732
733 if (nonblocking)
734 return -EAGAIN;
735
736 spin_lock_irq(&counter->waitq.lock);
737 __add_wait_queue(&counter->waitq, &wait);
738 for (;;) {
739 set_current_state(TASK_INTERRUPTIBLE);
740 if (usrdata->len + irqdata->len >= count)
741 break;
742
743 if (signal_pending(current))
744 break;
745
746 spin_unlock_irq(&counter->waitq.lock);
747 schedule();
748 spin_lock_irq(&counter->waitq.lock);
749 }
750 __remove_wait_queue(&counter->waitq, &wait);
751 __set_current_state(TASK_RUNNING);
752 spin_unlock_irq(&counter->waitq.lock);
753
754 if (usrdata->len + irqdata->len < count)
755 return -ERESTARTSYS;
756read_pending:
757 mutex_lock(&counter->mutex);
758
759 /* Drain pending data first: */
760 res = perf_copy_usrdata(usrdata, buf, count);
761 if (res < 0 || res == count)
762 goto out;
763
764 /* Switch irq buffer: */
765 usrdata = perf_switch_irq_data(counter);
766 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
767 if (!res)
768 res = -EFAULT;
769 } else {
770 res = count;
771 }
772out:
773 mutex_unlock(&counter->mutex);
774
775 return res;
776}
777
778static ssize_t
779perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
780{
781 struct perf_counter *counter = file->private_data;
782
Ingo Molnar9f66a382008-12-10 12:33:23 +0100783 switch (counter->hw_event.record_type) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100784 case PERF_RECORD_SIMPLE:
785 return perf_read_hw(counter, buf, count);
786
787 case PERF_RECORD_IRQ:
788 case PERF_RECORD_GROUP:
789 return perf_read_irq_data(counter, buf, count,
790 file->f_flags & O_NONBLOCK);
791 }
792 return -EINVAL;
793}
794
795static unsigned int perf_poll(struct file *file, poll_table *wait)
796{
797 struct perf_counter *counter = file->private_data;
798 unsigned int events = 0;
799 unsigned long flags;
800
801 poll_wait(file, &counter->waitq, wait);
802
803 spin_lock_irqsave(&counter->waitq.lock, flags);
804 if (counter->usrdata->len || counter->irqdata->len)
805 events |= POLLIN;
806 spin_unlock_irqrestore(&counter->waitq.lock, flags);
807
808 return events;
809}
810
811static const struct file_operations perf_fops = {
812 .release = perf_release,
813 .read = perf_read,
814 .poll = perf_poll,
815};
816
817/*
818 * Allocate and initialize a counter structure
819 */
820static struct perf_counter *
Ingo Molnar04289bb2008-12-11 08:38:42 +0100821perf_counter_alloc(struct perf_counter_hw_event *hw_event,
822 int cpu,
823 struct perf_counter *group_leader)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100824{
825 struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
826
827 if (!counter)
828 return NULL;
829
Ingo Molnar04289bb2008-12-11 08:38:42 +0100830 /*
831 * Single counters are their own group leaders, with an
832 * empty sibling list:
833 */
834 if (!group_leader)
835 group_leader = counter;
836
Thomas Gleixner0793a612008-12-04 20:12:29 +0100837 mutex_init(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100838 INIT_LIST_HEAD(&counter->list_entry);
839 INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100840 init_waitqueue_head(&counter->waitq);
841
Ingo Molnar9f66a382008-12-10 12:33:23 +0100842 counter->irqdata = &counter->data[0];
843 counter->usrdata = &counter->data[1];
844 counter->cpu = cpu;
845 counter->hw_event = *hw_event;
846 counter->wakeup_pending = 0;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100847 counter->group_leader = group_leader;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100848
849 return counter;
850}
851
852/**
Ingo Molnar9f66a382008-12-10 12:33:23 +0100853 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
854 *
855 * @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner0793a612008-12-04 20:12:29 +0100856 * @pid: target pid
Ingo Molnar9f66a382008-12-10 12:33:23 +0100857 * @cpu: target cpu
858 * @group_fd: group leader counter fd
Thomas Gleixner0793a612008-12-04 20:12:29 +0100859 */
Ingo Molnar9f66a382008-12-10 12:33:23 +0100860asmlinkage int sys_perf_counter_open(
861
862 struct perf_counter_hw_event *hw_event_uptr __user,
863 pid_t pid,
864 int cpu,
865 int group_fd)
866
Thomas Gleixner0793a612008-12-04 20:12:29 +0100867{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100868 struct perf_counter *counter, *group_leader;
Ingo Molnar9f66a382008-12-10 12:33:23 +0100869 struct perf_counter_hw_event hw_event;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100870 struct perf_counter_context *ctx;
871 struct file *group_file = NULL;
872 int fput_needed = 0;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100873 int ret;
874
Ingo Molnar9f66a382008-12-10 12:33:23 +0100875 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixnereab656a2008-12-08 19:26:59 +0100876 return -EFAULT;
877
Ingo Molnar04289bb2008-12-11 08:38:42 +0100878 /*
879 * Look up the group leader:
880 */
881 group_leader = NULL;
882 if (group_fd != -1) {
883 ret = -EINVAL;
884 group_file = fget_light(group_fd, &fput_needed);
885 if (!group_file)
886 goto out_fput;
887 if (group_file->f_op != &perf_fops)
888 goto out_fput;
889
890 group_leader = group_file->private_data;
891 /*
892 * Do not allow a recursive hierarchy:
893 */
894 if (group_leader->group_leader)
895 goto out_fput;
896 }
897
898 /*
899 * Get the target context (task or percpu):
900 */
Thomas Gleixner0793a612008-12-04 20:12:29 +0100901 ctx = find_get_context(pid, cpu);
902 if (IS_ERR(ctx))
903 return PTR_ERR(ctx);
904
905 ret = -ENOMEM;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100906 counter = perf_counter_alloc(&hw_event, cpu, group_leader);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100907 if (!counter)
908 goto err_put_context;
909
Thomas Gleixnerdfa7c892008-12-08 19:35:37 +0100910 ret = hw_perf_counter_init(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100911 if (ret)
912 goto err_free_put_context;
913
914 perf_install_in_context(ctx, counter, cpu);
915
916 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
917 if (ret < 0)
918 goto err_remove_free_put_context;
919
Ingo Molnar04289bb2008-12-11 08:38:42 +0100920out_fput:
921 fput_light(group_file, fput_needed);
922
Thomas Gleixner0793a612008-12-04 20:12:29 +0100923 return ret;
924
925err_remove_free_put_context:
926 mutex_lock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100927 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100928 mutex_unlock(&counter->mutex);
929
930err_free_put_context:
931 kfree(counter);
932
933err_put_context:
934 put_context(ctx);
935
Ingo Molnar04289bb2008-12-11 08:38:42 +0100936 goto out_fput;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100937}
938
Ingo Molnar04289bb2008-12-11 08:38:42 +0100939static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100940{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100941 struct perf_cpu_context *cpuctx;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100942
Ingo Molnar04289bb2008-12-11 08:38:42 +0100943 cpuctx = &per_cpu(perf_cpu_context, cpu);
944 __perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100945
946 mutex_lock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100947 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100948 mutex_unlock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100949
Thomas Gleixner0793a612008-12-04 20:12:29 +0100950 hw_perf_counter_setup();
951}
952
953#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar04289bb2008-12-11 08:38:42 +0100954static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100955{
956 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
957 struct perf_counter_context *ctx = &cpuctx->ctx;
958 struct perf_counter *counter, *tmp;
959
Ingo Molnar04289bb2008-12-11 08:38:42 +0100960 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
961 __perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100962
963}
Ingo Molnar04289bb2008-12-11 08:38:42 +0100964static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100965{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100966 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100967}
968#else
Ingo Molnar04289bb2008-12-11 08:38:42 +0100969static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner0793a612008-12-04 20:12:29 +0100970#endif
971
972static int __cpuinit
973perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
974{
975 unsigned int cpu = (long)hcpu;
976
977 switch (action) {
978
979 case CPU_UP_PREPARE:
980 case CPU_UP_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100981 perf_counter_init_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100982 break;
983
984 case CPU_DOWN_PREPARE:
985 case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100986 perf_counter_exit_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100987 break;
988
989 default:
990 break;
991 }
992
993 return NOTIFY_OK;
994}
995
996static struct notifier_block __cpuinitdata perf_cpu_nb = {
997 .notifier_call = perf_cpu_notify,
998};
999
1000static int __init perf_counter_init(void)
1001{
1002 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1003 (void *)(long)smp_processor_id());
1004 register_cpu_notifier(&perf_cpu_nb);
1005
1006 return 0;
1007}
1008early_initcall(perf_counter_init);
1009
1010static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1011{
1012 return sprintf(buf, "%d\n", perf_reserved_percpu);
1013}
1014
1015static ssize_t
1016perf_set_reserve_percpu(struct sysdev_class *class,
1017 const char *buf,
1018 size_t count)
1019{
1020 struct perf_cpu_context *cpuctx;
1021 unsigned long val;
1022 int err, cpu, mpt;
1023
1024 err = strict_strtoul(buf, 10, &val);
1025 if (err)
1026 return err;
1027 if (val > perf_max_counters)
1028 return -EINVAL;
1029
1030 mutex_lock(&perf_resource_mutex);
1031 perf_reserved_percpu = val;
1032 for_each_online_cpu(cpu) {
1033 cpuctx = &per_cpu(perf_cpu_context, cpu);
1034 spin_lock_irq(&cpuctx->ctx.lock);
1035 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1036 perf_max_counters - perf_reserved_percpu);
1037 cpuctx->max_pertask = mpt;
1038 spin_unlock_irq(&cpuctx->ctx.lock);
1039 }
1040 mutex_unlock(&perf_resource_mutex);
1041
1042 return count;
1043}
1044
1045static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1046{
1047 return sprintf(buf, "%d\n", perf_overcommit);
1048}
1049
1050static ssize_t
1051perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1052{
1053 unsigned long val;
1054 int err;
1055
1056 err = strict_strtoul(buf, 10, &val);
1057 if (err)
1058 return err;
1059 if (val > 1)
1060 return -EINVAL;
1061
1062 mutex_lock(&perf_resource_mutex);
1063 perf_overcommit = val;
1064 mutex_unlock(&perf_resource_mutex);
1065
1066 return count;
1067}
1068
1069static SYSDEV_CLASS_ATTR(
1070 reserve_percpu,
1071 0644,
1072 perf_show_reserve_percpu,
1073 perf_set_reserve_percpu
1074 );
1075
1076static SYSDEV_CLASS_ATTR(
1077 overcommit,
1078 0644,
1079 perf_show_overcommit,
1080 perf_set_overcommit
1081 );
1082
1083static struct attribute *perfclass_attrs[] = {
1084 &attr_reserve_percpu.attr,
1085 &attr_overcommit.attr,
1086 NULL
1087};
1088
1089static struct attribute_group perfclass_attr_group = {
1090 .attrs = perfclass_attrs,
1091 .name = "perf_counters",
1092};
1093
1094static int __init perf_counter_sysfs_init(void)
1095{
1096 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1097 &perfclass_attr_group);
1098}
1099device_initcall(perf_counter_sysfs_init);
1100