blob: 278209c547a870dd12a43997dc3c5915ca084665 [file] [log] [blame]
Thomas Gleixner0793a612008-12-04 20:12:29 +01001/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
Ingo Molnar04289bb2008-12-11 08:38:42 +010013#include <linux/file.h>
Thomas Gleixner0793a612008-12-04 20:12:29 +010014#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/perf_counter.h>
22
23/*
24 * Each CPU has a list of per CPU counters:
25 */
26DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
27
28int perf_max_counters __read_mostly;
29static int perf_reserved_percpu __read_mostly;
30static int perf_overcommit __read_mostly = 1;
31
32/*
33 * Mutex for (sysadmin-configurable) counter reservations:
34 */
35static DEFINE_MUTEX(perf_resource_mutex);
36
37/*
38 * Architecture provided APIs - weak aliases:
39 */
40
Thomas Gleixnerdfa7c892008-12-08 19:35:37 +010041int __weak hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010042{
43 return -EINVAL;
44}
45
46void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
47void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
48void __weak hw_perf_counter_read(struct perf_counter *counter) { }
49void __weak hw_perf_disable_all(void) { }
50void __weak hw_perf_enable_all(void) { }
51void __weak hw_perf_counter_setup(void) { }
52
53#if BITS_PER_LONG == 64
54
55/*
56 * Read the cached counter in counter safe against cross CPU / NMI
57 * modifications. 64 bit version - no complications.
58 */
Ingo Molnar04289bb2008-12-11 08:38:42 +010059static inline u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010060{
61 return (u64) atomic64_read(&counter->count);
62}
63
64#else
65
66/*
67 * Read the cached counter in counter safe against cross CPU / NMI
68 * modifications. 32 bit version.
69 */
Ingo Molnar04289bb2008-12-11 08:38:42 +010070static u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +010071{
72 u32 cntl, cnth;
73
74 local_irq_disable();
75 do {
76 cnth = atomic_read(&counter->count32[1]);
77 cntl = atomic_read(&counter->count32[0]);
78 } while (cnth != atomic_read(&counter->count32[1]));
79
80 local_irq_enable();
81
82 return cntl | ((u64) cnth) << 32;
83}
84
85#endif
86
Ingo Molnar04289bb2008-12-11 08:38:42 +010087static void
88list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
89{
90 struct perf_counter *group_leader = counter->group_leader;
91
92 /*
93 * Depending on whether it is a standalone or sibling counter,
94 * add it straight to the context's counter list, or to the group
95 * leader's sibling list:
96 */
97 if (counter->group_leader == counter)
98 list_add_tail(&counter->list_entry, &ctx->counter_list);
99 else
100 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
101}
102
103static void
104list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
105{
106 struct perf_counter *sibling, *tmp;
107
108 list_del_init(&counter->list_entry);
109
Ingo Molnar04289bb2008-12-11 08:38:42 +0100110 /*
111 * If this was a group counter with sibling counters then
112 * upgrade the siblings to singleton counters by adding them
113 * to the context list directly:
114 */
115 list_for_each_entry_safe(sibling, tmp,
116 &counter->sibling_list, list_entry) {
117
118 list_del_init(&sibling->list_entry);
119 list_add_tail(&sibling->list_entry, &ctx->counter_list);
120 WARN_ON_ONCE(!sibling->group_leader);
121 WARN_ON_ONCE(sibling->group_leader == sibling);
122 sibling->group_leader = sibling;
123 }
124}
125
Thomas Gleixner0793a612008-12-04 20:12:29 +0100126/*
127 * Cross CPU call to remove a performance counter
128 *
129 * We disable the counter on the hardware level first. After that we
130 * remove it from the context list.
131 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100132static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100133{
134 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
135 struct perf_counter *counter = info;
136 struct perf_counter_context *ctx = counter->ctx;
137
138 /*
139 * If this is a task context, we need to check whether it is
140 * the current task context of this cpu. If not it has been
141 * scheduled out before the smp call arrived.
142 */
143 if (ctx->task && cpuctx->task_ctx != ctx)
144 return;
145
146 spin_lock(&ctx->lock);
147
148 if (counter->active) {
149 hw_perf_counter_disable(counter);
150 counter->active = 0;
151 ctx->nr_active--;
152 cpuctx->active_oncpu--;
153 counter->task = NULL;
154 }
155 ctx->nr_counters--;
156
157 /*
158 * Protect the list operation against NMI by disabling the
159 * counters on a global level. NOP for non NMI based counters.
160 */
161 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100162 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100163 hw_perf_enable_all();
164
165 if (!ctx->task) {
166 /*
167 * Allow more per task counters with respect to the
168 * reservation:
169 */
170 cpuctx->max_pertask =
171 min(perf_max_counters - ctx->nr_counters,
172 perf_max_counters - perf_reserved_percpu);
173 }
174
175 spin_unlock(&ctx->lock);
176}
177
178
179/*
180 * Remove the counter from a task's (or a CPU's) list of counters.
181 *
182 * Must be called with counter->mutex held.
183 *
184 * CPU counters are removed with a smp call. For task counters we only
185 * call when the task is on a CPU.
186 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100187static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100188{
189 struct perf_counter_context *ctx = counter->ctx;
190 struct task_struct *task = ctx->task;
191
192 if (!task) {
193 /*
194 * Per cpu counters are removed via an smp call and
195 * the removal is always sucessful.
196 */
197 smp_call_function_single(counter->cpu,
Ingo Molnar04289bb2008-12-11 08:38:42 +0100198 __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100199 counter, 1);
200 return;
201 }
202
203retry:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100204 task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner0793a612008-12-04 20:12:29 +0100205 counter);
206
207 spin_lock_irq(&ctx->lock);
208 /*
209 * If the context is active we need to retry the smp call.
210 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100211 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100212 spin_unlock_irq(&ctx->lock);
213 goto retry;
214 }
215
216 /*
217 * The lock prevents that this context is scheduled in so we
Ingo Molnar04289bb2008-12-11 08:38:42 +0100218 * can remove the counter safely, if the call above did not
Thomas Gleixner0793a612008-12-04 20:12:29 +0100219 * succeed.
220 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100221 if (!list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100222 ctx->nr_counters--;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100223 list_del_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100224 counter->task = NULL;
225 }
226 spin_unlock_irq(&ctx->lock);
227}
228
229/*
230 * Cross CPU call to install and enable a preformance counter
231 */
232static void __perf_install_in_context(void *info)
233{
234 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
235 struct perf_counter *counter = info;
236 struct perf_counter_context *ctx = counter->ctx;
237 int cpu = smp_processor_id();
238
239 /*
240 * If this is a task context, we need to check whether it is
241 * the current task context of this cpu. If not it has been
242 * scheduled out before the smp call arrived.
243 */
244 if (ctx->task && cpuctx->task_ctx != ctx)
245 return;
246
247 spin_lock(&ctx->lock);
248
249 /*
250 * Protect the list operation against NMI by disabling the
251 * counters on a global level. NOP for non NMI based counters.
252 */
253 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100254 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100255 hw_perf_enable_all();
256
257 ctx->nr_counters++;
258
259 if (cpuctx->active_oncpu < perf_max_counters) {
260 hw_perf_counter_enable(counter);
261 counter->active = 1;
262 counter->oncpu = cpu;
263 ctx->nr_active++;
264 cpuctx->active_oncpu++;
265 }
266
267 if (!ctx->task && cpuctx->max_pertask)
268 cpuctx->max_pertask--;
269
270 spin_unlock(&ctx->lock);
271}
272
273/*
274 * Attach a performance counter to a context
275 *
276 * First we add the counter to the list with the hardware enable bit
277 * in counter->hw_config cleared.
278 *
279 * If the counter is attached to a task which is on a CPU we use a smp
280 * call to enable it in the task context. The task might have been
281 * scheduled away, but we check this in the smp call again.
282 */
283static void
284perf_install_in_context(struct perf_counter_context *ctx,
285 struct perf_counter *counter,
286 int cpu)
287{
288 struct task_struct *task = ctx->task;
289
290 counter->ctx = ctx;
291 if (!task) {
292 /*
293 * Per cpu counters are installed via an smp call and
294 * the install is always sucessful.
295 */
296 smp_call_function_single(cpu, __perf_install_in_context,
297 counter, 1);
298 return;
299 }
300
301 counter->task = task;
302retry:
303 task_oncpu_function_call(task, __perf_install_in_context,
304 counter);
305
306 spin_lock_irq(&ctx->lock);
307 /*
308 * If the context is active and the counter has not been added
309 * we need to retry the smp call.
310 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100311 if (ctx->nr_active && list_empty(&counter->list_entry)) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100312 spin_unlock_irq(&ctx->lock);
313 goto retry;
314 }
315
316 /*
317 * The lock prevents that this context is scheduled in so we
318 * can add the counter safely, if it the call above did not
319 * succeed.
320 */
Ingo Molnar04289bb2008-12-11 08:38:42 +0100321 if (list_empty(&counter->list_entry)) {
322 list_add_counter(counter, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100323 ctx->nr_counters++;
324 }
325 spin_unlock_irq(&ctx->lock);
326}
327
Ingo Molnar04289bb2008-12-11 08:38:42 +0100328static void
329counter_sched_out(struct perf_counter *counter,
330 struct perf_cpu_context *cpuctx,
331 struct perf_counter_context *ctx)
332{
333 if (!counter->active)
334 return;
335
336 hw_perf_counter_disable(counter);
337 counter->active = 0;
338 counter->oncpu = -1;
339
340 cpuctx->active_oncpu--;
341 ctx->nr_active--;
342}
343
344static void
345group_sched_out(struct perf_counter *group_counter,
346 struct perf_cpu_context *cpuctx,
347 struct perf_counter_context *ctx)
348{
349 struct perf_counter *counter;
350
351 counter_sched_out(group_counter, cpuctx, ctx);
352
353 /*
354 * Schedule out siblings (if any):
355 */
356 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
357 counter_sched_out(counter, cpuctx, ctx);
358}
359
Thomas Gleixner0793a612008-12-04 20:12:29 +0100360/*
361 * Called from scheduler to remove the counters of the current task,
362 * with interrupts disabled.
363 *
364 * We stop each counter and update the counter value in counter->count.
365 *
366 * This does not protect us against NMI, but hw_perf_counter_disable()
367 * sets the disabled bit in the control field of counter _before_
368 * accessing the counter control register. If a NMI hits, then it will
369 * not restart the counter.
370 */
371void perf_counter_task_sched_out(struct task_struct *task, int cpu)
372{
373 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
374 struct perf_counter_context *ctx = &task->perf_counter_ctx;
375 struct perf_counter *counter;
376
377 if (likely(!cpuctx->task_ctx))
378 return;
379
380 spin_lock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100381 if (ctx->nr_active) {
382 list_for_each_entry(counter, &ctx->counter_list, list_entry)
383 group_sched_out(counter, cpuctx, ctx);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100384 }
385 spin_unlock(&ctx->lock);
386 cpuctx->task_ctx = NULL;
387}
388
Ingo Molnar04289bb2008-12-11 08:38:42 +0100389static void
390counter_sched_in(struct perf_counter *counter,
391 struct perf_cpu_context *cpuctx,
392 struct perf_counter_context *ctx,
393 int cpu)
394{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100395 hw_perf_counter_enable(counter);
396 counter->active = 1;
397 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
398
399 cpuctx->active_oncpu++;
400 ctx->nr_active++;
401}
402
403static void
404group_sched_in(struct perf_counter *group_counter,
405 struct perf_cpu_context *cpuctx,
406 struct perf_counter_context *ctx,
407 int cpu)
408{
409 struct perf_counter *counter;
410
411 counter_sched_in(group_counter, cpuctx, ctx, cpu);
412
413 /*
414 * Schedule in siblings as one group (if any):
415 */
416 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
417 counter_sched_in(counter, cpuctx, ctx, cpu);
418}
419
Thomas Gleixner0793a612008-12-04 20:12:29 +0100420/*
421 * Called from scheduler to add the counters of the current task
422 * with interrupts disabled.
423 *
424 * We restore the counter value and then enable it.
425 *
426 * This does not protect us against NMI, but hw_perf_counter_enable()
427 * sets the enabled bit in the control field of counter _before_
428 * accessing the counter control register. If a NMI hits, then it will
429 * keep the counter running.
430 */
431void perf_counter_task_sched_in(struct task_struct *task, int cpu)
432{
433 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
434 struct perf_counter_context *ctx = &task->perf_counter_ctx;
435 struct perf_counter *counter;
436
437 if (likely(!ctx->nr_counters))
438 return;
439
440 spin_lock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100441 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100442 if (ctx->nr_active == cpuctx->max_pertask)
443 break;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100444
445 /*
446 * Listen to the 'cpu' scheduling filter constraint
447 * of counters:
448 */
Thomas Gleixner0793a612008-12-04 20:12:29 +0100449 if (counter->cpu != -1 && counter->cpu != cpu)
450 continue;
451
Ingo Molnar04289bb2008-12-11 08:38:42 +0100452 group_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100453 }
454 spin_unlock(&ctx->lock);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100455
Thomas Gleixner0793a612008-12-04 20:12:29 +0100456 cpuctx->task_ctx = ctx;
457}
458
459void perf_counter_task_tick(struct task_struct *curr, int cpu)
460{
461 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
462 struct perf_counter *counter;
463
464 if (likely(!ctx->nr_counters))
465 return;
466
467 perf_counter_task_sched_out(curr, cpu);
468
469 spin_lock(&ctx->lock);
470
471 /*
Ingo Molnar04289bb2008-12-11 08:38:42 +0100472 * Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner0793a612008-12-04 20:12:29 +0100473 */
474 hw_perf_disable_all();
Ingo Molnar04289bb2008-12-11 08:38:42 +0100475 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
476 list_del(&counter->list_entry);
477 list_add_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100478 break;
479 }
480 hw_perf_enable_all();
481
482 spin_unlock(&ctx->lock);
483
484 perf_counter_task_sched_in(curr, cpu);
485}
486
487/*
Ingo Molnar04289bb2008-12-11 08:38:42 +0100488 * Initialize the perf_counter context in a task_struct:
489 */
490static void
491__perf_counter_init_context(struct perf_counter_context *ctx,
492 struct task_struct *task)
493{
494 spin_lock_init(&ctx->lock);
495 INIT_LIST_HEAD(&ctx->counter_list);
496 ctx->nr_counters = 0;
497 ctx->task = task;
498}
499/*
Thomas Gleixner0793a612008-12-04 20:12:29 +0100500 * Initialize the perf_counter context in task_struct
501 */
502void perf_counter_init_task(struct task_struct *task)
503{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100504 __perf_counter_init_context(&task->perf_counter_ctx, task);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100505}
506
507/*
508 * Cross CPU call to read the hardware counter
509 */
510static void __hw_perf_counter_read(void *info)
511{
512 hw_perf_counter_read(info);
513}
514
Ingo Molnar04289bb2008-12-11 08:38:42 +0100515static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100516{
517 /*
518 * If counter is enabled and currently active on a CPU, update the
519 * value in the counter structure:
520 */
521 if (counter->active) {
522 smp_call_function_single(counter->oncpu,
523 __hw_perf_counter_read, counter, 1);
524 }
525
Ingo Molnar04289bb2008-12-11 08:38:42 +0100526 return perf_counter_read_safe(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100527}
528
529/*
530 * Cross CPU call to switch performance data pointers
531 */
532static void __perf_switch_irq_data(void *info)
533{
534 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
535 struct perf_counter *counter = info;
536 struct perf_counter_context *ctx = counter->ctx;
537 struct perf_data *oldirqdata = counter->irqdata;
538
539 /*
540 * If this is a task context, we need to check whether it is
541 * the current task context of this cpu. If not it has been
542 * scheduled out before the smp call arrived.
543 */
544 if (ctx->task) {
545 if (cpuctx->task_ctx != ctx)
546 return;
547 spin_lock(&ctx->lock);
548 }
549
550 /* Change the pointer NMI safe */
551 atomic_long_set((atomic_long_t *)&counter->irqdata,
552 (unsigned long) counter->usrdata);
553 counter->usrdata = oldirqdata;
554
555 if (ctx->task)
556 spin_unlock(&ctx->lock);
557}
558
559static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
560{
561 struct perf_counter_context *ctx = counter->ctx;
562 struct perf_data *oldirqdata = counter->irqdata;
563 struct task_struct *task = ctx->task;
564
565 if (!task) {
566 smp_call_function_single(counter->cpu,
567 __perf_switch_irq_data,
568 counter, 1);
569 return counter->usrdata;
570 }
571
572retry:
573 spin_lock_irq(&ctx->lock);
574 if (!counter->active) {
575 counter->irqdata = counter->usrdata;
576 counter->usrdata = oldirqdata;
577 spin_unlock_irq(&ctx->lock);
578 return oldirqdata;
579 }
580 spin_unlock_irq(&ctx->lock);
581 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
582 /* Might have failed, because task was scheduled out */
583 if (counter->irqdata == oldirqdata)
584 goto retry;
585
586 return counter->usrdata;
587}
588
589static void put_context(struct perf_counter_context *ctx)
590{
591 if (ctx->task)
592 put_task_struct(ctx->task);
593}
594
595static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
596{
597 struct perf_cpu_context *cpuctx;
598 struct perf_counter_context *ctx;
599 struct task_struct *task;
600
601 /*
602 * If cpu is not a wildcard then this is a percpu counter:
603 */
604 if (cpu != -1) {
605 /* Must be root to operate on a CPU counter: */
606 if (!capable(CAP_SYS_ADMIN))
607 return ERR_PTR(-EACCES);
608
609 if (cpu < 0 || cpu > num_possible_cpus())
610 return ERR_PTR(-EINVAL);
611
612 /*
613 * We could be clever and allow to attach a counter to an
614 * offline CPU and activate it when the CPU comes up, but
615 * that's for later.
616 */
617 if (!cpu_isset(cpu, cpu_online_map))
618 return ERR_PTR(-ENODEV);
619
620 cpuctx = &per_cpu(perf_cpu_context, cpu);
621 ctx = &cpuctx->ctx;
622
623 WARN_ON_ONCE(ctx->task);
624 return ctx;
625 }
626
627 rcu_read_lock();
628 if (!pid)
629 task = current;
630 else
631 task = find_task_by_vpid(pid);
632 if (task)
633 get_task_struct(task);
634 rcu_read_unlock();
635
636 if (!task)
637 return ERR_PTR(-ESRCH);
638
639 ctx = &task->perf_counter_ctx;
640 ctx->task = task;
641
642 /* Reuse ptrace permission checks for now. */
643 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
644 put_context(ctx);
645 return ERR_PTR(-EACCES);
646 }
647
648 return ctx;
649}
650
651/*
652 * Called when the last reference to the file is gone.
653 */
654static int perf_release(struct inode *inode, struct file *file)
655{
656 struct perf_counter *counter = file->private_data;
657 struct perf_counter_context *ctx = counter->ctx;
658
659 file->private_data = NULL;
660
661 mutex_lock(&counter->mutex);
662
Ingo Molnar04289bb2008-12-11 08:38:42 +0100663 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100664 put_context(ctx);
665
666 mutex_unlock(&counter->mutex);
667
668 kfree(counter);
669
670 return 0;
671}
672
673/*
674 * Read the performance counter - simple non blocking version for now
675 */
676static ssize_t
677perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
678{
679 u64 cntval;
680
681 if (count != sizeof(cntval))
682 return -EINVAL;
683
684 mutex_lock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100685 cntval = perf_counter_read(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100686 mutex_unlock(&counter->mutex);
687
688 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
689}
690
691static ssize_t
692perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
693{
694 if (!usrdata->len)
695 return 0;
696
697 count = min(count, (size_t)usrdata->len);
698 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
699 return -EFAULT;
700
701 /* Adjust the counters */
702 usrdata->len -= count;
703 if (!usrdata->len)
704 usrdata->rd_idx = 0;
705 else
706 usrdata->rd_idx += count;
707
708 return count;
709}
710
711static ssize_t
712perf_read_irq_data(struct perf_counter *counter,
713 char __user *buf,
714 size_t count,
715 int nonblocking)
716{
717 struct perf_data *irqdata, *usrdata;
718 DECLARE_WAITQUEUE(wait, current);
719 ssize_t res;
720
721 irqdata = counter->irqdata;
722 usrdata = counter->usrdata;
723
724 if (usrdata->len + irqdata->len >= count)
725 goto read_pending;
726
727 if (nonblocking)
728 return -EAGAIN;
729
730 spin_lock_irq(&counter->waitq.lock);
731 __add_wait_queue(&counter->waitq, &wait);
732 for (;;) {
733 set_current_state(TASK_INTERRUPTIBLE);
734 if (usrdata->len + irqdata->len >= count)
735 break;
736
737 if (signal_pending(current))
738 break;
739
740 spin_unlock_irq(&counter->waitq.lock);
741 schedule();
742 spin_lock_irq(&counter->waitq.lock);
743 }
744 __remove_wait_queue(&counter->waitq, &wait);
745 __set_current_state(TASK_RUNNING);
746 spin_unlock_irq(&counter->waitq.lock);
747
748 if (usrdata->len + irqdata->len < count)
749 return -ERESTARTSYS;
750read_pending:
751 mutex_lock(&counter->mutex);
752
753 /* Drain pending data first: */
754 res = perf_copy_usrdata(usrdata, buf, count);
755 if (res < 0 || res == count)
756 goto out;
757
758 /* Switch irq buffer: */
759 usrdata = perf_switch_irq_data(counter);
760 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
761 if (!res)
762 res = -EFAULT;
763 } else {
764 res = count;
765 }
766out:
767 mutex_unlock(&counter->mutex);
768
769 return res;
770}
771
772static ssize_t
773perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
774{
775 struct perf_counter *counter = file->private_data;
776
Ingo Molnar9f66a382008-12-10 12:33:23 +0100777 switch (counter->hw_event.record_type) {
Thomas Gleixner0793a612008-12-04 20:12:29 +0100778 case PERF_RECORD_SIMPLE:
779 return perf_read_hw(counter, buf, count);
780
781 case PERF_RECORD_IRQ:
782 case PERF_RECORD_GROUP:
783 return perf_read_irq_data(counter, buf, count,
784 file->f_flags & O_NONBLOCK);
785 }
786 return -EINVAL;
787}
788
789static unsigned int perf_poll(struct file *file, poll_table *wait)
790{
791 struct perf_counter *counter = file->private_data;
792 unsigned int events = 0;
793 unsigned long flags;
794
795 poll_wait(file, &counter->waitq, wait);
796
797 spin_lock_irqsave(&counter->waitq.lock, flags);
798 if (counter->usrdata->len || counter->irqdata->len)
799 events |= POLLIN;
800 spin_unlock_irqrestore(&counter->waitq.lock, flags);
801
802 return events;
803}
804
805static const struct file_operations perf_fops = {
806 .release = perf_release,
807 .read = perf_read,
808 .poll = perf_poll,
809};
810
811/*
812 * Allocate and initialize a counter structure
813 */
814static struct perf_counter *
Ingo Molnar04289bb2008-12-11 08:38:42 +0100815perf_counter_alloc(struct perf_counter_hw_event *hw_event,
816 int cpu,
817 struct perf_counter *group_leader)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100818{
819 struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
820
821 if (!counter)
822 return NULL;
823
Ingo Molnar04289bb2008-12-11 08:38:42 +0100824 /*
825 * Single counters are their own group leaders, with an
826 * empty sibling list:
827 */
828 if (!group_leader)
829 group_leader = counter;
830
Thomas Gleixner0793a612008-12-04 20:12:29 +0100831 mutex_init(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100832 INIT_LIST_HEAD(&counter->list_entry);
833 INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100834 init_waitqueue_head(&counter->waitq);
835
Ingo Molnar9f66a382008-12-10 12:33:23 +0100836 counter->irqdata = &counter->data[0];
837 counter->usrdata = &counter->data[1];
838 counter->cpu = cpu;
839 counter->hw_event = *hw_event;
840 counter->wakeup_pending = 0;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100841 counter->group_leader = group_leader;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100842
843 return counter;
844}
845
846/**
Ingo Molnar9f66a382008-12-10 12:33:23 +0100847 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
848 *
849 * @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner0793a612008-12-04 20:12:29 +0100850 * @pid: target pid
Ingo Molnar9f66a382008-12-10 12:33:23 +0100851 * @cpu: target cpu
852 * @group_fd: group leader counter fd
Thomas Gleixner0793a612008-12-04 20:12:29 +0100853 */
Ingo Molnar9f66a382008-12-10 12:33:23 +0100854asmlinkage int sys_perf_counter_open(
855
856 struct perf_counter_hw_event *hw_event_uptr __user,
857 pid_t pid,
858 int cpu,
859 int group_fd)
860
Thomas Gleixner0793a612008-12-04 20:12:29 +0100861{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100862 struct perf_counter *counter, *group_leader;
Ingo Molnar9f66a382008-12-10 12:33:23 +0100863 struct perf_counter_hw_event hw_event;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100864 struct perf_counter_context *ctx;
865 struct file *group_file = NULL;
866 int fput_needed = 0;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100867 int ret;
868
Ingo Molnar9f66a382008-12-10 12:33:23 +0100869 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixnereab656a2008-12-08 19:26:59 +0100870 return -EFAULT;
871
Ingo Molnar04289bb2008-12-11 08:38:42 +0100872 /*
Ingo Molnarccff2862008-12-11 11:26:29 +0100873 * Get the target context (task or percpu):
874 */
875 ctx = find_get_context(pid, cpu);
876 if (IS_ERR(ctx))
877 return PTR_ERR(ctx);
878
879 /*
880 * Look up the group leader (we will attach this counter to it):
Ingo Molnar04289bb2008-12-11 08:38:42 +0100881 */
882 group_leader = NULL;
883 if (group_fd != -1) {
884 ret = -EINVAL;
885 group_file = fget_light(group_fd, &fput_needed);
886 if (!group_file)
Ingo Molnarccff2862008-12-11 11:26:29 +0100887 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100888 if (group_file->f_op != &perf_fops)
Ingo Molnarccff2862008-12-11 11:26:29 +0100889 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100890
891 group_leader = group_file->private_data;
892 /*
Ingo Molnarccff2862008-12-11 11:26:29 +0100893 * Do not allow a recursive hierarchy (this new sibling
894 * becoming part of another group-sibling):
Ingo Molnar04289bb2008-12-11 08:38:42 +0100895 */
Ingo Molnarccff2862008-12-11 11:26:29 +0100896 if (group_leader->group_leader != group_leader)
897 goto err_put_context;
898 /*
899 * Do not allow to attach to a group in a different
900 * task or CPU context:
901 */
902 if (group_leader->ctx != ctx)
903 goto err_put_context;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100904 }
905
Thomas Gleixner0793a612008-12-04 20:12:29 +0100906 ret = -ENOMEM;
Ingo Molnar04289bb2008-12-11 08:38:42 +0100907 counter = perf_counter_alloc(&hw_event, cpu, group_leader);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100908 if (!counter)
909 goto err_put_context;
910
Thomas Gleixnerdfa7c892008-12-08 19:35:37 +0100911 ret = hw_perf_counter_init(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100912 if (ret)
913 goto err_free_put_context;
914
915 perf_install_in_context(ctx, counter, cpu);
916
917 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
918 if (ret < 0)
919 goto err_remove_free_put_context;
920
Ingo Molnar04289bb2008-12-11 08:38:42 +0100921out_fput:
922 fput_light(group_file, fput_needed);
923
Thomas Gleixner0793a612008-12-04 20:12:29 +0100924 return ret;
925
926err_remove_free_put_context:
927 mutex_lock(&counter->mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100928 perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100929 mutex_unlock(&counter->mutex);
930
931err_free_put_context:
932 kfree(counter);
933
934err_put_context:
935 put_context(ctx);
936
Ingo Molnar04289bb2008-12-11 08:38:42 +0100937 goto out_fput;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100938}
939
Ingo Molnar04289bb2008-12-11 08:38:42 +0100940static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100941{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100942 struct perf_cpu_context *cpuctx;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100943
Ingo Molnar04289bb2008-12-11 08:38:42 +0100944 cpuctx = &per_cpu(perf_cpu_context, cpu);
945 __perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100946
947 mutex_lock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100948 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner0793a612008-12-04 20:12:29 +0100949 mutex_unlock(&perf_resource_mutex);
Ingo Molnar04289bb2008-12-11 08:38:42 +0100950
Thomas Gleixner0793a612008-12-04 20:12:29 +0100951 hw_perf_counter_setup();
952}
953
954#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar04289bb2008-12-11 08:38:42 +0100955static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100956{
957 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
958 struct perf_counter_context *ctx = &cpuctx->ctx;
959 struct perf_counter *counter, *tmp;
960
Ingo Molnar04289bb2008-12-11 08:38:42 +0100961 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
962 __perf_counter_remove_from_context(counter);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100963
964}
Ingo Molnar04289bb2008-12-11 08:38:42 +0100965static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner0793a612008-12-04 20:12:29 +0100966{
Ingo Molnar04289bb2008-12-11 08:38:42 +0100967 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100968}
969#else
Ingo Molnar04289bb2008-12-11 08:38:42 +0100970static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner0793a612008-12-04 20:12:29 +0100971#endif
972
973static int __cpuinit
974perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
975{
976 unsigned int cpu = (long)hcpu;
977
978 switch (action) {
979
980 case CPU_UP_PREPARE:
981 case CPU_UP_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100982 perf_counter_init_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100983 break;
984
985 case CPU_DOWN_PREPARE:
986 case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar04289bb2008-12-11 08:38:42 +0100987 perf_counter_exit_cpu(cpu);
Thomas Gleixner0793a612008-12-04 20:12:29 +0100988 break;
989
990 default:
991 break;
992 }
993
994 return NOTIFY_OK;
995}
996
997static struct notifier_block __cpuinitdata perf_cpu_nb = {
998 .notifier_call = perf_cpu_notify,
999};
1000
1001static int __init perf_counter_init(void)
1002{
1003 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1004 (void *)(long)smp_processor_id());
1005 register_cpu_notifier(&perf_cpu_nb);
1006
1007 return 0;
1008}
1009early_initcall(perf_counter_init);
1010
1011static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1012{
1013 return sprintf(buf, "%d\n", perf_reserved_percpu);
1014}
1015
1016static ssize_t
1017perf_set_reserve_percpu(struct sysdev_class *class,
1018 const char *buf,
1019 size_t count)
1020{
1021 struct perf_cpu_context *cpuctx;
1022 unsigned long val;
1023 int err, cpu, mpt;
1024
1025 err = strict_strtoul(buf, 10, &val);
1026 if (err)
1027 return err;
1028 if (val > perf_max_counters)
1029 return -EINVAL;
1030
1031 mutex_lock(&perf_resource_mutex);
1032 perf_reserved_percpu = val;
1033 for_each_online_cpu(cpu) {
1034 cpuctx = &per_cpu(perf_cpu_context, cpu);
1035 spin_lock_irq(&cpuctx->ctx.lock);
1036 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1037 perf_max_counters - perf_reserved_percpu);
1038 cpuctx->max_pertask = mpt;
1039 spin_unlock_irq(&cpuctx->ctx.lock);
1040 }
1041 mutex_unlock(&perf_resource_mutex);
1042
1043 return count;
1044}
1045
1046static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1047{
1048 return sprintf(buf, "%d\n", perf_overcommit);
1049}
1050
1051static ssize_t
1052perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1053{
1054 unsigned long val;
1055 int err;
1056
1057 err = strict_strtoul(buf, 10, &val);
1058 if (err)
1059 return err;
1060 if (val > 1)
1061 return -EINVAL;
1062
1063 mutex_lock(&perf_resource_mutex);
1064 perf_overcommit = val;
1065 mutex_unlock(&perf_resource_mutex);
1066
1067 return count;
1068}
1069
1070static SYSDEV_CLASS_ATTR(
1071 reserve_percpu,
1072 0644,
1073 perf_show_reserve_percpu,
1074 perf_set_reserve_percpu
1075 );
1076
1077static SYSDEV_CLASS_ATTR(
1078 overcommit,
1079 0644,
1080 perf_show_overcommit,
1081 perf_set_overcommit
1082 );
1083
1084static struct attribute *perfclass_attrs[] = {
1085 &attr_reserve_percpu.attr,
1086 &attr_overcommit.attr,
1087 NULL
1088};
1089
1090static struct attribute_group perfclass_attr_group = {
1091 .attrs = perfclass_attrs,
1092 .name = "perf_counters",
1093};
1094
1095static int __init perf_counter_sysfs_init(void)
1096{
1097 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1098 &perfclass_attr_group);
1099}
1100device_initcall(perf_counter_sysfs_init);
1101