blob: 1f14481c2337d0a08657b220e44a5f780aca63cb [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020023#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020024#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080031#include <linux/ftrace_event.h>
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +020032#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020033
34#include <asm/irq_regs.h>
35
36/*
37 * Each CPU has a list of per CPU events:
38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40
41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly;
43static int perf_overcommit __read_mostly = 1;
44
45static atomic_t nr_events __read_mostly;
46static atomic_t nr_mmap_events __read_mostly;
47static atomic_t nr_comm_events __read_mostly;
48static atomic_t nr_task_events __read_mostly;
49
50/*
51 * perf event paranoia level:
52 * -1 - not paranoid at all
53 * 0 - disallow raw tracepoint access for unpriv
54 * 1 - disallow cpu events for unpriv
55 * 2 - disallow kernel profiling for unpriv
56 */
57int sysctl_perf_event_paranoid __read_mostly = 1;
58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75
76/*
77 * max perf event sample rate
78 */
79int sysctl_perf_event_sample_rate __read_mostly = 100000;
80
81static atomic64_t perf_event_id;
82
83/*
84 * Lock for (sysadmin-configurable) event reservations:
85 */
86static DEFINE_SPINLOCK(perf_resource_lock);
87
88/*
89 * Architecture provided APIs - weak aliases:
90 */
91extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
92{
93 return NULL;
94}
95
96void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); }
98
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu)
106{
107 return 0;
108}
109
110void __weak perf_event_print_debug(void) { }
111
112static DEFINE_PER_CPU(int, perf_disable_count);
113
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void)
125{
126 __perf_disable();
127 hw_perf_disable();
128}
129
130void perf_enable(void)
131{
132 if (__perf_enable())
133 hw_perf_enable();
134}
135
136static void get_ctx(struct perf_event_context *ctx)
137{
138 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
139}
140
141static void free_ctx(struct rcu_head *head)
142{
143 struct perf_event_context *ctx;
144
145 ctx = container_of(head, struct perf_event_context, rcu_head);
146 kfree(ctx);
147}
148
149static void put_ctx(struct perf_event_context *ctx)
150{
151 if (atomic_dec_and_test(&ctx->refcount)) {
152 if (ctx->parent_ctx)
153 put_ctx(ctx->parent_ctx);
154 if (ctx->task)
155 put_task_struct(ctx->task);
156 call_rcu(&ctx->rcu_head, free_ctx);
157 }
158}
159
160static void unclone_ctx(struct perf_event_context *ctx)
161{
162 if (ctx->parent_ctx) {
163 put_ctx(ctx->parent_ctx);
164 ctx->parent_ctx = NULL;
165 }
166}
167
168/*
169 * If we inherit events we want to return the parent event id
170 * to userspace.
171 */
172static u64 primary_event_id(struct perf_event *event)
173{
174 u64 id = event->id;
175
176 if (event->parent)
177 id = event->parent->id;
178
179 return id;
180}
181
182/*
183 * Get the perf_event_context for a task and lock it.
184 * This has to cope with with the fact that until it is locked,
185 * the context could get moved to another task.
186 */
187static struct perf_event_context *
188perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189{
190 struct perf_event_context *ctx;
191
192 rcu_read_lock();
193 retry:
194 ctx = rcu_dereference(task->perf_event_ctxp);
195 if (ctx) {
196 /*
197 * If this context is a clone of another, it might
198 * get swapped for another underneath us by
199 * perf_event_task_sched_out, though the
200 * rcu_read_lock() protects us from any context
201 * getting freed. Lock the context and check if it
202 * got swapped before we could get the lock, and retry
203 * if so. If we locked the right context, then it
204 * can't get swapped on us any more.
205 */
206 spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry;
210 }
211
212 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL;
215 }
216 }
217 rcu_read_unlock();
218 return ctx;
219}
220
221/*
222 * Get the context for a task and increment its pin_count so it
223 * can't get swapped to another task. This also increments its
224 * reference count so that the context can't get freed.
225 */
226static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
227{
228 struct perf_event_context *ctx;
229 unsigned long flags;
230
231 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) {
233 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags);
235 }
236 return ctx;
237}
238
239static void perf_unpin_context(struct perf_event_context *ctx)
240{
241 unsigned long flags;
242
243 spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx);
247}
248
249/*
250 * Add a event from the lists for its context.
251 * Must be called with ctx->mutex and ctx->lock held.
252 */
253static void
254list_add_event(struct perf_event *event, struct perf_event_context *ctx)
255{
256 struct perf_event *group_leader = event->group_leader;
257
258 /*
259 * Depending on whether it is a standalone or sibling event,
260 * add it straight to the context's event list, or to the group
261 * leader's sibling list:
262 */
263 if (group_leader == event)
264 list_add_tail(&event->group_entry, &ctx->group_list);
265 else {
266 list_add_tail(&event->group_entry, &group_leader->sibling_list);
267 group_leader->nr_siblings++;
268 }
269
270 list_add_rcu(&event->event_entry, &ctx->event_list);
271 ctx->nr_events++;
272 if (event->attr.inherit_stat)
273 ctx->nr_stat++;
274}
275
276/*
277 * Remove a event from the lists for its context.
278 * Must be called with ctx->mutex and ctx->lock held.
279 */
280static void
281list_del_event(struct perf_event *event, struct perf_event_context *ctx)
282{
283 struct perf_event *sibling, *tmp;
284
285 if (list_empty(&event->group_entry))
286 return;
287 ctx->nr_events--;
288 if (event->attr.inherit_stat)
289 ctx->nr_stat--;
290
291 list_del_init(&event->group_entry);
292 list_del_rcu(&event->event_entry);
293
294 if (event->group_leader != event)
295 event->group_leader->nr_siblings--;
296
297 /*
298 * If this was a group event with sibling events then
299 * upgrade the siblings to singleton events by adding them
300 * to the context list directly:
301 */
302 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
303
304 list_move_tail(&sibling->group_entry, &ctx->group_list);
305 sibling->group_leader = sibling;
306 }
307}
308
309static void
310event_sched_out(struct perf_event *event,
311 struct perf_cpu_context *cpuctx,
312 struct perf_event_context *ctx)
313{
314 if (event->state != PERF_EVENT_STATE_ACTIVE)
315 return;
316
317 event->state = PERF_EVENT_STATE_INACTIVE;
318 if (event->pending_disable) {
319 event->pending_disable = 0;
320 event->state = PERF_EVENT_STATE_OFF;
321 }
322 event->tstamp_stopped = ctx->time;
323 event->pmu->disable(event);
324 event->oncpu = -1;
325
326 if (!is_software_event(event))
327 cpuctx->active_oncpu--;
328 ctx->nr_active--;
329 if (event->attr.exclusive || !cpuctx->active_oncpu)
330 cpuctx->exclusive = 0;
331}
332
333static void
334group_sched_out(struct perf_event *group_event,
335 struct perf_cpu_context *cpuctx,
336 struct perf_event_context *ctx)
337{
338 struct perf_event *event;
339
340 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
341 return;
342
343 event_sched_out(group_event, cpuctx, ctx);
344
345 /*
346 * Schedule out siblings (if any):
347 */
348 list_for_each_entry(event, &group_event->sibling_list, group_entry)
349 event_sched_out(event, cpuctx, ctx);
350
351 if (group_event->attr.exclusive)
352 cpuctx->exclusive = 0;
353}
354
355/*
356 * Cross CPU call to remove a performance event
357 *
358 * We disable the event on the hardware level first. After that we
359 * remove it from the context list.
360 */
361static void __perf_event_remove_from_context(void *info)
362{
363 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
364 struct perf_event *event = info;
365 struct perf_event_context *ctx = event->ctx;
366
367 /*
368 * If this is a task context, we need to check whether it is
369 * the current task context of this cpu. If not it has been
370 * scheduled out before the smp call arrived.
371 */
372 if (ctx->task && cpuctx->task_ctx != ctx)
373 return;
374
375 spin_lock(&ctx->lock);
376 /*
377 * Protect the list operation against NMI by disabling the
378 * events on a global level.
379 */
380 perf_disable();
381
382 event_sched_out(event, cpuctx, ctx);
383
384 list_del_event(event, ctx);
385
386 if (!ctx->task) {
387 /*
388 * Allow more per task events with respect to the
389 * reservation:
390 */
391 cpuctx->max_pertask =
392 min(perf_max_events - ctx->nr_events,
393 perf_max_events - perf_reserved_percpu);
394 }
395
396 perf_enable();
397 spin_unlock(&ctx->lock);
398}
399
400
401/*
402 * Remove the event from a task's (or a CPU's) list of events.
403 *
404 * Must be called with ctx->mutex held.
405 *
406 * CPU events are removed with a smp call. For task events we only
407 * call when the task is on a CPU.
408 *
409 * If event->ctx is a cloned context, callers must make sure that
410 * every task struct that event->ctx->task could possibly point to
411 * remains valid. This is OK when called from perf_release since
412 * that only calls us on the top-level context, which can't be a clone.
413 * When called from perf_event_exit_task, it's OK because the
414 * context has been detached from its task.
415 */
416static void perf_event_remove_from_context(struct perf_event *event)
417{
418 struct perf_event_context *ctx = event->ctx;
419 struct task_struct *task = ctx->task;
420
421 if (!task) {
422 /*
423 * Per cpu events are removed via an smp call and
424 * the removal is always sucessful.
425 */
426 smp_call_function_single(event->cpu,
427 __perf_event_remove_from_context,
428 event, 1);
429 return;
430 }
431
432retry:
433 task_oncpu_function_call(task, __perf_event_remove_from_context,
434 event);
435
436 spin_lock_irq(&ctx->lock);
437 /*
438 * If the context is active we need to retry the smp call.
439 */
440 if (ctx->nr_active && !list_empty(&event->group_entry)) {
441 spin_unlock_irq(&ctx->lock);
442 goto retry;
443 }
444
445 /*
446 * The lock prevents that this context is scheduled in so we
447 * can remove the event safely, if the call above did not
448 * succeed.
449 */
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +0100450 if (!list_empty(&event->group_entry))
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200451 list_del_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200452 spin_unlock_irq(&ctx->lock);
453}
454
455static inline u64 perf_clock(void)
456{
457 return cpu_clock(smp_processor_id());
458}
459
460/*
461 * Update the record of the current time in a context.
462 */
463static void update_context_time(struct perf_event_context *ctx)
464{
465 u64 now = perf_clock();
466
467 ctx->time += now - ctx->timestamp;
468 ctx->timestamp = now;
469}
470
471/*
472 * Update the total_time_enabled and total_time_running fields for a event.
473 */
474static void update_event_times(struct perf_event *event)
475{
476 struct perf_event_context *ctx = event->ctx;
477 u64 run_end;
478
479 if (event->state < PERF_EVENT_STATE_INACTIVE ||
480 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481 return;
482
483 event->total_time_enabled = ctx->time - event->tstamp_enabled;
484
485 if (event->state == PERF_EVENT_STATE_INACTIVE)
486 run_end = event->tstamp_stopped;
487 else
488 run_end = ctx->time;
489
490 event->total_time_running = run_end - event->tstamp_running;
491}
492
493/*
494 * Update total_time_enabled and total_time_running for all events in a group.
495 */
496static void update_group_times(struct perf_event *leader)
497{
498 struct perf_event *event;
499
500 update_event_times(leader);
501 list_for_each_entry(event, &leader->sibling_list, group_entry)
502 update_event_times(event);
503}
504
505/*
506 * Cross CPU call to disable a performance event
507 */
508static void __perf_event_disable(void *info)
509{
510 struct perf_event *event = info;
511 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
512 struct perf_event_context *ctx = event->ctx;
513
514 /*
515 * If this is a per-task event, need to check whether this
516 * event's task is the current task on this cpu.
517 */
518 if (ctx->task && cpuctx->task_ctx != ctx)
519 return;
520
521 spin_lock(&ctx->lock);
522
523 /*
524 * If the event is on, turn it off.
525 * If it is in error state, leave it in error state.
526 */
527 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
528 update_context_time(ctx);
529 update_group_times(event);
530 if (event == event->group_leader)
531 group_sched_out(event, cpuctx, ctx);
532 else
533 event_sched_out(event, cpuctx, ctx);
534 event->state = PERF_EVENT_STATE_OFF;
535 }
536
537 spin_unlock(&ctx->lock);
538}
539
540/*
541 * Disable a event.
542 *
543 * If event->ctx is a cloned context, callers must make sure that
544 * every task struct that event->ctx->task could possibly point to
545 * remains valid. This condition is satisifed when called through
546 * perf_event_for_each_child or perf_event_for_each because they
547 * hold the top-level event's child_mutex, so any descendant that
548 * goes to exit will block in sync_child_event.
549 * When called from perf_pending_event it's OK because event->ctx
550 * is the current context on this CPU and preemption is disabled,
551 * hence we can't get into perf_event_task_sched_out for this context.
552 */
553static void perf_event_disable(struct perf_event *event)
554{
555 struct perf_event_context *ctx = event->ctx;
556 struct task_struct *task = ctx->task;
557
558 if (!task) {
559 /*
560 * Disable the event on the cpu that it's on
561 */
562 smp_call_function_single(event->cpu, __perf_event_disable,
563 event, 1);
564 return;
565 }
566
567 retry:
568 task_oncpu_function_call(task, __perf_event_disable, event);
569
570 spin_lock_irq(&ctx->lock);
571 /*
572 * If the event is still active, we need to retry the cross-call.
573 */
574 if (event->state == PERF_EVENT_STATE_ACTIVE) {
575 spin_unlock_irq(&ctx->lock);
576 goto retry;
577 }
578
579 /*
580 * Since we have the lock this context can't be scheduled
581 * in, so we can change the state safely.
582 */
583 if (event->state == PERF_EVENT_STATE_INACTIVE) {
584 update_group_times(event);
585 event->state = PERF_EVENT_STATE_OFF;
586 }
587
588 spin_unlock_irq(&ctx->lock);
589}
590
591static int
592event_sched_in(struct perf_event *event,
593 struct perf_cpu_context *cpuctx,
594 struct perf_event_context *ctx,
595 int cpu)
596{
597 if (event->state <= PERF_EVENT_STATE_OFF)
598 return 0;
599
600 event->state = PERF_EVENT_STATE_ACTIVE;
601 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
602 /*
603 * The new state must be visible before we turn it on in the hardware:
604 */
605 smp_wmb();
606
607 if (event->pmu->enable(event)) {
608 event->state = PERF_EVENT_STATE_INACTIVE;
609 event->oncpu = -1;
610 return -EAGAIN;
611 }
612
613 event->tstamp_running += ctx->time - event->tstamp_stopped;
614
615 if (!is_software_event(event))
616 cpuctx->active_oncpu++;
617 ctx->nr_active++;
618
619 if (event->attr.exclusive)
620 cpuctx->exclusive = 1;
621
622 return 0;
623}
624
625static int
626group_sched_in(struct perf_event *group_event,
627 struct perf_cpu_context *cpuctx,
628 struct perf_event_context *ctx,
629 int cpu)
630{
631 struct perf_event *event, *partial_group;
632 int ret;
633
634 if (group_event->state == PERF_EVENT_STATE_OFF)
635 return 0;
636
637 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
638 if (ret)
639 return ret < 0 ? ret : 0;
640
641 if (event_sched_in(group_event, cpuctx, ctx, cpu))
642 return -EAGAIN;
643
644 /*
645 * Schedule in siblings as one group (if any):
646 */
647 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
648 if (event_sched_in(event, cpuctx, ctx, cpu)) {
649 partial_group = event;
650 goto group_error;
651 }
652 }
653
654 return 0;
655
656group_error:
657 /*
658 * Groups can be scheduled in as one unit only, so undo any
659 * partial group before returning:
660 */
661 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
662 if (event == partial_group)
663 break;
664 event_sched_out(event, cpuctx, ctx);
665 }
666 event_sched_out(group_event, cpuctx, ctx);
667
668 return -EAGAIN;
669}
670
671/*
672 * Return 1 for a group consisting entirely of software events,
673 * 0 if the group contains any hardware events.
674 */
675static int is_software_only_group(struct perf_event *leader)
676{
677 struct perf_event *event;
678
679 if (!is_software_event(leader))
680 return 0;
681
682 list_for_each_entry(event, &leader->sibling_list, group_entry)
683 if (!is_software_event(event))
684 return 0;
685
686 return 1;
687}
688
689/*
690 * Work out whether we can put this event group on the CPU now.
691 */
692static int group_can_go_on(struct perf_event *event,
693 struct perf_cpu_context *cpuctx,
694 int can_add_hw)
695{
696 /*
697 * Groups consisting entirely of software events can always go on.
698 */
699 if (is_software_only_group(event))
700 return 1;
701 /*
702 * If an exclusive group is already on, no other hardware
703 * events can go on.
704 */
705 if (cpuctx->exclusive)
706 return 0;
707 /*
708 * If this group is exclusive and there are already
709 * events on the CPU, it can't go on.
710 */
711 if (event->attr.exclusive && cpuctx->active_oncpu)
712 return 0;
713 /*
714 * Otherwise, try to add it if all previous groups were able
715 * to go on.
716 */
717 return can_add_hw;
718}
719
720static void add_event_to_ctx(struct perf_event *event,
721 struct perf_event_context *ctx)
722{
723 list_add_event(event, ctx);
724 event->tstamp_enabled = ctx->time;
725 event->tstamp_running = ctx->time;
726 event->tstamp_stopped = ctx->time;
727}
728
729/*
730 * Cross CPU call to install and enable a performance event
731 *
732 * Must be called with ctx->mutex held
733 */
734static void __perf_install_in_context(void *info)
735{
736 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
737 struct perf_event *event = info;
738 struct perf_event_context *ctx = event->ctx;
739 struct perf_event *leader = event->group_leader;
740 int cpu = smp_processor_id();
741 int err;
742
743 /*
744 * If this is a task context, we need to check whether it is
745 * the current task context of this cpu. If not it has been
746 * scheduled out before the smp call arrived.
747 * Or possibly this is the right context but it isn't
748 * on this cpu because it had no events.
749 */
750 if (ctx->task && cpuctx->task_ctx != ctx) {
751 if (cpuctx->task_ctx || ctx->task != current)
752 return;
753 cpuctx->task_ctx = ctx;
754 }
755
756 spin_lock(&ctx->lock);
757 ctx->is_active = 1;
758 update_context_time(ctx);
759
760 /*
761 * Protect the list operation against NMI by disabling the
762 * events on a global level. NOP for non NMI based events.
763 */
764 perf_disable();
765
766 add_event_to_ctx(event, ctx);
767
768 /*
769 * Don't put the event on if it is disabled or if
770 * it is in a group and the group isn't on.
771 */
772 if (event->state != PERF_EVENT_STATE_INACTIVE ||
773 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
774 goto unlock;
775
776 /*
777 * An exclusive event can't go on if there are already active
778 * hardware events, and no hardware event can go on if there
779 * is already an exclusive event on.
780 */
781 if (!group_can_go_on(event, cpuctx, 1))
782 err = -EEXIST;
783 else
784 err = event_sched_in(event, cpuctx, ctx, cpu);
785
786 if (err) {
787 /*
788 * This event couldn't go on. If it is in a group
789 * then we have to pull the whole group off.
790 * If the event group is pinned then put it in error state.
791 */
792 if (leader != event)
793 group_sched_out(leader, cpuctx, ctx);
794 if (leader->attr.pinned) {
795 update_group_times(leader);
796 leader->state = PERF_EVENT_STATE_ERROR;
797 }
798 }
799
800 if (!err && !ctx->task && cpuctx->max_pertask)
801 cpuctx->max_pertask--;
802
803 unlock:
804 perf_enable();
805
806 spin_unlock(&ctx->lock);
807}
808
809/*
810 * Attach a performance event to a context
811 *
812 * First we add the event to the list with the hardware enable bit
813 * in event->hw_config cleared.
814 *
815 * If the event is attached to a task which is on a CPU we use a smp
816 * call to enable it in the task context. The task might have been
817 * scheduled away, but we check this in the smp call again.
818 *
819 * Must be called with ctx->mutex held.
820 */
821static void
822perf_install_in_context(struct perf_event_context *ctx,
823 struct perf_event *event,
824 int cpu)
825{
826 struct task_struct *task = ctx->task;
827
828 if (!task) {
829 /*
830 * Per cpu events are installed via an smp call and
831 * the install is always sucessful.
832 */
833 smp_call_function_single(cpu, __perf_install_in_context,
834 event, 1);
835 return;
836 }
837
838retry:
839 task_oncpu_function_call(task, __perf_install_in_context,
840 event);
841
842 spin_lock_irq(&ctx->lock);
843 /*
844 * we need to retry the smp call.
845 */
846 if (ctx->is_active && list_empty(&event->group_entry)) {
847 spin_unlock_irq(&ctx->lock);
848 goto retry;
849 }
850
851 /*
852 * The lock prevents that this context is scheduled in so we
853 * can add the event safely, if it the call above did not
854 * succeed.
855 */
856 if (list_empty(&event->group_entry))
857 add_event_to_ctx(event, ctx);
858 spin_unlock_irq(&ctx->lock);
859}
860
861/*
862 * Put a event into inactive state and update time fields.
863 * Enabling the leader of a group effectively enables all
864 * the group members that aren't explicitly disabled, so we
865 * have to update their ->tstamp_enabled also.
866 * Note: this works for group members as well as group leaders
867 * since the non-leader members' sibling_lists will be empty.
868 */
869static void __perf_event_mark_enabled(struct perf_event *event,
870 struct perf_event_context *ctx)
871{
872 struct perf_event *sub;
873
874 event->state = PERF_EVENT_STATE_INACTIVE;
875 event->tstamp_enabled = ctx->time - event->total_time_enabled;
876 list_for_each_entry(sub, &event->sibling_list, group_entry)
877 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
878 sub->tstamp_enabled =
879 ctx->time - sub->total_time_enabled;
880}
881
882/*
883 * Cross CPU call to enable a performance event
884 */
885static void __perf_event_enable(void *info)
886{
887 struct perf_event *event = info;
888 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
889 struct perf_event_context *ctx = event->ctx;
890 struct perf_event *leader = event->group_leader;
891 int err;
892
893 /*
894 * If this is a per-task event, need to check whether this
895 * event's task is the current task on this cpu.
896 */
897 if (ctx->task && cpuctx->task_ctx != ctx) {
898 if (cpuctx->task_ctx || ctx->task != current)
899 return;
900 cpuctx->task_ctx = ctx;
901 }
902
903 spin_lock(&ctx->lock);
904 ctx->is_active = 1;
905 update_context_time(ctx);
906
907 if (event->state >= PERF_EVENT_STATE_INACTIVE)
908 goto unlock;
909 __perf_event_mark_enabled(event, ctx);
910
911 /*
912 * If the event is in a group and isn't the group leader,
913 * then don't put it on unless the group is on.
914 */
915 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
916 goto unlock;
917
918 if (!group_can_go_on(event, cpuctx, 1)) {
919 err = -EEXIST;
920 } else {
921 perf_disable();
922 if (event == leader)
923 err = group_sched_in(event, cpuctx, ctx,
924 smp_processor_id());
925 else
926 err = event_sched_in(event, cpuctx, ctx,
927 smp_processor_id());
928 perf_enable();
929 }
930
931 if (err) {
932 /*
933 * If this event can't go on and it's part of a
934 * group, then the whole group has to come off.
935 */
936 if (leader != event)
937 group_sched_out(leader, cpuctx, ctx);
938 if (leader->attr.pinned) {
939 update_group_times(leader);
940 leader->state = PERF_EVENT_STATE_ERROR;
941 }
942 }
943
944 unlock:
945 spin_unlock(&ctx->lock);
946}
947
948/*
949 * Enable a event.
950 *
951 * If event->ctx is a cloned context, callers must make sure that
952 * every task struct that event->ctx->task could possibly point to
953 * remains valid. This condition is satisfied when called through
954 * perf_event_for_each_child or perf_event_for_each as described
955 * for perf_event_disable.
956 */
957static void perf_event_enable(struct perf_event *event)
958{
959 struct perf_event_context *ctx = event->ctx;
960 struct task_struct *task = ctx->task;
961
962 if (!task) {
963 /*
964 * Enable the event on the cpu that it's on
965 */
966 smp_call_function_single(event->cpu, __perf_event_enable,
967 event, 1);
968 return;
969 }
970
971 spin_lock_irq(&ctx->lock);
972 if (event->state >= PERF_EVENT_STATE_INACTIVE)
973 goto out;
974
975 /*
976 * If the event is in error state, clear that first.
977 * That way, if we see the event in error state below, we
978 * know that it has gone back into error state, as distinct
979 * from the task having been scheduled away before the
980 * cross-call arrived.
981 */
982 if (event->state == PERF_EVENT_STATE_ERROR)
983 event->state = PERF_EVENT_STATE_OFF;
984
985 retry:
986 spin_unlock_irq(&ctx->lock);
987 task_oncpu_function_call(task, __perf_event_enable, event);
988
989 spin_lock_irq(&ctx->lock);
990
991 /*
992 * If the context is active and the event is still off,
993 * we need to retry the cross-call.
994 */
995 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
996 goto retry;
997
998 /*
999 * Since we have the lock this context can't be scheduled
1000 * in, so we can change the state safely.
1001 */
1002 if (event->state == PERF_EVENT_STATE_OFF)
1003 __perf_event_mark_enabled(event, ctx);
1004
1005 out:
1006 spin_unlock_irq(&ctx->lock);
1007}
1008
1009static int perf_event_refresh(struct perf_event *event, int refresh)
1010{
1011 /*
1012 * not supported on inherited events
1013 */
1014 if (event->attr.inherit)
1015 return -EINVAL;
1016
1017 atomic_add(refresh, &event->event_limit);
1018 perf_event_enable(event);
1019
1020 return 0;
1021}
1022
1023void __perf_event_sched_out(struct perf_event_context *ctx,
1024 struct perf_cpu_context *cpuctx)
1025{
1026 struct perf_event *event;
1027
1028 spin_lock(&ctx->lock);
1029 ctx->is_active = 0;
1030 if (likely(!ctx->nr_events))
1031 goto out;
1032 update_context_time(ctx);
1033
1034 perf_disable();
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +01001035 if (ctx->nr_active) {
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001036 list_for_each_entry(event, &ctx->group_list, group_entry)
1037 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +01001038 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001039 perf_enable();
1040 out:
1041 spin_unlock(&ctx->lock);
1042}
1043
1044/*
1045 * Test whether two contexts are equivalent, i.e. whether they
1046 * have both been cloned from the same version of the same context
1047 * and they both have the same number of enabled events.
1048 * If the number of enabled events is the same, then the set
1049 * of enabled events should be the same, because these are both
1050 * inherited contexts, therefore we can't access individual events
1051 * in them directly with an fd; we can only enable/disable all
1052 * events via prctl, or enable/disable all events in a family
1053 * via ioctl, which will have the same effect on both contexts.
1054 */
1055static int context_equiv(struct perf_event_context *ctx1,
1056 struct perf_event_context *ctx2)
1057{
1058 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1059 && ctx1->parent_gen == ctx2->parent_gen
1060 && !ctx1->pin_count && !ctx2->pin_count;
1061}
1062
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001063static void __perf_event_sync_stat(struct perf_event *event,
1064 struct perf_event *next_event)
1065{
1066 u64 value;
1067
1068 if (!event->attr.inherit_stat)
1069 return;
1070
1071 /*
1072 * Update the event value, we cannot use perf_event_read()
1073 * because we're in the middle of a context switch and have IRQs
1074 * disabled, which upsets smp_call_function_single(), however
1075 * we know the event must be on the current CPU, therefore we
1076 * don't need to use it.
1077 */
1078 switch (event->state) {
1079 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001080 event->pmu->read(event);
1081 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001082
1083 case PERF_EVENT_STATE_INACTIVE:
1084 update_event_times(event);
1085 break;
1086
1087 default:
1088 break;
1089 }
1090
1091 /*
1092 * In order to keep per-task stats reliable we need to flip the event
1093 * values when we flip the contexts.
1094 */
1095 value = atomic64_read(&next_event->count);
1096 value = atomic64_xchg(&event->count, value);
1097 atomic64_set(&next_event->count, value);
1098
1099 swap(event->total_time_enabled, next_event->total_time_enabled);
1100 swap(event->total_time_running, next_event->total_time_running);
1101
1102 /*
1103 * Since we swizzled the values, update the user visible data too.
1104 */
1105 perf_event_update_userpage(event);
1106 perf_event_update_userpage(next_event);
1107}
1108
1109#define list_next_entry(pos, member) \
1110 list_entry(pos->member.next, typeof(*pos), member)
1111
1112static void perf_event_sync_stat(struct perf_event_context *ctx,
1113 struct perf_event_context *next_ctx)
1114{
1115 struct perf_event *event, *next_event;
1116
1117 if (!ctx->nr_stat)
1118 return;
1119
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001120 update_context_time(ctx);
1121
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001122 event = list_first_entry(&ctx->event_list,
1123 struct perf_event, event_entry);
1124
1125 next_event = list_first_entry(&next_ctx->event_list,
1126 struct perf_event, event_entry);
1127
1128 while (&event->event_entry != &ctx->event_list &&
1129 &next_event->event_entry != &next_ctx->event_list) {
1130
1131 __perf_event_sync_stat(event, next_event);
1132
1133 event = list_next_entry(event, event_entry);
1134 next_event = list_next_entry(next_event, event_entry);
1135 }
1136}
1137
1138/*
1139 * Called from scheduler to remove the events of the current task,
1140 * with interrupts disabled.
1141 *
1142 * We stop each event and update the event value in event->count.
1143 *
1144 * This does not protect us against NMI, but disable()
1145 * sets the disabled bit in the control field of event _before_
1146 * accessing the event control register. If a NMI hits, then it will
1147 * not restart the event.
1148 */
1149void perf_event_task_sched_out(struct task_struct *task,
1150 struct task_struct *next, int cpu)
1151{
1152 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1153 struct perf_event_context *ctx = task->perf_event_ctxp;
1154 struct perf_event_context *next_ctx;
1155 struct perf_event_context *parent;
1156 struct pt_regs *regs;
1157 int do_switch = 1;
1158
1159 regs = task_pt_regs(task);
1160 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1161
1162 if (likely(!ctx || !cpuctx->task_ctx))
1163 return;
1164
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001165 rcu_read_lock();
1166 parent = rcu_dereference(ctx->parent_ctx);
1167 next_ctx = next->perf_event_ctxp;
1168 if (parent && next_ctx &&
1169 rcu_dereference(next_ctx->parent_ctx) == parent) {
1170 /*
1171 * Looks like the two contexts are clones, so we might be
1172 * able to optimize the context switch. We lock both
1173 * contexts and check that they are clones under the
1174 * lock (including re-checking that neither has been
1175 * uncloned in the meantime). It doesn't matter which
1176 * order we take the locks because no other cpu could
1177 * be trying to lock both of these tasks.
1178 */
1179 spin_lock(&ctx->lock);
1180 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1181 if (context_equiv(ctx, next_ctx)) {
1182 /*
1183 * XXX do we need a memory barrier of sorts
1184 * wrt to rcu_dereference() of perf_event_ctxp
1185 */
1186 task->perf_event_ctxp = next_ctx;
1187 next->perf_event_ctxp = ctx;
1188 ctx->task = next;
1189 next_ctx->task = task;
1190 do_switch = 0;
1191
1192 perf_event_sync_stat(ctx, next_ctx);
1193 }
1194 spin_unlock(&next_ctx->lock);
1195 spin_unlock(&ctx->lock);
1196 }
1197 rcu_read_unlock();
1198
1199 if (do_switch) {
1200 __perf_event_sched_out(ctx, cpuctx);
1201 cpuctx->task_ctx = NULL;
1202 }
1203}
1204
1205/*
1206 * Called with IRQs disabled
1207 */
1208static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1209{
1210 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1211
1212 if (!cpuctx->task_ctx)
1213 return;
1214
1215 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1216 return;
1217
1218 __perf_event_sched_out(ctx, cpuctx);
1219 cpuctx->task_ctx = NULL;
1220}
1221
1222/*
1223 * Called with IRQs disabled
1224 */
1225static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1226{
1227 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1228}
1229
1230static void
1231__perf_event_sched_in(struct perf_event_context *ctx,
1232 struct perf_cpu_context *cpuctx, int cpu)
1233{
1234 struct perf_event *event;
1235 int can_add_hw = 1;
1236
1237 spin_lock(&ctx->lock);
1238 ctx->is_active = 1;
1239 if (likely(!ctx->nr_events))
1240 goto out;
1241
1242 ctx->timestamp = perf_clock();
1243
1244 perf_disable();
1245
1246 /*
1247 * First go through the list and put on any pinned groups
1248 * in order to give them the best chance of going on.
1249 */
1250 list_for_each_entry(event, &ctx->group_list, group_entry) {
1251 if (event->state <= PERF_EVENT_STATE_OFF ||
1252 !event->attr.pinned)
1253 continue;
1254 if (event->cpu != -1 && event->cpu != cpu)
1255 continue;
1256
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001257 if (group_can_go_on(event, cpuctx, 1))
1258 group_sched_in(event, cpuctx, ctx, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001259
1260 /*
1261 * If this pinned group hasn't been scheduled,
1262 * put it in error state.
1263 */
1264 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1265 update_group_times(event);
1266 event->state = PERF_EVENT_STATE_ERROR;
1267 }
1268 }
1269
1270 list_for_each_entry(event, &ctx->group_list, group_entry) {
1271 /*
1272 * Ignore events in OFF or ERROR state, and
1273 * ignore pinned events since we did them already.
1274 */
1275 if (event->state <= PERF_EVENT_STATE_OFF ||
1276 event->attr.pinned)
1277 continue;
1278
1279 /*
1280 * Listen to the 'cpu' scheduling filter constraint
1281 * of events:
1282 */
1283 if (event->cpu != -1 && event->cpu != cpu)
1284 continue;
1285
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001286 if (group_can_go_on(event, cpuctx, can_add_hw))
1287 if (group_sched_in(event, cpuctx, ctx, cpu))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001288 can_add_hw = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001289 }
1290 perf_enable();
1291 out:
1292 spin_unlock(&ctx->lock);
1293}
1294
1295/*
1296 * Called from scheduler to add the events of the current task
1297 * with interrupts disabled.
1298 *
1299 * We restore the event value and then enable it.
1300 *
1301 * This does not protect us against NMI, but enable()
1302 * sets the enabled bit in the control field of event _before_
1303 * accessing the event control register. If a NMI hits, then it will
1304 * keep the event running.
1305 */
1306void perf_event_task_sched_in(struct task_struct *task, int cpu)
1307{
1308 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1309 struct perf_event_context *ctx = task->perf_event_ctxp;
1310
1311 if (likely(!ctx))
1312 return;
1313 if (cpuctx->task_ctx == ctx)
1314 return;
1315 __perf_event_sched_in(ctx, cpuctx, cpu);
1316 cpuctx->task_ctx = ctx;
1317}
1318
1319static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1320{
1321 struct perf_event_context *ctx = &cpuctx->ctx;
1322
1323 __perf_event_sched_in(ctx, cpuctx, cpu);
1324}
1325
1326#define MAX_INTERRUPTS (~0ULL)
1327
1328static void perf_log_throttle(struct perf_event *event, int enable);
1329
1330static void perf_adjust_period(struct perf_event *event, u64 events)
1331{
1332 struct hw_perf_event *hwc = &event->hw;
1333 u64 period, sample_period;
1334 s64 delta;
1335
1336 events *= hwc->sample_period;
1337 period = div64_u64(events, event->attr.sample_freq);
1338
1339 delta = (s64)(period - hwc->sample_period);
1340 delta = (delta + 7) / 8; /* low pass filter */
1341
1342 sample_period = hwc->sample_period + delta;
1343
1344 if (!sample_period)
1345 sample_period = 1;
1346
1347 hwc->sample_period = sample_period;
1348}
1349
1350static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1351{
1352 struct perf_event *event;
1353 struct hw_perf_event *hwc;
1354 u64 interrupts, freq;
1355
1356 spin_lock(&ctx->lock);
Paul Mackerras03541f82009-10-14 16:58:03 +11001357 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001358 if (event->state != PERF_EVENT_STATE_ACTIVE)
1359 continue;
1360
1361 hwc = &event->hw;
1362
1363 interrupts = hwc->interrupts;
1364 hwc->interrupts = 0;
1365
1366 /*
1367 * unthrottle events on the tick
1368 */
1369 if (interrupts == MAX_INTERRUPTS) {
1370 perf_log_throttle(event, 1);
1371 event->pmu->unthrottle(event);
1372 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1373 }
1374
1375 if (!event->attr.freq || !event->attr.sample_freq)
1376 continue;
1377
1378 /*
1379 * if the specified freq < HZ then we need to skip ticks
1380 */
1381 if (event->attr.sample_freq < HZ) {
1382 freq = event->attr.sample_freq;
1383
1384 hwc->freq_count += freq;
1385 hwc->freq_interrupts += interrupts;
1386
1387 if (hwc->freq_count < HZ)
1388 continue;
1389
1390 interrupts = hwc->freq_interrupts;
1391 hwc->freq_interrupts = 0;
1392 hwc->freq_count -= HZ;
1393 } else
1394 freq = HZ;
1395
1396 perf_adjust_period(event, freq * interrupts);
1397
1398 /*
1399 * In order to avoid being stalled by an (accidental) huge
1400 * sample period, force reset the sample period if we didn't
1401 * get any events in this freq period.
1402 */
1403 if (!interrupts) {
1404 perf_disable();
1405 event->pmu->disable(event);
1406 atomic64_set(&hwc->period_left, 0);
1407 event->pmu->enable(event);
1408 perf_enable();
1409 }
1410 }
1411 spin_unlock(&ctx->lock);
1412}
1413
1414/*
1415 * Round-robin a context's events:
1416 */
1417static void rotate_ctx(struct perf_event_context *ctx)
1418{
1419 struct perf_event *event;
1420
1421 if (!ctx->nr_events)
1422 return;
1423
1424 spin_lock(&ctx->lock);
1425 /*
1426 * Rotate the first entry last (works just fine for group events too):
1427 */
1428 perf_disable();
1429 list_for_each_entry(event, &ctx->group_list, group_entry) {
1430 list_move_tail(&event->group_entry, &ctx->group_list);
1431 break;
1432 }
1433 perf_enable();
1434
1435 spin_unlock(&ctx->lock);
1436}
1437
1438void perf_event_task_tick(struct task_struct *curr, int cpu)
1439{
1440 struct perf_cpu_context *cpuctx;
1441 struct perf_event_context *ctx;
1442
1443 if (!atomic_read(&nr_events))
1444 return;
1445
1446 cpuctx = &per_cpu(perf_cpu_context, cpu);
1447 ctx = curr->perf_event_ctxp;
1448
1449 perf_ctx_adjust_freq(&cpuctx->ctx);
1450 if (ctx)
1451 perf_ctx_adjust_freq(ctx);
1452
1453 perf_event_cpu_sched_out(cpuctx);
1454 if (ctx)
1455 __perf_event_task_sched_out(ctx);
1456
1457 rotate_ctx(&cpuctx->ctx);
1458 if (ctx)
1459 rotate_ctx(ctx);
1460
1461 perf_event_cpu_sched_in(cpuctx, cpu);
1462 if (ctx)
1463 perf_event_task_sched_in(curr, cpu);
1464}
1465
1466/*
1467 * Enable all of a task's events that have been marked enable-on-exec.
1468 * This expects task == current.
1469 */
1470static void perf_event_enable_on_exec(struct task_struct *task)
1471{
1472 struct perf_event_context *ctx;
1473 struct perf_event *event;
1474 unsigned long flags;
1475 int enabled = 0;
1476
1477 local_irq_save(flags);
1478 ctx = task->perf_event_ctxp;
1479 if (!ctx || !ctx->nr_events)
1480 goto out;
1481
1482 __perf_event_task_sched_out(ctx);
1483
1484 spin_lock(&ctx->lock);
1485
1486 list_for_each_entry(event, &ctx->group_list, group_entry) {
1487 if (!event->attr.enable_on_exec)
1488 continue;
1489 event->attr.enable_on_exec = 0;
1490 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1491 continue;
1492 __perf_event_mark_enabled(event, ctx);
1493 enabled = 1;
1494 }
1495
1496 /*
1497 * Unclone this context if we enabled any event.
1498 */
1499 if (enabled)
1500 unclone_ctx(ctx);
1501
1502 spin_unlock(&ctx->lock);
1503
1504 perf_event_task_sched_in(task, smp_processor_id());
1505 out:
1506 local_irq_restore(flags);
1507}
1508
1509/*
1510 * Cross CPU call to read the hardware event
1511 */
1512static void __perf_event_read(void *info)
1513{
1514 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1515 struct perf_event *event = info;
1516 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001517
1518 /*
1519 * If this is a task context, we need to check whether it is
1520 * the current task context of this cpu. If not it has been
1521 * scheduled out before the smp call arrived. In that case
1522 * event->count would have been updated to a recent sample
1523 * when the event was scheduled out.
1524 */
1525 if (ctx->task && cpuctx->task_ctx != ctx)
1526 return;
1527
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001528 spin_lock(&ctx->lock);
Peter Zijlstra58e5ad12009-11-20 22:19:53 +01001529 update_context_time(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001530 update_event_times(event);
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001531 spin_unlock(&ctx->lock);
1532
Peter Zijlstra58e5ad12009-11-20 22:19:53 +01001533 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001534}
1535
1536static u64 perf_event_read(struct perf_event *event)
1537{
1538 /*
1539 * If event is enabled and currently active on a CPU, update the
1540 * value in the event structure:
1541 */
1542 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1543 smp_call_function_single(event->oncpu,
1544 __perf_event_read, event, 1);
1545 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001546 struct perf_event_context *ctx = event->ctx;
1547 unsigned long flags;
1548
1549 spin_lock_irqsave(&ctx->lock, flags);
1550 update_context_time(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001551 update_event_times(event);
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001552 spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001553 }
1554
1555 return atomic64_read(&event->count);
1556}
1557
1558/*
1559 * Initialize the perf_event context in a task_struct:
1560 */
1561static void
1562__perf_event_init_context(struct perf_event_context *ctx,
1563 struct task_struct *task)
1564{
1565 memset(ctx, 0, sizeof(*ctx));
1566 spin_lock_init(&ctx->lock);
1567 mutex_init(&ctx->mutex);
1568 INIT_LIST_HEAD(&ctx->group_list);
1569 INIT_LIST_HEAD(&ctx->event_list);
1570 atomic_set(&ctx->refcount, 1);
1571 ctx->task = task;
1572}
1573
1574static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1575{
1576 struct perf_event_context *ctx;
1577 struct perf_cpu_context *cpuctx;
1578 struct task_struct *task;
1579 unsigned long flags;
1580 int err;
1581
1582 /*
1583 * If cpu is not a wildcard then this is a percpu event:
1584 */
1585 if (cpu != -1) {
1586 /* Must be root to operate on a CPU event: */
1587 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1588 return ERR_PTR(-EACCES);
1589
1590 if (cpu < 0 || cpu > num_possible_cpus())
1591 return ERR_PTR(-EINVAL);
1592
1593 /*
1594 * We could be clever and allow to attach a event to an
1595 * offline CPU and activate it when the CPU comes up, but
1596 * that's for later.
1597 */
1598 if (!cpu_isset(cpu, cpu_online_map))
1599 return ERR_PTR(-ENODEV);
1600
1601 cpuctx = &per_cpu(perf_cpu_context, cpu);
1602 ctx = &cpuctx->ctx;
1603 get_ctx(ctx);
1604
1605 return ctx;
1606 }
1607
1608 rcu_read_lock();
1609 if (!pid)
1610 task = current;
1611 else
1612 task = find_task_by_vpid(pid);
1613 if (task)
1614 get_task_struct(task);
1615 rcu_read_unlock();
1616
1617 if (!task)
1618 return ERR_PTR(-ESRCH);
1619
1620 /*
1621 * Can't attach events to a dying task.
1622 */
1623 err = -ESRCH;
1624 if (task->flags & PF_EXITING)
1625 goto errout;
1626
1627 /* Reuse ptrace permission checks for now. */
1628 err = -EACCES;
1629 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1630 goto errout;
1631
1632 retry:
1633 ctx = perf_lock_task_context(task, &flags);
1634 if (ctx) {
1635 unclone_ctx(ctx);
1636 spin_unlock_irqrestore(&ctx->lock, flags);
1637 }
1638
1639 if (!ctx) {
1640 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1641 err = -ENOMEM;
1642 if (!ctx)
1643 goto errout;
1644 __perf_event_init_context(ctx, task);
1645 get_ctx(ctx);
1646 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1647 /*
1648 * We raced with some other task; use
1649 * the context they set.
1650 */
1651 kfree(ctx);
1652 goto retry;
1653 }
1654 get_task_struct(task);
1655 }
1656
1657 put_task_struct(task);
1658 return ctx;
1659
1660 errout:
1661 put_task_struct(task);
1662 return ERR_PTR(err);
1663}
1664
Li Zefan6fb29152009-10-15 11:21:42 +08001665static void perf_event_free_filter(struct perf_event *event);
1666
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001667static void free_event_rcu(struct rcu_head *head)
1668{
1669 struct perf_event *event;
1670
1671 event = container_of(head, struct perf_event, rcu_head);
1672 if (event->ns)
1673 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08001674 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001675 kfree(event);
1676}
1677
1678static void perf_pending_sync(struct perf_event *event);
1679
1680static void free_event(struct perf_event *event)
1681{
1682 perf_pending_sync(event);
1683
1684 if (!event->parent) {
1685 atomic_dec(&nr_events);
1686 if (event->attr.mmap)
1687 atomic_dec(&nr_mmap_events);
1688 if (event->attr.comm)
1689 atomic_dec(&nr_comm_events);
1690 if (event->attr.task)
1691 atomic_dec(&nr_task_events);
1692 }
1693
1694 if (event->output) {
1695 fput(event->output->filp);
1696 event->output = NULL;
1697 }
1698
1699 if (event->destroy)
1700 event->destroy(event);
1701
1702 put_ctx(event->ctx);
1703 call_rcu(&event->rcu_head, free_event_rcu);
1704}
1705
Arjan van de Venfb0459d2009-09-25 12:25:56 +02001706int perf_event_release_kernel(struct perf_event *event)
1707{
1708 struct perf_event_context *ctx = event->ctx;
1709
1710 WARN_ON_ONCE(ctx->parent_ctx);
1711 mutex_lock(&ctx->mutex);
1712 perf_event_remove_from_context(event);
1713 mutex_unlock(&ctx->mutex);
1714
1715 mutex_lock(&event->owner->perf_event_mutex);
1716 list_del_init(&event->owner_entry);
1717 mutex_unlock(&event->owner->perf_event_mutex);
1718 put_task_struct(event->owner);
1719
1720 free_event(event);
1721
1722 return 0;
1723}
1724EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1725
Peter Zijlstraa66a3052009-11-23 11:37:23 +01001726/*
1727 * Called when the last reference to the file is gone.
1728 */
1729static int perf_release(struct inode *inode, struct file *file)
1730{
1731 struct perf_event *event = file->private_data;
1732
1733 file->private_data = NULL;
1734
1735 return perf_event_release_kernel(event);
1736}
1737
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001738static int perf_event_read_size(struct perf_event *event)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (event->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += event->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001763u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001764{
1765 struct perf_event *child;
1766 u64 total = 0;
1767
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001768 *enabled = 0;
1769 *running = 0;
1770
Peter Zijlstra6f105812009-11-20 22:19:56 +01001771 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001772 total += perf_event_read(event);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001773 *enabled += event->total_time_enabled +
1774 atomic64_read(&event->child_total_time_enabled);
1775 *running += event->total_time_running +
1776 atomic64_read(&event->child_total_time_running);
1777
1778 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001779 total += perf_event_read(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001780 *enabled += child->total_time_enabled;
1781 *running += child->total_time_running;
1782 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01001783 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001784
1785 return total;
1786}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02001787EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001788
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001789static int perf_event_read_group(struct perf_event *event,
1790 u64 read_format, char __user *buf)
1791{
1792 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01001793 int n = 0, size = 0, ret = -EFAULT;
1794 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001795 u64 values[5];
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001796 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001797
Peter Zijlstra6f105812009-11-20 22:19:56 +01001798 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001799 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001800
1801 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001802 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1803 values[n++] = enabled;
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1805 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001806 values[n++] = count;
1807 if (read_format & PERF_FORMAT_ID)
1808 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01001813 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001814
Peter Zijlstra6f105812009-11-20 22:19:56 +01001815 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001816
1817 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01001818 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001819
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001820 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01001821 if (read_format & PERF_FORMAT_ID)
1822 values[n++] = primary_event_id(sub);
1823
1824 size = n * sizeof(u64);
1825
Peter Zijlstra6f105812009-11-20 22:19:56 +01001826 if (copy_to_user(buf + size, values, size)) {
1827 ret = -EFAULT;
1828 goto unlock;
1829 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01001830
1831 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001832 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01001833unlock:
1834 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001835
Peter Zijlstraabf48682009-11-20 22:19:49 +01001836 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001837}
1838
1839static int perf_event_read_one(struct perf_event *event,
1840 u64 read_format, char __user *buf)
1841{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001842 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001843 u64 values[4];
1844 int n = 0;
1845
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001846 values[n++] = perf_event_read_value(event, &enabled, &running);
1847 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1848 values[n++] = enabled;
1849 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1850 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001851 if (read_format & PERF_FORMAT_ID)
1852 values[n++] = primary_event_id(event);
1853
1854 if (copy_to_user(buf, values, n * sizeof(u64)))
1855 return -EFAULT;
1856
1857 return n * sizeof(u64);
1858}
1859
1860/*
1861 * Read the performance event - simple non blocking version for now
1862 */
1863static ssize_t
1864perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1865{
1866 u64 read_format = event->attr.read_format;
1867 int ret;
1868
1869 /*
1870 * Return end-of-file for a read on a event that is in
1871 * error state (i.e. because it was pinned but it couldn't be
1872 * scheduled on to the CPU at some point).
1873 */
1874 if (event->state == PERF_EVENT_STATE_ERROR)
1875 return 0;
1876
1877 if (count < perf_event_read_size(event))
1878 return -ENOSPC;
1879
1880 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001881 if (read_format & PERF_FORMAT_GROUP)
1882 ret = perf_event_read_group(event, read_format, buf);
1883 else
1884 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001885
1886 return ret;
1887}
1888
1889static ssize_t
1890perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1891{
1892 struct perf_event *event = file->private_data;
1893
1894 return perf_read_hw(event, buf, count);
1895}
1896
1897static unsigned int perf_poll(struct file *file, poll_table *wait)
1898{
1899 struct perf_event *event = file->private_data;
1900 struct perf_mmap_data *data;
1901 unsigned int events = POLL_HUP;
1902
1903 rcu_read_lock();
1904 data = rcu_dereference(event->data);
1905 if (data)
1906 events = atomic_xchg(&data->poll, 0);
1907 rcu_read_unlock();
1908
1909 poll_wait(file, &event->waitq, wait);
1910
1911 return events;
1912}
1913
1914static void perf_event_reset(struct perf_event *event)
1915{
1916 (void)perf_event_read(event);
1917 atomic64_set(&event->count, 0);
1918 perf_event_update_userpage(event);
1919}
1920
1921/*
1922 * Holding the top-level event's child_mutex means that any
1923 * descendant process that has inherited this event will block
1924 * in sync_child_event if it goes to exit, thus satisfying the
1925 * task existence requirements of perf_event_enable/disable.
1926 */
1927static void perf_event_for_each_child(struct perf_event *event,
1928 void (*func)(struct perf_event *))
1929{
1930 struct perf_event *child;
1931
1932 WARN_ON_ONCE(event->ctx->parent_ctx);
1933 mutex_lock(&event->child_mutex);
1934 func(event);
1935 list_for_each_entry(child, &event->child_list, child_list)
1936 func(child);
1937 mutex_unlock(&event->child_mutex);
1938}
1939
1940static void perf_event_for_each(struct perf_event *event,
1941 void (*func)(struct perf_event *))
1942{
1943 struct perf_event_context *ctx = event->ctx;
1944 struct perf_event *sibling;
1945
1946 WARN_ON_ONCE(ctx->parent_ctx);
1947 mutex_lock(&ctx->mutex);
1948 event = event->group_leader;
1949
1950 perf_event_for_each_child(event, func);
1951 func(event);
1952 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1953 perf_event_for_each_child(event, func);
1954 mutex_unlock(&ctx->mutex);
1955}
1956
1957static int perf_event_period(struct perf_event *event, u64 __user *arg)
1958{
1959 struct perf_event_context *ctx = event->ctx;
1960 unsigned long size;
1961 int ret = 0;
1962 u64 value;
1963
1964 if (!event->attr.sample_period)
1965 return -EINVAL;
1966
1967 size = copy_from_user(&value, arg, sizeof(value));
1968 if (size != sizeof(value))
1969 return -EFAULT;
1970
1971 if (!value)
1972 return -EINVAL;
1973
1974 spin_lock_irq(&ctx->lock);
1975 if (event->attr.freq) {
1976 if (value > sysctl_perf_event_sample_rate) {
1977 ret = -EINVAL;
1978 goto unlock;
1979 }
1980
1981 event->attr.sample_freq = value;
1982 } else {
1983 event->attr.sample_period = value;
1984 event->hw.sample_period = value;
1985 }
1986unlock:
1987 spin_unlock_irq(&ctx->lock);
1988
1989 return ret;
1990}
1991
Li Zefan6fb29152009-10-15 11:21:42 +08001992static int perf_event_set_output(struct perf_event *event, int output_fd);
1993static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001994
1995static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1996{
1997 struct perf_event *event = file->private_data;
1998 void (*func)(struct perf_event *);
1999 u32 flags = arg;
2000
2001 switch (cmd) {
2002 case PERF_EVENT_IOC_ENABLE:
2003 func = perf_event_enable;
2004 break;
2005 case PERF_EVENT_IOC_DISABLE:
2006 func = perf_event_disable;
2007 break;
2008 case PERF_EVENT_IOC_RESET:
2009 func = perf_event_reset;
2010 break;
2011
2012 case PERF_EVENT_IOC_REFRESH:
2013 return perf_event_refresh(event, arg);
2014
2015 case PERF_EVENT_IOC_PERIOD:
2016 return perf_event_period(event, (u64 __user *)arg);
2017
2018 case PERF_EVENT_IOC_SET_OUTPUT:
2019 return perf_event_set_output(event, arg);
2020
Li Zefan6fb29152009-10-15 11:21:42 +08002021 case PERF_EVENT_IOC_SET_FILTER:
2022 return perf_event_set_filter(event, (void __user *)arg);
2023
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002024 default:
2025 return -ENOTTY;
2026 }
2027
2028 if (flags & PERF_IOC_FLAG_GROUP)
2029 perf_event_for_each(event, func);
2030 else
2031 perf_event_for_each_child(event, func);
2032
2033 return 0;
2034}
2035
2036int perf_event_task_enable(void)
2037{
2038 struct perf_event *event;
2039
2040 mutex_lock(&current->perf_event_mutex);
2041 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2042 perf_event_for_each_child(event, perf_event_enable);
2043 mutex_unlock(&current->perf_event_mutex);
2044
2045 return 0;
2046}
2047
2048int perf_event_task_disable(void)
2049{
2050 struct perf_event *event;
2051
2052 mutex_lock(&current->perf_event_mutex);
2053 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2054 perf_event_for_each_child(event, perf_event_disable);
2055 mutex_unlock(&current->perf_event_mutex);
2056
2057 return 0;
2058}
2059
2060#ifndef PERF_EVENT_INDEX_OFFSET
2061# define PERF_EVENT_INDEX_OFFSET 0
2062#endif
2063
2064static int perf_event_index(struct perf_event *event)
2065{
2066 if (event->state != PERF_EVENT_STATE_ACTIVE)
2067 return 0;
2068
2069 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2070}
2071
2072/*
2073 * Callers need to ensure there can be no nesting of this function, otherwise
2074 * the seqlock logic goes bad. We can not serialize this because the arch
2075 * code calls this from NMI context.
2076 */
2077void perf_event_update_userpage(struct perf_event *event)
2078{
2079 struct perf_event_mmap_page *userpg;
2080 struct perf_mmap_data *data;
2081
2082 rcu_read_lock();
2083 data = rcu_dereference(event->data);
2084 if (!data)
2085 goto unlock;
2086
2087 userpg = data->user_page;
2088
2089 /*
2090 * Disable preemption so as to not let the corresponding user-space
2091 * spin too long if we get preempted.
2092 */
2093 preempt_disable();
2094 ++userpg->lock;
2095 barrier();
2096 userpg->index = perf_event_index(event);
2097 userpg->offset = atomic64_read(&event->count);
2098 if (event->state == PERF_EVENT_STATE_ACTIVE)
2099 userpg->offset -= atomic64_read(&event->hw.prev_count);
2100
2101 userpg->time_enabled = event->total_time_enabled +
2102 atomic64_read(&event->child_total_time_enabled);
2103
2104 userpg->time_running = event->total_time_running +
2105 atomic64_read(&event->child_total_time_running);
2106
2107 barrier();
2108 ++userpg->lock;
2109 preempt_enable();
2110unlock:
2111 rcu_read_unlock();
2112}
2113
Peter Zijlstra906010b2009-09-21 16:08:49 +02002114static unsigned long perf_data_size(struct perf_mmap_data *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002115{
Peter Zijlstra906010b2009-09-21 16:08:49 +02002116 return data->nr_pages << (PAGE_SHIFT + data->data_order);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002117}
2118
Peter Zijlstra906010b2009-09-21 16:08:49 +02002119#ifndef CONFIG_PERF_USE_VMALLOC
2120
2121/*
2122 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2123 */
2124
2125static struct page *
2126perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2127{
2128 if (pgoff > data->nr_pages)
2129 return NULL;
2130
2131 if (pgoff == 0)
2132 return virt_to_page(data->user_page);
2133
2134 return virt_to_page(data->data_pages[pgoff - 1]);
2135}
2136
2137static struct perf_mmap_data *
2138perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002139{
2140 struct perf_mmap_data *data;
2141 unsigned long size;
2142 int i;
2143
2144 WARN_ON(atomic_read(&event->mmap_count));
2145
2146 size = sizeof(struct perf_mmap_data);
2147 size += nr_pages * sizeof(void *);
2148
2149 data = kzalloc(size, GFP_KERNEL);
2150 if (!data)
2151 goto fail;
2152
2153 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2154 if (!data->user_page)
2155 goto fail_user_page;
2156
2157 for (i = 0; i < nr_pages; i++) {
2158 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2159 if (!data->data_pages[i])
2160 goto fail_data_pages;
2161 }
2162
Peter Zijlstra906010b2009-09-21 16:08:49 +02002163 data->data_order = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002164 data->nr_pages = nr_pages;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002165
Peter Zijlstra906010b2009-09-21 16:08:49 +02002166 return data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002167
2168fail_data_pages:
2169 for (i--; i >= 0; i--)
2170 free_page((unsigned long)data->data_pages[i]);
2171
2172 free_page((unsigned long)data->user_page);
2173
2174fail_user_page:
2175 kfree(data);
2176
2177fail:
Peter Zijlstra906010b2009-09-21 16:08:49 +02002178 return NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002179}
2180
2181static void perf_mmap_free_page(unsigned long addr)
2182{
2183 struct page *page = virt_to_page((void *)addr);
2184
2185 page->mapping = NULL;
2186 __free_page(page);
2187}
2188
Peter Zijlstra906010b2009-09-21 16:08:49 +02002189static void perf_mmap_data_free(struct perf_mmap_data *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002190{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002191 int i;
2192
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002193 perf_mmap_free_page((unsigned long)data->user_page);
2194 for (i = 0; i < data->nr_pages; i++)
2195 perf_mmap_free_page((unsigned long)data->data_pages[i]);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002196}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002197
Peter Zijlstra906010b2009-09-21 16:08:49 +02002198#else
2199
2200/*
2201 * Back perf_mmap() with vmalloc memory.
2202 *
2203 * Required for architectures that have d-cache aliasing issues.
2204 */
2205
2206static struct page *
2207perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2208{
2209 if (pgoff > (1UL << data->data_order))
2210 return NULL;
2211
2212 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2213}
2214
2215static void perf_mmap_unmark_page(void *addr)
2216{
2217 struct page *page = vmalloc_to_page(addr);
2218
2219 page->mapping = NULL;
2220}
2221
2222static void perf_mmap_data_free_work(struct work_struct *work)
2223{
2224 struct perf_mmap_data *data;
2225 void *base;
2226 int i, nr;
2227
2228 data = container_of(work, struct perf_mmap_data, work);
2229 nr = 1 << data->data_order;
2230
2231 base = data->user_page;
2232 for (i = 0; i < nr + 1; i++)
2233 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2234
2235 vfree(base);
2236}
2237
2238static void perf_mmap_data_free(struct perf_mmap_data *data)
2239{
2240 schedule_work(&data->work);
2241}
2242
2243static struct perf_mmap_data *
2244perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2245{
2246 struct perf_mmap_data *data;
2247 unsigned long size;
2248 void *all_buf;
2249
2250 WARN_ON(atomic_read(&event->mmap_count));
2251
2252 size = sizeof(struct perf_mmap_data);
2253 size += sizeof(void *);
2254
2255 data = kzalloc(size, GFP_KERNEL);
2256 if (!data)
2257 goto fail;
2258
2259 INIT_WORK(&data->work, perf_mmap_data_free_work);
2260
2261 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2262 if (!all_buf)
2263 goto fail_all_buf;
2264
2265 data->user_page = all_buf;
2266 data->data_pages[0] = all_buf + PAGE_SIZE;
2267 data->data_order = ilog2(nr_pages);
2268 data->nr_pages = 1;
2269
2270 return data;
2271
2272fail_all_buf:
2273 kfree(data);
2274
2275fail:
2276 return NULL;
2277}
2278
2279#endif
2280
2281static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2282{
2283 struct perf_event *event = vma->vm_file->private_data;
2284 struct perf_mmap_data *data;
2285 int ret = VM_FAULT_SIGBUS;
2286
2287 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2288 if (vmf->pgoff == 0)
2289 ret = 0;
2290 return ret;
2291 }
2292
2293 rcu_read_lock();
2294 data = rcu_dereference(event->data);
2295 if (!data)
2296 goto unlock;
2297
2298 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2299 goto unlock;
2300
2301 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2302 if (!vmf->page)
2303 goto unlock;
2304
2305 get_page(vmf->page);
2306 vmf->page->mapping = vma->vm_file->f_mapping;
2307 vmf->page->index = vmf->pgoff;
2308
2309 ret = 0;
2310unlock:
2311 rcu_read_unlock();
2312
2313 return ret;
2314}
2315
2316static void
2317perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2318{
2319 long max_size = perf_data_size(data);
2320
2321 atomic_set(&data->lock, -1);
2322
2323 if (event->attr.watermark) {
2324 data->watermark = min_t(long, max_size,
2325 event->attr.wakeup_watermark);
2326 }
2327
2328 if (!data->watermark)
Stephane Eranian8904b182009-11-20 22:19:57 +01002329 data->watermark = max_size / 2;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002330
2331
2332 rcu_assign_pointer(event->data, data);
2333}
2334
2335static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2336{
2337 struct perf_mmap_data *data;
2338
2339 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2340 perf_mmap_data_free(data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002341 kfree(data);
2342}
2343
Peter Zijlstra906010b2009-09-21 16:08:49 +02002344static void perf_mmap_data_release(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002345{
2346 struct perf_mmap_data *data = event->data;
2347
2348 WARN_ON(atomic_read(&event->mmap_count));
2349
2350 rcu_assign_pointer(event->data, NULL);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002351 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002352}
2353
2354static void perf_mmap_open(struct vm_area_struct *vma)
2355{
2356 struct perf_event *event = vma->vm_file->private_data;
2357
2358 atomic_inc(&event->mmap_count);
2359}
2360
2361static void perf_mmap_close(struct vm_area_struct *vma)
2362{
2363 struct perf_event *event = vma->vm_file->private_data;
2364
2365 WARN_ON_ONCE(event->ctx->parent_ctx);
2366 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstra906010b2009-09-21 16:08:49 +02002367 unsigned long size = perf_data_size(event->data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002368 struct user_struct *user = current_user();
2369
Peter Zijlstra906010b2009-09-21 16:08:49 +02002370 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002371 vma->vm_mm->locked_vm -= event->data->nr_locked;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002372 perf_mmap_data_release(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002373 mutex_unlock(&event->mmap_mutex);
2374 }
2375}
2376
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04002377static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002378 .open = perf_mmap_open,
2379 .close = perf_mmap_close,
2380 .fault = perf_mmap_fault,
2381 .page_mkwrite = perf_mmap_fault,
2382};
2383
2384static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2385{
2386 struct perf_event *event = file->private_data;
2387 unsigned long user_locked, user_lock_limit;
2388 struct user_struct *user = current_user();
2389 unsigned long locked, lock_limit;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002390 struct perf_mmap_data *data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002391 unsigned long vma_size;
2392 unsigned long nr_pages;
2393 long user_extra, extra;
2394 int ret = 0;
2395
2396 if (!(vma->vm_flags & VM_SHARED))
2397 return -EINVAL;
2398
2399 vma_size = vma->vm_end - vma->vm_start;
2400 nr_pages = (vma_size / PAGE_SIZE) - 1;
2401
2402 /*
2403 * If we have data pages ensure they're a power-of-two number, so we
2404 * can do bitmasks instead of modulo.
2405 */
2406 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2407 return -EINVAL;
2408
2409 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2410 return -EINVAL;
2411
2412 if (vma->vm_pgoff != 0)
2413 return -EINVAL;
2414
2415 WARN_ON_ONCE(event->ctx->parent_ctx);
2416 mutex_lock(&event->mmap_mutex);
2417 if (event->output) {
2418 ret = -EINVAL;
2419 goto unlock;
2420 }
2421
2422 if (atomic_inc_not_zero(&event->mmap_count)) {
2423 if (nr_pages != event->data->nr_pages)
2424 ret = -EINVAL;
2425 goto unlock;
2426 }
2427
2428 user_extra = nr_pages + 1;
2429 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2430
2431 /*
2432 * Increase the limit linearly with more CPUs:
2433 */
2434 user_lock_limit *= num_online_cpus();
2435
2436 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2437
2438 extra = 0;
2439 if (user_locked > user_lock_limit)
2440 extra = user_locked - user_lock_limit;
2441
2442 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2443 lock_limit >>= PAGE_SHIFT;
2444 locked = vma->vm_mm->locked_vm + extra;
2445
2446 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2447 !capable(CAP_IPC_LOCK)) {
2448 ret = -EPERM;
2449 goto unlock;
2450 }
2451
2452 WARN_ON(event->data);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002453
2454 data = perf_mmap_data_alloc(event, nr_pages);
2455 ret = -ENOMEM;
2456 if (!data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002457 goto unlock;
2458
Peter Zijlstra906010b2009-09-21 16:08:49 +02002459 ret = 0;
2460 perf_mmap_data_init(event, data);
2461
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002462 atomic_set(&event->mmap_count, 1);
2463 atomic_long_add(user_extra, &user->locked_vm);
2464 vma->vm_mm->locked_vm += extra;
2465 event->data->nr_locked = extra;
2466 if (vma->vm_flags & VM_WRITE)
2467 event->data->writable = 1;
2468
2469unlock:
2470 mutex_unlock(&event->mmap_mutex);
2471
2472 vma->vm_flags |= VM_RESERVED;
2473 vma->vm_ops = &perf_mmap_vmops;
2474
2475 return ret;
2476}
2477
2478static int perf_fasync(int fd, struct file *filp, int on)
2479{
2480 struct inode *inode = filp->f_path.dentry->d_inode;
2481 struct perf_event *event = filp->private_data;
2482 int retval;
2483
2484 mutex_lock(&inode->i_mutex);
2485 retval = fasync_helper(fd, filp, on, &event->fasync);
2486 mutex_unlock(&inode->i_mutex);
2487
2488 if (retval < 0)
2489 return retval;
2490
2491 return 0;
2492}
2493
2494static const struct file_operations perf_fops = {
2495 .release = perf_release,
2496 .read = perf_read,
2497 .poll = perf_poll,
2498 .unlocked_ioctl = perf_ioctl,
2499 .compat_ioctl = perf_ioctl,
2500 .mmap = perf_mmap,
2501 .fasync = perf_fasync,
2502};
2503
2504/*
2505 * Perf event wakeup
2506 *
2507 * If there's data, ensure we set the poll() state and publish everything
2508 * to user-space before waking everybody up.
2509 */
2510
2511void perf_event_wakeup(struct perf_event *event)
2512{
2513 wake_up_all(&event->waitq);
2514
2515 if (event->pending_kill) {
2516 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2517 event->pending_kill = 0;
2518 }
2519}
2520
2521/*
2522 * Pending wakeups
2523 *
2524 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2525 *
2526 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2527 * single linked list and use cmpxchg() to add entries lockless.
2528 */
2529
2530static void perf_pending_event(struct perf_pending_entry *entry)
2531{
2532 struct perf_event *event = container_of(entry,
2533 struct perf_event, pending);
2534
2535 if (event->pending_disable) {
2536 event->pending_disable = 0;
2537 __perf_event_disable(event);
2538 }
2539
2540 if (event->pending_wakeup) {
2541 event->pending_wakeup = 0;
2542 perf_event_wakeup(event);
2543 }
2544}
2545
2546#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2547
2548static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2549 PENDING_TAIL,
2550};
2551
2552static void perf_pending_queue(struct perf_pending_entry *entry,
2553 void (*func)(struct perf_pending_entry *))
2554{
2555 struct perf_pending_entry **head;
2556
2557 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2558 return;
2559
2560 entry->func = func;
2561
2562 head = &get_cpu_var(perf_pending_head);
2563
2564 do {
2565 entry->next = *head;
2566 } while (cmpxchg(head, entry->next, entry) != entry->next);
2567
2568 set_perf_event_pending();
2569
2570 put_cpu_var(perf_pending_head);
2571}
2572
2573static int __perf_pending_run(void)
2574{
2575 struct perf_pending_entry *list;
2576 int nr = 0;
2577
2578 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2579 while (list != PENDING_TAIL) {
2580 void (*func)(struct perf_pending_entry *);
2581 struct perf_pending_entry *entry = list;
2582
2583 list = list->next;
2584
2585 func = entry->func;
2586 entry->next = NULL;
2587 /*
2588 * Ensure we observe the unqueue before we issue the wakeup,
2589 * so that we won't be waiting forever.
2590 * -- see perf_not_pending().
2591 */
2592 smp_wmb();
2593
2594 func(entry);
2595 nr++;
2596 }
2597
2598 return nr;
2599}
2600
2601static inline int perf_not_pending(struct perf_event *event)
2602{
2603 /*
2604 * If we flush on whatever cpu we run, there is a chance we don't
2605 * need to wait.
2606 */
2607 get_cpu();
2608 __perf_pending_run();
2609 put_cpu();
2610
2611 /*
2612 * Ensure we see the proper queue state before going to sleep
2613 * so that we do not miss the wakeup. -- see perf_pending_handle()
2614 */
2615 smp_rmb();
2616 return event->pending.next == NULL;
2617}
2618
2619static void perf_pending_sync(struct perf_event *event)
2620{
2621 wait_event(event->waitq, perf_not_pending(event));
2622}
2623
2624void perf_event_do_pending(void)
2625{
2626 __perf_pending_run();
2627}
2628
2629/*
2630 * Callchain support -- arch specific
2631 */
2632
2633__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2634{
2635 return NULL;
2636}
2637
2638/*
2639 * Output
2640 */
2641static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2642 unsigned long offset, unsigned long head)
2643{
2644 unsigned long mask;
2645
2646 if (!data->writable)
2647 return true;
2648
Peter Zijlstra906010b2009-09-21 16:08:49 +02002649 mask = perf_data_size(data) - 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002650
2651 offset = (offset - tail) & mask;
2652 head = (head - tail) & mask;
2653
2654 if ((int)(head - offset) < 0)
2655 return false;
2656
2657 return true;
2658}
2659
2660static void perf_output_wakeup(struct perf_output_handle *handle)
2661{
2662 atomic_set(&handle->data->poll, POLL_IN);
2663
2664 if (handle->nmi) {
2665 handle->event->pending_wakeup = 1;
2666 perf_pending_queue(&handle->event->pending,
2667 perf_pending_event);
2668 } else
2669 perf_event_wakeup(handle->event);
2670}
2671
2672/*
2673 * Curious locking construct.
2674 *
2675 * We need to ensure a later event_id doesn't publish a head when a former
2676 * event_id isn't done writing. However since we need to deal with NMIs we
2677 * cannot fully serialize things.
2678 *
2679 * What we do is serialize between CPUs so we only have to deal with NMI
2680 * nesting on a single CPU.
2681 *
2682 * We only publish the head (and generate a wakeup) when the outer-most
2683 * event_id completes.
2684 */
2685static void perf_output_lock(struct perf_output_handle *handle)
2686{
2687 struct perf_mmap_data *data = handle->data;
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002688 int cur, cpu = get_cpu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002689
2690 handle->locked = 0;
2691
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002692 for (;;) {
2693 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2694 if (cur == -1) {
2695 handle->locked = 1;
2696 break;
2697 }
2698 if (cur == cpu)
2699 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002700
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002701 cpu_relax();
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002702 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002703}
2704
2705static void perf_output_unlock(struct perf_output_handle *handle)
2706{
2707 struct perf_mmap_data *data = handle->data;
2708 unsigned long head;
2709 int cpu;
2710
2711 data->done_head = data->head;
2712
2713 if (!handle->locked)
2714 goto out;
2715
2716again:
2717 /*
2718 * The xchg implies a full barrier that ensures all writes are done
2719 * before we publish the new head, matched by a rmb() in userspace when
2720 * reading this position.
2721 */
2722 while ((head = atomic_long_xchg(&data->done_head, 0)))
2723 data->user_page->data_head = head;
2724
2725 /*
2726 * NMI can happen here, which means we can miss a done_head update.
2727 */
2728
2729 cpu = atomic_xchg(&data->lock, -1);
2730 WARN_ON_ONCE(cpu != smp_processor_id());
2731
2732 /*
2733 * Therefore we have to validate we did not indeed do so.
2734 */
2735 if (unlikely(atomic_long_read(&data->done_head))) {
2736 /*
2737 * Since we had it locked, we can lock it again.
2738 */
2739 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2740 cpu_relax();
2741
2742 goto again;
2743 }
2744
2745 if (atomic_xchg(&data->wakeup, 0))
2746 perf_output_wakeup(handle);
2747out:
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002748 put_cpu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002749}
2750
2751void perf_output_copy(struct perf_output_handle *handle,
2752 const void *buf, unsigned int len)
2753{
2754 unsigned int pages_mask;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002755 unsigned long offset;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002756 unsigned int size;
2757 void **pages;
2758
2759 offset = handle->offset;
2760 pages_mask = handle->data->nr_pages - 1;
2761 pages = handle->data->data_pages;
2762
2763 do {
Peter Zijlstra906010b2009-09-21 16:08:49 +02002764 unsigned long page_offset;
2765 unsigned long page_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002766 int nr;
2767
2768 nr = (offset >> PAGE_SHIFT) & pages_mask;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002769 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2770 page_offset = offset & (page_size - 1);
2771 size = min_t(unsigned int, page_size - page_offset, len);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002772
2773 memcpy(pages[nr] + page_offset, buf, size);
2774
2775 len -= size;
2776 buf += size;
2777 offset += size;
2778 } while (len);
2779
2780 handle->offset = offset;
2781
2782 /*
2783 * Check we didn't copy past our reservation window, taking the
2784 * possible unsigned int wrap into account.
2785 */
2786 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2787}
2788
2789int perf_output_begin(struct perf_output_handle *handle,
2790 struct perf_event *event, unsigned int size,
2791 int nmi, int sample)
2792{
2793 struct perf_event *output_event;
2794 struct perf_mmap_data *data;
2795 unsigned long tail, offset, head;
2796 int have_lost;
2797 struct {
2798 struct perf_event_header header;
2799 u64 id;
2800 u64 lost;
2801 } lost_event;
2802
2803 rcu_read_lock();
2804 /*
2805 * For inherited events we send all the output towards the parent.
2806 */
2807 if (event->parent)
2808 event = event->parent;
2809
2810 output_event = rcu_dereference(event->output);
2811 if (output_event)
2812 event = output_event;
2813
2814 data = rcu_dereference(event->data);
2815 if (!data)
2816 goto out;
2817
2818 handle->data = data;
2819 handle->event = event;
2820 handle->nmi = nmi;
2821 handle->sample = sample;
2822
2823 if (!data->nr_pages)
2824 goto fail;
2825
2826 have_lost = atomic_read(&data->lost);
2827 if (have_lost)
2828 size += sizeof(lost_event);
2829
2830 perf_output_lock(handle);
2831
2832 do {
2833 /*
2834 * Userspace could choose to issue a mb() before updating the
2835 * tail pointer. So that all reads will be completed before the
2836 * write is issued.
2837 */
2838 tail = ACCESS_ONCE(data->user_page->data_tail);
2839 smp_rmb();
2840 offset = head = atomic_long_read(&data->head);
2841 head += size;
2842 if (unlikely(!perf_output_space(data, tail, offset, head)))
2843 goto fail;
2844 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2845
2846 handle->offset = offset;
2847 handle->head = head;
2848
2849 if (head - tail > data->watermark)
2850 atomic_set(&data->wakeup, 1);
2851
2852 if (have_lost) {
2853 lost_event.header.type = PERF_RECORD_LOST;
2854 lost_event.header.misc = 0;
2855 lost_event.header.size = sizeof(lost_event);
2856 lost_event.id = event->id;
2857 lost_event.lost = atomic_xchg(&data->lost, 0);
2858
2859 perf_output_put(handle, lost_event);
2860 }
2861
2862 return 0;
2863
2864fail:
2865 atomic_inc(&data->lost);
2866 perf_output_unlock(handle);
2867out:
2868 rcu_read_unlock();
2869
2870 return -ENOSPC;
2871}
2872
2873void perf_output_end(struct perf_output_handle *handle)
2874{
2875 struct perf_event *event = handle->event;
2876 struct perf_mmap_data *data = handle->data;
2877
2878 int wakeup_events = event->attr.wakeup_events;
2879
2880 if (handle->sample && wakeup_events) {
2881 int events = atomic_inc_return(&data->events);
2882 if (events >= wakeup_events) {
2883 atomic_sub(wakeup_events, &data->events);
2884 atomic_set(&data->wakeup, 1);
2885 }
2886 }
2887
2888 perf_output_unlock(handle);
2889 rcu_read_unlock();
2890}
2891
2892static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2893{
2894 /*
2895 * only top level events have the pid namespace they were created in
2896 */
2897 if (event->parent)
2898 event = event->parent;
2899
2900 return task_tgid_nr_ns(p, event->ns);
2901}
2902
2903static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2904{
2905 /*
2906 * only top level events have the pid namespace they were created in
2907 */
2908 if (event->parent)
2909 event = event->parent;
2910
2911 return task_pid_nr_ns(p, event->ns);
2912}
2913
2914static void perf_output_read_one(struct perf_output_handle *handle,
2915 struct perf_event *event)
2916{
2917 u64 read_format = event->attr.read_format;
2918 u64 values[4];
2919 int n = 0;
2920
2921 values[n++] = atomic64_read(&event->count);
2922 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2923 values[n++] = event->total_time_enabled +
2924 atomic64_read(&event->child_total_time_enabled);
2925 }
2926 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2927 values[n++] = event->total_time_running +
2928 atomic64_read(&event->child_total_time_running);
2929 }
2930 if (read_format & PERF_FORMAT_ID)
2931 values[n++] = primary_event_id(event);
2932
2933 perf_output_copy(handle, values, n * sizeof(u64));
2934}
2935
2936/*
2937 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2938 */
2939static void perf_output_read_group(struct perf_output_handle *handle,
2940 struct perf_event *event)
2941{
2942 struct perf_event *leader = event->group_leader, *sub;
2943 u64 read_format = event->attr.read_format;
2944 u64 values[5];
2945 int n = 0;
2946
2947 values[n++] = 1 + leader->nr_siblings;
2948
2949 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2950 values[n++] = leader->total_time_enabled;
2951
2952 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2953 values[n++] = leader->total_time_running;
2954
2955 if (leader != event)
2956 leader->pmu->read(leader);
2957
2958 values[n++] = atomic64_read(&leader->count);
2959 if (read_format & PERF_FORMAT_ID)
2960 values[n++] = primary_event_id(leader);
2961
2962 perf_output_copy(handle, values, n * sizeof(u64));
2963
2964 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2965 n = 0;
2966
2967 if (sub != event)
2968 sub->pmu->read(sub);
2969
2970 values[n++] = atomic64_read(&sub->count);
2971 if (read_format & PERF_FORMAT_ID)
2972 values[n++] = primary_event_id(sub);
2973
2974 perf_output_copy(handle, values, n * sizeof(u64));
2975 }
2976}
2977
2978static void perf_output_read(struct perf_output_handle *handle,
2979 struct perf_event *event)
2980{
2981 if (event->attr.read_format & PERF_FORMAT_GROUP)
2982 perf_output_read_group(handle, event);
2983 else
2984 perf_output_read_one(handle, event);
2985}
2986
2987void perf_output_sample(struct perf_output_handle *handle,
2988 struct perf_event_header *header,
2989 struct perf_sample_data *data,
2990 struct perf_event *event)
2991{
2992 u64 sample_type = data->type;
2993
2994 perf_output_put(handle, *header);
2995
2996 if (sample_type & PERF_SAMPLE_IP)
2997 perf_output_put(handle, data->ip);
2998
2999 if (sample_type & PERF_SAMPLE_TID)
3000 perf_output_put(handle, data->tid_entry);
3001
3002 if (sample_type & PERF_SAMPLE_TIME)
3003 perf_output_put(handle, data->time);
3004
3005 if (sample_type & PERF_SAMPLE_ADDR)
3006 perf_output_put(handle, data->addr);
3007
3008 if (sample_type & PERF_SAMPLE_ID)
3009 perf_output_put(handle, data->id);
3010
3011 if (sample_type & PERF_SAMPLE_STREAM_ID)
3012 perf_output_put(handle, data->stream_id);
3013
3014 if (sample_type & PERF_SAMPLE_CPU)
3015 perf_output_put(handle, data->cpu_entry);
3016
3017 if (sample_type & PERF_SAMPLE_PERIOD)
3018 perf_output_put(handle, data->period);
3019
3020 if (sample_type & PERF_SAMPLE_READ)
3021 perf_output_read(handle, event);
3022
3023 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3024 if (data->callchain) {
3025 int size = 1;
3026
3027 if (data->callchain)
3028 size += data->callchain->nr;
3029
3030 size *= sizeof(u64);
3031
3032 perf_output_copy(handle, data->callchain, size);
3033 } else {
3034 u64 nr = 0;
3035 perf_output_put(handle, nr);
3036 }
3037 }
3038
3039 if (sample_type & PERF_SAMPLE_RAW) {
3040 if (data->raw) {
3041 perf_output_put(handle, data->raw->size);
3042 perf_output_copy(handle, data->raw->data,
3043 data->raw->size);
3044 } else {
3045 struct {
3046 u32 size;
3047 u32 data;
3048 } raw = {
3049 .size = sizeof(u32),
3050 .data = 0,
3051 };
3052 perf_output_put(handle, raw);
3053 }
3054 }
3055}
3056
3057void perf_prepare_sample(struct perf_event_header *header,
3058 struct perf_sample_data *data,
3059 struct perf_event *event,
3060 struct pt_regs *regs)
3061{
3062 u64 sample_type = event->attr.sample_type;
3063
3064 data->type = sample_type;
3065
3066 header->type = PERF_RECORD_SAMPLE;
3067 header->size = sizeof(*header);
3068
3069 header->misc = 0;
3070 header->misc |= perf_misc_flags(regs);
3071
3072 if (sample_type & PERF_SAMPLE_IP) {
3073 data->ip = perf_instruction_pointer(regs);
3074
3075 header->size += sizeof(data->ip);
3076 }
3077
3078 if (sample_type & PERF_SAMPLE_TID) {
3079 /* namespace issues */
3080 data->tid_entry.pid = perf_event_pid(event, current);
3081 data->tid_entry.tid = perf_event_tid(event, current);
3082
3083 header->size += sizeof(data->tid_entry);
3084 }
3085
3086 if (sample_type & PERF_SAMPLE_TIME) {
3087 data->time = perf_clock();
3088
3089 header->size += sizeof(data->time);
3090 }
3091
3092 if (sample_type & PERF_SAMPLE_ADDR)
3093 header->size += sizeof(data->addr);
3094
3095 if (sample_type & PERF_SAMPLE_ID) {
3096 data->id = primary_event_id(event);
3097
3098 header->size += sizeof(data->id);
3099 }
3100
3101 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3102 data->stream_id = event->id;
3103
3104 header->size += sizeof(data->stream_id);
3105 }
3106
3107 if (sample_type & PERF_SAMPLE_CPU) {
3108 data->cpu_entry.cpu = raw_smp_processor_id();
3109 data->cpu_entry.reserved = 0;
3110
3111 header->size += sizeof(data->cpu_entry);
3112 }
3113
3114 if (sample_type & PERF_SAMPLE_PERIOD)
3115 header->size += sizeof(data->period);
3116
3117 if (sample_type & PERF_SAMPLE_READ)
3118 header->size += perf_event_read_size(event);
3119
3120 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3121 int size = 1;
3122
3123 data->callchain = perf_callchain(regs);
3124
3125 if (data->callchain)
3126 size += data->callchain->nr;
3127
3128 header->size += size * sizeof(u64);
3129 }
3130
3131 if (sample_type & PERF_SAMPLE_RAW) {
3132 int size = sizeof(u32);
3133
3134 if (data->raw)
3135 size += data->raw->size;
3136 else
3137 size += sizeof(u32);
3138
3139 WARN_ON_ONCE(size & (sizeof(u64)-1));
3140 header->size += size;
3141 }
3142}
3143
3144static void perf_event_output(struct perf_event *event, int nmi,
3145 struct perf_sample_data *data,
3146 struct pt_regs *regs)
3147{
3148 struct perf_output_handle handle;
3149 struct perf_event_header header;
3150
3151 perf_prepare_sample(&header, data, event, regs);
3152
3153 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3154 return;
3155
3156 perf_output_sample(&handle, &header, data, event);
3157
3158 perf_output_end(&handle);
3159}
3160
3161/*
3162 * read event_id
3163 */
3164
3165struct perf_read_event {
3166 struct perf_event_header header;
3167
3168 u32 pid;
3169 u32 tid;
3170};
3171
3172static void
3173perf_event_read_event(struct perf_event *event,
3174 struct task_struct *task)
3175{
3176 struct perf_output_handle handle;
3177 struct perf_read_event read_event = {
3178 .header = {
3179 .type = PERF_RECORD_READ,
3180 .misc = 0,
3181 .size = sizeof(read_event) + perf_event_read_size(event),
3182 },
3183 .pid = perf_event_pid(event, task),
3184 .tid = perf_event_tid(event, task),
3185 };
3186 int ret;
3187
3188 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3189 if (ret)
3190 return;
3191
3192 perf_output_put(&handle, read_event);
3193 perf_output_read(&handle, event);
3194
3195 perf_output_end(&handle);
3196}
3197
3198/*
3199 * task tracking -- fork/exit
3200 *
3201 * enabled by: attr.comm | attr.mmap | attr.task
3202 */
3203
3204struct perf_task_event {
3205 struct task_struct *task;
3206 struct perf_event_context *task_ctx;
3207
3208 struct {
3209 struct perf_event_header header;
3210
3211 u32 pid;
3212 u32 ppid;
3213 u32 tid;
3214 u32 ptid;
3215 u64 time;
3216 } event_id;
3217};
3218
3219static void perf_event_task_output(struct perf_event *event,
3220 struct perf_task_event *task_event)
3221{
3222 struct perf_output_handle handle;
3223 int size;
3224 struct task_struct *task = task_event->task;
3225 int ret;
3226
3227 size = task_event->event_id.header.size;
3228 ret = perf_output_begin(&handle, event, size, 0, 0);
3229
3230 if (ret)
3231 return;
3232
3233 task_event->event_id.pid = perf_event_pid(event, task);
3234 task_event->event_id.ppid = perf_event_pid(event, current);
3235
3236 task_event->event_id.tid = perf_event_tid(event, task);
3237 task_event->event_id.ptid = perf_event_tid(event, current);
3238
3239 task_event->event_id.time = perf_clock();
3240
3241 perf_output_put(&handle, task_event->event_id);
3242
3243 perf_output_end(&handle);
3244}
3245
3246static int perf_event_task_match(struct perf_event *event)
3247{
3248 if (event->attr.comm || event->attr.mmap || event->attr.task)
3249 return 1;
3250
3251 return 0;
3252}
3253
3254static void perf_event_task_ctx(struct perf_event_context *ctx,
3255 struct perf_task_event *task_event)
3256{
3257 struct perf_event *event;
3258
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003259 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3260 if (perf_event_task_match(event))
3261 perf_event_task_output(event, task_event);
3262 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003263}
3264
3265static void perf_event_task_event(struct perf_task_event *task_event)
3266{
3267 struct perf_cpu_context *cpuctx;
3268 struct perf_event_context *ctx = task_event->task_ctx;
3269
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01003270 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003271 cpuctx = &get_cpu_var(perf_cpu_context);
3272 perf_event_task_ctx(&cpuctx->ctx, task_event);
3273 put_cpu_var(perf_cpu_context);
3274
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003275 if (!ctx)
3276 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3277 if (ctx)
3278 perf_event_task_ctx(ctx, task_event);
3279 rcu_read_unlock();
3280}
3281
3282static void perf_event_task(struct task_struct *task,
3283 struct perf_event_context *task_ctx,
3284 int new)
3285{
3286 struct perf_task_event task_event;
3287
3288 if (!atomic_read(&nr_comm_events) &&
3289 !atomic_read(&nr_mmap_events) &&
3290 !atomic_read(&nr_task_events))
3291 return;
3292
3293 task_event = (struct perf_task_event){
3294 .task = task,
3295 .task_ctx = task_ctx,
3296 .event_id = {
3297 .header = {
3298 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3299 .misc = 0,
3300 .size = sizeof(task_event.event_id),
3301 },
3302 /* .pid */
3303 /* .ppid */
3304 /* .tid */
3305 /* .ptid */
3306 },
3307 };
3308
3309 perf_event_task_event(&task_event);
3310}
3311
3312void perf_event_fork(struct task_struct *task)
3313{
3314 perf_event_task(task, NULL, 1);
3315}
3316
3317/*
3318 * comm tracking
3319 */
3320
3321struct perf_comm_event {
3322 struct task_struct *task;
3323 char *comm;
3324 int comm_size;
3325
3326 struct {
3327 struct perf_event_header header;
3328
3329 u32 pid;
3330 u32 tid;
3331 } event_id;
3332};
3333
3334static void perf_event_comm_output(struct perf_event *event,
3335 struct perf_comm_event *comm_event)
3336{
3337 struct perf_output_handle handle;
3338 int size = comm_event->event_id.header.size;
3339 int ret = perf_output_begin(&handle, event, size, 0, 0);
3340
3341 if (ret)
3342 return;
3343
3344 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3345 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3346
3347 perf_output_put(&handle, comm_event->event_id);
3348 perf_output_copy(&handle, comm_event->comm,
3349 comm_event->comm_size);
3350 perf_output_end(&handle);
3351}
3352
3353static int perf_event_comm_match(struct perf_event *event)
3354{
3355 if (event->attr.comm)
3356 return 1;
3357
3358 return 0;
3359}
3360
3361static void perf_event_comm_ctx(struct perf_event_context *ctx,
3362 struct perf_comm_event *comm_event)
3363{
3364 struct perf_event *event;
3365
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003366 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3367 if (perf_event_comm_match(event))
3368 perf_event_comm_output(event, comm_event);
3369 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003370}
3371
3372static void perf_event_comm_event(struct perf_comm_event *comm_event)
3373{
3374 struct perf_cpu_context *cpuctx;
3375 struct perf_event_context *ctx;
3376 unsigned int size;
3377 char comm[TASK_COMM_LEN];
3378
3379 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01003380 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003381 size = ALIGN(strlen(comm)+1, sizeof(u64));
3382
3383 comm_event->comm = comm;
3384 comm_event->comm_size = size;
3385
3386 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3387
Peter Zijlstraf6595f32009-11-20 22:19:47 +01003388 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003389 cpuctx = &get_cpu_var(perf_cpu_context);
3390 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3391 put_cpu_var(perf_cpu_context);
3392
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003393 /*
3394 * doesn't really matter which of the child contexts the
3395 * events ends up in.
3396 */
3397 ctx = rcu_dereference(current->perf_event_ctxp);
3398 if (ctx)
3399 perf_event_comm_ctx(ctx, comm_event);
3400 rcu_read_unlock();
3401}
3402
3403void perf_event_comm(struct task_struct *task)
3404{
3405 struct perf_comm_event comm_event;
3406
3407 if (task->perf_event_ctxp)
3408 perf_event_enable_on_exec(task);
3409
3410 if (!atomic_read(&nr_comm_events))
3411 return;
3412
3413 comm_event = (struct perf_comm_event){
3414 .task = task,
3415 /* .comm */
3416 /* .comm_size */
3417 .event_id = {
3418 .header = {
3419 .type = PERF_RECORD_COMM,
3420 .misc = 0,
3421 /* .size */
3422 },
3423 /* .pid */
3424 /* .tid */
3425 },
3426 };
3427
3428 perf_event_comm_event(&comm_event);
3429}
3430
3431/*
3432 * mmap tracking
3433 */
3434
3435struct perf_mmap_event {
3436 struct vm_area_struct *vma;
3437
3438 const char *file_name;
3439 int file_size;
3440
3441 struct {
3442 struct perf_event_header header;
3443
3444 u32 pid;
3445 u32 tid;
3446 u64 start;
3447 u64 len;
3448 u64 pgoff;
3449 } event_id;
3450};
3451
3452static void perf_event_mmap_output(struct perf_event *event,
3453 struct perf_mmap_event *mmap_event)
3454{
3455 struct perf_output_handle handle;
3456 int size = mmap_event->event_id.header.size;
3457 int ret = perf_output_begin(&handle, event, size, 0, 0);
3458
3459 if (ret)
3460 return;
3461
3462 mmap_event->event_id.pid = perf_event_pid(event, current);
3463 mmap_event->event_id.tid = perf_event_tid(event, current);
3464
3465 perf_output_put(&handle, mmap_event->event_id);
3466 perf_output_copy(&handle, mmap_event->file_name,
3467 mmap_event->file_size);
3468 perf_output_end(&handle);
3469}
3470
3471static int perf_event_mmap_match(struct perf_event *event,
3472 struct perf_mmap_event *mmap_event)
3473{
3474 if (event->attr.mmap)
3475 return 1;
3476
3477 return 0;
3478}
3479
3480static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3481 struct perf_mmap_event *mmap_event)
3482{
3483 struct perf_event *event;
3484
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003485 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3486 if (perf_event_mmap_match(event, mmap_event))
3487 perf_event_mmap_output(event, mmap_event);
3488 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003489}
3490
3491static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3492{
3493 struct perf_cpu_context *cpuctx;
3494 struct perf_event_context *ctx;
3495 struct vm_area_struct *vma = mmap_event->vma;
3496 struct file *file = vma->vm_file;
3497 unsigned int size;
3498 char tmp[16];
3499 char *buf = NULL;
3500 const char *name;
3501
3502 memset(tmp, 0, sizeof(tmp));
3503
3504 if (file) {
3505 /*
3506 * d_path works from the end of the buffer backwards, so we
3507 * need to add enough zero bytes after the string to handle
3508 * the 64bit alignment we do later.
3509 */
3510 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3511 if (!buf) {
3512 name = strncpy(tmp, "//enomem", sizeof(tmp));
3513 goto got_name;
3514 }
3515 name = d_path(&file->f_path, buf, PATH_MAX);
3516 if (IS_ERR(name)) {
3517 name = strncpy(tmp, "//toolong", sizeof(tmp));
3518 goto got_name;
3519 }
3520 } else {
3521 if (arch_vma_name(mmap_event->vma)) {
3522 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3523 sizeof(tmp));
3524 goto got_name;
3525 }
3526
3527 if (!vma->vm_mm) {
3528 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3529 goto got_name;
3530 }
3531
3532 name = strncpy(tmp, "//anon", sizeof(tmp));
3533 goto got_name;
3534 }
3535
3536got_name:
3537 size = ALIGN(strlen(name)+1, sizeof(u64));
3538
3539 mmap_event->file_name = name;
3540 mmap_event->file_size = size;
3541
3542 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3543
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01003544 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003545 cpuctx = &get_cpu_var(perf_cpu_context);
3546 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3547 put_cpu_var(perf_cpu_context);
3548
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003549 /*
3550 * doesn't really matter which of the child contexts the
3551 * events ends up in.
3552 */
3553 ctx = rcu_dereference(current->perf_event_ctxp);
3554 if (ctx)
3555 perf_event_mmap_ctx(ctx, mmap_event);
3556 rcu_read_unlock();
3557
3558 kfree(buf);
3559}
3560
3561void __perf_event_mmap(struct vm_area_struct *vma)
3562{
3563 struct perf_mmap_event mmap_event;
3564
3565 if (!atomic_read(&nr_mmap_events))
3566 return;
3567
3568 mmap_event = (struct perf_mmap_event){
3569 .vma = vma,
3570 /* .file_name */
3571 /* .file_size */
3572 .event_id = {
3573 .header = {
3574 .type = PERF_RECORD_MMAP,
3575 .misc = 0,
3576 /* .size */
3577 },
3578 /* .pid */
3579 /* .tid */
3580 .start = vma->vm_start,
3581 .len = vma->vm_end - vma->vm_start,
3582 .pgoff = vma->vm_pgoff,
3583 },
3584 };
3585
3586 perf_event_mmap_event(&mmap_event);
3587}
3588
3589/*
3590 * IRQ throttle logging
3591 */
3592
3593static void perf_log_throttle(struct perf_event *event, int enable)
3594{
3595 struct perf_output_handle handle;
3596 int ret;
3597
3598 struct {
3599 struct perf_event_header header;
3600 u64 time;
3601 u64 id;
3602 u64 stream_id;
3603 } throttle_event = {
3604 .header = {
3605 .type = PERF_RECORD_THROTTLE,
3606 .misc = 0,
3607 .size = sizeof(throttle_event),
3608 },
3609 .time = perf_clock(),
3610 .id = primary_event_id(event),
3611 .stream_id = event->id,
3612 };
3613
3614 if (enable)
3615 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3616
3617 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3618 if (ret)
3619 return;
3620
3621 perf_output_put(&handle, throttle_event);
3622 perf_output_end(&handle);
3623}
3624
3625/*
3626 * Generic event overflow handling, sampling.
3627 */
3628
3629static int __perf_event_overflow(struct perf_event *event, int nmi,
3630 int throttle, struct perf_sample_data *data,
3631 struct pt_regs *regs)
3632{
3633 int events = atomic_read(&event->event_limit);
3634 struct hw_perf_event *hwc = &event->hw;
3635 int ret = 0;
3636
3637 throttle = (throttle && event->pmu->unthrottle != NULL);
3638
3639 if (!throttle) {
3640 hwc->interrupts++;
3641 } else {
3642 if (hwc->interrupts != MAX_INTERRUPTS) {
3643 hwc->interrupts++;
3644 if (HZ * hwc->interrupts >
3645 (u64)sysctl_perf_event_sample_rate) {
3646 hwc->interrupts = MAX_INTERRUPTS;
3647 perf_log_throttle(event, 0);
3648 ret = 1;
3649 }
3650 } else {
3651 /*
3652 * Keep re-disabling events even though on the previous
3653 * pass we disabled it - just in case we raced with a
3654 * sched-in and the event got enabled again:
3655 */
3656 ret = 1;
3657 }
3658 }
3659
3660 if (event->attr.freq) {
3661 u64 now = perf_clock();
3662 s64 delta = now - hwc->freq_stamp;
3663
3664 hwc->freq_stamp = now;
3665
3666 if (delta > 0 && delta < TICK_NSEC)
3667 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3668 }
3669
3670 /*
3671 * XXX event_limit might not quite work as expected on inherited
3672 * events
3673 */
3674
3675 event->pending_kill = POLL_IN;
3676 if (events && atomic_dec_and_test(&event->event_limit)) {
3677 ret = 1;
3678 event->pending_kill = POLL_HUP;
3679 if (nmi) {
3680 event->pending_disable = 1;
3681 perf_pending_queue(&event->pending,
3682 perf_pending_event);
3683 } else
3684 perf_event_disable(event);
3685 }
3686
Peter Zijlstra453f19e2009-11-20 22:19:43 +01003687 if (event->overflow_handler)
3688 event->overflow_handler(event, nmi, data, regs);
3689 else
3690 perf_event_output(event, nmi, data, regs);
3691
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003692 return ret;
3693}
3694
3695int perf_event_overflow(struct perf_event *event, int nmi,
3696 struct perf_sample_data *data,
3697 struct pt_regs *regs)
3698{
3699 return __perf_event_overflow(event, nmi, 1, data, regs);
3700}
3701
3702/*
3703 * Generic software event infrastructure
3704 */
3705
3706/*
3707 * We directly increment event->count and keep a second value in
3708 * event->hw.period_left to count intervals. This period event
3709 * is kept in the range [-sample_period, 0] so that we can use the
3710 * sign as trigger.
3711 */
3712
3713static u64 perf_swevent_set_period(struct perf_event *event)
3714{
3715 struct hw_perf_event *hwc = &event->hw;
3716 u64 period = hwc->last_period;
3717 u64 nr, offset;
3718 s64 old, val;
3719
3720 hwc->last_period = hwc->sample_period;
3721
3722again:
3723 old = val = atomic64_read(&hwc->period_left);
3724 if (val < 0)
3725 return 0;
3726
3727 nr = div64_u64(period + val, period);
3728 offset = nr * period;
3729 val -= offset;
3730 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3731 goto again;
3732
3733 return nr;
3734}
3735
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003736static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003737 int nmi, struct perf_sample_data *data,
3738 struct pt_regs *regs)
3739{
3740 struct hw_perf_event *hwc = &event->hw;
3741 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003742
3743 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003744 if (!overflow)
3745 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003746
3747 if (hwc->interrupts == MAX_INTERRUPTS)
3748 return;
3749
3750 for (; overflow; overflow--) {
3751 if (__perf_event_overflow(event, nmi, throttle,
3752 data, regs)) {
3753 /*
3754 * We inhibit the overflow from happening when
3755 * hwc->interrupts == MAX_INTERRUPTS.
3756 */
3757 break;
3758 }
3759 throttle = 1;
3760 }
3761}
3762
3763static void perf_swevent_unthrottle(struct perf_event *event)
3764{
3765 /*
3766 * Nothing to do, we already reset hwc->interrupts.
3767 */
3768}
3769
3770static void perf_swevent_add(struct perf_event *event, u64 nr,
3771 int nmi, struct perf_sample_data *data,
3772 struct pt_regs *regs)
3773{
3774 struct hw_perf_event *hwc = &event->hw;
3775
3776 atomic64_add(nr, &event->count);
3777
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003778 if (!regs)
3779 return;
3780
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003781 if (!hwc->sample_period)
3782 return;
3783
3784 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3785 return perf_swevent_overflow(event, 1, nmi, data, regs);
3786
3787 if (atomic64_add_negative(nr, &hwc->period_left))
3788 return;
3789
3790 perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003791}
3792
3793static int perf_swevent_is_counting(struct perf_event *event)
3794{
3795 /*
3796 * The event is active, we're good!
3797 */
3798 if (event->state == PERF_EVENT_STATE_ACTIVE)
3799 return 1;
3800
3801 /*
3802 * The event is off/error, not counting.
3803 */
3804 if (event->state != PERF_EVENT_STATE_INACTIVE)
3805 return 0;
3806
3807 /*
3808 * The event is inactive, if the context is active
3809 * we're part of a group that didn't make it on the 'pmu',
3810 * not counting.
3811 */
3812 if (event->ctx->is_active)
3813 return 0;
3814
3815 /*
3816 * We're inactive and the context is too, this means the
3817 * task is scheduled out, we're counting events that happen
3818 * to us, like migration events.
3819 */
3820 return 1;
3821}
3822
Li Zefan6fb29152009-10-15 11:21:42 +08003823static int perf_tp_event_match(struct perf_event *event,
3824 struct perf_sample_data *data);
3825
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003826static int perf_swevent_match(struct perf_event *event,
3827 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08003828 u32 event_id,
3829 struct perf_sample_data *data,
3830 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003831{
3832 if (!perf_swevent_is_counting(event))
3833 return 0;
3834
3835 if (event->attr.type != type)
3836 return 0;
3837 if (event->attr.config != event_id)
3838 return 0;
3839
3840 if (regs) {
3841 if (event->attr.exclude_user && user_mode(regs))
3842 return 0;
3843
3844 if (event->attr.exclude_kernel && !user_mode(regs))
3845 return 0;
3846 }
3847
Li Zefan6fb29152009-10-15 11:21:42 +08003848 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3849 !perf_tp_event_match(event, data))
3850 return 0;
3851
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003852 return 1;
3853}
3854
3855static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3856 enum perf_type_id type,
3857 u32 event_id, u64 nr, int nmi,
3858 struct perf_sample_data *data,
3859 struct pt_regs *regs)
3860{
3861 struct perf_event *event;
3862
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003863 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08003864 if (perf_swevent_match(event, type, event_id, data, regs))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003865 perf_swevent_add(event, nr, nmi, data, regs);
3866 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003867}
3868
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003869/*
3870 * Must be called with preemption disabled
3871 */
3872int perf_swevent_get_recursion_context(int **recursion)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003873{
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003874 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3875
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003876 if (in_nmi())
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003877 *recursion = &cpuctx->recursion[3];
3878 else if (in_irq())
3879 *recursion = &cpuctx->recursion[2];
3880 else if (in_softirq())
3881 *recursion = &cpuctx->recursion[1];
3882 else
3883 *recursion = &cpuctx->recursion[0];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003884
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003885 if (**recursion)
3886 return -1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003887
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003888 (**recursion)++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003889
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003890 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003891}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01003892EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003893
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003894void perf_swevent_put_recursion_context(int *recursion)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003895{
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003896 (*recursion)--;
3897}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01003898EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003899
3900static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
3901 u64 nr, int nmi,
3902 struct perf_sample_data *data,
3903 struct pt_regs *regs)
3904{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003905 struct perf_event_context *ctx;
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003906 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003907
Peter Zijlstra81520182009-11-20 22:19:45 +01003908 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003909 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3910 nr, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003911 /*
3912 * doesn't really matter which of the child contexts the
3913 * events ends up in.
3914 */
3915 ctx = rcu_dereference(current->perf_event_ctxp);
3916 if (ctx)
3917 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3918 rcu_read_unlock();
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003919}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003920
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003921static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3922 u64 nr, int nmi,
3923 struct perf_sample_data *data,
3924 struct pt_regs *regs)
3925{
3926 int *recursion;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003927
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003928 preempt_disable();
3929
3930 if (perf_swevent_get_recursion_context(&recursion))
3931 goto out;
3932
3933 __do_perf_sw_event(type, event_id, nr, nmi, data, regs);
3934
3935 perf_swevent_put_recursion_context(recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003936out:
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003937 preempt_enable();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003938}
3939
3940void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3941 struct pt_regs *regs, u64 addr)
3942{
Ingo Molnara4234bf2009-11-23 10:57:59 +01003943 struct perf_sample_data data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003944
Ingo Molnara4234bf2009-11-23 10:57:59 +01003945 data.addr = addr;
3946 data.raw = NULL;
3947
3948 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003949}
3950
3951static void perf_swevent_read(struct perf_event *event)
3952{
3953}
3954
3955static int perf_swevent_enable(struct perf_event *event)
3956{
3957 struct hw_perf_event *hwc = &event->hw;
3958
3959 if (hwc->sample_period) {
3960 hwc->last_period = hwc->sample_period;
3961 perf_swevent_set_period(event);
3962 }
3963 return 0;
3964}
3965
3966static void perf_swevent_disable(struct perf_event *event)
3967{
3968}
3969
3970static const struct pmu perf_ops_generic = {
3971 .enable = perf_swevent_enable,
3972 .disable = perf_swevent_disable,
3973 .read = perf_swevent_read,
3974 .unthrottle = perf_swevent_unthrottle,
3975};
3976
3977/*
3978 * hrtimer based swevent callback
3979 */
3980
3981static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3982{
3983 enum hrtimer_restart ret = HRTIMER_RESTART;
3984 struct perf_sample_data data;
3985 struct pt_regs *regs;
3986 struct perf_event *event;
3987 u64 period;
3988
3989 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3990 event->pmu->read(event);
3991
3992 data.addr = 0;
3993 regs = get_irq_regs();
3994 /*
3995 * In case we exclude kernel IPs or are somehow not in interrupt
3996 * context, provide the next best thing, the user IP.
3997 */
3998 if ((event->attr.exclude_kernel || !regs) &&
3999 !event->attr.exclude_user)
4000 regs = task_pt_regs(current);
4001
4002 if (regs) {
Soeren Sandmann54f44072009-10-22 18:34:08 +02004003 if (!(event->attr.exclude_idle && current->pid == 0))
4004 if (perf_event_overflow(event, 0, &data, regs))
4005 ret = HRTIMER_NORESTART;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004006 }
4007
4008 period = max_t(u64, 10000, event->hw.sample_period);
4009 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4010
4011 return ret;
4012}
4013
Soeren Sandmann721a6692009-09-15 14:33:08 +02004014static void perf_swevent_start_hrtimer(struct perf_event *event)
4015{
4016 struct hw_perf_event *hwc = &event->hw;
4017
4018 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4019 hwc->hrtimer.function = perf_swevent_hrtimer;
4020 if (hwc->sample_period) {
4021 u64 period;
4022
4023 if (hwc->remaining) {
4024 if (hwc->remaining < 0)
4025 period = 10000;
4026 else
4027 period = hwc->remaining;
4028 hwc->remaining = 0;
4029 } else {
4030 period = max_t(u64, 10000, hwc->sample_period);
4031 }
4032 __hrtimer_start_range_ns(&hwc->hrtimer,
4033 ns_to_ktime(period), 0,
4034 HRTIMER_MODE_REL, 0);
4035 }
4036}
4037
4038static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4039{
4040 struct hw_perf_event *hwc = &event->hw;
4041
4042 if (hwc->sample_period) {
4043 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4044 hwc->remaining = ktime_to_ns(remaining);
4045
4046 hrtimer_cancel(&hwc->hrtimer);
4047 }
4048}
4049
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004050/*
4051 * Software event: cpu wall time clock
4052 */
4053
4054static void cpu_clock_perf_event_update(struct perf_event *event)
4055{
4056 int cpu = raw_smp_processor_id();
4057 s64 prev;
4058 u64 now;
4059
4060 now = cpu_clock(cpu);
4061 prev = atomic64_read(&event->hw.prev_count);
4062 atomic64_set(&event->hw.prev_count, now);
4063 atomic64_add(now - prev, &event->count);
4064}
4065
4066static int cpu_clock_perf_event_enable(struct perf_event *event)
4067{
4068 struct hw_perf_event *hwc = &event->hw;
4069 int cpu = raw_smp_processor_id();
4070
4071 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
Soeren Sandmann721a6692009-09-15 14:33:08 +02004072 perf_swevent_start_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004073
4074 return 0;
4075}
4076
4077static void cpu_clock_perf_event_disable(struct perf_event *event)
4078{
Soeren Sandmann721a6692009-09-15 14:33:08 +02004079 perf_swevent_cancel_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004080 cpu_clock_perf_event_update(event);
4081}
4082
4083static void cpu_clock_perf_event_read(struct perf_event *event)
4084{
4085 cpu_clock_perf_event_update(event);
4086}
4087
4088static const struct pmu perf_ops_cpu_clock = {
4089 .enable = cpu_clock_perf_event_enable,
4090 .disable = cpu_clock_perf_event_disable,
4091 .read = cpu_clock_perf_event_read,
4092};
4093
4094/*
4095 * Software event: task time clock
4096 */
4097
4098static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4099{
4100 u64 prev;
4101 s64 delta;
4102
4103 prev = atomic64_xchg(&event->hw.prev_count, now);
4104 delta = now - prev;
4105 atomic64_add(delta, &event->count);
4106}
4107
4108static int task_clock_perf_event_enable(struct perf_event *event)
4109{
4110 struct hw_perf_event *hwc = &event->hw;
4111 u64 now;
4112
4113 now = event->ctx->time;
4114
4115 atomic64_set(&hwc->prev_count, now);
Soeren Sandmann721a6692009-09-15 14:33:08 +02004116
4117 perf_swevent_start_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004118
4119 return 0;
4120}
4121
4122static void task_clock_perf_event_disable(struct perf_event *event)
4123{
Soeren Sandmann721a6692009-09-15 14:33:08 +02004124 perf_swevent_cancel_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004125 task_clock_perf_event_update(event, event->ctx->time);
4126
4127}
4128
4129static void task_clock_perf_event_read(struct perf_event *event)
4130{
4131 u64 time;
4132
4133 if (!in_nmi()) {
4134 update_context_time(event->ctx);
4135 time = event->ctx->time;
4136 } else {
4137 u64 now = perf_clock();
4138 u64 delta = now - event->ctx->timestamp;
4139 time = event->ctx->time + delta;
4140 }
4141
4142 task_clock_perf_event_update(event, time);
4143}
4144
4145static const struct pmu perf_ops_task_clock = {
4146 .enable = task_clock_perf_event_enable,
4147 .disable = task_clock_perf_event_disable,
4148 .read = task_clock_perf_event_read,
4149};
4150
4151#ifdef CONFIG_EVENT_PROFILE
Li Zefan6fb29152009-10-15 11:21:42 +08004152
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004153void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4154 int entry_size)
4155{
4156 struct perf_raw_record raw = {
4157 .size = entry_size,
4158 .data = record,
4159 };
4160
4161 struct perf_sample_data data = {
4162 .addr = addr,
4163 .raw = &raw,
4164 };
4165
4166 struct pt_regs *regs = get_irq_regs();
4167
4168 if (!regs)
4169 regs = task_pt_regs(current);
4170
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01004171 /* Trace events already protected against recursion */
4172 __do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004173 &data, regs);
4174}
4175EXPORT_SYMBOL_GPL(perf_tp_event);
4176
Li Zefan6fb29152009-10-15 11:21:42 +08004177static int perf_tp_event_match(struct perf_event *event,
4178 struct perf_sample_data *data)
4179{
4180 void *record = data->raw->data;
4181
4182 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4183 return 1;
4184 return 0;
4185}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004186
4187static void tp_perf_event_destroy(struct perf_event *event)
4188{
4189 ftrace_profile_disable(event->attr.config);
4190}
4191
4192static const struct pmu *tp_perf_event_init(struct perf_event *event)
4193{
4194 /*
4195 * Raw tracepoint data is a severe data leak, only allow root to
4196 * have these.
4197 */
4198 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4199 perf_paranoid_tracepoint_raw() &&
4200 !capable(CAP_SYS_ADMIN))
4201 return ERR_PTR(-EPERM);
4202
4203 if (ftrace_profile_enable(event->attr.config))
4204 return NULL;
4205
4206 event->destroy = tp_perf_event_destroy;
4207
4208 return &perf_ops_generic;
4209}
Li Zefan6fb29152009-10-15 11:21:42 +08004210
4211static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4212{
4213 char *filter_str;
4214 int ret;
4215
4216 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4217 return -EINVAL;
4218
4219 filter_str = strndup_user(arg, PAGE_SIZE);
4220 if (IS_ERR(filter_str))
4221 return PTR_ERR(filter_str);
4222
4223 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4224
4225 kfree(filter_str);
4226 return ret;
4227}
4228
4229static void perf_event_free_filter(struct perf_event *event)
4230{
4231 ftrace_profile_free_filter(event);
4232}
4233
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004234#else
Li Zefan6fb29152009-10-15 11:21:42 +08004235
4236static int perf_tp_event_match(struct perf_event *event,
4237 struct perf_sample_data *data)
4238{
4239 return 1;
4240}
4241
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004242static const struct pmu *tp_perf_event_init(struct perf_event *event)
4243{
4244 return NULL;
4245}
Li Zefan6fb29152009-10-15 11:21:42 +08004246
4247static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4248{
4249 return -ENOENT;
4250}
4251
4252static void perf_event_free_filter(struct perf_event *event)
4253{
4254}
4255
4256#endif /* CONFIG_EVENT_PROFILE */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004257
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004258#ifdef CONFIG_HAVE_HW_BREAKPOINT
4259static void bp_perf_event_destroy(struct perf_event *event)
4260{
4261 release_bp_slot(event);
4262}
4263
4264static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4265{
4266 int err;
4267 /*
4268 * The breakpoint is already filled if we haven't created the counter
4269 * through perf syscall
4270 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4271 */
4272 if (!bp->callback)
4273 err = register_perf_hw_breakpoint(bp);
4274 else
4275 err = __register_perf_hw_breakpoint(bp);
4276 if (err)
4277 return ERR_PTR(err);
4278
4279 bp->destroy = bp_perf_event_destroy;
4280
4281 return &perf_ops_bp;
4282}
4283
4284void perf_bp_event(struct perf_event *bp, void *regs)
4285{
4286 /* TODO */
4287}
4288#else
4289static void bp_perf_event_destroy(struct perf_event *event)
4290{
4291}
4292
4293static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4294{
4295 return NULL;
4296}
4297
4298void perf_bp_event(struct perf_event *bp, void *regs)
4299{
4300}
4301#endif
4302
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004303atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4304
4305static void sw_perf_event_destroy(struct perf_event *event)
4306{
4307 u64 event_id = event->attr.config;
4308
4309 WARN_ON(event->parent);
4310
4311 atomic_dec(&perf_swevent_enabled[event_id]);
4312}
4313
4314static const struct pmu *sw_perf_event_init(struct perf_event *event)
4315{
4316 const struct pmu *pmu = NULL;
4317 u64 event_id = event->attr.config;
4318
4319 /*
4320 * Software events (currently) can't in general distinguish
4321 * between user, kernel and hypervisor events.
4322 * However, context switches and cpu migrations are considered
4323 * to be kernel events, and page faults are never hypervisor
4324 * events.
4325 */
4326 switch (event_id) {
4327 case PERF_COUNT_SW_CPU_CLOCK:
4328 pmu = &perf_ops_cpu_clock;
4329
4330 break;
4331 case PERF_COUNT_SW_TASK_CLOCK:
4332 /*
4333 * If the user instantiates this as a per-cpu event,
4334 * use the cpu_clock event instead.
4335 */
4336 if (event->ctx->task)
4337 pmu = &perf_ops_task_clock;
4338 else
4339 pmu = &perf_ops_cpu_clock;
4340
4341 break;
4342 case PERF_COUNT_SW_PAGE_FAULTS:
4343 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4344 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4345 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4346 case PERF_COUNT_SW_CPU_MIGRATIONS:
Anton Blanchardf7d79862009-10-18 01:09:29 +00004347 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4348 case PERF_COUNT_SW_EMULATION_FAULTS:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004349 if (!event->parent) {
4350 atomic_inc(&perf_swevent_enabled[event_id]);
4351 event->destroy = sw_perf_event_destroy;
4352 }
4353 pmu = &perf_ops_generic;
4354 break;
4355 }
4356
4357 return pmu;
4358}
4359
4360/*
4361 * Allocate and initialize a event structure
4362 */
4363static struct perf_event *
4364perf_event_alloc(struct perf_event_attr *attr,
4365 int cpu,
4366 struct perf_event_context *ctx,
4367 struct perf_event *group_leader,
4368 struct perf_event *parent_event,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004369 perf_callback_t callback,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004370 gfp_t gfpflags)
4371{
4372 const struct pmu *pmu;
4373 struct perf_event *event;
4374 struct hw_perf_event *hwc;
4375 long err;
4376
4377 event = kzalloc(sizeof(*event), gfpflags);
4378 if (!event)
4379 return ERR_PTR(-ENOMEM);
4380
4381 /*
4382 * Single events are their own group leaders, with an
4383 * empty sibling list:
4384 */
4385 if (!group_leader)
4386 group_leader = event;
4387
4388 mutex_init(&event->child_mutex);
4389 INIT_LIST_HEAD(&event->child_list);
4390
4391 INIT_LIST_HEAD(&event->group_entry);
4392 INIT_LIST_HEAD(&event->event_entry);
4393 INIT_LIST_HEAD(&event->sibling_list);
4394 init_waitqueue_head(&event->waitq);
4395
4396 mutex_init(&event->mmap_mutex);
4397
4398 event->cpu = cpu;
4399 event->attr = *attr;
4400 event->group_leader = group_leader;
4401 event->pmu = NULL;
4402 event->ctx = ctx;
4403 event->oncpu = -1;
4404
4405 event->parent = parent_event;
4406
4407 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4408 event->id = atomic64_inc_return(&perf_event_id);
4409
4410 event->state = PERF_EVENT_STATE_INACTIVE;
4411
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004412 if (!callback && parent_event)
4413 callback = parent_event->callback;
4414
4415 event->callback = callback;
4416
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004417 if (attr->disabled)
4418 event->state = PERF_EVENT_STATE_OFF;
4419
4420 pmu = NULL;
4421
4422 hwc = &event->hw;
4423 hwc->sample_period = attr->sample_period;
4424 if (attr->freq && attr->sample_freq)
4425 hwc->sample_period = 1;
4426 hwc->last_period = hwc->sample_period;
4427
4428 atomic64_set(&hwc->period_left, hwc->sample_period);
4429
4430 /*
4431 * we currently do not support PERF_FORMAT_GROUP on inherited events
4432 */
4433 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4434 goto done;
4435
4436 switch (attr->type) {
4437 case PERF_TYPE_RAW:
4438 case PERF_TYPE_HARDWARE:
4439 case PERF_TYPE_HW_CACHE:
4440 pmu = hw_perf_event_init(event);
4441 break;
4442
4443 case PERF_TYPE_SOFTWARE:
4444 pmu = sw_perf_event_init(event);
4445 break;
4446
4447 case PERF_TYPE_TRACEPOINT:
4448 pmu = tp_perf_event_init(event);
4449 break;
4450
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004451 case PERF_TYPE_BREAKPOINT:
4452 pmu = bp_perf_event_init(event);
4453 break;
4454
4455
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004456 default:
4457 break;
4458 }
4459done:
4460 err = 0;
4461 if (!pmu)
4462 err = -EINVAL;
4463 else if (IS_ERR(pmu))
4464 err = PTR_ERR(pmu);
4465
4466 if (err) {
4467 if (event->ns)
4468 put_pid_ns(event->ns);
4469 kfree(event);
4470 return ERR_PTR(err);
4471 }
4472
4473 event->pmu = pmu;
4474
4475 if (!event->parent) {
4476 atomic_inc(&nr_events);
4477 if (event->attr.mmap)
4478 atomic_inc(&nr_mmap_events);
4479 if (event->attr.comm)
4480 atomic_inc(&nr_comm_events);
4481 if (event->attr.task)
4482 atomic_inc(&nr_task_events);
4483 }
4484
4485 return event;
4486}
4487
4488static int perf_copy_attr(struct perf_event_attr __user *uattr,
4489 struct perf_event_attr *attr)
4490{
4491 u32 size;
4492 int ret;
4493
4494 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4495 return -EFAULT;
4496
4497 /*
4498 * zero the full structure, so that a short copy will be nice.
4499 */
4500 memset(attr, 0, sizeof(*attr));
4501
4502 ret = get_user(size, &uattr->size);
4503 if (ret)
4504 return ret;
4505
4506 if (size > PAGE_SIZE) /* silly large */
4507 goto err_size;
4508
4509 if (!size) /* abi compat */
4510 size = PERF_ATTR_SIZE_VER0;
4511
4512 if (size < PERF_ATTR_SIZE_VER0)
4513 goto err_size;
4514
4515 /*
4516 * If we're handed a bigger struct than we know of,
4517 * ensure all the unknown bits are 0 - i.e. new
4518 * user-space does not rely on any kernel feature
4519 * extensions we dont know about yet.
4520 */
4521 if (size > sizeof(*attr)) {
4522 unsigned char __user *addr;
4523 unsigned char __user *end;
4524 unsigned char val;
4525
4526 addr = (void __user *)uattr + sizeof(*attr);
4527 end = (void __user *)uattr + size;
4528
4529 for (; addr < end; addr++) {
4530 ret = get_user(val, addr);
4531 if (ret)
4532 return ret;
4533 if (val)
4534 goto err_size;
4535 }
4536 size = sizeof(*attr);
4537 }
4538
4539 ret = copy_from_user(attr, uattr, size);
4540 if (ret)
4541 return -EFAULT;
4542
4543 /*
4544 * If the type exists, the corresponding creation will verify
4545 * the attr->config.
4546 */
4547 if (attr->type >= PERF_TYPE_MAX)
4548 return -EINVAL;
4549
4550 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4551 return -EINVAL;
4552
4553 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4554 return -EINVAL;
4555
4556 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4557 return -EINVAL;
4558
4559out:
4560 return ret;
4561
4562err_size:
4563 put_user(sizeof(*attr), &uattr->size);
4564 ret = -E2BIG;
4565 goto out;
4566}
4567
Li Zefan6fb29152009-10-15 11:21:42 +08004568static int perf_event_set_output(struct perf_event *event, int output_fd)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004569{
4570 struct perf_event *output_event = NULL;
4571 struct file *output_file = NULL;
4572 struct perf_event *old_output;
4573 int fput_needed = 0;
4574 int ret = -EINVAL;
4575
4576 if (!output_fd)
4577 goto set;
4578
4579 output_file = fget_light(output_fd, &fput_needed);
4580 if (!output_file)
4581 return -EBADF;
4582
4583 if (output_file->f_op != &perf_fops)
4584 goto out;
4585
4586 output_event = output_file->private_data;
4587
4588 /* Don't chain output fds */
4589 if (output_event->output)
4590 goto out;
4591
4592 /* Don't set an output fd when we already have an output channel */
4593 if (event->data)
4594 goto out;
4595
4596 atomic_long_inc(&output_file->f_count);
4597
4598set:
4599 mutex_lock(&event->mmap_mutex);
4600 old_output = event->output;
4601 rcu_assign_pointer(event->output, output_event);
4602 mutex_unlock(&event->mmap_mutex);
4603
4604 if (old_output) {
4605 /*
4606 * we need to make sure no existing perf_output_*()
4607 * is still referencing this event.
4608 */
4609 synchronize_rcu();
4610 fput(old_output->filp);
4611 }
4612
4613 ret = 0;
4614out:
4615 fput_light(output_file, fput_needed);
4616 return ret;
4617}
4618
4619/**
4620 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4621 *
4622 * @attr_uptr: event_id type attributes for monitoring/sampling
4623 * @pid: target pid
4624 * @cpu: target cpu
4625 * @group_fd: group leader event fd
4626 */
4627SYSCALL_DEFINE5(perf_event_open,
4628 struct perf_event_attr __user *, attr_uptr,
4629 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4630{
4631 struct perf_event *event, *group_leader;
4632 struct perf_event_attr attr;
4633 struct perf_event_context *ctx;
4634 struct file *event_file = NULL;
4635 struct file *group_file = NULL;
4636 int fput_needed = 0;
4637 int fput_needed2 = 0;
4638 int err;
4639
4640 /* for future expandability... */
4641 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4642 return -EINVAL;
4643
4644 err = perf_copy_attr(attr_uptr, &attr);
4645 if (err)
4646 return err;
4647
4648 if (!attr.exclude_kernel) {
4649 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4650 return -EACCES;
4651 }
4652
4653 if (attr.freq) {
4654 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4655 return -EINVAL;
4656 }
4657
4658 /*
4659 * Get the target context (task or percpu):
4660 */
4661 ctx = find_get_context(pid, cpu);
4662 if (IS_ERR(ctx))
4663 return PTR_ERR(ctx);
4664
4665 /*
4666 * Look up the group leader (we will attach this event to it):
4667 */
4668 group_leader = NULL;
4669 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4670 err = -EINVAL;
4671 group_file = fget_light(group_fd, &fput_needed);
4672 if (!group_file)
4673 goto err_put_context;
4674 if (group_file->f_op != &perf_fops)
4675 goto err_put_context;
4676
4677 group_leader = group_file->private_data;
4678 /*
4679 * Do not allow a recursive hierarchy (this new sibling
4680 * becoming part of another group-sibling):
4681 */
4682 if (group_leader->group_leader != group_leader)
4683 goto err_put_context;
4684 /*
4685 * Do not allow to attach to a group in a different
4686 * task or CPU context:
4687 */
4688 if (group_leader->ctx != ctx)
4689 goto err_put_context;
4690 /*
4691 * Only a group leader can be exclusive or pinned
4692 */
4693 if (attr.exclusive || attr.pinned)
4694 goto err_put_context;
4695 }
4696
4697 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004698 NULL, NULL, GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004699 err = PTR_ERR(event);
4700 if (IS_ERR(event))
4701 goto err_put_context;
4702
4703 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4704 if (err < 0)
4705 goto err_free_put_context;
4706
4707 event_file = fget_light(err, &fput_needed2);
4708 if (!event_file)
4709 goto err_free_put_context;
4710
4711 if (flags & PERF_FLAG_FD_OUTPUT) {
4712 err = perf_event_set_output(event, group_fd);
4713 if (err)
4714 goto err_fput_free_put_context;
4715 }
4716
4717 event->filp = event_file;
4718 WARN_ON_ONCE(ctx->parent_ctx);
4719 mutex_lock(&ctx->mutex);
4720 perf_install_in_context(ctx, event, cpu);
4721 ++ctx->generation;
4722 mutex_unlock(&ctx->mutex);
4723
4724 event->owner = current;
4725 get_task_struct(current);
4726 mutex_lock(&current->perf_event_mutex);
4727 list_add_tail(&event->owner_entry, &current->perf_event_list);
4728 mutex_unlock(&current->perf_event_mutex);
4729
4730err_fput_free_put_context:
4731 fput_light(event_file, fput_needed2);
4732
4733err_free_put_context:
4734 if (err < 0)
4735 kfree(event);
4736
4737err_put_context:
4738 if (err < 0)
4739 put_ctx(ctx);
4740
4741 fput_light(group_file, fput_needed);
4742
4743 return err;
4744}
4745
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004746/**
4747 * perf_event_create_kernel_counter
4748 *
4749 * @attr: attributes of the counter to create
4750 * @cpu: cpu in which the counter is bound
4751 * @pid: task to profile
4752 */
4753struct perf_event *
4754perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004755 pid_t pid, perf_callback_t callback)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004756{
4757 struct perf_event *event;
4758 struct perf_event_context *ctx;
4759 int err;
4760
4761 /*
4762 * Get the target context (task or percpu):
4763 */
4764
4765 ctx = find_get_context(pid, cpu);
4766 if (IS_ERR(ctx))
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004767 return NULL;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004768
4769 event = perf_event_alloc(attr, cpu, ctx, NULL,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004770 NULL, callback, GFP_KERNEL);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004771 err = PTR_ERR(event);
4772 if (IS_ERR(event))
4773 goto err_put_context;
4774
4775 event->filp = NULL;
4776 WARN_ON_ONCE(ctx->parent_ctx);
4777 mutex_lock(&ctx->mutex);
4778 perf_install_in_context(ctx, event, cpu);
4779 ++ctx->generation;
4780 mutex_unlock(&ctx->mutex);
4781
4782 event->owner = current;
4783 get_task_struct(current);
4784 mutex_lock(&current->perf_event_mutex);
4785 list_add_tail(&event->owner_entry, &current->perf_event_list);
4786 mutex_unlock(&current->perf_event_mutex);
4787
4788 return event;
4789
4790err_put_context:
4791 if (err < 0)
4792 put_ctx(ctx);
4793
4794 return NULL;
4795}
4796EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4797
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004798/*
4799 * inherit a event from parent task to child task:
4800 */
4801static struct perf_event *
4802inherit_event(struct perf_event *parent_event,
4803 struct task_struct *parent,
4804 struct perf_event_context *parent_ctx,
4805 struct task_struct *child,
4806 struct perf_event *group_leader,
4807 struct perf_event_context *child_ctx)
4808{
4809 struct perf_event *child_event;
4810
4811 /*
4812 * Instead of creating recursive hierarchies of events,
4813 * we link inherited events back to the original parent,
4814 * which has a filp for sure, which we use as the reference
4815 * count:
4816 */
4817 if (parent_event->parent)
4818 parent_event = parent_event->parent;
4819
4820 child_event = perf_event_alloc(&parent_event->attr,
4821 parent_event->cpu, child_ctx,
4822 group_leader, parent_event,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004823 NULL, GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004824 if (IS_ERR(child_event))
4825 return child_event;
4826 get_ctx(child_ctx);
4827
4828 /*
4829 * Make the child state follow the state of the parent event,
4830 * not its attr.disabled bit. We hold the parent's mutex,
4831 * so we won't race with perf_event_{en, dis}able_family.
4832 */
4833 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4834 child_event->state = PERF_EVENT_STATE_INACTIVE;
4835 else
4836 child_event->state = PERF_EVENT_STATE_OFF;
4837
4838 if (parent_event->attr.freq)
4839 child_event->hw.sample_period = parent_event->hw.sample_period;
4840
Peter Zijlstra453f19e2009-11-20 22:19:43 +01004841 child_event->overflow_handler = parent_event->overflow_handler;
4842
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004843 /*
4844 * Link it up in the child's context:
4845 */
4846 add_event_to_ctx(child_event, child_ctx);
4847
4848 /*
4849 * Get a reference to the parent filp - we will fput it
4850 * when the child event exits. This is safe to do because
4851 * we are in the parent and we know that the filp still
4852 * exists and has a nonzero count:
4853 */
4854 atomic_long_inc(&parent_event->filp->f_count);
4855
4856 /*
4857 * Link this into the parent event's child list
4858 */
4859 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4860 mutex_lock(&parent_event->child_mutex);
4861 list_add_tail(&child_event->child_list, &parent_event->child_list);
4862 mutex_unlock(&parent_event->child_mutex);
4863
4864 return child_event;
4865}
4866
4867static int inherit_group(struct perf_event *parent_event,
4868 struct task_struct *parent,
4869 struct perf_event_context *parent_ctx,
4870 struct task_struct *child,
4871 struct perf_event_context *child_ctx)
4872{
4873 struct perf_event *leader;
4874 struct perf_event *sub;
4875 struct perf_event *child_ctr;
4876
4877 leader = inherit_event(parent_event, parent, parent_ctx,
4878 child, NULL, child_ctx);
4879 if (IS_ERR(leader))
4880 return PTR_ERR(leader);
4881 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4882 child_ctr = inherit_event(sub, parent, parent_ctx,
4883 child, leader, child_ctx);
4884 if (IS_ERR(child_ctr))
4885 return PTR_ERR(child_ctr);
4886 }
4887 return 0;
4888}
4889
4890static void sync_child_event(struct perf_event *child_event,
4891 struct task_struct *child)
4892{
4893 struct perf_event *parent_event = child_event->parent;
4894 u64 child_val;
4895
4896 if (child_event->attr.inherit_stat)
4897 perf_event_read_event(child_event, child);
4898
4899 child_val = atomic64_read(&child_event->count);
4900
4901 /*
4902 * Add back the child's count to the parent's count:
4903 */
4904 atomic64_add(child_val, &parent_event->count);
4905 atomic64_add(child_event->total_time_enabled,
4906 &parent_event->child_total_time_enabled);
4907 atomic64_add(child_event->total_time_running,
4908 &parent_event->child_total_time_running);
4909
4910 /*
4911 * Remove this event from the parent's list
4912 */
4913 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4914 mutex_lock(&parent_event->child_mutex);
4915 list_del_init(&child_event->child_list);
4916 mutex_unlock(&parent_event->child_mutex);
4917
4918 /*
4919 * Release the parent event, if this was the last
4920 * reference to it.
4921 */
4922 fput(parent_event->filp);
4923}
4924
4925static void
4926__perf_event_exit_task(struct perf_event *child_event,
4927 struct perf_event_context *child_ctx,
4928 struct task_struct *child)
4929{
4930 struct perf_event *parent_event;
4931
4932 update_event_times(child_event);
4933 perf_event_remove_from_context(child_event);
4934
4935 parent_event = child_event->parent;
4936 /*
4937 * It can happen that parent exits first, and has events
4938 * that are still around due to the child reference. These
4939 * events need to be zapped - but otherwise linger.
4940 */
4941 if (parent_event) {
4942 sync_child_event(child_event, child);
4943 free_event(child_event);
4944 }
4945}
4946
4947/*
4948 * When a child task exits, feed back event values to parent events.
4949 */
4950void perf_event_exit_task(struct task_struct *child)
4951{
4952 struct perf_event *child_event, *tmp;
4953 struct perf_event_context *child_ctx;
4954 unsigned long flags;
4955
4956 if (likely(!child->perf_event_ctxp)) {
4957 perf_event_task(child, NULL, 0);
4958 return;
4959 }
4960
4961 local_irq_save(flags);
4962 /*
4963 * We can't reschedule here because interrupts are disabled,
4964 * and either child is current or it is a task that can't be
4965 * scheduled, so we are now safe from rescheduling changing
4966 * our context.
4967 */
4968 child_ctx = child->perf_event_ctxp;
4969 __perf_event_task_sched_out(child_ctx);
4970
4971 /*
4972 * Take the context lock here so that if find_get_context is
4973 * reading child->perf_event_ctxp, we wait until it has
4974 * incremented the context's refcount before we do put_ctx below.
4975 */
4976 spin_lock(&child_ctx->lock);
4977 child->perf_event_ctxp = NULL;
4978 /*
4979 * If this context is a clone; unclone it so it can't get
4980 * swapped to another process while we're removing all
4981 * the events from it.
4982 */
4983 unclone_ctx(child_ctx);
4984 spin_unlock_irqrestore(&child_ctx->lock, flags);
4985
4986 /*
4987 * Report the task dead after unscheduling the events so that we
4988 * won't get any samples after PERF_RECORD_EXIT. We can however still
4989 * get a few PERF_RECORD_READ events.
4990 */
4991 perf_event_task(child, child_ctx, 0);
4992
4993 /*
4994 * We can recurse on the same lock type through:
4995 *
4996 * __perf_event_exit_task()
4997 * sync_child_event()
4998 * fput(parent_event->filp)
4999 * perf_release()
5000 * mutex_lock(&ctx->mutex)
5001 *
5002 * But since its the parent context it won't be the same instance.
5003 */
5004 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5005
5006again:
5007 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
5008 group_entry)
5009 __perf_event_exit_task(child_event, child_ctx, child);
5010
5011 /*
5012 * If the last event was a group event, it will have appended all
5013 * its siblings to the list, but we obtained 'tmp' before that which
5014 * will still point to the list head terminating the iteration.
5015 */
5016 if (!list_empty(&child_ctx->group_list))
5017 goto again;
5018
5019 mutex_unlock(&child_ctx->mutex);
5020
5021 put_ctx(child_ctx);
5022}
5023
5024/*
5025 * free an unexposed, unused context as created by inheritance by
5026 * init_task below, used by fork() in case of fail.
5027 */
5028void perf_event_free_task(struct task_struct *task)
5029{
5030 struct perf_event_context *ctx = task->perf_event_ctxp;
5031 struct perf_event *event, *tmp;
5032
5033 if (!ctx)
5034 return;
5035
5036 mutex_lock(&ctx->mutex);
5037again:
5038 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
5039 struct perf_event *parent = event->parent;
5040
5041 if (WARN_ON_ONCE(!parent))
5042 continue;
5043
5044 mutex_lock(&parent->child_mutex);
5045 list_del_init(&event->child_list);
5046 mutex_unlock(&parent->child_mutex);
5047
5048 fput(parent->filp);
5049
5050 list_del_event(event, ctx);
5051 free_event(event);
5052 }
5053
5054 if (!list_empty(&ctx->group_list))
5055 goto again;
5056
5057 mutex_unlock(&ctx->mutex);
5058
5059 put_ctx(ctx);
5060}
5061
5062/*
5063 * Initialize the perf_event context in task_struct
5064 */
5065int perf_event_init_task(struct task_struct *child)
5066{
5067 struct perf_event_context *child_ctx, *parent_ctx;
5068 struct perf_event_context *cloned_ctx;
5069 struct perf_event *event;
5070 struct task_struct *parent = current;
5071 int inherited_all = 1;
5072 int ret = 0;
5073
5074 child->perf_event_ctxp = NULL;
5075
5076 mutex_init(&child->perf_event_mutex);
5077 INIT_LIST_HEAD(&child->perf_event_list);
5078
5079 if (likely(!parent->perf_event_ctxp))
5080 return 0;
5081
5082 /*
5083 * This is executed from the parent task context, so inherit
5084 * events that have been marked for cloning.
5085 * First allocate and initialize a context for the child.
5086 */
5087
5088 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5089 if (!child_ctx)
5090 return -ENOMEM;
5091
5092 __perf_event_init_context(child_ctx, child);
5093 child->perf_event_ctxp = child_ctx;
5094 get_task_struct(child);
5095
5096 /*
5097 * If the parent's context is a clone, pin it so it won't get
5098 * swapped under us.
5099 */
5100 parent_ctx = perf_pin_task_context(parent);
5101
5102 /*
5103 * No need to check if parent_ctx != NULL here; since we saw
5104 * it non-NULL earlier, the only reason for it to become NULL
5105 * is if we exit, and since we're currently in the middle of
5106 * a fork we can't be exiting at the same time.
5107 */
5108
5109 /*
5110 * Lock the parent list. No need to lock the child - not PID
5111 * hashed yet and not running, so nobody can access it.
5112 */
5113 mutex_lock(&parent_ctx->mutex);
5114
5115 /*
5116 * We dont have to disable NMIs - we are only looking at
5117 * the list, not manipulating it:
5118 */
Xiao Guangrong27f99942009-09-25 13:54:01 +08005119 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005120
5121 if (!event->attr.inherit) {
5122 inherited_all = 0;
5123 continue;
5124 }
5125
5126 ret = inherit_group(event, parent, parent_ctx,
5127 child, child_ctx);
5128 if (ret) {
5129 inherited_all = 0;
5130 break;
5131 }
5132 }
5133
5134 if (inherited_all) {
5135 /*
5136 * Mark the child context as a clone of the parent
5137 * context, or of whatever the parent is a clone of.
5138 * Note that if the parent is a clone, it could get
5139 * uncloned at any point, but that doesn't matter
5140 * because the list of events and the generation
5141 * count can't have changed since we took the mutex.
5142 */
5143 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5144 if (cloned_ctx) {
5145 child_ctx->parent_ctx = cloned_ctx;
5146 child_ctx->parent_gen = parent_ctx->parent_gen;
5147 } else {
5148 child_ctx->parent_ctx = parent_ctx;
5149 child_ctx->parent_gen = parent_ctx->generation;
5150 }
5151 get_ctx(child_ctx->parent_ctx);
5152 }
5153
5154 mutex_unlock(&parent_ctx->mutex);
5155
5156 perf_unpin_context(parent_ctx);
5157
5158 return ret;
5159}
5160
5161static void __cpuinit perf_event_init_cpu(int cpu)
5162{
5163 struct perf_cpu_context *cpuctx;
5164
5165 cpuctx = &per_cpu(perf_cpu_context, cpu);
5166 __perf_event_init_context(&cpuctx->ctx, NULL);
5167
5168 spin_lock(&perf_resource_lock);
5169 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5170 spin_unlock(&perf_resource_lock);
5171
5172 hw_perf_event_setup(cpu);
5173}
5174
5175#ifdef CONFIG_HOTPLUG_CPU
5176static void __perf_event_exit_cpu(void *info)
5177{
5178 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5179 struct perf_event_context *ctx = &cpuctx->ctx;
5180 struct perf_event *event, *tmp;
5181
5182 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5183 __perf_event_remove_from_context(event);
5184}
5185static void perf_event_exit_cpu(int cpu)
5186{
5187 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5188 struct perf_event_context *ctx = &cpuctx->ctx;
5189
5190 mutex_lock(&ctx->mutex);
5191 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5192 mutex_unlock(&ctx->mutex);
5193}
5194#else
5195static inline void perf_event_exit_cpu(int cpu) { }
5196#endif
5197
5198static int __cpuinit
5199perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5200{
5201 unsigned int cpu = (long)hcpu;
5202
5203 switch (action) {
5204
5205 case CPU_UP_PREPARE:
5206 case CPU_UP_PREPARE_FROZEN:
5207 perf_event_init_cpu(cpu);
5208 break;
5209
5210 case CPU_ONLINE:
5211 case CPU_ONLINE_FROZEN:
5212 hw_perf_event_setup_online(cpu);
5213 break;
5214
5215 case CPU_DOWN_PREPARE:
5216 case CPU_DOWN_PREPARE_FROZEN:
5217 perf_event_exit_cpu(cpu);
5218 break;
5219
5220 default:
5221 break;
5222 }
5223
5224 return NOTIFY_OK;
5225}
5226
5227/*
5228 * This has to have a higher priority than migration_notifier in sched.c.
5229 */
5230static struct notifier_block __cpuinitdata perf_cpu_nb = {
5231 .notifier_call = perf_cpu_notify,
5232 .priority = 20,
5233};
5234
5235void __init perf_event_init(void)
5236{
5237 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5238 (void *)(long)smp_processor_id());
5239 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5240 (void *)(long)smp_processor_id());
5241 register_cpu_notifier(&perf_cpu_nb);
5242}
5243
5244static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5245{
5246 return sprintf(buf, "%d\n", perf_reserved_percpu);
5247}
5248
5249static ssize_t
5250perf_set_reserve_percpu(struct sysdev_class *class,
5251 const char *buf,
5252 size_t count)
5253{
5254 struct perf_cpu_context *cpuctx;
5255 unsigned long val;
5256 int err, cpu, mpt;
5257
5258 err = strict_strtoul(buf, 10, &val);
5259 if (err)
5260 return err;
5261 if (val > perf_max_events)
5262 return -EINVAL;
5263
5264 spin_lock(&perf_resource_lock);
5265 perf_reserved_percpu = val;
5266 for_each_online_cpu(cpu) {
5267 cpuctx = &per_cpu(perf_cpu_context, cpu);
5268 spin_lock_irq(&cpuctx->ctx.lock);
5269 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5270 perf_max_events - perf_reserved_percpu);
5271 cpuctx->max_pertask = mpt;
5272 spin_unlock_irq(&cpuctx->ctx.lock);
5273 }
5274 spin_unlock(&perf_resource_lock);
5275
5276 return count;
5277}
5278
5279static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5280{
5281 return sprintf(buf, "%d\n", perf_overcommit);
5282}
5283
5284static ssize_t
5285perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5286{
5287 unsigned long val;
5288 int err;
5289
5290 err = strict_strtoul(buf, 10, &val);
5291 if (err)
5292 return err;
5293 if (val > 1)
5294 return -EINVAL;
5295
5296 spin_lock(&perf_resource_lock);
5297 perf_overcommit = val;
5298 spin_unlock(&perf_resource_lock);
5299
5300 return count;
5301}
5302
5303static SYSDEV_CLASS_ATTR(
5304 reserve_percpu,
5305 0644,
5306 perf_show_reserve_percpu,
5307 perf_set_reserve_percpu
5308 );
5309
5310static SYSDEV_CLASS_ATTR(
5311 overcommit,
5312 0644,
5313 perf_show_overcommit,
5314 perf_set_overcommit
5315 );
5316
5317static struct attribute *perfclass_attrs[] = {
5318 &attr_reserve_percpu.attr,
5319 &attr_overcommit.attr,
5320 NULL
5321};
5322
5323static struct attribute_group perfclass_attr_group = {
5324 .attrs = perfclass_attrs,
5325 .name = "perf_events",
5326};
5327
5328static int __init perf_event_sysfs_init(void)
5329{
5330 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5331 &perfclass_attr_group);
5332}
5333device_initcall(perf_event_sysfs_init);