blob: fb851ec34461c932620374f8d8236dbbcb57bb84 [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020023#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020024#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080031#include <linux/ftrace_event.h>
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +020032#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020033
34#include <asm/irq_regs.h>
35
36/*
37 * Each CPU has a list of per CPU events:
38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40
41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly;
43static int perf_overcommit __read_mostly = 1;
44
45static atomic_t nr_events __read_mostly;
46static atomic_t nr_mmap_events __read_mostly;
47static atomic_t nr_comm_events __read_mostly;
48static atomic_t nr_task_events __read_mostly;
49
50/*
51 * perf event paranoia level:
52 * -1 - not paranoid at all
53 * 0 - disallow raw tracepoint access for unpriv
54 * 1 - disallow cpu events for unpriv
55 * 2 - disallow kernel profiling for unpriv
56 */
57int sysctl_perf_event_paranoid __read_mostly = 1;
58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75
76/*
77 * max perf event sample rate
78 */
79int sysctl_perf_event_sample_rate __read_mostly = 100000;
80
81static atomic64_t perf_event_id;
82
83/*
84 * Lock for (sysadmin-configurable) event reservations:
85 */
86static DEFINE_SPINLOCK(perf_resource_lock);
87
88/*
89 * Architecture provided APIs - weak aliases:
90 */
91extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
92{
93 return NULL;
94}
95
96void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); }
98
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu)
106{
107 return 0;
108}
109
110void __weak perf_event_print_debug(void) { }
111
112static DEFINE_PER_CPU(int, perf_disable_count);
113
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void)
125{
126 __perf_disable();
127 hw_perf_disable();
128}
129
130void perf_enable(void)
131{
132 if (__perf_enable())
133 hw_perf_enable();
134}
135
136static void get_ctx(struct perf_event_context *ctx)
137{
138 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
139}
140
141static void free_ctx(struct rcu_head *head)
142{
143 struct perf_event_context *ctx;
144
145 ctx = container_of(head, struct perf_event_context, rcu_head);
146 kfree(ctx);
147}
148
149static void put_ctx(struct perf_event_context *ctx)
150{
151 if (atomic_dec_and_test(&ctx->refcount)) {
152 if (ctx->parent_ctx)
153 put_ctx(ctx->parent_ctx);
154 if (ctx->task)
155 put_task_struct(ctx->task);
156 call_rcu(&ctx->rcu_head, free_ctx);
157 }
158}
159
160static void unclone_ctx(struct perf_event_context *ctx)
161{
162 if (ctx->parent_ctx) {
163 put_ctx(ctx->parent_ctx);
164 ctx->parent_ctx = NULL;
165 }
166}
167
168/*
169 * If we inherit events we want to return the parent event id
170 * to userspace.
171 */
172static u64 primary_event_id(struct perf_event *event)
173{
174 u64 id = event->id;
175
176 if (event->parent)
177 id = event->parent->id;
178
179 return id;
180}
181
182/*
183 * Get the perf_event_context for a task and lock it.
184 * This has to cope with with the fact that until it is locked,
185 * the context could get moved to another task.
186 */
187static struct perf_event_context *
188perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189{
190 struct perf_event_context *ctx;
191
192 rcu_read_lock();
193 retry:
194 ctx = rcu_dereference(task->perf_event_ctxp);
195 if (ctx) {
196 /*
197 * If this context is a clone of another, it might
198 * get swapped for another underneath us by
199 * perf_event_task_sched_out, though the
200 * rcu_read_lock() protects us from any context
201 * getting freed. Lock the context and check if it
202 * got swapped before we could get the lock, and retry
203 * if so. If we locked the right context, then it
204 * can't get swapped on us any more.
205 */
206 spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry;
210 }
211
212 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL;
215 }
216 }
217 rcu_read_unlock();
218 return ctx;
219}
220
221/*
222 * Get the context for a task and increment its pin_count so it
223 * can't get swapped to another task. This also increments its
224 * reference count so that the context can't get freed.
225 */
226static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
227{
228 struct perf_event_context *ctx;
229 unsigned long flags;
230
231 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) {
233 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags);
235 }
236 return ctx;
237}
238
239static void perf_unpin_context(struct perf_event_context *ctx)
240{
241 unsigned long flags;
242
243 spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx);
247}
248
249/*
250 * Add a event from the lists for its context.
251 * Must be called with ctx->mutex and ctx->lock held.
252 */
253static void
254list_add_event(struct perf_event *event, struct perf_event_context *ctx)
255{
256 struct perf_event *group_leader = event->group_leader;
257
258 /*
259 * Depending on whether it is a standalone or sibling event,
260 * add it straight to the context's event list, or to the group
261 * leader's sibling list:
262 */
263 if (group_leader == event)
264 list_add_tail(&event->group_entry, &ctx->group_list);
265 else {
266 list_add_tail(&event->group_entry, &group_leader->sibling_list);
267 group_leader->nr_siblings++;
268 }
269
270 list_add_rcu(&event->event_entry, &ctx->event_list);
271 ctx->nr_events++;
272 if (event->attr.inherit_stat)
273 ctx->nr_stat++;
274}
275
276/*
277 * Remove a event from the lists for its context.
278 * Must be called with ctx->mutex and ctx->lock held.
279 */
280static void
281list_del_event(struct perf_event *event, struct perf_event_context *ctx)
282{
283 struct perf_event *sibling, *tmp;
284
285 if (list_empty(&event->group_entry))
286 return;
287 ctx->nr_events--;
288 if (event->attr.inherit_stat)
289 ctx->nr_stat--;
290
291 list_del_init(&event->group_entry);
292 list_del_rcu(&event->event_entry);
293
294 if (event->group_leader != event)
295 event->group_leader->nr_siblings--;
296
Peter Zijlstra2e2af502009-11-23 11:37:25 +0100297 event->state = PERF_EVENT_STATE_OFF;
298
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200299 /*
300 * If this was a group event with sibling events then
301 * upgrade the siblings to singleton events by adding them
302 * to the context list directly:
303 */
304 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
305
306 list_move_tail(&sibling->group_entry, &ctx->group_list);
307 sibling->group_leader = sibling;
308 }
309}
310
311static void
312event_sched_out(struct perf_event *event,
313 struct perf_cpu_context *cpuctx,
314 struct perf_event_context *ctx)
315{
316 if (event->state != PERF_EVENT_STATE_ACTIVE)
317 return;
318
319 event->state = PERF_EVENT_STATE_INACTIVE;
320 if (event->pending_disable) {
321 event->pending_disable = 0;
322 event->state = PERF_EVENT_STATE_OFF;
323 }
324 event->tstamp_stopped = ctx->time;
325 event->pmu->disable(event);
326 event->oncpu = -1;
327
328 if (!is_software_event(event))
329 cpuctx->active_oncpu--;
330 ctx->nr_active--;
331 if (event->attr.exclusive || !cpuctx->active_oncpu)
332 cpuctx->exclusive = 0;
333}
334
335static void
336group_sched_out(struct perf_event *group_event,
337 struct perf_cpu_context *cpuctx,
338 struct perf_event_context *ctx)
339{
340 struct perf_event *event;
341
342 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
343 return;
344
345 event_sched_out(group_event, cpuctx, ctx);
346
347 /*
348 * Schedule out siblings (if any):
349 */
350 list_for_each_entry(event, &group_event->sibling_list, group_entry)
351 event_sched_out(event, cpuctx, ctx);
352
353 if (group_event->attr.exclusive)
354 cpuctx->exclusive = 0;
355}
356
357/*
358 * Cross CPU call to remove a performance event
359 *
360 * We disable the event on the hardware level first. After that we
361 * remove it from the context list.
362 */
363static void __perf_event_remove_from_context(void *info)
364{
365 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
366 struct perf_event *event = info;
367 struct perf_event_context *ctx = event->ctx;
368
369 /*
370 * If this is a task context, we need to check whether it is
371 * the current task context of this cpu. If not it has been
372 * scheduled out before the smp call arrived.
373 */
374 if (ctx->task && cpuctx->task_ctx != ctx)
375 return;
376
377 spin_lock(&ctx->lock);
378 /*
379 * Protect the list operation against NMI by disabling the
380 * events on a global level.
381 */
382 perf_disable();
383
384 event_sched_out(event, cpuctx, ctx);
385
386 list_del_event(event, ctx);
387
388 if (!ctx->task) {
389 /*
390 * Allow more per task events with respect to the
391 * reservation:
392 */
393 cpuctx->max_pertask =
394 min(perf_max_events - ctx->nr_events,
395 perf_max_events - perf_reserved_percpu);
396 }
397
398 perf_enable();
399 spin_unlock(&ctx->lock);
400}
401
402
403/*
404 * Remove the event from a task's (or a CPU's) list of events.
405 *
406 * Must be called with ctx->mutex held.
407 *
408 * CPU events are removed with a smp call. For task events we only
409 * call when the task is on a CPU.
410 *
411 * If event->ctx is a cloned context, callers must make sure that
412 * every task struct that event->ctx->task could possibly point to
413 * remains valid. This is OK when called from perf_release since
414 * that only calls us on the top-level context, which can't be a clone.
415 * When called from perf_event_exit_task, it's OK because the
416 * context has been detached from its task.
417 */
418static void perf_event_remove_from_context(struct perf_event *event)
419{
420 struct perf_event_context *ctx = event->ctx;
421 struct task_struct *task = ctx->task;
422
423 if (!task) {
424 /*
425 * Per cpu events are removed via an smp call and
426 * the removal is always sucessful.
427 */
428 smp_call_function_single(event->cpu,
429 __perf_event_remove_from_context,
430 event, 1);
431 return;
432 }
433
434retry:
435 task_oncpu_function_call(task, __perf_event_remove_from_context,
436 event);
437
438 spin_lock_irq(&ctx->lock);
439 /*
440 * If the context is active we need to retry the smp call.
441 */
442 if (ctx->nr_active && !list_empty(&event->group_entry)) {
443 spin_unlock_irq(&ctx->lock);
444 goto retry;
445 }
446
447 /*
448 * The lock prevents that this context is scheduled in so we
449 * can remove the event safely, if the call above did not
450 * succeed.
451 */
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +0100452 if (!list_empty(&event->group_entry))
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200453 list_del_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200454 spin_unlock_irq(&ctx->lock);
455}
456
457static inline u64 perf_clock(void)
458{
459 return cpu_clock(smp_processor_id());
460}
461
462/*
463 * Update the record of the current time in a context.
464 */
465static void update_context_time(struct perf_event_context *ctx)
466{
467 u64 now = perf_clock();
468
469 ctx->time += now - ctx->timestamp;
470 ctx->timestamp = now;
471}
472
473/*
474 * Update the total_time_enabled and total_time_running fields for a event.
475 */
476static void update_event_times(struct perf_event *event)
477{
478 struct perf_event_context *ctx = event->ctx;
479 u64 run_end;
480
481 if (event->state < PERF_EVENT_STATE_INACTIVE ||
482 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
483 return;
484
485 event->total_time_enabled = ctx->time - event->tstamp_enabled;
486
487 if (event->state == PERF_EVENT_STATE_INACTIVE)
488 run_end = event->tstamp_stopped;
489 else
490 run_end = ctx->time;
491
492 event->total_time_running = run_end - event->tstamp_running;
493}
494
495/*
496 * Update total_time_enabled and total_time_running for all events in a group.
497 */
498static void update_group_times(struct perf_event *leader)
499{
500 struct perf_event *event;
501
502 update_event_times(leader);
503 list_for_each_entry(event, &leader->sibling_list, group_entry)
504 update_event_times(event);
505}
506
507/*
508 * Cross CPU call to disable a performance event
509 */
510static void __perf_event_disable(void *info)
511{
512 struct perf_event *event = info;
513 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
514 struct perf_event_context *ctx = event->ctx;
515
516 /*
517 * If this is a per-task event, need to check whether this
518 * event's task is the current task on this cpu.
519 */
520 if (ctx->task && cpuctx->task_ctx != ctx)
521 return;
522
523 spin_lock(&ctx->lock);
524
525 /*
526 * If the event is on, turn it off.
527 * If it is in error state, leave it in error state.
528 */
529 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
530 update_context_time(ctx);
531 update_group_times(event);
532 if (event == event->group_leader)
533 group_sched_out(event, cpuctx, ctx);
534 else
535 event_sched_out(event, cpuctx, ctx);
536 event->state = PERF_EVENT_STATE_OFF;
537 }
538
539 spin_unlock(&ctx->lock);
540}
541
542/*
543 * Disable a event.
544 *
545 * If event->ctx is a cloned context, callers must make sure that
546 * every task struct that event->ctx->task could possibly point to
547 * remains valid. This condition is satisifed when called through
548 * perf_event_for_each_child or perf_event_for_each because they
549 * hold the top-level event's child_mutex, so any descendant that
550 * goes to exit will block in sync_child_event.
551 * When called from perf_pending_event it's OK because event->ctx
552 * is the current context on this CPU and preemption is disabled,
553 * hence we can't get into perf_event_task_sched_out for this context.
554 */
555static void perf_event_disable(struct perf_event *event)
556{
557 struct perf_event_context *ctx = event->ctx;
558 struct task_struct *task = ctx->task;
559
560 if (!task) {
561 /*
562 * Disable the event on the cpu that it's on
563 */
564 smp_call_function_single(event->cpu, __perf_event_disable,
565 event, 1);
566 return;
567 }
568
569 retry:
570 task_oncpu_function_call(task, __perf_event_disable, event);
571
572 spin_lock_irq(&ctx->lock);
573 /*
574 * If the event is still active, we need to retry the cross-call.
575 */
576 if (event->state == PERF_EVENT_STATE_ACTIVE) {
577 spin_unlock_irq(&ctx->lock);
578 goto retry;
579 }
580
581 /*
582 * Since we have the lock this context can't be scheduled
583 * in, so we can change the state safely.
584 */
585 if (event->state == PERF_EVENT_STATE_INACTIVE) {
586 update_group_times(event);
587 event->state = PERF_EVENT_STATE_OFF;
588 }
589
590 spin_unlock_irq(&ctx->lock);
591}
592
593static int
594event_sched_in(struct perf_event *event,
595 struct perf_cpu_context *cpuctx,
596 struct perf_event_context *ctx,
597 int cpu)
598{
599 if (event->state <= PERF_EVENT_STATE_OFF)
600 return 0;
601
602 event->state = PERF_EVENT_STATE_ACTIVE;
603 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
604 /*
605 * The new state must be visible before we turn it on in the hardware:
606 */
607 smp_wmb();
608
609 if (event->pmu->enable(event)) {
610 event->state = PERF_EVENT_STATE_INACTIVE;
611 event->oncpu = -1;
612 return -EAGAIN;
613 }
614
615 event->tstamp_running += ctx->time - event->tstamp_stopped;
616
617 if (!is_software_event(event))
618 cpuctx->active_oncpu++;
619 ctx->nr_active++;
620
621 if (event->attr.exclusive)
622 cpuctx->exclusive = 1;
623
624 return 0;
625}
626
627static int
628group_sched_in(struct perf_event *group_event,
629 struct perf_cpu_context *cpuctx,
630 struct perf_event_context *ctx,
631 int cpu)
632{
633 struct perf_event *event, *partial_group;
634 int ret;
635
636 if (group_event->state == PERF_EVENT_STATE_OFF)
637 return 0;
638
639 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
640 if (ret)
641 return ret < 0 ? ret : 0;
642
643 if (event_sched_in(group_event, cpuctx, ctx, cpu))
644 return -EAGAIN;
645
646 /*
647 * Schedule in siblings as one group (if any):
648 */
649 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
650 if (event_sched_in(event, cpuctx, ctx, cpu)) {
651 partial_group = event;
652 goto group_error;
653 }
654 }
655
656 return 0;
657
658group_error:
659 /*
660 * Groups can be scheduled in as one unit only, so undo any
661 * partial group before returning:
662 */
663 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
664 if (event == partial_group)
665 break;
666 event_sched_out(event, cpuctx, ctx);
667 }
668 event_sched_out(group_event, cpuctx, ctx);
669
670 return -EAGAIN;
671}
672
673/*
674 * Return 1 for a group consisting entirely of software events,
675 * 0 if the group contains any hardware events.
676 */
677static int is_software_only_group(struct perf_event *leader)
678{
679 struct perf_event *event;
680
681 if (!is_software_event(leader))
682 return 0;
683
684 list_for_each_entry(event, &leader->sibling_list, group_entry)
685 if (!is_software_event(event))
686 return 0;
687
688 return 1;
689}
690
691/*
692 * Work out whether we can put this event group on the CPU now.
693 */
694static int group_can_go_on(struct perf_event *event,
695 struct perf_cpu_context *cpuctx,
696 int can_add_hw)
697{
698 /*
699 * Groups consisting entirely of software events can always go on.
700 */
701 if (is_software_only_group(event))
702 return 1;
703 /*
704 * If an exclusive group is already on, no other hardware
705 * events can go on.
706 */
707 if (cpuctx->exclusive)
708 return 0;
709 /*
710 * If this group is exclusive and there are already
711 * events on the CPU, it can't go on.
712 */
713 if (event->attr.exclusive && cpuctx->active_oncpu)
714 return 0;
715 /*
716 * Otherwise, try to add it if all previous groups were able
717 * to go on.
718 */
719 return can_add_hw;
720}
721
722static void add_event_to_ctx(struct perf_event *event,
723 struct perf_event_context *ctx)
724{
725 list_add_event(event, ctx);
726 event->tstamp_enabled = ctx->time;
727 event->tstamp_running = ctx->time;
728 event->tstamp_stopped = ctx->time;
729}
730
731/*
732 * Cross CPU call to install and enable a performance event
733 *
734 * Must be called with ctx->mutex held
735 */
736static void __perf_install_in_context(void *info)
737{
738 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
739 struct perf_event *event = info;
740 struct perf_event_context *ctx = event->ctx;
741 struct perf_event *leader = event->group_leader;
742 int cpu = smp_processor_id();
743 int err;
744
745 /*
746 * If this is a task context, we need to check whether it is
747 * the current task context of this cpu. If not it has been
748 * scheduled out before the smp call arrived.
749 * Or possibly this is the right context but it isn't
750 * on this cpu because it had no events.
751 */
752 if (ctx->task && cpuctx->task_ctx != ctx) {
753 if (cpuctx->task_ctx || ctx->task != current)
754 return;
755 cpuctx->task_ctx = ctx;
756 }
757
758 spin_lock(&ctx->lock);
759 ctx->is_active = 1;
760 update_context_time(ctx);
761
762 /*
763 * Protect the list operation against NMI by disabling the
764 * events on a global level. NOP for non NMI based events.
765 */
766 perf_disable();
767
768 add_event_to_ctx(event, ctx);
769
770 /*
771 * Don't put the event on if it is disabled or if
772 * it is in a group and the group isn't on.
773 */
774 if (event->state != PERF_EVENT_STATE_INACTIVE ||
775 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
776 goto unlock;
777
778 /*
779 * An exclusive event can't go on if there are already active
780 * hardware events, and no hardware event can go on if there
781 * is already an exclusive event on.
782 */
783 if (!group_can_go_on(event, cpuctx, 1))
784 err = -EEXIST;
785 else
786 err = event_sched_in(event, cpuctx, ctx, cpu);
787
788 if (err) {
789 /*
790 * This event couldn't go on. If it is in a group
791 * then we have to pull the whole group off.
792 * If the event group is pinned then put it in error state.
793 */
794 if (leader != event)
795 group_sched_out(leader, cpuctx, ctx);
796 if (leader->attr.pinned) {
797 update_group_times(leader);
798 leader->state = PERF_EVENT_STATE_ERROR;
799 }
800 }
801
802 if (!err && !ctx->task && cpuctx->max_pertask)
803 cpuctx->max_pertask--;
804
805 unlock:
806 perf_enable();
807
808 spin_unlock(&ctx->lock);
809}
810
811/*
812 * Attach a performance event to a context
813 *
814 * First we add the event to the list with the hardware enable bit
815 * in event->hw_config cleared.
816 *
817 * If the event is attached to a task which is on a CPU we use a smp
818 * call to enable it in the task context. The task might have been
819 * scheduled away, but we check this in the smp call again.
820 *
821 * Must be called with ctx->mutex held.
822 */
823static void
824perf_install_in_context(struct perf_event_context *ctx,
825 struct perf_event *event,
826 int cpu)
827{
828 struct task_struct *task = ctx->task;
829
830 if (!task) {
831 /*
832 * Per cpu events are installed via an smp call and
833 * the install is always sucessful.
834 */
835 smp_call_function_single(cpu, __perf_install_in_context,
836 event, 1);
837 return;
838 }
839
840retry:
841 task_oncpu_function_call(task, __perf_install_in_context,
842 event);
843
844 spin_lock_irq(&ctx->lock);
845 /*
846 * we need to retry the smp call.
847 */
848 if (ctx->is_active && list_empty(&event->group_entry)) {
849 spin_unlock_irq(&ctx->lock);
850 goto retry;
851 }
852
853 /*
854 * The lock prevents that this context is scheduled in so we
855 * can add the event safely, if it the call above did not
856 * succeed.
857 */
858 if (list_empty(&event->group_entry))
859 add_event_to_ctx(event, ctx);
860 spin_unlock_irq(&ctx->lock);
861}
862
863/*
864 * Put a event into inactive state and update time fields.
865 * Enabling the leader of a group effectively enables all
866 * the group members that aren't explicitly disabled, so we
867 * have to update their ->tstamp_enabled also.
868 * Note: this works for group members as well as group leaders
869 * since the non-leader members' sibling_lists will be empty.
870 */
871static void __perf_event_mark_enabled(struct perf_event *event,
872 struct perf_event_context *ctx)
873{
874 struct perf_event *sub;
875
876 event->state = PERF_EVENT_STATE_INACTIVE;
877 event->tstamp_enabled = ctx->time - event->total_time_enabled;
878 list_for_each_entry(sub, &event->sibling_list, group_entry)
879 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
880 sub->tstamp_enabled =
881 ctx->time - sub->total_time_enabled;
882}
883
884/*
885 * Cross CPU call to enable a performance event
886 */
887static void __perf_event_enable(void *info)
888{
889 struct perf_event *event = info;
890 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
891 struct perf_event_context *ctx = event->ctx;
892 struct perf_event *leader = event->group_leader;
893 int err;
894
895 /*
896 * If this is a per-task event, need to check whether this
897 * event's task is the current task on this cpu.
898 */
899 if (ctx->task && cpuctx->task_ctx != ctx) {
900 if (cpuctx->task_ctx || ctx->task != current)
901 return;
902 cpuctx->task_ctx = ctx;
903 }
904
905 spin_lock(&ctx->lock);
906 ctx->is_active = 1;
907 update_context_time(ctx);
908
909 if (event->state >= PERF_EVENT_STATE_INACTIVE)
910 goto unlock;
911 __perf_event_mark_enabled(event, ctx);
912
913 /*
914 * If the event is in a group and isn't the group leader,
915 * then don't put it on unless the group is on.
916 */
917 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
918 goto unlock;
919
920 if (!group_can_go_on(event, cpuctx, 1)) {
921 err = -EEXIST;
922 } else {
923 perf_disable();
924 if (event == leader)
925 err = group_sched_in(event, cpuctx, ctx,
926 smp_processor_id());
927 else
928 err = event_sched_in(event, cpuctx, ctx,
929 smp_processor_id());
930 perf_enable();
931 }
932
933 if (err) {
934 /*
935 * If this event can't go on and it's part of a
936 * group, then the whole group has to come off.
937 */
938 if (leader != event)
939 group_sched_out(leader, cpuctx, ctx);
940 if (leader->attr.pinned) {
941 update_group_times(leader);
942 leader->state = PERF_EVENT_STATE_ERROR;
943 }
944 }
945
946 unlock:
947 spin_unlock(&ctx->lock);
948}
949
950/*
951 * Enable a event.
952 *
953 * If event->ctx is a cloned context, callers must make sure that
954 * every task struct that event->ctx->task could possibly point to
955 * remains valid. This condition is satisfied when called through
956 * perf_event_for_each_child or perf_event_for_each as described
957 * for perf_event_disable.
958 */
959static void perf_event_enable(struct perf_event *event)
960{
961 struct perf_event_context *ctx = event->ctx;
962 struct task_struct *task = ctx->task;
963
964 if (!task) {
965 /*
966 * Enable the event on the cpu that it's on
967 */
968 smp_call_function_single(event->cpu, __perf_event_enable,
969 event, 1);
970 return;
971 }
972
973 spin_lock_irq(&ctx->lock);
974 if (event->state >= PERF_EVENT_STATE_INACTIVE)
975 goto out;
976
977 /*
978 * If the event is in error state, clear that first.
979 * That way, if we see the event in error state below, we
980 * know that it has gone back into error state, as distinct
981 * from the task having been scheduled away before the
982 * cross-call arrived.
983 */
984 if (event->state == PERF_EVENT_STATE_ERROR)
985 event->state = PERF_EVENT_STATE_OFF;
986
987 retry:
988 spin_unlock_irq(&ctx->lock);
989 task_oncpu_function_call(task, __perf_event_enable, event);
990
991 spin_lock_irq(&ctx->lock);
992
993 /*
994 * If the context is active and the event is still off,
995 * we need to retry the cross-call.
996 */
997 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
998 goto retry;
999
1000 /*
1001 * Since we have the lock this context can't be scheduled
1002 * in, so we can change the state safely.
1003 */
1004 if (event->state == PERF_EVENT_STATE_OFF)
1005 __perf_event_mark_enabled(event, ctx);
1006
1007 out:
1008 spin_unlock_irq(&ctx->lock);
1009}
1010
1011static int perf_event_refresh(struct perf_event *event, int refresh)
1012{
1013 /*
1014 * not supported on inherited events
1015 */
1016 if (event->attr.inherit)
1017 return -EINVAL;
1018
1019 atomic_add(refresh, &event->event_limit);
1020 perf_event_enable(event);
1021
1022 return 0;
1023}
1024
1025void __perf_event_sched_out(struct perf_event_context *ctx,
1026 struct perf_cpu_context *cpuctx)
1027{
1028 struct perf_event *event;
1029
1030 spin_lock(&ctx->lock);
1031 ctx->is_active = 0;
1032 if (likely(!ctx->nr_events))
1033 goto out;
1034 update_context_time(ctx);
1035
1036 perf_disable();
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +01001037 if (ctx->nr_active) {
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001038 list_for_each_entry(event, &ctx->group_list, group_entry)
1039 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra6c2bfcb2009-11-23 11:37:24 +01001040 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001041 perf_enable();
1042 out:
1043 spin_unlock(&ctx->lock);
1044}
1045
1046/*
1047 * Test whether two contexts are equivalent, i.e. whether they
1048 * have both been cloned from the same version of the same context
1049 * and they both have the same number of enabled events.
1050 * If the number of enabled events is the same, then the set
1051 * of enabled events should be the same, because these are both
1052 * inherited contexts, therefore we can't access individual events
1053 * in them directly with an fd; we can only enable/disable all
1054 * events via prctl, or enable/disable all events in a family
1055 * via ioctl, which will have the same effect on both contexts.
1056 */
1057static int context_equiv(struct perf_event_context *ctx1,
1058 struct perf_event_context *ctx2)
1059{
1060 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1061 && ctx1->parent_gen == ctx2->parent_gen
1062 && !ctx1->pin_count && !ctx2->pin_count;
1063}
1064
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001065static void __perf_event_sync_stat(struct perf_event *event,
1066 struct perf_event *next_event)
1067{
1068 u64 value;
1069
1070 if (!event->attr.inherit_stat)
1071 return;
1072
1073 /*
1074 * Update the event value, we cannot use perf_event_read()
1075 * because we're in the middle of a context switch and have IRQs
1076 * disabled, which upsets smp_call_function_single(), however
1077 * we know the event must be on the current CPU, therefore we
1078 * don't need to use it.
1079 */
1080 switch (event->state) {
1081 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001082 event->pmu->read(event);
1083 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001084
1085 case PERF_EVENT_STATE_INACTIVE:
1086 update_event_times(event);
1087 break;
1088
1089 default:
1090 break;
1091 }
1092
1093 /*
1094 * In order to keep per-task stats reliable we need to flip the event
1095 * values when we flip the contexts.
1096 */
1097 value = atomic64_read(&next_event->count);
1098 value = atomic64_xchg(&event->count, value);
1099 atomic64_set(&next_event->count, value);
1100
1101 swap(event->total_time_enabled, next_event->total_time_enabled);
1102 swap(event->total_time_running, next_event->total_time_running);
1103
1104 /*
1105 * Since we swizzled the values, update the user visible data too.
1106 */
1107 perf_event_update_userpage(event);
1108 perf_event_update_userpage(next_event);
1109}
1110
1111#define list_next_entry(pos, member) \
1112 list_entry(pos->member.next, typeof(*pos), member)
1113
1114static void perf_event_sync_stat(struct perf_event_context *ctx,
1115 struct perf_event_context *next_ctx)
1116{
1117 struct perf_event *event, *next_event;
1118
1119 if (!ctx->nr_stat)
1120 return;
1121
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001122 update_context_time(ctx);
1123
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001124 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry);
1126
1127 next_event = list_first_entry(&next_ctx->event_list,
1128 struct perf_event, event_entry);
1129
1130 while (&event->event_entry != &ctx->event_list &&
1131 &next_event->event_entry != &next_ctx->event_list) {
1132
1133 __perf_event_sync_stat(event, next_event);
1134
1135 event = list_next_entry(event, event_entry);
1136 next_event = list_next_entry(next_event, event_entry);
1137 }
1138}
1139
1140/*
1141 * Called from scheduler to remove the events of the current task,
1142 * with interrupts disabled.
1143 *
1144 * We stop each event and update the event value in event->count.
1145 *
1146 * This does not protect us against NMI, but disable()
1147 * sets the disabled bit in the control field of event _before_
1148 * accessing the event control register. If a NMI hits, then it will
1149 * not restart the event.
1150 */
1151void perf_event_task_sched_out(struct task_struct *task,
1152 struct task_struct *next, int cpu)
1153{
1154 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1155 struct perf_event_context *ctx = task->perf_event_ctxp;
1156 struct perf_event_context *next_ctx;
1157 struct perf_event_context *parent;
1158 struct pt_regs *regs;
1159 int do_switch = 1;
1160
1161 regs = task_pt_regs(task);
1162 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1163
1164 if (likely(!ctx || !cpuctx->task_ctx))
1165 return;
1166
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001167 rcu_read_lock();
1168 parent = rcu_dereference(ctx->parent_ctx);
1169 next_ctx = next->perf_event_ctxp;
1170 if (parent && next_ctx &&
1171 rcu_dereference(next_ctx->parent_ctx) == parent) {
1172 /*
1173 * Looks like the two contexts are clones, so we might be
1174 * able to optimize the context switch. We lock both
1175 * contexts and check that they are clones under the
1176 * lock (including re-checking that neither has been
1177 * uncloned in the meantime). It doesn't matter which
1178 * order we take the locks because no other cpu could
1179 * be trying to lock both of these tasks.
1180 */
1181 spin_lock(&ctx->lock);
1182 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1183 if (context_equiv(ctx, next_ctx)) {
1184 /*
1185 * XXX do we need a memory barrier of sorts
1186 * wrt to rcu_dereference() of perf_event_ctxp
1187 */
1188 task->perf_event_ctxp = next_ctx;
1189 next->perf_event_ctxp = ctx;
1190 ctx->task = next;
1191 next_ctx->task = task;
1192 do_switch = 0;
1193
1194 perf_event_sync_stat(ctx, next_ctx);
1195 }
1196 spin_unlock(&next_ctx->lock);
1197 spin_unlock(&ctx->lock);
1198 }
1199 rcu_read_unlock();
1200
1201 if (do_switch) {
1202 __perf_event_sched_out(ctx, cpuctx);
1203 cpuctx->task_ctx = NULL;
1204 }
1205}
1206
1207/*
1208 * Called with IRQs disabled
1209 */
1210static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1211{
1212 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1213
1214 if (!cpuctx->task_ctx)
1215 return;
1216
1217 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1218 return;
1219
1220 __perf_event_sched_out(ctx, cpuctx);
1221 cpuctx->task_ctx = NULL;
1222}
1223
1224/*
1225 * Called with IRQs disabled
1226 */
1227static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1228{
1229 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1230}
1231
1232static void
1233__perf_event_sched_in(struct perf_event_context *ctx,
1234 struct perf_cpu_context *cpuctx, int cpu)
1235{
1236 struct perf_event *event;
1237 int can_add_hw = 1;
1238
1239 spin_lock(&ctx->lock);
1240 ctx->is_active = 1;
1241 if (likely(!ctx->nr_events))
1242 goto out;
1243
1244 ctx->timestamp = perf_clock();
1245
1246 perf_disable();
1247
1248 /*
1249 * First go through the list and put on any pinned groups
1250 * in order to give them the best chance of going on.
1251 */
1252 list_for_each_entry(event, &ctx->group_list, group_entry) {
1253 if (event->state <= PERF_EVENT_STATE_OFF ||
1254 !event->attr.pinned)
1255 continue;
1256 if (event->cpu != -1 && event->cpu != cpu)
1257 continue;
1258
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001259 if (group_can_go_on(event, cpuctx, 1))
1260 group_sched_in(event, cpuctx, ctx, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001261
1262 /*
1263 * If this pinned group hasn't been scheduled,
1264 * put it in error state.
1265 */
1266 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1267 update_group_times(event);
1268 event->state = PERF_EVENT_STATE_ERROR;
1269 }
1270 }
1271
1272 list_for_each_entry(event, &ctx->group_list, group_entry) {
1273 /*
1274 * Ignore events in OFF or ERROR state, and
1275 * ignore pinned events since we did them already.
1276 */
1277 if (event->state <= PERF_EVENT_STATE_OFF ||
1278 event->attr.pinned)
1279 continue;
1280
1281 /*
1282 * Listen to the 'cpu' scheduling filter constraint
1283 * of events:
1284 */
1285 if (event->cpu != -1 && event->cpu != cpu)
1286 continue;
1287
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001288 if (group_can_go_on(event, cpuctx, can_add_hw))
1289 if (group_sched_in(event, cpuctx, ctx, cpu))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001290 can_add_hw = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001291 }
1292 perf_enable();
1293 out:
1294 spin_unlock(&ctx->lock);
1295}
1296
1297/*
1298 * Called from scheduler to add the events of the current task
1299 * with interrupts disabled.
1300 *
1301 * We restore the event value and then enable it.
1302 *
1303 * This does not protect us against NMI, but enable()
1304 * sets the enabled bit in the control field of event _before_
1305 * accessing the event control register. If a NMI hits, then it will
1306 * keep the event running.
1307 */
1308void perf_event_task_sched_in(struct task_struct *task, int cpu)
1309{
1310 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1311 struct perf_event_context *ctx = task->perf_event_ctxp;
1312
1313 if (likely(!ctx))
1314 return;
1315 if (cpuctx->task_ctx == ctx)
1316 return;
1317 __perf_event_sched_in(ctx, cpuctx, cpu);
1318 cpuctx->task_ctx = ctx;
1319}
1320
1321static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1322{
1323 struct perf_event_context *ctx = &cpuctx->ctx;
1324
1325 __perf_event_sched_in(ctx, cpuctx, cpu);
1326}
1327
1328#define MAX_INTERRUPTS (~0ULL)
1329
1330static void perf_log_throttle(struct perf_event *event, int enable);
1331
1332static void perf_adjust_period(struct perf_event *event, u64 events)
1333{
1334 struct hw_perf_event *hwc = &event->hw;
1335 u64 period, sample_period;
1336 s64 delta;
1337
1338 events *= hwc->sample_period;
1339 period = div64_u64(events, event->attr.sample_freq);
1340
1341 delta = (s64)(period - hwc->sample_period);
1342 delta = (delta + 7) / 8; /* low pass filter */
1343
1344 sample_period = hwc->sample_period + delta;
1345
1346 if (!sample_period)
1347 sample_period = 1;
1348
1349 hwc->sample_period = sample_period;
1350}
1351
1352static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1353{
1354 struct perf_event *event;
1355 struct hw_perf_event *hwc;
1356 u64 interrupts, freq;
1357
1358 spin_lock(&ctx->lock);
Paul Mackerras03541f82009-10-14 16:58:03 +11001359 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001360 if (event->state != PERF_EVENT_STATE_ACTIVE)
1361 continue;
1362
1363 hwc = &event->hw;
1364
1365 interrupts = hwc->interrupts;
1366 hwc->interrupts = 0;
1367
1368 /*
1369 * unthrottle events on the tick
1370 */
1371 if (interrupts == MAX_INTERRUPTS) {
1372 perf_log_throttle(event, 1);
1373 event->pmu->unthrottle(event);
1374 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1375 }
1376
1377 if (!event->attr.freq || !event->attr.sample_freq)
1378 continue;
1379
1380 /*
1381 * if the specified freq < HZ then we need to skip ticks
1382 */
1383 if (event->attr.sample_freq < HZ) {
1384 freq = event->attr.sample_freq;
1385
1386 hwc->freq_count += freq;
1387 hwc->freq_interrupts += interrupts;
1388
1389 if (hwc->freq_count < HZ)
1390 continue;
1391
1392 interrupts = hwc->freq_interrupts;
1393 hwc->freq_interrupts = 0;
1394 hwc->freq_count -= HZ;
1395 } else
1396 freq = HZ;
1397
1398 perf_adjust_period(event, freq * interrupts);
1399
1400 /*
1401 * In order to avoid being stalled by an (accidental) huge
1402 * sample period, force reset the sample period if we didn't
1403 * get any events in this freq period.
1404 */
1405 if (!interrupts) {
1406 perf_disable();
1407 event->pmu->disable(event);
1408 atomic64_set(&hwc->period_left, 0);
1409 event->pmu->enable(event);
1410 perf_enable();
1411 }
1412 }
1413 spin_unlock(&ctx->lock);
1414}
1415
1416/*
1417 * Round-robin a context's events:
1418 */
1419static void rotate_ctx(struct perf_event_context *ctx)
1420{
1421 struct perf_event *event;
1422
1423 if (!ctx->nr_events)
1424 return;
1425
1426 spin_lock(&ctx->lock);
1427 /*
1428 * Rotate the first entry last (works just fine for group events too):
1429 */
1430 perf_disable();
1431 list_for_each_entry(event, &ctx->group_list, group_entry) {
1432 list_move_tail(&event->group_entry, &ctx->group_list);
1433 break;
1434 }
1435 perf_enable();
1436
1437 spin_unlock(&ctx->lock);
1438}
1439
1440void perf_event_task_tick(struct task_struct *curr, int cpu)
1441{
1442 struct perf_cpu_context *cpuctx;
1443 struct perf_event_context *ctx;
1444
1445 if (!atomic_read(&nr_events))
1446 return;
1447
1448 cpuctx = &per_cpu(perf_cpu_context, cpu);
1449 ctx = curr->perf_event_ctxp;
1450
1451 perf_ctx_adjust_freq(&cpuctx->ctx);
1452 if (ctx)
1453 perf_ctx_adjust_freq(ctx);
1454
1455 perf_event_cpu_sched_out(cpuctx);
1456 if (ctx)
1457 __perf_event_task_sched_out(ctx);
1458
1459 rotate_ctx(&cpuctx->ctx);
1460 if (ctx)
1461 rotate_ctx(ctx);
1462
1463 perf_event_cpu_sched_in(cpuctx, cpu);
1464 if (ctx)
1465 perf_event_task_sched_in(curr, cpu);
1466}
1467
1468/*
1469 * Enable all of a task's events that have been marked enable-on-exec.
1470 * This expects task == current.
1471 */
1472static void perf_event_enable_on_exec(struct task_struct *task)
1473{
1474 struct perf_event_context *ctx;
1475 struct perf_event *event;
1476 unsigned long flags;
1477 int enabled = 0;
1478
1479 local_irq_save(flags);
1480 ctx = task->perf_event_ctxp;
1481 if (!ctx || !ctx->nr_events)
1482 goto out;
1483
1484 __perf_event_task_sched_out(ctx);
1485
1486 spin_lock(&ctx->lock);
1487
1488 list_for_each_entry(event, &ctx->group_list, group_entry) {
1489 if (!event->attr.enable_on_exec)
1490 continue;
1491 event->attr.enable_on_exec = 0;
1492 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1493 continue;
1494 __perf_event_mark_enabled(event, ctx);
1495 enabled = 1;
1496 }
1497
1498 /*
1499 * Unclone this context if we enabled any event.
1500 */
1501 if (enabled)
1502 unclone_ctx(ctx);
1503
1504 spin_unlock(&ctx->lock);
1505
1506 perf_event_task_sched_in(task, smp_processor_id());
1507 out:
1508 local_irq_restore(flags);
1509}
1510
1511/*
1512 * Cross CPU call to read the hardware event
1513 */
1514static void __perf_event_read(void *info)
1515{
1516 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517 struct perf_event *event = info;
1518 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001519
1520 /*
1521 * If this is a task context, we need to check whether it is
1522 * the current task context of this cpu. If not it has been
1523 * scheduled out before the smp call arrived. In that case
1524 * event->count would have been updated to a recent sample
1525 * when the event was scheduled out.
1526 */
1527 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return;
1529
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001530 spin_lock(&ctx->lock);
Peter Zijlstra58e5ad12009-11-20 22:19:53 +01001531 update_context_time(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001532 update_event_times(event);
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001533 spin_unlock(&ctx->lock);
1534
Peter Zijlstra58e5ad12009-11-20 22:19:53 +01001535 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001536}
1537
1538static u64 perf_event_read(struct perf_event *event)
1539{
1540 /*
1541 * If event is enabled and currently active on a CPU, update the
1542 * value in the event structure:
1543 */
1544 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1545 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001548 struct perf_event_context *ctx = event->ctx;
1549 unsigned long flags;
1550
1551 spin_lock_irqsave(&ctx->lock, flags);
1552 update_context_time(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001553 update_event_times(event);
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01001554 spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001555 }
1556
1557 return atomic64_read(&event->count);
1558}
1559
1560/*
1561 * Initialize the perf_event context in a task_struct:
1562 */
1563static void
1564__perf_event_init_context(struct perf_event_context *ctx,
1565 struct task_struct *task)
1566{
1567 memset(ctx, 0, sizeof(*ctx));
1568 spin_lock_init(&ctx->lock);
1569 mutex_init(&ctx->mutex);
1570 INIT_LIST_HEAD(&ctx->group_list);
1571 INIT_LIST_HEAD(&ctx->event_list);
1572 atomic_set(&ctx->refcount, 1);
1573 ctx->task = task;
1574}
1575
1576static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1577{
1578 struct perf_event_context *ctx;
1579 struct perf_cpu_context *cpuctx;
1580 struct task_struct *task;
1581 unsigned long flags;
1582 int err;
1583
1584 /*
1585 * If cpu is not a wildcard then this is a percpu event:
1586 */
1587 if (cpu != -1) {
1588 /* Must be root to operate on a CPU event: */
1589 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1590 return ERR_PTR(-EACCES);
1591
1592 if (cpu < 0 || cpu > num_possible_cpus())
1593 return ERR_PTR(-EINVAL);
1594
1595 /*
1596 * We could be clever and allow to attach a event to an
1597 * offline CPU and activate it when the CPU comes up, but
1598 * that's for later.
1599 */
1600 if (!cpu_isset(cpu, cpu_online_map))
1601 return ERR_PTR(-ENODEV);
1602
1603 cpuctx = &per_cpu(perf_cpu_context, cpu);
1604 ctx = &cpuctx->ctx;
1605 get_ctx(ctx);
1606
1607 return ctx;
1608 }
1609
1610 rcu_read_lock();
1611 if (!pid)
1612 task = current;
1613 else
1614 task = find_task_by_vpid(pid);
1615 if (task)
1616 get_task_struct(task);
1617 rcu_read_unlock();
1618
1619 if (!task)
1620 return ERR_PTR(-ESRCH);
1621
1622 /*
1623 * Can't attach events to a dying task.
1624 */
1625 err = -ESRCH;
1626 if (task->flags & PF_EXITING)
1627 goto errout;
1628
1629 /* Reuse ptrace permission checks for now. */
1630 err = -EACCES;
1631 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1632 goto errout;
1633
1634 retry:
1635 ctx = perf_lock_task_context(task, &flags);
1636 if (ctx) {
1637 unclone_ctx(ctx);
1638 spin_unlock_irqrestore(&ctx->lock, flags);
1639 }
1640
1641 if (!ctx) {
1642 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1643 err = -ENOMEM;
1644 if (!ctx)
1645 goto errout;
1646 __perf_event_init_context(ctx, task);
1647 get_ctx(ctx);
1648 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1649 /*
1650 * We raced with some other task; use
1651 * the context they set.
1652 */
1653 kfree(ctx);
1654 goto retry;
1655 }
1656 get_task_struct(task);
1657 }
1658
1659 put_task_struct(task);
1660 return ctx;
1661
1662 errout:
1663 put_task_struct(task);
1664 return ERR_PTR(err);
1665}
1666
Li Zefan6fb29152009-10-15 11:21:42 +08001667static void perf_event_free_filter(struct perf_event *event);
1668
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001669static void free_event_rcu(struct rcu_head *head)
1670{
1671 struct perf_event *event;
1672
1673 event = container_of(head, struct perf_event, rcu_head);
1674 if (event->ns)
1675 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08001676 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001677 kfree(event);
1678}
1679
1680static void perf_pending_sync(struct perf_event *event);
1681
1682static void free_event(struct perf_event *event)
1683{
1684 perf_pending_sync(event);
1685
1686 if (!event->parent) {
1687 atomic_dec(&nr_events);
1688 if (event->attr.mmap)
1689 atomic_dec(&nr_mmap_events);
1690 if (event->attr.comm)
1691 atomic_dec(&nr_comm_events);
1692 if (event->attr.task)
1693 atomic_dec(&nr_task_events);
1694 }
1695
1696 if (event->output) {
1697 fput(event->output->filp);
1698 event->output = NULL;
1699 }
1700
1701 if (event->destroy)
1702 event->destroy(event);
1703
1704 put_ctx(event->ctx);
1705 call_rcu(&event->rcu_head, free_event_rcu);
1706}
1707
Arjan van de Venfb0459d2009-09-25 12:25:56 +02001708int perf_event_release_kernel(struct perf_event *event)
1709{
1710 struct perf_event_context *ctx = event->ctx;
1711
1712 WARN_ON_ONCE(ctx->parent_ctx);
1713 mutex_lock(&ctx->mutex);
1714 perf_event_remove_from_context(event);
1715 mutex_unlock(&ctx->mutex);
1716
1717 mutex_lock(&event->owner->perf_event_mutex);
1718 list_del_init(&event->owner_entry);
1719 mutex_unlock(&event->owner->perf_event_mutex);
1720 put_task_struct(event->owner);
1721
1722 free_event(event);
1723
1724 return 0;
1725}
1726EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1727
Peter Zijlstraa66a3052009-11-23 11:37:23 +01001728/*
1729 * Called when the last reference to the file is gone.
1730 */
1731static int perf_release(struct inode *inode, struct file *file)
1732{
1733 struct perf_event *event = file->private_data;
1734
1735 file->private_data = NULL;
1736
1737 return perf_event_release_kernel(event);
1738}
1739
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001740static int perf_event_read_size(struct perf_event *event)
1741{
1742 int entry = sizeof(u64); /* value */
1743 int size = 0;
1744 int nr = 1;
1745
1746 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1747 size += sizeof(u64);
1748
1749 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1750 size += sizeof(u64);
1751
1752 if (event->attr.read_format & PERF_FORMAT_ID)
1753 entry += sizeof(u64);
1754
1755 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1756 nr += event->group_leader->nr_siblings;
1757 size += sizeof(u64);
1758 }
1759
1760 size += entry * nr;
1761
1762 return size;
1763}
1764
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001765u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001766{
1767 struct perf_event *child;
1768 u64 total = 0;
1769
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001770 *enabled = 0;
1771 *running = 0;
1772
Peter Zijlstra6f105812009-11-20 22:19:56 +01001773 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001774 total += perf_event_read(event);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001775 *enabled += event->total_time_enabled +
1776 atomic64_read(&event->child_total_time_enabled);
1777 *running += event->total_time_running +
1778 atomic64_read(&event->child_total_time_running);
1779
1780 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001781 total += perf_event_read(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001782 *enabled += child->total_time_enabled;
1783 *running += child->total_time_running;
1784 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01001785 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001786
1787 return total;
1788}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02001789EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001790
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001791static int perf_event_read_group(struct perf_event *event,
1792 u64 read_format, char __user *buf)
1793{
1794 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01001795 int n = 0, size = 0, ret = -EFAULT;
1796 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001797 u64 values[5];
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001798 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001799
Peter Zijlstra6f105812009-11-20 22:19:56 +01001800 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001801 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001802
1803 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001804 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1805 values[n++] = enabled;
1806 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1807 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01001808 values[n++] = count;
1809 if (read_format & PERF_FORMAT_ID)
1810 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001811
1812 size = n * sizeof(u64);
1813
1814 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01001815 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001816
Peter Zijlstra6f105812009-11-20 22:19:56 +01001817 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001818
1819 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01001820 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001821
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001822 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01001823 if (read_format & PERF_FORMAT_ID)
1824 values[n++] = primary_event_id(sub);
1825
1826 size = n * sizeof(u64);
1827
Peter Zijlstra6f105812009-11-20 22:19:56 +01001828 if (copy_to_user(buf + size, values, size)) {
1829 ret = -EFAULT;
1830 goto unlock;
1831 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01001832
1833 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001834 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01001835unlock:
1836 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001837
Peter Zijlstraabf48682009-11-20 22:19:49 +01001838 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001839}
1840
1841static int perf_event_read_one(struct perf_event *event,
1842 u64 read_format, char __user *buf)
1843{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001844 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001845 u64 values[4];
1846 int n = 0;
1847
Peter Zijlstra59ed4462009-11-20 22:19:55 +01001848 values[n++] = perf_event_read_value(event, &enabled, &running);
1849 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1850 values[n++] = enabled;
1851 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1852 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001853 if (read_format & PERF_FORMAT_ID)
1854 values[n++] = primary_event_id(event);
1855
1856 if (copy_to_user(buf, values, n * sizeof(u64)))
1857 return -EFAULT;
1858
1859 return n * sizeof(u64);
1860}
1861
1862/*
1863 * Read the performance event - simple non blocking version for now
1864 */
1865static ssize_t
1866perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1867{
1868 u64 read_format = event->attr.read_format;
1869 int ret;
1870
1871 /*
1872 * Return end-of-file for a read on a event that is in
1873 * error state (i.e. because it was pinned but it couldn't be
1874 * scheduled on to the CPU at some point).
1875 */
1876 if (event->state == PERF_EVENT_STATE_ERROR)
1877 return 0;
1878
1879 if (count < perf_event_read_size(event))
1880 return -ENOSPC;
1881
1882 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001883 if (read_format & PERF_FORMAT_GROUP)
1884 ret = perf_event_read_group(event, read_format, buf);
1885 else
1886 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001887
1888 return ret;
1889}
1890
1891static ssize_t
1892perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1893{
1894 struct perf_event *event = file->private_data;
1895
1896 return perf_read_hw(event, buf, count);
1897}
1898
1899static unsigned int perf_poll(struct file *file, poll_table *wait)
1900{
1901 struct perf_event *event = file->private_data;
1902 struct perf_mmap_data *data;
1903 unsigned int events = POLL_HUP;
1904
1905 rcu_read_lock();
1906 data = rcu_dereference(event->data);
1907 if (data)
1908 events = atomic_xchg(&data->poll, 0);
1909 rcu_read_unlock();
1910
1911 poll_wait(file, &event->waitq, wait);
1912
1913 return events;
1914}
1915
1916static void perf_event_reset(struct perf_event *event)
1917{
1918 (void)perf_event_read(event);
1919 atomic64_set(&event->count, 0);
1920 perf_event_update_userpage(event);
1921}
1922
1923/*
1924 * Holding the top-level event's child_mutex means that any
1925 * descendant process that has inherited this event will block
1926 * in sync_child_event if it goes to exit, thus satisfying the
1927 * task existence requirements of perf_event_enable/disable.
1928 */
1929static void perf_event_for_each_child(struct perf_event *event,
1930 void (*func)(struct perf_event *))
1931{
1932 struct perf_event *child;
1933
1934 WARN_ON_ONCE(event->ctx->parent_ctx);
1935 mutex_lock(&event->child_mutex);
1936 func(event);
1937 list_for_each_entry(child, &event->child_list, child_list)
1938 func(child);
1939 mutex_unlock(&event->child_mutex);
1940}
1941
1942static void perf_event_for_each(struct perf_event *event,
1943 void (*func)(struct perf_event *))
1944{
1945 struct perf_event_context *ctx = event->ctx;
1946 struct perf_event *sibling;
1947
1948 WARN_ON_ONCE(ctx->parent_ctx);
1949 mutex_lock(&ctx->mutex);
1950 event = event->group_leader;
1951
1952 perf_event_for_each_child(event, func);
1953 func(event);
1954 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1955 perf_event_for_each_child(event, func);
1956 mutex_unlock(&ctx->mutex);
1957}
1958
1959static int perf_event_period(struct perf_event *event, u64 __user *arg)
1960{
1961 struct perf_event_context *ctx = event->ctx;
1962 unsigned long size;
1963 int ret = 0;
1964 u64 value;
1965
1966 if (!event->attr.sample_period)
1967 return -EINVAL;
1968
1969 size = copy_from_user(&value, arg, sizeof(value));
1970 if (size != sizeof(value))
1971 return -EFAULT;
1972
1973 if (!value)
1974 return -EINVAL;
1975
1976 spin_lock_irq(&ctx->lock);
1977 if (event->attr.freq) {
1978 if (value > sysctl_perf_event_sample_rate) {
1979 ret = -EINVAL;
1980 goto unlock;
1981 }
1982
1983 event->attr.sample_freq = value;
1984 } else {
1985 event->attr.sample_period = value;
1986 event->hw.sample_period = value;
1987 }
1988unlock:
1989 spin_unlock_irq(&ctx->lock);
1990
1991 return ret;
1992}
1993
Li Zefan6fb29152009-10-15 11:21:42 +08001994static int perf_event_set_output(struct perf_event *event, int output_fd);
1995static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001996
1997static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1998{
1999 struct perf_event *event = file->private_data;
2000 void (*func)(struct perf_event *);
2001 u32 flags = arg;
2002
2003 switch (cmd) {
2004 case PERF_EVENT_IOC_ENABLE:
2005 func = perf_event_enable;
2006 break;
2007 case PERF_EVENT_IOC_DISABLE:
2008 func = perf_event_disable;
2009 break;
2010 case PERF_EVENT_IOC_RESET:
2011 func = perf_event_reset;
2012 break;
2013
2014 case PERF_EVENT_IOC_REFRESH:
2015 return perf_event_refresh(event, arg);
2016
2017 case PERF_EVENT_IOC_PERIOD:
2018 return perf_event_period(event, (u64 __user *)arg);
2019
2020 case PERF_EVENT_IOC_SET_OUTPUT:
2021 return perf_event_set_output(event, arg);
2022
Li Zefan6fb29152009-10-15 11:21:42 +08002023 case PERF_EVENT_IOC_SET_FILTER:
2024 return perf_event_set_filter(event, (void __user *)arg);
2025
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002026 default:
2027 return -ENOTTY;
2028 }
2029
2030 if (flags & PERF_IOC_FLAG_GROUP)
2031 perf_event_for_each(event, func);
2032 else
2033 perf_event_for_each_child(event, func);
2034
2035 return 0;
2036}
2037
2038int perf_event_task_enable(void)
2039{
2040 struct perf_event *event;
2041
2042 mutex_lock(&current->perf_event_mutex);
2043 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2044 perf_event_for_each_child(event, perf_event_enable);
2045 mutex_unlock(&current->perf_event_mutex);
2046
2047 return 0;
2048}
2049
2050int perf_event_task_disable(void)
2051{
2052 struct perf_event *event;
2053
2054 mutex_lock(&current->perf_event_mutex);
2055 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2056 perf_event_for_each_child(event, perf_event_disable);
2057 mutex_unlock(&current->perf_event_mutex);
2058
2059 return 0;
2060}
2061
2062#ifndef PERF_EVENT_INDEX_OFFSET
2063# define PERF_EVENT_INDEX_OFFSET 0
2064#endif
2065
2066static int perf_event_index(struct perf_event *event)
2067{
2068 if (event->state != PERF_EVENT_STATE_ACTIVE)
2069 return 0;
2070
2071 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2072}
2073
2074/*
2075 * Callers need to ensure there can be no nesting of this function, otherwise
2076 * the seqlock logic goes bad. We can not serialize this because the arch
2077 * code calls this from NMI context.
2078 */
2079void perf_event_update_userpage(struct perf_event *event)
2080{
2081 struct perf_event_mmap_page *userpg;
2082 struct perf_mmap_data *data;
2083
2084 rcu_read_lock();
2085 data = rcu_dereference(event->data);
2086 if (!data)
2087 goto unlock;
2088
2089 userpg = data->user_page;
2090
2091 /*
2092 * Disable preemption so as to not let the corresponding user-space
2093 * spin too long if we get preempted.
2094 */
2095 preempt_disable();
2096 ++userpg->lock;
2097 barrier();
2098 userpg->index = perf_event_index(event);
2099 userpg->offset = atomic64_read(&event->count);
2100 if (event->state == PERF_EVENT_STATE_ACTIVE)
2101 userpg->offset -= atomic64_read(&event->hw.prev_count);
2102
2103 userpg->time_enabled = event->total_time_enabled +
2104 atomic64_read(&event->child_total_time_enabled);
2105
2106 userpg->time_running = event->total_time_running +
2107 atomic64_read(&event->child_total_time_running);
2108
2109 barrier();
2110 ++userpg->lock;
2111 preempt_enable();
2112unlock:
2113 rcu_read_unlock();
2114}
2115
Peter Zijlstra906010b2009-09-21 16:08:49 +02002116static unsigned long perf_data_size(struct perf_mmap_data *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002117{
Peter Zijlstra906010b2009-09-21 16:08:49 +02002118 return data->nr_pages << (PAGE_SHIFT + data->data_order);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002119}
2120
Peter Zijlstra906010b2009-09-21 16:08:49 +02002121#ifndef CONFIG_PERF_USE_VMALLOC
2122
2123/*
2124 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2125 */
2126
2127static struct page *
2128perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2129{
2130 if (pgoff > data->nr_pages)
2131 return NULL;
2132
2133 if (pgoff == 0)
2134 return virt_to_page(data->user_page);
2135
2136 return virt_to_page(data->data_pages[pgoff - 1]);
2137}
2138
2139static struct perf_mmap_data *
2140perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002141{
2142 struct perf_mmap_data *data;
2143 unsigned long size;
2144 int i;
2145
2146 WARN_ON(atomic_read(&event->mmap_count));
2147
2148 size = sizeof(struct perf_mmap_data);
2149 size += nr_pages * sizeof(void *);
2150
2151 data = kzalloc(size, GFP_KERNEL);
2152 if (!data)
2153 goto fail;
2154
2155 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2156 if (!data->user_page)
2157 goto fail_user_page;
2158
2159 for (i = 0; i < nr_pages; i++) {
2160 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2161 if (!data->data_pages[i])
2162 goto fail_data_pages;
2163 }
2164
Peter Zijlstra906010b2009-09-21 16:08:49 +02002165 data->data_order = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002166 data->nr_pages = nr_pages;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002167
Peter Zijlstra906010b2009-09-21 16:08:49 +02002168 return data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002169
2170fail_data_pages:
2171 for (i--; i >= 0; i--)
2172 free_page((unsigned long)data->data_pages[i]);
2173
2174 free_page((unsigned long)data->user_page);
2175
2176fail_user_page:
2177 kfree(data);
2178
2179fail:
Peter Zijlstra906010b2009-09-21 16:08:49 +02002180 return NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002181}
2182
2183static void perf_mmap_free_page(unsigned long addr)
2184{
2185 struct page *page = virt_to_page((void *)addr);
2186
2187 page->mapping = NULL;
2188 __free_page(page);
2189}
2190
Peter Zijlstra906010b2009-09-21 16:08:49 +02002191static void perf_mmap_data_free(struct perf_mmap_data *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002192{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002193 int i;
2194
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002195 perf_mmap_free_page((unsigned long)data->user_page);
2196 for (i = 0; i < data->nr_pages; i++)
2197 perf_mmap_free_page((unsigned long)data->data_pages[i]);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002198}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002199
Peter Zijlstra906010b2009-09-21 16:08:49 +02002200#else
2201
2202/*
2203 * Back perf_mmap() with vmalloc memory.
2204 *
2205 * Required for architectures that have d-cache aliasing issues.
2206 */
2207
2208static struct page *
2209perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2210{
2211 if (pgoff > (1UL << data->data_order))
2212 return NULL;
2213
2214 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2215}
2216
2217static void perf_mmap_unmark_page(void *addr)
2218{
2219 struct page *page = vmalloc_to_page(addr);
2220
2221 page->mapping = NULL;
2222}
2223
2224static void perf_mmap_data_free_work(struct work_struct *work)
2225{
2226 struct perf_mmap_data *data;
2227 void *base;
2228 int i, nr;
2229
2230 data = container_of(work, struct perf_mmap_data, work);
2231 nr = 1 << data->data_order;
2232
2233 base = data->user_page;
2234 for (i = 0; i < nr + 1; i++)
2235 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2236
2237 vfree(base);
2238}
2239
2240static void perf_mmap_data_free(struct perf_mmap_data *data)
2241{
2242 schedule_work(&data->work);
2243}
2244
2245static struct perf_mmap_data *
2246perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2247{
2248 struct perf_mmap_data *data;
2249 unsigned long size;
2250 void *all_buf;
2251
2252 WARN_ON(atomic_read(&event->mmap_count));
2253
2254 size = sizeof(struct perf_mmap_data);
2255 size += sizeof(void *);
2256
2257 data = kzalloc(size, GFP_KERNEL);
2258 if (!data)
2259 goto fail;
2260
2261 INIT_WORK(&data->work, perf_mmap_data_free_work);
2262
2263 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2264 if (!all_buf)
2265 goto fail_all_buf;
2266
2267 data->user_page = all_buf;
2268 data->data_pages[0] = all_buf + PAGE_SIZE;
2269 data->data_order = ilog2(nr_pages);
2270 data->nr_pages = 1;
2271
2272 return data;
2273
2274fail_all_buf:
2275 kfree(data);
2276
2277fail:
2278 return NULL;
2279}
2280
2281#endif
2282
2283static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2284{
2285 struct perf_event *event = vma->vm_file->private_data;
2286 struct perf_mmap_data *data;
2287 int ret = VM_FAULT_SIGBUS;
2288
2289 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2290 if (vmf->pgoff == 0)
2291 ret = 0;
2292 return ret;
2293 }
2294
2295 rcu_read_lock();
2296 data = rcu_dereference(event->data);
2297 if (!data)
2298 goto unlock;
2299
2300 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2301 goto unlock;
2302
2303 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2304 if (!vmf->page)
2305 goto unlock;
2306
2307 get_page(vmf->page);
2308 vmf->page->mapping = vma->vm_file->f_mapping;
2309 vmf->page->index = vmf->pgoff;
2310
2311 ret = 0;
2312unlock:
2313 rcu_read_unlock();
2314
2315 return ret;
2316}
2317
2318static void
2319perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2320{
2321 long max_size = perf_data_size(data);
2322
2323 atomic_set(&data->lock, -1);
2324
2325 if (event->attr.watermark) {
2326 data->watermark = min_t(long, max_size,
2327 event->attr.wakeup_watermark);
2328 }
2329
2330 if (!data->watermark)
Stephane Eranian8904b182009-11-20 22:19:57 +01002331 data->watermark = max_size / 2;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002332
2333
2334 rcu_assign_pointer(event->data, data);
2335}
2336
2337static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2338{
2339 struct perf_mmap_data *data;
2340
2341 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2342 perf_mmap_data_free(data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002343 kfree(data);
2344}
2345
Peter Zijlstra906010b2009-09-21 16:08:49 +02002346static void perf_mmap_data_release(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002347{
2348 struct perf_mmap_data *data = event->data;
2349
2350 WARN_ON(atomic_read(&event->mmap_count));
2351
2352 rcu_assign_pointer(event->data, NULL);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002353 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002354}
2355
2356static void perf_mmap_open(struct vm_area_struct *vma)
2357{
2358 struct perf_event *event = vma->vm_file->private_data;
2359
2360 atomic_inc(&event->mmap_count);
2361}
2362
2363static void perf_mmap_close(struct vm_area_struct *vma)
2364{
2365 struct perf_event *event = vma->vm_file->private_data;
2366
2367 WARN_ON_ONCE(event->ctx->parent_ctx);
2368 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstra906010b2009-09-21 16:08:49 +02002369 unsigned long size = perf_data_size(event->data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002370 struct user_struct *user = current_user();
2371
Peter Zijlstra906010b2009-09-21 16:08:49 +02002372 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002373 vma->vm_mm->locked_vm -= event->data->nr_locked;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002374 perf_mmap_data_release(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002375 mutex_unlock(&event->mmap_mutex);
2376 }
2377}
2378
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04002379static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002380 .open = perf_mmap_open,
2381 .close = perf_mmap_close,
2382 .fault = perf_mmap_fault,
2383 .page_mkwrite = perf_mmap_fault,
2384};
2385
2386static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2387{
2388 struct perf_event *event = file->private_data;
2389 unsigned long user_locked, user_lock_limit;
2390 struct user_struct *user = current_user();
2391 unsigned long locked, lock_limit;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002392 struct perf_mmap_data *data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002393 unsigned long vma_size;
2394 unsigned long nr_pages;
2395 long user_extra, extra;
2396 int ret = 0;
2397
2398 if (!(vma->vm_flags & VM_SHARED))
2399 return -EINVAL;
2400
2401 vma_size = vma->vm_end - vma->vm_start;
2402 nr_pages = (vma_size / PAGE_SIZE) - 1;
2403
2404 /*
2405 * If we have data pages ensure they're a power-of-two number, so we
2406 * can do bitmasks instead of modulo.
2407 */
2408 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2409 return -EINVAL;
2410
2411 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2412 return -EINVAL;
2413
2414 if (vma->vm_pgoff != 0)
2415 return -EINVAL;
2416
2417 WARN_ON_ONCE(event->ctx->parent_ctx);
2418 mutex_lock(&event->mmap_mutex);
2419 if (event->output) {
2420 ret = -EINVAL;
2421 goto unlock;
2422 }
2423
2424 if (atomic_inc_not_zero(&event->mmap_count)) {
2425 if (nr_pages != event->data->nr_pages)
2426 ret = -EINVAL;
2427 goto unlock;
2428 }
2429
2430 user_extra = nr_pages + 1;
2431 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2432
2433 /*
2434 * Increase the limit linearly with more CPUs:
2435 */
2436 user_lock_limit *= num_online_cpus();
2437
2438 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2439
2440 extra = 0;
2441 if (user_locked > user_lock_limit)
2442 extra = user_locked - user_lock_limit;
2443
2444 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2445 lock_limit >>= PAGE_SHIFT;
2446 locked = vma->vm_mm->locked_vm + extra;
2447
2448 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2449 !capable(CAP_IPC_LOCK)) {
2450 ret = -EPERM;
2451 goto unlock;
2452 }
2453
2454 WARN_ON(event->data);
Peter Zijlstra906010b2009-09-21 16:08:49 +02002455
2456 data = perf_mmap_data_alloc(event, nr_pages);
2457 ret = -ENOMEM;
2458 if (!data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002459 goto unlock;
2460
Peter Zijlstra906010b2009-09-21 16:08:49 +02002461 ret = 0;
2462 perf_mmap_data_init(event, data);
2463
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002464 atomic_set(&event->mmap_count, 1);
2465 atomic_long_add(user_extra, &user->locked_vm);
2466 vma->vm_mm->locked_vm += extra;
2467 event->data->nr_locked = extra;
2468 if (vma->vm_flags & VM_WRITE)
2469 event->data->writable = 1;
2470
2471unlock:
2472 mutex_unlock(&event->mmap_mutex);
2473
2474 vma->vm_flags |= VM_RESERVED;
2475 vma->vm_ops = &perf_mmap_vmops;
2476
2477 return ret;
2478}
2479
2480static int perf_fasync(int fd, struct file *filp, int on)
2481{
2482 struct inode *inode = filp->f_path.dentry->d_inode;
2483 struct perf_event *event = filp->private_data;
2484 int retval;
2485
2486 mutex_lock(&inode->i_mutex);
2487 retval = fasync_helper(fd, filp, on, &event->fasync);
2488 mutex_unlock(&inode->i_mutex);
2489
2490 if (retval < 0)
2491 return retval;
2492
2493 return 0;
2494}
2495
2496static const struct file_operations perf_fops = {
2497 .release = perf_release,
2498 .read = perf_read,
2499 .poll = perf_poll,
2500 .unlocked_ioctl = perf_ioctl,
2501 .compat_ioctl = perf_ioctl,
2502 .mmap = perf_mmap,
2503 .fasync = perf_fasync,
2504};
2505
2506/*
2507 * Perf event wakeup
2508 *
2509 * If there's data, ensure we set the poll() state and publish everything
2510 * to user-space before waking everybody up.
2511 */
2512
2513void perf_event_wakeup(struct perf_event *event)
2514{
2515 wake_up_all(&event->waitq);
2516
2517 if (event->pending_kill) {
2518 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2519 event->pending_kill = 0;
2520 }
2521}
2522
2523/*
2524 * Pending wakeups
2525 *
2526 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2527 *
2528 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2529 * single linked list and use cmpxchg() to add entries lockless.
2530 */
2531
2532static void perf_pending_event(struct perf_pending_entry *entry)
2533{
2534 struct perf_event *event = container_of(entry,
2535 struct perf_event, pending);
2536
2537 if (event->pending_disable) {
2538 event->pending_disable = 0;
2539 __perf_event_disable(event);
2540 }
2541
2542 if (event->pending_wakeup) {
2543 event->pending_wakeup = 0;
2544 perf_event_wakeup(event);
2545 }
2546}
2547
2548#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2549
2550static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2551 PENDING_TAIL,
2552};
2553
2554static void perf_pending_queue(struct perf_pending_entry *entry,
2555 void (*func)(struct perf_pending_entry *))
2556{
2557 struct perf_pending_entry **head;
2558
2559 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2560 return;
2561
2562 entry->func = func;
2563
2564 head = &get_cpu_var(perf_pending_head);
2565
2566 do {
2567 entry->next = *head;
2568 } while (cmpxchg(head, entry->next, entry) != entry->next);
2569
2570 set_perf_event_pending();
2571
2572 put_cpu_var(perf_pending_head);
2573}
2574
2575static int __perf_pending_run(void)
2576{
2577 struct perf_pending_entry *list;
2578 int nr = 0;
2579
2580 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2581 while (list != PENDING_TAIL) {
2582 void (*func)(struct perf_pending_entry *);
2583 struct perf_pending_entry *entry = list;
2584
2585 list = list->next;
2586
2587 func = entry->func;
2588 entry->next = NULL;
2589 /*
2590 * Ensure we observe the unqueue before we issue the wakeup,
2591 * so that we won't be waiting forever.
2592 * -- see perf_not_pending().
2593 */
2594 smp_wmb();
2595
2596 func(entry);
2597 nr++;
2598 }
2599
2600 return nr;
2601}
2602
2603static inline int perf_not_pending(struct perf_event *event)
2604{
2605 /*
2606 * If we flush on whatever cpu we run, there is a chance we don't
2607 * need to wait.
2608 */
2609 get_cpu();
2610 __perf_pending_run();
2611 put_cpu();
2612
2613 /*
2614 * Ensure we see the proper queue state before going to sleep
2615 * so that we do not miss the wakeup. -- see perf_pending_handle()
2616 */
2617 smp_rmb();
2618 return event->pending.next == NULL;
2619}
2620
2621static void perf_pending_sync(struct perf_event *event)
2622{
2623 wait_event(event->waitq, perf_not_pending(event));
2624}
2625
2626void perf_event_do_pending(void)
2627{
2628 __perf_pending_run();
2629}
2630
2631/*
2632 * Callchain support -- arch specific
2633 */
2634
2635__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2636{
2637 return NULL;
2638}
2639
2640/*
2641 * Output
2642 */
2643static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2644 unsigned long offset, unsigned long head)
2645{
2646 unsigned long mask;
2647
2648 if (!data->writable)
2649 return true;
2650
Peter Zijlstra906010b2009-09-21 16:08:49 +02002651 mask = perf_data_size(data) - 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002652
2653 offset = (offset - tail) & mask;
2654 head = (head - tail) & mask;
2655
2656 if ((int)(head - offset) < 0)
2657 return false;
2658
2659 return true;
2660}
2661
2662static void perf_output_wakeup(struct perf_output_handle *handle)
2663{
2664 atomic_set(&handle->data->poll, POLL_IN);
2665
2666 if (handle->nmi) {
2667 handle->event->pending_wakeup = 1;
2668 perf_pending_queue(&handle->event->pending,
2669 perf_pending_event);
2670 } else
2671 perf_event_wakeup(handle->event);
2672}
2673
2674/*
2675 * Curious locking construct.
2676 *
2677 * We need to ensure a later event_id doesn't publish a head when a former
2678 * event_id isn't done writing. However since we need to deal with NMIs we
2679 * cannot fully serialize things.
2680 *
2681 * What we do is serialize between CPUs so we only have to deal with NMI
2682 * nesting on a single CPU.
2683 *
2684 * We only publish the head (and generate a wakeup) when the outer-most
2685 * event_id completes.
2686 */
2687static void perf_output_lock(struct perf_output_handle *handle)
2688{
2689 struct perf_mmap_data *data = handle->data;
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002690 int cur, cpu = get_cpu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002691
2692 handle->locked = 0;
2693
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002694 for (;;) {
2695 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2696 if (cur == -1) {
2697 handle->locked = 1;
2698 break;
2699 }
2700 if (cur == cpu)
2701 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002702
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002703 cpu_relax();
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002704 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002705}
2706
2707static void perf_output_unlock(struct perf_output_handle *handle)
2708{
2709 struct perf_mmap_data *data = handle->data;
2710 unsigned long head;
2711 int cpu;
2712
2713 data->done_head = data->head;
2714
2715 if (!handle->locked)
2716 goto out;
2717
2718again:
2719 /*
2720 * The xchg implies a full barrier that ensures all writes are done
2721 * before we publish the new head, matched by a rmb() in userspace when
2722 * reading this position.
2723 */
2724 while ((head = atomic_long_xchg(&data->done_head, 0)))
2725 data->user_page->data_head = head;
2726
2727 /*
2728 * NMI can happen here, which means we can miss a done_head update.
2729 */
2730
2731 cpu = atomic_xchg(&data->lock, -1);
2732 WARN_ON_ONCE(cpu != smp_processor_id());
2733
2734 /*
2735 * Therefore we have to validate we did not indeed do so.
2736 */
2737 if (unlikely(atomic_long_read(&data->done_head))) {
2738 /*
2739 * Since we had it locked, we can lock it again.
2740 */
2741 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2742 cpu_relax();
2743
2744 goto again;
2745 }
2746
2747 if (atomic_xchg(&data->wakeup, 0))
2748 perf_output_wakeup(handle);
2749out:
Peter Zijlstra559fdc32009-11-16 12:45:14 +01002750 put_cpu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002751}
2752
2753void perf_output_copy(struct perf_output_handle *handle,
2754 const void *buf, unsigned int len)
2755{
2756 unsigned int pages_mask;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002757 unsigned long offset;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002758 unsigned int size;
2759 void **pages;
2760
2761 offset = handle->offset;
2762 pages_mask = handle->data->nr_pages - 1;
2763 pages = handle->data->data_pages;
2764
2765 do {
Peter Zijlstra906010b2009-09-21 16:08:49 +02002766 unsigned long page_offset;
2767 unsigned long page_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002768 int nr;
2769
2770 nr = (offset >> PAGE_SHIFT) & pages_mask;
Peter Zijlstra906010b2009-09-21 16:08:49 +02002771 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2772 page_offset = offset & (page_size - 1);
2773 size = min_t(unsigned int, page_size - page_offset, len);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002774
2775 memcpy(pages[nr] + page_offset, buf, size);
2776
2777 len -= size;
2778 buf += size;
2779 offset += size;
2780 } while (len);
2781
2782 handle->offset = offset;
2783
2784 /*
2785 * Check we didn't copy past our reservation window, taking the
2786 * possible unsigned int wrap into account.
2787 */
2788 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2789}
2790
2791int perf_output_begin(struct perf_output_handle *handle,
2792 struct perf_event *event, unsigned int size,
2793 int nmi, int sample)
2794{
2795 struct perf_event *output_event;
2796 struct perf_mmap_data *data;
2797 unsigned long tail, offset, head;
2798 int have_lost;
2799 struct {
2800 struct perf_event_header header;
2801 u64 id;
2802 u64 lost;
2803 } lost_event;
2804
2805 rcu_read_lock();
2806 /*
2807 * For inherited events we send all the output towards the parent.
2808 */
2809 if (event->parent)
2810 event = event->parent;
2811
2812 output_event = rcu_dereference(event->output);
2813 if (output_event)
2814 event = output_event;
2815
2816 data = rcu_dereference(event->data);
2817 if (!data)
2818 goto out;
2819
2820 handle->data = data;
2821 handle->event = event;
2822 handle->nmi = nmi;
2823 handle->sample = sample;
2824
2825 if (!data->nr_pages)
2826 goto fail;
2827
2828 have_lost = atomic_read(&data->lost);
2829 if (have_lost)
2830 size += sizeof(lost_event);
2831
2832 perf_output_lock(handle);
2833
2834 do {
2835 /*
2836 * Userspace could choose to issue a mb() before updating the
2837 * tail pointer. So that all reads will be completed before the
2838 * write is issued.
2839 */
2840 tail = ACCESS_ONCE(data->user_page->data_tail);
2841 smp_rmb();
2842 offset = head = atomic_long_read(&data->head);
2843 head += size;
2844 if (unlikely(!perf_output_space(data, tail, offset, head)))
2845 goto fail;
2846 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2847
2848 handle->offset = offset;
2849 handle->head = head;
2850
2851 if (head - tail > data->watermark)
2852 atomic_set(&data->wakeup, 1);
2853
2854 if (have_lost) {
2855 lost_event.header.type = PERF_RECORD_LOST;
2856 lost_event.header.misc = 0;
2857 lost_event.header.size = sizeof(lost_event);
2858 lost_event.id = event->id;
2859 lost_event.lost = atomic_xchg(&data->lost, 0);
2860
2861 perf_output_put(handle, lost_event);
2862 }
2863
2864 return 0;
2865
2866fail:
2867 atomic_inc(&data->lost);
2868 perf_output_unlock(handle);
2869out:
2870 rcu_read_unlock();
2871
2872 return -ENOSPC;
2873}
2874
2875void perf_output_end(struct perf_output_handle *handle)
2876{
2877 struct perf_event *event = handle->event;
2878 struct perf_mmap_data *data = handle->data;
2879
2880 int wakeup_events = event->attr.wakeup_events;
2881
2882 if (handle->sample && wakeup_events) {
2883 int events = atomic_inc_return(&data->events);
2884 if (events >= wakeup_events) {
2885 atomic_sub(wakeup_events, &data->events);
2886 atomic_set(&data->wakeup, 1);
2887 }
2888 }
2889
2890 perf_output_unlock(handle);
2891 rcu_read_unlock();
2892}
2893
2894static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2895{
2896 /*
2897 * only top level events have the pid namespace they were created in
2898 */
2899 if (event->parent)
2900 event = event->parent;
2901
2902 return task_tgid_nr_ns(p, event->ns);
2903}
2904
2905static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2906{
2907 /*
2908 * only top level events have the pid namespace they were created in
2909 */
2910 if (event->parent)
2911 event = event->parent;
2912
2913 return task_pid_nr_ns(p, event->ns);
2914}
2915
2916static void perf_output_read_one(struct perf_output_handle *handle,
2917 struct perf_event *event)
2918{
2919 u64 read_format = event->attr.read_format;
2920 u64 values[4];
2921 int n = 0;
2922
2923 values[n++] = atomic64_read(&event->count);
2924 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2925 values[n++] = event->total_time_enabled +
2926 atomic64_read(&event->child_total_time_enabled);
2927 }
2928 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2929 values[n++] = event->total_time_running +
2930 atomic64_read(&event->child_total_time_running);
2931 }
2932 if (read_format & PERF_FORMAT_ID)
2933 values[n++] = primary_event_id(event);
2934
2935 perf_output_copy(handle, values, n * sizeof(u64));
2936}
2937
2938/*
2939 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2940 */
2941static void perf_output_read_group(struct perf_output_handle *handle,
2942 struct perf_event *event)
2943{
2944 struct perf_event *leader = event->group_leader, *sub;
2945 u64 read_format = event->attr.read_format;
2946 u64 values[5];
2947 int n = 0;
2948
2949 values[n++] = 1 + leader->nr_siblings;
2950
2951 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2952 values[n++] = leader->total_time_enabled;
2953
2954 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2955 values[n++] = leader->total_time_running;
2956
2957 if (leader != event)
2958 leader->pmu->read(leader);
2959
2960 values[n++] = atomic64_read(&leader->count);
2961 if (read_format & PERF_FORMAT_ID)
2962 values[n++] = primary_event_id(leader);
2963
2964 perf_output_copy(handle, values, n * sizeof(u64));
2965
2966 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2967 n = 0;
2968
2969 if (sub != event)
2970 sub->pmu->read(sub);
2971
2972 values[n++] = atomic64_read(&sub->count);
2973 if (read_format & PERF_FORMAT_ID)
2974 values[n++] = primary_event_id(sub);
2975
2976 perf_output_copy(handle, values, n * sizeof(u64));
2977 }
2978}
2979
2980static void perf_output_read(struct perf_output_handle *handle,
2981 struct perf_event *event)
2982{
2983 if (event->attr.read_format & PERF_FORMAT_GROUP)
2984 perf_output_read_group(handle, event);
2985 else
2986 perf_output_read_one(handle, event);
2987}
2988
2989void perf_output_sample(struct perf_output_handle *handle,
2990 struct perf_event_header *header,
2991 struct perf_sample_data *data,
2992 struct perf_event *event)
2993{
2994 u64 sample_type = data->type;
2995
2996 perf_output_put(handle, *header);
2997
2998 if (sample_type & PERF_SAMPLE_IP)
2999 perf_output_put(handle, data->ip);
3000
3001 if (sample_type & PERF_SAMPLE_TID)
3002 perf_output_put(handle, data->tid_entry);
3003
3004 if (sample_type & PERF_SAMPLE_TIME)
3005 perf_output_put(handle, data->time);
3006
3007 if (sample_type & PERF_SAMPLE_ADDR)
3008 perf_output_put(handle, data->addr);
3009
3010 if (sample_type & PERF_SAMPLE_ID)
3011 perf_output_put(handle, data->id);
3012
3013 if (sample_type & PERF_SAMPLE_STREAM_ID)
3014 perf_output_put(handle, data->stream_id);
3015
3016 if (sample_type & PERF_SAMPLE_CPU)
3017 perf_output_put(handle, data->cpu_entry);
3018
3019 if (sample_type & PERF_SAMPLE_PERIOD)
3020 perf_output_put(handle, data->period);
3021
3022 if (sample_type & PERF_SAMPLE_READ)
3023 perf_output_read(handle, event);
3024
3025 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3026 if (data->callchain) {
3027 int size = 1;
3028
3029 if (data->callchain)
3030 size += data->callchain->nr;
3031
3032 size *= sizeof(u64);
3033
3034 perf_output_copy(handle, data->callchain, size);
3035 } else {
3036 u64 nr = 0;
3037 perf_output_put(handle, nr);
3038 }
3039 }
3040
3041 if (sample_type & PERF_SAMPLE_RAW) {
3042 if (data->raw) {
3043 perf_output_put(handle, data->raw->size);
3044 perf_output_copy(handle, data->raw->data,
3045 data->raw->size);
3046 } else {
3047 struct {
3048 u32 size;
3049 u32 data;
3050 } raw = {
3051 .size = sizeof(u32),
3052 .data = 0,
3053 };
3054 perf_output_put(handle, raw);
3055 }
3056 }
3057}
3058
3059void perf_prepare_sample(struct perf_event_header *header,
3060 struct perf_sample_data *data,
3061 struct perf_event *event,
3062 struct pt_regs *regs)
3063{
3064 u64 sample_type = event->attr.sample_type;
3065
3066 data->type = sample_type;
3067
3068 header->type = PERF_RECORD_SAMPLE;
3069 header->size = sizeof(*header);
3070
3071 header->misc = 0;
3072 header->misc |= perf_misc_flags(regs);
3073
3074 if (sample_type & PERF_SAMPLE_IP) {
3075 data->ip = perf_instruction_pointer(regs);
3076
3077 header->size += sizeof(data->ip);
3078 }
3079
3080 if (sample_type & PERF_SAMPLE_TID) {
3081 /* namespace issues */
3082 data->tid_entry.pid = perf_event_pid(event, current);
3083 data->tid_entry.tid = perf_event_tid(event, current);
3084
3085 header->size += sizeof(data->tid_entry);
3086 }
3087
3088 if (sample_type & PERF_SAMPLE_TIME) {
3089 data->time = perf_clock();
3090
3091 header->size += sizeof(data->time);
3092 }
3093
3094 if (sample_type & PERF_SAMPLE_ADDR)
3095 header->size += sizeof(data->addr);
3096
3097 if (sample_type & PERF_SAMPLE_ID) {
3098 data->id = primary_event_id(event);
3099
3100 header->size += sizeof(data->id);
3101 }
3102
3103 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3104 data->stream_id = event->id;
3105
3106 header->size += sizeof(data->stream_id);
3107 }
3108
3109 if (sample_type & PERF_SAMPLE_CPU) {
3110 data->cpu_entry.cpu = raw_smp_processor_id();
3111 data->cpu_entry.reserved = 0;
3112
3113 header->size += sizeof(data->cpu_entry);
3114 }
3115
3116 if (sample_type & PERF_SAMPLE_PERIOD)
3117 header->size += sizeof(data->period);
3118
3119 if (sample_type & PERF_SAMPLE_READ)
3120 header->size += perf_event_read_size(event);
3121
3122 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3123 int size = 1;
3124
3125 data->callchain = perf_callchain(regs);
3126
3127 if (data->callchain)
3128 size += data->callchain->nr;
3129
3130 header->size += size * sizeof(u64);
3131 }
3132
3133 if (sample_type & PERF_SAMPLE_RAW) {
3134 int size = sizeof(u32);
3135
3136 if (data->raw)
3137 size += data->raw->size;
3138 else
3139 size += sizeof(u32);
3140
3141 WARN_ON_ONCE(size & (sizeof(u64)-1));
3142 header->size += size;
3143 }
3144}
3145
3146static void perf_event_output(struct perf_event *event, int nmi,
3147 struct perf_sample_data *data,
3148 struct pt_regs *regs)
3149{
3150 struct perf_output_handle handle;
3151 struct perf_event_header header;
3152
3153 perf_prepare_sample(&header, data, event, regs);
3154
3155 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3156 return;
3157
3158 perf_output_sample(&handle, &header, data, event);
3159
3160 perf_output_end(&handle);
3161}
3162
3163/*
3164 * read event_id
3165 */
3166
3167struct perf_read_event {
3168 struct perf_event_header header;
3169
3170 u32 pid;
3171 u32 tid;
3172};
3173
3174static void
3175perf_event_read_event(struct perf_event *event,
3176 struct task_struct *task)
3177{
3178 struct perf_output_handle handle;
3179 struct perf_read_event read_event = {
3180 .header = {
3181 .type = PERF_RECORD_READ,
3182 .misc = 0,
3183 .size = sizeof(read_event) + perf_event_read_size(event),
3184 },
3185 .pid = perf_event_pid(event, task),
3186 .tid = perf_event_tid(event, task),
3187 };
3188 int ret;
3189
3190 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3191 if (ret)
3192 return;
3193
3194 perf_output_put(&handle, read_event);
3195 perf_output_read(&handle, event);
3196
3197 perf_output_end(&handle);
3198}
3199
3200/*
3201 * task tracking -- fork/exit
3202 *
3203 * enabled by: attr.comm | attr.mmap | attr.task
3204 */
3205
3206struct perf_task_event {
3207 struct task_struct *task;
3208 struct perf_event_context *task_ctx;
3209
3210 struct {
3211 struct perf_event_header header;
3212
3213 u32 pid;
3214 u32 ppid;
3215 u32 tid;
3216 u32 ptid;
3217 u64 time;
3218 } event_id;
3219};
3220
3221static void perf_event_task_output(struct perf_event *event,
3222 struct perf_task_event *task_event)
3223{
3224 struct perf_output_handle handle;
3225 int size;
3226 struct task_struct *task = task_event->task;
3227 int ret;
3228
3229 size = task_event->event_id.header.size;
3230 ret = perf_output_begin(&handle, event, size, 0, 0);
3231
3232 if (ret)
3233 return;
3234
3235 task_event->event_id.pid = perf_event_pid(event, task);
3236 task_event->event_id.ppid = perf_event_pid(event, current);
3237
3238 task_event->event_id.tid = perf_event_tid(event, task);
3239 task_event->event_id.ptid = perf_event_tid(event, current);
3240
3241 task_event->event_id.time = perf_clock();
3242
3243 perf_output_put(&handle, task_event->event_id);
3244
3245 perf_output_end(&handle);
3246}
3247
3248static int perf_event_task_match(struct perf_event *event)
3249{
3250 if (event->attr.comm || event->attr.mmap || event->attr.task)
3251 return 1;
3252
3253 return 0;
3254}
3255
3256static void perf_event_task_ctx(struct perf_event_context *ctx,
3257 struct perf_task_event *task_event)
3258{
3259 struct perf_event *event;
3260
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003261 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3262 if (perf_event_task_match(event))
3263 perf_event_task_output(event, task_event);
3264 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003265}
3266
3267static void perf_event_task_event(struct perf_task_event *task_event)
3268{
3269 struct perf_cpu_context *cpuctx;
3270 struct perf_event_context *ctx = task_event->task_ctx;
3271
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01003272 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003273 cpuctx = &get_cpu_var(perf_cpu_context);
3274 perf_event_task_ctx(&cpuctx->ctx, task_event);
3275 put_cpu_var(perf_cpu_context);
3276
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003277 if (!ctx)
3278 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3279 if (ctx)
3280 perf_event_task_ctx(ctx, task_event);
3281 rcu_read_unlock();
3282}
3283
3284static void perf_event_task(struct task_struct *task,
3285 struct perf_event_context *task_ctx,
3286 int new)
3287{
3288 struct perf_task_event task_event;
3289
3290 if (!atomic_read(&nr_comm_events) &&
3291 !atomic_read(&nr_mmap_events) &&
3292 !atomic_read(&nr_task_events))
3293 return;
3294
3295 task_event = (struct perf_task_event){
3296 .task = task,
3297 .task_ctx = task_ctx,
3298 .event_id = {
3299 .header = {
3300 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3301 .misc = 0,
3302 .size = sizeof(task_event.event_id),
3303 },
3304 /* .pid */
3305 /* .ppid */
3306 /* .tid */
3307 /* .ptid */
3308 },
3309 };
3310
3311 perf_event_task_event(&task_event);
3312}
3313
3314void perf_event_fork(struct task_struct *task)
3315{
3316 perf_event_task(task, NULL, 1);
3317}
3318
3319/*
3320 * comm tracking
3321 */
3322
3323struct perf_comm_event {
3324 struct task_struct *task;
3325 char *comm;
3326 int comm_size;
3327
3328 struct {
3329 struct perf_event_header header;
3330
3331 u32 pid;
3332 u32 tid;
3333 } event_id;
3334};
3335
3336static void perf_event_comm_output(struct perf_event *event,
3337 struct perf_comm_event *comm_event)
3338{
3339 struct perf_output_handle handle;
3340 int size = comm_event->event_id.header.size;
3341 int ret = perf_output_begin(&handle, event, size, 0, 0);
3342
3343 if (ret)
3344 return;
3345
3346 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3347 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3348
3349 perf_output_put(&handle, comm_event->event_id);
3350 perf_output_copy(&handle, comm_event->comm,
3351 comm_event->comm_size);
3352 perf_output_end(&handle);
3353}
3354
3355static int perf_event_comm_match(struct perf_event *event)
3356{
3357 if (event->attr.comm)
3358 return 1;
3359
3360 return 0;
3361}
3362
3363static void perf_event_comm_ctx(struct perf_event_context *ctx,
3364 struct perf_comm_event *comm_event)
3365{
3366 struct perf_event *event;
3367
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003368 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3369 if (perf_event_comm_match(event))
3370 perf_event_comm_output(event, comm_event);
3371 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003372}
3373
3374static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375{
3376 struct perf_cpu_context *cpuctx;
3377 struct perf_event_context *ctx;
3378 unsigned int size;
3379 char comm[TASK_COMM_LEN];
3380
3381 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01003382 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003383 size = ALIGN(strlen(comm)+1, sizeof(u64));
3384
3385 comm_event->comm = comm;
3386 comm_event->comm_size = size;
3387
3388 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3389
Peter Zijlstraf6595f32009-11-20 22:19:47 +01003390 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003391 cpuctx = &get_cpu_var(perf_cpu_context);
3392 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3393 put_cpu_var(perf_cpu_context);
3394
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003395 /*
3396 * doesn't really matter which of the child contexts the
3397 * events ends up in.
3398 */
3399 ctx = rcu_dereference(current->perf_event_ctxp);
3400 if (ctx)
3401 perf_event_comm_ctx(ctx, comm_event);
3402 rcu_read_unlock();
3403}
3404
3405void perf_event_comm(struct task_struct *task)
3406{
3407 struct perf_comm_event comm_event;
3408
3409 if (task->perf_event_ctxp)
3410 perf_event_enable_on_exec(task);
3411
3412 if (!atomic_read(&nr_comm_events))
3413 return;
3414
3415 comm_event = (struct perf_comm_event){
3416 .task = task,
3417 /* .comm */
3418 /* .comm_size */
3419 .event_id = {
3420 .header = {
3421 .type = PERF_RECORD_COMM,
3422 .misc = 0,
3423 /* .size */
3424 },
3425 /* .pid */
3426 /* .tid */
3427 },
3428 };
3429
3430 perf_event_comm_event(&comm_event);
3431}
3432
3433/*
3434 * mmap tracking
3435 */
3436
3437struct perf_mmap_event {
3438 struct vm_area_struct *vma;
3439
3440 const char *file_name;
3441 int file_size;
3442
3443 struct {
3444 struct perf_event_header header;
3445
3446 u32 pid;
3447 u32 tid;
3448 u64 start;
3449 u64 len;
3450 u64 pgoff;
3451 } event_id;
3452};
3453
3454static void perf_event_mmap_output(struct perf_event *event,
3455 struct perf_mmap_event *mmap_event)
3456{
3457 struct perf_output_handle handle;
3458 int size = mmap_event->event_id.header.size;
3459 int ret = perf_output_begin(&handle, event, size, 0, 0);
3460
3461 if (ret)
3462 return;
3463
3464 mmap_event->event_id.pid = perf_event_pid(event, current);
3465 mmap_event->event_id.tid = perf_event_tid(event, current);
3466
3467 perf_output_put(&handle, mmap_event->event_id);
3468 perf_output_copy(&handle, mmap_event->file_name,
3469 mmap_event->file_size);
3470 perf_output_end(&handle);
3471}
3472
3473static int perf_event_mmap_match(struct perf_event *event,
3474 struct perf_mmap_event *mmap_event)
3475{
3476 if (event->attr.mmap)
3477 return 1;
3478
3479 return 0;
3480}
3481
3482static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3483 struct perf_mmap_event *mmap_event)
3484{
3485 struct perf_event *event;
3486
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003487 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3488 if (perf_event_mmap_match(event, mmap_event))
3489 perf_event_mmap_output(event, mmap_event);
3490 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003491}
3492
3493static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3494{
3495 struct perf_cpu_context *cpuctx;
3496 struct perf_event_context *ctx;
3497 struct vm_area_struct *vma = mmap_event->vma;
3498 struct file *file = vma->vm_file;
3499 unsigned int size;
3500 char tmp[16];
3501 char *buf = NULL;
3502 const char *name;
3503
3504 memset(tmp, 0, sizeof(tmp));
3505
3506 if (file) {
3507 /*
3508 * d_path works from the end of the buffer backwards, so we
3509 * need to add enough zero bytes after the string to handle
3510 * the 64bit alignment we do later.
3511 */
3512 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3513 if (!buf) {
3514 name = strncpy(tmp, "//enomem", sizeof(tmp));
3515 goto got_name;
3516 }
3517 name = d_path(&file->f_path, buf, PATH_MAX);
3518 if (IS_ERR(name)) {
3519 name = strncpy(tmp, "//toolong", sizeof(tmp));
3520 goto got_name;
3521 }
3522 } else {
3523 if (arch_vma_name(mmap_event->vma)) {
3524 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3525 sizeof(tmp));
3526 goto got_name;
3527 }
3528
3529 if (!vma->vm_mm) {
3530 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3531 goto got_name;
3532 }
3533
3534 name = strncpy(tmp, "//anon", sizeof(tmp));
3535 goto got_name;
3536 }
3537
3538got_name:
3539 size = ALIGN(strlen(name)+1, sizeof(u64));
3540
3541 mmap_event->file_name = name;
3542 mmap_event->file_size = size;
3543
3544 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3545
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01003546 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003547 cpuctx = &get_cpu_var(perf_cpu_context);
3548 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3549 put_cpu_var(perf_cpu_context);
3550
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003551 /*
3552 * doesn't really matter which of the child contexts the
3553 * events ends up in.
3554 */
3555 ctx = rcu_dereference(current->perf_event_ctxp);
3556 if (ctx)
3557 perf_event_mmap_ctx(ctx, mmap_event);
3558 rcu_read_unlock();
3559
3560 kfree(buf);
3561}
3562
3563void __perf_event_mmap(struct vm_area_struct *vma)
3564{
3565 struct perf_mmap_event mmap_event;
3566
3567 if (!atomic_read(&nr_mmap_events))
3568 return;
3569
3570 mmap_event = (struct perf_mmap_event){
3571 .vma = vma,
3572 /* .file_name */
3573 /* .file_size */
3574 .event_id = {
3575 .header = {
3576 .type = PERF_RECORD_MMAP,
3577 .misc = 0,
3578 /* .size */
3579 },
3580 /* .pid */
3581 /* .tid */
3582 .start = vma->vm_start,
3583 .len = vma->vm_end - vma->vm_start,
3584 .pgoff = vma->vm_pgoff,
3585 },
3586 };
3587
3588 perf_event_mmap_event(&mmap_event);
3589}
3590
3591/*
3592 * IRQ throttle logging
3593 */
3594
3595static void perf_log_throttle(struct perf_event *event, int enable)
3596{
3597 struct perf_output_handle handle;
3598 int ret;
3599
3600 struct {
3601 struct perf_event_header header;
3602 u64 time;
3603 u64 id;
3604 u64 stream_id;
3605 } throttle_event = {
3606 .header = {
3607 .type = PERF_RECORD_THROTTLE,
3608 .misc = 0,
3609 .size = sizeof(throttle_event),
3610 },
3611 .time = perf_clock(),
3612 .id = primary_event_id(event),
3613 .stream_id = event->id,
3614 };
3615
3616 if (enable)
3617 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3618
3619 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3620 if (ret)
3621 return;
3622
3623 perf_output_put(&handle, throttle_event);
3624 perf_output_end(&handle);
3625}
3626
3627/*
3628 * Generic event overflow handling, sampling.
3629 */
3630
3631static int __perf_event_overflow(struct perf_event *event, int nmi,
3632 int throttle, struct perf_sample_data *data,
3633 struct pt_regs *regs)
3634{
3635 int events = atomic_read(&event->event_limit);
3636 struct hw_perf_event *hwc = &event->hw;
3637 int ret = 0;
3638
3639 throttle = (throttle && event->pmu->unthrottle != NULL);
3640
3641 if (!throttle) {
3642 hwc->interrupts++;
3643 } else {
3644 if (hwc->interrupts != MAX_INTERRUPTS) {
3645 hwc->interrupts++;
3646 if (HZ * hwc->interrupts >
3647 (u64)sysctl_perf_event_sample_rate) {
3648 hwc->interrupts = MAX_INTERRUPTS;
3649 perf_log_throttle(event, 0);
3650 ret = 1;
3651 }
3652 } else {
3653 /*
3654 * Keep re-disabling events even though on the previous
3655 * pass we disabled it - just in case we raced with a
3656 * sched-in and the event got enabled again:
3657 */
3658 ret = 1;
3659 }
3660 }
3661
3662 if (event->attr.freq) {
3663 u64 now = perf_clock();
3664 s64 delta = now - hwc->freq_stamp;
3665
3666 hwc->freq_stamp = now;
3667
3668 if (delta > 0 && delta < TICK_NSEC)
3669 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3670 }
3671
3672 /*
3673 * XXX event_limit might not quite work as expected on inherited
3674 * events
3675 */
3676
3677 event->pending_kill = POLL_IN;
3678 if (events && atomic_dec_and_test(&event->event_limit)) {
3679 ret = 1;
3680 event->pending_kill = POLL_HUP;
3681 if (nmi) {
3682 event->pending_disable = 1;
3683 perf_pending_queue(&event->pending,
3684 perf_pending_event);
3685 } else
3686 perf_event_disable(event);
3687 }
3688
Peter Zijlstra453f19e2009-11-20 22:19:43 +01003689 if (event->overflow_handler)
3690 event->overflow_handler(event, nmi, data, regs);
3691 else
3692 perf_event_output(event, nmi, data, regs);
3693
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003694 return ret;
3695}
3696
3697int perf_event_overflow(struct perf_event *event, int nmi,
3698 struct perf_sample_data *data,
3699 struct pt_regs *regs)
3700{
3701 return __perf_event_overflow(event, nmi, 1, data, regs);
3702}
3703
3704/*
3705 * Generic software event infrastructure
3706 */
3707
3708/*
3709 * We directly increment event->count and keep a second value in
3710 * event->hw.period_left to count intervals. This period event
3711 * is kept in the range [-sample_period, 0] so that we can use the
3712 * sign as trigger.
3713 */
3714
3715static u64 perf_swevent_set_period(struct perf_event *event)
3716{
3717 struct hw_perf_event *hwc = &event->hw;
3718 u64 period = hwc->last_period;
3719 u64 nr, offset;
3720 s64 old, val;
3721
3722 hwc->last_period = hwc->sample_period;
3723
3724again:
3725 old = val = atomic64_read(&hwc->period_left);
3726 if (val < 0)
3727 return 0;
3728
3729 nr = div64_u64(period + val, period);
3730 offset = nr * period;
3731 val -= offset;
3732 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3733 goto again;
3734
3735 return nr;
3736}
3737
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003738static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003739 int nmi, struct perf_sample_data *data,
3740 struct pt_regs *regs)
3741{
3742 struct hw_perf_event *hwc = &event->hw;
3743 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003744
3745 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003746 if (!overflow)
3747 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003748
3749 if (hwc->interrupts == MAX_INTERRUPTS)
3750 return;
3751
3752 for (; overflow; overflow--) {
3753 if (__perf_event_overflow(event, nmi, throttle,
3754 data, regs)) {
3755 /*
3756 * We inhibit the overflow from happening when
3757 * hwc->interrupts == MAX_INTERRUPTS.
3758 */
3759 break;
3760 }
3761 throttle = 1;
3762 }
3763}
3764
3765static void perf_swevent_unthrottle(struct perf_event *event)
3766{
3767 /*
3768 * Nothing to do, we already reset hwc->interrupts.
3769 */
3770}
3771
3772static void perf_swevent_add(struct perf_event *event, u64 nr,
3773 int nmi, struct perf_sample_data *data,
3774 struct pt_regs *regs)
3775{
3776 struct hw_perf_event *hwc = &event->hw;
3777
3778 atomic64_add(nr, &event->count);
3779
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003780 if (!regs)
3781 return;
3782
Peter Zijlstra0cff7842009-11-20 22:19:44 +01003783 if (!hwc->sample_period)
3784 return;
3785
3786 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3787 return perf_swevent_overflow(event, 1, nmi, data, regs);
3788
3789 if (atomic64_add_negative(nr, &hwc->period_left))
3790 return;
3791
3792 perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003793}
3794
3795static int perf_swevent_is_counting(struct perf_event *event)
3796{
3797 /*
3798 * The event is active, we're good!
3799 */
3800 if (event->state == PERF_EVENT_STATE_ACTIVE)
3801 return 1;
3802
3803 /*
3804 * The event is off/error, not counting.
3805 */
3806 if (event->state != PERF_EVENT_STATE_INACTIVE)
3807 return 0;
3808
3809 /*
3810 * The event is inactive, if the context is active
3811 * we're part of a group that didn't make it on the 'pmu',
3812 * not counting.
3813 */
3814 if (event->ctx->is_active)
3815 return 0;
3816
3817 /*
3818 * We're inactive and the context is too, this means the
3819 * task is scheduled out, we're counting events that happen
3820 * to us, like migration events.
3821 */
3822 return 1;
3823}
3824
Li Zefan6fb29152009-10-15 11:21:42 +08003825static int perf_tp_event_match(struct perf_event *event,
3826 struct perf_sample_data *data);
3827
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003828static int perf_swevent_match(struct perf_event *event,
3829 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08003830 u32 event_id,
3831 struct perf_sample_data *data,
3832 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003833{
3834 if (!perf_swevent_is_counting(event))
3835 return 0;
3836
3837 if (event->attr.type != type)
3838 return 0;
3839 if (event->attr.config != event_id)
3840 return 0;
3841
3842 if (regs) {
3843 if (event->attr.exclude_user && user_mode(regs))
3844 return 0;
3845
3846 if (event->attr.exclude_kernel && !user_mode(regs))
3847 return 0;
3848 }
3849
Li Zefan6fb29152009-10-15 11:21:42 +08003850 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3851 !perf_tp_event_match(event, data))
3852 return 0;
3853
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003854 return 1;
3855}
3856
3857static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3858 enum perf_type_id type,
3859 u32 event_id, u64 nr, int nmi,
3860 struct perf_sample_data *data,
3861 struct pt_regs *regs)
3862{
3863 struct perf_event *event;
3864
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003865 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08003866 if (perf_swevent_match(event, type, event_id, data, regs))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003867 perf_swevent_add(event, nr, nmi, data, regs);
3868 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003869}
3870
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003871/*
3872 * Must be called with preemption disabled
3873 */
3874int perf_swevent_get_recursion_context(int **recursion)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003875{
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003876 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3877
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003878 if (in_nmi())
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003879 *recursion = &cpuctx->recursion[3];
3880 else if (in_irq())
3881 *recursion = &cpuctx->recursion[2];
3882 else if (in_softirq())
3883 *recursion = &cpuctx->recursion[1];
3884 else
3885 *recursion = &cpuctx->recursion[0];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003886
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003887 if (**recursion)
3888 return -1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003889
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003890 (**recursion)++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003891
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003892 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003893}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01003894EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003895
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003896void perf_swevent_put_recursion_context(int *recursion)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003897{
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003898 (*recursion)--;
3899}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01003900EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003901
3902static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
3903 u64 nr, int nmi,
3904 struct perf_sample_data *data,
3905 struct pt_regs *regs)
3906{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003907 struct perf_event_context *ctx;
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003908 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003909
Peter Zijlstra81520182009-11-20 22:19:45 +01003910 rcu_read_lock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003911 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3912 nr, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003913 /*
3914 * doesn't really matter which of the child contexts the
3915 * events ends up in.
3916 */
3917 ctx = rcu_dereference(current->perf_event_ctxp);
3918 if (ctx)
3919 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3920 rcu_read_unlock();
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003921}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003922
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003923static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3924 u64 nr, int nmi,
3925 struct perf_sample_data *data,
3926 struct pt_regs *regs)
3927{
3928 int *recursion;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003929
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003930 preempt_disable();
3931
3932 if (perf_swevent_get_recursion_context(&recursion))
3933 goto out;
3934
3935 __do_perf_sw_event(type, event_id, nr, nmi, data, regs);
3936
3937 perf_swevent_put_recursion_context(recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003938out:
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01003939 preempt_enable();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003940}
3941
3942void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3943 struct pt_regs *regs, u64 addr)
3944{
Ingo Molnara4234bf2009-11-23 10:57:59 +01003945 struct perf_sample_data data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003946
Ingo Molnara4234bf2009-11-23 10:57:59 +01003947 data.addr = addr;
3948 data.raw = NULL;
3949
3950 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003951}
3952
3953static void perf_swevent_read(struct perf_event *event)
3954{
3955}
3956
3957static int perf_swevent_enable(struct perf_event *event)
3958{
3959 struct hw_perf_event *hwc = &event->hw;
3960
3961 if (hwc->sample_period) {
3962 hwc->last_period = hwc->sample_period;
3963 perf_swevent_set_period(event);
3964 }
3965 return 0;
3966}
3967
3968static void perf_swevent_disable(struct perf_event *event)
3969{
3970}
3971
3972static const struct pmu perf_ops_generic = {
3973 .enable = perf_swevent_enable,
3974 .disable = perf_swevent_disable,
3975 .read = perf_swevent_read,
3976 .unthrottle = perf_swevent_unthrottle,
3977};
3978
3979/*
3980 * hrtimer based swevent callback
3981 */
3982
3983static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3984{
3985 enum hrtimer_restart ret = HRTIMER_RESTART;
3986 struct perf_sample_data data;
3987 struct pt_regs *regs;
3988 struct perf_event *event;
3989 u64 period;
3990
3991 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3992 event->pmu->read(event);
3993
3994 data.addr = 0;
3995 regs = get_irq_regs();
3996 /*
3997 * In case we exclude kernel IPs or are somehow not in interrupt
3998 * context, provide the next best thing, the user IP.
3999 */
4000 if ((event->attr.exclude_kernel || !regs) &&
4001 !event->attr.exclude_user)
4002 regs = task_pt_regs(current);
4003
4004 if (regs) {
Soeren Sandmann54f44072009-10-22 18:34:08 +02004005 if (!(event->attr.exclude_idle && current->pid == 0))
4006 if (perf_event_overflow(event, 0, &data, regs))
4007 ret = HRTIMER_NORESTART;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004008 }
4009
4010 period = max_t(u64, 10000, event->hw.sample_period);
4011 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4012
4013 return ret;
4014}
4015
Soeren Sandmann721a6692009-09-15 14:33:08 +02004016static void perf_swevent_start_hrtimer(struct perf_event *event)
4017{
4018 struct hw_perf_event *hwc = &event->hw;
4019
4020 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4021 hwc->hrtimer.function = perf_swevent_hrtimer;
4022 if (hwc->sample_period) {
4023 u64 period;
4024
4025 if (hwc->remaining) {
4026 if (hwc->remaining < 0)
4027 period = 10000;
4028 else
4029 period = hwc->remaining;
4030 hwc->remaining = 0;
4031 } else {
4032 period = max_t(u64, 10000, hwc->sample_period);
4033 }
4034 __hrtimer_start_range_ns(&hwc->hrtimer,
4035 ns_to_ktime(period), 0,
4036 HRTIMER_MODE_REL, 0);
4037 }
4038}
4039
4040static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4041{
4042 struct hw_perf_event *hwc = &event->hw;
4043
4044 if (hwc->sample_period) {
4045 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4046 hwc->remaining = ktime_to_ns(remaining);
4047
4048 hrtimer_cancel(&hwc->hrtimer);
4049 }
4050}
4051
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004052/*
4053 * Software event: cpu wall time clock
4054 */
4055
4056static void cpu_clock_perf_event_update(struct perf_event *event)
4057{
4058 int cpu = raw_smp_processor_id();
4059 s64 prev;
4060 u64 now;
4061
4062 now = cpu_clock(cpu);
4063 prev = atomic64_read(&event->hw.prev_count);
4064 atomic64_set(&event->hw.prev_count, now);
4065 atomic64_add(now - prev, &event->count);
4066}
4067
4068static int cpu_clock_perf_event_enable(struct perf_event *event)
4069{
4070 struct hw_perf_event *hwc = &event->hw;
4071 int cpu = raw_smp_processor_id();
4072
4073 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
Soeren Sandmann721a6692009-09-15 14:33:08 +02004074 perf_swevent_start_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004075
4076 return 0;
4077}
4078
4079static void cpu_clock_perf_event_disable(struct perf_event *event)
4080{
Soeren Sandmann721a6692009-09-15 14:33:08 +02004081 perf_swevent_cancel_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004082 cpu_clock_perf_event_update(event);
4083}
4084
4085static void cpu_clock_perf_event_read(struct perf_event *event)
4086{
4087 cpu_clock_perf_event_update(event);
4088}
4089
4090static const struct pmu perf_ops_cpu_clock = {
4091 .enable = cpu_clock_perf_event_enable,
4092 .disable = cpu_clock_perf_event_disable,
4093 .read = cpu_clock_perf_event_read,
4094};
4095
4096/*
4097 * Software event: task time clock
4098 */
4099
4100static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4101{
4102 u64 prev;
4103 s64 delta;
4104
4105 prev = atomic64_xchg(&event->hw.prev_count, now);
4106 delta = now - prev;
4107 atomic64_add(delta, &event->count);
4108}
4109
4110static int task_clock_perf_event_enable(struct perf_event *event)
4111{
4112 struct hw_perf_event *hwc = &event->hw;
4113 u64 now;
4114
4115 now = event->ctx->time;
4116
4117 atomic64_set(&hwc->prev_count, now);
Soeren Sandmann721a6692009-09-15 14:33:08 +02004118
4119 perf_swevent_start_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004120
4121 return 0;
4122}
4123
4124static void task_clock_perf_event_disable(struct perf_event *event)
4125{
Soeren Sandmann721a6692009-09-15 14:33:08 +02004126 perf_swevent_cancel_hrtimer(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004127 task_clock_perf_event_update(event, event->ctx->time);
4128
4129}
4130
4131static void task_clock_perf_event_read(struct perf_event *event)
4132{
4133 u64 time;
4134
4135 if (!in_nmi()) {
4136 update_context_time(event->ctx);
4137 time = event->ctx->time;
4138 } else {
4139 u64 now = perf_clock();
4140 u64 delta = now - event->ctx->timestamp;
4141 time = event->ctx->time + delta;
4142 }
4143
4144 task_clock_perf_event_update(event, time);
4145}
4146
4147static const struct pmu perf_ops_task_clock = {
4148 .enable = task_clock_perf_event_enable,
4149 .disable = task_clock_perf_event_disable,
4150 .read = task_clock_perf_event_read,
4151};
4152
4153#ifdef CONFIG_EVENT_PROFILE
Li Zefan6fb29152009-10-15 11:21:42 +08004154
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004155void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4156 int entry_size)
4157{
4158 struct perf_raw_record raw = {
4159 .size = entry_size,
4160 .data = record,
4161 };
4162
4163 struct perf_sample_data data = {
4164 .addr = addr,
4165 .raw = &raw,
4166 };
4167
4168 struct pt_regs *regs = get_irq_regs();
4169
4170 if (!regs)
4171 regs = task_pt_regs(current);
4172
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01004173 /* Trace events already protected against recursion */
4174 __do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004175 &data, regs);
4176}
4177EXPORT_SYMBOL_GPL(perf_tp_event);
4178
Li Zefan6fb29152009-10-15 11:21:42 +08004179static int perf_tp_event_match(struct perf_event *event,
4180 struct perf_sample_data *data)
4181{
4182 void *record = data->raw->data;
4183
4184 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4185 return 1;
4186 return 0;
4187}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004188
4189static void tp_perf_event_destroy(struct perf_event *event)
4190{
4191 ftrace_profile_disable(event->attr.config);
4192}
4193
4194static const struct pmu *tp_perf_event_init(struct perf_event *event)
4195{
4196 /*
4197 * Raw tracepoint data is a severe data leak, only allow root to
4198 * have these.
4199 */
4200 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4201 perf_paranoid_tracepoint_raw() &&
4202 !capable(CAP_SYS_ADMIN))
4203 return ERR_PTR(-EPERM);
4204
4205 if (ftrace_profile_enable(event->attr.config))
4206 return NULL;
4207
4208 event->destroy = tp_perf_event_destroy;
4209
4210 return &perf_ops_generic;
4211}
Li Zefan6fb29152009-10-15 11:21:42 +08004212
4213static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4214{
4215 char *filter_str;
4216 int ret;
4217
4218 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4219 return -EINVAL;
4220
4221 filter_str = strndup_user(arg, PAGE_SIZE);
4222 if (IS_ERR(filter_str))
4223 return PTR_ERR(filter_str);
4224
4225 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4226
4227 kfree(filter_str);
4228 return ret;
4229}
4230
4231static void perf_event_free_filter(struct perf_event *event)
4232{
4233 ftrace_profile_free_filter(event);
4234}
4235
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004236#else
Li Zefan6fb29152009-10-15 11:21:42 +08004237
4238static int perf_tp_event_match(struct perf_event *event,
4239 struct perf_sample_data *data)
4240{
4241 return 1;
4242}
4243
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004244static const struct pmu *tp_perf_event_init(struct perf_event *event)
4245{
4246 return NULL;
4247}
Li Zefan6fb29152009-10-15 11:21:42 +08004248
4249static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4250{
4251 return -ENOENT;
4252}
4253
4254static void perf_event_free_filter(struct perf_event *event)
4255{
4256}
4257
4258#endif /* CONFIG_EVENT_PROFILE */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004259
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004260#ifdef CONFIG_HAVE_HW_BREAKPOINT
4261static void bp_perf_event_destroy(struct perf_event *event)
4262{
4263 release_bp_slot(event);
4264}
4265
4266static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4267{
4268 int err;
4269 /*
4270 * The breakpoint is already filled if we haven't created the counter
4271 * through perf syscall
4272 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4273 */
4274 if (!bp->callback)
4275 err = register_perf_hw_breakpoint(bp);
4276 else
4277 err = __register_perf_hw_breakpoint(bp);
4278 if (err)
4279 return ERR_PTR(err);
4280
4281 bp->destroy = bp_perf_event_destroy;
4282
4283 return &perf_ops_bp;
4284}
4285
4286void perf_bp_event(struct perf_event *bp, void *regs)
4287{
4288 /* TODO */
4289}
4290#else
4291static void bp_perf_event_destroy(struct perf_event *event)
4292{
4293}
4294
4295static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4296{
4297 return NULL;
4298}
4299
4300void perf_bp_event(struct perf_event *bp, void *regs)
4301{
4302}
4303#endif
4304
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004305atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4306
4307static void sw_perf_event_destroy(struct perf_event *event)
4308{
4309 u64 event_id = event->attr.config;
4310
4311 WARN_ON(event->parent);
4312
4313 atomic_dec(&perf_swevent_enabled[event_id]);
4314}
4315
4316static const struct pmu *sw_perf_event_init(struct perf_event *event)
4317{
4318 const struct pmu *pmu = NULL;
4319 u64 event_id = event->attr.config;
4320
4321 /*
4322 * Software events (currently) can't in general distinguish
4323 * between user, kernel and hypervisor events.
4324 * However, context switches and cpu migrations are considered
4325 * to be kernel events, and page faults are never hypervisor
4326 * events.
4327 */
4328 switch (event_id) {
4329 case PERF_COUNT_SW_CPU_CLOCK:
4330 pmu = &perf_ops_cpu_clock;
4331
4332 break;
4333 case PERF_COUNT_SW_TASK_CLOCK:
4334 /*
4335 * If the user instantiates this as a per-cpu event,
4336 * use the cpu_clock event instead.
4337 */
4338 if (event->ctx->task)
4339 pmu = &perf_ops_task_clock;
4340 else
4341 pmu = &perf_ops_cpu_clock;
4342
4343 break;
4344 case PERF_COUNT_SW_PAGE_FAULTS:
4345 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4346 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4347 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4348 case PERF_COUNT_SW_CPU_MIGRATIONS:
Anton Blanchardf7d79862009-10-18 01:09:29 +00004349 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4350 case PERF_COUNT_SW_EMULATION_FAULTS:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004351 if (!event->parent) {
4352 atomic_inc(&perf_swevent_enabled[event_id]);
4353 event->destroy = sw_perf_event_destroy;
4354 }
4355 pmu = &perf_ops_generic;
4356 break;
4357 }
4358
4359 return pmu;
4360}
4361
4362/*
4363 * Allocate and initialize a event structure
4364 */
4365static struct perf_event *
4366perf_event_alloc(struct perf_event_attr *attr,
4367 int cpu,
4368 struct perf_event_context *ctx,
4369 struct perf_event *group_leader,
4370 struct perf_event *parent_event,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004371 perf_callback_t callback,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004372 gfp_t gfpflags)
4373{
4374 const struct pmu *pmu;
4375 struct perf_event *event;
4376 struct hw_perf_event *hwc;
4377 long err;
4378
4379 event = kzalloc(sizeof(*event), gfpflags);
4380 if (!event)
4381 return ERR_PTR(-ENOMEM);
4382
4383 /*
4384 * Single events are their own group leaders, with an
4385 * empty sibling list:
4386 */
4387 if (!group_leader)
4388 group_leader = event;
4389
4390 mutex_init(&event->child_mutex);
4391 INIT_LIST_HEAD(&event->child_list);
4392
4393 INIT_LIST_HEAD(&event->group_entry);
4394 INIT_LIST_HEAD(&event->event_entry);
4395 INIT_LIST_HEAD(&event->sibling_list);
4396 init_waitqueue_head(&event->waitq);
4397
4398 mutex_init(&event->mmap_mutex);
4399
4400 event->cpu = cpu;
4401 event->attr = *attr;
4402 event->group_leader = group_leader;
4403 event->pmu = NULL;
4404 event->ctx = ctx;
4405 event->oncpu = -1;
4406
4407 event->parent = parent_event;
4408
4409 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4410 event->id = atomic64_inc_return(&perf_event_id);
4411
4412 event->state = PERF_EVENT_STATE_INACTIVE;
4413
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004414 if (!callback && parent_event)
4415 callback = parent_event->callback;
4416
4417 event->callback = callback;
4418
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004419 if (attr->disabled)
4420 event->state = PERF_EVENT_STATE_OFF;
4421
4422 pmu = NULL;
4423
4424 hwc = &event->hw;
4425 hwc->sample_period = attr->sample_period;
4426 if (attr->freq && attr->sample_freq)
4427 hwc->sample_period = 1;
4428 hwc->last_period = hwc->sample_period;
4429
4430 atomic64_set(&hwc->period_left, hwc->sample_period);
4431
4432 /*
4433 * we currently do not support PERF_FORMAT_GROUP on inherited events
4434 */
4435 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4436 goto done;
4437
4438 switch (attr->type) {
4439 case PERF_TYPE_RAW:
4440 case PERF_TYPE_HARDWARE:
4441 case PERF_TYPE_HW_CACHE:
4442 pmu = hw_perf_event_init(event);
4443 break;
4444
4445 case PERF_TYPE_SOFTWARE:
4446 pmu = sw_perf_event_init(event);
4447 break;
4448
4449 case PERF_TYPE_TRACEPOINT:
4450 pmu = tp_perf_event_init(event);
4451 break;
4452
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004453 case PERF_TYPE_BREAKPOINT:
4454 pmu = bp_perf_event_init(event);
4455 break;
4456
4457
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004458 default:
4459 break;
4460 }
4461done:
4462 err = 0;
4463 if (!pmu)
4464 err = -EINVAL;
4465 else if (IS_ERR(pmu))
4466 err = PTR_ERR(pmu);
4467
4468 if (err) {
4469 if (event->ns)
4470 put_pid_ns(event->ns);
4471 kfree(event);
4472 return ERR_PTR(err);
4473 }
4474
4475 event->pmu = pmu;
4476
4477 if (!event->parent) {
4478 atomic_inc(&nr_events);
4479 if (event->attr.mmap)
4480 atomic_inc(&nr_mmap_events);
4481 if (event->attr.comm)
4482 atomic_inc(&nr_comm_events);
4483 if (event->attr.task)
4484 atomic_inc(&nr_task_events);
4485 }
4486
4487 return event;
4488}
4489
4490static int perf_copy_attr(struct perf_event_attr __user *uattr,
4491 struct perf_event_attr *attr)
4492{
4493 u32 size;
4494 int ret;
4495
4496 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4497 return -EFAULT;
4498
4499 /*
4500 * zero the full structure, so that a short copy will be nice.
4501 */
4502 memset(attr, 0, sizeof(*attr));
4503
4504 ret = get_user(size, &uattr->size);
4505 if (ret)
4506 return ret;
4507
4508 if (size > PAGE_SIZE) /* silly large */
4509 goto err_size;
4510
4511 if (!size) /* abi compat */
4512 size = PERF_ATTR_SIZE_VER0;
4513
4514 if (size < PERF_ATTR_SIZE_VER0)
4515 goto err_size;
4516
4517 /*
4518 * If we're handed a bigger struct than we know of,
4519 * ensure all the unknown bits are 0 - i.e. new
4520 * user-space does not rely on any kernel feature
4521 * extensions we dont know about yet.
4522 */
4523 if (size > sizeof(*attr)) {
4524 unsigned char __user *addr;
4525 unsigned char __user *end;
4526 unsigned char val;
4527
4528 addr = (void __user *)uattr + sizeof(*attr);
4529 end = (void __user *)uattr + size;
4530
4531 for (; addr < end; addr++) {
4532 ret = get_user(val, addr);
4533 if (ret)
4534 return ret;
4535 if (val)
4536 goto err_size;
4537 }
4538 size = sizeof(*attr);
4539 }
4540
4541 ret = copy_from_user(attr, uattr, size);
4542 if (ret)
4543 return -EFAULT;
4544
4545 /*
4546 * If the type exists, the corresponding creation will verify
4547 * the attr->config.
4548 */
4549 if (attr->type >= PERF_TYPE_MAX)
4550 return -EINVAL;
4551
4552 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4553 return -EINVAL;
4554
4555 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4556 return -EINVAL;
4557
4558 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4559 return -EINVAL;
4560
4561out:
4562 return ret;
4563
4564err_size:
4565 put_user(sizeof(*attr), &uattr->size);
4566 ret = -E2BIG;
4567 goto out;
4568}
4569
Li Zefan6fb29152009-10-15 11:21:42 +08004570static int perf_event_set_output(struct perf_event *event, int output_fd)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004571{
4572 struct perf_event *output_event = NULL;
4573 struct file *output_file = NULL;
4574 struct perf_event *old_output;
4575 int fput_needed = 0;
4576 int ret = -EINVAL;
4577
4578 if (!output_fd)
4579 goto set;
4580
4581 output_file = fget_light(output_fd, &fput_needed);
4582 if (!output_file)
4583 return -EBADF;
4584
4585 if (output_file->f_op != &perf_fops)
4586 goto out;
4587
4588 output_event = output_file->private_data;
4589
4590 /* Don't chain output fds */
4591 if (output_event->output)
4592 goto out;
4593
4594 /* Don't set an output fd when we already have an output channel */
4595 if (event->data)
4596 goto out;
4597
4598 atomic_long_inc(&output_file->f_count);
4599
4600set:
4601 mutex_lock(&event->mmap_mutex);
4602 old_output = event->output;
4603 rcu_assign_pointer(event->output, output_event);
4604 mutex_unlock(&event->mmap_mutex);
4605
4606 if (old_output) {
4607 /*
4608 * we need to make sure no existing perf_output_*()
4609 * is still referencing this event.
4610 */
4611 synchronize_rcu();
4612 fput(old_output->filp);
4613 }
4614
4615 ret = 0;
4616out:
4617 fput_light(output_file, fput_needed);
4618 return ret;
4619}
4620
4621/**
4622 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4623 *
4624 * @attr_uptr: event_id type attributes for monitoring/sampling
4625 * @pid: target pid
4626 * @cpu: target cpu
4627 * @group_fd: group leader event fd
4628 */
4629SYSCALL_DEFINE5(perf_event_open,
4630 struct perf_event_attr __user *, attr_uptr,
4631 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4632{
4633 struct perf_event *event, *group_leader;
4634 struct perf_event_attr attr;
4635 struct perf_event_context *ctx;
4636 struct file *event_file = NULL;
4637 struct file *group_file = NULL;
4638 int fput_needed = 0;
4639 int fput_needed2 = 0;
4640 int err;
4641
4642 /* for future expandability... */
4643 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4644 return -EINVAL;
4645
4646 err = perf_copy_attr(attr_uptr, &attr);
4647 if (err)
4648 return err;
4649
4650 if (!attr.exclude_kernel) {
4651 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4652 return -EACCES;
4653 }
4654
4655 if (attr.freq) {
4656 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4657 return -EINVAL;
4658 }
4659
4660 /*
4661 * Get the target context (task or percpu):
4662 */
4663 ctx = find_get_context(pid, cpu);
4664 if (IS_ERR(ctx))
4665 return PTR_ERR(ctx);
4666
4667 /*
4668 * Look up the group leader (we will attach this event to it):
4669 */
4670 group_leader = NULL;
4671 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4672 err = -EINVAL;
4673 group_file = fget_light(group_fd, &fput_needed);
4674 if (!group_file)
4675 goto err_put_context;
4676 if (group_file->f_op != &perf_fops)
4677 goto err_put_context;
4678
4679 group_leader = group_file->private_data;
4680 /*
4681 * Do not allow a recursive hierarchy (this new sibling
4682 * becoming part of another group-sibling):
4683 */
4684 if (group_leader->group_leader != group_leader)
4685 goto err_put_context;
4686 /*
4687 * Do not allow to attach to a group in a different
4688 * task or CPU context:
4689 */
4690 if (group_leader->ctx != ctx)
4691 goto err_put_context;
4692 /*
4693 * Only a group leader can be exclusive or pinned
4694 */
4695 if (attr.exclusive || attr.pinned)
4696 goto err_put_context;
4697 }
4698
4699 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004700 NULL, NULL, GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004701 err = PTR_ERR(event);
4702 if (IS_ERR(event))
4703 goto err_put_context;
4704
4705 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4706 if (err < 0)
4707 goto err_free_put_context;
4708
4709 event_file = fget_light(err, &fput_needed2);
4710 if (!event_file)
4711 goto err_free_put_context;
4712
4713 if (flags & PERF_FLAG_FD_OUTPUT) {
4714 err = perf_event_set_output(event, group_fd);
4715 if (err)
4716 goto err_fput_free_put_context;
4717 }
4718
4719 event->filp = event_file;
4720 WARN_ON_ONCE(ctx->parent_ctx);
4721 mutex_lock(&ctx->mutex);
4722 perf_install_in_context(ctx, event, cpu);
4723 ++ctx->generation;
4724 mutex_unlock(&ctx->mutex);
4725
4726 event->owner = current;
4727 get_task_struct(current);
4728 mutex_lock(&current->perf_event_mutex);
4729 list_add_tail(&event->owner_entry, &current->perf_event_list);
4730 mutex_unlock(&current->perf_event_mutex);
4731
4732err_fput_free_put_context:
4733 fput_light(event_file, fput_needed2);
4734
4735err_free_put_context:
4736 if (err < 0)
4737 kfree(event);
4738
4739err_put_context:
4740 if (err < 0)
4741 put_ctx(ctx);
4742
4743 fput_light(group_file, fput_needed);
4744
4745 return err;
4746}
4747
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004748/**
4749 * perf_event_create_kernel_counter
4750 *
4751 * @attr: attributes of the counter to create
4752 * @cpu: cpu in which the counter is bound
4753 * @pid: task to profile
4754 */
4755struct perf_event *
4756perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004757 pid_t pid, perf_callback_t callback)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004758{
4759 struct perf_event *event;
4760 struct perf_event_context *ctx;
4761 int err;
4762
4763 /*
4764 * Get the target context (task or percpu):
4765 */
4766
4767 ctx = find_get_context(pid, cpu);
4768 if (IS_ERR(ctx))
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02004769 return NULL;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004770
4771 event = perf_event_alloc(attr, cpu, ctx, NULL,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004772 NULL, callback, GFP_KERNEL);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02004773 err = PTR_ERR(event);
4774 if (IS_ERR(event))
4775 goto err_put_context;
4776
4777 event->filp = NULL;
4778 WARN_ON_ONCE(ctx->parent_ctx);
4779 mutex_lock(&ctx->mutex);
4780 perf_install_in_context(ctx, event, cpu);
4781 ++ctx->generation;
4782 mutex_unlock(&ctx->mutex);
4783
4784 event->owner = current;
4785 get_task_struct(current);
4786 mutex_lock(&current->perf_event_mutex);
4787 list_add_tail(&event->owner_entry, &current->perf_event_list);
4788 mutex_unlock(&current->perf_event_mutex);
4789
4790 return event;
4791
4792err_put_context:
4793 if (err < 0)
4794 put_ctx(ctx);
4795
4796 return NULL;
4797}
4798EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4799
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004800/*
4801 * inherit a event from parent task to child task:
4802 */
4803static struct perf_event *
4804inherit_event(struct perf_event *parent_event,
4805 struct task_struct *parent,
4806 struct perf_event_context *parent_ctx,
4807 struct task_struct *child,
4808 struct perf_event *group_leader,
4809 struct perf_event_context *child_ctx)
4810{
4811 struct perf_event *child_event;
4812
4813 /*
4814 * Instead of creating recursive hierarchies of events,
4815 * we link inherited events back to the original parent,
4816 * which has a filp for sure, which we use as the reference
4817 * count:
4818 */
4819 if (parent_event->parent)
4820 parent_event = parent_event->parent;
4821
4822 child_event = perf_event_alloc(&parent_event->attr,
4823 parent_event->cpu, child_ctx,
4824 group_leader, parent_event,
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02004825 NULL, GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004826 if (IS_ERR(child_event))
4827 return child_event;
4828 get_ctx(child_ctx);
4829
4830 /*
4831 * Make the child state follow the state of the parent event,
4832 * not its attr.disabled bit. We hold the parent's mutex,
4833 * so we won't race with perf_event_{en, dis}able_family.
4834 */
4835 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4836 child_event->state = PERF_EVENT_STATE_INACTIVE;
4837 else
4838 child_event->state = PERF_EVENT_STATE_OFF;
4839
4840 if (parent_event->attr.freq)
4841 child_event->hw.sample_period = parent_event->hw.sample_period;
4842
Peter Zijlstra453f19e2009-11-20 22:19:43 +01004843 child_event->overflow_handler = parent_event->overflow_handler;
4844
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004845 /*
4846 * Link it up in the child's context:
4847 */
4848 add_event_to_ctx(child_event, child_ctx);
4849
4850 /*
4851 * Get a reference to the parent filp - we will fput it
4852 * when the child event exits. This is safe to do because
4853 * we are in the parent and we know that the filp still
4854 * exists and has a nonzero count:
4855 */
4856 atomic_long_inc(&parent_event->filp->f_count);
4857
4858 /*
4859 * Link this into the parent event's child list
4860 */
4861 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4862 mutex_lock(&parent_event->child_mutex);
4863 list_add_tail(&child_event->child_list, &parent_event->child_list);
4864 mutex_unlock(&parent_event->child_mutex);
4865
4866 return child_event;
4867}
4868
4869static int inherit_group(struct perf_event *parent_event,
4870 struct task_struct *parent,
4871 struct perf_event_context *parent_ctx,
4872 struct task_struct *child,
4873 struct perf_event_context *child_ctx)
4874{
4875 struct perf_event *leader;
4876 struct perf_event *sub;
4877 struct perf_event *child_ctr;
4878
4879 leader = inherit_event(parent_event, parent, parent_ctx,
4880 child, NULL, child_ctx);
4881 if (IS_ERR(leader))
4882 return PTR_ERR(leader);
4883 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4884 child_ctr = inherit_event(sub, parent, parent_ctx,
4885 child, leader, child_ctx);
4886 if (IS_ERR(child_ctr))
4887 return PTR_ERR(child_ctr);
4888 }
4889 return 0;
4890}
4891
4892static void sync_child_event(struct perf_event *child_event,
4893 struct task_struct *child)
4894{
4895 struct perf_event *parent_event = child_event->parent;
4896 u64 child_val;
4897
4898 if (child_event->attr.inherit_stat)
4899 perf_event_read_event(child_event, child);
4900
4901 child_val = atomic64_read(&child_event->count);
4902
4903 /*
4904 * Add back the child's count to the parent's count:
4905 */
4906 atomic64_add(child_val, &parent_event->count);
4907 atomic64_add(child_event->total_time_enabled,
4908 &parent_event->child_total_time_enabled);
4909 atomic64_add(child_event->total_time_running,
4910 &parent_event->child_total_time_running);
4911
4912 /*
4913 * Remove this event from the parent's list
4914 */
4915 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4916 mutex_lock(&parent_event->child_mutex);
4917 list_del_init(&child_event->child_list);
4918 mutex_unlock(&parent_event->child_mutex);
4919
4920 /*
4921 * Release the parent event, if this was the last
4922 * reference to it.
4923 */
4924 fput(parent_event->filp);
4925}
4926
4927static void
4928__perf_event_exit_task(struct perf_event *child_event,
4929 struct perf_event_context *child_ctx,
4930 struct task_struct *child)
4931{
4932 struct perf_event *parent_event;
4933
4934 update_event_times(child_event);
4935 perf_event_remove_from_context(child_event);
4936
4937 parent_event = child_event->parent;
4938 /*
4939 * It can happen that parent exits first, and has events
4940 * that are still around due to the child reference. These
4941 * events need to be zapped - but otherwise linger.
4942 */
4943 if (parent_event) {
4944 sync_child_event(child_event, child);
4945 free_event(child_event);
4946 }
4947}
4948
4949/*
4950 * When a child task exits, feed back event values to parent events.
4951 */
4952void perf_event_exit_task(struct task_struct *child)
4953{
4954 struct perf_event *child_event, *tmp;
4955 struct perf_event_context *child_ctx;
4956 unsigned long flags;
4957
4958 if (likely(!child->perf_event_ctxp)) {
4959 perf_event_task(child, NULL, 0);
4960 return;
4961 }
4962
4963 local_irq_save(flags);
4964 /*
4965 * We can't reschedule here because interrupts are disabled,
4966 * and either child is current or it is a task that can't be
4967 * scheduled, so we are now safe from rescheduling changing
4968 * our context.
4969 */
4970 child_ctx = child->perf_event_ctxp;
4971 __perf_event_task_sched_out(child_ctx);
4972
4973 /*
4974 * Take the context lock here so that if find_get_context is
4975 * reading child->perf_event_ctxp, we wait until it has
4976 * incremented the context's refcount before we do put_ctx below.
4977 */
4978 spin_lock(&child_ctx->lock);
4979 child->perf_event_ctxp = NULL;
4980 /*
4981 * If this context is a clone; unclone it so it can't get
4982 * swapped to another process while we're removing all
4983 * the events from it.
4984 */
4985 unclone_ctx(child_ctx);
4986 spin_unlock_irqrestore(&child_ctx->lock, flags);
4987
4988 /*
4989 * Report the task dead after unscheduling the events so that we
4990 * won't get any samples after PERF_RECORD_EXIT. We can however still
4991 * get a few PERF_RECORD_READ events.
4992 */
4993 perf_event_task(child, child_ctx, 0);
4994
4995 /*
4996 * We can recurse on the same lock type through:
4997 *
4998 * __perf_event_exit_task()
4999 * sync_child_event()
5000 * fput(parent_event->filp)
5001 * perf_release()
5002 * mutex_lock(&ctx->mutex)
5003 *
5004 * But since its the parent context it won't be the same instance.
5005 */
5006 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5007
5008again:
5009 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
5010 group_entry)
5011 __perf_event_exit_task(child_event, child_ctx, child);
5012
5013 /*
5014 * If the last event was a group event, it will have appended all
5015 * its siblings to the list, but we obtained 'tmp' before that which
5016 * will still point to the list head terminating the iteration.
5017 */
5018 if (!list_empty(&child_ctx->group_list))
5019 goto again;
5020
5021 mutex_unlock(&child_ctx->mutex);
5022
5023 put_ctx(child_ctx);
5024}
5025
5026/*
5027 * free an unexposed, unused context as created by inheritance by
5028 * init_task below, used by fork() in case of fail.
5029 */
5030void perf_event_free_task(struct task_struct *task)
5031{
5032 struct perf_event_context *ctx = task->perf_event_ctxp;
5033 struct perf_event *event, *tmp;
5034
5035 if (!ctx)
5036 return;
5037
5038 mutex_lock(&ctx->mutex);
5039again:
5040 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
5041 struct perf_event *parent = event->parent;
5042
5043 if (WARN_ON_ONCE(!parent))
5044 continue;
5045
5046 mutex_lock(&parent->child_mutex);
5047 list_del_init(&event->child_list);
5048 mutex_unlock(&parent->child_mutex);
5049
5050 fput(parent->filp);
5051
5052 list_del_event(event, ctx);
5053 free_event(event);
5054 }
5055
5056 if (!list_empty(&ctx->group_list))
5057 goto again;
5058
5059 mutex_unlock(&ctx->mutex);
5060
5061 put_ctx(ctx);
5062}
5063
5064/*
5065 * Initialize the perf_event context in task_struct
5066 */
5067int perf_event_init_task(struct task_struct *child)
5068{
5069 struct perf_event_context *child_ctx, *parent_ctx;
5070 struct perf_event_context *cloned_ctx;
5071 struct perf_event *event;
5072 struct task_struct *parent = current;
5073 int inherited_all = 1;
5074 int ret = 0;
5075
5076 child->perf_event_ctxp = NULL;
5077
5078 mutex_init(&child->perf_event_mutex);
5079 INIT_LIST_HEAD(&child->perf_event_list);
5080
5081 if (likely(!parent->perf_event_ctxp))
5082 return 0;
5083
5084 /*
5085 * This is executed from the parent task context, so inherit
5086 * events that have been marked for cloning.
5087 * First allocate and initialize a context for the child.
5088 */
5089
5090 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5091 if (!child_ctx)
5092 return -ENOMEM;
5093
5094 __perf_event_init_context(child_ctx, child);
5095 child->perf_event_ctxp = child_ctx;
5096 get_task_struct(child);
5097
5098 /*
5099 * If the parent's context is a clone, pin it so it won't get
5100 * swapped under us.
5101 */
5102 parent_ctx = perf_pin_task_context(parent);
5103
5104 /*
5105 * No need to check if parent_ctx != NULL here; since we saw
5106 * it non-NULL earlier, the only reason for it to become NULL
5107 * is if we exit, and since we're currently in the middle of
5108 * a fork we can't be exiting at the same time.
5109 */
5110
5111 /*
5112 * Lock the parent list. No need to lock the child - not PID
5113 * hashed yet and not running, so nobody can access it.
5114 */
5115 mutex_lock(&parent_ctx->mutex);
5116
5117 /*
5118 * We dont have to disable NMIs - we are only looking at
5119 * the list, not manipulating it:
5120 */
Xiao Guangrong27f99942009-09-25 13:54:01 +08005121 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005122
5123 if (!event->attr.inherit) {
5124 inherited_all = 0;
5125 continue;
5126 }
5127
5128 ret = inherit_group(event, parent, parent_ctx,
5129 child, child_ctx);
5130 if (ret) {
5131 inherited_all = 0;
5132 break;
5133 }
5134 }
5135
5136 if (inherited_all) {
5137 /*
5138 * Mark the child context as a clone of the parent
5139 * context, or of whatever the parent is a clone of.
5140 * Note that if the parent is a clone, it could get
5141 * uncloned at any point, but that doesn't matter
5142 * because the list of events and the generation
5143 * count can't have changed since we took the mutex.
5144 */
5145 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5146 if (cloned_ctx) {
5147 child_ctx->parent_ctx = cloned_ctx;
5148 child_ctx->parent_gen = parent_ctx->parent_gen;
5149 } else {
5150 child_ctx->parent_ctx = parent_ctx;
5151 child_ctx->parent_gen = parent_ctx->generation;
5152 }
5153 get_ctx(child_ctx->parent_ctx);
5154 }
5155
5156 mutex_unlock(&parent_ctx->mutex);
5157
5158 perf_unpin_context(parent_ctx);
5159
5160 return ret;
5161}
5162
5163static void __cpuinit perf_event_init_cpu(int cpu)
5164{
5165 struct perf_cpu_context *cpuctx;
5166
5167 cpuctx = &per_cpu(perf_cpu_context, cpu);
5168 __perf_event_init_context(&cpuctx->ctx, NULL);
5169
5170 spin_lock(&perf_resource_lock);
5171 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5172 spin_unlock(&perf_resource_lock);
5173
5174 hw_perf_event_setup(cpu);
5175}
5176
5177#ifdef CONFIG_HOTPLUG_CPU
5178static void __perf_event_exit_cpu(void *info)
5179{
5180 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5181 struct perf_event_context *ctx = &cpuctx->ctx;
5182 struct perf_event *event, *tmp;
5183
5184 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5185 __perf_event_remove_from_context(event);
5186}
5187static void perf_event_exit_cpu(int cpu)
5188{
5189 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5190 struct perf_event_context *ctx = &cpuctx->ctx;
5191
5192 mutex_lock(&ctx->mutex);
5193 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5194 mutex_unlock(&ctx->mutex);
5195}
5196#else
5197static inline void perf_event_exit_cpu(int cpu) { }
5198#endif
5199
5200static int __cpuinit
5201perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5202{
5203 unsigned int cpu = (long)hcpu;
5204
5205 switch (action) {
5206
5207 case CPU_UP_PREPARE:
5208 case CPU_UP_PREPARE_FROZEN:
5209 perf_event_init_cpu(cpu);
5210 break;
5211
5212 case CPU_ONLINE:
5213 case CPU_ONLINE_FROZEN:
5214 hw_perf_event_setup_online(cpu);
5215 break;
5216
5217 case CPU_DOWN_PREPARE:
5218 case CPU_DOWN_PREPARE_FROZEN:
5219 perf_event_exit_cpu(cpu);
5220 break;
5221
5222 default:
5223 break;
5224 }
5225
5226 return NOTIFY_OK;
5227}
5228
5229/*
5230 * This has to have a higher priority than migration_notifier in sched.c.
5231 */
5232static struct notifier_block __cpuinitdata perf_cpu_nb = {
5233 .notifier_call = perf_cpu_notify,
5234 .priority = 20,
5235};
5236
5237void __init perf_event_init(void)
5238{
5239 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5240 (void *)(long)smp_processor_id());
5241 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5242 (void *)(long)smp_processor_id());
5243 register_cpu_notifier(&perf_cpu_nb);
5244}
5245
5246static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5247{
5248 return sprintf(buf, "%d\n", perf_reserved_percpu);
5249}
5250
5251static ssize_t
5252perf_set_reserve_percpu(struct sysdev_class *class,
5253 const char *buf,
5254 size_t count)
5255{
5256 struct perf_cpu_context *cpuctx;
5257 unsigned long val;
5258 int err, cpu, mpt;
5259
5260 err = strict_strtoul(buf, 10, &val);
5261 if (err)
5262 return err;
5263 if (val > perf_max_events)
5264 return -EINVAL;
5265
5266 spin_lock(&perf_resource_lock);
5267 perf_reserved_percpu = val;
5268 for_each_online_cpu(cpu) {
5269 cpuctx = &per_cpu(perf_cpu_context, cpu);
5270 spin_lock_irq(&cpuctx->ctx.lock);
5271 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5272 perf_max_events - perf_reserved_percpu);
5273 cpuctx->max_pertask = mpt;
5274 spin_unlock_irq(&cpuctx->ctx.lock);
5275 }
5276 spin_unlock(&perf_resource_lock);
5277
5278 return count;
5279}
5280
5281static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5282{
5283 return sprintf(buf, "%d\n", perf_overcommit);
5284}
5285
5286static ssize_t
5287perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5288{
5289 unsigned long val;
5290 int err;
5291
5292 err = strict_strtoul(buf, 10, &val);
5293 if (err)
5294 return err;
5295 if (val > 1)
5296 return -EINVAL;
5297
5298 spin_lock(&perf_resource_lock);
5299 perf_overcommit = val;
5300 spin_unlock(&perf_resource_lock);
5301
5302 return count;
5303}
5304
5305static SYSDEV_CLASS_ATTR(
5306 reserve_percpu,
5307 0644,
5308 perf_show_reserve_percpu,
5309 perf_set_reserve_percpu
5310 );
5311
5312static SYSDEV_CLASS_ATTR(
5313 overcommit,
5314 0644,
5315 perf_show_overcommit,
5316 perf_set_overcommit
5317 );
5318
5319static struct attribute *perfclass_attrs[] = {
5320 &attr_reserve_percpu.attr,
5321 &attr_overcommit.attr,
5322 NULL
5323};
5324
5325static struct attribute_group perfclass_attr_group = {
5326 .attrs = perfclass_attrs,
5327 .name = "perf_events",
5328};
5329
5330static int __init perf_event_sysfs_init(void)
5331{
5332 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5333 &perfclass_attr_group);
5334}
5335device_initcall(perf_event_sysfs_init);