blob: 193b1900e64f49d08e5ee6ac68163ba69341c1cf [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
Peter Zijlstra2e80a822010-11-17 23:17:36 +010016#include <linux/idr.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020017#include <linux/file.h>
18#include <linux/poll.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Frederic Weisbecker76e1d902010-04-05 15:35:57 +020020#include <linux/hash.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020021#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
Peter Zijlstrac2774432010-12-08 15:29:02 +010025#include <linux/reboot.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020026#include <linux/vmstat.h>
Peter Zijlstraabe43402010-11-17 23:17:37 +010027#include <linux/device.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020028#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020029#include <linux/hardirq.h>
30#include <linux/rculist.h>
31#include <linux/uaccess.h>
32#include <linux/syscalls.h>
33#include <linux/anon_inodes.h>
34#include <linux/kernel_stat.h>
35#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080036#include <linux/ftrace_event.h>
Jason Wessel3c502e72010-11-04 17:33:01 -050037#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020038
39#include <asm/irq_regs.h>
40
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010041struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
Stephane Eraniane5d13672011-02-14 11:20:01 +0200114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
Stephane Eraniane5d13672011-02-14 11:20:01 +0200124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200131static atomic_t nr_mmap_events __read_mostly;
132static atomic_t nr_comm_events __read_mostly;
133static atomic_t nr_task_events __read_mostly;
134
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200135static LIST_HEAD(pmus);
136static DEFINE_MUTEX(pmus_lock);
137static struct srcu_struct pmus_srcu;
138
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200139/*
140 * perf event paranoia level:
141 * -1 - not paranoid at all
142 * 0 - disallow raw tracepoint access for unpriv
143 * 1 - disallow cpu events for unpriv
144 * 2 - disallow kernel profiling for unpriv
145 */
146int sysctl_perf_event_paranoid __read_mostly = 1;
147
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200148int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
149
150/*
151 * max perf event sample rate
152 */
Peter Zijlstra163ec432011-02-16 11:22:34 +0100153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200171
172static atomic64_t perf_event_id;
173
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200174static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
175 enum event_type_t event_type);
176
177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +0200178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200183
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200184void __weak perf_event_print_debug(void) { }
185
Matt Fleming84c79912010-10-03 21:41:13 +0100186extern __weak const char *perf_pmu_name(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200187{
Matt Fleming84c79912010-10-03 21:41:13 +0100188 return "pmu";
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200189}
190
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200191static inline u64 perf_clock(void)
192{
193 return local_clock();
194}
195
Stephane Eraniane5d13672011-02-14 11:20:01 +0200196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200276 struct perf_cgroup *cgrp;
277
Stephane Eraniane5d13672011-02-14 11:20:01 +0200278 /*
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200281 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200282 if (!is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200283 return;
284
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200291}
292
293static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200310 info->timestamp = ctx->timestamp;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
Li Zefan3db272c2011-03-03 14:25:37 +0800407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
Li Zefanf75e18c2011-03-03 14:25:50 +0800415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
Stephane Eraniane5d13672011-02-14 11:20:01 +0200418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200426 }
Li Zefan3db272c2011-03-03 14:25:37 +0800427out:
Stephane Eraniane5d13672011-02-14 11:20:01 +0200428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200550void perf_pmu_disable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200551{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
553 if (!(*count)++)
554 pmu->pmu_disable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200555}
556
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200557void perf_pmu_enable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200558{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200559 int *count = this_cpu_ptr(pmu->pmu_disable_count);
560 if (!--(*count))
561 pmu->pmu_enable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200562}
563
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200564static DEFINE_PER_CPU(struct list_head, rotation_list);
565
566/*
567 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
568 * because they're strictly cpu affine and rotate_start is called with IRQs
569 * disabled, while rotate_context is called from IRQ context.
570 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200571static void perf_pmu_rotate_start(struct pmu *pmu)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200572{
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200573 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200574 struct list_head *head = &__get_cpu_var(rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200575
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200576 WARN_ON(!irqs_disabled());
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200577
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200578 if (list_empty(&cpuctx->rotation_list))
579 list_add(&cpuctx->rotation_list, head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200580}
581
582static void get_ctx(struct perf_event_context *ctx)
583{
584 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
585}
586
587static void free_ctx(struct rcu_head *head)
588{
589 struct perf_event_context *ctx;
590
591 ctx = container_of(head, struct perf_event_context, rcu_head);
592 kfree(ctx);
593}
594
595static void put_ctx(struct perf_event_context *ctx)
596{
597 if (atomic_dec_and_test(&ctx->refcount)) {
598 if (ctx->parent_ctx)
599 put_ctx(ctx->parent_ctx);
600 if (ctx->task)
601 put_task_struct(ctx->task);
602 call_rcu(&ctx->rcu_head, free_ctx);
603 }
604}
605
606static void unclone_ctx(struct perf_event_context *ctx)
607{
608 if (ctx->parent_ctx) {
609 put_ctx(ctx->parent_ctx);
610 ctx->parent_ctx = NULL;
611 }
612}
613
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200614static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
615{
616 /*
617 * only top level events have the pid namespace they were created in
618 */
619 if (event->parent)
620 event = event->parent;
621
622 return task_tgid_nr_ns(p, event->ns);
623}
624
625static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
626{
627 /*
628 * only top level events have the pid namespace they were created in
629 */
630 if (event->parent)
631 event = event->parent;
632
633 return task_pid_nr_ns(p, event->ns);
634}
635
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200636/*
637 * If we inherit events we want to return the parent event id
638 * to userspace.
639 */
640static u64 primary_event_id(struct perf_event *event)
641{
642 u64 id = event->id;
643
644 if (event->parent)
645 id = event->parent->id;
646
647 return id;
648}
649
650/*
651 * Get the perf_event_context for a task and lock it.
652 * This has to cope with with the fact that until it is locked,
653 * the context could get moved to another task.
654 */
655static struct perf_event_context *
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200656perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200657{
658 struct perf_event_context *ctx;
659
660 rcu_read_lock();
Peter Zijlstra9ed60602010-06-11 17:36:35 +0200661retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200662 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200663 if (ctx) {
664 /*
665 * If this context is a clone of another, it might
666 * get swapped for another underneath us by
667 * perf_event_task_sched_out, though the
668 * rcu_read_lock() protects us from any context
669 * getting freed. Lock the context and check if it
670 * got swapped before we could get the lock, and retry
671 * if so. If we locked the right context, then it
672 * can't get swapped on us any more.
673 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100674 raw_spin_lock_irqsave(&ctx->lock, *flags);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200675 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100676 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200677 goto retry;
678 }
679
680 if (!atomic_inc_not_zero(&ctx->refcount)) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100681 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200682 ctx = NULL;
683 }
684 }
685 rcu_read_unlock();
686 return ctx;
687}
688
689/*
690 * Get the context for a task and increment its pin_count so it
691 * can't get swapped to another task. This also increments its
692 * reference count so that the context can't get freed.
693 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200694static struct perf_event_context *
695perf_pin_task_context(struct task_struct *task, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200696{
697 struct perf_event_context *ctx;
698 unsigned long flags;
699
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200700 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200701 if (ctx) {
702 ++ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100703 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200704 }
705 return ctx;
706}
707
708static void perf_unpin_context(struct perf_event_context *ctx)
709{
710 unsigned long flags;
711
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100712 raw_spin_lock_irqsave(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200713 --ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +0100714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200715}
716
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100717/*
718 * Update the record of the current time in a context.
719 */
720static void update_context_time(struct perf_event_context *ctx)
721{
722 u64 now = perf_clock();
723
724 ctx->time += now - ctx->timestamp;
725 ctx->timestamp = now;
726}
727
Stephane Eranian41587552011-01-03 18:20:01 +0200728static u64 perf_event_time(struct perf_event *event)
729{
730 struct perf_event_context *ctx = event->ctx;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
Stephane Eranian41587552011-01-03 18:20:01 +0200735 return ctx ? ctx->time : 0;
736}
737
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100738/*
739 * Update the total_time_enabled and total_time_running fields for a event.
740 */
741static void update_event_times(struct perf_event *event)
742{
743 struct perf_event_context *ctx = event->ctx;
744 u64 run_end;
745
746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
748 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200749 /*
750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
Stephane Eranian41587552011-01-03 18:20:01 +0200760 run_end = perf_event_time(event);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200761 else if (ctx->is_active)
762 run_end = ctx->time;
Peter Zijlstraacd1d7c2009-11-23 15:00:36 +0100763 else
764 run_end = event->tstamp_stopped;
765
766 event->total_time_enabled = run_end - event->tstamp_enabled;
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100767
768 if (event->state == PERF_EVENT_STATE_INACTIVE)
769 run_end = event->tstamp_stopped;
770 else
Stephane Eranian41587552011-01-03 18:20:01 +0200771 run_end = perf_event_time(event);
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100772
773 event->total_time_running = run_end - event->tstamp_running;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200774
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100775}
776
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200777/*
778 * Update total_time_enabled and total_time_running for all events in a group.
779 */
780static void update_group_times(struct perf_event *leader)
781{
782 struct perf_event *event;
783
784 update_event_times(leader);
785 list_for_each_entry(event, &leader->sibling_list, group_entry)
786 update_event_times(event);
787}
788
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100789static struct list_head *
790ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
791{
792 if (event->attr.pinned)
793 return &ctx->pinned_groups;
794 else
795 return &ctx->flexible_groups;
796}
797
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200798/*
799 * Add a event from the lists for its context.
800 * Must be called with ctx->mutex and ctx->lock held.
801 */
802static void
803list_add_event(struct perf_event *event, struct perf_event_context *ctx)
804{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200805 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
806 event->attach_state |= PERF_ATTACH_CONTEXT;
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200807
808 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +0200809 * If we're a stand alone event or group leader, we go to the context
810 * list, group events are kept attached to the group so that
811 * perf_group_detach can, at all times, locate all siblings.
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200812 */
Peter Zijlstra8a495422010-05-27 15:47:49 +0200813 if (event->group_leader == event) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100814 struct list_head *list;
815
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +0100816 if (is_software_event(event))
817 event->group_flags |= PERF_GROUP_SOFTWARE;
818
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100819 list = ctx_group_list(event, ctx);
820 list_add_tail(&event->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200821 }
822
Stephane Eraniane5d13672011-02-14 11:20:01 +0200823 if (is_cgroup_event(event)) {
824 ctx->nr_cgroups++;
825 /*
826 * one more event:
827 * - that has cgroup constraint on event->cpu
828 * - that may need work on context switch
829 */
830 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
831 jump_label_inc(&perf_sched_events);
832 }
833
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200834 list_add_rcu(&event->event_entry, &ctx->event_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200835 if (!ctx->nr_events)
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200836 perf_pmu_rotate_start(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200837 ctx->nr_events++;
838 if (event->attr.inherit_stat)
839 ctx->nr_stat++;
840}
841
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200842/*
843 * Called at perf_event creation and when events are attached/detached from a
844 * group.
845 */
846static void perf_event__read_size(struct perf_event *event)
847{
848 int entry = sizeof(u64); /* value */
849 int size = 0;
850 int nr = 1;
851
852 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
853 size += sizeof(u64);
854
855 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
856 size += sizeof(u64);
857
858 if (event->attr.read_format & PERF_FORMAT_ID)
859 entry += sizeof(u64);
860
861 if (event->attr.read_format & PERF_FORMAT_GROUP) {
862 nr += event->group_leader->nr_siblings;
863 size += sizeof(u64);
864 }
865
866 size += entry * nr;
867 event->read_size = size;
868}
869
870static void perf_event__header_size(struct perf_event *event)
871{
872 struct perf_sample_data *data;
873 u64 sample_type = event->attr.sample_type;
874 u16 size = 0;
875
876 perf_event__read_size(event);
877
878 if (sample_type & PERF_SAMPLE_IP)
879 size += sizeof(data->ip);
880
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200881 if (sample_type & PERF_SAMPLE_ADDR)
882 size += sizeof(data->addr);
883
884 if (sample_type & PERF_SAMPLE_PERIOD)
885 size += sizeof(data->period);
886
887 if (sample_type & PERF_SAMPLE_READ)
888 size += event->read_size;
889
890 event->header_size = size;
891}
892
893static void perf_event__id_header_size(struct perf_event *event)
894{
895 struct perf_sample_data *data;
896 u64 sample_type = event->attr.sample_type;
897 u16 size = 0;
898
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200899 if (sample_type & PERF_SAMPLE_TID)
900 size += sizeof(data->tid_entry);
901
902 if (sample_type & PERF_SAMPLE_TIME)
903 size += sizeof(data->time);
904
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200905 if (sample_type & PERF_SAMPLE_ID)
906 size += sizeof(data->id);
907
908 if (sample_type & PERF_SAMPLE_STREAM_ID)
909 size += sizeof(data->stream_id);
910
911 if (sample_type & PERF_SAMPLE_CPU)
912 size += sizeof(data->cpu_entry);
913
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200914 event->id_header_size = size;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200915}
916
Peter Zijlstra8a495422010-05-27 15:47:49 +0200917static void perf_group_attach(struct perf_event *event)
918{
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200919 struct perf_event *group_leader = event->group_leader, *pos;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200920
Peter Zijlstra74c33372010-10-15 11:40:29 +0200921 /*
922 * We can have double attach due to group movement in perf_event_open.
923 */
924 if (event->attach_state & PERF_ATTACH_GROUP)
925 return;
926
Peter Zijlstra8a495422010-05-27 15:47:49 +0200927 event->attach_state |= PERF_ATTACH_GROUP;
928
929 if (group_leader == event)
930 return;
931
932 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
933 !is_software_event(event))
934 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
935
936 list_add_tail(&event->group_entry, &group_leader->sibling_list);
937 group_leader->nr_siblings++;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200938
939 perf_event__header_size(group_leader);
940
941 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
942 perf_event__header_size(pos);
Peter Zijlstra8a495422010-05-27 15:47:49 +0200943}
944
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200945/*
946 * Remove a event from the lists for its context.
947 * Must be called with ctx->mutex and ctx->lock held.
948 */
949static void
950list_del_event(struct perf_event *event, struct perf_event_context *ctx)
951{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200952 /*
953 * We can have double detach due to exit/hot-unplug + close.
954 */
955 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200956 return;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200957
958 event->attach_state &= ~PERF_ATTACH_CONTEXT;
959
Stephane Eraniane5d13672011-02-14 11:20:01 +0200960 if (is_cgroup_event(event)) {
961 ctx->nr_cgroups--;
962 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
963 jump_label_dec(&perf_sched_events);
964 }
965
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200966 ctx->nr_events--;
967 if (event->attr.inherit_stat)
968 ctx->nr_stat--;
969
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200970 list_del_rcu(&event->event_entry);
971
Peter Zijlstra8a495422010-05-27 15:47:49 +0200972 if (event->group_leader == event)
973 list_del_init(&event->group_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200974
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200975 update_group_times(event);
Stephane Eranianb2e74a22009-11-26 09:24:30 -0800976
977 /*
978 * If event was in error state, then keep it
979 * that way, otherwise bogus counts will be
980 * returned on read(). The only way to get out
981 * of error state is by explicit re-enabling
982 * of the event
983 */
984 if (event->state > PERF_EVENT_STATE_OFF)
985 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra050735b2010-05-11 11:51:53 +0200986}
987
Peter Zijlstra8a495422010-05-27 15:47:49 +0200988static void perf_group_detach(struct perf_event *event)
Peter Zijlstra050735b2010-05-11 11:51:53 +0200989{
990 struct perf_event *sibling, *tmp;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200991 struct list_head *list = NULL;
992
993 /*
994 * We can have double detach due to exit/hot-unplug + close.
995 */
996 if (!(event->attach_state & PERF_ATTACH_GROUP))
997 return;
998
999 event->attach_state &= ~PERF_ATTACH_GROUP;
1000
1001 /*
1002 * If this is a sibling, remove it from its group.
1003 */
1004 if (event->group_leader != event) {
1005 list_del_init(&event->group_entry);
1006 event->group_leader->nr_siblings--;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001007 goto out;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001008 }
1009
1010 if (!list_empty(&event->group_entry))
1011 list = &event->group_entry;
Peter Zijlstra2e2af502009-11-23 11:37:25 +01001012
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001013 /*
1014 * If this was a group event with sibling events then
1015 * upgrade the siblings to singleton events by adding them
Peter Zijlstra8a495422010-05-27 15:47:49 +02001016 * to whatever list we are on.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001017 */
1018 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
Peter Zijlstra8a495422010-05-27 15:47:49 +02001019 if (list)
1020 list_move_tail(&sibling->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001021 sibling->group_leader = sibling;
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001022
1023 /* Inherit group flags from the previous leader */
1024 sibling->group_flags = event->group_flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001025 }
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001026
1027out:
1028 perf_event__header_size(event->group_leader);
1029
1030 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1031 perf_event__header_size(tmp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001032}
1033
Stephane Eranianfa66f072010-08-26 16:40:01 +02001034static inline int
1035event_filter_match(struct perf_event *event)
1036{
Stephane Eraniane5d13672011-02-14 11:20:01 +02001037 return (event->cpu == -1 || event->cpu == smp_processor_id())
1038 && perf_cgroup_match(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001039}
1040
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001041static void
1042event_sched_out(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001043 struct perf_cpu_context *cpuctx,
1044 struct perf_event_context *ctx)
1045{
Stephane Eranian41587552011-01-03 18:20:01 +02001046 u64 tstamp = perf_event_time(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001047 u64 delta;
1048 /*
1049 * An event which could not be activated because of
1050 * filter mismatch still needs to have its timings
1051 * maintained, otherwise bogus information is return
1052 * via read() for time_enabled, time_running:
1053 */
1054 if (event->state == PERF_EVENT_STATE_INACTIVE
1055 && !event_filter_match(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001056 delta = tstamp - event->tstamp_stopped;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001057 event->tstamp_running += delta;
Stephane Eranian41587552011-01-03 18:20:01 +02001058 event->tstamp_stopped = tstamp;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001059 }
1060
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001061 if (event->state != PERF_EVENT_STATE_ACTIVE)
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001062 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001063
1064 event->state = PERF_EVENT_STATE_INACTIVE;
1065 if (event->pending_disable) {
1066 event->pending_disable = 0;
1067 event->state = PERF_EVENT_STATE_OFF;
1068 }
Stephane Eranian41587552011-01-03 18:20:01 +02001069 event->tstamp_stopped = tstamp;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001070 event->pmu->del(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001071 event->oncpu = -1;
1072
1073 if (!is_software_event(event))
1074 cpuctx->active_oncpu--;
1075 ctx->nr_active--;
1076 if (event->attr.exclusive || !cpuctx->active_oncpu)
1077 cpuctx->exclusive = 0;
1078}
1079
1080static void
1081group_sched_out(struct perf_event *group_event,
1082 struct perf_cpu_context *cpuctx,
1083 struct perf_event_context *ctx)
1084{
1085 struct perf_event *event;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001086 int state = group_event->state;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001087
1088 event_sched_out(group_event, cpuctx, ctx);
1089
1090 /*
1091 * Schedule out siblings (if any):
1092 */
1093 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1094 event_sched_out(event, cpuctx, ctx);
1095
Stephane Eranianfa66f072010-08-26 16:40:01 +02001096 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001097 cpuctx->exclusive = 0;
1098}
1099
1100/*
1101 * Cross CPU call to remove a performance event
1102 *
1103 * We disable the event on the hardware level first. After that we
1104 * remove it from the context list.
1105 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001106static int __perf_remove_from_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001107{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001108 struct perf_event *event = info;
1109 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001110 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001111
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001112 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001113 event_sched_out(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001114 list_del_event(event, ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001115 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001116
1117 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001118}
1119
1120
1121/*
1122 * Remove the event from a task's (or a CPU's) list of events.
1123 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001124 * CPU events are removed with a smp call. For task events we only
1125 * call when the task is on a CPU.
1126 *
1127 * If event->ctx is a cloned context, callers must make sure that
1128 * every task struct that event->ctx->task could possibly point to
1129 * remains valid. This is OK when called from perf_release since
1130 * that only calls us on the top-level context, which can't be a clone.
1131 * When called from perf_event_exit_task, it's OK because the
1132 * context has been detached from its task.
1133 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001134static void perf_remove_from_context(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001135{
1136 struct perf_event_context *ctx = event->ctx;
1137 struct task_struct *task = ctx->task;
1138
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001139 lockdep_assert_held(&ctx->mutex);
1140
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001141 if (!task) {
1142 /*
1143 * Per cpu events are removed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001144 * the removal is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001145 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001146 cpu_function_call(event->cpu, __perf_remove_from_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001147 return;
1148 }
1149
1150retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001151 if (!task_function_call(task, __perf_remove_from_context, event))
1152 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001153
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001154 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001155 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001156 * If we failed to find a running task, but find the context active now
1157 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001158 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001159 if (ctx->is_active) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001160 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001161 goto retry;
1162 }
1163
1164 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001165 * Since the task isn't running, its safe to remove the event, us
1166 * holding the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001167 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001168 list_del_event(event, ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001169 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001170}
1171
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001172/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001173 * Cross CPU call to disable a performance event
1174 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001175static int __perf_event_disable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001176{
1177 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001178 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001179 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001180
1181 /*
1182 * If this is a per-task event, need to check whether this
1183 * event's task is the current task on this cpu.
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001184 *
1185 * Can trigger due to concurrent perf_event_context_sched_out()
1186 * flipping contexts around.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001187 */
1188 if (ctx->task && cpuctx->task_ctx != ctx)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001189 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001190
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001191 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001192
1193 /*
1194 * If the event is on, turn it off.
1195 * If it is in error state, leave it in error state.
1196 */
1197 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1198 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001199 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001200 update_group_times(event);
1201 if (event == event->group_leader)
1202 group_sched_out(event, cpuctx, ctx);
1203 else
1204 event_sched_out(event, cpuctx, ctx);
1205 event->state = PERF_EVENT_STATE_OFF;
1206 }
1207
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001208 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001209
1210 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001211}
1212
1213/*
1214 * Disable a event.
1215 *
1216 * If event->ctx is a cloned context, callers must make sure that
1217 * every task struct that event->ctx->task could possibly point to
1218 * remains valid. This condition is satisifed when called through
1219 * perf_event_for_each_child or perf_event_for_each because they
1220 * hold the top-level event's child_mutex, so any descendant that
1221 * goes to exit will block in sync_child_event.
1222 * When called from perf_pending_event it's OK because event->ctx
1223 * is the current context on this CPU and preemption is disabled,
1224 * hence we can't get into perf_event_task_sched_out for this context.
1225 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001226void perf_event_disable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001227{
1228 struct perf_event_context *ctx = event->ctx;
1229 struct task_struct *task = ctx->task;
1230
1231 if (!task) {
1232 /*
1233 * Disable the event on the cpu that it's on
1234 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001235 cpu_function_call(event->cpu, __perf_event_disable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001236 return;
1237 }
1238
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001239retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001240 if (!task_function_call(task, __perf_event_disable, event))
1241 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001242
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001243 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001244 /*
1245 * If the event is still active, we need to retry the cross-call.
1246 */
1247 if (event->state == PERF_EVENT_STATE_ACTIVE) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001248 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001249 /*
1250 * Reload the task pointer, it might have been changed by
1251 * a concurrent perf_event_context_sched_out().
1252 */
1253 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001254 goto retry;
1255 }
1256
1257 /*
1258 * Since we have the lock this context can't be scheduled
1259 * in, so we can change the state safely.
1260 */
1261 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1262 update_group_times(event);
1263 event->state = PERF_EVENT_STATE_OFF;
1264 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001265 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001266}
1267
Stephane Eraniane5d13672011-02-14 11:20:01 +02001268static void perf_set_shadow_time(struct perf_event *event,
1269 struct perf_event_context *ctx,
1270 u64 tstamp)
1271{
1272 /*
1273 * use the correct time source for the time snapshot
1274 *
1275 * We could get by without this by leveraging the
1276 * fact that to get to this function, the caller
1277 * has most likely already called update_context_time()
1278 * and update_cgrp_time_xx() and thus both timestamp
1279 * are identical (or very close). Given that tstamp is,
1280 * already adjusted for cgroup, we could say that:
1281 * tstamp - ctx->timestamp
1282 * is equivalent to
1283 * tstamp - cgrp->timestamp.
1284 *
1285 * Then, in perf_output_read(), the calculation would
1286 * work with no changes because:
1287 * - event is guaranteed scheduled in
1288 * - no scheduled out in between
1289 * - thus the timestamp would be the same
1290 *
1291 * But this is a bit hairy.
1292 *
1293 * So instead, we have an explicit cgroup call to remain
1294 * within the time time source all along. We believe it
1295 * is cleaner and simpler to understand.
1296 */
1297 if (is_cgroup_event(event))
1298 perf_cgroup_set_shadow_time(event, tstamp);
1299 else
1300 event->shadow_ctx_time = tstamp - ctx->timestamp;
1301}
1302
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001303#define MAX_INTERRUPTS (~0ULL)
1304
1305static void perf_log_throttle(struct perf_event *event, int enable);
1306
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001307static int
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001308event_sched_in(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001309 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001310 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001311{
Stephane Eranian41587552011-01-03 18:20:01 +02001312 u64 tstamp = perf_event_time(event);
1313
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001314 if (event->state <= PERF_EVENT_STATE_OFF)
1315 return 0;
1316
1317 event->state = PERF_EVENT_STATE_ACTIVE;
Peter Zijlstra6e377382010-02-11 13:21:58 +01001318 event->oncpu = smp_processor_id();
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001319
1320 /*
1321 * Unthrottle events, since we scheduled we might have missed several
1322 * ticks already, also for a heavily scheduling task there is little
1323 * guarantee it'll get a tick in a timely manner.
1324 */
1325 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1326 perf_log_throttle(event, 1);
1327 event->hw.interrupts = 0;
1328 }
1329
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001330 /*
1331 * The new state must be visible before we turn it on in the hardware:
1332 */
1333 smp_wmb();
1334
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001335 if (event->pmu->add(event, PERF_EF_START)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001336 event->state = PERF_EVENT_STATE_INACTIVE;
1337 event->oncpu = -1;
1338 return -EAGAIN;
1339 }
1340
Stephane Eranian41587552011-01-03 18:20:01 +02001341 event->tstamp_running += tstamp - event->tstamp_stopped;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001342
Stephane Eraniane5d13672011-02-14 11:20:01 +02001343 perf_set_shadow_time(event, ctx, tstamp);
Stephane Eranianeed01522010-10-26 16:08:01 +02001344
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001345 if (!is_software_event(event))
1346 cpuctx->active_oncpu++;
1347 ctx->nr_active++;
1348
1349 if (event->attr.exclusive)
1350 cpuctx->exclusive = 1;
1351
1352 return 0;
1353}
1354
1355static int
1356group_sched_in(struct perf_event *group_event,
1357 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001358 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001359{
Lin Ming6bde9b62010-04-23 13:56:00 +08001360 struct perf_event *event, *partial_group = NULL;
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02001361 struct pmu *pmu = group_event->pmu;
Stephane Eraniand7842da2010-10-20 15:25:01 +02001362 u64 now = ctx->time;
1363 bool simulate = false;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001364
1365 if (group_event->state == PERF_EVENT_STATE_OFF)
1366 return 0;
1367
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001368 pmu->start_txn(pmu);
Lin Ming6bde9b62010-04-23 13:56:00 +08001369
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001370 if (event_sched_in(group_event, cpuctx, ctx)) {
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001371 pmu->cancel_txn(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001372 return -EAGAIN;
Stephane Eranian90151c32010-05-25 16:23:10 +02001373 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001374
1375 /*
1376 * Schedule in siblings as one group (if any):
1377 */
1378 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001379 if (event_sched_in(event, cpuctx, ctx)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001380 partial_group = event;
1381 goto group_error;
1382 }
1383 }
1384
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001385 if (!pmu->commit_txn(pmu))
Paul Mackerras6e851582010-05-08 20:58:00 +10001386 return 0;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001387
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001388group_error:
1389 /*
1390 * Groups can be scheduled in as one unit only, so undo any
1391 * partial group before returning:
Stephane Eraniand7842da2010-10-20 15:25:01 +02001392 * The events up to the failed event are scheduled out normally,
1393 * tstamp_stopped will be updated.
1394 *
1395 * The failed events and the remaining siblings need to have
1396 * their timings updated as if they had gone thru event_sched_in()
1397 * and event_sched_out(). This is required to get consistent timings
1398 * across the group. This also takes care of the case where the group
1399 * could never be scheduled by ensuring tstamp_stopped is set to mark
1400 * the time the event was actually stopped, such that time delta
1401 * calculation in update_event_times() is correct.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001402 */
1403 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1404 if (event == partial_group)
Stephane Eraniand7842da2010-10-20 15:25:01 +02001405 simulate = true;
1406
1407 if (simulate) {
1408 event->tstamp_running += now - event->tstamp_stopped;
1409 event->tstamp_stopped = now;
1410 } else {
1411 event_sched_out(event, cpuctx, ctx);
1412 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001413 }
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001414 event_sched_out(group_event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001415
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001416 pmu->cancel_txn(pmu);
Stephane Eranian90151c32010-05-25 16:23:10 +02001417
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001418 return -EAGAIN;
1419}
1420
1421/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001422 * Work out whether we can put this event group on the CPU now.
1423 */
1424static int group_can_go_on(struct perf_event *event,
1425 struct perf_cpu_context *cpuctx,
1426 int can_add_hw)
1427{
1428 /*
1429 * Groups consisting entirely of software events can always go on.
1430 */
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001431 if (event->group_flags & PERF_GROUP_SOFTWARE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001432 return 1;
1433 /*
1434 * If an exclusive group is already on, no other hardware
1435 * events can go on.
1436 */
1437 if (cpuctx->exclusive)
1438 return 0;
1439 /*
1440 * If this group is exclusive and there are already
1441 * events on the CPU, it can't go on.
1442 */
1443 if (event->attr.exclusive && cpuctx->active_oncpu)
1444 return 0;
1445 /*
1446 * Otherwise, try to add it if all previous groups were able
1447 * to go on.
1448 */
1449 return can_add_hw;
1450}
1451
1452static void add_event_to_ctx(struct perf_event *event,
1453 struct perf_event_context *ctx)
1454{
Stephane Eranian41587552011-01-03 18:20:01 +02001455 u64 tstamp = perf_event_time(event);
1456
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001457 list_add_event(event, ctx);
Peter Zijlstra8a495422010-05-27 15:47:49 +02001458 perf_group_attach(event);
Stephane Eranian41587552011-01-03 18:20:01 +02001459 event->tstamp_enabled = tstamp;
1460 event->tstamp_running = tstamp;
1461 event->tstamp_stopped = tstamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001462}
1463
Stephane Eraniane5d13672011-02-14 11:20:01 +02001464static void perf_event_context_sched_in(struct perf_event_context *ctx,
1465 struct task_struct *tsk);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001466
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001467/*
1468 * Cross CPU call to install and enable a performance event
1469 *
1470 * Must be called with ctx->mutex held
1471 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001472static int __perf_install_in_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001473{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001474 struct perf_event *event = info;
1475 struct perf_event_context *ctx = event->ctx;
1476 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001477 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001478 int err;
1479
1480 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001481 * In case we're installing a new context to an already running task,
1482 * could also happen before perf_event_task_sched_in() on architectures
1483 * which do context switches with IRQs enabled.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001484 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001485 if (ctx->task && !cpuctx->task_ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +02001486 perf_event_context_sched_in(ctx, ctx->task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001487
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001488 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001489 ctx->is_active = 1;
1490 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001491 /*
1492 * update cgrp time only if current cgrp
1493 * matches event->cgrp. Must be done before
1494 * calling add_event_to_ctx()
1495 */
1496 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001497
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001498 add_event_to_ctx(event, ctx);
1499
Stephane Eranian5632ab12011-01-03 18:20:01 +02001500 if (!event_filter_match(event))
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001501 goto unlock;
1502
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001503 /*
1504 * Don't put the event on if it is disabled or if
1505 * it is in a group and the group isn't on.
1506 */
1507 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1508 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1509 goto unlock;
1510
1511 /*
1512 * An exclusive event can't go on if there are already active
1513 * hardware events, and no hardware event can go on if there
1514 * is already an exclusive event on.
1515 */
1516 if (!group_can_go_on(event, cpuctx, 1))
1517 err = -EEXIST;
1518 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001519 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001520
1521 if (err) {
1522 /*
1523 * This event couldn't go on. If it is in a group
1524 * then we have to pull the whole group off.
1525 * If the event group is pinned then put it in error state.
1526 */
1527 if (leader != event)
1528 group_sched_out(leader, cpuctx, ctx);
1529 if (leader->attr.pinned) {
1530 update_group_times(leader);
1531 leader->state = PERF_EVENT_STATE_ERROR;
1532 }
1533 }
1534
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001535unlock:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001536 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001537
1538 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001539}
1540
1541/*
1542 * Attach a performance event to a context
1543 *
1544 * First we add the event to the list with the hardware enable bit
1545 * in event->hw_config cleared.
1546 *
1547 * If the event is attached to a task which is on a CPU we use a smp
1548 * call to enable it in the task context. The task might have been
1549 * scheduled away, but we check this in the smp call again.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001550 */
1551static void
1552perf_install_in_context(struct perf_event_context *ctx,
1553 struct perf_event *event,
1554 int cpu)
1555{
1556 struct task_struct *task = ctx->task;
1557
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001558 lockdep_assert_held(&ctx->mutex);
1559
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02001560 event->ctx = ctx;
1561
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001562 if (!task) {
1563 /*
1564 * Per cpu events are installed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001565 * the install is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001566 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001567 cpu_function_call(cpu, __perf_install_in_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001568 return;
1569 }
1570
1571retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001572 if (!task_function_call(task, __perf_install_in_context, event))
1573 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001574
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001575 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001576 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001577 * If we failed to find a running task, but find the context active now
1578 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001579 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001580 if (ctx->is_active) {
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001581 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001582 goto retry;
1583 }
1584
1585 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001586 * Since the task isn't running, its safe to add the event, us holding
1587 * the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001588 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001589 add_event_to_ctx(event, ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001590 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001591}
1592
1593/*
1594 * Put a event into inactive state and update time fields.
1595 * Enabling the leader of a group effectively enables all
1596 * the group members that aren't explicitly disabled, so we
1597 * have to update their ->tstamp_enabled also.
1598 * Note: this works for group members as well as group leaders
1599 * since the non-leader members' sibling_lists will be empty.
1600 */
1601static void __perf_event_mark_enabled(struct perf_event *event,
1602 struct perf_event_context *ctx)
1603{
1604 struct perf_event *sub;
Stephane Eranian41587552011-01-03 18:20:01 +02001605 u64 tstamp = perf_event_time(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001606
1607 event->state = PERF_EVENT_STATE_INACTIVE;
Stephane Eranian41587552011-01-03 18:20:01 +02001608 event->tstamp_enabled = tstamp - event->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001609 list_for_each_entry(sub, &event->sibling_list, group_entry) {
Stephane Eranian41587552011-01-03 18:20:01 +02001610 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1611 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001612 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001613}
1614
1615/*
1616 * Cross CPU call to enable a performance event
1617 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001618static int __perf_event_enable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001619{
1620 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001621 struct perf_event_context *ctx = event->ctx;
1622 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001623 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001624 int err;
1625
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001626 if (WARN_ON_ONCE(!ctx->is_active))
1627 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001628
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001629 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001630 update_context_time(ctx);
1631
1632 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1633 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001634
1635 /*
1636 * set current task's cgroup time reference point
1637 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +02001638 perf_cgroup_set_timestamp(current, ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001639
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001640 __perf_event_mark_enabled(event, ctx);
1641
Stephane Eraniane5d13672011-02-14 11:20:01 +02001642 if (!event_filter_match(event)) {
1643 if (is_cgroup_event(event))
1644 perf_cgroup_defer_enabled(event);
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001645 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001646 }
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001647
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001648 /*
1649 * If the event is in a group and isn't the group leader,
1650 * then don't put it on unless the group is on.
1651 */
1652 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1653 goto unlock;
1654
1655 if (!group_can_go_on(event, cpuctx, 1)) {
1656 err = -EEXIST;
1657 } else {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001658 if (event == leader)
Peter Zijlstra6e377382010-02-11 13:21:58 +01001659 err = group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001660 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001661 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001662 }
1663
1664 if (err) {
1665 /*
1666 * If this event can't go on and it's part of a
1667 * group, then the whole group has to come off.
1668 */
1669 if (leader != event)
1670 group_sched_out(leader, cpuctx, ctx);
1671 if (leader->attr.pinned) {
1672 update_group_times(leader);
1673 leader->state = PERF_EVENT_STATE_ERROR;
1674 }
1675 }
1676
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001677unlock:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001678 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001679
1680 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001681}
1682
1683/*
1684 * Enable a event.
1685 *
1686 * If event->ctx is a cloned context, callers must make sure that
1687 * every task struct that event->ctx->task could possibly point to
1688 * remains valid. This condition is satisfied when called through
1689 * perf_event_for_each_child or perf_event_for_each as described
1690 * for perf_event_disable.
1691 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001692void perf_event_enable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001693{
1694 struct perf_event_context *ctx = event->ctx;
1695 struct task_struct *task = ctx->task;
1696
1697 if (!task) {
1698 /*
1699 * Enable the event on the cpu that it's on
1700 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001701 cpu_function_call(event->cpu, __perf_event_enable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001702 return;
1703 }
1704
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001705 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001706 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1707 goto out;
1708
1709 /*
1710 * If the event is in error state, clear that first.
1711 * That way, if we see the event in error state below, we
1712 * know that it has gone back into error state, as distinct
1713 * from the task having been scheduled away before the
1714 * cross-call arrived.
1715 */
1716 if (event->state == PERF_EVENT_STATE_ERROR)
1717 event->state = PERF_EVENT_STATE_OFF;
1718
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001719retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001720 if (!ctx->is_active) {
1721 __perf_event_mark_enabled(event, ctx);
1722 goto out;
1723 }
1724
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001725 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001726
1727 if (!task_function_call(task, __perf_event_enable, event))
1728 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001729
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001730 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001731
1732 /*
1733 * If the context is active and the event is still off,
1734 * we need to retry the cross-call.
1735 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001736 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1737 /*
1738 * task could have been flipped by a concurrent
1739 * perf_event_context_sched_out()
1740 */
1741 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001742 goto retry;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001743 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001744
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001745out:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001746 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001747}
1748
1749static int perf_event_refresh(struct perf_event *event, int refresh)
1750{
1751 /*
1752 * not supported on inherited events
1753 */
Franck Bui-Huu2e939d12010-11-23 16:21:44 +01001754 if (event->attr.inherit || !is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001755 return -EINVAL;
1756
1757 atomic_add(refresh, &event->event_limit);
1758 perf_event_enable(event);
1759
1760 return 0;
1761}
1762
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001763static void ctx_sched_out(struct perf_event_context *ctx,
1764 struct perf_cpu_context *cpuctx,
1765 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001766{
1767 struct perf_event *event;
1768
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001769 raw_spin_lock(&ctx->lock);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001770 perf_pmu_disable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001771 ctx->is_active = 0;
1772 if (likely(!ctx->nr_events))
1773 goto out;
1774 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001775 update_cgrp_time_from_cpuctx(cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001776
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001777 if (!ctx->nr_active)
Peter Zijlstra24cd7f52010-06-11 17:32:03 +02001778 goto out;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001779
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001780 if (event_type & EVENT_PINNED) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001781 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1782 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001783 }
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001784
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001785 if (event_type & EVENT_FLEXIBLE) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001786 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001787 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001788 }
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001789out:
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001790 perf_pmu_enable(ctx->pmu);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001791 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001792}
1793
1794/*
1795 * Test whether two contexts are equivalent, i.e. whether they
1796 * have both been cloned from the same version of the same context
1797 * and they both have the same number of enabled events.
1798 * If the number of enabled events is the same, then the set
1799 * of enabled events should be the same, because these are both
1800 * inherited contexts, therefore we can't access individual events
1801 * in them directly with an fd; we can only enable/disable all
1802 * events via prctl, or enable/disable all events in a family
1803 * via ioctl, which will have the same effect on both contexts.
1804 */
1805static int context_equiv(struct perf_event_context *ctx1,
1806 struct perf_event_context *ctx2)
1807{
1808 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1809 && ctx1->parent_gen == ctx2->parent_gen
1810 && !ctx1->pin_count && !ctx2->pin_count;
1811}
1812
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001813static void __perf_event_sync_stat(struct perf_event *event,
1814 struct perf_event *next_event)
1815{
1816 u64 value;
1817
1818 if (!event->attr.inherit_stat)
1819 return;
1820
1821 /*
1822 * Update the event value, we cannot use perf_event_read()
1823 * because we're in the middle of a context switch and have IRQs
1824 * disabled, which upsets smp_call_function_single(), however
1825 * we know the event must be on the current CPU, therefore we
1826 * don't need to use it.
1827 */
1828 switch (event->state) {
1829 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001830 event->pmu->read(event);
1831 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001832
1833 case PERF_EVENT_STATE_INACTIVE:
1834 update_event_times(event);
1835 break;
1836
1837 default:
1838 break;
1839 }
1840
1841 /*
1842 * In order to keep per-task stats reliable we need to flip the event
1843 * values when we flip the contexts.
1844 */
Peter Zijlstrae7850592010-05-21 14:43:08 +02001845 value = local64_read(&next_event->count);
1846 value = local64_xchg(&event->count, value);
1847 local64_set(&next_event->count, value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001848
1849 swap(event->total_time_enabled, next_event->total_time_enabled);
1850 swap(event->total_time_running, next_event->total_time_running);
1851
1852 /*
1853 * Since we swizzled the values, update the user visible data too.
1854 */
1855 perf_event_update_userpage(event);
1856 perf_event_update_userpage(next_event);
1857}
1858
1859#define list_next_entry(pos, member) \
1860 list_entry(pos->member.next, typeof(*pos), member)
1861
1862static void perf_event_sync_stat(struct perf_event_context *ctx,
1863 struct perf_event_context *next_ctx)
1864{
1865 struct perf_event *event, *next_event;
1866
1867 if (!ctx->nr_stat)
1868 return;
1869
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001870 update_context_time(ctx);
1871
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001872 event = list_first_entry(&ctx->event_list,
1873 struct perf_event, event_entry);
1874
1875 next_event = list_first_entry(&next_ctx->event_list,
1876 struct perf_event, event_entry);
1877
1878 while (&event->event_entry != &ctx->event_list &&
1879 &next_event->event_entry != &next_ctx->event_list) {
1880
1881 __perf_event_sync_stat(event, next_event);
1882
1883 event = list_next_entry(event, event_entry);
1884 next_event = list_next_entry(next_event, event_entry);
1885 }
1886}
1887
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001888static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1889 struct task_struct *next)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001890{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001891 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001892 struct perf_event_context *next_ctx;
1893 struct perf_event_context *parent;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001894 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001895 int do_switch = 1;
1896
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001897 if (likely(!ctx))
1898 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001899
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001900 cpuctx = __get_cpu_context(ctx);
1901 if (!cpuctx->task_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001902 return;
1903
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001904 rcu_read_lock();
1905 parent = rcu_dereference(ctx->parent_ctx);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001906 next_ctx = next->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001907 if (parent && next_ctx &&
1908 rcu_dereference(next_ctx->parent_ctx) == parent) {
1909 /*
1910 * Looks like the two contexts are clones, so we might be
1911 * able to optimize the context switch. We lock both
1912 * contexts and check that they are clones under the
1913 * lock (including re-checking that neither has been
1914 * uncloned in the meantime). It doesn't matter which
1915 * order we take the locks because no other cpu could
1916 * be trying to lock both of these tasks.
1917 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001918 raw_spin_lock(&ctx->lock);
1919 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001920 if (context_equiv(ctx, next_ctx)) {
1921 /*
1922 * XXX do we need a memory barrier of sorts
1923 * wrt to rcu_dereference() of perf_event_ctxp
1924 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001925 task->perf_event_ctxp[ctxn] = next_ctx;
1926 next->perf_event_ctxp[ctxn] = ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001927 ctx->task = next;
1928 next_ctx->task = task;
1929 do_switch = 0;
1930
1931 perf_event_sync_stat(ctx, next_ctx);
1932 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01001933 raw_spin_unlock(&next_ctx->lock);
1934 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001935 }
1936 rcu_read_unlock();
1937
1938 if (do_switch) {
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001939 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001940 cpuctx->task_ctx = NULL;
1941 }
1942}
1943
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001944#define for_each_task_context_nr(ctxn) \
1945 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1946
1947/*
1948 * Called from scheduler to remove the events of the current task,
1949 * with interrupts disabled.
1950 *
1951 * We stop each event and update the event value in event->count.
1952 *
1953 * This does not protect us against NMI, but disable()
1954 * sets the disabled bit in the control field of event _before_
1955 * accessing the event control register. If a NMI hits, then it will
1956 * not restart the event.
1957 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02001958void __perf_event_task_sched_out(struct task_struct *task,
1959 struct task_struct *next)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001960{
1961 int ctxn;
1962
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001963 for_each_task_context_nr(ctxn)
1964 perf_event_context_sched_out(task, ctxn, next);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001965
1966 /*
1967 * if cgroup events exist on this CPU, then we need
1968 * to check if we have to switch out PMU state.
1969 * cgroup event are system-wide mode only
1970 */
1971 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1972 perf_cgroup_sched_out(task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001973}
1974
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001975static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001977{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001978 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001979
1980 if (!cpuctx->task_ctx)
1981 return;
1982
1983 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1984 return;
1985
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001986 ctx_sched_out(ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001987 cpuctx->task_ctx = NULL;
1988}
1989
1990/*
1991 * Called with IRQs disabled
1992 */
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001993static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1994 enum event_type_t event_type)
1995{
1996 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001997}
1998
1999static void
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002000ctx_pinned_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002001 struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002002{
2003 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002004
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002005 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2006 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002007 continue;
Stephane Eranian5632ab12011-01-03 18:20:01 +02002008 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002009 continue;
2010
Stephane Eraniane5d13672011-02-14 11:20:01 +02002011 /* may need to reset tstamp_enabled */
2012 if (is_cgroup_event(event))
2013 perf_cgroup_mark_enabled(event, ctx);
2014
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002015 if (group_can_go_on(event, cpuctx, 1))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002016 group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002017
2018 /*
2019 * If this pinned group hasn't been scheduled,
2020 * put it in error state.
2021 */
2022 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2023 update_group_times(event);
2024 event->state = PERF_EVENT_STATE_ERROR;
2025 }
2026 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002027}
2028
2029static void
2030ctx_flexible_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002031 struct perf_cpu_context *cpuctx)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002032{
2033 struct perf_event *event;
2034 int can_add_hw = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002035
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002036 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2037 /* Ignore events in OFF or ERROR state */
2038 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002039 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002040 /*
2041 * Listen to the 'cpu' scheduling filter constraint
2042 * of events:
2043 */
Stephane Eranian5632ab12011-01-03 18:20:01 +02002044 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002045 continue;
2046
Stephane Eraniane5d13672011-02-14 11:20:01 +02002047 /* may need to reset tstamp_enabled */
2048 if (is_cgroup_event(event))
2049 perf_cgroup_mark_enabled(event, ctx);
2050
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002051 if (group_can_go_on(event, cpuctx, can_add_hw)) {
Peter Zijlstra6e377382010-02-11 13:21:58 +01002052 if (group_sched_in(event, cpuctx, ctx))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002053 can_add_hw = 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002054 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002055 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002056}
2057
2058static void
2059ctx_sched_in(struct perf_event_context *ctx,
2060 struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002061 enum event_type_t event_type,
2062 struct task_struct *task)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002063{
Stephane Eraniane5d13672011-02-14 11:20:01 +02002064 u64 now;
2065
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002066 raw_spin_lock(&ctx->lock);
2067 ctx->is_active = 1;
2068 if (likely(!ctx->nr_events))
2069 goto out;
2070
Stephane Eraniane5d13672011-02-14 11:20:01 +02002071 now = perf_clock();
2072 ctx->timestamp = now;
Stephane Eranian3f7cce32011-02-18 14:40:01 +02002073 perf_cgroup_set_timestamp(task, ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002074 /*
2075 * First go through the list and put on any pinned groups
2076 * in order to give them the best chance of going on.
2077 */
2078 if (event_type & EVENT_PINNED)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002079 ctx_pinned_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002080
2081 /* Then walk through the lower prio flexible groups */
2082 if (event_type & EVENT_FLEXIBLE)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002083 ctx_flexible_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002084
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002085out:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002086 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002087}
2088
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002089static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002090 enum event_type_t event_type,
2091 struct task_struct *task)
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002092{
2093 struct perf_event_context *ctx = &cpuctx->ctx;
2094
Stephane Eraniane5d13672011-02-14 11:20:01 +02002095 ctx_sched_in(ctx, cpuctx, event_type, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002096}
2097
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002098static void task_ctx_sched_in(struct perf_event_context *ctx,
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002099 enum event_type_t event_type)
2100{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002101 struct perf_cpu_context *cpuctx;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002102
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002103 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002104 if (cpuctx->task_ctx == ctx)
2105 return;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002106
Stephane Eraniane5d13672011-02-14 11:20:01 +02002107 ctx_sched_in(ctx, cpuctx, event_type, NULL);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002108 cpuctx->task_ctx = ctx;
2109}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002110
Stephane Eraniane5d13672011-02-14 11:20:01 +02002111static void perf_event_context_sched_in(struct perf_event_context *ctx,
2112 struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002113{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002114 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002115
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002116 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002117 if (cpuctx->task_ctx == ctx)
2118 return;
2119
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002120 perf_pmu_disable(ctx->pmu);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002121 /*
2122 * We want to keep the following priority order:
2123 * cpu pinned (that don't need to move), task pinned,
2124 * cpu flexible, task flexible.
2125 */
2126 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2127
Stephane Eraniane5d13672011-02-14 11:20:01 +02002128 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2129 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2130 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002131
2132 cpuctx->task_ctx = ctx;
eranian@google.com9b33fa62010-03-10 22:26:05 -08002133
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002134 /*
2135 * Since these rotations are per-cpu, we need to ensure the
2136 * cpu-context we got scheduled on is actually rotating.
2137 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002138 perf_pmu_rotate_start(ctx->pmu);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002139 perf_pmu_enable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002140}
2141
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002142/*
2143 * Called from scheduler to add the events of the current task
2144 * with interrupts disabled.
2145 *
2146 * We restore the event value and then enable it.
2147 *
2148 * This does not protect us against NMI, but enable()
2149 * sets the enabled bit in the control field of event _before_
2150 * accessing the event control register. If a NMI hits, then it will
2151 * keep the event running.
2152 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002153void __perf_event_task_sched_in(struct task_struct *task)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002154{
2155 struct perf_event_context *ctx;
2156 int ctxn;
2157
2158 for_each_task_context_nr(ctxn) {
2159 ctx = task->perf_event_ctxp[ctxn];
2160 if (likely(!ctx))
2161 continue;
2162
Stephane Eraniane5d13672011-02-14 11:20:01 +02002163 perf_event_context_sched_in(ctx, task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002164 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02002165 /*
2166 * if cgroup events exist on this CPU, then we need
2167 * to check if we have to switch in PMU state.
2168 * cgroup event are system-wide mode only
2169 */
2170 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2171 perf_cgroup_sched_in(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002172}
2173
Peter Zijlstraabd50712010-01-26 18:50:16 +01002174static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2175{
2176 u64 frequency = event->attr.sample_freq;
2177 u64 sec = NSEC_PER_SEC;
2178 u64 divisor, dividend;
2179
2180 int count_fls, nsec_fls, frequency_fls, sec_fls;
2181
2182 count_fls = fls64(count);
2183 nsec_fls = fls64(nsec);
2184 frequency_fls = fls64(frequency);
2185 sec_fls = 30;
2186
2187 /*
2188 * We got @count in @nsec, with a target of sample_freq HZ
2189 * the target period becomes:
2190 *
2191 * @count * 10^9
2192 * period = -------------------
2193 * @nsec * sample_freq
2194 *
2195 */
2196
2197 /*
2198 * Reduce accuracy by one bit such that @a and @b converge
2199 * to a similar magnitude.
2200 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002201#define REDUCE_FLS(a, b) \
Peter Zijlstraabd50712010-01-26 18:50:16 +01002202do { \
2203 if (a##_fls > b##_fls) { \
2204 a >>= 1; \
2205 a##_fls--; \
2206 } else { \
2207 b >>= 1; \
2208 b##_fls--; \
2209 } \
2210} while (0)
2211
2212 /*
2213 * Reduce accuracy until either term fits in a u64, then proceed with
2214 * the other, so that finally we can do a u64/u64 division.
2215 */
2216 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2217 REDUCE_FLS(nsec, frequency);
2218 REDUCE_FLS(sec, count);
2219 }
2220
2221 if (count_fls + sec_fls > 64) {
2222 divisor = nsec * frequency;
2223
2224 while (count_fls + sec_fls > 64) {
2225 REDUCE_FLS(count, sec);
2226 divisor >>= 1;
2227 }
2228
2229 dividend = count * sec;
2230 } else {
2231 dividend = count * sec;
2232
2233 while (nsec_fls + frequency_fls > 64) {
2234 REDUCE_FLS(nsec, frequency);
2235 dividend >>= 1;
2236 }
2237
2238 divisor = nsec * frequency;
2239 }
2240
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002241 if (!divisor)
2242 return dividend;
2243
Peter Zijlstraabd50712010-01-26 18:50:16 +01002244 return div64_u64(dividend, divisor);
2245}
2246
2247static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002248{
2249 struct hw_perf_event *hwc = &event->hw;
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002250 s64 period, sample_period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002251 s64 delta;
2252
Peter Zijlstraabd50712010-01-26 18:50:16 +01002253 period = perf_calculate_period(event, nsec, count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002254
2255 delta = (s64)(period - hwc->sample_period);
2256 delta = (delta + 7) / 8; /* low pass filter */
2257
2258 sample_period = hwc->sample_period + delta;
2259
2260 if (!sample_period)
2261 sample_period = 1;
2262
2263 hwc->sample_period = sample_period;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002264
Peter Zijlstrae7850592010-05-21 14:43:08 +02002265 if (local64_read(&hwc->period_left) > 8*sample_period) {
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002266 event->pmu->stop(event, PERF_EF_UPDATE);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002267 local64_set(&hwc->period_left, 0);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002268 event->pmu->start(event, PERF_EF_RELOAD);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002269 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002270}
2271
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002272static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002273{
2274 struct perf_event *event;
2275 struct hw_perf_event *hwc;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002276 u64 interrupts, now;
2277 s64 delta;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002278
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002279 raw_spin_lock(&ctx->lock);
Paul Mackerras03541f82009-10-14 16:58:03 +11002280 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002281 if (event->state != PERF_EVENT_STATE_ACTIVE)
2282 continue;
2283
Stephane Eranian5632ab12011-01-03 18:20:01 +02002284 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01002285 continue;
2286
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002287 hwc = &event->hw;
2288
2289 interrupts = hwc->interrupts;
2290 hwc->interrupts = 0;
2291
2292 /*
2293 * unthrottle events on the tick
2294 */
2295 if (interrupts == MAX_INTERRUPTS) {
2296 perf_log_throttle(event, 1);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002297 event->pmu->start(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002298 }
2299
2300 if (!event->attr.freq || !event->attr.sample_freq)
2301 continue;
2302
Peter Zijlstraabd50712010-01-26 18:50:16 +01002303 event->pmu->read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002304 now = local64_read(&event->count);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002305 delta = now - hwc->freq_count_stamp;
2306 hwc->freq_count_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002307
Peter Zijlstraabd50712010-01-26 18:50:16 +01002308 if (delta > 0)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002309 perf_adjust_period(event, period, delta);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002310 }
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002311 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002312}
2313
2314/*
2315 * Round-robin a context's events:
2316 */
2317static void rotate_ctx(struct perf_event_context *ctx)
2318{
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002319 raw_spin_lock(&ctx->lock);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002320
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01002321 /*
2322 * Rotate the first entry last of non-pinned groups. Rotation might be
2323 * disabled by the inheritance code.
2324 */
2325 if (!ctx->rotate_disable)
2326 list_rotate_left(&ctx->flexible_groups);
Frederic Weisbeckere2864172010-01-09 21:05:28 +01002327
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002328 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002329}
2330
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002331/*
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002332 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2333 * because they're strictly cpu affine and rotate_start is called with IRQs
2334 * disabled, while rotate_context is called from IRQ context.
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002335 */
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002336static void perf_rotate_context(struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002337{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002338 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002339 struct perf_event_context *ctx = NULL;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002340 int rotate = 0, remove = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002341
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002342 if (cpuctx->ctx.nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002343 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002344 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2345 rotate = 1;
2346 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002347
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002348 ctx = cpuctx->task_ctx;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002349 if (ctx && ctx->nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002350 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002351 if (ctx->nr_events != ctx->nr_active)
2352 rotate = 1;
2353 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002354
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002355 perf_pmu_disable(cpuctx->ctx.pmu);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002356 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002357 if (ctx)
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002358 perf_ctx_adjust_freq(ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002359
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002360 if (!rotate)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002361 goto done;
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002362
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002363 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002364 if (ctx)
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002365 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002366
2367 rotate_ctx(&cpuctx->ctx);
2368 if (ctx)
2369 rotate_ctx(ctx);
2370
Stephane Eraniane5d13672011-02-14 11:20:01 +02002371 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002372 if (ctx)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002373 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002374
2375done:
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002376 if (remove)
2377 list_del_init(&cpuctx->rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002378
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002379 perf_pmu_enable(cpuctx->ctx.pmu);
2380}
2381
2382void perf_event_task_tick(void)
2383{
2384 struct list_head *head = &__get_cpu_var(rotation_list);
2385 struct perf_cpu_context *cpuctx, *tmp;
2386
2387 WARN_ON(!irqs_disabled());
2388
2389 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2390 if (cpuctx->jiffies_interval == 1 ||
2391 !(jiffies % cpuctx->jiffies_interval))
2392 perf_rotate_context(cpuctx);
2393 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002394}
2395
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002396static int event_enable_on_exec(struct perf_event *event,
2397 struct perf_event_context *ctx)
2398{
2399 if (!event->attr.enable_on_exec)
2400 return 0;
2401
2402 event->attr.enable_on_exec = 0;
2403 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2404 return 0;
2405
2406 __perf_event_mark_enabled(event, ctx);
2407
2408 return 1;
2409}
2410
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002411/*
2412 * Enable all of a task's events that have been marked enable-on-exec.
2413 * This expects task == current.
2414 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002415static void perf_event_enable_on_exec(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002416{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002417 struct perf_event *event;
2418 unsigned long flags;
2419 int enabled = 0;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002420 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002421
2422 local_irq_save(flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002423 if (!ctx || !ctx->nr_events)
2424 goto out;
2425
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002426 task_ctx_sched_out(ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002427
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002428 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002429
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002430 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2431 ret = event_enable_on_exec(event, ctx);
2432 if (ret)
2433 enabled = 1;
2434 }
2435
2436 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2437 ret = event_enable_on_exec(event, ctx);
2438 if (ret)
2439 enabled = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002440 }
2441
2442 /*
2443 * Unclone this context if we enabled any event.
2444 */
2445 if (enabled)
2446 unclone_ctx(ctx);
2447
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002448 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002449
Stephane Eraniane5d13672011-02-14 11:20:01 +02002450 perf_event_context_sched_in(ctx, ctx->task);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002451out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002452 local_irq_restore(flags);
2453}
2454
2455/*
2456 * Cross CPU call to read the hardware event
2457 */
2458static void __perf_event_read(void *info)
2459{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002460 struct perf_event *event = info;
2461 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002462 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002463
2464 /*
2465 * If this is a task context, we need to check whether it is
2466 * the current task context of this cpu. If not it has been
2467 * scheduled out before the smp call arrived. In that case
2468 * event->count would have been updated to a recent sample
2469 * when the event was scheduled out.
2470 */
2471 if (ctx->task && cpuctx->task_ctx != ctx)
2472 return;
2473
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002474 raw_spin_lock(&ctx->lock);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002475 if (ctx->is_active) {
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002476 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002477 update_cgrp_time_from_event(event);
2478 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002479 update_event_times(event);
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002480 if (event->state == PERF_EVENT_STATE_ACTIVE)
2481 event->pmu->read(event);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002482 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002483}
2484
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002485static inline u64 perf_event_count(struct perf_event *event)
2486{
Peter Zijlstrae7850592010-05-21 14:43:08 +02002487 return local64_read(&event->count) + atomic64_read(&event->child_count);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002488}
2489
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002490static u64 perf_event_read(struct perf_event *event)
2491{
2492 /*
2493 * If event is enabled and currently active on a CPU, update the
2494 * value in the event structure:
2495 */
2496 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2497 smp_call_function_single(event->oncpu,
2498 __perf_event_read, event, 1);
2499 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01002500 struct perf_event_context *ctx = event->ctx;
2501 unsigned long flags;
2502
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002503 raw_spin_lock_irqsave(&ctx->lock, flags);
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002504 /*
2505 * may read while context is not active
2506 * (e.g., thread is blocked), in that case
2507 * we cannot update context time
2508 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02002509 if (ctx->is_active) {
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002510 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002511 update_cgrp_time_from_event(event);
2512 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002513 update_event_times(event);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002514 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002515 }
2516
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002517 return perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002518}
2519
2520/*
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002521 * Callchain support
2522 */
2523
2524struct callchain_cpus_entries {
2525 struct rcu_head rcu_head;
2526 struct perf_callchain_entry *cpu_entries[0];
2527};
2528
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002529static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002530static atomic_t nr_callchain_events;
2531static DEFINE_MUTEX(callchain_mutex);
2532struct callchain_cpus_entries *callchain_cpus_entries;
2533
2534
2535__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2536 struct pt_regs *regs)
2537{
2538}
2539
2540__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2541 struct pt_regs *regs)
2542{
2543}
2544
2545static void release_callchain_buffers_rcu(struct rcu_head *head)
2546{
2547 struct callchain_cpus_entries *entries;
2548 int cpu;
2549
2550 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2551
2552 for_each_possible_cpu(cpu)
2553 kfree(entries->cpu_entries[cpu]);
2554
2555 kfree(entries);
2556}
2557
2558static void release_callchain_buffers(void)
2559{
2560 struct callchain_cpus_entries *entries;
2561
2562 entries = callchain_cpus_entries;
2563 rcu_assign_pointer(callchain_cpus_entries, NULL);
2564 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2565}
2566
2567static int alloc_callchain_buffers(void)
2568{
2569 int cpu;
2570 int size;
2571 struct callchain_cpus_entries *entries;
2572
2573 /*
2574 * We can't use the percpu allocation API for data that can be
2575 * accessed from NMI. Use a temporary manual per cpu allocation
2576 * until that gets sorted out.
2577 */
Eric Dumazet88d4f0d2011-01-25 19:40:51 +01002578 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002579
2580 entries = kzalloc(size, GFP_KERNEL);
2581 if (!entries)
2582 return -ENOMEM;
2583
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002584 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002585
2586 for_each_possible_cpu(cpu) {
2587 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2588 cpu_to_node(cpu));
2589 if (!entries->cpu_entries[cpu])
2590 goto fail;
2591 }
2592
2593 rcu_assign_pointer(callchain_cpus_entries, entries);
2594
2595 return 0;
2596
2597fail:
2598 for_each_possible_cpu(cpu)
2599 kfree(entries->cpu_entries[cpu]);
2600 kfree(entries);
2601
2602 return -ENOMEM;
2603}
2604
2605static int get_callchain_buffers(void)
2606{
2607 int err = 0;
2608 int count;
2609
2610 mutex_lock(&callchain_mutex);
2611
2612 count = atomic_inc_return(&nr_callchain_events);
2613 if (WARN_ON_ONCE(count < 1)) {
2614 err = -EINVAL;
2615 goto exit;
2616 }
2617
2618 if (count > 1) {
2619 /* If the allocation failed, give up */
2620 if (!callchain_cpus_entries)
2621 err = -ENOMEM;
2622 goto exit;
2623 }
2624
2625 err = alloc_callchain_buffers();
2626 if (err)
2627 release_callchain_buffers();
2628exit:
2629 mutex_unlock(&callchain_mutex);
2630
2631 return err;
2632}
2633
2634static void put_callchain_buffers(void)
2635{
2636 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2637 release_callchain_buffers();
2638 mutex_unlock(&callchain_mutex);
2639 }
2640}
2641
2642static int get_recursion_context(int *recursion)
2643{
2644 int rctx;
2645
2646 if (in_nmi())
2647 rctx = 3;
2648 else if (in_irq())
2649 rctx = 2;
2650 else if (in_softirq())
2651 rctx = 1;
2652 else
2653 rctx = 0;
2654
2655 if (recursion[rctx])
2656 return -1;
2657
2658 recursion[rctx]++;
2659 barrier();
2660
2661 return rctx;
2662}
2663
2664static inline void put_recursion_context(int *recursion, int rctx)
2665{
2666 barrier();
2667 recursion[rctx]--;
2668}
2669
2670static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2671{
2672 int cpu;
2673 struct callchain_cpus_entries *entries;
2674
2675 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2676 if (*rctx == -1)
2677 return NULL;
2678
2679 entries = rcu_dereference(callchain_cpus_entries);
2680 if (!entries)
2681 return NULL;
2682
2683 cpu = smp_processor_id();
2684
2685 return &entries->cpu_entries[cpu][*rctx];
2686}
2687
2688static void
2689put_callchain_entry(int rctx)
2690{
2691 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2692}
2693
2694static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2695{
2696 int rctx;
2697 struct perf_callchain_entry *entry;
2698
2699
2700 entry = get_callchain_entry(&rctx);
2701 if (rctx == -1)
2702 return NULL;
2703
2704 if (!entry)
2705 goto exit_put;
2706
2707 entry->nr = 0;
2708
2709 if (!user_mode(regs)) {
2710 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2711 perf_callchain_kernel(entry, regs);
2712 if (current->mm)
2713 regs = task_pt_regs(current);
2714 else
2715 regs = NULL;
2716 }
2717
2718 if (regs) {
2719 perf_callchain_store(entry, PERF_CONTEXT_USER);
2720 perf_callchain_user(entry, regs);
2721 }
2722
2723exit_put:
2724 put_callchain_entry(rctx);
2725
2726 return entry;
2727}
2728
2729/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002730 * Initialize the perf_event context in a task_struct:
2731 */
Peter Zijlstraeb184472010-09-07 15:55:13 +02002732static void __perf_event_init_context(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002733{
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002734 raw_spin_lock_init(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002735 mutex_init(&ctx->mutex);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002736 INIT_LIST_HEAD(&ctx->pinned_groups);
2737 INIT_LIST_HEAD(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002738 INIT_LIST_HEAD(&ctx->event_list);
2739 atomic_set(&ctx->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002740}
2741
Peter Zijlstraeb184472010-09-07 15:55:13 +02002742static struct perf_event_context *
2743alloc_perf_context(struct pmu *pmu, struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002744{
2745 struct perf_event_context *ctx;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002746
2747 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2748 if (!ctx)
2749 return NULL;
2750
2751 __perf_event_init_context(ctx);
2752 if (task) {
2753 ctx->task = task;
2754 get_task_struct(task);
2755 }
2756 ctx->pmu = pmu;
2757
2758 return ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002759}
2760
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002761static struct task_struct *
2762find_lively_task_by_vpid(pid_t vpid)
2763{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002764 struct task_struct *task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002765 int err;
2766
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002767 rcu_read_lock();
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002768 if (!vpid)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002769 task = current;
2770 else
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002771 task = find_task_by_vpid(vpid);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002772 if (task)
2773 get_task_struct(task);
2774 rcu_read_unlock();
2775
2776 if (!task)
2777 return ERR_PTR(-ESRCH);
2778
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002779 /* Reuse ptrace permission checks for now. */
2780 err = -EACCES;
2781 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2782 goto errout;
2783
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002784 return task;
2785errout:
2786 put_task_struct(task);
2787 return ERR_PTR(err);
2788
2789}
2790
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002791/*
2792 * Returns a matching context with refcount and pincount.
2793 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002794static struct perf_event_context *
Matt Helsley38a81da2010-09-13 13:01:20 -07002795find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002796{
2797 struct perf_event_context *ctx;
2798 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002799 unsigned long flags;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002800 int ctxn, err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002801
Oleg Nesterov22a4ec72011-01-18 17:10:08 +01002802 if (!task) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002803 /* Must be root to operate on a CPU event: */
2804 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2805 return ERR_PTR(-EACCES);
2806
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002807 /*
2808 * We could be clever and allow to attach a event to an
2809 * offline CPU and activate it when the CPU comes up, but
2810 * that's for later.
2811 */
2812 if (!cpu_online(cpu))
2813 return ERR_PTR(-ENODEV);
2814
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002815 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002816 ctx = &cpuctx->ctx;
2817 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002818 ++ctx->pin_count;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002819
2820 return ctx;
2821 }
2822
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002823 err = -EINVAL;
2824 ctxn = pmu->task_ctx_nr;
2825 if (ctxn < 0)
2826 goto errout;
2827
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002828retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002829 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002830 if (ctx) {
2831 unclone_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002832 ++ctx->pin_count;
Thomas Gleixnere625cce2009-11-17 18:02:06 +01002833 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002834 }
2835
2836 if (!ctx) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002837 ctx = alloc_perf_context(pmu, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002838 err = -ENOMEM;
2839 if (!ctx)
2840 goto errout;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002841
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002842 get_ctx(ctx);
Peter Zijlstraeb184472010-09-07 15:55:13 +02002843
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002844 err = 0;
2845 mutex_lock(&task->perf_event_mutex);
2846 /*
2847 * If it has already passed perf_event_exit_task().
2848 * we must see PF_EXITING, it takes this mutex too.
2849 */
2850 if (task->flags & PF_EXITING)
2851 err = -ESRCH;
2852 else if (task->perf_event_ctxp[ctxn])
2853 err = -EAGAIN;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002854 else {
2855 ++ctx->pin_count;
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002856 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002857 }
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002858 mutex_unlock(&task->perf_event_mutex);
2859
2860 if (unlikely(err)) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002861 put_task_struct(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002862 kfree(ctx);
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002863
2864 if (err == -EAGAIN)
2865 goto retry;
2866 goto errout;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002867 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002868 }
2869
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002870 return ctx;
2871
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002872errout:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002873 return ERR_PTR(err);
2874}
2875
Li Zefan6fb29152009-10-15 11:21:42 +08002876static void perf_event_free_filter(struct perf_event *event);
2877
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002878static void free_event_rcu(struct rcu_head *head)
2879{
2880 struct perf_event *event;
2881
2882 event = container_of(head, struct perf_event, rcu_head);
2883 if (event->ns)
2884 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08002885 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002886 kfree(event);
2887}
2888
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002889static void perf_buffer_put(struct perf_buffer *buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002890
2891static void free_event(struct perf_event *event)
2892{
Peter Zijlstrae360adb2010-10-14 14:01:34 +08002893 irq_work_sync(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002894
2895 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002896 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02002897 jump_label_dec(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01002898 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002899 atomic_dec(&nr_mmap_events);
2900 if (event->attr.comm)
2901 atomic_dec(&nr_comm_events);
2902 if (event->attr.task)
2903 atomic_dec(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002904 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2905 put_callchain_buffers();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002906 }
2907
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002908 if (event->buffer) {
2909 perf_buffer_put(event->buffer);
2910 event->buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002911 }
2912
Stephane Eraniane5d13672011-02-14 11:20:01 +02002913 if (is_cgroup_event(event))
2914 perf_detach_cgroup(event);
2915
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002916 if (event->destroy)
2917 event->destroy(event);
2918
Peter Zijlstra0c67b402010-09-13 11:15:58 +02002919 if (event->ctx)
2920 put_ctx(event->ctx);
2921
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002922 call_rcu(&event->rcu_head, free_event_rcu);
2923}
2924
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002925int perf_event_release_kernel(struct perf_event *event)
2926{
2927 struct perf_event_context *ctx = event->ctx;
2928
Peter Zijlstra050735b2010-05-11 11:51:53 +02002929 /*
2930 * Remove from the PMU, can't get re-enabled since we got
2931 * here because the last ref went.
2932 */
2933 perf_event_disable(event);
2934
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002935 WARN_ON_ONCE(ctx->parent_ctx);
Peter Zijlstraa0507c82010-05-06 15:42:53 +02002936 /*
2937 * There are two ways this annotation is useful:
2938 *
2939 * 1) there is a lock recursion from perf_event_exit_task
2940 * see the comment there.
2941 *
2942 * 2) there is a lock-inversion with mmap_sem through
2943 * perf_event_read_group(), which takes faults while
2944 * holding ctx->mutex, however this is called after
2945 * the last filedesc died, so there is no possibility
2946 * to trigger the AB-BA case.
2947 */
2948 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002949 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra8a495422010-05-27 15:47:49 +02002950 perf_group_detach(event);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002951 list_del_event(event, ctx);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002952 raw_spin_unlock_irq(&ctx->lock);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002953 mutex_unlock(&ctx->mutex);
2954
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002955 free_event(event);
2956
2957 return 0;
2958}
2959EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2960
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002961/*
2962 * Called when the last reference to the file is gone.
2963 */
2964static int perf_release(struct inode *inode, struct file *file)
2965{
2966 struct perf_event *event = file->private_data;
Peter Zijlstra8882135b2010-11-09 19:01:43 +01002967 struct task_struct *owner;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002968
2969 file->private_data = NULL;
2970
Peter Zijlstra8882135b2010-11-09 19:01:43 +01002971 rcu_read_lock();
2972 owner = ACCESS_ONCE(event->owner);
2973 /*
2974 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2975 * !owner it means the list deletion is complete and we can indeed
2976 * free this event, otherwise we need to serialize on
2977 * owner->perf_event_mutex.
2978 */
2979 smp_read_barrier_depends();
2980 if (owner) {
2981 /*
2982 * Since delayed_put_task_struct() also drops the last
2983 * task reference we can safely take a new reference
2984 * while holding the rcu_read_lock().
2985 */
2986 get_task_struct(owner);
2987 }
2988 rcu_read_unlock();
2989
2990 if (owner) {
2991 mutex_lock(&owner->perf_event_mutex);
2992 /*
2993 * We have to re-check the event->owner field, if it is cleared
2994 * we raced with perf_event_exit_task(), acquiring the mutex
2995 * ensured they're done, and we can proceed with freeing the
2996 * event.
2997 */
2998 if (event->owner)
2999 list_del_init(&event->owner_entry);
3000 mutex_unlock(&owner->perf_event_mutex);
3001 put_task_struct(owner);
3002 }
3003
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003004 return perf_event_release_kernel(event);
3005}
3006
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003007u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003008{
3009 struct perf_event *child;
3010 u64 total = 0;
3011
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003012 *enabled = 0;
3013 *running = 0;
3014
Peter Zijlstra6f105812009-11-20 22:19:56 +01003015 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003016 total += perf_event_read(event);
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003017 *enabled += event->total_time_enabled +
3018 atomic64_read(&event->child_total_time_enabled);
3019 *running += event->total_time_running +
3020 atomic64_read(&event->child_total_time_running);
3021
3022 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003023 total += perf_event_read(child);
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003024 *enabled += child->total_time_enabled;
3025 *running += child->total_time_running;
3026 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003027 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003028
3029 return total;
3030}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003031EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003032
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003033static int perf_event_read_group(struct perf_event *event,
3034 u64 read_format, char __user *buf)
3035{
3036 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01003037 int n = 0, size = 0, ret = -EFAULT;
3038 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003039 u64 values[5];
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003040 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003041
Peter Zijlstra6f105812009-11-20 22:19:56 +01003042 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003043 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003044
3045 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003046 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3047 values[n++] = enabled;
3048 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3049 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003050 values[n++] = count;
3051 if (read_format & PERF_FORMAT_ID)
3052 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003053
3054 size = n * sizeof(u64);
3055
3056 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01003057 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003058
Peter Zijlstra6f105812009-11-20 22:19:56 +01003059 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003060
3061 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01003062 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003063
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003064 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003065 if (read_format & PERF_FORMAT_ID)
3066 values[n++] = primary_event_id(sub);
3067
3068 size = n * sizeof(u64);
3069
Stephane Eranian184d3da2009-11-23 21:40:49 -08003070 if (copy_to_user(buf + ret, values, size)) {
Peter Zijlstra6f105812009-11-20 22:19:56 +01003071 ret = -EFAULT;
3072 goto unlock;
3073 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01003074
3075 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003076 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003077unlock:
3078 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003079
Peter Zijlstraabf48682009-11-20 22:19:49 +01003080 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003081}
3082
3083static int perf_event_read_one(struct perf_event *event,
3084 u64 read_format, char __user *buf)
3085{
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003086 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003087 u64 values[4];
3088 int n = 0;
3089
Peter Zijlstra59ed446f2009-11-20 22:19:55 +01003090 values[n++] = perf_event_read_value(event, &enabled, &running);
3091 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3092 values[n++] = enabled;
3093 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3094 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003095 if (read_format & PERF_FORMAT_ID)
3096 values[n++] = primary_event_id(event);
3097
3098 if (copy_to_user(buf, values, n * sizeof(u64)))
3099 return -EFAULT;
3100
3101 return n * sizeof(u64);
3102}
3103
3104/*
3105 * Read the performance event - simple non blocking version for now
3106 */
3107static ssize_t
3108perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3109{
3110 u64 read_format = event->attr.read_format;
3111 int ret;
3112
3113 /*
3114 * Return end-of-file for a read on a event that is in
3115 * error state (i.e. because it was pinned but it couldn't be
3116 * scheduled on to the CPU at some point).
3117 */
3118 if (event->state == PERF_EVENT_STATE_ERROR)
3119 return 0;
3120
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02003121 if (count < event->read_size)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003122 return -ENOSPC;
3123
3124 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003125 if (read_format & PERF_FORMAT_GROUP)
3126 ret = perf_event_read_group(event, read_format, buf);
3127 else
3128 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003129
3130 return ret;
3131}
3132
3133static ssize_t
3134perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3135{
3136 struct perf_event *event = file->private_data;
3137
3138 return perf_read_hw(event, buf, count);
3139}
3140
3141static unsigned int perf_poll(struct file *file, poll_table *wait)
3142{
3143 struct perf_event *event = file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003144 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003145 unsigned int events = POLL_HUP;
3146
3147 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003148 buffer = rcu_dereference(event->buffer);
3149 if (buffer)
3150 events = atomic_xchg(&buffer->poll, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003151 rcu_read_unlock();
3152
3153 poll_wait(file, &event->waitq, wait);
3154
3155 return events;
3156}
3157
3158static void perf_event_reset(struct perf_event *event)
3159{
3160 (void)perf_event_read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02003161 local64_set(&event->count, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003162 perf_event_update_userpage(event);
3163}
3164
3165/*
3166 * Holding the top-level event's child_mutex means that any
3167 * descendant process that has inherited this event will block
3168 * in sync_child_event if it goes to exit, thus satisfying the
3169 * task existence requirements of perf_event_enable/disable.
3170 */
3171static void perf_event_for_each_child(struct perf_event *event,
3172 void (*func)(struct perf_event *))
3173{
3174 struct perf_event *child;
3175
3176 WARN_ON_ONCE(event->ctx->parent_ctx);
3177 mutex_lock(&event->child_mutex);
3178 func(event);
3179 list_for_each_entry(child, &event->child_list, child_list)
3180 func(child);
3181 mutex_unlock(&event->child_mutex);
3182}
3183
3184static void perf_event_for_each(struct perf_event *event,
3185 void (*func)(struct perf_event *))
3186{
3187 struct perf_event_context *ctx = event->ctx;
3188 struct perf_event *sibling;
3189
3190 WARN_ON_ONCE(ctx->parent_ctx);
3191 mutex_lock(&ctx->mutex);
3192 event = event->group_leader;
3193
3194 perf_event_for_each_child(event, func);
3195 func(event);
3196 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3197 perf_event_for_each_child(event, func);
3198 mutex_unlock(&ctx->mutex);
3199}
3200
3201static int perf_event_period(struct perf_event *event, u64 __user *arg)
3202{
3203 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003204 int ret = 0;
3205 u64 value;
3206
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01003207 if (!is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003208 return -EINVAL;
3209
John Blackwoodad0cf342010-09-28 18:03:11 -04003210 if (copy_from_user(&value, arg, sizeof(value)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003211 return -EFAULT;
3212
3213 if (!value)
3214 return -EINVAL;
3215
Thomas Gleixnere625cce2009-11-17 18:02:06 +01003216 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003217 if (event->attr.freq) {
3218 if (value > sysctl_perf_event_sample_rate) {
3219 ret = -EINVAL;
3220 goto unlock;
3221 }
3222
3223 event->attr.sample_freq = value;
3224 } else {
3225 event->attr.sample_period = value;
3226 event->hw.sample_period = value;
3227 }
3228unlock:
Thomas Gleixnere625cce2009-11-17 18:02:06 +01003229 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003230
3231 return ret;
3232}
3233
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003234static const struct file_operations perf_fops;
3235
3236static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3237{
3238 struct file *file;
3239
3240 file = fget_light(fd, fput_needed);
3241 if (!file)
3242 return ERR_PTR(-EBADF);
3243
3244 if (file->f_op != &perf_fops) {
3245 fput_light(file, *fput_needed);
3246 *fput_needed = 0;
3247 return ERR_PTR(-EBADF);
3248 }
3249
3250 return file->private_data;
3251}
3252
3253static int perf_event_set_output(struct perf_event *event,
3254 struct perf_event *output_event);
Li Zefan6fb29152009-10-15 11:21:42 +08003255static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003256
3257static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3258{
3259 struct perf_event *event = file->private_data;
3260 void (*func)(struct perf_event *);
3261 u32 flags = arg;
3262
3263 switch (cmd) {
3264 case PERF_EVENT_IOC_ENABLE:
3265 func = perf_event_enable;
3266 break;
3267 case PERF_EVENT_IOC_DISABLE:
3268 func = perf_event_disable;
3269 break;
3270 case PERF_EVENT_IOC_RESET:
3271 func = perf_event_reset;
3272 break;
3273
3274 case PERF_EVENT_IOC_REFRESH:
3275 return perf_event_refresh(event, arg);
3276
3277 case PERF_EVENT_IOC_PERIOD:
3278 return perf_event_period(event, (u64 __user *)arg);
3279
3280 case PERF_EVENT_IOC_SET_OUTPUT:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003281 {
3282 struct perf_event *output_event = NULL;
3283 int fput_needed = 0;
3284 int ret;
3285
3286 if (arg != -1) {
3287 output_event = perf_fget_light(arg, &fput_needed);
3288 if (IS_ERR(output_event))
3289 return PTR_ERR(output_event);
3290 }
3291
3292 ret = perf_event_set_output(event, output_event);
3293 if (output_event)
3294 fput_light(output_event->filp, fput_needed);
3295
3296 return ret;
3297 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003298
Li Zefan6fb29152009-10-15 11:21:42 +08003299 case PERF_EVENT_IOC_SET_FILTER:
3300 return perf_event_set_filter(event, (void __user *)arg);
3301
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003302 default:
3303 return -ENOTTY;
3304 }
3305
3306 if (flags & PERF_IOC_FLAG_GROUP)
3307 perf_event_for_each(event, func);
3308 else
3309 perf_event_for_each_child(event, func);
3310
3311 return 0;
3312}
3313
3314int perf_event_task_enable(void)
3315{
3316 struct perf_event *event;
3317
3318 mutex_lock(&current->perf_event_mutex);
3319 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3320 perf_event_for_each_child(event, perf_event_enable);
3321 mutex_unlock(&current->perf_event_mutex);
3322
3323 return 0;
3324}
3325
3326int perf_event_task_disable(void)
3327{
3328 struct perf_event *event;
3329
3330 mutex_lock(&current->perf_event_mutex);
3331 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3332 perf_event_for_each_child(event, perf_event_disable);
3333 mutex_unlock(&current->perf_event_mutex);
3334
3335 return 0;
3336}
3337
3338#ifndef PERF_EVENT_INDEX_OFFSET
3339# define PERF_EVENT_INDEX_OFFSET 0
3340#endif
3341
3342static int perf_event_index(struct perf_event *event)
3343{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02003344 if (event->hw.state & PERF_HES_STOPPED)
3345 return 0;
3346
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003347 if (event->state != PERF_EVENT_STATE_ACTIVE)
3348 return 0;
3349
3350 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3351}
3352
3353/*
3354 * Callers need to ensure there can be no nesting of this function, otherwise
3355 * the seqlock logic goes bad. We can not serialize this because the arch
3356 * code calls this from NMI context.
3357 */
3358void perf_event_update_userpage(struct perf_event *event)
3359{
3360 struct perf_event_mmap_page *userpg;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003361 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003362
3363 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003364 buffer = rcu_dereference(event->buffer);
3365 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003366 goto unlock;
3367
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003368 userpg = buffer->user_page;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003369
3370 /*
3371 * Disable preemption so as to not let the corresponding user-space
3372 * spin too long if we get preempted.
3373 */
3374 preempt_disable();
3375 ++userpg->lock;
3376 barrier();
3377 userpg->index = perf_event_index(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003378 userpg->offset = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003379 if (event->state == PERF_EVENT_STATE_ACTIVE)
Peter Zijlstrae7850592010-05-21 14:43:08 +02003380 userpg->offset -= local64_read(&event->hw.prev_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003381
3382 userpg->time_enabled = event->total_time_enabled +
3383 atomic64_read(&event->child_total_time_enabled);
3384
3385 userpg->time_running = event->total_time_running +
3386 atomic64_read(&event->child_total_time_running);
3387
3388 barrier();
3389 ++userpg->lock;
3390 preempt_enable();
3391unlock:
3392 rcu_read_unlock();
3393}
3394
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003395static unsigned long perf_data_size(struct perf_buffer *buffer);
3396
3397static void
3398perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3399{
3400 long max_size = perf_data_size(buffer);
3401
3402 if (watermark)
3403 buffer->watermark = min(max_size, watermark);
3404
3405 if (!buffer->watermark)
3406 buffer->watermark = max_size / 2;
3407
3408 if (flags & PERF_BUFFER_WRITABLE)
3409 buffer->writable = 1;
3410
3411 atomic_set(&buffer->refcount, 1);
3412}
3413
Peter Zijlstra906010b2009-09-21 16:08:49 +02003414#ifndef CONFIG_PERF_USE_VMALLOC
3415
3416/*
3417 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3418 */
3419
3420static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003421perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003422{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003423 if (pgoff > buffer->nr_pages)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003424 return NULL;
3425
3426 if (pgoff == 0)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003427 return virt_to_page(buffer->user_page);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003428
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003429 return virt_to_page(buffer->data_pages[pgoff - 1]);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003430}
3431
Peter Zijlstraa19d35c2010-05-17 18:48:00 +02003432static void *perf_mmap_alloc_page(int cpu)
3433{
3434 struct page *page;
3435 int node;
3436
3437 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3438 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3439 if (!page)
3440 return NULL;
3441
3442 return page_address(page);
3443}
3444
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003445static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003446perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003447{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003448 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003449 unsigned long size;
3450 int i;
3451
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003452 size = sizeof(struct perf_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003453 size += nr_pages * sizeof(void *);
3454
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003455 buffer = kzalloc(size, GFP_KERNEL);
3456 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003457 goto fail;
3458
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003459 buffer->user_page = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003460 if (!buffer->user_page)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003461 goto fail_user_page;
3462
3463 for (i = 0; i < nr_pages; i++) {
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003464 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003465 if (!buffer->data_pages[i])
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003466 goto fail_data_pages;
3467 }
3468
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003469 buffer->nr_pages = nr_pages;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003470
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003471 perf_buffer_init(buffer, watermark, flags);
3472
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003473 return buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003474
3475fail_data_pages:
3476 for (i--; i >= 0; i--)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003477 free_page((unsigned long)buffer->data_pages[i]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003478
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003479 free_page((unsigned long)buffer->user_page);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003480
3481fail_user_page:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003482 kfree(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003483
3484fail:
Peter Zijlstra906010b2009-09-21 16:08:49 +02003485 return NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003486}
3487
3488static void perf_mmap_free_page(unsigned long addr)
3489{
3490 struct page *page = virt_to_page((void *)addr);
3491
3492 page->mapping = NULL;
3493 __free_page(page);
3494}
3495
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003496static void perf_buffer_free(struct perf_buffer *buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003497{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003498 int i;
3499
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003500 perf_mmap_free_page((unsigned long)buffer->user_page);
3501 for (i = 0; i < buffer->nr_pages; i++)
3502 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3503 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003504}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003505
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003506static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003507{
3508 return 0;
3509}
3510
Peter Zijlstra906010b2009-09-21 16:08:49 +02003511#else
3512
3513/*
3514 * Back perf_mmap() with vmalloc memory.
3515 *
3516 * Required for architectures that have d-cache aliasing issues.
3517 */
3518
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003519static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003520{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003521 return buffer->page_order;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003522}
3523
Peter Zijlstra906010b2009-09-21 16:08:49 +02003524static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003525perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003526{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003527 if (pgoff > (1UL << page_order(buffer)))
Peter Zijlstra906010b2009-09-21 16:08:49 +02003528 return NULL;
3529
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003530 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003531}
3532
3533static void perf_mmap_unmark_page(void *addr)
3534{
3535 struct page *page = vmalloc_to_page(addr);
3536
3537 page->mapping = NULL;
3538}
3539
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003540static void perf_buffer_free_work(struct work_struct *work)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003541{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003542 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003543 void *base;
3544 int i, nr;
3545
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003546 buffer = container_of(work, struct perf_buffer, work);
3547 nr = 1 << page_order(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003548
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003549 base = buffer->user_page;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003550 for (i = 0; i < nr + 1; i++)
3551 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3552
3553 vfree(base);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003554 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003555}
3556
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003557static void perf_buffer_free(struct perf_buffer *buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003558{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003559 schedule_work(&buffer->work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003560}
3561
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003562static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003563perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003564{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003565 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003566 unsigned long size;
3567 void *all_buf;
3568
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003569 size = sizeof(struct perf_buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003570 size += sizeof(void *);
3571
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003572 buffer = kzalloc(size, GFP_KERNEL);
3573 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003574 goto fail;
3575
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003576 INIT_WORK(&buffer->work, perf_buffer_free_work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003577
3578 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3579 if (!all_buf)
3580 goto fail_all_buf;
3581
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003582 buffer->user_page = all_buf;
3583 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3584 buffer->page_order = ilog2(nr_pages);
3585 buffer->nr_pages = 1;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003586
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003587 perf_buffer_init(buffer, watermark, flags);
3588
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003589 return buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003590
3591fail_all_buf:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003592 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003593
3594fail:
3595 return NULL;
3596}
3597
3598#endif
3599
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003600static unsigned long perf_data_size(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003601{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003602 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003603}
3604
Peter Zijlstra906010b2009-09-21 16:08:49 +02003605static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3606{
3607 struct perf_event *event = vma->vm_file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003608 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003609 int ret = VM_FAULT_SIGBUS;
3610
3611 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3612 if (vmf->pgoff == 0)
3613 ret = 0;
3614 return ret;
3615 }
3616
3617 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003618 buffer = rcu_dereference(event->buffer);
3619 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003620 goto unlock;
3621
3622 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3623 goto unlock;
3624
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003625 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003626 if (!vmf->page)
3627 goto unlock;
3628
3629 get_page(vmf->page);
3630 vmf->page->mapping = vma->vm_file->f_mapping;
3631 vmf->page->index = vmf->pgoff;
3632
3633 ret = 0;
3634unlock:
3635 rcu_read_unlock();
3636
3637 return ret;
3638}
3639
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003640static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003641{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003642 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003643
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003644 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
3645 perf_buffer_free(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003646}
3647
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003648static struct perf_buffer *perf_buffer_get(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003649{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003650 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003651
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003652 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003653 buffer = rcu_dereference(event->buffer);
3654 if (buffer) {
3655 if (!atomic_inc_not_zero(&buffer->refcount))
3656 buffer = NULL;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003657 }
3658 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003659
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003660 return buffer;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003661}
3662
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003663static void perf_buffer_put(struct perf_buffer *buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003664{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003665 if (!atomic_dec_and_test(&buffer->refcount))
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003666 return;
3667
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003668 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003669}
3670
3671static void perf_mmap_open(struct vm_area_struct *vma)
3672{
3673 struct perf_event *event = vma->vm_file->private_data;
3674
3675 atomic_inc(&event->mmap_count);
3676}
3677
3678static void perf_mmap_close(struct vm_area_struct *vma)
3679{
3680 struct perf_event *event = vma->vm_file->private_data;
3681
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003682 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003683 unsigned long size = perf_data_size(event->buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003684 struct user_struct *user = event->mmap_user;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003685 struct perf_buffer *buffer = event->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003686
Peter Zijlstra906010b2009-09-21 16:08:49 +02003687 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003688 vma->vm_mm->locked_vm -= event->mmap_locked;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003689 rcu_assign_pointer(event->buffer, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003690 mutex_unlock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003691
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003692 perf_buffer_put(buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003693 free_uid(user);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003694 }
3695}
3696
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04003697static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003698 .open = perf_mmap_open,
3699 .close = perf_mmap_close,
3700 .fault = perf_mmap_fault,
3701 .page_mkwrite = perf_mmap_fault,
3702};
3703
3704static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3705{
3706 struct perf_event *event = file->private_data;
3707 unsigned long user_locked, user_lock_limit;
3708 struct user_struct *user = current_user();
3709 unsigned long locked, lock_limit;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003710 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003711 unsigned long vma_size;
3712 unsigned long nr_pages;
3713 long user_extra, extra;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003714 int ret = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003715
Peter Zijlstrac7920612010-05-18 10:33:24 +02003716 /*
3717 * Don't allow mmap() of inherited per-task counters. This would
3718 * create a performance issue due to all children writing to the
3719 * same buffer.
3720 */
3721 if (event->cpu == -1 && event->attr.inherit)
3722 return -EINVAL;
3723
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003724 if (!(vma->vm_flags & VM_SHARED))
3725 return -EINVAL;
3726
3727 vma_size = vma->vm_end - vma->vm_start;
3728 nr_pages = (vma_size / PAGE_SIZE) - 1;
3729
3730 /*
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003731 * If we have buffer pages ensure they're a power-of-two number, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003732 * can do bitmasks instead of modulo.
3733 */
3734 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3735 return -EINVAL;
3736
3737 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3738 return -EINVAL;
3739
3740 if (vma->vm_pgoff != 0)
3741 return -EINVAL;
3742
3743 WARN_ON_ONCE(event->ctx->parent_ctx);
3744 mutex_lock(&event->mmap_mutex);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003745 if (event->buffer) {
3746 if (event->buffer->nr_pages == nr_pages)
3747 atomic_inc(&event->buffer->refcount);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003748 else
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003749 ret = -EINVAL;
3750 goto unlock;
3751 }
3752
3753 user_extra = nr_pages + 1;
3754 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3755
3756 /*
3757 * Increase the limit linearly with more CPUs:
3758 */
3759 user_lock_limit *= num_online_cpus();
3760
3761 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3762
3763 extra = 0;
3764 if (user_locked > user_lock_limit)
3765 extra = user_locked - user_lock_limit;
3766
Jiri Slaby78d7d402010-03-05 13:42:54 -08003767 lock_limit = rlimit(RLIMIT_MEMLOCK);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003768 lock_limit >>= PAGE_SHIFT;
3769 locked = vma->vm_mm->locked_vm + extra;
3770
3771 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3772 !capable(CAP_IPC_LOCK)) {
3773 ret = -EPERM;
3774 goto unlock;
3775 }
3776
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003777 WARN_ON(event->buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003778
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003779 if (vma->vm_flags & VM_WRITE)
3780 flags |= PERF_BUFFER_WRITABLE;
3781
3782 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
3783 event->cpu, flags);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003784 if (!buffer) {
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003785 ret = -ENOMEM;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003786 goto unlock;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003787 }
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003788 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003789
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003790 atomic_long_add(user_extra, &user->locked_vm);
3791 event->mmap_locked = extra;
3792 event->mmap_user = get_current_user();
3793 vma->vm_mm->locked_vm += event->mmap_locked;
3794
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003795unlock:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003796 if (!ret)
3797 atomic_inc(&event->mmap_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003798 mutex_unlock(&event->mmap_mutex);
3799
3800 vma->vm_flags |= VM_RESERVED;
3801 vma->vm_ops = &perf_mmap_vmops;
3802
3803 return ret;
3804}
3805
3806static int perf_fasync(int fd, struct file *filp, int on)
3807{
3808 struct inode *inode = filp->f_path.dentry->d_inode;
3809 struct perf_event *event = filp->private_data;
3810 int retval;
3811
3812 mutex_lock(&inode->i_mutex);
3813 retval = fasync_helper(fd, filp, on, &event->fasync);
3814 mutex_unlock(&inode->i_mutex);
3815
3816 if (retval < 0)
3817 return retval;
3818
3819 return 0;
3820}
3821
3822static const struct file_operations perf_fops = {
Arnd Bergmann3326c1c2010-03-23 19:09:33 +01003823 .llseek = no_llseek,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003824 .release = perf_release,
3825 .read = perf_read,
3826 .poll = perf_poll,
3827 .unlocked_ioctl = perf_ioctl,
3828 .compat_ioctl = perf_ioctl,
3829 .mmap = perf_mmap,
3830 .fasync = perf_fasync,
3831};
3832
3833/*
3834 * Perf event wakeup
3835 *
3836 * If there's data, ensure we set the poll() state and publish everything
3837 * to user-space before waking everybody up.
3838 */
3839
3840void perf_event_wakeup(struct perf_event *event)
3841{
3842 wake_up_all(&event->waitq);
3843
3844 if (event->pending_kill) {
3845 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3846 event->pending_kill = 0;
3847 }
3848}
3849
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003850static void perf_pending_event(struct irq_work *entry)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003851{
3852 struct perf_event *event = container_of(entry,
3853 struct perf_event, pending);
3854
3855 if (event->pending_disable) {
3856 event->pending_disable = 0;
3857 __perf_event_disable(event);
3858 }
3859
3860 if (event->pending_wakeup) {
3861 event->pending_wakeup = 0;
3862 perf_event_wakeup(event);
3863 }
3864}
3865
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003866/*
Zhang, Yanmin39447b32010-04-19 13:32:41 +08003867 * We assume there is only KVM supporting the callbacks.
3868 * Later on, we might change it to a list if there is
3869 * another virtualization implementation supporting the callbacks.
3870 */
3871struct perf_guest_info_callbacks *perf_guest_cbs;
3872
3873int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3874{
3875 perf_guest_cbs = cbs;
3876 return 0;
3877}
3878EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3879
3880int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3881{
3882 perf_guest_cbs = NULL;
3883 return 0;
3884}
3885EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3886
3887/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003888 * Output
3889 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003890static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003891 unsigned long offset, unsigned long head)
3892{
3893 unsigned long mask;
3894
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003895 if (!buffer->writable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003896 return true;
3897
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003898 mask = perf_data_size(buffer) - 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003899
3900 offset = (offset - tail) & mask;
3901 head = (head - tail) & mask;
3902
3903 if ((int)(head - offset) < 0)
3904 return false;
3905
3906 return true;
3907}
3908
3909static void perf_output_wakeup(struct perf_output_handle *handle)
3910{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003911 atomic_set(&handle->buffer->poll, POLL_IN);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003912
3913 if (handle->nmi) {
3914 handle->event->pending_wakeup = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003915 irq_work_queue(&handle->event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003916 } else
3917 perf_event_wakeup(handle->event);
3918}
3919
3920/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003921 * We need to ensure a later event_id doesn't publish a head when a former
Peter Zijlstraef607772010-05-18 10:50:41 +02003922 * event isn't done writing. However since we need to deal with NMIs we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003923 * cannot fully serialize things.
3924 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003925 * We only publish the head (and generate a wakeup) when the outer-most
Peter Zijlstraef607772010-05-18 10:50:41 +02003926 * event completes.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003927 */
Peter Zijlstraef607772010-05-18 10:50:41 +02003928static void perf_output_get_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003929{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003930 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003931
Peter Zijlstraef607772010-05-18 10:50:41 +02003932 preempt_disable();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003933 local_inc(&buffer->nest);
3934 handle->wakeup = local_read(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003935}
3936
Peter Zijlstraef607772010-05-18 10:50:41 +02003937static void perf_output_put_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003938{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003939 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003940 unsigned long head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003941
3942again:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003943 head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003944
3945 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003946 * IRQ/NMI can happen here, which means we can miss a head update.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003947 */
3948
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003949 if (!local_dec_and_test(&buffer->nest))
Frederic Weisbeckeracd35a42010-05-20 21:28:34 +02003950 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003951
3952 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003953 * Publish the known good head. Rely on the full barrier implied
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003954 * by atomic_dec_and_test() order the buffer->head read and this
Peter Zijlstraef607772010-05-18 10:50:41 +02003955 * write.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003956 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003957 buffer->user_page->data_head = head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003958
Peter Zijlstraef607772010-05-18 10:50:41 +02003959 /*
3960 * Now check if we missed an update, rely on the (compiler)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003961 * barrier in atomic_dec_and_test() to re-read buffer->head.
Peter Zijlstraef607772010-05-18 10:50:41 +02003962 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003963 if (unlikely(head != local_read(&buffer->head))) {
3964 local_inc(&buffer->nest);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003965 goto again;
3966 }
3967
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003968 if (handle->wakeup != local_read(&buffer->wakeup))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003969 perf_output_wakeup(handle);
Peter Zijlstraef607772010-05-18 10:50:41 +02003970
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003971out:
Peter Zijlstraef607772010-05-18 10:50:41 +02003972 preempt_enable();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003973}
3974
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003975__always_inline void perf_output_copy(struct perf_output_handle *handle,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003976 const void *buf, unsigned int len)
3977{
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003978 do {
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003979 unsigned long size = min_t(unsigned long, handle->size, len);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003980
3981 memcpy(handle->addr, buf, size);
3982
3983 len -= size;
3984 handle->addr += size;
Frederic Weisbecker74048f82010-05-27 21:34:58 +02003985 buf += size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003986 handle->size -= size;
3987 if (!handle->size) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003988 struct perf_buffer *buffer = handle->buffer;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003989
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003990 handle->page++;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003991 handle->page &= buffer->nr_pages - 1;
3992 handle->addr = buffer->data_pages[handle->page];
3993 handle->size = PAGE_SIZE << page_order(buffer);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003994 }
3995 } while (len);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003996}
3997
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003998static void __perf_event_header__init_id(struct perf_event_header *header,
3999 struct perf_sample_data *data,
4000 struct perf_event *event)
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02004001{
4002 u64 sample_type = event->attr.sample_type;
4003
4004 data->type = sample_type;
4005 header->size += event->id_header_size;
4006
4007 if (sample_type & PERF_SAMPLE_TID) {
4008 /* namespace issues */
4009 data->tid_entry.pid = perf_event_pid(event, current);
4010 data->tid_entry.tid = perf_event_tid(event, current);
4011 }
4012
4013 if (sample_type & PERF_SAMPLE_TIME)
4014 data->time = perf_clock();
4015
4016 if (sample_type & PERF_SAMPLE_ID)
4017 data->id = primary_event_id(event);
4018
4019 if (sample_type & PERF_SAMPLE_STREAM_ID)
4020 data->stream_id = event->id;
4021
4022 if (sample_type & PERF_SAMPLE_CPU) {
4023 data->cpu_entry.cpu = raw_smp_processor_id();
4024 data->cpu_entry.reserved = 0;
4025 }
4026}
4027
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004028static void perf_event_header__init_id(struct perf_event_header *header,
4029 struct perf_sample_data *data,
4030 struct perf_event *event)
4031{
4032 if (event->attr.sample_id_all)
4033 __perf_event_header__init_id(header, data, event);
4034}
4035
4036static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4037 struct perf_sample_data *data)
4038{
4039 u64 sample_type = data->type;
4040
4041 if (sample_type & PERF_SAMPLE_TID)
4042 perf_output_put(handle, data->tid_entry);
4043
4044 if (sample_type & PERF_SAMPLE_TIME)
4045 perf_output_put(handle, data->time);
4046
4047 if (sample_type & PERF_SAMPLE_ID)
4048 perf_output_put(handle, data->id);
4049
4050 if (sample_type & PERF_SAMPLE_STREAM_ID)
4051 perf_output_put(handle, data->stream_id);
4052
4053 if (sample_type & PERF_SAMPLE_CPU)
4054 perf_output_put(handle, data->cpu_entry);
4055}
4056
4057static void perf_event__output_id_sample(struct perf_event *event,
4058 struct perf_output_handle *handle,
4059 struct perf_sample_data *sample)
4060{
4061 if (event->attr.sample_id_all)
4062 __perf_event__output_id_sample(handle, sample);
4063}
4064
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004065int perf_output_begin(struct perf_output_handle *handle,
4066 struct perf_event *event, unsigned int size,
4067 int nmi, int sample)
4068{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004069 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004070 unsigned long tail, offset, head;
4071 int have_lost;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004072 struct perf_sample_data sample_data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004073 struct {
4074 struct perf_event_header header;
4075 u64 id;
4076 u64 lost;
4077 } lost_event;
4078
4079 rcu_read_lock();
4080 /*
4081 * For inherited events we send all the output towards the parent.
4082 */
4083 if (event->parent)
4084 event = event->parent;
4085
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004086 buffer = rcu_dereference(event->buffer);
4087 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004088 goto out;
4089
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004090 handle->buffer = buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004091 handle->event = event;
4092 handle->nmi = nmi;
4093 handle->sample = sample;
4094
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004095 if (!buffer->nr_pages)
Stephane Eranian00d1d0b2010-05-17 12:46:01 +02004096 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004097
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004098 have_lost = local_read(&buffer->lost);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004099 if (have_lost) {
4100 lost_event.header.size = sizeof(lost_event);
4101 perf_event_header__init_id(&lost_event.header, &sample_data,
4102 event);
4103 size += lost_event.header.size;
4104 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004105
Peter Zijlstraef607772010-05-18 10:50:41 +02004106 perf_output_get_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004107
4108 do {
4109 /*
4110 * Userspace could choose to issue a mb() before updating the
4111 * tail pointer. So that all reads will be completed before the
4112 * write is issued.
4113 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004114 tail = ACCESS_ONCE(buffer->user_page->data_tail);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004115 smp_rmb();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004116 offset = head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004117 head += size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004118 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004119 goto fail;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004120 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004121
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004122 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4123 local_add(buffer->watermark, &buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004124
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004125 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4126 handle->page &= buffer->nr_pages - 1;
4127 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4128 handle->addr = buffer->data_pages[handle->page];
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004129 handle->addr += handle->size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004130 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004131
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004132 if (have_lost) {
4133 lost_event.header.type = PERF_RECORD_LOST;
4134 lost_event.header.misc = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004135 lost_event.id = event->id;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004136 lost_event.lost = local_xchg(&buffer->lost, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004137
4138 perf_output_put(handle, lost_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004139 perf_event__output_id_sample(event, handle, &sample_data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004140 }
4141
4142 return 0;
4143
4144fail:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004145 local_inc(&buffer->lost);
Peter Zijlstraef607772010-05-18 10:50:41 +02004146 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004147out:
4148 rcu_read_unlock();
4149
4150 return -ENOSPC;
4151}
4152
4153void perf_output_end(struct perf_output_handle *handle)
4154{
4155 struct perf_event *event = handle->event;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004156 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004157
4158 int wakeup_events = event->attr.wakeup_events;
4159
4160 if (handle->sample && wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004161 int events = local_inc_return(&buffer->events);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004162 if (events >= wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004163 local_sub(wakeup_events, &buffer->events);
4164 local_inc(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004165 }
4166 }
4167
Peter Zijlstraef607772010-05-18 10:50:41 +02004168 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004169 rcu_read_unlock();
4170}
4171
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004172static void perf_output_read_one(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004173 struct perf_event *event,
4174 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004175{
4176 u64 read_format = event->attr.read_format;
4177 u64 values[4];
4178 int n = 0;
4179
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004180 values[n++] = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004181 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004182 values[n++] = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004183 atomic64_read(&event->child_total_time_enabled);
4184 }
4185 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004186 values[n++] = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004187 atomic64_read(&event->child_total_time_running);
4188 }
4189 if (read_format & PERF_FORMAT_ID)
4190 values[n++] = primary_event_id(event);
4191
4192 perf_output_copy(handle, values, n * sizeof(u64));
4193}
4194
4195/*
4196 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4197 */
4198static void perf_output_read_group(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004199 struct perf_event *event,
4200 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004201{
4202 struct perf_event *leader = event->group_leader, *sub;
4203 u64 read_format = event->attr.read_format;
4204 u64 values[5];
4205 int n = 0;
4206
4207 values[n++] = 1 + leader->nr_siblings;
4208
4209 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
Stephane Eranianeed01522010-10-26 16:08:01 +02004210 values[n++] = enabled;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004211
4212 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
Stephane Eranianeed01522010-10-26 16:08:01 +02004213 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004214
4215 if (leader != event)
4216 leader->pmu->read(leader);
4217
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004218 values[n++] = perf_event_count(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004219 if (read_format & PERF_FORMAT_ID)
4220 values[n++] = primary_event_id(leader);
4221
4222 perf_output_copy(handle, values, n * sizeof(u64));
4223
4224 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4225 n = 0;
4226
4227 if (sub != event)
4228 sub->pmu->read(sub);
4229
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004230 values[n++] = perf_event_count(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004231 if (read_format & PERF_FORMAT_ID)
4232 values[n++] = primary_event_id(sub);
4233
4234 perf_output_copy(handle, values, n * sizeof(u64));
4235 }
4236}
4237
Stephane Eranianeed01522010-10-26 16:08:01 +02004238#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4239 PERF_FORMAT_TOTAL_TIME_RUNNING)
4240
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004241static void perf_output_read(struct perf_output_handle *handle,
4242 struct perf_event *event)
4243{
Stephane Eranianeed01522010-10-26 16:08:01 +02004244 u64 enabled = 0, running = 0, now, ctx_time;
4245 u64 read_format = event->attr.read_format;
4246
4247 /*
4248 * compute total_time_enabled, total_time_running
4249 * based on snapshot values taken when the event
4250 * was last scheduled in.
4251 *
4252 * we cannot simply called update_context_time()
4253 * because of locking issue as we are called in
4254 * NMI context
4255 */
4256 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
4257 now = perf_clock();
4258 ctx_time = event->shadow_ctx_time + now;
4259 enabled = ctx_time - event->tstamp_enabled;
4260 running = ctx_time - event->tstamp_running;
4261 }
4262
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004263 if (event->attr.read_format & PERF_FORMAT_GROUP)
Stephane Eranianeed01522010-10-26 16:08:01 +02004264 perf_output_read_group(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004265 else
Stephane Eranianeed01522010-10-26 16:08:01 +02004266 perf_output_read_one(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004267}
4268
4269void perf_output_sample(struct perf_output_handle *handle,
4270 struct perf_event_header *header,
4271 struct perf_sample_data *data,
4272 struct perf_event *event)
4273{
4274 u64 sample_type = data->type;
4275
4276 perf_output_put(handle, *header);
4277
4278 if (sample_type & PERF_SAMPLE_IP)
4279 perf_output_put(handle, data->ip);
4280
4281 if (sample_type & PERF_SAMPLE_TID)
4282 perf_output_put(handle, data->tid_entry);
4283
4284 if (sample_type & PERF_SAMPLE_TIME)
4285 perf_output_put(handle, data->time);
4286
4287 if (sample_type & PERF_SAMPLE_ADDR)
4288 perf_output_put(handle, data->addr);
4289
4290 if (sample_type & PERF_SAMPLE_ID)
4291 perf_output_put(handle, data->id);
4292
4293 if (sample_type & PERF_SAMPLE_STREAM_ID)
4294 perf_output_put(handle, data->stream_id);
4295
4296 if (sample_type & PERF_SAMPLE_CPU)
4297 perf_output_put(handle, data->cpu_entry);
4298
4299 if (sample_type & PERF_SAMPLE_PERIOD)
4300 perf_output_put(handle, data->period);
4301
4302 if (sample_type & PERF_SAMPLE_READ)
4303 perf_output_read(handle, event);
4304
4305 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4306 if (data->callchain) {
4307 int size = 1;
4308
4309 if (data->callchain)
4310 size += data->callchain->nr;
4311
4312 size *= sizeof(u64);
4313
4314 perf_output_copy(handle, data->callchain, size);
4315 } else {
4316 u64 nr = 0;
4317 perf_output_put(handle, nr);
4318 }
4319 }
4320
4321 if (sample_type & PERF_SAMPLE_RAW) {
4322 if (data->raw) {
4323 perf_output_put(handle, data->raw->size);
4324 perf_output_copy(handle, data->raw->data,
4325 data->raw->size);
4326 } else {
4327 struct {
4328 u32 size;
4329 u32 data;
4330 } raw = {
4331 .size = sizeof(u32),
4332 .data = 0,
4333 };
4334 perf_output_put(handle, raw);
4335 }
4336 }
4337}
4338
4339void perf_prepare_sample(struct perf_event_header *header,
4340 struct perf_sample_data *data,
4341 struct perf_event *event,
4342 struct pt_regs *regs)
4343{
4344 u64 sample_type = event->attr.sample_type;
4345
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004346 header->type = PERF_RECORD_SAMPLE;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004347 header->size = sizeof(*header) + event->header_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004348
4349 header->misc = 0;
4350 header->misc |= perf_misc_flags(regs);
4351
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004352 __perf_event_header__init_id(header, data, event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02004353
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004354 if (sample_type & PERF_SAMPLE_IP)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004355 data->ip = perf_instruction_pointer(regs);
4356
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004357 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4358 int size = 1;
4359
4360 data->callchain = perf_callchain(regs);
4361
4362 if (data->callchain)
4363 size += data->callchain->nr;
4364
4365 header->size += size * sizeof(u64);
4366 }
4367
4368 if (sample_type & PERF_SAMPLE_RAW) {
4369 int size = sizeof(u32);
4370
4371 if (data->raw)
4372 size += data->raw->size;
4373 else
4374 size += sizeof(u32);
4375
4376 WARN_ON_ONCE(size & (sizeof(u64)-1));
4377 header->size += size;
4378 }
4379}
4380
4381static void perf_event_output(struct perf_event *event, int nmi,
4382 struct perf_sample_data *data,
4383 struct pt_regs *regs)
4384{
4385 struct perf_output_handle handle;
4386 struct perf_event_header header;
4387
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004388 /* protect the callchain buffers */
4389 rcu_read_lock();
4390
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004391 perf_prepare_sample(&header, data, event, regs);
4392
4393 if (perf_output_begin(&handle, event, header.size, nmi, 1))
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004394 goto exit;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004395
4396 perf_output_sample(&handle, &header, data, event);
4397
4398 perf_output_end(&handle);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004399
4400exit:
4401 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004402}
4403
4404/*
4405 * read event_id
4406 */
4407
4408struct perf_read_event {
4409 struct perf_event_header header;
4410
4411 u32 pid;
4412 u32 tid;
4413};
4414
4415static void
4416perf_event_read_event(struct perf_event *event,
4417 struct task_struct *task)
4418{
4419 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004420 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004421 struct perf_read_event read_event = {
4422 .header = {
4423 .type = PERF_RECORD_READ,
4424 .misc = 0,
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004425 .size = sizeof(read_event) + event->read_size,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004426 },
4427 .pid = perf_event_pid(event, task),
4428 .tid = perf_event_tid(event, task),
4429 };
4430 int ret;
4431
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004432 perf_event_header__init_id(&read_event.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004433 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
4434 if (ret)
4435 return;
4436
4437 perf_output_put(&handle, read_event);
4438 perf_output_read(&handle, event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004439 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004440
4441 perf_output_end(&handle);
4442}
4443
4444/*
4445 * task tracking -- fork/exit
4446 *
Eric B Munson3af9e852010-05-18 15:30:49 +01004447 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004448 */
4449
4450struct perf_task_event {
4451 struct task_struct *task;
4452 struct perf_event_context *task_ctx;
4453
4454 struct {
4455 struct perf_event_header header;
4456
4457 u32 pid;
4458 u32 ppid;
4459 u32 tid;
4460 u32 ptid;
4461 u64 time;
4462 } event_id;
4463};
4464
4465static void perf_event_task_output(struct perf_event *event,
4466 struct perf_task_event *task_event)
4467{
4468 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004469 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004470 struct task_struct *task = task_event->task;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004471 int ret, size = task_event->event_id.header.size;
Mike Galbraith8bb39f92010-03-26 11:11:33 +01004472
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004473 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004474
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004475 ret = perf_output_begin(&handle, event,
4476 task_event->event_id.header.size, 0, 0);
Peter Zijlstraef607772010-05-18 10:50:41 +02004477 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004478 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004479
4480 task_event->event_id.pid = perf_event_pid(event, task);
4481 task_event->event_id.ppid = perf_event_pid(event, current);
4482
4483 task_event->event_id.tid = perf_event_tid(event, task);
4484 task_event->event_id.ptid = perf_event_tid(event, current);
4485
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004486 perf_output_put(&handle, task_event->event_id);
4487
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004488 perf_event__output_id_sample(event, &handle, &sample);
4489
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004490 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004491out:
4492 task_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004493}
4494
4495static int perf_event_task_match(struct perf_event *event)
4496{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004497 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004498 return 0;
4499
Stephane Eranian5632ab12011-01-03 18:20:01 +02004500 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004501 return 0;
4502
Eric B Munson3af9e852010-05-18 15:30:49 +01004503 if (event->attr.comm || event->attr.mmap ||
4504 event->attr.mmap_data || event->attr.task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004505 return 1;
4506
4507 return 0;
4508}
4509
4510static void perf_event_task_ctx(struct perf_event_context *ctx,
4511 struct perf_task_event *task_event)
4512{
4513 struct perf_event *event;
4514
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004515 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4516 if (perf_event_task_match(event))
4517 perf_event_task_output(event, task_event);
4518 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004519}
4520
4521static void perf_event_task_event(struct perf_task_event *task_event)
4522{
4523 struct perf_cpu_context *cpuctx;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004524 struct perf_event_context *ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004525 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004526 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004527
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01004528 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004529 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004530 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004531 if (cpuctx->active_pmu != pmu)
4532 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004533 perf_event_task_ctx(&cpuctx->ctx, task_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004534
4535 ctx = task_event->task_ctx;
4536 if (!ctx) {
4537 ctxn = pmu->task_ctx_nr;
4538 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004539 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004540 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4541 }
4542 if (ctx)
4543 perf_event_task_ctx(ctx, task_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004544next:
4545 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004546 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004547 rcu_read_unlock();
4548}
4549
4550static void perf_event_task(struct task_struct *task,
4551 struct perf_event_context *task_ctx,
4552 int new)
4553{
4554 struct perf_task_event task_event;
4555
4556 if (!atomic_read(&nr_comm_events) &&
4557 !atomic_read(&nr_mmap_events) &&
4558 !atomic_read(&nr_task_events))
4559 return;
4560
4561 task_event = (struct perf_task_event){
4562 .task = task,
4563 .task_ctx = task_ctx,
4564 .event_id = {
4565 .header = {
4566 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4567 .misc = 0,
4568 .size = sizeof(task_event.event_id),
4569 },
4570 /* .pid */
4571 /* .ppid */
4572 /* .tid */
4573 /* .ptid */
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004574 .time = perf_clock(),
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004575 },
4576 };
4577
4578 perf_event_task_event(&task_event);
4579}
4580
4581void perf_event_fork(struct task_struct *task)
4582{
4583 perf_event_task(task, NULL, 1);
4584}
4585
4586/*
4587 * comm tracking
4588 */
4589
4590struct perf_comm_event {
4591 struct task_struct *task;
4592 char *comm;
4593 int comm_size;
4594
4595 struct {
4596 struct perf_event_header header;
4597
4598 u32 pid;
4599 u32 tid;
4600 } event_id;
4601};
4602
4603static void perf_event_comm_output(struct perf_event *event,
4604 struct perf_comm_event *comm_event)
4605{
4606 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004607 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004608 int size = comm_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004609 int ret;
4610
4611 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4612 ret = perf_output_begin(&handle, event,
4613 comm_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004614
4615 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004616 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004617
4618 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4619 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4620
4621 perf_output_put(&handle, comm_event->event_id);
4622 perf_output_copy(&handle, comm_event->comm,
4623 comm_event->comm_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004624
4625 perf_event__output_id_sample(event, &handle, &sample);
4626
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004627 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004628out:
4629 comm_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004630}
4631
4632static int perf_event_comm_match(struct perf_event *event)
4633{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004634 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004635 return 0;
4636
Stephane Eranian5632ab12011-01-03 18:20:01 +02004637 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004638 return 0;
4639
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004640 if (event->attr.comm)
4641 return 1;
4642
4643 return 0;
4644}
4645
4646static void perf_event_comm_ctx(struct perf_event_context *ctx,
4647 struct perf_comm_event *comm_event)
4648{
4649 struct perf_event *event;
4650
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004651 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4652 if (perf_event_comm_match(event))
4653 perf_event_comm_output(event, comm_event);
4654 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004655}
4656
4657static void perf_event_comm_event(struct perf_comm_event *comm_event)
4658{
4659 struct perf_cpu_context *cpuctx;
4660 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004661 char comm[TASK_COMM_LEN];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004662 unsigned int size;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004663 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004664 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004665
4666 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01004667 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004668 size = ALIGN(strlen(comm)+1, sizeof(u64));
4669
4670 comm_event->comm = comm;
4671 comm_event->comm_size = size;
4672
4673 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
Peter Zijlstraf6595f32009-11-20 22:19:47 +01004674 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004675 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004676 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004677 if (cpuctx->active_pmu != pmu)
4678 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004679 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004680
4681 ctxn = pmu->task_ctx_nr;
4682 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004683 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004684
4685 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4686 if (ctx)
4687 perf_event_comm_ctx(ctx, comm_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004688next:
4689 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004690 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004691 rcu_read_unlock();
4692}
4693
4694void perf_event_comm(struct task_struct *task)
4695{
4696 struct perf_comm_event comm_event;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004697 struct perf_event_context *ctx;
4698 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004699
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004700 for_each_task_context_nr(ctxn) {
4701 ctx = task->perf_event_ctxp[ctxn];
4702 if (!ctx)
4703 continue;
4704
4705 perf_event_enable_on_exec(ctx);
4706 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004707
4708 if (!atomic_read(&nr_comm_events))
4709 return;
4710
4711 comm_event = (struct perf_comm_event){
4712 .task = task,
4713 /* .comm */
4714 /* .comm_size */
4715 .event_id = {
4716 .header = {
4717 .type = PERF_RECORD_COMM,
4718 .misc = 0,
4719 /* .size */
4720 },
4721 /* .pid */
4722 /* .tid */
4723 },
4724 };
4725
4726 perf_event_comm_event(&comm_event);
4727}
4728
4729/*
4730 * mmap tracking
4731 */
4732
4733struct perf_mmap_event {
4734 struct vm_area_struct *vma;
4735
4736 const char *file_name;
4737 int file_size;
4738
4739 struct {
4740 struct perf_event_header header;
4741
4742 u32 pid;
4743 u32 tid;
4744 u64 start;
4745 u64 len;
4746 u64 pgoff;
4747 } event_id;
4748};
4749
4750static void perf_event_mmap_output(struct perf_event *event,
4751 struct perf_mmap_event *mmap_event)
4752{
4753 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004754 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004755 int size = mmap_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004756 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004757
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004758 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4759 ret = perf_output_begin(&handle, event,
4760 mmap_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004761 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004762 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004763
4764 mmap_event->event_id.pid = perf_event_pid(event, current);
4765 mmap_event->event_id.tid = perf_event_tid(event, current);
4766
4767 perf_output_put(&handle, mmap_event->event_id);
4768 perf_output_copy(&handle, mmap_event->file_name,
4769 mmap_event->file_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004770
4771 perf_event__output_id_sample(event, &handle, &sample);
4772
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004773 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004774out:
4775 mmap_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004776}
4777
4778static int perf_event_mmap_match(struct perf_event *event,
Eric B Munson3af9e852010-05-18 15:30:49 +01004779 struct perf_mmap_event *mmap_event,
4780 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004781{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004782 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004783 return 0;
4784
Stephane Eranian5632ab12011-01-03 18:20:01 +02004785 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004786 return 0;
4787
Eric B Munson3af9e852010-05-18 15:30:49 +01004788 if ((!executable && event->attr.mmap_data) ||
4789 (executable && event->attr.mmap))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004790 return 1;
4791
4792 return 0;
4793}
4794
4795static void perf_event_mmap_ctx(struct perf_event_context *ctx,
Eric B Munson3af9e852010-05-18 15:30:49 +01004796 struct perf_mmap_event *mmap_event,
4797 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004798{
4799 struct perf_event *event;
4800
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004801 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Eric B Munson3af9e852010-05-18 15:30:49 +01004802 if (perf_event_mmap_match(event, mmap_event, executable))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004803 perf_event_mmap_output(event, mmap_event);
4804 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004805}
4806
4807static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4808{
4809 struct perf_cpu_context *cpuctx;
4810 struct perf_event_context *ctx;
4811 struct vm_area_struct *vma = mmap_event->vma;
4812 struct file *file = vma->vm_file;
4813 unsigned int size;
4814 char tmp[16];
4815 char *buf = NULL;
4816 const char *name;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004817 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004818 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004819
4820 memset(tmp, 0, sizeof(tmp));
4821
4822 if (file) {
4823 /*
4824 * d_path works from the end of the buffer backwards, so we
4825 * need to add enough zero bytes after the string to handle
4826 * the 64bit alignment we do later.
4827 */
4828 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4829 if (!buf) {
4830 name = strncpy(tmp, "//enomem", sizeof(tmp));
4831 goto got_name;
4832 }
4833 name = d_path(&file->f_path, buf, PATH_MAX);
4834 if (IS_ERR(name)) {
4835 name = strncpy(tmp, "//toolong", sizeof(tmp));
4836 goto got_name;
4837 }
4838 } else {
4839 if (arch_vma_name(mmap_event->vma)) {
4840 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4841 sizeof(tmp));
4842 goto got_name;
4843 }
4844
4845 if (!vma->vm_mm) {
4846 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4847 goto got_name;
Eric B Munson3af9e852010-05-18 15:30:49 +01004848 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4849 vma->vm_end >= vma->vm_mm->brk) {
4850 name = strncpy(tmp, "[heap]", sizeof(tmp));
4851 goto got_name;
4852 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4853 vma->vm_end >= vma->vm_mm->start_stack) {
4854 name = strncpy(tmp, "[stack]", sizeof(tmp));
4855 goto got_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004856 }
4857
4858 name = strncpy(tmp, "//anon", sizeof(tmp));
4859 goto got_name;
4860 }
4861
4862got_name:
4863 size = ALIGN(strlen(name)+1, sizeof(u64));
4864
4865 mmap_event->file_name = name;
4866 mmap_event->file_size = size;
4867
4868 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4869
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01004870 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004871 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004872 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004873 if (cpuctx->active_pmu != pmu)
4874 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004875 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4876 vma->vm_flags & VM_EXEC);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004877
4878 ctxn = pmu->task_ctx_nr;
4879 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004880 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004881
4882 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4883 if (ctx) {
4884 perf_event_mmap_ctx(ctx, mmap_event,
4885 vma->vm_flags & VM_EXEC);
4886 }
Peter Zijlstra41945f62010-09-16 19:17:24 +02004887next:
4888 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004889 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004890 rcu_read_unlock();
4891
4892 kfree(buf);
4893}
4894
Eric B Munson3af9e852010-05-18 15:30:49 +01004895void perf_event_mmap(struct vm_area_struct *vma)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004896{
4897 struct perf_mmap_event mmap_event;
4898
4899 if (!atomic_read(&nr_mmap_events))
4900 return;
4901
4902 mmap_event = (struct perf_mmap_event){
4903 .vma = vma,
4904 /* .file_name */
4905 /* .file_size */
4906 .event_id = {
4907 .header = {
4908 .type = PERF_RECORD_MMAP,
Zhang, Yanmin39447b32010-04-19 13:32:41 +08004909 .misc = PERF_RECORD_MISC_USER,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004910 /* .size */
4911 },
4912 /* .pid */
4913 /* .tid */
4914 .start = vma->vm_start,
4915 .len = vma->vm_end - vma->vm_start,
Peter Zijlstra3a0304e2010-02-26 10:33:41 +01004916 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004917 },
4918 };
4919
4920 perf_event_mmap_event(&mmap_event);
4921}
4922
4923/*
4924 * IRQ throttle logging
4925 */
4926
4927static void perf_log_throttle(struct perf_event *event, int enable)
4928{
4929 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004930 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004931 int ret;
4932
4933 struct {
4934 struct perf_event_header header;
4935 u64 time;
4936 u64 id;
4937 u64 stream_id;
4938 } throttle_event = {
4939 .header = {
4940 .type = PERF_RECORD_THROTTLE,
4941 .misc = 0,
4942 .size = sizeof(throttle_event),
4943 },
4944 .time = perf_clock(),
4945 .id = primary_event_id(event),
4946 .stream_id = event->id,
4947 };
4948
4949 if (enable)
4950 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4951
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004952 perf_event_header__init_id(&throttle_event.header, &sample, event);
4953
4954 ret = perf_output_begin(&handle, event,
4955 throttle_event.header.size, 1, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004956 if (ret)
4957 return;
4958
4959 perf_output_put(&handle, throttle_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004960 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004961 perf_output_end(&handle);
4962}
4963
4964/*
4965 * Generic event overflow handling, sampling.
4966 */
4967
4968static int __perf_event_overflow(struct perf_event *event, int nmi,
4969 int throttle, struct perf_sample_data *data,
4970 struct pt_regs *regs)
4971{
4972 int events = atomic_read(&event->event_limit);
4973 struct hw_perf_event *hwc = &event->hw;
4974 int ret = 0;
4975
Peter Zijlstra96398822010-11-24 18:55:29 +01004976 /*
4977 * Non-sampling counters might still use the PMI to fold short
4978 * hardware counters, ignore those.
4979 */
4980 if (unlikely(!is_sampling_event(event)))
4981 return 0;
4982
Peter Zijlstra163ec432011-02-16 11:22:34 +01004983 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4984 if (throttle) {
4985 hwc->interrupts = MAX_INTERRUPTS;
4986 perf_log_throttle(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004987 ret = 1;
4988 }
Peter Zijlstra163ec432011-02-16 11:22:34 +01004989 } else
4990 hwc->interrupts++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004991
4992 if (event->attr.freq) {
4993 u64 now = perf_clock();
Peter Zijlstraabd50712010-01-26 18:50:16 +01004994 s64 delta = now - hwc->freq_time_stamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004995
Peter Zijlstraabd50712010-01-26 18:50:16 +01004996 hwc->freq_time_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004997
Peter Zijlstraabd50712010-01-26 18:50:16 +01004998 if (delta > 0 && delta < 2*TICK_NSEC)
4999 perf_adjust_period(event, delta, hwc->last_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005000 }
5001
5002 /*
5003 * XXX event_limit might not quite work as expected on inherited
5004 * events
5005 */
5006
5007 event->pending_kill = POLL_IN;
5008 if (events && atomic_dec_and_test(&event->event_limit)) {
5009 ret = 1;
5010 event->pending_kill = POLL_HUP;
5011 if (nmi) {
5012 event->pending_disable = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08005013 irq_work_queue(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005014 } else
5015 perf_event_disable(event);
5016 }
5017
Peter Zijlstra453f19e2009-11-20 22:19:43 +01005018 if (event->overflow_handler)
5019 event->overflow_handler(event, nmi, data, regs);
5020 else
5021 perf_event_output(event, nmi, data, regs);
5022
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005023 return ret;
5024}
5025
5026int perf_event_overflow(struct perf_event *event, int nmi,
5027 struct perf_sample_data *data,
5028 struct pt_regs *regs)
5029{
5030 return __perf_event_overflow(event, nmi, 1, data, regs);
5031}
5032
5033/*
5034 * Generic software event infrastructure
5035 */
5036
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005037struct swevent_htable {
5038 struct swevent_hlist *swevent_hlist;
5039 struct mutex hlist_mutex;
5040 int hlist_refcount;
5041
5042 /* Recursion avoidance in each contexts */
5043 int recursion[PERF_NR_CONTEXTS];
5044};
5045
5046static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5047
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005048/*
5049 * We directly increment event->count and keep a second value in
5050 * event->hw.period_left to count intervals. This period event
5051 * is kept in the range [-sample_period, 0] so that we can use the
5052 * sign as trigger.
5053 */
5054
5055static u64 perf_swevent_set_period(struct perf_event *event)
5056{
5057 struct hw_perf_event *hwc = &event->hw;
5058 u64 period = hwc->last_period;
5059 u64 nr, offset;
5060 s64 old, val;
5061
5062 hwc->last_period = hwc->sample_period;
5063
5064again:
Peter Zijlstrae7850592010-05-21 14:43:08 +02005065 old = val = local64_read(&hwc->period_left);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005066 if (val < 0)
5067 return 0;
5068
5069 nr = div64_u64(period + val, period);
5070 offset = nr * period;
5071 val -= offset;
Peter Zijlstrae7850592010-05-21 14:43:08 +02005072 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005073 goto again;
5074
5075 return nr;
5076}
5077
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005078static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005079 int nmi, struct perf_sample_data *data,
5080 struct pt_regs *regs)
5081{
5082 struct hw_perf_event *hwc = &event->hw;
5083 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005084
5085 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005086 if (!overflow)
5087 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005088
5089 if (hwc->interrupts == MAX_INTERRUPTS)
5090 return;
5091
5092 for (; overflow; overflow--) {
5093 if (__perf_event_overflow(event, nmi, throttle,
5094 data, regs)) {
5095 /*
5096 * We inhibit the overflow from happening when
5097 * hwc->interrupts == MAX_INTERRUPTS.
5098 */
5099 break;
5100 }
5101 throttle = 1;
5102 }
5103}
5104
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005105static void perf_swevent_event(struct perf_event *event, u64 nr,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005106 int nmi, struct perf_sample_data *data,
5107 struct pt_regs *regs)
5108{
5109 struct hw_perf_event *hwc = &event->hw;
5110
Peter Zijlstrae7850592010-05-21 14:43:08 +02005111 local64_add(nr, &event->count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005112
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005113 if (!regs)
5114 return;
5115
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005116 if (!is_sampling_event(event))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005117 return;
5118
5119 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5120 return perf_swevent_overflow(event, 1, nmi, data, regs);
5121
Peter Zijlstrae7850592010-05-21 14:43:08 +02005122 if (local64_add_negative(nr, &hwc->period_left))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005123 return;
5124
5125 perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005126}
5127
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005128static int perf_exclude_event(struct perf_event *event,
5129 struct pt_regs *regs)
5130{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005131 if (event->hw.state & PERF_HES_STOPPED)
5132 return 0;
5133
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005134 if (regs) {
5135 if (event->attr.exclude_user && user_mode(regs))
5136 return 1;
5137
5138 if (event->attr.exclude_kernel && !user_mode(regs))
5139 return 1;
5140 }
5141
5142 return 0;
5143}
5144
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005145static int perf_swevent_match(struct perf_event *event,
5146 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08005147 u32 event_id,
5148 struct perf_sample_data *data,
5149 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005150{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005151 if (event->attr.type != type)
5152 return 0;
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005153
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005154 if (event->attr.config != event_id)
5155 return 0;
5156
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005157 if (perf_exclude_event(event, regs))
5158 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005159
5160 return 1;
5161}
5162
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005163static inline u64 swevent_hash(u64 type, u32 event_id)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005164{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005165 u64 val = event_id | (type << 32);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005166
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005167 return hash_64(val, SWEVENT_HLIST_BITS);
5168}
5169
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005170static inline struct hlist_head *
5171__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005172{
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005173 u64 hash = swevent_hash(type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005174
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005175 return &hlist->heads[hash];
5176}
5177
5178/* For the read side: events when they trigger */
5179static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005180find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005181{
5182 struct swevent_hlist *hlist;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005183
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005184 hlist = rcu_dereference(swhash->swevent_hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005185 if (!hlist)
5186 return NULL;
5187
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005188 return __find_swevent_head(hlist, type, event_id);
5189}
5190
5191/* For the event head insertion and removal in the hlist */
5192static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005193find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005194{
5195 struct swevent_hlist *hlist;
5196 u32 event_id = event->attr.config;
5197 u64 type = event->attr.type;
5198
5199 /*
5200 * Event scheduling is always serialized against hlist allocation
5201 * and release. Which makes the protected version suitable here.
5202 * The context lock guarantees that.
5203 */
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005204 hlist = rcu_dereference_protected(swhash->swevent_hlist,
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005205 lockdep_is_held(&event->ctx->lock));
5206 if (!hlist)
5207 return NULL;
5208
5209 return __find_swevent_head(hlist, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005210}
5211
5212static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5213 u64 nr, int nmi,
5214 struct perf_sample_data *data,
5215 struct pt_regs *regs)
5216{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005217 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005218 struct perf_event *event;
5219 struct hlist_node *node;
5220 struct hlist_head *head;
5221
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005222 rcu_read_lock();
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005223 head = find_swevent_head_rcu(swhash, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005224 if (!head)
5225 goto end;
5226
5227 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08005228 if (perf_swevent_match(event, type, event_id, data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005229 perf_swevent_event(event, nr, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005230 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005231end:
5232 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005233}
5234
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005235int perf_swevent_get_recursion_context(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005236{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005237 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005238
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005239 return get_recursion_context(swhash->recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005240}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01005241EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005242
Jesper Juhlfa9f90b2010-11-28 21:39:34 +01005243inline void perf_swevent_put_recursion_context(int rctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005244{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005245 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005246
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005247 put_recursion_context(swhash->recursion, rctx);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005248}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005249
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005250void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5251 struct pt_regs *regs, u64 addr)
5252{
Ingo Molnara4234bf2009-11-23 10:57:59 +01005253 struct perf_sample_data data;
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005254 int rctx;
5255
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005256 preempt_disable_notrace();
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005257 rctx = perf_swevent_get_recursion_context();
5258 if (rctx < 0)
5259 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005260
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005261 perf_sample_data_init(&data, addr);
Ingo Molnara4234bf2009-11-23 10:57:59 +01005262
5263 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005264
5265 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005266 preempt_enable_notrace();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005267}
5268
5269static void perf_swevent_read(struct perf_event *event)
5270{
5271}
5272
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005273static int perf_swevent_add(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005274{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005275 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005276 struct hw_perf_event *hwc = &event->hw;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005277 struct hlist_head *head;
5278
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005279 if (is_sampling_event(event)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005280 hwc->last_period = hwc->sample_period;
5281 perf_swevent_set_period(event);
5282 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005283
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005284 hwc->state = !(flags & PERF_EF_START);
5285
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005286 head = find_swevent_head(swhash, event);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005287 if (WARN_ON_ONCE(!head))
5288 return -EINVAL;
5289
5290 hlist_add_head_rcu(&event->hlist_entry, head);
5291
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005292 return 0;
5293}
5294
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005295static void perf_swevent_del(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005296{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005297 hlist_del_rcu(&event->hlist_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005298}
5299
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005300static void perf_swevent_start(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005301{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005302 event->hw.state = 0;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005303}
5304
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005305static void perf_swevent_stop(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005306{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005307 event->hw.state = PERF_HES_STOPPED;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005308}
5309
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005310/* Deref the hlist from the update side */
5311static inline struct swevent_hlist *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005312swevent_hlist_deref(struct swevent_htable *swhash)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005313{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005314 return rcu_dereference_protected(swhash->swevent_hlist,
5315 lockdep_is_held(&swhash->hlist_mutex));
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005316}
5317
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005318static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5319{
5320 struct swevent_hlist *hlist;
5321
5322 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5323 kfree(hlist);
5324}
5325
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005326static void swevent_hlist_release(struct swevent_htable *swhash)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005327{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005328 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005329
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005330 if (!hlist)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005331 return;
5332
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005333 rcu_assign_pointer(swhash->swevent_hlist, NULL);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005334 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
5335}
5336
5337static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5338{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005339 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005340
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005341 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005342
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005343 if (!--swhash->hlist_refcount)
5344 swevent_hlist_release(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005345
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005346 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005347}
5348
5349static void swevent_hlist_put(struct perf_event *event)
5350{
5351 int cpu;
5352
5353 if (event->cpu != -1) {
5354 swevent_hlist_put_cpu(event, event->cpu);
5355 return;
5356 }
5357
5358 for_each_possible_cpu(cpu)
5359 swevent_hlist_put_cpu(event, cpu);
5360}
5361
5362static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5363{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005364 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005365 int err = 0;
5366
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005367 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005368
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005369 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005370 struct swevent_hlist *hlist;
5371
5372 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5373 if (!hlist) {
5374 err = -ENOMEM;
5375 goto exit;
5376 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005377 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005378 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005379 swhash->hlist_refcount++;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005380exit:
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005381 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005382
5383 return err;
5384}
5385
5386static int swevent_hlist_get(struct perf_event *event)
5387{
5388 int err;
5389 int cpu, failed_cpu;
5390
5391 if (event->cpu != -1)
5392 return swevent_hlist_get_cpu(event, event->cpu);
5393
5394 get_online_cpus();
5395 for_each_possible_cpu(cpu) {
5396 err = swevent_hlist_get_cpu(event, cpu);
5397 if (err) {
5398 failed_cpu = cpu;
5399 goto fail;
5400 }
5401 }
5402 put_online_cpus();
5403
5404 return 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005405fail:
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005406 for_each_possible_cpu(cpu) {
5407 if (cpu == failed_cpu)
5408 break;
5409 swevent_hlist_put_cpu(event, cpu);
5410 }
5411
5412 put_online_cpus();
5413 return err;
5414}
5415
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005416atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005417
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005418static void sw_perf_event_destroy(struct perf_event *event)
5419{
5420 u64 event_id = event->attr.config;
5421
5422 WARN_ON(event->parent);
5423
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005424 jump_label_dec(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005425 swevent_hlist_put(event);
5426}
5427
5428static int perf_swevent_init(struct perf_event *event)
5429{
5430 int event_id = event->attr.config;
5431
5432 if (event->attr.type != PERF_TYPE_SOFTWARE)
5433 return -ENOENT;
5434
5435 switch (event_id) {
5436 case PERF_COUNT_SW_CPU_CLOCK:
5437 case PERF_COUNT_SW_TASK_CLOCK:
5438 return -ENOENT;
5439
5440 default:
5441 break;
5442 }
5443
Dan Carpenterce677832010-10-24 21:50:42 +02005444 if (event_id >= PERF_COUNT_SW_MAX)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005445 return -ENOENT;
5446
5447 if (!event->parent) {
5448 int err;
5449
5450 err = swevent_hlist_get(event);
5451 if (err)
5452 return err;
5453
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005454 jump_label_inc(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005455 event->destroy = sw_perf_event_destroy;
5456 }
5457
5458 return 0;
5459}
5460
5461static struct pmu perf_swevent = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005462 .task_ctx_nr = perf_sw_context,
5463
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005464 .event_init = perf_swevent_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005465 .add = perf_swevent_add,
5466 .del = perf_swevent_del,
5467 .start = perf_swevent_start,
5468 .stop = perf_swevent_stop,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005469 .read = perf_swevent_read,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005470};
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005471
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005472#ifdef CONFIG_EVENT_TRACING
5473
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005474static int perf_tp_filter_match(struct perf_event *event,
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005475 struct perf_sample_data *data)
5476{
5477 void *record = data->raw->data;
5478
5479 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5480 return 1;
5481 return 0;
5482}
5483
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005484static int perf_tp_event_match(struct perf_event *event,
5485 struct perf_sample_data *data,
5486 struct pt_regs *regs)
5487{
Peter Zijlstra580d6072010-05-20 20:54:31 +02005488 /*
5489 * All tracepoints are from kernel-space.
5490 */
5491 if (event->attr.exclude_kernel)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005492 return 0;
5493
5494 if (!perf_tp_filter_match(event, data))
5495 return 0;
5496
5497 return 1;
5498}
5499
5500void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005501 struct pt_regs *regs, struct hlist_head *head, int rctx)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005502{
5503 struct perf_sample_data data;
5504 struct perf_event *event;
5505 struct hlist_node *node;
5506
5507 struct perf_raw_record raw = {
5508 .size = entry_size,
5509 .data = record,
5510 };
5511
5512 perf_sample_data_init(&data, addr);
5513 data.raw = &raw;
5514
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005515 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5516 if (perf_tp_event_match(event, &data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005517 perf_swevent_event(event, count, 1, &data, regs);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005518 }
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005519
5520 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005521}
5522EXPORT_SYMBOL_GPL(perf_tp_event);
5523
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005524static void tp_perf_event_destroy(struct perf_event *event)
5525{
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005526 perf_trace_destroy(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005527}
5528
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005529static int perf_tp_event_init(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005530{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005531 int err;
5532
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005533 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5534 return -ENOENT;
5535
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005536 err = perf_trace_init(event);
5537 if (err)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005538 return err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005539
5540 event->destroy = tp_perf_event_destroy;
5541
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005542 return 0;
5543}
5544
5545static struct pmu perf_tracepoint = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005546 .task_ctx_nr = perf_sw_context,
5547
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005548 .event_init = perf_tp_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005549 .add = perf_trace_add,
5550 .del = perf_trace_del,
5551 .start = perf_swevent_start,
5552 .stop = perf_swevent_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005553 .read = perf_swevent_read,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005554};
5555
5556static inline void perf_tp_register(void)
5557{
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005558 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005559}
Li Zefan6fb29152009-10-15 11:21:42 +08005560
5561static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5562{
5563 char *filter_str;
5564 int ret;
5565
5566 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5567 return -EINVAL;
5568
5569 filter_str = strndup_user(arg, PAGE_SIZE);
5570 if (IS_ERR(filter_str))
5571 return PTR_ERR(filter_str);
5572
5573 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5574
5575 kfree(filter_str);
5576 return ret;
5577}
5578
5579static void perf_event_free_filter(struct perf_event *event)
5580{
5581 ftrace_profile_free_filter(event);
5582}
5583
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005584#else
Li Zefan6fb29152009-10-15 11:21:42 +08005585
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005586static inline void perf_tp_register(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005587{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005588}
Li Zefan6fb29152009-10-15 11:21:42 +08005589
5590static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5591{
5592 return -ENOENT;
5593}
5594
5595static void perf_event_free_filter(struct perf_event *event)
5596{
5597}
5598
Li Zefan07b139c2009-12-21 14:27:35 +08005599#endif /* CONFIG_EVENT_TRACING */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005600
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005601#ifdef CONFIG_HAVE_HW_BREAKPOINT
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005602void perf_bp_event(struct perf_event *bp, void *data)
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005603{
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005604 struct perf_sample_data sample;
5605 struct pt_regs *regs = data;
5606
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005607 perf_sample_data_init(&sample, bp->attr.bp_addr);
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005608
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005609 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5610 perf_swevent_event(bp, 1, 1, &sample, regs);
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005611}
5612#endif
5613
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005614/*
5615 * hrtimer based swevent callback
5616 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005617
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005618static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005619{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005620 enum hrtimer_restart ret = HRTIMER_RESTART;
5621 struct perf_sample_data data;
5622 struct pt_regs *regs;
5623 struct perf_event *event;
5624 u64 period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005625
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005626 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005627
5628 if (event->state != PERF_EVENT_STATE_ACTIVE)
5629 return HRTIMER_NORESTART;
5630
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005631 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005632
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005633 perf_sample_data_init(&data, 0);
5634 data.period = event->hw.last_period;
5635 regs = get_irq_regs();
5636
5637 if (regs && !perf_exclude_event(event, regs)) {
5638 if (!(event->attr.exclude_idle && current->pid == 0))
5639 if (perf_event_overflow(event, 0, &data, regs))
5640 ret = HRTIMER_NORESTART;
5641 }
5642
5643 period = max_t(u64, 10000, event->hw.sample_period);
5644 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5645
5646 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005647}
5648
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005649static void perf_swevent_start_hrtimer(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005650{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005651 struct hw_perf_event *hwc = &event->hw;
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005652 s64 period;
5653
5654 if (!is_sampling_event(event))
5655 return;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005656
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005657 period = local64_read(&hwc->period_left);
5658 if (period) {
5659 if (period < 0)
5660 period = 10000;
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005661
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005662 local64_set(&hwc->period_left, 0);
5663 } else {
5664 period = max_t(u64, 10000, hwc->sample_period);
5665 }
5666 __hrtimer_start_range_ns(&hwc->hrtimer,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005667 ns_to_ktime(period), 0,
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02005668 HRTIMER_MODE_REL_PINNED, 0);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005669}
5670
5671static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5672{
5673 struct hw_perf_event *hwc = &event->hw;
5674
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005675 if (is_sampling_event(event)) {
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005676 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005677 local64_set(&hwc->period_left, ktime_to_ns(remaining));
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005678
5679 hrtimer_cancel(&hwc->hrtimer);
5680 }
5681}
5682
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005683static void perf_swevent_init_hrtimer(struct perf_event *event)
5684{
5685 struct hw_perf_event *hwc = &event->hw;
5686
5687 if (!is_sampling_event(event))
5688 return;
5689
5690 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5691 hwc->hrtimer.function = perf_swevent_hrtimer;
5692
5693 /*
5694 * Since hrtimers have a fixed rate, we can do a static freq->period
5695 * mapping and avoid the whole period adjust feedback stuff.
5696 */
5697 if (event->attr.freq) {
5698 long freq = event->attr.sample_freq;
5699
5700 event->attr.sample_period = NSEC_PER_SEC / freq;
5701 hwc->sample_period = event->attr.sample_period;
5702 local64_set(&hwc->period_left, hwc->sample_period);
5703 event->attr.freq = 0;
5704 }
5705}
5706
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005707/*
5708 * Software event: cpu wall time clock
5709 */
5710
5711static void cpu_clock_event_update(struct perf_event *event)
5712{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005713 s64 prev;
5714 u64 now;
5715
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005716 now = local_clock();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005717 prev = local64_xchg(&event->hw.prev_count, now);
5718 local64_add(now - prev, &event->count);
5719}
5720
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005721static void cpu_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005722{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005723 local64_set(&event->hw.prev_count, local_clock());
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005724 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005725}
5726
5727static void cpu_clock_event_stop(struct perf_event *event, int flags)
5728{
5729 perf_swevent_cancel_hrtimer(event);
5730 cpu_clock_event_update(event);
5731}
5732
5733static int cpu_clock_event_add(struct perf_event *event, int flags)
5734{
5735 if (flags & PERF_EF_START)
5736 cpu_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005737
5738 return 0;
5739}
5740
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005741static void cpu_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005742{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005743 cpu_clock_event_stop(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005744}
5745
5746static void cpu_clock_event_read(struct perf_event *event)
5747{
5748 cpu_clock_event_update(event);
5749}
5750
5751static int cpu_clock_event_init(struct perf_event *event)
5752{
5753 if (event->attr.type != PERF_TYPE_SOFTWARE)
5754 return -ENOENT;
5755
5756 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5757 return -ENOENT;
5758
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005759 perf_swevent_init_hrtimer(event);
5760
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005761 return 0;
5762}
5763
5764static struct pmu perf_cpu_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005765 .task_ctx_nr = perf_sw_context,
5766
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005767 .event_init = cpu_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005768 .add = cpu_clock_event_add,
5769 .del = cpu_clock_event_del,
5770 .start = cpu_clock_event_start,
5771 .stop = cpu_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005772 .read = cpu_clock_event_read,
5773};
5774
5775/*
5776 * Software event: task time clock
5777 */
5778
5779static void task_clock_event_update(struct perf_event *event, u64 now)
5780{
5781 u64 prev;
5782 s64 delta;
5783
5784 prev = local64_xchg(&event->hw.prev_count, now);
5785 delta = now - prev;
5786 local64_add(delta, &event->count);
5787}
5788
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005789static void task_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005790{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005791 local64_set(&event->hw.prev_count, event->ctx->time);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005792 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005793}
5794
5795static void task_clock_event_stop(struct perf_event *event, int flags)
5796{
5797 perf_swevent_cancel_hrtimer(event);
5798 task_clock_event_update(event, event->ctx->time);
5799}
5800
5801static int task_clock_event_add(struct perf_event *event, int flags)
5802{
5803 if (flags & PERF_EF_START)
5804 task_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005805
5806 return 0;
5807}
5808
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005809static void task_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005810{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005811 task_clock_event_stop(event, PERF_EF_UPDATE);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005812}
5813
5814static void task_clock_event_read(struct perf_event *event)
5815{
Peter Zijlstra768a06e2011-02-22 16:52:24 +01005816 u64 now = perf_clock();
5817 u64 delta = now - event->ctx->timestamp;
5818 u64 time = event->ctx->time + delta;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005819
5820 task_clock_event_update(event, time);
5821}
5822
5823static int task_clock_event_init(struct perf_event *event)
5824{
5825 if (event->attr.type != PERF_TYPE_SOFTWARE)
5826 return -ENOENT;
5827
5828 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5829 return -ENOENT;
5830
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005831 perf_swevent_init_hrtimer(event);
5832
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005833 return 0;
5834}
5835
5836static struct pmu perf_task_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005837 .task_ctx_nr = perf_sw_context,
5838
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005839 .event_init = task_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005840 .add = task_clock_event_add,
5841 .del = task_clock_event_del,
5842 .start = task_clock_event_start,
5843 .stop = task_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005844 .read = task_clock_event_read,
5845};
5846
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005847static void perf_pmu_nop_void(struct pmu *pmu)
5848{
5849}
5850
5851static int perf_pmu_nop_int(struct pmu *pmu)
5852{
5853 return 0;
5854}
5855
5856static void perf_pmu_start_txn(struct pmu *pmu)
5857{
5858 perf_pmu_disable(pmu);
5859}
5860
5861static int perf_pmu_commit_txn(struct pmu *pmu)
5862{
5863 perf_pmu_enable(pmu);
5864 return 0;
5865}
5866
5867static void perf_pmu_cancel_txn(struct pmu *pmu)
5868{
5869 perf_pmu_enable(pmu);
5870}
5871
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005872/*
5873 * Ensures all contexts with the same task_ctx_nr have the same
5874 * pmu_cpu_context too.
5875 */
5876static void *find_pmu_context(int ctxn)
5877{
5878 struct pmu *pmu;
5879
5880 if (ctxn < 0)
5881 return NULL;
5882
5883 list_for_each_entry(pmu, &pmus, entry) {
5884 if (pmu->task_ctx_nr == ctxn)
5885 return pmu->pmu_cpu_context;
5886 }
5887
5888 return NULL;
5889}
5890
Peter Zijlstra51676952010-12-07 14:18:20 +01005891static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005892{
Peter Zijlstra51676952010-12-07 14:18:20 +01005893 int cpu;
5894
5895 for_each_possible_cpu(cpu) {
5896 struct perf_cpu_context *cpuctx;
5897
5898 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5899
5900 if (cpuctx->active_pmu == old_pmu)
5901 cpuctx->active_pmu = pmu;
5902 }
5903}
5904
5905static void free_pmu_context(struct pmu *pmu)
5906{
5907 struct pmu *i;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005908
5909 mutex_lock(&pmus_lock);
5910 /*
5911 * Like a real lame refcount.
5912 */
Peter Zijlstra51676952010-12-07 14:18:20 +01005913 list_for_each_entry(i, &pmus, entry) {
5914 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5915 update_pmu_context(i, pmu);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005916 goto out;
Peter Zijlstra51676952010-12-07 14:18:20 +01005917 }
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005918 }
5919
Peter Zijlstra51676952010-12-07 14:18:20 +01005920 free_percpu(pmu->pmu_cpu_context);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005921out:
5922 mutex_unlock(&pmus_lock);
5923}
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005924static struct idr pmu_idr;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005925
Peter Zijlstraabe43402010-11-17 23:17:37 +01005926static ssize_t
5927type_show(struct device *dev, struct device_attribute *attr, char *page)
5928{
5929 struct pmu *pmu = dev_get_drvdata(dev);
5930
5931 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5932}
5933
5934static struct device_attribute pmu_dev_attrs[] = {
5935 __ATTR_RO(type),
5936 __ATTR_NULL,
5937};
5938
5939static int pmu_bus_running;
5940static struct bus_type pmu_bus = {
5941 .name = "event_source",
5942 .dev_attrs = pmu_dev_attrs,
5943};
5944
5945static void pmu_dev_release(struct device *dev)
5946{
5947 kfree(dev);
5948}
5949
5950static int pmu_dev_alloc(struct pmu *pmu)
5951{
5952 int ret = -ENOMEM;
5953
5954 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5955 if (!pmu->dev)
5956 goto out;
5957
5958 device_initialize(pmu->dev);
5959 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5960 if (ret)
5961 goto free_dev;
5962
5963 dev_set_drvdata(pmu->dev, pmu);
5964 pmu->dev->bus = &pmu_bus;
5965 pmu->dev->release = pmu_dev_release;
5966 ret = device_add(pmu->dev);
5967 if (ret)
5968 goto free_dev;
5969
5970out:
5971 return ret;
5972
5973free_dev:
5974 put_device(pmu->dev);
5975 goto out;
5976}
5977
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005978static struct lock_class_key cpuctx_mutex;
5979
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005980int perf_pmu_register(struct pmu *pmu, char *name, int type)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005981{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005982 int cpu, ret;
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005983
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005984 mutex_lock(&pmus_lock);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005985 ret = -ENOMEM;
5986 pmu->pmu_disable_count = alloc_percpu(int);
5987 if (!pmu->pmu_disable_count)
5988 goto unlock;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005989
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005990 pmu->type = -1;
5991 if (!name)
5992 goto skip_type;
5993 pmu->name = name;
5994
5995 if (type < 0) {
5996 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5997 if (!err)
5998 goto free_pdc;
5999
6000 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
6001 if (err) {
6002 ret = err;
6003 goto free_pdc;
6004 }
6005 }
6006 pmu->type = type;
6007
Peter Zijlstraabe43402010-11-17 23:17:37 +01006008 if (pmu_bus_running) {
6009 ret = pmu_dev_alloc(pmu);
6010 if (ret)
6011 goto free_idr;
6012 }
6013
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006014skip_type:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006015 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6016 if (pmu->pmu_cpu_context)
6017 goto got_cpu_context;
6018
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006019 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6020 if (!pmu->pmu_cpu_context)
Peter Zijlstraabe43402010-11-17 23:17:37 +01006021 goto free_dev;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006022
6023 for_each_possible_cpu(cpu) {
6024 struct perf_cpu_context *cpuctx;
6025
6026 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Peter Zijlstraeb184472010-09-07 15:55:13 +02006027 __perf_event_init_context(&cpuctx->ctx);
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01006028 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006029 cpuctx->ctx.type = cpu_context;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006030 cpuctx->ctx.pmu = pmu;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006031 cpuctx->jiffies_interval = 1;
6032 INIT_LIST_HEAD(&cpuctx->rotation_list);
Peter Zijlstra51676952010-12-07 14:18:20 +01006033 cpuctx->active_pmu = pmu;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006034 }
6035
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006036got_cpu_context:
Peter Zijlstraad5133b2010-06-15 12:22:39 +02006037 if (!pmu->start_txn) {
6038 if (pmu->pmu_enable) {
6039 /*
6040 * If we have pmu_enable/pmu_disable calls, install
6041 * transaction stubs that use that to try and batch
6042 * hardware accesses.
6043 */
6044 pmu->start_txn = perf_pmu_start_txn;
6045 pmu->commit_txn = perf_pmu_commit_txn;
6046 pmu->cancel_txn = perf_pmu_cancel_txn;
6047 } else {
6048 pmu->start_txn = perf_pmu_nop_void;
6049 pmu->commit_txn = perf_pmu_nop_int;
6050 pmu->cancel_txn = perf_pmu_nop_void;
6051 }
6052 }
6053
6054 if (!pmu->pmu_enable) {
6055 pmu->pmu_enable = perf_pmu_nop_void;
6056 pmu->pmu_disable = perf_pmu_nop_void;
6057 }
6058
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006059 list_add_rcu(&pmu->entry, &pmus);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006060 ret = 0;
6061unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006062 mutex_unlock(&pmus_lock);
6063
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006064 return ret;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006065
Peter Zijlstraabe43402010-11-17 23:17:37 +01006066free_dev:
6067 device_del(pmu->dev);
6068 put_device(pmu->dev);
6069
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006070free_idr:
6071 if (pmu->type >= PERF_TYPE_MAX)
6072 idr_remove(&pmu_idr, pmu->type);
6073
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006074free_pdc:
6075 free_percpu(pmu->pmu_disable_count);
6076 goto unlock;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006077}
6078
6079void perf_pmu_unregister(struct pmu *pmu)
6080{
6081 mutex_lock(&pmus_lock);
6082 list_del_rcu(&pmu->entry);
6083 mutex_unlock(&pmus_lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006084
6085 /*
Peter Zijlstracde8e882010-09-13 11:06:55 +02006086 * We dereference the pmu list under both SRCU and regular RCU, so
6087 * synchronize against both of those.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006088 */
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006089 synchronize_srcu(&pmus_srcu);
Peter Zijlstracde8e882010-09-13 11:06:55 +02006090 synchronize_rcu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006091
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006092 free_percpu(pmu->pmu_disable_count);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006093 if (pmu->type >= PERF_TYPE_MAX)
6094 idr_remove(&pmu_idr, pmu->type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01006095 device_del(pmu->dev);
6096 put_device(pmu->dev);
Peter Zijlstra51676952010-12-07 14:18:20 +01006097 free_pmu_context(pmu);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006098}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006099
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006100struct pmu *perf_init_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006101{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006102 struct pmu *pmu = NULL;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006103 int idx;
Lin Ming940c5b22011-02-27 21:13:31 +08006104 int ret;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006105
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006106 idx = srcu_read_lock(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006107
6108 rcu_read_lock();
6109 pmu = idr_find(&pmu_idr, event->attr.type);
6110 rcu_read_unlock();
Lin Ming940c5b22011-02-27 21:13:31 +08006111 if (pmu) {
6112 ret = pmu->event_init(event);
6113 if (ret)
6114 pmu = ERR_PTR(ret);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006115 goto unlock;
Lin Ming940c5b22011-02-27 21:13:31 +08006116 }
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006117
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006118 list_for_each_entry_rcu(pmu, &pmus, entry) {
Lin Ming940c5b22011-02-27 21:13:31 +08006119 ret = pmu->event_init(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006120 if (!ret)
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006121 goto unlock;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006122
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006123 if (ret != -ENOENT) {
6124 pmu = ERR_PTR(ret);
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006125 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006126 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006127 }
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006128 pmu = ERR_PTR(-ENOENT);
6129unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006130 srcu_read_unlock(&pmus_srcu, idx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006131
6132 return pmu;
6133}
6134
6135/*
6136 * Allocate and initialize a event structure
6137 */
6138static struct perf_event *
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006139perf_event_alloc(struct perf_event_attr *attr, int cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006140 struct task_struct *task,
6141 struct perf_event *group_leader,
6142 struct perf_event *parent_event,
6143 perf_overflow_handler_t overflow_handler)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006144{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006145 struct pmu *pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006146 struct perf_event *event;
6147 struct hw_perf_event *hwc;
6148 long err;
6149
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006150 if ((unsigned)cpu >= nr_cpu_ids) {
6151 if (!task || cpu != -1)
6152 return ERR_PTR(-EINVAL);
6153 }
6154
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006155 event = kzalloc(sizeof(*event), GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006156 if (!event)
6157 return ERR_PTR(-ENOMEM);
6158
6159 /*
6160 * Single events are their own group leaders, with an
6161 * empty sibling list:
6162 */
6163 if (!group_leader)
6164 group_leader = event;
6165
6166 mutex_init(&event->child_mutex);
6167 INIT_LIST_HEAD(&event->child_list);
6168
6169 INIT_LIST_HEAD(&event->group_entry);
6170 INIT_LIST_HEAD(&event->event_entry);
6171 INIT_LIST_HEAD(&event->sibling_list);
6172 init_waitqueue_head(&event->waitq);
Peter Zijlstrae360adb2010-10-14 14:01:34 +08006173 init_irq_work(&event->pending, perf_pending_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006174
6175 mutex_init(&event->mmap_mutex);
6176
6177 event->cpu = cpu;
6178 event->attr = *attr;
6179 event->group_leader = group_leader;
6180 event->pmu = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006181 event->oncpu = -1;
6182
6183 event->parent = parent_event;
6184
6185 event->ns = get_pid_ns(current->nsproxy->pid_ns);
6186 event->id = atomic64_inc_return(&perf_event_id);
6187
6188 event->state = PERF_EVENT_STATE_INACTIVE;
6189
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006190 if (task) {
6191 event->attach_state = PERF_ATTACH_TASK;
6192#ifdef CONFIG_HAVE_HW_BREAKPOINT
6193 /*
6194 * hw_breakpoint is a bit difficult here..
6195 */
6196 if (attr->type == PERF_TYPE_BREAKPOINT)
6197 event->hw.bp_target = task;
6198#endif
6199 }
6200
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006201 if (!overflow_handler && parent_event)
6202 overflow_handler = parent_event->overflow_handler;
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006203
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006204 event->overflow_handler = overflow_handler;
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02006205
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006206 if (attr->disabled)
6207 event->state = PERF_EVENT_STATE_OFF;
6208
6209 pmu = NULL;
6210
6211 hwc = &event->hw;
6212 hwc->sample_period = attr->sample_period;
6213 if (attr->freq && attr->sample_freq)
6214 hwc->sample_period = 1;
6215 hwc->last_period = hwc->sample_period;
6216
Peter Zijlstrae7850592010-05-21 14:43:08 +02006217 local64_set(&hwc->period_left, hwc->sample_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006218
6219 /*
6220 * we currently do not support PERF_FORMAT_GROUP on inherited events
6221 */
6222 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6223 goto done;
6224
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006225 pmu = perf_init_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006226
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006227done:
6228 err = 0;
6229 if (!pmu)
6230 err = -EINVAL;
6231 else if (IS_ERR(pmu))
6232 err = PTR_ERR(pmu);
6233
6234 if (err) {
6235 if (event->ns)
6236 put_pid_ns(event->ns);
6237 kfree(event);
6238 return ERR_PTR(err);
6239 }
6240
6241 event->pmu = pmu;
6242
6243 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006244 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02006245 jump_label_inc(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01006246 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006247 atomic_inc(&nr_mmap_events);
6248 if (event->attr.comm)
6249 atomic_inc(&nr_comm_events);
6250 if (event->attr.task)
6251 atomic_inc(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02006252 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6253 err = get_callchain_buffers();
6254 if (err) {
6255 free_event(event);
6256 return ERR_PTR(err);
6257 }
6258 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006259 }
6260
6261 return event;
6262}
6263
6264static int perf_copy_attr(struct perf_event_attr __user *uattr,
6265 struct perf_event_attr *attr)
6266{
6267 u32 size;
6268 int ret;
6269
6270 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6271 return -EFAULT;
6272
6273 /*
6274 * zero the full structure, so that a short copy will be nice.
6275 */
6276 memset(attr, 0, sizeof(*attr));
6277
6278 ret = get_user(size, &uattr->size);
6279 if (ret)
6280 return ret;
6281
6282 if (size > PAGE_SIZE) /* silly large */
6283 goto err_size;
6284
6285 if (!size) /* abi compat */
6286 size = PERF_ATTR_SIZE_VER0;
6287
6288 if (size < PERF_ATTR_SIZE_VER0)
6289 goto err_size;
6290
6291 /*
6292 * If we're handed a bigger struct than we know of,
6293 * ensure all the unknown bits are 0 - i.e. new
6294 * user-space does not rely on any kernel feature
6295 * extensions we dont know about yet.
6296 */
6297 if (size > sizeof(*attr)) {
6298 unsigned char __user *addr;
6299 unsigned char __user *end;
6300 unsigned char val;
6301
6302 addr = (void __user *)uattr + sizeof(*attr);
6303 end = (void __user *)uattr + size;
6304
6305 for (; addr < end; addr++) {
6306 ret = get_user(val, addr);
6307 if (ret)
6308 return ret;
6309 if (val)
6310 goto err_size;
6311 }
6312 size = sizeof(*attr);
6313 }
6314
6315 ret = copy_from_user(attr, uattr, size);
6316 if (ret)
6317 return -EFAULT;
6318
6319 /*
6320 * If the type exists, the corresponding creation will verify
6321 * the attr->config.
6322 */
6323 if (attr->type >= PERF_TYPE_MAX)
6324 return -EINVAL;
6325
Mahesh Salgaonkarcd757642010-01-30 10:25:18 +05306326 if (attr->__reserved_1)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006327 return -EINVAL;
6328
6329 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6330 return -EINVAL;
6331
6332 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6333 return -EINVAL;
6334
6335out:
6336 return ret;
6337
6338err_size:
6339 put_user(sizeof(*attr), &uattr->size);
6340 ret = -E2BIG;
6341 goto out;
6342}
6343
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006344static int
6345perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006346{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006347 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006348 int ret = -EINVAL;
6349
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006350 if (!output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006351 goto set;
6352
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006353 /* don't allow circular references */
6354 if (event == output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006355 goto out;
6356
Peter Zijlstra0f139302010-05-20 14:35:15 +02006357 /*
6358 * Don't allow cross-cpu buffers
6359 */
6360 if (output_event->cpu != event->cpu)
6361 goto out;
6362
6363 /*
6364 * If its not a per-cpu buffer, it must be the same task.
6365 */
6366 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6367 goto out;
6368
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006369set:
6370 mutex_lock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006371 /* Can't redirect output if we've got an active mmap() */
6372 if (atomic_read(&event->mmap_count))
6373 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006374
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006375 if (output_event) {
6376 /* get the buffer we want to redirect to */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006377 buffer = perf_buffer_get(output_event);
6378 if (!buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006379 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006380 }
6381
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006382 old_buffer = event->buffer;
6383 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006384 ret = 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006385unlock:
6386 mutex_unlock(&event->mmap_mutex);
6387
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006388 if (old_buffer)
6389 perf_buffer_put(old_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006390out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006391 return ret;
6392}
6393
6394/**
6395 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6396 *
6397 * @attr_uptr: event_id type attributes for monitoring/sampling
6398 * @pid: target pid
6399 * @cpu: target cpu
6400 * @group_fd: group leader event fd
6401 */
6402SYSCALL_DEFINE5(perf_event_open,
6403 struct perf_event_attr __user *, attr_uptr,
6404 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6405{
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006406 struct perf_event *group_leader = NULL, *output_event = NULL;
6407 struct perf_event *event, *sibling;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006408 struct perf_event_attr attr;
6409 struct perf_event_context *ctx;
6410 struct file *event_file = NULL;
6411 struct file *group_file = NULL;
Matt Helsley38a81da2010-09-13 13:01:20 -07006412 struct task_struct *task = NULL;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006413 struct pmu *pmu;
Al Viroea635c62010-05-26 17:40:29 -04006414 int event_fd;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006415 int move_group = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006416 int fput_needed = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006417 int err;
6418
6419 /* for future expandability... */
Stephane Eraniane5d13672011-02-14 11:20:01 +02006420 if (flags & ~PERF_FLAG_ALL)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006421 return -EINVAL;
6422
6423 err = perf_copy_attr(attr_uptr, &attr);
6424 if (err)
6425 return err;
6426
6427 if (!attr.exclude_kernel) {
6428 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6429 return -EACCES;
6430 }
6431
6432 if (attr.freq) {
6433 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6434 return -EINVAL;
6435 }
6436
Stephane Eraniane5d13672011-02-14 11:20:01 +02006437 /*
6438 * In cgroup mode, the pid argument is used to pass the fd
6439 * opened to the cgroup directory in cgroupfs. The cpu argument
6440 * designates the cpu on which to monitor threads from that
6441 * cgroup.
6442 */
6443 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6444 return -EINVAL;
6445
Al Viroea635c62010-05-26 17:40:29 -04006446 event_fd = get_unused_fd_flags(O_RDWR);
6447 if (event_fd < 0)
6448 return event_fd;
6449
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006450 if (group_fd != -1) {
6451 group_leader = perf_fget_light(group_fd, &fput_needed);
6452 if (IS_ERR(group_leader)) {
6453 err = PTR_ERR(group_leader);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006454 goto err_fd;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006455 }
6456 group_file = group_leader->filp;
6457 if (flags & PERF_FLAG_FD_OUTPUT)
6458 output_event = group_leader;
6459 if (flags & PERF_FLAG_FD_NO_GROUP)
6460 group_leader = NULL;
6461 }
6462
Stephane Eraniane5d13672011-02-14 11:20:01 +02006463 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006464 task = find_lively_task_by_vpid(pid);
6465 if (IS_ERR(task)) {
6466 err = PTR_ERR(task);
6467 goto err_group_fd;
6468 }
6469 }
6470
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006471 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006472 if (IS_ERR(event)) {
6473 err = PTR_ERR(event);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006474 goto err_task;
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006475 }
6476
Stephane Eraniane5d13672011-02-14 11:20:01 +02006477 if (flags & PERF_FLAG_PID_CGROUP) {
6478 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6479 if (err)
6480 goto err_alloc;
6481 }
6482
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006483 /*
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006484 * Special case software events and allow them to be part of
6485 * any hardware group.
6486 */
6487 pmu = event->pmu;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006488
6489 if (group_leader &&
6490 (is_software_event(event) != is_software_event(group_leader))) {
6491 if (is_software_event(event)) {
6492 /*
6493 * If event and group_leader are not both a software
6494 * event, and event is, then group leader is not.
6495 *
6496 * Allow the addition of software events to !software
6497 * groups, this is safe because software events never
6498 * fail to schedule.
6499 */
6500 pmu = group_leader->pmu;
6501 } else if (is_software_event(group_leader) &&
6502 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6503 /*
6504 * In case the group is a pure software group, and we
6505 * try to add a hardware event, move the whole group to
6506 * the hardware context.
6507 */
6508 move_group = 1;
6509 }
6510 }
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006511
6512 /*
6513 * Get the target context (task or percpu):
6514 */
Matt Helsley38a81da2010-09-13 13:01:20 -07006515 ctx = find_get_context(pmu, task, cpu);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006516 if (IS_ERR(ctx)) {
6517 err = PTR_ERR(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006518 goto err_alloc;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006519 }
6520
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006521 /*
6522 * Look up the group leader (we will attach this event to it):
6523 */
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006524 if (group_leader) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006525 err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006526
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006527 /*
6528 * Do not allow a recursive hierarchy (this new sibling
6529 * becoming part of another group-sibling):
6530 */
6531 if (group_leader->group_leader != group_leader)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006532 goto err_context;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006533 /*
6534 * Do not allow to attach to a group in a different
6535 * task or CPU context:
6536 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006537 if (move_group) {
6538 if (group_leader->ctx->type != ctx->type)
6539 goto err_context;
6540 } else {
6541 if (group_leader->ctx != ctx)
6542 goto err_context;
6543 }
6544
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006545 /*
6546 * Only a group leader can be exclusive or pinned
6547 */
6548 if (attr.exclusive || attr.pinned)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006549 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006550 }
6551
6552 if (output_event) {
6553 err = perf_event_set_output(event, output_event);
6554 if (err)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006555 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006556 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006557
Al Viroea635c62010-05-26 17:40:29 -04006558 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6559 if (IS_ERR(event_file)) {
6560 err = PTR_ERR(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006561 goto err_context;
Al Viroea635c62010-05-26 17:40:29 -04006562 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006563
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006564 if (move_group) {
6565 struct perf_event_context *gctx = group_leader->ctx;
6566
6567 mutex_lock(&gctx->mutex);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006568 perf_remove_from_context(group_leader);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006569 list_for_each_entry(sibling, &group_leader->sibling_list,
6570 group_entry) {
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006571 perf_remove_from_context(sibling);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006572 put_ctx(gctx);
6573 }
6574 mutex_unlock(&gctx->mutex);
6575 put_ctx(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006576 }
6577
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006578 event->filp = event_file;
6579 WARN_ON_ONCE(ctx->parent_ctx);
6580 mutex_lock(&ctx->mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006581
6582 if (move_group) {
6583 perf_install_in_context(ctx, group_leader, cpu);
6584 get_ctx(ctx);
6585 list_for_each_entry(sibling, &group_leader->sibling_list,
6586 group_entry) {
6587 perf_install_in_context(ctx, sibling, cpu);
6588 get_ctx(ctx);
6589 }
6590 }
6591
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006592 perf_install_in_context(ctx, event, cpu);
6593 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006594 perf_unpin_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006595 mutex_unlock(&ctx->mutex);
6596
6597 event->owner = current;
Peter Zijlstra8882135b2010-11-09 19:01:43 +01006598
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006599 mutex_lock(&current->perf_event_mutex);
6600 list_add_tail(&event->owner_entry, &current->perf_event_list);
6601 mutex_unlock(&current->perf_event_mutex);
6602
Peter Zijlstra8a495422010-05-27 15:47:49 +02006603 /*
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006604 * Precalculate sample_data sizes
6605 */
6606 perf_event__header_size(event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006607 perf_event__id_header_size(event);
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006608
6609 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +02006610 * Drop the reference on the group_event after placing the
6611 * new event on the sibling_list. This ensures destruction
6612 * of the group leader will find the pointer to itself in
6613 * perf_group_detach().
6614 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006615 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006616 fd_install(event_fd, event_file);
6617 return event_fd;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006618
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006619err_context:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006620 perf_unpin_context(ctx);
Al Viroea635c62010-05-26 17:40:29 -04006621 put_ctx(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006622err_alloc:
6623 free_event(event);
Peter Zijlstrae7d0bc02010-10-14 16:54:51 +02006624err_task:
6625 if (task)
6626 put_task_struct(task);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006627err_group_fd:
6628 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006629err_fd:
6630 put_unused_fd(event_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006631 return err;
6632}
6633
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006634/**
6635 * perf_event_create_kernel_counter
6636 *
6637 * @attr: attributes of the counter to create
6638 * @cpu: cpu in which the counter is bound
Matt Helsley38a81da2010-09-13 13:01:20 -07006639 * @task: task to profile (NULL for percpu)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006640 */
6641struct perf_event *
6642perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Matt Helsley38a81da2010-09-13 13:01:20 -07006643 struct task_struct *task,
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006644 perf_overflow_handler_t overflow_handler)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006645{
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006646 struct perf_event_context *ctx;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006647 struct perf_event *event;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006648 int err;
6649
6650 /*
6651 * Get the target context (task or percpu):
6652 */
6653
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006654 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006655 if (IS_ERR(event)) {
6656 err = PTR_ERR(event);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006657 goto err;
6658 }
6659
Matt Helsley38a81da2010-09-13 13:01:20 -07006660 ctx = find_get_context(event->pmu, task, cpu);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006661 if (IS_ERR(ctx)) {
6662 err = PTR_ERR(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006663 goto err_free;
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006664 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006665
6666 event->filp = NULL;
6667 WARN_ON_ONCE(ctx->parent_ctx);
6668 mutex_lock(&ctx->mutex);
6669 perf_install_in_context(ctx, event, cpu);
6670 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006671 perf_unpin_context(ctx);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006672 mutex_unlock(&ctx->mutex);
6673
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006674 return event;
6675
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006676err_free:
6677 free_event(event);
6678err:
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006679 return ERR_PTR(err);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006680}
6681EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6682
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006683static void sync_child_event(struct perf_event *child_event,
6684 struct task_struct *child)
6685{
6686 struct perf_event *parent_event = child_event->parent;
6687 u64 child_val;
6688
6689 if (child_event->attr.inherit_stat)
6690 perf_event_read_event(child_event, child);
6691
Peter Zijlstrab5e58792010-05-21 14:43:12 +02006692 child_val = perf_event_count(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006693
6694 /*
6695 * Add back the child's count to the parent's count:
6696 */
Peter Zijlstraa6e6dea2010-05-21 14:27:58 +02006697 atomic64_add(child_val, &parent_event->child_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006698 atomic64_add(child_event->total_time_enabled,
6699 &parent_event->child_total_time_enabled);
6700 atomic64_add(child_event->total_time_running,
6701 &parent_event->child_total_time_running);
6702
6703 /*
6704 * Remove this event from the parent's list
6705 */
6706 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6707 mutex_lock(&parent_event->child_mutex);
6708 list_del_init(&child_event->child_list);
6709 mutex_unlock(&parent_event->child_mutex);
6710
6711 /*
6712 * Release the parent event, if this was the last
6713 * reference to it.
6714 */
6715 fput(parent_event->filp);
6716}
6717
6718static void
6719__perf_event_exit_task(struct perf_event *child_event,
6720 struct perf_event_context *child_ctx,
6721 struct task_struct *child)
6722{
6723 struct perf_event *parent_event;
6724
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006725 perf_remove_from_context(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006726
6727 parent_event = child_event->parent;
6728 /*
6729 * It can happen that parent exits first, and has events
6730 * that are still around due to the child reference. These
6731 * events need to be zapped - but otherwise linger.
6732 */
6733 if (parent_event) {
6734 sync_child_event(child_event, child);
6735 free_event(child_event);
6736 }
6737}
6738
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006739static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006740{
6741 struct perf_event *child_event, *tmp;
6742 struct perf_event_context *child_ctx;
6743 unsigned long flags;
6744
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006745 if (likely(!child->perf_event_ctxp[ctxn])) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006746 perf_event_task(child, NULL, 0);
6747 return;
6748 }
6749
6750 local_irq_save(flags);
6751 /*
6752 * We can't reschedule here because interrupts are disabled,
6753 * and either child is current or it is a task that can't be
6754 * scheduled, so we are now safe from rescheduling changing
6755 * our context.
6756 */
Oleg Nesterov806839b2011-01-21 18:45:47 +01006757 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006758 task_ctx_sched_out(child_ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006759
6760 /*
6761 * Take the context lock here so that if find_get_context is
6762 * reading child->perf_event_ctxp, we wait until it has
6763 * incremented the context's refcount before we do put_ctx below.
6764 */
Thomas Gleixnere625cce2009-11-17 18:02:06 +01006765 raw_spin_lock(&child_ctx->lock);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006766 child->perf_event_ctxp[ctxn] = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006767 /*
6768 * If this context is a clone; unclone it so it can't get
6769 * swapped to another process while we're removing all
6770 * the events from it.
6771 */
6772 unclone_ctx(child_ctx);
Peter Zijlstra5e942bb2009-11-23 11:37:26 +01006773 update_context_time(child_ctx);
Thomas Gleixnere625cce2009-11-17 18:02:06 +01006774 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006775
6776 /*
6777 * Report the task dead after unscheduling the events so that we
6778 * won't get any samples after PERF_RECORD_EXIT. We can however still
6779 * get a few PERF_RECORD_READ events.
6780 */
6781 perf_event_task(child, child_ctx, 0);
6782
6783 /*
6784 * We can recurse on the same lock type through:
6785 *
6786 * __perf_event_exit_task()
6787 * sync_child_event()
6788 * fput(parent_event->filp)
6789 * perf_release()
6790 * mutex_lock(&ctx->mutex)
6791 *
6792 * But since its the parent context it won't be the same instance.
6793 */
Peter Zijlstraa0507c82010-05-06 15:42:53 +02006794 mutex_lock(&child_ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006795
6796again:
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006797 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6798 group_entry)
6799 __perf_event_exit_task(child_event, child_ctx, child);
6800
6801 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006802 group_entry)
6803 __perf_event_exit_task(child_event, child_ctx, child);
6804
6805 /*
6806 * If the last event was a group event, it will have appended all
6807 * its siblings to the list, but we obtained 'tmp' before that which
6808 * will still point to the list head terminating the iteration.
6809 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006810 if (!list_empty(&child_ctx->pinned_groups) ||
6811 !list_empty(&child_ctx->flexible_groups))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006812 goto again;
6813
6814 mutex_unlock(&child_ctx->mutex);
6815
6816 put_ctx(child_ctx);
6817}
6818
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006819/*
6820 * When a child task exits, feed back event values to parent events.
6821 */
6822void perf_event_exit_task(struct task_struct *child)
6823{
Peter Zijlstra8882135b2010-11-09 19:01:43 +01006824 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006825 int ctxn;
6826
Peter Zijlstra8882135b2010-11-09 19:01:43 +01006827 mutex_lock(&child->perf_event_mutex);
6828 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6829 owner_entry) {
6830 list_del_init(&event->owner_entry);
6831
6832 /*
6833 * Ensure the list deletion is visible before we clear
6834 * the owner, closes a race against perf_release() where
6835 * we need to serialize on the owner->perf_event_mutex.
6836 */
6837 smp_wmb();
6838 event->owner = NULL;
6839 }
6840 mutex_unlock(&child->perf_event_mutex);
6841
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006842 for_each_task_context_nr(ctxn)
6843 perf_event_exit_task_context(child, ctxn);
6844}
6845
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006846static void perf_free_event(struct perf_event *event,
6847 struct perf_event_context *ctx)
6848{
6849 struct perf_event *parent = event->parent;
6850
6851 if (WARN_ON_ONCE(!parent))
6852 return;
6853
6854 mutex_lock(&parent->child_mutex);
6855 list_del_init(&event->child_list);
6856 mutex_unlock(&parent->child_mutex);
6857
6858 fput(parent->filp);
6859
Peter Zijlstra8a495422010-05-27 15:47:49 +02006860 perf_group_detach(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006861 list_del_event(event, ctx);
6862 free_event(event);
6863}
6864
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006865/*
6866 * free an unexposed, unused context as created by inheritance by
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006867 * perf_event_init_task below, used by fork() in case of fail.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006868 */
6869void perf_event_free_task(struct task_struct *task)
6870{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006871 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006872 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006873 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006874
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006875 for_each_task_context_nr(ctxn) {
6876 ctx = task->perf_event_ctxp[ctxn];
6877 if (!ctx)
6878 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006879
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006880 mutex_lock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006881again:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006882 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6883 group_entry)
6884 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006885
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006886 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6887 group_entry)
6888 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006889
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006890 if (!list_empty(&ctx->pinned_groups) ||
6891 !list_empty(&ctx->flexible_groups))
6892 goto again;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006893
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006894 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006895
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006896 put_ctx(ctx);
6897 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006898}
6899
Peter Zijlstra4e231c72010-09-09 21:01:59 +02006900void perf_event_delayed_put(struct task_struct *task)
6901{
6902 int ctxn;
6903
6904 for_each_task_context_nr(ctxn)
6905 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6906}
6907
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006908/*
6909 * inherit a event from parent task to child task:
6910 */
6911static struct perf_event *
6912inherit_event(struct perf_event *parent_event,
6913 struct task_struct *parent,
6914 struct perf_event_context *parent_ctx,
6915 struct task_struct *child,
6916 struct perf_event *group_leader,
6917 struct perf_event_context *child_ctx)
6918{
6919 struct perf_event *child_event;
Peter Zijlstracee010e2010-09-10 12:51:54 +02006920 unsigned long flags;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006921
6922 /*
6923 * Instead of creating recursive hierarchies of events,
6924 * we link inherited events back to the original parent,
6925 * which has a filp for sure, which we use as the reference
6926 * count:
6927 */
6928 if (parent_event->parent)
6929 parent_event = parent_event->parent;
6930
6931 child_event = perf_event_alloc(&parent_event->attr,
6932 parent_event->cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006933 child,
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006934 group_leader, parent_event,
6935 NULL);
6936 if (IS_ERR(child_event))
6937 return child_event;
6938 get_ctx(child_ctx);
6939
6940 /*
6941 * Make the child state follow the state of the parent event,
6942 * not its attr.disabled bit. We hold the parent's mutex,
6943 * so we won't race with perf_event_{en, dis}able_family.
6944 */
6945 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6946 child_event->state = PERF_EVENT_STATE_INACTIVE;
6947 else
6948 child_event->state = PERF_EVENT_STATE_OFF;
6949
6950 if (parent_event->attr.freq) {
6951 u64 sample_period = parent_event->hw.sample_period;
6952 struct hw_perf_event *hwc = &child_event->hw;
6953
6954 hwc->sample_period = sample_period;
6955 hwc->last_period = sample_period;
6956
6957 local64_set(&hwc->period_left, sample_period);
6958 }
6959
6960 child_event->ctx = child_ctx;
6961 child_event->overflow_handler = parent_event->overflow_handler;
6962
6963 /*
Thomas Gleixner614b6782010-12-03 16:24:32 -02006964 * Precalculate sample_data sizes
6965 */
6966 perf_event__header_size(child_event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006967 perf_event__id_header_size(child_event);
Thomas Gleixner614b6782010-12-03 16:24:32 -02006968
6969 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006970 * Link it up in the child's context:
6971 */
Peter Zijlstracee010e2010-09-10 12:51:54 +02006972 raw_spin_lock_irqsave(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006973 add_event_to_ctx(child_event, child_ctx);
Peter Zijlstracee010e2010-09-10 12:51:54 +02006974 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006975
6976 /*
6977 * Get a reference to the parent filp - we will fput it
6978 * when the child event exits. This is safe to do because
6979 * we are in the parent and we know that the filp still
6980 * exists and has a nonzero count:
6981 */
6982 atomic_long_inc(&parent_event->filp->f_count);
6983
6984 /*
6985 * Link this into the parent event's child list
6986 */
6987 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6988 mutex_lock(&parent_event->child_mutex);
6989 list_add_tail(&child_event->child_list, &parent_event->child_list);
6990 mutex_unlock(&parent_event->child_mutex);
6991
6992 return child_event;
6993}
6994
6995static int inherit_group(struct perf_event *parent_event,
6996 struct task_struct *parent,
6997 struct perf_event_context *parent_ctx,
6998 struct task_struct *child,
6999 struct perf_event_context *child_ctx)
7000{
7001 struct perf_event *leader;
7002 struct perf_event *sub;
7003 struct perf_event *child_ctr;
7004
7005 leader = inherit_event(parent_event, parent, parent_ctx,
7006 child, NULL, child_ctx);
7007 if (IS_ERR(leader))
7008 return PTR_ERR(leader);
7009 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7010 child_ctr = inherit_event(sub, parent, parent_ctx,
7011 child, leader, child_ctx);
7012 if (IS_ERR(child_ctr))
7013 return PTR_ERR(child_ctr);
7014 }
7015 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007016}
7017
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007018static int
7019inherit_task_group(struct perf_event *event, struct task_struct *parent,
7020 struct perf_event_context *parent_ctx,
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007021 struct task_struct *child, int ctxn,
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007022 int *inherited_all)
7023{
7024 int ret;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007025 struct perf_event_context *child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007026
7027 if (!event->attr.inherit) {
7028 *inherited_all = 0;
7029 return 0;
7030 }
7031
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007032 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007033 if (!child_ctx) {
7034 /*
7035 * This is executed from the parent task context, so
7036 * inherit events that have been marked for cloning.
7037 * First allocate and initialize a context for the
7038 * child.
7039 */
7040
Peter Zijlstraeb184472010-09-07 15:55:13 +02007041 child_ctx = alloc_perf_context(event->pmu, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007042 if (!child_ctx)
7043 return -ENOMEM;
7044
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007045 child->perf_event_ctxp[ctxn] = child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007046 }
7047
7048 ret = inherit_group(event, parent, parent_ctx,
7049 child, child_ctx);
7050
7051 if (ret)
7052 *inherited_all = 0;
7053
7054 return ret;
7055}
7056
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007057/*
7058 * Initialize the perf_event context in task_struct
7059 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007060int perf_event_init_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007061{
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007062 struct perf_event_context *child_ctx, *parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007063 struct perf_event_context *cloned_ctx;
7064 struct perf_event *event;
7065 struct task_struct *parent = current;
7066 int inherited_all = 1;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007067 unsigned long flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007068 int ret = 0;
7069
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007070 if (likely(!parent->perf_event_ctxp[ctxn]))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007071 return 0;
7072
7073 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007074 * If the parent's context is a clone, pin it so it won't get
7075 * swapped under us.
7076 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007077 parent_ctx = perf_pin_task_context(parent, ctxn);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007078
7079 /*
7080 * No need to check if parent_ctx != NULL here; since we saw
7081 * it non-NULL earlier, the only reason for it to become NULL
7082 * is if we exit, and since we're currently in the middle of
7083 * a fork we can't be exiting at the same time.
7084 */
7085
7086 /*
7087 * Lock the parent list. No need to lock the child - not PID
7088 * hashed yet and not running, so nobody can access it.
7089 */
7090 mutex_lock(&parent_ctx->mutex);
7091
7092 /*
7093 * We dont have to disable NMIs - we are only looking at
7094 * the list, not manipulating it:
7095 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007096 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007097 ret = inherit_task_group(event, parent, parent_ctx,
7098 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007099 if (ret)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007100 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007101 }
7102
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007103 /*
7104 * We can't hold ctx->lock when iterating the ->flexible_group list due
7105 * to allocations, but we need to prevent rotation because
7106 * rotate_ctx() will change the list from interrupt context.
7107 */
7108 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7109 parent_ctx->rotate_disable = 1;
7110 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7111
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007112 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007113 ret = inherit_task_group(event, parent, parent_ctx,
7114 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007115 if (ret)
7116 break;
7117 }
7118
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007119 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7120 parent_ctx->rotate_disable = 0;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007121
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007122 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007123
Peter Zijlstra05cbaa22009-12-30 16:00:35 +01007124 if (child_ctx && inherited_all) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007125 /*
7126 * Mark the child context as a clone of the parent
7127 * context, or of whatever the parent is a clone of.
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007128 *
7129 * Note that if the parent is a clone, the holding of
7130 * parent_ctx->lock avoids it from being uncloned.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007131 */
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007132 cloned_ctx = parent_ctx->parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007133 if (cloned_ctx) {
7134 child_ctx->parent_ctx = cloned_ctx;
7135 child_ctx->parent_gen = parent_ctx->parent_gen;
7136 } else {
7137 child_ctx->parent_ctx = parent_ctx;
7138 child_ctx->parent_gen = parent_ctx->generation;
7139 }
7140 get_ctx(child_ctx->parent_ctx);
7141 }
7142
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007143 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007144 mutex_unlock(&parent_ctx->mutex);
7145
7146 perf_unpin_context(parent_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007147 put_ctx(parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007148
7149 return ret;
7150}
7151
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007152/*
7153 * Initialize the perf_event context in task_struct
7154 */
7155int perf_event_init_task(struct task_struct *child)
7156{
7157 int ctxn, ret;
7158
Oleg Nesterov8550d7c2011-01-19 19:22:28 +01007159 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7160 mutex_init(&child->perf_event_mutex);
7161 INIT_LIST_HEAD(&child->perf_event_list);
7162
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007163 for_each_task_context_nr(ctxn) {
7164 ret = perf_event_init_context(child, ctxn);
7165 if (ret)
7166 return ret;
7167 }
7168
7169 return 0;
7170}
7171
Paul Mackerras220b1402010-03-10 20:45:52 +11007172static void __init perf_event_init_all_cpus(void)
7173{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007174 struct swevent_htable *swhash;
Paul Mackerras220b1402010-03-10 20:45:52 +11007175 int cpu;
Paul Mackerras220b1402010-03-10 20:45:52 +11007176
7177 for_each_possible_cpu(cpu) {
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007178 swhash = &per_cpu(swevent_htable, cpu);
7179 mutex_init(&swhash->hlist_mutex);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007180 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
Paul Mackerras220b1402010-03-10 20:45:52 +11007181 }
7182}
7183
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007184static void __cpuinit perf_event_init_cpu(int cpu)
7185{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007186 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007187
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007188 mutex_lock(&swhash->hlist_mutex);
7189 if (swhash->hlist_refcount > 0) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007190 struct swevent_hlist *hlist;
7191
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007192 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7193 WARN_ON(!hlist);
7194 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007195 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007196 mutex_unlock(&swhash->hlist_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007197}
7198
Peter Zijlstrac2774432010-12-08 15:29:02 +01007199#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007200static void perf_pmu_rotate_stop(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007201{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007202 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7203
7204 WARN_ON(!irqs_disabled());
7205
7206 list_del_init(&cpuctx->rotation_list);
7207}
7208
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007209static void __perf_event_exit_context(void *__info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007210{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007211 struct perf_event_context *ctx = __info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007212 struct perf_event *event, *tmp;
7213
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007214 perf_pmu_rotate_stop(ctx->pmu);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02007215
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007217 __perf_remove_from_context(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007219 __perf_remove_from_context(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007220}
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007221
7222static void perf_event_exit_cpu_context(int cpu)
7223{
7224 struct perf_event_context *ctx;
7225 struct pmu *pmu;
7226 int idx;
7227
7228 idx = srcu_read_lock(&pmus_srcu);
7229 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra917bdd12010-09-17 11:28:49 +02007230 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007231
7232 mutex_lock(&ctx->mutex);
7233 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7234 mutex_unlock(&ctx->mutex);
7235 }
7236 srcu_read_unlock(&pmus_srcu, idx);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007237}
7238
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007239static void perf_event_exit_cpu(int cpu)
7240{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007241 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007242
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007243 mutex_lock(&swhash->hlist_mutex);
7244 swevent_hlist_release(swhash);
7245 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007246
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007247 perf_event_exit_cpu_context(cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007248}
7249#else
7250static inline void perf_event_exit_cpu(int cpu) { }
7251#endif
7252
Peter Zijlstrac2774432010-12-08 15:29:02 +01007253static int
7254perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7255{
7256 int cpu;
7257
7258 for_each_online_cpu(cpu)
7259 perf_event_exit_cpu(cpu);
7260
7261 return NOTIFY_OK;
7262}
7263
7264/*
7265 * Run the perf reboot notifier at the very last possible moment so that
7266 * the generic watchdog code runs as long as possible.
7267 */
7268static struct notifier_block perf_reboot_notifier = {
7269 .notifier_call = perf_reboot,
7270 .priority = INT_MIN,
7271};
7272
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007273static int __cpuinit
7274perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7275{
7276 unsigned int cpu = (long)hcpu;
7277
Peter Zijlstra5e116372010-06-11 13:35:08 +02007278 switch (action & ~CPU_TASKS_FROZEN) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007279
7280 case CPU_UP_PREPARE:
Peter Zijlstra5e116372010-06-11 13:35:08 +02007281 case CPU_DOWN_FAILED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007282 perf_event_init_cpu(cpu);
7283 break;
7284
Peter Zijlstra5e116372010-06-11 13:35:08 +02007285 case CPU_UP_CANCELED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007286 case CPU_DOWN_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007287 perf_event_exit_cpu(cpu);
7288 break;
7289
7290 default:
7291 break;
7292 }
7293
7294 return NOTIFY_OK;
7295}
7296
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007297void __init perf_event_init(void)
7298{
Jason Wessel3c502e72010-11-04 17:33:01 -05007299 int ret;
7300
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007301 idr_init(&pmu_idr);
7302
Paul Mackerras220b1402010-03-10 20:45:52 +11007303 perf_event_init_all_cpus();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007304 init_srcu_struct(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007305 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7306 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7307 perf_pmu_register(&perf_task_clock, NULL, -1);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007308 perf_tp_register();
7309 perf_cpu_notifier(perf_cpu_notify);
Peter Zijlstrac2774432010-12-08 15:29:02 +01007310 register_reboot_notifier(&perf_reboot_notifier);
Jason Wessel3c502e72010-11-04 17:33:01 -05007311
7312 ret = init_hw_breakpoint();
7313 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007314}
Peter Zijlstraabe43402010-11-17 23:17:37 +01007315
7316static int __init perf_event_sysfs_init(void)
7317{
7318 struct pmu *pmu;
7319 int ret;
7320
7321 mutex_lock(&pmus_lock);
7322
7323 ret = bus_register(&pmu_bus);
7324 if (ret)
7325 goto unlock;
7326
7327 list_for_each_entry(pmu, &pmus, entry) {
7328 if (!pmu->name || pmu->type < 0)
7329 continue;
7330
7331 ret = pmu_dev_alloc(pmu);
7332 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7333 }
7334 pmu_bus_running = 1;
7335 ret = 0;
7336
7337unlock:
7338 mutex_unlock(&pmus_lock);
7339
7340 return ret;
7341}
7342device_initcall(perf_event_sysfs_init);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
Stephane Eraniane5d13672011-02-14 11:20:01 +02007349
Li Zefan1b15d052011-03-03 14:26:06 +08007350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
Stephane Eraniane5d13672011-02-14 11:20:01 +02007354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
Stephane Eraniane5d13672011-02-14 11:20:01 +02007360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */