Blame - kernel/perf_event.c - android_kernel_htc_msm8960

blob: fb851ec34461c932620374f8d8236dbbcb57bb84 [file] [log] [blame]

Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1	/*
Ingo Molnar	57c0c15	2009-09-21 12:20:38 +0200	[diff] [blame]	2	* Performance events core code:
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3	*
				4	* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
				5	* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
				6	* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
				7	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
				8	*
Ingo Molnar	57c0c15	2009-09-21 12:20:38 +0200	[diff] [blame]	9	* For licensing details see kernel-base/COPYING
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	10	*/
				11
				12	#include <linux/fs.h>
				13	#include <linux/mm.h>
				14	#include <linux/cpu.h>
				15	#include <linux/smp.h>
				16	#include <linux/file.h>
				17	#include <linux/poll.h>
				18	#include <linux/sysfs.h>
				19	#include <linux/dcache.h>
				20	#include <linux/percpu.h>
				21	#include <linux/ptrace.h>
				22	#include <linux/vmstat.h>
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	23	#include <linux/vmalloc.h>
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	24	#include <linux/hardirq.h>
				25	#include <linux/rculist.h>
				26	#include <linux/uaccess.h>
				27	#include <linux/syscalls.h>
				28	#include <linux/anon_inodes.h>
				29	#include <linux/kernel_stat.h>
				30	#include <linux/perf_event.h>
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	31	#include <linux/ftrace_event.h>
Frederic Weisbecker	24f1e32c	2009-09-09 19:22:48 +0200	[diff] [blame]	32	#include <linux/hw_breakpoint.h>
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	33
				34	#include <asm/irq_regs.h>
				35
				36	/*
				37	* Each CPU has a list of per CPU events:
				38	*/
				39	DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
				40
				41	int perf_max_events __read_mostly = 1;
				42	static int perf_reserved_percpu __read_mostly;
				43	static int perf_overcommit __read_mostly = 1;
				44
				45	static atomic_t nr_events __read_mostly;
				46	static atomic_t nr_mmap_events __read_mostly;
				47	static atomic_t nr_comm_events __read_mostly;
				48	static atomic_t nr_task_events __read_mostly;
				49
				50	/*
				51	* perf event paranoia level:
				52	* -1 - not paranoid at all
				53	* 0 - disallow raw tracepoint access for unpriv
				54	* 1 - disallow cpu events for unpriv
				55	* 2 - disallow kernel profiling for unpriv
				56	*/
				57	int sysctl_perf_event_paranoid __read_mostly = 1;
				58
				59	static inline bool perf_paranoid_tracepoint_raw(void)
				60	{
				61	return sysctl_perf_event_paranoid > -1;
				62	}
				63
				64	static inline bool perf_paranoid_cpu(void)
				65	{
				66	return sysctl_perf_event_paranoid > 0;
				67	}
				68
				69	static inline bool perf_paranoid_kernel(void)
				70	{
				71	return sysctl_perf_event_paranoid > 1;
				72	}
				73
				74	int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
				75
				76	/*
				77	* max perf event sample rate
				78	*/
				79	int sysctl_perf_event_sample_rate __read_mostly = 100000;
				80
				81	static atomic64_t perf_event_id;
				82
				83	/*
				84	* Lock for (sysadmin-configurable) event reservations:
				85	*/
				86	static DEFINE_SPINLOCK(perf_resource_lock);
				87
				88	/*
				89	* Architecture provided APIs - weak aliases:
				90	*/
				91	extern __weak const struct pmu hw_perf_event_init(struct perf_event event)
				92	{
				93	return NULL;
				94	}
				95
				96	void __weak hw_perf_disable(void) { barrier(); }
				97	void __weak hw_perf_enable(void) { barrier(); }
				98
				99	void __weak hw_perf_event_setup(int cpu) { barrier(); }
				100	void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
				101
				102	int __weak
				103	hw_perf_group_sched_in(struct perf_event *group_leader,
				104	struct perf_cpu_context *cpuctx,
				105	struct perf_event_context *ctx, int cpu)
				106	{
				107	return 0;
				108	}
				109
				110	void __weak perf_event_print_debug(void) { }
				111
				112	static DEFINE_PER_CPU(int, perf_disable_count);
				113
				114	void __perf_disable(void)
				115	{
				116	__get_cpu_var(perf_disable_count)++;
				117	}
				118
				119	bool __perf_enable(void)
				120	{
				121	return !--__get_cpu_var(perf_disable_count);
				122	}
				123
				124	void perf_disable(void)
				125	{
				126	__perf_disable();
				127	hw_perf_disable();
				128	}
				129
				130	void perf_enable(void)
				131	{
				132	if (__perf_enable())
				133	hw_perf_enable();
				134	}
				135
				136	static void get_ctx(struct perf_event_context *ctx)
				137	{
				138	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
				139	}
				140
				141	static void free_ctx(struct rcu_head *head)
				142	{
				143	struct perf_event_context *ctx;
				144
				145	ctx = container_of(head, struct perf_event_context, rcu_head);
				146	kfree(ctx);
				147	}
				148
				149	static void put_ctx(struct perf_event_context *ctx)
				150	{
				151	if (atomic_dec_and_test(&ctx->refcount)) {
				152	if (ctx->parent_ctx)
				153	put_ctx(ctx->parent_ctx);
				154	if (ctx->task)
				155	put_task_struct(ctx->task);
				156	call_rcu(&ctx->rcu_head, free_ctx);
				157	}
				158	}
				159
				160	static void unclone_ctx(struct perf_event_context *ctx)
				161	{
				162	if (ctx->parent_ctx) {
				163	put_ctx(ctx->parent_ctx);
				164	ctx->parent_ctx = NULL;
				165	}
				166	}
				167
				168	/*
				169	* If we inherit events we want to return the parent event id
				170	* to userspace.
				171	*/
				172	static u64 primary_event_id(struct perf_event *event)
				173	{
				174	u64 id = event->id;
				175
				176	if (event->parent)
				177	id = event->parent->id;
				178
				179	return id;
				180	}
				181
				182	/*
				183	* Get the perf_event_context for a task and lock it.
				184	* This has to cope with with the fact that until it is locked,
				185	* the context could get moved to another task.
				186	*/
				187	static struct perf_event_context *
				188	perf_lock_task_context(struct task_struct task, unsigned long flags)
				189	{
				190	struct perf_event_context *ctx;
				191
				192	rcu_read_lock();
				193	retry:
				194	ctx = rcu_dereference(task->perf_event_ctxp);
				195	if (ctx) {
				196	/*
				197	* If this context is a clone of another, it might
				198	* get swapped for another underneath us by
				199	* perf_event_task_sched_out, though the
				200	* rcu_read_lock() protects us from any context
				201	* getting freed. Lock the context and check if it
				202	* got swapped before we could get the lock, and retry
				203	* if so. If we locked the right context, then it
				204	* can't get swapped on us any more.
				205	*/
				206	spin_lock_irqsave(&ctx->lock, *flags);
				207	if (ctx != rcu_dereference(task->perf_event_ctxp)) {
				208	spin_unlock_irqrestore(&ctx->lock, *flags);
				209	goto retry;
				210	}
				211
				212	if (!atomic_inc_not_zero(&ctx->refcount)) {
				213	spin_unlock_irqrestore(&ctx->lock, *flags);
				214	ctx = NULL;
				215	}
				216	}
				217	rcu_read_unlock();
				218	return ctx;
				219	}
				220
				221	/*
				222	* Get the context for a task and increment its pin_count so it
				223	* can't get swapped to another task. This also increments its
				224	* reference count so that the context can't get freed.
				225	*/
				226	static struct perf_event_context perf_pin_task_context(struct task_struct task)
				227	{
				228	struct perf_event_context *ctx;
				229	unsigned long flags;
				230
				231	ctx = perf_lock_task_context(task, &flags);
				232	if (ctx) {
				233	++ctx->pin_count;
				234	spin_unlock_irqrestore(&ctx->lock, flags);
				235	}
				236	return ctx;
				237	}
				238
				239	static void perf_unpin_context(struct perf_event_context *ctx)
				240	{
				241	unsigned long flags;
				242
				243	spin_lock_irqsave(&ctx->lock, flags);
				244	--ctx->pin_count;
				245	spin_unlock_irqrestore(&ctx->lock, flags);
				246	put_ctx(ctx);
				247	}
				248
				249	/*
				250	* Add a event from the lists for its context.
				251	* Must be called with ctx->mutex and ctx->lock held.
				252	*/
				253	static void
				254	list_add_event(struct perf_event event, struct perf_event_context ctx)
				255	{
				256	struct perf_event *group_leader = event->group_leader;
				257
				258	/*
				259	* Depending on whether it is a standalone or sibling event,
				260	* add it straight to the context's event list, or to the group
				261	* leader's sibling list:
				262	*/
				263	if (group_leader == event)
				264	list_add_tail(&event->group_entry, &ctx->group_list);
				265	else {
				266	list_add_tail(&event->group_entry, &group_leader->sibling_list);
				267	group_leader->nr_siblings++;
				268	}
				269
				270	list_add_rcu(&event->event_entry, &ctx->event_list);
				271	ctx->nr_events++;
				272	if (event->attr.inherit_stat)
				273	ctx->nr_stat++;
				274	}
				275
				276	/*
				277	* Remove a event from the lists for its context.
				278	* Must be called with ctx->mutex and ctx->lock held.
				279	*/
				280	static void
				281	list_del_event(struct perf_event event, struct perf_event_context ctx)
				282	{
				283	struct perf_event sibling, tmp;
				284
				285	if (list_empty(&event->group_entry))
				286	return;
				287	ctx->nr_events--;
				288	if (event->attr.inherit_stat)
				289	ctx->nr_stat--;
				290
				291	list_del_init(&event->group_entry);
				292	list_del_rcu(&event->event_entry);
				293
				294	if (event->group_leader != event)
				295	event->group_leader->nr_siblings--;
				296
Peter Zijlstra	2e2af50	2009-11-23 11:37:25 +0100	[diff] [blame^]	297	event->state = PERF_EVENT_STATE_OFF;
				298
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	299	/*
				300	* If this was a group event with sibling events then
				301	* upgrade the siblings to singleton events by adding them
				302	* to the context list directly:
				303	*/
				304	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
				305
				306	list_move_tail(&sibling->group_entry, &ctx->group_list);
				307	sibling->group_leader = sibling;
				308	}
				309	}
				310
				311	static void
				312	event_sched_out(struct perf_event *event,
				313	struct perf_cpu_context *cpuctx,
				314	struct perf_event_context *ctx)
				315	{
				316	if (event->state != PERF_EVENT_STATE_ACTIVE)
				317	return;
				318
				319	event->state = PERF_EVENT_STATE_INACTIVE;
				320	if (event->pending_disable) {
				321	event->pending_disable = 0;
				322	event->state = PERF_EVENT_STATE_OFF;
				323	}
				324	event->tstamp_stopped = ctx->time;
				325	event->pmu->disable(event);
				326	event->oncpu = -1;
				327
				328	if (!is_software_event(event))
				329	cpuctx->active_oncpu--;
				330	ctx->nr_active--;
				331	if (event->attr.exclusive \|\| !cpuctx->active_oncpu)
				332	cpuctx->exclusive = 0;
				333	}
				334
				335	static void
				336	group_sched_out(struct perf_event *group_event,
				337	struct perf_cpu_context *cpuctx,
				338	struct perf_event_context *ctx)
				339	{
				340	struct perf_event *event;
				341
				342	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
				343	return;
				344
				345	event_sched_out(group_event, cpuctx, ctx);
				346
				347	/*
				348	* Schedule out siblings (if any):
				349	*/
				350	list_for_each_entry(event, &group_event->sibling_list, group_entry)
				351	event_sched_out(event, cpuctx, ctx);
				352
				353	if (group_event->attr.exclusive)
				354	cpuctx->exclusive = 0;
				355	}
				356
				357	/*
				358	* Cross CPU call to remove a performance event
				359	*
				360	* We disable the event on the hardware level first. After that we
				361	* remove it from the context list.
				362	*/
				363	static void __perf_event_remove_from_context(void *info)
				364	{
				365	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				366	struct perf_event *event = info;
				367	struct perf_event_context *ctx = event->ctx;
				368
				369	/*
				370	* If this is a task context, we need to check whether it is
				371	* the current task context of this cpu. If not it has been
				372	* scheduled out before the smp call arrived.
				373	*/
				374	if (ctx->task && cpuctx->task_ctx != ctx)
				375	return;
				376
				377	spin_lock(&ctx->lock);
				378	/*
				379	* Protect the list operation against NMI by disabling the
				380	* events on a global level.
				381	*/
				382	perf_disable();
				383
				384	event_sched_out(event, cpuctx, ctx);
				385
				386	list_del_event(event, ctx);
				387
				388	if (!ctx->task) {
				389	/*
				390	* Allow more per task events with respect to the
				391	* reservation:
				392	*/
				393	cpuctx->max_pertask =
				394	min(perf_max_events - ctx->nr_events,
				395	perf_max_events - perf_reserved_percpu);
				396	}
				397
				398	perf_enable();
				399	spin_unlock(&ctx->lock);
				400	}
				401
				402
				403	/*
				404	* Remove the event from a task's (or a CPU's) list of events.
				405	*
				406	* Must be called with ctx->mutex held.
				407	*
				408	* CPU events are removed with a smp call. For task events we only
				409	* call when the task is on a CPU.
				410	*
				411	* If event->ctx is a cloned context, callers must make sure that
				412	* every task struct that event->ctx->task could possibly point to
				413	* remains valid. This is OK when called from perf_release since
				414	* that only calls us on the top-level context, which can't be a clone.
				415	* When called from perf_event_exit_task, it's OK because the
				416	* context has been detached from its task.
				417	*/
				418	static void perf_event_remove_from_context(struct perf_event *event)
				419	{
				420	struct perf_event_context *ctx = event->ctx;
				421	struct task_struct *task = ctx->task;
				422
				423	if (!task) {
				424	/*
				425	* Per cpu events are removed via an smp call and
				426	* the removal is always sucessful.
				427	*/
				428	smp_call_function_single(event->cpu,
				429	__perf_event_remove_from_context,
				430	event, 1);
				431	return;
				432	}
				433
				434	retry:
				435	task_oncpu_function_call(task, __perf_event_remove_from_context,
				436	event);
				437
				438	spin_lock_irq(&ctx->lock);
				439	/*
				440	* If the context is active we need to retry the smp call.
				441	*/
				442	if (ctx->nr_active && !list_empty(&event->group_entry)) {
				443	spin_unlock_irq(&ctx->lock);
				444	goto retry;
				445	}
				446
				447	/*
				448	* The lock prevents that this context is scheduled in so we
				449	* can remove the event safely, if the call above did not
				450	* succeed.
				451	*/
Peter Zijlstra	6c2bfcb	2009-11-23 11:37:24 +0100	[diff] [blame]	452	if (!list_empty(&event->group_entry))
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	453	list_del_event(event, ctx);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	454	spin_unlock_irq(&ctx->lock);
				455	}
				456
				457	static inline u64 perf_clock(void)
				458	{
				459	return cpu_clock(smp_processor_id());
				460	}
				461
				462	/*
				463	* Update the record of the current time in a context.
				464	*/
				465	static void update_context_time(struct perf_event_context *ctx)
				466	{
				467	u64 now = perf_clock();
				468
				469	ctx->time += now - ctx->timestamp;
				470	ctx->timestamp = now;
				471	}
				472
				473	/*
				474	* Update the total_time_enabled and total_time_running fields for a event.
				475	*/
				476	static void update_event_times(struct perf_event *event)
				477	{
				478	struct perf_event_context *ctx = event->ctx;
				479	u64 run_end;
				480
				481	if (event->state < PERF_EVENT_STATE_INACTIVE \|\|
				482	event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
				483	return;
				484
				485	event->total_time_enabled = ctx->time - event->tstamp_enabled;
				486
				487	if (event->state == PERF_EVENT_STATE_INACTIVE)
				488	run_end = event->tstamp_stopped;
				489	else
				490	run_end = ctx->time;
				491
				492	event->total_time_running = run_end - event->tstamp_running;
				493	}
				494
				495	/*
				496	* Update total_time_enabled and total_time_running for all events in a group.
				497	*/
				498	static void update_group_times(struct perf_event *leader)
				499	{
				500	struct perf_event *event;
				501
				502	update_event_times(leader);
				503	list_for_each_entry(event, &leader->sibling_list, group_entry)
				504	update_event_times(event);
				505	}
				506
				507	/*
				508	* Cross CPU call to disable a performance event
				509	*/
				510	static void __perf_event_disable(void *info)
				511	{
				512	struct perf_event *event = info;
				513	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				514	struct perf_event_context *ctx = event->ctx;
				515
				516	/*
				517	* If this is a per-task event, need to check whether this
				518	* event's task is the current task on this cpu.
				519	*/
				520	if (ctx->task && cpuctx->task_ctx != ctx)
				521	return;
				522
				523	spin_lock(&ctx->lock);
				524
				525	/*
				526	* If the event is on, turn it off.
				527	* If it is in error state, leave it in error state.
				528	*/
				529	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
				530	update_context_time(ctx);
				531	update_group_times(event);
				532	if (event == event->group_leader)
				533	group_sched_out(event, cpuctx, ctx);
				534	else
				535	event_sched_out(event, cpuctx, ctx);
				536	event->state = PERF_EVENT_STATE_OFF;
				537	}
				538
				539	spin_unlock(&ctx->lock);
				540	}
				541
				542	/*
				543	* Disable a event.
				544	*
				545	* If event->ctx is a cloned context, callers must make sure that
				546	* every task struct that event->ctx->task could possibly point to
				547	* remains valid. This condition is satisifed when called through
				548	* perf_event_for_each_child or perf_event_for_each because they
				549	* hold the top-level event's child_mutex, so any descendant that
				550	* goes to exit will block in sync_child_event.
				551	* When called from perf_pending_event it's OK because event->ctx
				552	* is the current context on this CPU and preemption is disabled,
				553	* hence we can't get into perf_event_task_sched_out for this context.
				554	*/
				555	static void perf_event_disable(struct perf_event *event)
				556	{
				557	struct perf_event_context *ctx = event->ctx;
				558	struct task_struct *task = ctx->task;
				559
				560	if (!task) {
				561	/*
				562	* Disable the event on the cpu that it's on
				563	*/
				564	smp_call_function_single(event->cpu, __perf_event_disable,
				565	event, 1);
				566	return;
				567	}
				568
				569	retry:
				570	task_oncpu_function_call(task, __perf_event_disable, event);
				571
				572	spin_lock_irq(&ctx->lock);
				573	/*
				574	* If the event is still active, we need to retry the cross-call.
				575	*/
				576	if (event->state == PERF_EVENT_STATE_ACTIVE) {
				577	spin_unlock_irq(&ctx->lock);
				578	goto retry;
				579	}
				580
				581	/*
				582	* Since we have the lock this context can't be scheduled
				583	* in, so we can change the state safely.
				584	*/
				585	if (event->state == PERF_EVENT_STATE_INACTIVE) {
				586	update_group_times(event);
				587	event->state = PERF_EVENT_STATE_OFF;
				588	}
				589
				590	spin_unlock_irq(&ctx->lock);
				591	}
				592
				593	static int
				594	event_sched_in(struct perf_event *event,
				595	struct perf_cpu_context *cpuctx,
				596	struct perf_event_context *ctx,
				597	int cpu)
				598	{
				599	if (event->state <= PERF_EVENT_STATE_OFF)
				600	return 0;
				601
				602	event->state = PERF_EVENT_STATE_ACTIVE;
				603	event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
				604	/*
				605	* The new state must be visible before we turn it on in the hardware:
				606	*/
				607	smp_wmb();
				608
				609	if (event->pmu->enable(event)) {
				610	event->state = PERF_EVENT_STATE_INACTIVE;
				611	event->oncpu = -1;
				612	return -EAGAIN;
				613	}
				614
				615	event->tstamp_running += ctx->time - event->tstamp_stopped;
				616
				617	if (!is_software_event(event))
				618	cpuctx->active_oncpu++;
				619	ctx->nr_active++;
				620
				621	if (event->attr.exclusive)
				622	cpuctx->exclusive = 1;
				623
				624	return 0;
				625	}
				626
				627	static int
				628	group_sched_in(struct perf_event *group_event,
				629	struct perf_cpu_context *cpuctx,
				630	struct perf_event_context *ctx,
				631	int cpu)
				632	{
				633	struct perf_event event, partial_group;
				634	int ret;
				635
				636	if (group_event->state == PERF_EVENT_STATE_OFF)
				637	return 0;
				638
				639	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
				640	if (ret)
				641	return ret < 0 ? ret : 0;
				642
				643	if (event_sched_in(group_event, cpuctx, ctx, cpu))
				644	return -EAGAIN;
				645
				646	/*
				647	* Schedule in siblings as one group (if any):
				648	*/
				649	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
				650	if (event_sched_in(event, cpuctx, ctx, cpu)) {
				651	partial_group = event;
				652	goto group_error;
				653	}
				654	}
				655
				656	return 0;
				657
				658	group_error:
				659	/*
				660	* Groups can be scheduled in as one unit only, so undo any
				661	* partial group before returning:
				662	*/
				663	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
				664	if (event == partial_group)
				665	break;
				666	event_sched_out(event, cpuctx, ctx);
				667	}
				668	event_sched_out(group_event, cpuctx, ctx);
				669
				670	return -EAGAIN;
				671	}
				672
				673	/*
				674	* Return 1 for a group consisting entirely of software events,
				675	* 0 if the group contains any hardware events.
				676	*/
				677	static int is_software_only_group(struct perf_event *leader)
				678	{
				679	struct perf_event *event;
				680
				681	if (!is_software_event(leader))
				682	return 0;
				683
				684	list_for_each_entry(event, &leader->sibling_list, group_entry)
				685	if (!is_software_event(event))
				686	return 0;
				687
				688	return 1;
				689	}
				690
				691	/*
				692	* Work out whether we can put this event group on the CPU now.
				693	*/
				694	static int group_can_go_on(struct perf_event *event,
				695	struct perf_cpu_context *cpuctx,
				696	int can_add_hw)
				697	{
				698	/*
				699	* Groups consisting entirely of software events can always go on.
				700	*/
				701	if (is_software_only_group(event))
				702	return 1;
				703	/*
				704	* If an exclusive group is already on, no other hardware
				705	* events can go on.
				706	*/
				707	if (cpuctx->exclusive)
				708	return 0;
				709	/*
				710	* If this group is exclusive and there are already
				711	* events on the CPU, it can't go on.
				712	*/
				713	if (event->attr.exclusive && cpuctx->active_oncpu)
				714	return 0;
				715	/*
				716	* Otherwise, try to add it if all previous groups were able
				717	* to go on.
				718	*/
				719	return can_add_hw;
				720	}
				721
				722	static void add_event_to_ctx(struct perf_event *event,
				723	struct perf_event_context *ctx)
				724	{
				725	list_add_event(event, ctx);
				726	event->tstamp_enabled = ctx->time;
				727	event->tstamp_running = ctx->time;
				728	event->tstamp_stopped = ctx->time;
				729	}
				730
				731	/*
				732	* Cross CPU call to install and enable a performance event
				733	*
				734	* Must be called with ctx->mutex held
				735	*/
				736	static void __perf_install_in_context(void *info)
				737	{
				738	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				739	struct perf_event *event = info;
				740	struct perf_event_context *ctx = event->ctx;
				741	struct perf_event *leader = event->group_leader;
				742	int cpu = smp_processor_id();
				743	int err;
				744
				745	/*
				746	* If this is a task context, we need to check whether it is
				747	* the current task context of this cpu. If not it has been
				748	* scheduled out before the smp call arrived.
				749	* Or possibly this is the right context but it isn't
				750	* on this cpu because it had no events.
				751	*/
				752	if (ctx->task && cpuctx->task_ctx != ctx) {
				753	if (cpuctx->task_ctx \|\| ctx->task != current)
				754	return;
				755	cpuctx->task_ctx = ctx;
				756	}
				757
				758	spin_lock(&ctx->lock);
				759	ctx->is_active = 1;
				760	update_context_time(ctx);
				761
				762	/*
				763	* Protect the list operation against NMI by disabling the
				764	* events on a global level. NOP for non NMI based events.
				765	*/
				766	perf_disable();
				767
				768	add_event_to_ctx(event, ctx);
				769
				770	/*
				771	* Don't put the event on if it is disabled or if
				772	* it is in a group and the group isn't on.
				773	*/
				774	if (event->state != PERF_EVENT_STATE_INACTIVE \|\|
				775	(leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
				776	goto unlock;
				777
				778	/*
				779	* An exclusive event can't go on if there are already active
				780	* hardware events, and no hardware event can go on if there
				781	* is already an exclusive event on.
				782	*/
				783	if (!group_can_go_on(event, cpuctx, 1))
				784	err = -EEXIST;
				785	else
				786	err = event_sched_in(event, cpuctx, ctx, cpu);
				787
				788	if (err) {
				789	/*
				790	* This event couldn't go on. If it is in a group
				791	* then we have to pull the whole group off.
				792	* If the event group is pinned then put it in error state.
				793	*/
				794	if (leader != event)
				795	group_sched_out(leader, cpuctx, ctx);
				796	if (leader->attr.pinned) {
				797	update_group_times(leader);
				798	leader->state = PERF_EVENT_STATE_ERROR;
				799	}
				800	}
				801
				802	if (!err && !ctx->task && cpuctx->max_pertask)
				803	cpuctx->max_pertask--;
				804
				805	unlock:
				806	perf_enable();
				807
				808	spin_unlock(&ctx->lock);
				809	}
				810
				811	/*
				812	* Attach a performance event to a context
				813	*
				814	* First we add the event to the list with the hardware enable bit
				815	* in event->hw_config cleared.
				816	*
				817	* If the event is attached to a task which is on a CPU we use a smp
				818	* call to enable it in the task context. The task might have been
				819	* scheduled away, but we check this in the smp call again.
				820	*
				821	* Must be called with ctx->mutex held.
				822	*/
				823	static void
				824	perf_install_in_context(struct perf_event_context *ctx,
				825	struct perf_event *event,
				826	int cpu)
				827	{
				828	struct task_struct *task = ctx->task;
				829
				830	if (!task) {
				831	/*
				832	* Per cpu events are installed via an smp call and
				833	* the install is always sucessful.
				834	*/
				835	smp_call_function_single(cpu, __perf_install_in_context,
				836	event, 1);
				837	return;
				838	}
				839
				840	retry:
				841	task_oncpu_function_call(task, __perf_install_in_context,
				842	event);
				843
				844	spin_lock_irq(&ctx->lock);
				845	/*
				846	* we need to retry the smp call.
				847	*/
				848	if (ctx->is_active && list_empty(&event->group_entry)) {
				849	spin_unlock_irq(&ctx->lock);
				850	goto retry;
				851	}
				852
				853	/*
				854	* The lock prevents that this context is scheduled in so we
				855	* can add the event safely, if it the call above did not
				856	* succeed.
				857	*/
				858	if (list_empty(&event->group_entry))
				859	add_event_to_ctx(event, ctx);
				860	spin_unlock_irq(&ctx->lock);
				861	}
				862
				863	/*
				864	* Put a event into inactive state and update time fields.
				865	* Enabling the leader of a group effectively enables all
				866	* the group members that aren't explicitly disabled, so we
				867	* have to update their ->tstamp_enabled also.
				868	* Note: this works for group members as well as group leaders
				869	* since the non-leader members' sibling_lists will be empty.
				870	*/
				871	static void __perf_event_mark_enabled(struct perf_event *event,
				872	struct perf_event_context *ctx)
				873	{
				874	struct perf_event *sub;
				875
				876	event->state = PERF_EVENT_STATE_INACTIVE;
				877	event->tstamp_enabled = ctx->time - event->total_time_enabled;
				878	list_for_each_entry(sub, &event->sibling_list, group_entry)
				879	if (sub->state >= PERF_EVENT_STATE_INACTIVE)
				880	sub->tstamp_enabled =
				881	ctx->time - sub->total_time_enabled;
				882	}
				883
				884	/*
				885	* Cross CPU call to enable a performance event
				886	*/
				887	static void __perf_event_enable(void *info)
				888	{
				889	struct perf_event *event = info;
				890	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				891	struct perf_event_context *ctx = event->ctx;
				892	struct perf_event *leader = event->group_leader;
				893	int err;
				894
				895	/*
				896	* If this is a per-task event, need to check whether this
				897	* event's task is the current task on this cpu.
				898	*/
				899	if (ctx->task && cpuctx->task_ctx != ctx) {
				900	if (cpuctx->task_ctx \|\| ctx->task != current)
				901	return;
				902	cpuctx->task_ctx = ctx;
				903	}
				904
				905	spin_lock(&ctx->lock);
				906	ctx->is_active = 1;
				907	update_context_time(ctx);
				908
				909	if (event->state >= PERF_EVENT_STATE_INACTIVE)
				910	goto unlock;
				911	__perf_event_mark_enabled(event, ctx);
				912
				913	/*
				914	* If the event is in a group and isn't the group leader,
				915	* then don't put it on unless the group is on.
				916	*/
				917	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
				918	goto unlock;
				919
				920	if (!group_can_go_on(event, cpuctx, 1)) {
				921	err = -EEXIST;
				922	} else {
				923	perf_disable();
				924	if (event == leader)
				925	err = group_sched_in(event, cpuctx, ctx,
				926	smp_processor_id());
				927	else
				928	err = event_sched_in(event, cpuctx, ctx,
				929	smp_processor_id());
				930	perf_enable();
				931	}
				932
				933	if (err) {
				934	/*
				935	* If this event can't go on and it's part of a
				936	* group, then the whole group has to come off.
				937	*/
				938	if (leader != event)
				939	group_sched_out(leader, cpuctx, ctx);
				940	if (leader->attr.pinned) {
				941	update_group_times(leader);
				942	leader->state = PERF_EVENT_STATE_ERROR;
				943	}
				944	}
				945
				946	unlock:
				947	spin_unlock(&ctx->lock);
				948	}
				949
				950	/*
				951	* Enable a event.
				952	*
				953	* If event->ctx is a cloned context, callers must make sure that
				954	* every task struct that event->ctx->task could possibly point to
				955	* remains valid. This condition is satisfied when called through
				956	* perf_event_for_each_child or perf_event_for_each as described
				957	* for perf_event_disable.
				958	*/
				959	static void perf_event_enable(struct perf_event *event)
				960	{
				961	struct perf_event_context *ctx = event->ctx;
				962	struct task_struct *task = ctx->task;
				963
				964	if (!task) {
				965	/*
				966	* Enable the event on the cpu that it's on
				967	*/
				968	smp_call_function_single(event->cpu, __perf_event_enable,
				969	event, 1);
				970	return;
				971	}
				972
				973	spin_lock_irq(&ctx->lock);
				974	if (event->state >= PERF_EVENT_STATE_INACTIVE)
				975	goto out;
				976
				977	/*
				978	* If the event is in error state, clear that first.
				979	* That way, if we see the event in error state below, we
				980	* know that it has gone back into error state, as distinct
				981	* from the task having been scheduled away before the
				982	* cross-call arrived.
				983	*/
				984	if (event->state == PERF_EVENT_STATE_ERROR)
				985	event->state = PERF_EVENT_STATE_OFF;
				986
				987	retry:
				988	spin_unlock_irq(&ctx->lock);
				989	task_oncpu_function_call(task, __perf_event_enable, event);
				990
				991	spin_lock_irq(&ctx->lock);
				992
				993	/*
				994	* If the context is active and the event is still off,
				995	* we need to retry the cross-call.
				996	*/
				997	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
				998	goto retry;
				999
				1000	/*
				1001	* Since we have the lock this context can't be scheduled
				1002	* in, so we can change the state safely.
				1003	*/
				1004	if (event->state == PERF_EVENT_STATE_OFF)
				1005	__perf_event_mark_enabled(event, ctx);
				1006
				1007	out:
				1008	spin_unlock_irq(&ctx->lock);
				1009	}
				1010
				1011	static int perf_event_refresh(struct perf_event *event, int refresh)
				1012	{
				1013	/*
				1014	* not supported on inherited events
				1015	*/
				1016	if (event->attr.inherit)
				1017	return -EINVAL;
				1018
				1019	atomic_add(refresh, &event->event_limit);
				1020	perf_event_enable(event);
				1021
				1022	return 0;
				1023	}
				1024
				1025	void __perf_event_sched_out(struct perf_event_context *ctx,
				1026	struct perf_cpu_context *cpuctx)
				1027	{
				1028	struct perf_event *event;
				1029
				1030	spin_lock(&ctx->lock);
				1031	ctx->is_active = 0;
				1032	if (likely(!ctx->nr_events))
				1033	goto out;
				1034	update_context_time(ctx);
				1035
				1036	perf_disable();
Peter Zijlstra	6c2bfcb	2009-11-23 11:37:24 +0100	[diff] [blame]	1037	if (ctx->nr_active) {
Xiao Guangrong	8c9ed8e	2009-09-25 13:51:17 +0800	[diff] [blame]	1038	list_for_each_entry(event, &ctx->group_list, group_entry)
				1039	group_sched_out(event, cpuctx, ctx);
Peter Zijlstra	6c2bfcb	2009-11-23 11:37:24 +0100	[diff] [blame]	1040	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1041	perf_enable();
				1042	out:
				1043	spin_unlock(&ctx->lock);
				1044	}
				1045
				1046	/*
				1047	* Test whether two contexts are equivalent, i.e. whether they
				1048	* have both been cloned from the same version of the same context
				1049	* and they both have the same number of enabled events.
				1050	* If the number of enabled events is the same, then the set
				1051	* of enabled events should be the same, because these are both
				1052	* inherited contexts, therefore we can't access individual events
				1053	* in them directly with an fd; we can only enable/disable all
				1054	* events via prctl, or enable/disable all events in a family
				1055	* via ioctl, which will have the same effect on both contexts.
				1056	*/
				1057	static int context_equiv(struct perf_event_context *ctx1,
				1058	struct perf_event_context *ctx2)
				1059	{
				1060	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
				1061	&& ctx1->parent_gen == ctx2->parent_gen
				1062	&& !ctx1->pin_count && !ctx2->pin_count;
				1063	}
				1064
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1065	static void __perf_event_sync_stat(struct perf_event *event,
				1066	struct perf_event *next_event)
				1067	{
				1068	u64 value;
				1069
				1070	if (!event->attr.inherit_stat)
				1071	return;
				1072
				1073	/*
				1074	* Update the event value, we cannot use perf_event_read()
				1075	* because we're in the middle of a context switch and have IRQs
				1076	* disabled, which upsets smp_call_function_single(), however
				1077	* we know the event must be on the current CPU, therefore we
				1078	* don't need to use it.
				1079	*/
				1080	switch (event->state) {
				1081	case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra	3dbebf1	2009-11-20 22:19:52 +0100	[diff] [blame]	1082	event->pmu->read(event);
				1083	/* fall-through */
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1084
				1085	case PERF_EVENT_STATE_INACTIVE:
				1086	update_event_times(event);
				1087	break;
				1088
				1089	default:
				1090	break;
				1091	}
				1092
				1093	/*
				1094	* In order to keep per-task stats reliable we need to flip the event
				1095	* values when we flip the contexts.
				1096	*/
				1097	value = atomic64_read(&next_event->count);
				1098	value = atomic64_xchg(&event->count, value);
				1099	atomic64_set(&next_event->count, value);
				1100
				1101	swap(event->total_time_enabled, next_event->total_time_enabled);
				1102	swap(event->total_time_running, next_event->total_time_running);
				1103
				1104	/*
				1105	* Since we swizzled the values, update the user visible data too.
				1106	*/
				1107	perf_event_update_userpage(event);
				1108	perf_event_update_userpage(next_event);
				1109	}
				1110
				1111	#define list_next_entry(pos, member) \
				1112	list_entry(pos->member.next, typeof(*pos), member)
				1113
				1114	static void perf_event_sync_stat(struct perf_event_context *ctx,
				1115	struct perf_event_context *next_ctx)
				1116	{
				1117	struct perf_event event, next_event;
				1118
				1119	if (!ctx->nr_stat)
				1120	return;
				1121
Peter Zijlstra	02ffdbc	2009-11-20 22:19:50 +0100	[diff] [blame]	1122	update_context_time(ctx);
				1123
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1124	event = list_first_entry(&ctx->event_list,
				1125	struct perf_event, event_entry);
				1126
				1127	next_event = list_first_entry(&next_ctx->event_list,
				1128	struct perf_event, event_entry);
				1129
				1130	while (&event->event_entry != &ctx->event_list &&
				1131	&next_event->event_entry != &next_ctx->event_list) {
				1132
				1133	__perf_event_sync_stat(event, next_event);
				1134
				1135	event = list_next_entry(event, event_entry);
				1136	next_event = list_next_entry(next_event, event_entry);
				1137	}
				1138	}
				1139
				1140	/*
				1141	* Called from scheduler to remove the events of the current task,
				1142	* with interrupts disabled.
				1143	*
				1144	* We stop each event and update the event value in event->count.
				1145	*
				1146	* This does not protect us against NMI, but disable()
				1147	* sets the disabled bit in the control field of event _before_
				1148	* accessing the event control register. If a NMI hits, then it will
				1149	* not restart the event.
				1150	*/
				1151	void perf_event_task_sched_out(struct task_struct *task,
				1152	struct task_struct *next, int cpu)
				1153	{
				1154	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				1155	struct perf_event_context *ctx = task->perf_event_ctxp;
				1156	struct perf_event_context *next_ctx;
				1157	struct perf_event_context *parent;
				1158	struct pt_regs *regs;
				1159	int do_switch = 1;
				1160
				1161	regs = task_pt_regs(task);
				1162	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
				1163
				1164	if (likely(!ctx \|\| !cpuctx->task_ctx))
				1165	return;
				1166
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1167	rcu_read_lock();
				1168	parent = rcu_dereference(ctx->parent_ctx);
				1169	next_ctx = next->perf_event_ctxp;
				1170	if (parent && next_ctx &&
				1171	rcu_dereference(next_ctx->parent_ctx) == parent) {
				1172	/*
				1173	* Looks like the two contexts are clones, so we might be
				1174	* able to optimize the context switch. We lock both
				1175	* contexts and check that they are clones under the
				1176	* lock (including re-checking that neither has been
				1177	* uncloned in the meantime). It doesn't matter which
				1178	* order we take the locks because no other cpu could
				1179	* be trying to lock both of these tasks.
				1180	*/
				1181	spin_lock(&ctx->lock);
				1182	spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
				1183	if (context_equiv(ctx, next_ctx)) {
				1184	/*
				1185	* XXX do we need a memory barrier of sorts
				1186	* wrt to rcu_dereference() of perf_event_ctxp
				1187	*/
				1188	task->perf_event_ctxp = next_ctx;
				1189	next->perf_event_ctxp = ctx;
				1190	ctx->task = next;
				1191	next_ctx->task = task;
				1192	do_switch = 0;
				1193
				1194	perf_event_sync_stat(ctx, next_ctx);
				1195	}
				1196	spin_unlock(&next_ctx->lock);
				1197	spin_unlock(&ctx->lock);
				1198	}
				1199	rcu_read_unlock();
				1200
				1201	if (do_switch) {
				1202	__perf_event_sched_out(ctx, cpuctx);
				1203	cpuctx->task_ctx = NULL;
				1204	}
				1205	}
				1206
				1207	/*
				1208	* Called with IRQs disabled
				1209	*/
				1210	static void __perf_event_task_sched_out(struct perf_event_context *ctx)
				1211	{
				1212	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				1213
				1214	if (!cpuctx->task_ctx)
				1215	return;
				1216
				1217	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
				1218	return;
				1219
				1220	__perf_event_sched_out(ctx, cpuctx);
				1221	cpuctx->task_ctx = NULL;
				1222	}
				1223
				1224	/*
				1225	* Called with IRQs disabled
				1226	*/
				1227	static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
				1228	{
				1229	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
				1230	}
				1231
				1232	static void
				1233	__perf_event_sched_in(struct perf_event_context *ctx,
				1234	struct perf_cpu_context *cpuctx, int cpu)
				1235	{
				1236	struct perf_event *event;
				1237	int can_add_hw = 1;
				1238
				1239	spin_lock(&ctx->lock);
				1240	ctx->is_active = 1;
				1241	if (likely(!ctx->nr_events))
				1242	goto out;
				1243
				1244	ctx->timestamp = perf_clock();
				1245
				1246	perf_disable();
				1247
				1248	/*
				1249	* First go through the list and put on any pinned groups
				1250	* in order to give them the best chance of going on.
				1251	*/
				1252	list_for_each_entry(event, &ctx->group_list, group_entry) {
				1253	if (event->state <= PERF_EVENT_STATE_OFF \|\|
				1254	!event->attr.pinned)
				1255	continue;
				1256	if (event->cpu != -1 && event->cpu != cpu)
				1257	continue;
				1258
Xiao Guangrong	8c9ed8e	2009-09-25 13:51:17 +0800	[diff] [blame]	1259	if (group_can_go_on(event, cpuctx, 1))
				1260	group_sched_in(event, cpuctx, ctx, cpu);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1261
				1262	/*
				1263	* If this pinned group hasn't been scheduled,
				1264	* put it in error state.
				1265	*/
				1266	if (event->state == PERF_EVENT_STATE_INACTIVE) {
				1267	update_group_times(event);
				1268	event->state = PERF_EVENT_STATE_ERROR;
				1269	}
				1270	}
				1271
				1272	list_for_each_entry(event, &ctx->group_list, group_entry) {
				1273	/*
				1274	* Ignore events in OFF or ERROR state, and
				1275	* ignore pinned events since we did them already.
				1276	*/
				1277	if (event->state <= PERF_EVENT_STATE_OFF \|\|
				1278	event->attr.pinned)
				1279	continue;
				1280
				1281	/*
				1282	* Listen to the 'cpu' scheduling filter constraint
				1283	* of events:
				1284	*/
				1285	if (event->cpu != -1 && event->cpu != cpu)
				1286	continue;
				1287
Xiao Guangrong	8c9ed8e	2009-09-25 13:51:17 +0800	[diff] [blame]	1288	if (group_can_go_on(event, cpuctx, can_add_hw))
				1289	if (group_sched_in(event, cpuctx, ctx, cpu))
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1290	can_add_hw = 0;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1291	}
				1292	perf_enable();
				1293	out:
				1294	spin_unlock(&ctx->lock);
				1295	}
				1296
				1297	/*
				1298	* Called from scheduler to add the events of the current task
				1299	* with interrupts disabled.
				1300	*
				1301	* We restore the event value and then enable it.
				1302	*
				1303	* This does not protect us against NMI, but enable()
				1304	* sets the enabled bit in the control field of event _before_
				1305	* accessing the event control register. If a NMI hits, then it will
				1306	* keep the event running.
				1307	*/
				1308	void perf_event_task_sched_in(struct task_struct *task, int cpu)
				1309	{
				1310	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				1311	struct perf_event_context *ctx = task->perf_event_ctxp;
				1312
				1313	if (likely(!ctx))
				1314	return;
				1315	if (cpuctx->task_ctx == ctx)
				1316	return;
				1317	__perf_event_sched_in(ctx, cpuctx, cpu);
				1318	cpuctx->task_ctx = ctx;
				1319	}
				1320
				1321	static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
				1322	{
				1323	struct perf_event_context *ctx = &cpuctx->ctx;
				1324
				1325	__perf_event_sched_in(ctx, cpuctx, cpu);
				1326	}
				1327
				1328	#define MAX_INTERRUPTS (~0ULL)
				1329
				1330	static void perf_log_throttle(struct perf_event *event, int enable);
				1331
				1332	static void perf_adjust_period(struct perf_event *event, u64 events)
				1333	{
				1334	struct hw_perf_event *hwc = &event->hw;
				1335	u64 period, sample_period;
				1336	s64 delta;
				1337
				1338	events *= hwc->sample_period;
				1339	period = div64_u64(events, event->attr.sample_freq);
				1340
				1341	delta = (s64)(period - hwc->sample_period);
				1342	delta = (delta + 7) / 8; /* low pass filter */
				1343
				1344	sample_period = hwc->sample_period + delta;
				1345
				1346	if (!sample_period)
				1347	sample_period = 1;
				1348
				1349	hwc->sample_period = sample_period;
				1350	}
				1351
				1352	static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
				1353	{
				1354	struct perf_event *event;
				1355	struct hw_perf_event *hwc;
				1356	u64 interrupts, freq;
				1357
				1358	spin_lock(&ctx->lock);
Paul Mackerras	03541f8	2009-10-14 16:58:03 +1100	[diff] [blame]	1359	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1360	if (event->state != PERF_EVENT_STATE_ACTIVE)
				1361	continue;
				1362
				1363	hwc = &event->hw;
				1364
				1365	interrupts = hwc->interrupts;
				1366	hwc->interrupts = 0;
				1367
				1368	/*
				1369	* unthrottle events on the tick
				1370	*/
				1371	if (interrupts == MAX_INTERRUPTS) {
				1372	perf_log_throttle(event, 1);
				1373	event->pmu->unthrottle(event);
				1374	interrupts = 2*sysctl_perf_event_sample_rate/HZ;
				1375	}
				1376
				1377	if (!event->attr.freq \|\| !event->attr.sample_freq)
				1378	continue;
				1379
				1380	/*
				1381	* if the specified freq < HZ then we need to skip ticks
				1382	*/
				1383	if (event->attr.sample_freq < HZ) {
				1384	freq = event->attr.sample_freq;
				1385
				1386	hwc->freq_count += freq;
				1387	hwc->freq_interrupts += interrupts;
				1388
				1389	if (hwc->freq_count < HZ)
				1390	continue;
				1391
				1392	interrupts = hwc->freq_interrupts;
				1393	hwc->freq_interrupts = 0;
				1394	hwc->freq_count -= HZ;
				1395	} else
				1396	freq = HZ;
				1397
				1398	perf_adjust_period(event, freq * interrupts);
				1399
				1400	/*
				1401	* In order to avoid being stalled by an (accidental) huge
				1402	* sample period, force reset the sample period if we didn't
				1403	* get any events in this freq period.
				1404	*/
				1405	if (!interrupts) {
				1406	perf_disable();
				1407	event->pmu->disable(event);
				1408	atomic64_set(&hwc->period_left, 0);
				1409	event->pmu->enable(event);
				1410	perf_enable();
				1411	}
				1412	}
				1413	spin_unlock(&ctx->lock);
				1414	}
				1415
				1416	/*
				1417	* Round-robin a context's events:
				1418	*/
				1419	static void rotate_ctx(struct perf_event_context *ctx)
				1420	{
				1421	struct perf_event *event;
				1422
				1423	if (!ctx->nr_events)
				1424	return;
				1425
				1426	spin_lock(&ctx->lock);
				1427	/*
				1428	* Rotate the first entry last (works just fine for group events too):
				1429	*/
				1430	perf_disable();
				1431	list_for_each_entry(event, &ctx->group_list, group_entry) {
				1432	list_move_tail(&event->group_entry, &ctx->group_list);
				1433	break;
				1434	}
				1435	perf_enable();
				1436
				1437	spin_unlock(&ctx->lock);
				1438	}
				1439
				1440	void perf_event_task_tick(struct task_struct *curr, int cpu)
				1441	{
				1442	struct perf_cpu_context *cpuctx;
				1443	struct perf_event_context *ctx;
				1444
				1445	if (!atomic_read(&nr_events))
				1446	return;
				1447
				1448	cpuctx = &per_cpu(perf_cpu_context, cpu);
				1449	ctx = curr->perf_event_ctxp;
				1450
				1451	perf_ctx_adjust_freq(&cpuctx->ctx);
				1452	if (ctx)
				1453	perf_ctx_adjust_freq(ctx);
				1454
				1455	perf_event_cpu_sched_out(cpuctx);
				1456	if (ctx)
				1457	__perf_event_task_sched_out(ctx);
				1458
				1459	rotate_ctx(&cpuctx->ctx);
				1460	if (ctx)
				1461	rotate_ctx(ctx);
				1462
				1463	perf_event_cpu_sched_in(cpuctx, cpu);
				1464	if (ctx)
				1465	perf_event_task_sched_in(curr, cpu);
				1466	}
				1467
				1468	/*
				1469	* Enable all of a task's events that have been marked enable-on-exec.
				1470	* This expects task == current.
				1471	*/
				1472	static void perf_event_enable_on_exec(struct task_struct *task)
				1473	{
				1474	struct perf_event_context *ctx;
				1475	struct perf_event *event;
				1476	unsigned long flags;
				1477	int enabled = 0;
				1478
				1479	local_irq_save(flags);
				1480	ctx = task->perf_event_ctxp;
				1481	if (!ctx \|\| !ctx->nr_events)
				1482	goto out;
				1483
				1484	__perf_event_task_sched_out(ctx);
				1485
				1486	spin_lock(&ctx->lock);
				1487
				1488	list_for_each_entry(event, &ctx->group_list, group_entry) {
				1489	if (!event->attr.enable_on_exec)
				1490	continue;
				1491	event->attr.enable_on_exec = 0;
				1492	if (event->state >= PERF_EVENT_STATE_INACTIVE)
				1493	continue;
				1494	__perf_event_mark_enabled(event, ctx);
				1495	enabled = 1;
				1496	}
				1497
				1498	/*
				1499	* Unclone this context if we enabled any event.
				1500	*/
				1501	if (enabled)
				1502	unclone_ctx(ctx);
				1503
				1504	spin_unlock(&ctx->lock);
				1505
				1506	perf_event_task_sched_in(task, smp_processor_id());
				1507	out:
				1508	local_irq_restore(flags);
				1509	}
				1510
				1511	/*
				1512	* Cross CPU call to read the hardware event
				1513	*/
				1514	static void __perf_event_read(void *info)
				1515	{
				1516	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				1517	struct perf_event *event = info;
				1518	struct perf_event_context *ctx = event->ctx;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1519
				1520	/*
				1521	* If this is a task context, we need to check whether it is
				1522	* the current task context of this cpu. If not it has been
				1523	* scheduled out before the smp call arrived. In that case
				1524	* event->count would have been updated to a recent sample
				1525	* when the event was scheduled out.
				1526	*/
				1527	if (ctx->task && cpuctx->task_ctx != ctx)
				1528	return;
				1529
Peter Zijlstra	2b8988c	2009-11-20 22:19:54 +0100	[diff] [blame]	1530	spin_lock(&ctx->lock);
Peter Zijlstra	58e5ad1	2009-11-20 22:19:53 +0100	[diff] [blame]	1531	update_context_time(ctx);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1532	update_event_times(event);
Peter Zijlstra	2b8988c	2009-11-20 22:19:54 +0100	[diff] [blame]	1533	spin_unlock(&ctx->lock);
				1534
Peter Zijlstra	58e5ad1	2009-11-20 22:19:53 +0100	[diff] [blame]	1535	event->pmu->read(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1536	}
				1537
				1538	static u64 perf_event_read(struct perf_event *event)
				1539	{
				1540	/*
				1541	* If event is enabled and currently active on a CPU, update the
				1542	* value in the event structure:
				1543	*/
				1544	if (event->state == PERF_EVENT_STATE_ACTIVE) {
				1545	smp_call_function_single(event->oncpu,
				1546	__perf_event_read, event, 1);
				1547	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra	2b8988c	2009-11-20 22:19:54 +0100	[diff] [blame]	1548	struct perf_event_context *ctx = event->ctx;
				1549	unsigned long flags;
				1550
				1551	spin_lock_irqsave(&ctx->lock, flags);
				1552	update_context_time(ctx);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1553	update_event_times(event);
Peter Zijlstra	2b8988c	2009-11-20 22:19:54 +0100	[diff] [blame]	1554	spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1555	}
				1556
				1557	return atomic64_read(&event->count);
				1558	}
				1559
				1560	/*
				1561	* Initialize the perf_event context in a task_struct:
				1562	*/
				1563	static void
				1564	__perf_event_init_context(struct perf_event_context *ctx,
				1565	struct task_struct *task)
				1566	{
				1567	memset(ctx, 0, sizeof(*ctx));
				1568	spin_lock_init(&ctx->lock);
				1569	mutex_init(&ctx->mutex);
				1570	INIT_LIST_HEAD(&ctx->group_list);
				1571	INIT_LIST_HEAD(&ctx->event_list);
				1572	atomic_set(&ctx->refcount, 1);
				1573	ctx->task = task;
				1574	}
				1575
				1576	static struct perf_event_context *find_get_context(pid_t pid, int cpu)
				1577	{
				1578	struct perf_event_context *ctx;
				1579	struct perf_cpu_context *cpuctx;
				1580	struct task_struct *task;
				1581	unsigned long flags;
				1582	int err;
				1583
				1584	/*
				1585	* If cpu is not a wildcard then this is a percpu event:
				1586	*/
				1587	if (cpu != -1) {
				1588	/* Must be root to operate on a CPU event: */
				1589	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
				1590	return ERR_PTR(-EACCES);
				1591
				1592	if (cpu < 0 \|\| cpu > num_possible_cpus())
				1593	return ERR_PTR(-EINVAL);
				1594
				1595	/*
				1596	* We could be clever and allow to attach a event to an
				1597	* offline CPU and activate it when the CPU comes up, but
				1598	* that's for later.
				1599	*/
				1600	if (!cpu_isset(cpu, cpu_online_map))
				1601	return ERR_PTR(-ENODEV);
				1602
				1603	cpuctx = &per_cpu(perf_cpu_context, cpu);
				1604	ctx = &cpuctx->ctx;
				1605	get_ctx(ctx);
				1606
				1607	return ctx;
				1608	}
				1609
				1610	rcu_read_lock();
				1611	if (!pid)
				1612	task = current;
				1613	else
				1614	task = find_task_by_vpid(pid);
				1615	if (task)
				1616	get_task_struct(task);
				1617	rcu_read_unlock();
				1618
				1619	if (!task)
				1620	return ERR_PTR(-ESRCH);
				1621
				1622	/*
				1623	* Can't attach events to a dying task.
				1624	*/
				1625	err = -ESRCH;
				1626	if (task->flags & PF_EXITING)
				1627	goto errout;
				1628
				1629	/* Reuse ptrace permission checks for now. */
				1630	err = -EACCES;
				1631	if (!ptrace_may_access(task, PTRACE_MODE_READ))
				1632	goto errout;
				1633
				1634	retry:
				1635	ctx = perf_lock_task_context(task, &flags);
				1636	if (ctx) {
				1637	unclone_ctx(ctx);
				1638	spin_unlock_irqrestore(&ctx->lock, flags);
				1639	}
				1640
				1641	if (!ctx) {
				1642	ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
				1643	err = -ENOMEM;
				1644	if (!ctx)
				1645	goto errout;
				1646	__perf_event_init_context(ctx, task);
				1647	get_ctx(ctx);
				1648	if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
				1649	/*
				1650	* We raced with some other task; use
				1651	* the context they set.
				1652	*/
				1653	kfree(ctx);
				1654	goto retry;
				1655	}
				1656	get_task_struct(task);
				1657	}
				1658
				1659	put_task_struct(task);
				1660	return ctx;
				1661
				1662	errout:
				1663	put_task_struct(task);
				1664	return ERR_PTR(err);
				1665	}
				1666
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	1667	static void perf_event_free_filter(struct perf_event *event);
				1668
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1669	static void free_event_rcu(struct rcu_head *head)
				1670	{
				1671	struct perf_event *event;
				1672
				1673	event = container_of(head, struct perf_event, rcu_head);
				1674	if (event->ns)
				1675	put_pid_ns(event->ns);
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	1676	perf_event_free_filter(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1677	kfree(event);
				1678	}
				1679
				1680	static void perf_pending_sync(struct perf_event *event);
				1681
				1682	static void free_event(struct perf_event *event)
				1683	{
				1684	perf_pending_sync(event);
				1685
				1686	if (!event->parent) {
				1687	atomic_dec(&nr_events);
				1688	if (event->attr.mmap)
				1689	atomic_dec(&nr_mmap_events);
				1690	if (event->attr.comm)
				1691	atomic_dec(&nr_comm_events);
				1692	if (event->attr.task)
				1693	atomic_dec(&nr_task_events);
				1694	}
				1695
				1696	if (event->output) {
				1697	fput(event->output->filp);
				1698	event->output = NULL;
				1699	}
				1700
				1701	if (event->destroy)
				1702	event->destroy(event);
				1703
				1704	put_ctx(event->ctx);
				1705	call_rcu(&event->rcu_head, free_event_rcu);
				1706	}
				1707
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	1708	int perf_event_release_kernel(struct perf_event *event)
				1709	{
				1710	struct perf_event_context *ctx = event->ctx;
				1711
				1712	WARN_ON_ONCE(ctx->parent_ctx);
				1713	mutex_lock(&ctx->mutex);
				1714	perf_event_remove_from_context(event);
				1715	mutex_unlock(&ctx->mutex);
				1716
				1717	mutex_lock(&event->owner->perf_event_mutex);
				1718	list_del_init(&event->owner_entry);
				1719	mutex_unlock(&event->owner->perf_event_mutex);
				1720	put_task_struct(event->owner);
				1721
				1722	free_event(event);
				1723
				1724	return 0;
				1725	}
				1726	EXPORT_SYMBOL_GPL(perf_event_release_kernel);
				1727
Peter Zijlstra	a66a305	2009-11-23 11:37:23 +0100	[diff] [blame]	1728	/*
				1729	* Called when the last reference to the file is gone.
				1730	*/
				1731	static int perf_release(struct inode inode, struct file file)
				1732	{
				1733	struct perf_event *event = file->private_data;
				1734
				1735	file->private_data = NULL;
				1736
				1737	return perf_event_release_kernel(event);
				1738	}
				1739
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1740	static int perf_event_read_size(struct perf_event *event)
				1741	{
				1742	int entry = sizeof(u64); /* value */
				1743	int size = 0;
				1744	int nr = 1;
				1745
				1746	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				1747	size += sizeof(u64);
				1748
				1749	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				1750	size += sizeof(u64);
				1751
				1752	if (event->attr.read_format & PERF_FORMAT_ID)
				1753	entry += sizeof(u64);
				1754
				1755	if (event->attr.read_format & PERF_FORMAT_GROUP) {
				1756	nr += event->group_leader->nr_siblings;
				1757	size += sizeof(u64);
				1758	}
				1759
				1760	size += entry * nr;
				1761
				1762	return size;
				1763	}
				1764
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1765	u64 perf_event_read_value(struct perf_event event, u64 enabled, u64 *running)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1766	{
				1767	struct perf_event *child;
				1768	u64 total = 0;
				1769
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1770	*enabled = 0;
				1771	*running = 0;
				1772
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1773	mutex_lock(&event->child_mutex);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1774	total += perf_event_read(event);
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1775	*enabled += event->total_time_enabled +
				1776	atomic64_read(&event->child_total_time_enabled);
				1777	*running += event->total_time_running +
				1778	atomic64_read(&event->child_total_time_running);
				1779
				1780	list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1781	total += perf_event_read(child);
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1782	*enabled += child->total_time_enabled;
				1783	*running += child->total_time_running;
				1784	}
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1785	mutex_unlock(&event->child_mutex);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1786
				1787	return total;
				1788	}
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	1789	EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1790
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1791	static int perf_event_read_group(struct perf_event *event,
				1792	u64 read_format, char __user *buf)
				1793	{
				1794	struct perf_event leader = event->group_leader, sub;
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1795	int n = 0, size = 0, ret = -EFAULT;
				1796	struct perf_event_context *ctx = leader->ctx;
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1797	u64 values[5];
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1798	u64 count, enabled, running;
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1799
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1800	mutex_lock(&ctx->mutex);
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1801	count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1802
				1803	values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1804	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				1805	values[n++] = enabled;
				1806	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				1807	values[n++] = running;
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1808	values[n++] = count;
				1809	if (read_format & PERF_FORMAT_ID)
				1810	values[n++] = primary_event_id(leader);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1811
				1812	size = n * sizeof(u64);
				1813
				1814	if (copy_to_user(buf, values, size))
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1815	goto unlock;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1816
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1817	ret = size;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1818
				1819	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1820	n = 0;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1821
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1822	values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1823	if (read_format & PERF_FORMAT_ID)
				1824	values[n++] = primary_event_id(sub);
				1825
				1826	size = n * sizeof(u64);
				1827
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1828	if (copy_to_user(buf + size, values, size)) {
				1829	ret = -EFAULT;
				1830	goto unlock;
				1831	}
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1832
				1833	ret += size;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1834	}
Peter Zijlstra	6f10581	2009-11-20 22:19:56 +0100	[diff] [blame]	1835	unlock:
				1836	mutex_unlock(&ctx->mutex);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1837
Peter Zijlstra	abf4868	2009-11-20 22:19:49 +0100	[diff] [blame]	1838	return ret;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1839	}
				1840
				1841	static int perf_event_read_one(struct perf_event *event,
				1842	u64 read_format, char __user *buf)
				1843	{
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1844	u64 enabled, running;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1845	u64 values[4];
				1846	int n = 0;
				1847
Peter Zijlstra	59ed446	2009-11-20 22:19:55 +0100	[diff] [blame]	1848	values[n++] = perf_event_read_value(event, &enabled, &running);
				1849	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				1850	values[n++] = enabled;
				1851	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				1852	values[n++] = running;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1853	if (read_format & PERF_FORMAT_ID)
				1854	values[n++] = primary_event_id(event);
				1855
				1856	if (copy_to_user(buf, values, n * sizeof(u64)))
				1857	return -EFAULT;
				1858
				1859	return n * sizeof(u64);
				1860	}
				1861
				1862	/*
				1863	* Read the performance event - simple non blocking version for now
				1864	*/
				1865	static ssize_t
				1866	perf_read_hw(struct perf_event event, char __user buf, size_t count)
				1867	{
				1868	u64 read_format = event->attr.read_format;
				1869	int ret;
				1870
				1871	/*
				1872	* Return end-of-file for a read on a event that is in
				1873	* error state (i.e. because it was pinned but it couldn't be
				1874	* scheduled on to the CPU at some point).
				1875	*/
				1876	if (event->state == PERF_EVENT_STATE_ERROR)
				1877	return 0;
				1878
				1879	if (count < perf_event_read_size(event))
				1880	return -ENOSPC;
				1881
				1882	WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1883	if (read_format & PERF_FORMAT_GROUP)
				1884	ret = perf_event_read_group(event, read_format, buf);
				1885	else
				1886	ret = perf_event_read_one(event, read_format, buf);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1887
				1888	return ret;
				1889	}
				1890
				1891	static ssize_t
				1892	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)
				1893	{
				1894	struct perf_event *event = file->private_data;
				1895
				1896	return perf_read_hw(event, buf, count);
				1897	}
				1898
				1899	static unsigned int perf_poll(struct file file, poll_table wait)
				1900	{
				1901	struct perf_event *event = file->private_data;
				1902	struct perf_mmap_data *data;
				1903	unsigned int events = POLL_HUP;
				1904
				1905	rcu_read_lock();
				1906	data = rcu_dereference(event->data);
				1907	if (data)
				1908	events = atomic_xchg(&data->poll, 0);
				1909	rcu_read_unlock();
				1910
				1911	poll_wait(file, &event->waitq, wait);
				1912
				1913	return events;
				1914	}
				1915
				1916	static void perf_event_reset(struct perf_event *event)
				1917	{
				1918	(void)perf_event_read(event);
				1919	atomic64_set(&event->count, 0);
				1920	perf_event_update_userpage(event);
				1921	}
				1922
				1923	/*
				1924	* Holding the top-level event's child_mutex means that any
				1925	* descendant process that has inherited this event will block
				1926	* in sync_child_event if it goes to exit, thus satisfying the
				1927	* task existence requirements of perf_event_enable/disable.
				1928	*/
				1929	static void perf_event_for_each_child(struct perf_event *event,
				1930	void (func)(struct perf_event ))
				1931	{
				1932	struct perf_event *child;
				1933
				1934	WARN_ON_ONCE(event->ctx->parent_ctx);
				1935	mutex_lock(&event->child_mutex);
				1936	func(event);
				1937	list_for_each_entry(child, &event->child_list, child_list)
				1938	func(child);
				1939	mutex_unlock(&event->child_mutex);
				1940	}
				1941
				1942	static void perf_event_for_each(struct perf_event *event,
				1943	void (func)(struct perf_event ))
				1944	{
				1945	struct perf_event_context *ctx = event->ctx;
				1946	struct perf_event *sibling;
				1947
				1948	WARN_ON_ONCE(ctx->parent_ctx);
				1949	mutex_lock(&ctx->mutex);
				1950	event = event->group_leader;
				1951
				1952	perf_event_for_each_child(event, func);
				1953	func(event);
				1954	list_for_each_entry(sibling, &event->sibling_list, group_entry)
				1955	perf_event_for_each_child(event, func);
				1956	mutex_unlock(&ctx->mutex);
				1957	}
				1958
				1959	static int perf_event_period(struct perf_event event, u64 __user arg)
				1960	{
				1961	struct perf_event_context *ctx = event->ctx;
				1962	unsigned long size;
				1963	int ret = 0;
				1964	u64 value;
				1965
				1966	if (!event->attr.sample_period)
				1967	return -EINVAL;
				1968
				1969	size = copy_from_user(&value, arg, sizeof(value));
				1970	if (size != sizeof(value))
				1971	return -EFAULT;
				1972
				1973	if (!value)
				1974	return -EINVAL;
				1975
				1976	spin_lock_irq(&ctx->lock);
				1977	if (event->attr.freq) {
				1978	if (value > sysctl_perf_event_sample_rate) {
				1979	ret = -EINVAL;
				1980	goto unlock;
				1981	}
				1982
				1983	event->attr.sample_freq = value;
				1984	} else {
				1985	event->attr.sample_period = value;
				1986	event->hw.sample_period = value;
				1987	}
				1988	unlock:
				1989	spin_unlock_irq(&ctx->lock);
				1990
				1991	return ret;
				1992	}
				1993
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	1994	static int perf_event_set_output(struct perf_event *event, int output_fd);
				1995	static int perf_event_set_filter(struct perf_event event, void __user arg);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	1996
				1997	static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
				1998	{
				1999	struct perf_event *event = file->private_data;
				2000	void (func)(struct perf_event );
				2001	u32 flags = arg;
				2002
				2003	switch (cmd) {
				2004	case PERF_EVENT_IOC_ENABLE:
				2005	func = perf_event_enable;
				2006	break;
				2007	case PERF_EVENT_IOC_DISABLE:
				2008	func = perf_event_disable;
				2009	break;
				2010	case PERF_EVENT_IOC_RESET:
				2011	func = perf_event_reset;
				2012	break;
				2013
				2014	case PERF_EVENT_IOC_REFRESH:
				2015	return perf_event_refresh(event, arg);
				2016
				2017	case PERF_EVENT_IOC_PERIOD:
				2018	return perf_event_period(event, (u64 __user *)arg);
				2019
				2020	case PERF_EVENT_IOC_SET_OUTPUT:
				2021	return perf_event_set_output(event, arg);
				2022
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	2023	case PERF_EVENT_IOC_SET_FILTER:
				2024	return perf_event_set_filter(event, (void __user *)arg);
				2025
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2026	default:
				2027	return -ENOTTY;
				2028	}
				2029
				2030	if (flags & PERF_IOC_FLAG_GROUP)
				2031	perf_event_for_each(event, func);
				2032	else
				2033	perf_event_for_each_child(event, func);
				2034
				2035	return 0;
				2036	}
				2037
				2038	int perf_event_task_enable(void)
				2039	{
				2040	struct perf_event *event;
				2041
				2042	mutex_lock(&current->perf_event_mutex);
				2043	list_for_each_entry(event, &current->perf_event_list, owner_entry)
				2044	perf_event_for_each_child(event, perf_event_enable);
				2045	mutex_unlock(&current->perf_event_mutex);
				2046
				2047	return 0;
				2048	}
				2049
				2050	int perf_event_task_disable(void)
				2051	{
				2052	struct perf_event *event;
				2053
				2054	mutex_lock(&current->perf_event_mutex);
				2055	list_for_each_entry(event, &current->perf_event_list, owner_entry)
				2056	perf_event_for_each_child(event, perf_event_disable);
				2057	mutex_unlock(&current->perf_event_mutex);
				2058
				2059	return 0;
				2060	}
				2061
				2062	#ifndef PERF_EVENT_INDEX_OFFSET
				2063	# define PERF_EVENT_INDEX_OFFSET 0
				2064	#endif
				2065
				2066	static int perf_event_index(struct perf_event *event)
				2067	{
				2068	if (event->state != PERF_EVENT_STATE_ACTIVE)
				2069	return 0;
				2070
				2071	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
				2072	}
				2073
				2074	/*
				2075	* Callers need to ensure there can be no nesting of this function, otherwise
				2076	* the seqlock logic goes bad. We can not serialize this because the arch
				2077	* code calls this from NMI context.
				2078	*/
				2079	void perf_event_update_userpage(struct perf_event *event)
				2080	{
				2081	struct perf_event_mmap_page *userpg;
				2082	struct perf_mmap_data *data;
				2083
				2084	rcu_read_lock();
				2085	data = rcu_dereference(event->data);
				2086	if (!data)
				2087	goto unlock;
				2088
				2089	userpg = data->user_page;
				2090
				2091	/*
				2092	* Disable preemption so as to not let the corresponding user-space
				2093	* spin too long if we get preempted.
				2094	*/
				2095	preempt_disable();
				2096	++userpg->lock;
				2097	barrier();
				2098	userpg->index = perf_event_index(event);
				2099	userpg->offset = atomic64_read(&event->count);
				2100	if (event->state == PERF_EVENT_STATE_ACTIVE)
				2101	userpg->offset -= atomic64_read(&event->hw.prev_count);
				2102
				2103	userpg->time_enabled = event->total_time_enabled +
				2104	atomic64_read(&event->child_total_time_enabled);
				2105
				2106	userpg->time_running = event->total_time_running +
				2107	atomic64_read(&event->child_total_time_running);
				2108
				2109	barrier();
				2110	++userpg->lock;
				2111	preempt_enable();
				2112	unlock:
				2113	rcu_read_unlock();
				2114	}
				2115
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2116	static unsigned long perf_data_size(struct perf_mmap_data *data)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2117	{
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2118	return data->nr_pages << (PAGE_SHIFT + data->data_order);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2119	}
				2120
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2121	#ifndef CONFIG_PERF_USE_VMALLOC
				2122
				2123	/*
				2124	* Back perf_mmap() with regular GFP_KERNEL-0 pages.
				2125	*/
				2126
				2127	static struct page *
				2128	perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
				2129	{
				2130	if (pgoff > data->nr_pages)
				2131	return NULL;
				2132
				2133	if (pgoff == 0)
				2134	return virt_to_page(data->user_page);
				2135
				2136	return virt_to_page(data->data_pages[pgoff - 1]);
				2137	}
				2138
				2139	static struct perf_mmap_data *
				2140	perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2141	{
				2142	struct perf_mmap_data *data;
				2143	unsigned long size;
				2144	int i;
				2145
				2146	WARN_ON(atomic_read(&event->mmap_count));
				2147
				2148	size = sizeof(struct perf_mmap_data);
				2149	size += nr_pages * sizeof(void *);
				2150
				2151	data = kzalloc(size, GFP_KERNEL);
				2152	if (!data)
				2153	goto fail;
				2154
				2155	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
				2156	if (!data->user_page)
				2157	goto fail_user_page;
				2158
				2159	for (i = 0; i < nr_pages; i++) {
				2160	data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
				2161	if (!data->data_pages[i])
				2162	goto fail_data_pages;
				2163	}
				2164
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2165	data->data_order = 0;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2166	data->nr_pages = nr_pages;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2167
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2168	return data;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2169
				2170	fail_data_pages:
				2171	for (i--; i >= 0; i--)
				2172	free_page((unsigned long)data->data_pages[i]);
				2173
				2174	free_page((unsigned long)data->user_page);
				2175
				2176	fail_user_page:
				2177	kfree(data);
				2178
				2179	fail:
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2180	return NULL;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2181	}
				2182
				2183	static void perf_mmap_free_page(unsigned long addr)
				2184	{
				2185	struct page page = virt_to_page((void )addr);
				2186
				2187	page->mapping = NULL;
				2188	__free_page(page);
				2189	}
				2190
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2191	static void perf_mmap_data_free(struct perf_mmap_data *data)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2192	{
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2193	int i;
				2194
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2195	perf_mmap_free_page((unsigned long)data->user_page);
				2196	for (i = 0; i < data->nr_pages; i++)
				2197	perf_mmap_free_page((unsigned long)data->data_pages[i]);
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2198	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2199
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2200	#else
				2201
				2202	/*
				2203	* Back perf_mmap() with vmalloc memory.
				2204	*
				2205	* Required for architectures that have d-cache aliasing issues.
				2206	*/
				2207
				2208	static struct page *
				2209	perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
				2210	{
				2211	if (pgoff > (1UL << data->data_order))
				2212	return NULL;
				2213
				2214	return vmalloc_to_page((void )data->user_page + pgoff PAGE_SIZE);
				2215	}
				2216
				2217	static void perf_mmap_unmark_page(void *addr)
				2218	{
				2219	struct page *page = vmalloc_to_page(addr);
				2220
				2221	page->mapping = NULL;
				2222	}
				2223
				2224	static void perf_mmap_data_free_work(struct work_struct *work)
				2225	{
				2226	struct perf_mmap_data *data;
				2227	void *base;
				2228	int i, nr;
				2229
				2230	data = container_of(work, struct perf_mmap_data, work);
				2231	nr = 1 << data->data_order;
				2232
				2233	base = data->user_page;
				2234	for (i = 0; i < nr + 1; i++)
				2235	perf_mmap_unmark_page(base + (i * PAGE_SIZE));
				2236
				2237	vfree(base);
				2238	}
				2239
				2240	static void perf_mmap_data_free(struct perf_mmap_data *data)
				2241	{
				2242	schedule_work(&data->work);
				2243	}
				2244
				2245	static struct perf_mmap_data *
				2246	perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
				2247	{
				2248	struct perf_mmap_data *data;
				2249	unsigned long size;
				2250	void *all_buf;
				2251
				2252	WARN_ON(atomic_read(&event->mmap_count));
				2253
				2254	size = sizeof(struct perf_mmap_data);
				2255	size += sizeof(void *);
				2256
				2257	data = kzalloc(size, GFP_KERNEL);
				2258	if (!data)
				2259	goto fail;
				2260
				2261	INIT_WORK(&data->work, perf_mmap_data_free_work);
				2262
				2263	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
				2264	if (!all_buf)
				2265	goto fail_all_buf;
				2266
				2267	data->user_page = all_buf;
				2268	data->data_pages[0] = all_buf + PAGE_SIZE;
				2269	data->data_order = ilog2(nr_pages);
				2270	data->nr_pages = 1;
				2271
				2272	return data;
				2273
				2274	fail_all_buf:
				2275	kfree(data);
				2276
				2277	fail:
				2278	return NULL;
				2279	}
				2280
				2281	#endif
				2282
				2283	static int perf_mmap_fault(struct vm_area_struct vma, struct vm_fault vmf)
				2284	{
				2285	struct perf_event *event = vma->vm_file->private_data;
				2286	struct perf_mmap_data *data;
				2287	int ret = VM_FAULT_SIGBUS;
				2288
				2289	if (vmf->flags & FAULT_FLAG_MKWRITE) {
				2290	if (vmf->pgoff == 0)
				2291	ret = 0;
				2292	return ret;
				2293	}
				2294
				2295	rcu_read_lock();
				2296	data = rcu_dereference(event->data);
				2297	if (!data)
				2298	goto unlock;
				2299
				2300	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
				2301	goto unlock;
				2302
				2303	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
				2304	if (!vmf->page)
				2305	goto unlock;
				2306
				2307	get_page(vmf->page);
				2308	vmf->page->mapping = vma->vm_file->f_mapping;
				2309	vmf->page->index = vmf->pgoff;
				2310
				2311	ret = 0;
				2312	unlock:
				2313	rcu_read_unlock();
				2314
				2315	return ret;
				2316	}
				2317
				2318	static void
				2319	perf_mmap_data_init(struct perf_event event, struct perf_mmap_data data)
				2320	{
				2321	long max_size = perf_data_size(data);
				2322
				2323	atomic_set(&data->lock, -1);
				2324
				2325	if (event->attr.watermark) {
				2326	data->watermark = min_t(long, max_size,
				2327	event->attr.wakeup_watermark);
				2328	}
				2329
				2330	if (!data->watermark)
Stephane Eranian	8904b18	2009-11-20 22:19:57 +0100	[diff] [blame]	2331	data->watermark = max_size / 2;
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2332
				2333
				2334	rcu_assign_pointer(event->data, data);
				2335	}
				2336
				2337	static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
				2338	{
				2339	struct perf_mmap_data *data;
				2340
				2341	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
				2342	perf_mmap_data_free(data);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2343	kfree(data);
				2344	}
				2345
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2346	static void perf_mmap_data_release(struct perf_event *event)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2347	{
				2348	struct perf_mmap_data *data = event->data;
				2349
				2350	WARN_ON(atomic_read(&event->mmap_count));
				2351
				2352	rcu_assign_pointer(event->data, NULL);
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2353	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2354	}
				2355
				2356	static void perf_mmap_open(struct vm_area_struct *vma)
				2357	{
				2358	struct perf_event *event = vma->vm_file->private_data;
				2359
				2360	atomic_inc(&event->mmap_count);
				2361	}
				2362
				2363	static void perf_mmap_close(struct vm_area_struct *vma)
				2364	{
				2365	struct perf_event *event = vma->vm_file->private_data;
				2366
				2367	WARN_ON_ONCE(event->ctx->parent_ctx);
				2368	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2369	unsigned long size = perf_data_size(event->data);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2370	struct user_struct *user = current_user();
				2371
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2372	atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2373	vma->vm_mm->locked_vm -= event->data->nr_locked;
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2374	perf_mmap_data_release(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2375	mutex_unlock(&event->mmap_mutex);
				2376	}
				2377	}
				2378
Alexey Dobriyan	f0f37e2	2009-09-27 22:29:37 +0400	[diff] [blame]	2379	static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2380	.open = perf_mmap_open,
				2381	.close = perf_mmap_close,
				2382	.fault = perf_mmap_fault,
				2383	.page_mkwrite = perf_mmap_fault,
				2384	};
				2385
				2386	static int perf_mmap(struct file file, struct vm_area_struct vma)
				2387	{
				2388	struct perf_event *event = file->private_data;
				2389	unsigned long user_locked, user_lock_limit;
				2390	struct user_struct *user = current_user();
				2391	unsigned long locked, lock_limit;
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2392	struct perf_mmap_data *data;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2393	unsigned long vma_size;
				2394	unsigned long nr_pages;
				2395	long user_extra, extra;
				2396	int ret = 0;
				2397
				2398	if (!(vma->vm_flags & VM_SHARED))
				2399	return -EINVAL;
				2400
				2401	vma_size = vma->vm_end - vma->vm_start;
				2402	nr_pages = (vma_size / PAGE_SIZE) - 1;
				2403
				2404	/*
				2405	* If we have data pages ensure they're a power-of-two number, so we
				2406	* can do bitmasks instead of modulo.
				2407	*/
				2408	if (nr_pages != 0 && !is_power_of_2(nr_pages))
				2409	return -EINVAL;
				2410
				2411	if (vma_size != PAGE_SIZE * (1 + nr_pages))
				2412	return -EINVAL;
				2413
				2414	if (vma->vm_pgoff != 0)
				2415	return -EINVAL;
				2416
				2417	WARN_ON_ONCE(event->ctx->parent_ctx);
				2418	mutex_lock(&event->mmap_mutex);
				2419	if (event->output) {
				2420	ret = -EINVAL;
				2421	goto unlock;
				2422	}
				2423
				2424	if (atomic_inc_not_zero(&event->mmap_count)) {
				2425	if (nr_pages != event->data->nr_pages)
				2426	ret = -EINVAL;
				2427	goto unlock;
				2428	}
				2429
				2430	user_extra = nr_pages + 1;
				2431	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
				2432
				2433	/*
				2434	* Increase the limit linearly with more CPUs:
				2435	*/
				2436	user_lock_limit *= num_online_cpus();
				2437
				2438	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
				2439
				2440	extra = 0;
				2441	if (user_locked > user_lock_limit)
				2442	extra = user_locked - user_lock_limit;
				2443
				2444	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
				2445	lock_limit >>= PAGE_SHIFT;
				2446	locked = vma->vm_mm->locked_vm + extra;
				2447
				2448	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
				2449	!capable(CAP_IPC_LOCK)) {
				2450	ret = -EPERM;
				2451	goto unlock;
				2452	}
				2453
				2454	WARN_ON(event->data);
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2455
				2456	data = perf_mmap_data_alloc(event, nr_pages);
				2457	ret = -ENOMEM;
				2458	if (!data)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2459	goto unlock;
				2460
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2461	ret = 0;
				2462	perf_mmap_data_init(event, data);
				2463
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2464	atomic_set(&event->mmap_count, 1);
				2465	atomic_long_add(user_extra, &user->locked_vm);
				2466	vma->vm_mm->locked_vm += extra;
				2467	event->data->nr_locked = extra;
				2468	if (vma->vm_flags & VM_WRITE)
				2469	event->data->writable = 1;
				2470
				2471	unlock:
				2472	mutex_unlock(&event->mmap_mutex);
				2473
				2474	vma->vm_flags \|= VM_RESERVED;
				2475	vma->vm_ops = &perf_mmap_vmops;
				2476
				2477	return ret;
				2478	}
				2479
				2480	static int perf_fasync(int fd, struct file *filp, int on)
				2481	{
				2482	struct inode *inode = filp->f_path.dentry->d_inode;
				2483	struct perf_event *event = filp->private_data;
				2484	int retval;
				2485
				2486	mutex_lock(&inode->i_mutex);
				2487	retval = fasync_helper(fd, filp, on, &event->fasync);
				2488	mutex_unlock(&inode->i_mutex);
				2489
				2490	if (retval < 0)
				2491	return retval;
				2492
				2493	return 0;
				2494	}
				2495
				2496	static const struct file_operations perf_fops = {
				2497	.release = perf_release,
				2498	.read = perf_read,
				2499	.poll = perf_poll,
				2500	.unlocked_ioctl = perf_ioctl,
				2501	.compat_ioctl = perf_ioctl,
				2502	.mmap = perf_mmap,
				2503	.fasync = perf_fasync,
				2504	};
				2505
				2506	/*
				2507	* Perf event wakeup
				2508	*
				2509	* If there's data, ensure we set the poll() state and publish everything
				2510	* to user-space before waking everybody up.
				2511	*/
				2512
				2513	void perf_event_wakeup(struct perf_event *event)
				2514	{
				2515	wake_up_all(&event->waitq);
				2516
				2517	if (event->pending_kill) {
				2518	kill_fasync(&event->fasync, SIGIO, event->pending_kill);
				2519	event->pending_kill = 0;
				2520	}
				2521	}
				2522
				2523	/*
				2524	* Pending wakeups
				2525	*
				2526	* Handle the case where we need to wakeup up from NMI (or rq->lock) context.
				2527	*
				2528	* The NMI bit means we cannot possibly take locks. Therefore, maintain a
				2529	* single linked list and use cmpxchg() to add entries lockless.
				2530	*/
				2531
				2532	static void perf_pending_event(struct perf_pending_entry *entry)
				2533	{
				2534	struct perf_event *event = container_of(entry,
				2535	struct perf_event, pending);
				2536
				2537	if (event->pending_disable) {
				2538	event->pending_disable = 0;
				2539	__perf_event_disable(event);
				2540	}
				2541
				2542	if (event->pending_wakeup) {
				2543	event->pending_wakeup = 0;
				2544	perf_event_wakeup(event);
				2545	}
				2546	}
				2547
				2548	#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
				2549
				2550	static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
				2551	PENDING_TAIL,
				2552	};
				2553
				2554	static void perf_pending_queue(struct perf_pending_entry *entry,
				2555	void (func)(struct perf_pending_entry ))
				2556	{
				2557	struct perf_pending_entry **head;
				2558
				2559	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
				2560	return;
				2561
				2562	entry->func = func;
				2563
				2564	head = &get_cpu_var(perf_pending_head);
				2565
				2566	do {
				2567	entry->next = *head;
				2568	} while (cmpxchg(head, entry->next, entry) != entry->next);
				2569
				2570	set_perf_event_pending();
				2571
				2572	put_cpu_var(perf_pending_head);
				2573	}
				2574
				2575	static int __perf_pending_run(void)
				2576	{
				2577	struct perf_pending_entry *list;
				2578	int nr = 0;
				2579
				2580	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
				2581	while (list != PENDING_TAIL) {
				2582	void (func)(struct perf_pending_entry );
				2583	struct perf_pending_entry *entry = list;
				2584
				2585	list = list->next;
				2586
				2587	func = entry->func;
				2588	entry->next = NULL;
				2589	/*
				2590	* Ensure we observe the unqueue before we issue the wakeup,
				2591	* so that we won't be waiting forever.
				2592	* -- see perf_not_pending().
				2593	*/
				2594	smp_wmb();
				2595
				2596	func(entry);
				2597	nr++;
				2598	}
				2599
				2600	return nr;
				2601	}
				2602
				2603	static inline int perf_not_pending(struct perf_event *event)
				2604	{
				2605	/*
				2606	* If we flush on whatever cpu we run, there is a chance we don't
				2607	* need to wait.
				2608	*/
				2609	get_cpu();
				2610	__perf_pending_run();
				2611	put_cpu();
				2612
				2613	/*
				2614	* Ensure we see the proper queue state before going to sleep
				2615	* so that we do not miss the wakeup. -- see perf_pending_handle()
				2616	*/
				2617	smp_rmb();
				2618	return event->pending.next == NULL;
				2619	}
				2620
				2621	static void perf_pending_sync(struct perf_event *event)
				2622	{
				2623	wait_event(event->waitq, perf_not_pending(event));
				2624	}
				2625
				2626	void perf_event_do_pending(void)
				2627	{
				2628	__perf_pending_run();
				2629	}
				2630
				2631	/*
				2632	* Callchain support -- arch specific
				2633	*/
				2634
				2635	__weak struct perf_callchain_entry perf_callchain(struct pt_regs regs)
				2636	{
				2637	return NULL;
				2638	}
				2639
				2640	/*
				2641	* Output
				2642	*/
				2643	static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
				2644	unsigned long offset, unsigned long head)
				2645	{
				2646	unsigned long mask;
				2647
				2648	if (!data->writable)
				2649	return true;
				2650
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2651	mask = perf_data_size(data) - 1;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2652
				2653	offset = (offset - tail) & mask;
				2654	head = (head - tail) & mask;
				2655
				2656	if ((int)(head - offset) < 0)
				2657	return false;
				2658
				2659	return true;
				2660	}
				2661
				2662	static void perf_output_wakeup(struct perf_output_handle *handle)
				2663	{
				2664	atomic_set(&handle->data->poll, POLL_IN);
				2665
				2666	if (handle->nmi) {
				2667	handle->event->pending_wakeup = 1;
				2668	perf_pending_queue(&handle->event->pending,
				2669	perf_pending_event);
				2670	} else
				2671	perf_event_wakeup(handle->event);
				2672	}
				2673
				2674	/*
				2675	* Curious locking construct.
				2676	*
				2677	* We need to ensure a later event_id doesn't publish a head when a former
				2678	* event_id isn't done writing. However since we need to deal with NMIs we
				2679	* cannot fully serialize things.
				2680	*
				2681	* What we do is serialize between CPUs so we only have to deal with NMI
				2682	* nesting on a single CPU.
				2683	*
				2684	* We only publish the head (and generate a wakeup) when the outer-most
				2685	* event_id completes.
				2686	*/
				2687	static void perf_output_lock(struct perf_output_handle *handle)
				2688	{
				2689	struct perf_mmap_data *data = handle->data;
Peter Zijlstra	559fdc3	2009-11-16 12:45:14 +0100	[diff] [blame]	2690	int cur, cpu = get_cpu();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2691
				2692	handle->locked = 0;
				2693
Peter Zijlstra	559fdc3	2009-11-16 12:45:14 +0100	[diff] [blame]	2694	for (;;) {
				2695	cur = atomic_cmpxchg(&data->lock, -1, cpu);
				2696	if (cur == -1) {
				2697	handle->locked = 1;
				2698	break;
				2699	}
				2700	if (cur == cpu)
				2701	break;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2702
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2703	cpu_relax();
Peter Zijlstra	559fdc3	2009-11-16 12:45:14 +0100	[diff] [blame]	2704	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2705	}
				2706
				2707	static void perf_output_unlock(struct perf_output_handle *handle)
				2708	{
				2709	struct perf_mmap_data *data = handle->data;
				2710	unsigned long head;
				2711	int cpu;
				2712
				2713	data->done_head = data->head;
				2714
				2715	if (!handle->locked)
				2716	goto out;
				2717
				2718	again:
				2719	/*
				2720	* The xchg implies a full barrier that ensures all writes are done
				2721	* before we publish the new head, matched by a rmb() in userspace when
				2722	* reading this position.
				2723	*/
				2724	while ((head = atomic_long_xchg(&data->done_head, 0)))
				2725	data->user_page->data_head = head;
				2726
				2727	/*
				2728	* NMI can happen here, which means we can miss a done_head update.
				2729	*/
				2730
				2731	cpu = atomic_xchg(&data->lock, -1);
				2732	WARN_ON_ONCE(cpu != smp_processor_id());
				2733
				2734	/*
				2735	* Therefore we have to validate we did not indeed do so.
				2736	*/
				2737	if (unlikely(atomic_long_read(&data->done_head))) {
				2738	/*
				2739	* Since we had it locked, we can lock it again.
				2740	*/
				2741	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
				2742	cpu_relax();
				2743
				2744	goto again;
				2745	}
				2746
				2747	if (atomic_xchg(&data->wakeup, 0))
				2748	perf_output_wakeup(handle);
				2749	out:
Peter Zijlstra	559fdc3	2009-11-16 12:45:14 +0100	[diff] [blame]	2750	put_cpu();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2751	}
				2752
				2753	void perf_output_copy(struct perf_output_handle *handle,
				2754	const void *buf, unsigned int len)
				2755	{
				2756	unsigned int pages_mask;
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2757	unsigned long offset;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2758	unsigned int size;
				2759	void **pages;
				2760
				2761	offset = handle->offset;
				2762	pages_mask = handle->data->nr_pages - 1;
				2763	pages = handle->data->data_pages;
				2764
				2765	do {
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2766	unsigned long page_offset;
				2767	unsigned long page_size;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2768	int nr;
				2769
				2770	nr = (offset >> PAGE_SHIFT) & pages_mask;
Peter Zijlstra	906010b	2009-09-21 16:08:49 +0200	[diff] [blame]	2771	page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
				2772	page_offset = offset & (page_size - 1);
				2773	size = min_t(unsigned int, page_size - page_offset, len);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	2774
				2775	memcpy(pages[nr] + page_offset, buf, size);
				2776
				2777	len -= size;
				2778	buf += size;
				2779	offset += size;
				2780	} while (len);
				2781
				2782	handle->offset = offset;
				2783
				2784	/*
				2785	* Check we didn't copy past our reservation window, taking the
				2786	* possible unsigned int wrap into account.
				2787	*/
				2788	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
				2789	}
				2790
				2791	int perf_output_begin(struct perf_output_handle *handle,
				2792	struct perf_event *event, unsigned int size,
				2793	int nmi, int sample)
				2794	{
				2795	struct perf_event *output_event;
				2796	struct perf_mmap_data *data;
				2797	unsigned long tail, offset, head;
				2798	int have_lost;
				2799	struct {
				2800	struct perf_event_header header;
				2801	u64 id;
				2802	u64 lost;
				2803	} lost_event;
				2804
				2805	rcu_read_lock();
				2806	/*
				2807	* For inherited events we send all the output towards the parent.
				2808	*/
				2809	if (event->parent)
				2810	event = event->parent;
				2811
				2812	output_event = rcu_dereference(event->output);
				2813	if (output_event)
				2814	event = output_event;
				2815
				2816	data = rcu_dereference(event->data);
				2817	if (!data)
				2818	goto out;
				2819
				2820	handle->data = data;
				2821	handle->event = event;
				2822	handle->nmi = nmi;
				2823	handle->sample = sample;
				2824
				2825	if (!data->nr_pages)
				2826	goto fail;
				2827
				2828	have_lost = atomic_read(&data->lost);
				2829	if (have_lost)
				2830	size += sizeof(lost_event);
				2831
				2832	perf_output_lock(handle);
				2833
				2834	do {
				2835	/*
				2836	* Userspace could choose to issue a mb() before updating the
				2837	* tail pointer. So that all reads will be completed before the
				2838	* write is issued.
				2839	*/
				2840	tail = ACCESS_ONCE(data->user_page->data_tail);
				2841	smp_rmb();
				2842	offset = head = atomic_long_read(&data->head);
				2843	head += size;
				2844	if (unlikely(!perf_output_space(data, tail, offset, head)))
				2845	goto fail;
				2846	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
				2847
				2848	handle->offset = offset;
				2849	handle->head = head;
				2850
				2851	if (head - tail > data->watermark)
				2852	atomic_set(&data->wakeup, 1);
				2853
				2854	if (have_lost) {
				2855	lost_event.header.type = PERF_RECORD_LOST;
				2856	lost_event.header.misc = 0;
				2857	lost_event.header.size = sizeof(lost_event);
				2858	lost_event.id = event->id;
				2859	lost_event.lost = atomic_xchg(&data->lost, 0);
				2860
				2861	perf_output_put(handle, lost_event);
				2862	}
				2863
				2864	return 0;
				2865
				2866	fail:
				2867	atomic_inc(&data->lost);
				2868	perf_output_unlock(handle);
				2869	out:
				2870	rcu_read_unlock();
				2871
				2872	return -ENOSPC;
				2873	}
				2874
				2875	void perf_output_end(struct perf_output_handle *handle)
				2876	{
				2877	struct perf_event *event = handle->event;
				2878	struct perf_mmap_data *data = handle->data;
				2879
				2880	int wakeup_events = event->attr.wakeup_events;
				2881
				2882	if (handle->sample && wakeup_events) {
				2883	int events = atomic_inc_return(&data->events);
				2884	if (events >= wakeup_events) {
				2885	atomic_sub(wakeup_events, &data->events);
				2886	atomic_set(&data->wakeup, 1);
				2887	}
				2888	}
				2889
				2890	perf_output_unlock(handle);
				2891	rcu_read_unlock();
				2892	}
				2893
				2894	static u32 perf_event_pid(struct perf_event event, struct task_struct p)
				2895	{
				2896	/*
				2897	* only top level events have the pid namespace they were created in
				2898	*/
				2899	if (event->parent)
				2900	event = event->parent;
				2901
				2902	return task_tgid_nr_ns(p, event->ns);
				2903	}
				2904
				2905	static u32 perf_event_tid(struct perf_event event, struct task_struct p)
				2906	{
				2907	/*
				2908	* only top level events have the pid namespace they were created in
				2909	*/
				2910	if (event->parent)
				2911	event = event->parent;
				2912
				2913	return task_pid_nr_ns(p, event->ns);
				2914	}
				2915
				2916	static void perf_output_read_one(struct perf_output_handle *handle,
				2917	struct perf_event *event)
				2918	{
				2919	u64 read_format = event->attr.read_format;
				2920	u64 values[4];
				2921	int n = 0;
				2922
				2923	values[n++] = atomic64_read(&event->count);
				2924	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
				2925	values[n++] = event->total_time_enabled +
				2926	atomic64_read(&event->child_total_time_enabled);
				2927	}
				2928	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
				2929	values[n++] = event->total_time_running +
				2930	atomic64_read(&event->child_total_time_running);
				2931	}
				2932	if (read_format & PERF_FORMAT_ID)
				2933	values[n++] = primary_event_id(event);
				2934
				2935	perf_output_copy(handle, values, n * sizeof(u64));
				2936	}
				2937
				2938	/*
				2939	* XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
				2940	*/
				2941	static void perf_output_read_group(struct perf_output_handle *handle,
				2942	struct perf_event *event)
				2943	{
				2944	struct perf_event leader = event->group_leader, sub;
				2945	u64 read_format = event->attr.read_format;
				2946	u64 values[5];
				2947	int n = 0;
				2948
				2949	values[n++] = 1 + leader->nr_siblings;
				2950
				2951	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				2952	values[n++] = leader->total_time_enabled;
				2953
				2954	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				2955	values[n++] = leader->total_time_running;
				2956
				2957	if (leader != event)
				2958	leader->pmu->read(leader);
				2959
				2960	values[n++] = atomic64_read(&leader->count);
				2961	if (read_format & PERF_FORMAT_ID)
				2962	values[n++] = primary_event_id(leader);
				2963
				2964	perf_output_copy(handle, values, n * sizeof(u64));
				2965
				2966	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
				2967	n = 0;
				2968
				2969	if (sub != event)
				2970	sub->pmu->read(sub);
				2971
				2972	values[n++] = atomic64_read(&sub->count);
				2973	if (read_format & PERF_FORMAT_ID)
				2974	values[n++] = primary_event_id(sub);
				2975
				2976	perf_output_copy(handle, values, n * sizeof(u64));
				2977	}
				2978	}
				2979
				2980	static void perf_output_read(struct perf_output_handle *handle,
				2981	struct perf_event *event)
				2982	{
				2983	if (event->attr.read_format & PERF_FORMAT_GROUP)
				2984	perf_output_read_group(handle, event);
				2985	else
				2986	perf_output_read_one(handle, event);
				2987	}
				2988
				2989	void perf_output_sample(struct perf_output_handle *handle,
				2990	struct perf_event_header *header,
				2991	struct perf_sample_data *data,
				2992	struct perf_event *event)
				2993	{
				2994	u64 sample_type = data->type;
				2995
				2996	perf_output_put(handle, *header);
				2997
				2998	if (sample_type & PERF_SAMPLE_IP)
				2999	perf_output_put(handle, data->ip);
				3000
				3001	if (sample_type & PERF_SAMPLE_TID)
				3002	perf_output_put(handle, data->tid_entry);
				3003
				3004	if (sample_type & PERF_SAMPLE_TIME)
				3005	perf_output_put(handle, data->time);
				3006
				3007	if (sample_type & PERF_SAMPLE_ADDR)
				3008	perf_output_put(handle, data->addr);
				3009
				3010	if (sample_type & PERF_SAMPLE_ID)
				3011	perf_output_put(handle, data->id);
				3012
				3013	if (sample_type & PERF_SAMPLE_STREAM_ID)
				3014	perf_output_put(handle, data->stream_id);
				3015
				3016	if (sample_type & PERF_SAMPLE_CPU)
				3017	perf_output_put(handle, data->cpu_entry);
				3018
				3019	if (sample_type & PERF_SAMPLE_PERIOD)
				3020	perf_output_put(handle, data->period);
				3021
				3022	if (sample_type & PERF_SAMPLE_READ)
				3023	perf_output_read(handle, event);
				3024
				3025	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
				3026	if (data->callchain) {
				3027	int size = 1;
				3028
				3029	if (data->callchain)
				3030	size += data->callchain->nr;
				3031
				3032	size *= sizeof(u64);
				3033
				3034	perf_output_copy(handle, data->callchain, size);
				3035	} else {
				3036	u64 nr = 0;
				3037	perf_output_put(handle, nr);
				3038	}
				3039	}
				3040
				3041	if (sample_type & PERF_SAMPLE_RAW) {
				3042	if (data->raw) {
				3043	perf_output_put(handle, data->raw->size);
				3044	perf_output_copy(handle, data->raw->data,
				3045	data->raw->size);
				3046	} else {
				3047	struct {
				3048	u32 size;
				3049	u32 data;
				3050	} raw = {
				3051	.size = sizeof(u32),
				3052	.data = 0,
				3053	};
				3054	perf_output_put(handle, raw);
				3055	}
				3056	}
				3057	}
				3058
				3059	void perf_prepare_sample(struct perf_event_header *header,
				3060	struct perf_sample_data *data,
				3061	struct perf_event *event,
				3062	struct pt_regs *regs)
				3063	{
				3064	u64 sample_type = event->attr.sample_type;
				3065
				3066	data->type = sample_type;
				3067
				3068	header->type = PERF_RECORD_SAMPLE;
				3069	header->size = sizeof(*header);
				3070
				3071	header->misc = 0;
				3072	header->misc \|= perf_misc_flags(regs);
				3073
				3074	if (sample_type & PERF_SAMPLE_IP) {
				3075	data->ip = perf_instruction_pointer(regs);
				3076
				3077	header->size += sizeof(data->ip);
				3078	}
				3079
				3080	if (sample_type & PERF_SAMPLE_TID) {
				3081	/* namespace issues */
				3082	data->tid_entry.pid = perf_event_pid(event, current);
				3083	data->tid_entry.tid = perf_event_tid(event, current);
				3084
				3085	header->size += sizeof(data->tid_entry);
				3086	}
				3087
				3088	if (sample_type & PERF_SAMPLE_TIME) {
				3089	data->time = perf_clock();
				3090
				3091	header->size += sizeof(data->time);
				3092	}
				3093
				3094	if (sample_type & PERF_SAMPLE_ADDR)
				3095	header->size += sizeof(data->addr);
				3096
				3097	if (sample_type & PERF_SAMPLE_ID) {
				3098	data->id = primary_event_id(event);
				3099
				3100	header->size += sizeof(data->id);
				3101	}
				3102
				3103	if (sample_type & PERF_SAMPLE_STREAM_ID) {
				3104	data->stream_id = event->id;
				3105
				3106	header->size += sizeof(data->stream_id);
				3107	}
				3108
				3109	if (sample_type & PERF_SAMPLE_CPU) {
				3110	data->cpu_entry.cpu = raw_smp_processor_id();
				3111	data->cpu_entry.reserved = 0;
				3112
				3113	header->size += sizeof(data->cpu_entry);
				3114	}
				3115
				3116	if (sample_type & PERF_SAMPLE_PERIOD)
				3117	header->size += sizeof(data->period);
				3118
				3119	if (sample_type & PERF_SAMPLE_READ)
				3120	header->size += perf_event_read_size(event);
				3121
				3122	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
				3123	int size = 1;
				3124
				3125	data->callchain = perf_callchain(regs);
				3126
				3127	if (data->callchain)
				3128	size += data->callchain->nr;
				3129
				3130	header->size += size * sizeof(u64);
				3131	}
				3132
				3133	if (sample_type & PERF_SAMPLE_RAW) {
				3134	int size = sizeof(u32);
				3135
				3136	if (data->raw)
				3137	size += data->raw->size;
				3138	else
				3139	size += sizeof(u32);
				3140
				3141	WARN_ON_ONCE(size & (sizeof(u64)-1));
				3142	header->size += size;
				3143	}
				3144	}
				3145
				3146	static void perf_event_output(struct perf_event *event, int nmi,
				3147	struct perf_sample_data *data,
				3148	struct pt_regs *regs)
				3149	{
				3150	struct perf_output_handle handle;
				3151	struct perf_event_header header;
				3152
				3153	perf_prepare_sample(&header, data, event, regs);
				3154
				3155	if (perf_output_begin(&handle, event, header.size, nmi, 1))
				3156	return;
				3157
				3158	perf_output_sample(&handle, &header, data, event);
				3159
				3160	perf_output_end(&handle);
				3161	}
				3162
				3163	/*
				3164	* read event_id
				3165	*/
				3166
				3167	struct perf_read_event {
				3168	struct perf_event_header header;
				3169
				3170	u32 pid;
				3171	u32 tid;
				3172	};
				3173
				3174	static void
				3175	perf_event_read_event(struct perf_event *event,
				3176	struct task_struct *task)
				3177	{
				3178	struct perf_output_handle handle;
				3179	struct perf_read_event read_event = {
				3180	.header = {
				3181	.type = PERF_RECORD_READ,
				3182	.misc = 0,
				3183	.size = sizeof(read_event) + perf_event_read_size(event),
				3184	},
				3185	.pid = perf_event_pid(event, task),
				3186	.tid = perf_event_tid(event, task),
				3187	};
				3188	int ret;
				3189
				3190	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
				3191	if (ret)
				3192	return;
				3193
				3194	perf_output_put(&handle, read_event);
				3195	perf_output_read(&handle, event);
				3196
				3197	perf_output_end(&handle);
				3198	}
				3199
				3200	/*
				3201	* task tracking -- fork/exit
				3202	*
				3203	* enabled by: attr.comm \| attr.mmap \| attr.task
				3204	*/
				3205
				3206	struct perf_task_event {
				3207	struct task_struct *task;
				3208	struct perf_event_context *task_ctx;
				3209
				3210	struct {
				3211	struct perf_event_header header;
				3212
				3213	u32 pid;
				3214	u32 ppid;
				3215	u32 tid;
				3216	u32 ptid;
				3217	u64 time;
				3218	} event_id;
				3219	};
				3220
				3221	static void perf_event_task_output(struct perf_event *event,
				3222	struct perf_task_event *task_event)
				3223	{
				3224	struct perf_output_handle handle;
				3225	int size;
				3226	struct task_struct *task = task_event->task;
				3227	int ret;
				3228
				3229	size = task_event->event_id.header.size;
				3230	ret = perf_output_begin(&handle, event, size, 0, 0);
				3231
				3232	if (ret)
				3233	return;
				3234
				3235	task_event->event_id.pid = perf_event_pid(event, task);
				3236	task_event->event_id.ppid = perf_event_pid(event, current);
				3237
				3238	task_event->event_id.tid = perf_event_tid(event, task);
				3239	task_event->event_id.ptid = perf_event_tid(event, current);
				3240
				3241	task_event->event_id.time = perf_clock();
				3242
				3243	perf_output_put(&handle, task_event->event_id);
				3244
				3245	perf_output_end(&handle);
				3246	}
				3247
				3248	static int perf_event_task_match(struct perf_event *event)
				3249	{
				3250	if (event->attr.comm \|\| event->attr.mmap \|\| event->attr.task)
				3251	return 1;
				3252
				3253	return 0;
				3254	}
				3255
				3256	static void perf_event_task_ctx(struct perf_event_context *ctx,
				3257	struct perf_task_event *task_event)
				3258	{
				3259	struct perf_event *event;
				3260
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3261	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				3262	if (perf_event_task_match(event))
				3263	perf_event_task_output(event, task_event);
				3264	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3265	}
				3266
				3267	static void perf_event_task_event(struct perf_task_event *task_event)
				3268	{
				3269	struct perf_cpu_context *cpuctx;
				3270	struct perf_event_context *ctx = task_event->task_ctx;
				3271
Peter Zijlstra	d6ff86c	2009-11-20 22:19:46 +0100	[diff] [blame]	3272	rcu_read_lock();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3273	cpuctx = &get_cpu_var(perf_cpu_context);
				3274	perf_event_task_ctx(&cpuctx->ctx, task_event);
				3275	put_cpu_var(perf_cpu_context);
				3276
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3277	if (!ctx)
				3278	ctx = rcu_dereference(task_event->task->perf_event_ctxp);
				3279	if (ctx)
				3280	perf_event_task_ctx(ctx, task_event);
				3281	rcu_read_unlock();
				3282	}
				3283
				3284	static void perf_event_task(struct task_struct *task,
				3285	struct perf_event_context *task_ctx,
				3286	int new)
				3287	{
				3288	struct perf_task_event task_event;
				3289
				3290	if (!atomic_read(&nr_comm_events) &&
				3291	!atomic_read(&nr_mmap_events) &&
				3292	!atomic_read(&nr_task_events))
				3293	return;
				3294
				3295	task_event = (struct perf_task_event){
				3296	.task = task,
				3297	.task_ctx = task_ctx,
				3298	.event_id = {
				3299	.header = {
				3300	.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
				3301	.misc = 0,
				3302	.size = sizeof(task_event.event_id),
				3303	},
				3304	/* .pid */
				3305	/* .ppid */
				3306	/* .tid */
				3307	/* .ptid */
				3308	},
				3309	};
				3310
				3311	perf_event_task_event(&task_event);
				3312	}
				3313
				3314	void perf_event_fork(struct task_struct *task)
				3315	{
				3316	perf_event_task(task, NULL, 1);
				3317	}
				3318
				3319	/*
				3320	* comm tracking
				3321	*/
				3322
				3323	struct perf_comm_event {
				3324	struct task_struct *task;
				3325	char *comm;
				3326	int comm_size;
				3327
				3328	struct {
				3329	struct perf_event_header header;
				3330
				3331	u32 pid;
				3332	u32 tid;
				3333	} event_id;
				3334	};
				3335
				3336	static void perf_event_comm_output(struct perf_event *event,
				3337	struct perf_comm_event *comm_event)
				3338	{
				3339	struct perf_output_handle handle;
				3340	int size = comm_event->event_id.header.size;
				3341	int ret = perf_output_begin(&handle, event, size, 0, 0);
				3342
				3343	if (ret)
				3344	return;
				3345
				3346	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
				3347	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
				3348
				3349	perf_output_put(&handle, comm_event->event_id);
				3350	perf_output_copy(&handle, comm_event->comm,
				3351	comm_event->comm_size);
				3352	perf_output_end(&handle);
				3353	}
				3354
				3355	static int perf_event_comm_match(struct perf_event *event)
				3356	{
				3357	if (event->attr.comm)
				3358	return 1;
				3359
				3360	return 0;
				3361	}
				3362
				3363	static void perf_event_comm_ctx(struct perf_event_context *ctx,
				3364	struct perf_comm_event *comm_event)
				3365	{
				3366	struct perf_event *event;
				3367
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3368	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				3369	if (perf_event_comm_match(event))
				3370	perf_event_comm_output(event, comm_event);
				3371	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3372	}
				3373
				3374	static void perf_event_comm_event(struct perf_comm_event *comm_event)
				3375	{
				3376	struct perf_cpu_context *cpuctx;
				3377	struct perf_event_context *ctx;
				3378	unsigned int size;
				3379	char comm[TASK_COMM_LEN];
				3380
				3381	memset(comm, 0, sizeof(comm));
Márton Németh	96b02d7	2009-11-21 23:10:15 +0100	[diff] [blame]	3382	strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3383	size = ALIGN(strlen(comm)+1, sizeof(u64));
				3384
				3385	comm_event->comm = comm;
				3386	comm_event->comm_size = size;
				3387
				3388	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
				3389
Peter Zijlstra	f6595f3	2009-11-20 22:19:47 +0100	[diff] [blame]	3390	rcu_read_lock();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3391	cpuctx = &get_cpu_var(perf_cpu_context);
				3392	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
				3393	put_cpu_var(perf_cpu_context);
				3394
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3395	/*
				3396	* doesn't really matter which of the child contexts the
				3397	* events ends up in.
				3398	*/
				3399	ctx = rcu_dereference(current->perf_event_ctxp);
				3400	if (ctx)
				3401	perf_event_comm_ctx(ctx, comm_event);
				3402	rcu_read_unlock();
				3403	}
				3404
				3405	void perf_event_comm(struct task_struct *task)
				3406	{
				3407	struct perf_comm_event comm_event;
				3408
				3409	if (task->perf_event_ctxp)
				3410	perf_event_enable_on_exec(task);
				3411
				3412	if (!atomic_read(&nr_comm_events))
				3413	return;
				3414
				3415	comm_event = (struct perf_comm_event){
				3416	.task = task,
				3417	/* .comm */
				3418	/* .comm_size */
				3419	.event_id = {
				3420	.header = {
				3421	.type = PERF_RECORD_COMM,
				3422	.misc = 0,
				3423	/* .size */
				3424	},
				3425	/* .pid */
				3426	/* .tid */
				3427	},
				3428	};
				3429
				3430	perf_event_comm_event(&comm_event);
				3431	}
				3432
				3433	/*
				3434	* mmap tracking
				3435	*/
				3436
				3437	struct perf_mmap_event {
				3438	struct vm_area_struct *vma;
				3439
				3440	const char *file_name;
				3441	int file_size;
				3442
				3443	struct {
				3444	struct perf_event_header header;
				3445
				3446	u32 pid;
				3447	u32 tid;
				3448	u64 start;
				3449	u64 len;
				3450	u64 pgoff;
				3451	} event_id;
				3452	};
				3453
				3454	static void perf_event_mmap_output(struct perf_event *event,
				3455	struct perf_mmap_event *mmap_event)
				3456	{
				3457	struct perf_output_handle handle;
				3458	int size = mmap_event->event_id.header.size;
				3459	int ret = perf_output_begin(&handle, event, size, 0, 0);
				3460
				3461	if (ret)
				3462	return;
				3463
				3464	mmap_event->event_id.pid = perf_event_pid(event, current);
				3465	mmap_event->event_id.tid = perf_event_tid(event, current);
				3466
				3467	perf_output_put(&handle, mmap_event->event_id);
				3468	perf_output_copy(&handle, mmap_event->file_name,
				3469	mmap_event->file_size);
				3470	perf_output_end(&handle);
				3471	}
				3472
				3473	static int perf_event_mmap_match(struct perf_event *event,
				3474	struct perf_mmap_event *mmap_event)
				3475	{
				3476	if (event->attr.mmap)
				3477	return 1;
				3478
				3479	return 0;
				3480	}
				3481
				3482	static void perf_event_mmap_ctx(struct perf_event_context *ctx,
				3483	struct perf_mmap_event *mmap_event)
				3484	{
				3485	struct perf_event *event;
				3486
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3487	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				3488	if (perf_event_mmap_match(event, mmap_event))
				3489	perf_event_mmap_output(event, mmap_event);
				3490	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3491	}
				3492
				3493	static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
				3494	{
				3495	struct perf_cpu_context *cpuctx;
				3496	struct perf_event_context *ctx;
				3497	struct vm_area_struct *vma = mmap_event->vma;
				3498	struct file *file = vma->vm_file;
				3499	unsigned int size;
				3500	char tmp[16];
				3501	char *buf = NULL;
				3502	const char *name;
				3503
				3504	memset(tmp, 0, sizeof(tmp));
				3505
				3506	if (file) {
				3507	/*
				3508	* d_path works from the end of the buffer backwards, so we
				3509	* need to add enough zero bytes after the string to handle
				3510	* the 64bit alignment we do later.
				3511	*/
				3512	buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
				3513	if (!buf) {
				3514	name = strncpy(tmp, "//enomem", sizeof(tmp));
				3515	goto got_name;
				3516	}
				3517	name = d_path(&file->f_path, buf, PATH_MAX);
				3518	if (IS_ERR(name)) {
				3519	name = strncpy(tmp, "//toolong", sizeof(tmp));
				3520	goto got_name;
				3521	}
				3522	} else {
				3523	if (arch_vma_name(mmap_event->vma)) {
				3524	name = strncpy(tmp, arch_vma_name(mmap_event->vma),
				3525	sizeof(tmp));
				3526	goto got_name;
				3527	}
				3528
				3529	if (!vma->vm_mm) {
				3530	name = strncpy(tmp, "[vdso]", sizeof(tmp));
				3531	goto got_name;
				3532	}
				3533
				3534	name = strncpy(tmp, "//anon", sizeof(tmp));
				3535	goto got_name;
				3536	}
				3537
				3538	got_name:
				3539	size = ALIGN(strlen(name)+1, sizeof(u64));
				3540
				3541	mmap_event->file_name = name;
				3542	mmap_event->file_size = size;
				3543
				3544	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
				3545
Peter Zijlstra	f6d9dd2	2009-11-20 22:19:48 +0100	[diff] [blame]	3546	rcu_read_lock();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3547	cpuctx = &get_cpu_var(perf_cpu_context);
				3548	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
				3549	put_cpu_var(perf_cpu_context);
				3550
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3551	/*
				3552	* doesn't really matter which of the child contexts the
				3553	* events ends up in.
				3554	*/
				3555	ctx = rcu_dereference(current->perf_event_ctxp);
				3556	if (ctx)
				3557	perf_event_mmap_ctx(ctx, mmap_event);
				3558	rcu_read_unlock();
				3559
				3560	kfree(buf);
				3561	}
				3562
				3563	void __perf_event_mmap(struct vm_area_struct *vma)
				3564	{
				3565	struct perf_mmap_event mmap_event;
				3566
				3567	if (!atomic_read(&nr_mmap_events))
				3568	return;
				3569
				3570	mmap_event = (struct perf_mmap_event){
				3571	.vma = vma,
				3572	/* .file_name */
				3573	/* .file_size */
				3574	.event_id = {
				3575	.header = {
				3576	.type = PERF_RECORD_MMAP,
				3577	.misc = 0,
				3578	/* .size */
				3579	},
				3580	/* .pid */
				3581	/* .tid */
				3582	.start = vma->vm_start,
				3583	.len = vma->vm_end - vma->vm_start,
				3584	.pgoff = vma->vm_pgoff,
				3585	},
				3586	};
				3587
				3588	perf_event_mmap_event(&mmap_event);
				3589	}
				3590
				3591	/*
				3592	* IRQ throttle logging
				3593	*/
				3594
				3595	static void perf_log_throttle(struct perf_event *event, int enable)
				3596	{
				3597	struct perf_output_handle handle;
				3598	int ret;
				3599
				3600	struct {
				3601	struct perf_event_header header;
				3602	u64 time;
				3603	u64 id;
				3604	u64 stream_id;
				3605	} throttle_event = {
				3606	.header = {
				3607	.type = PERF_RECORD_THROTTLE,
				3608	.misc = 0,
				3609	.size = sizeof(throttle_event),
				3610	},
				3611	.time = perf_clock(),
				3612	.id = primary_event_id(event),
				3613	.stream_id = event->id,
				3614	};
				3615
				3616	if (enable)
				3617	throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
				3618
				3619	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
				3620	if (ret)
				3621	return;
				3622
				3623	perf_output_put(&handle, throttle_event);
				3624	perf_output_end(&handle);
				3625	}
				3626
				3627	/*
				3628	* Generic event overflow handling, sampling.
				3629	*/
				3630
				3631	static int __perf_event_overflow(struct perf_event *event, int nmi,
				3632	int throttle, struct perf_sample_data *data,
				3633	struct pt_regs *regs)
				3634	{
				3635	int events = atomic_read(&event->event_limit);
				3636	struct hw_perf_event *hwc = &event->hw;
				3637	int ret = 0;
				3638
				3639	throttle = (throttle && event->pmu->unthrottle != NULL);
				3640
				3641	if (!throttle) {
				3642	hwc->interrupts++;
				3643	} else {
				3644	if (hwc->interrupts != MAX_INTERRUPTS) {
				3645	hwc->interrupts++;
				3646	if (HZ * hwc->interrupts >
				3647	(u64)sysctl_perf_event_sample_rate) {
				3648	hwc->interrupts = MAX_INTERRUPTS;
				3649	perf_log_throttle(event, 0);
				3650	ret = 1;
				3651	}
				3652	} else {
				3653	/*
				3654	* Keep re-disabling events even though on the previous
				3655	* pass we disabled it - just in case we raced with a
				3656	* sched-in and the event got enabled again:
				3657	*/
				3658	ret = 1;
				3659	}
				3660	}
				3661
				3662	if (event->attr.freq) {
				3663	u64 now = perf_clock();
				3664	s64 delta = now - hwc->freq_stamp;
				3665
				3666	hwc->freq_stamp = now;
				3667
				3668	if (delta > 0 && delta < TICK_NSEC)
				3669	perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
				3670	}
				3671
				3672	/*
				3673	* XXX event_limit might not quite work as expected on inherited
				3674	* events
				3675	*/
				3676
				3677	event->pending_kill = POLL_IN;
				3678	if (events && atomic_dec_and_test(&event->event_limit)) {
				3679	ret = 1;
				3680	event->pending_kill = POLL_HUP;
				3681	if (nmi) {
				3682	event->pending_disable = 1;
				3683	perf_pending_queue(&event->pending,
				3684	perf_pending_event);
				3685	} else
				3686	perf_event_disable(event);
				3687	}
				3688
Peter Zijlstra	453f19e	2009-11-20 22:19:43 +0100	[diff] [blame]	3689	if (event->overflow_handler)
				3690	event->overflow_handler(event, nmi, data, regs);
				3691	else
				3692	perf_event_output(event, nmi, data, regs);
				3693
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3694	return ret;
				3695	}
				3696
				3697	int perf_event_overflow(struct perf_event *event, int nmi,
				3698	struct perf_sample_data *data,
				3699	struct pt_regs *regs)
				3700	{
				3701	return __perf_event_overflow(event, nmi, 1, data, regs);
				3702	}
				3703
				3704	/*
				3705	* Generic software event infrastructure
				3706	*/
				3707
				3708	/*
				3709	* We directly increment event->count and keep a second value in
				3710	* event->hw.period_left to count intervals. This period event
				3711	* is kept in the range [-sample_period, 0] so that we can use the
				3712	* sign as trigger.
				3713	*/
				3714
				3715	static u64 perf_swevent_set_period(struct perf_event *event)
				3716	{
				3717	struct hw_perf_event *hwc = &event->hw;
				3718	u64 period = hwc->last_period;
				3719	u64 nr, offset;
				3720	s64 old, val;
				3721
				3722	hwc->last_period = hwc->sample_period;
				3723
				3724	again:
				3725	old = val = atomic64_read(&hwc->period_left);
				3726	if (val < 0)
				3727	return 0;
				3728
				3729	nr = div64_u64(period + val, period);
				3730	offset = nr * period;
				3731	val -= offset;
				3732	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
				3733	goto again;
				3734
				3735	return nr;
				3736	}
				3737
Peter Zijlstra	0cff784	2009-11-20 22:19:44 +0100	[diff] [blame]	3738	static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3739	int nmi, struct perf_sample_data *data,
				3740	struct pt_regs *regs)
				3741	{
				3742	struct hw_perf_event *hwc = &event->hw;
				3743	int throttle = 0;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3744
				3745	data->period = event->hw.last_period;
Peter Zijlstra	0cff784	2009-11-20 22:19:44 +0100	[diff] [blame]	3746	if (!overflow)
				3747	overflow = perf_swevent_set_period(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3748
				3749	if (hwc->interrupts == MAX_INTERRUPTS)
				3750	return;
				3751
				3752	for (; overflow; overflow--) {
				3753	if (__perf_event_overflow(event, nmi, throttle,
				3754	data, regs)) {
				3755	/*
				3756	* We inhibit the overflow from happening when
				3757	* hwc->interrupts == MAX_INTERRUPTS.
				3758	*/
				3759	break;
				3760	}
				3761	throttle = 1;
				3762	}
				3763	}
				3764
				3765	static void perf_swevent_unthrottle(struct perf_event *event)
				3766	{
				3767	/*
				3768	* Nothing to do, we already reset hwc->interrupts.
				3769	*/
				3770	}
				3771
				3772	static void perf_swevent_add(struct perf_event *event, u64 nr,
				3773	int nmi, struct perf_sample_data *data,
				3774	struct pt_regs *regs)
				3775	{
				3776	struct hw_perf_event *hwc = &event->hw;
				3777
				3778	atomic64_add(nr, &event->count);
				3779
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3780	if (!regs)
				3781	return;
				3782
Peter Zijlstra	0cff784	2009-11-20 22:19:44 +0100	[diff] [blame]	3783	if (!hwc->sample_period)
				3784	return;
				3785
				3786	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
				3787	return perf_swevent_overflow(event, 1, nmi, data, regs);
				3788
				3789	if (atomic64_add_negative(nr, &hwc->period_left))
				3790	return;
				3791
				3792	perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3793	}
				3794
				3795	static int perf_swevent_is_counting(struct perf_event *event)
				3796	{
				3797	/*
				3798	* The event is active, we're good!
				3799	*/
				3800	if (event->state == PERF_EVENT_STATE_ACTIVE)
				3801	return 1;
				3802
				3803	/*
				3804	* The event is off/error, not counting.
				3805	*/
				3806	if (event->state != PERF_EVENT_STATE_INACTIVE)
				3807	return 0;
				3808
				3809	/*
				3810	* The event is inactive, if the context is active
				3811	* we're part of a group that didn't make it on the 'pmu',
				3812	* not counting.
				3813	*/
				3814	if (event->ctx->is_active)
				3815	return 0;
				3816
				3817	/*
				3818	* We're inactive and the context is too, this means the
				3819	* task is scheduled out, we're counting events that happen
				3820	* to us, like migration events.
				3821	*/
				3822	return 1;
				3823	}
				3824
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	3825	static int perf_tp_event_match(struct perf_event *event,
				3826	struct perf_sample_data *data);
				3827
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3828	static int perf_swevent_match(struct perf_event *event,
				3829	enum perf_type_id type,
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	3830	u32 event_id,
				3831	struct perf_sample_data *data,
				3832	struct pt_regs *regs)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3833	{
				3834	if (!perf_swevent_is_counting(event))
				3835	return 0;
				3836
				3837	if (event->attr.type != type)
				3838	return 0;
				3839	if (event->attr.config != event_id)
				3840	return 0;
				3841
				3842	if (regs) {
				3843	if (event->attr.exclude_user && user_mode(regs))
				3844	return 0;
				3845
				3846	if (event->attr.exclude_kernel && !user_mode(regs))
				3847	return 0;
				3848	}
				3849
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	3850	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
				3851	!perf_tp_event_match(event, data))
				3852	return 0;
				3853
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3854	return 1;
				3855	}
				3856
				3857	static void perf_swevent_ctx_event(struct perf_event_context *ctx,
				3858	enum perf_type_id type,
				3859	u32 event_id, u64 nr, int nmi,
				3860	struct perf_sample_data *data,
				3861	struct pt_regs *regs)
				3862	{
				3863	struct perf_event *event;
				3864
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3865	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	3866	if (perf_swevent_match(event, type, event_id, data, regs))
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3867	perf_swevent_add(event, nr, nmi, data, regs);
				3868	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3869	}
				3870
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3871	/*
				3872	* Must be called with preemption disabled
				3873	*/
				3874	int perf_swevent_get_recursion_context(int **recursion)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3875	{
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3876	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				3877
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3878	if (in_nmi())
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3879	*recursion = &cpuctx->recursion[3];
				3880	else if (in_irq())
				3881	*recursion = &cpuctx->recursion[2];
				3882	else if (in_softirq())
				3883	*recursion = &cpuctx->recursion[1];
				3884	else
				3885	*recursion = &cpuctx->recursion[0];
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3886
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3887	if (**recursion)
				3888	return -1;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3889
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3890	(**recursion)++;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3891
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3892	return 0;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3893	}
Ingo Molnar	645e8cc	2009-11-22 12:20:19 +0100	[diff] [blame]	3894	EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3895
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3896	void perf_swevent_put_recursion_context(int *recursion)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3897	{
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3898	(*recursion)--;
				3899	}
Ingo Molnar	645e8cc	2009-11-22 12:20:19 +0100	[diff] [blame]	3900	EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3901
				3902	static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
				3903	u64 nr, int nmi,
				3904	struct perf_sample_data *data,
				3905	struct pt_regs *regs)
				3906	{
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3907	struct perf_event_context *ctx;
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3908	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3909
Peter Zijlstra	8152018	2009-11-20 22:19:45 +0100	[diff] [blame]	3910	rcu_read_lock();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3911	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
				3912	nr, nmi, data, regs);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3913	/*
				3914	* doesn't really matter which of the child contexts the
				3915	* events ends up in.
				3916	*/
				3917	ctx = rcu_dereference(current->perf_event_ctxp);
				3918	if (ctx)
				3919	perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
				3920	rcu_read_unlock();
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3921	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3922
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3923	static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
				3924	u64 nr, int nmi,
				3925	struct perf_sample_data *data,
				3926	struct pt_regs *regs)
				3927	{
				3928	int *recursion;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3929
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3930	preempt_disable();
				3931
				3932	if (perf_swevent_get_recursion_context(&recursion))
				3933	goto out;
				3934
				3935	__do_perf_sw_event(type, event_id, nr, nmi, data, regs);
				3936
				3937	perf_swevent_put_recursion_context(recursion);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3938	out:
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	3939	preempt_enable();
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3940	}
				3941
				3942	void __perf_sw_event(u32 event_id, u64 nr, int nmi,
				3943	struct pt_regs *regs, u64 addr)
				3944	{
Ingo Molnar	a4234bf	2009-11-23 10:57:59 +0100	[diff] [blame]	3945	struct perf_sample_data data;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3946
Ingo Molnar	a4234bf	2009-11-23 10:57:59 +0100	[diff] [blame]	3947	data.addr = addr;
				3948	data.raw = NULL;
				3949
				3950	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	3951	}
				3952
				3953	static void perf_swevent_read(struct perf_event *event)
				3954	{
				3955	}
				3956
				3957	static int perf_swevent_enable(struct perf_event *event)
				3958	{
				3959	struct hw_perf_event *hwc = &event->hw;
				3960
				3961	if (hwc->sample_period) {
				3962	hwc->last_period = hwc->sample_period;
				3963	perf_swevent_set_period(event);
				3964	}
				3965	return 0;
				3966	}
				3967
				3968	static void perf_swevent_disable(struct perf_event *event)
				3969	{
				3970	}
				3971
				3972	static const struct pmu perf_ops_generic = {
				3973	.enable = perf_swevent_enable,
				3974	.disable = perf_swevent_disable,
				3975	.read = perf_swevent_read,
				3976	.unthrottle = perf_swevent_unthrottle,
				3977	};
				3978
				3979	/*
				3980	* hrtimer based swevent callback
				3981	*/
				3982
				3983	static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
				3984	{
				3985	enum hrtimer_restart ret = HRTIMER_RESTART;
				3986	struct perf_sample_data data;
				3987	struct pt_regs *regs;
				3988	struct perf_event *event;
				3989	u64 period;
				3990
				3991	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
				3992	event->pmu->read(event);
				3993
				3994	data.addr = 0;
				3995	regs = get_irq_regs();
				3996	/*
				3997	* In case we exclude kernel IPs or are somehow not in interrupt
				3998	* context, provide the next best thing, the user IP.
				3999	*/
				4000	if ((event->attr.exclude_kernel \|\| !regs) &&
				4001	!event->attr.exclude_user)
				4002	regs = task_pt_regs(current);
				4003
				4004	if (regs) {
Soeren Sandmann	54f4407	2009-10-22 18:34:08 +0200	[diff] [blame]	4005	if (!(event->attr.exclude_idle && current->pid == 0))
				4006	if (perf_event_overflow(event, 0, &data, regs))
				4007	ret = HRTIMER_NORESTART;
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4008	}
				4009
				4010	period = max_t(u64, 10000, event->hw.sample_period);
				4011	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
				4012
				4013	return ret;
				4014	}
				4015
Soeren Sandmann	721a669	2009-09-15 14:33:08 +0200	[diff] [blame]	4016	static void perf_swevent_start_hrtimer(struct perf_event *event)
				4017	{
				4018	struct hw_perf_event *hwc = &event->hw;
				4019
				4020	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				4021	hwc->hrtimer.function = perf_swevent_hrtimer;
				4022	if (hwc->sample_period) {
				4023	u64 period;
				4024
				4025	if (hwc->remaining) {
				4026	if (hwc->remaining < 0)
				4027	period = 10000;
				4028	else
				4029	period = hwc->remaining;
				4030	hwc->remaining = 0;
				4031	} else {
				4032	period = max_t(u64, 10000, hwc->sample_period);
				4033	}
				4034	__hrtimer_start_range_ns(&hwc->hrtimer,
				4035	ns_to_ktime(period), 0,
				4036	HRTIMER_MODE_REL, 0);
				4037	}
				4038	}
				4039
				4040	static void perf_swevent_cancel_hrtimer(struct perf_event *event)
				4041	{
				4042	struct hw_perf_event *hwc = &event->hw;
				4043
				4044	if (hwc->sample_period) {
				4045	ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
				4046	hwc->remaining = ktime_to_ns(remaining);
				4047
				4048	hrtimer_cancel(&hwc->hrtimer);
				4049	}
				4050	}
				4051
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4052	/*
				4053	* Software event: cpu wall time clock
				4054	*/
				4055
				4056	static void cpu_clock_perf_event_update(struct perf_event *event)
				4057	{
				4058	int cpu = raw_smp_processor_id();
				4059	s64 prev;
				4060	u64 now;
				4061
				4062	now = cpu_clock(cpu);
				4063	prev = atomic64_read(&event->hw.prev_count);
				4064	atomic64_set(&event->hw.prev_count, now);
				4065	atomic64_add(now - prev, &event->count);
				4066	}
				4067
				4068	static int cpu_clock_perf_event_enable(struct perf_event *event)
				4069	{
				4070	struct hw_perf_event *hwc = &event->hw;
				4071	int cpu = raw_smp_processor_id();
				4072
				4073	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
Soeren Sandmann	721a669	2009-09-15 14:33:08 +0200	[diff] [blame]	4074	perf_swevent_start_hrtimer(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4075
				4076	return 0;
				4077	}
				4078
				4079	static void cpu_clock_perf_event_disable(struct perf_event *event)
				4080	{
Soeren Sandmann	721a669	2009-09-15 14:33:08 +0200	[diff] [blame]	4081	perf_swevent_cancel_hrtimer(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4082	cpu_clock_perf_event_update(event);
				4083	}
				4084
				4085	static void cpu_clock_perf_event_read(struct perf_event *event)
				4086	{
				4087	cpu_clock_perf_event_update(event);
				4088	}
				4089
				4090	static const struct pmu perf_ops_cpu_clock = {
				4091	.enable = cpu_clock_perf_event_enable,
				4092	.disable = cpu_clock_perf_event_disable,
				4093	.read = cpu_clock_perf_event_read,
				4094	};
				4095
				4096	/*
				4097	* Software event: task time clock
				4098	*/
				4099
				4100	static void task_clock_perf_event_update(struct perf_event *event, u64 now)
				4101	{
				4102	u64 prev;
				4103	s64 delta;
				4104
				4105	prev = atomic64_xchg(&event->hw.prev_count, now);
				4106	delta = now - prev;
				4107	atomic64_add(delta, &event->count);
				4108	}
				4109
				4110	static int task_clock_perf_event_enable(struct perf_event *event)
				4111	{
				4112	struct hw_perf_event *hwc = &event->hw;
				4113	u64 now;
				4114
				4115	now = event->ctx->time;
				4116
				4117	atomic64_set(&hwc->prev_count, now);
Soeren Sandmann	721a669	2009-09-15 14:33:08 +0200	[diff] [blame]	4118
				4119	perf_swevent_start_hrtimer(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4120
				4121	return 0;
				4122	}
				4123
				4124	static void task_clock_perf_event_disable(struct perf_event *event)
				4125	{
Soeren Sandmann	721a669	2009-09-15 14:33:08 +0200	[diff] [blame]	4126	perf_swevent_cancel_hrtimer(event);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4127	task_clock_perf_event_update(event, event->ctx->time);
				4128
				4129	}
				4130
				4131	static void task_clock_perf_event_read(struct perf_event *event)
				4132	{
				4133	u64 time;
				4134
				4135	if (!in_nmi()) {
				4136	update_context_time(event->ctx);
				4137	time = event->ctx->time;
				4138	} else {
				4139	u64 now = perf_clock();
				4140	u64 delta = now - event->ctx->timestamp;
				4141	time = event->ctx->time + delta;
				4142	}
				4143
				4144	task_clock_perf_event_update(event, time);
				4145	}
				4146
				4147	static const struct pmu perf_ops_task_clock = {
				4148	.enable = task_clock_perf_event_enable,
				4149	.disable = task_clock_perf_event_disable,
				4150	.read = task_clock_perf_event_read,
				4151	};
				4152
				4153	#ifdef CONFIG_EVENT_PROFILE
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4154
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4155	void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
				4156	int entry_size)
				4157	{
				4158	struct perf_raw_record raw = {
				4159	.size = entry_size,
				4160	.data = record,
				4161	};
				4162
				4163	struct perf_sample_data data = {
				4164	.addr = addr,
				4165	.raw = &raw,
				4166	};
				4167
				4168	struct pt_regs *regs = get_irq_regs();
				4169
				4170	if (!regs)
				4171	regs = task_pt_regs(current);
				4172
Frederic Weisbecker	ce71b9d	2009-11-22 05:26:55 +0100	[diff] [blame]	4173	/* Trace events already protected against recursion */
				4174	__do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4175	&data, regs);
				4176	}
				4177	EXPORT_SYMBOL_GPL(perf_tp_event);
				4178
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4179	static int perf_tp_event_match(struct perf_event *event,
				4180	struct perf_sample_data *data)
				4181	{
				4182	void *record = data->raw->data;
				4183
				4184	if (likely(!event->filter) \|\| filter_match_preds(event->filter, record))
				4185	return 1;
				4186	return 0;
				4187	}
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4188
				4189	static void tp_perf_event_destroy(struct perf_event *event)
				4190	{
				4191	ftrace_profile_disable(event->attr.config);
				4192	}
				4193
				4194	static const struct pmu tp_perf_event_init(struct perf_event event)
				4195	{
				4196	/*
				4197	* Raw tracepoint data is a severe data leak, only allow root to
				4198	* have these.
				4199	*/
				4200	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
				4201	perf_paranoid_tracepoint_raw() &&
				4202	!capable(CAP_SYS_ADMIN))
				4203	return ERR_PTR(-EPERM);
				4204
				4205	if (ftrace_profile_enable(event->attr.config))
				4206	return NULL;
				4207
				4208	event->destroy = tp_perf_event_destroy;
				4209
				4210	return &perf_ops_generic;
				4211	}
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4212
				4213	static int perf_event_set_filter(struct perf_event event, void __user arg)
				4214	{
				4215	char *filter_str;
				4216	int ret;
				4217
				4218	if (event->attr.type != PERF_TYPE_TRACEPOINT)
				4219	return -EINVAL;
				4220
				4221	filter_str = strndup_user(arg, PAGE_SIZE);
				4222	if (IS_ERR(filter_str))
				4223	return PTR_ERR(filter_str);
				4224
				4225	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
				4226
				4227	kfree(filter_str);
				4228	return ret;
				4229	}
				4230
				4231	static void perf_event_free_filter(struct perf_event *event)
				4232	{
				4233	ftrace_profile_free_filter(event);
				4234	}
				4235
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4236	#else
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4237
				4238	static int perf_tp_event_match(struct perf_event *event,
				4239	struct perf_sample_data *data)
				4240	{
				4241	return 1;
				4242	}
				4243
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4244	static const struct pmu tp_perf_event_init(struct perf_event event)
				4245	{
				4246	return NULL;
				4247	}
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4248
				4249	static int perf_event_set_filter(struct perf_event event, void __user arg)
				4250	{
				4251	return -ENOENT;
				4252	}
				4253
				4254	static void perf_event_free_filter(struct perf_event *event)
				4255	{
				4256	}
				4257
				4258	#endif /* CONFIG_EVENT_PROFILE */
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4259
Frederic Weisbecker	24f1e32c	2009-09-09 19:22:48 +0200	[diff] [blame]	4260	#ifdef CONFIG_HAVE_HW_BREAKPOINT
				4261	static void bp_perf_event_destroy(struct perf_event *event)
				4262	{
				4263	release_bp_slot(event);
				4264	}
				4265
				4266	static const struct pmu bp_perf_event_init(struct perf_event bp)
				4267	{
				4268	int err;
				4269	/*
				4270	* The breakpoint is already filled if we haven't created the counter
				4271	* through perf syscall
				4272	* FIXME: manage to get trigerred to NULL if it comes from syscalls
				4273	*/
				4274	if (!bp->callback)
				4275	err = register_perf_hw_breakpoint(bp);
				4276	else
				4277	err = __register_perf_hw_breakpoint(bp);
				4278	if (err)
				4279	return ERR_PTR(err);
				4280
				4281	bp->destroy = bp_perf_event_destroy;
				4282
				4283	return &perf_ops_bp;
				4284	}
				4285
				4286	void perf_bp_event(struct perf_event bp, void regs)
				4287	{
				4288	/* TODO */
				4289	}
				4290	#else
				4291	static void bp_perf_event_destroy(struct perf_event *event)
				4292	{
				4293	}
				4294
				4295	static const struct pmu bp_perf_event_init(struct perf_event bp)
				4296	{
				4297	return NULL;
				4298	}
				4299
				4300	void perf_bp_event(struct perf_event bp, void regs)
				4301	{
				4302	}
				4303	#endif
				4304
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4305	atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
				4306
				4307	static void sw_perf_event_destroy(struct perf_event *event)
				4308	{
				4309	u64 event_id = event->attr.config;
				4310
				4311	WARN_ON(event->parent);
				4312
				4313	atomic_dec(&perf_swevent_enabled[event_id]);
				4314	}
				4315
				4316	static const struct pmu sw_perf_event_init(struct perf_event event)
				4317	{
				4318	const struct pmu *pmu = NULL;
				4319	u64 event_id = event->attr.config;
				4320
				4321	/*
				4322	* Software events (currently) can't in general distinguish
				4323	* between user, kernel and hypervisor events.
				4324	* However, context switches and cpu migrations are considered
				4325	* to be kernel events, and page faults are never hypervisor
				4326	* events.
				4327	*/
				4328	switch (event_id) {
				4329	case PERF_COUNT_SW_CPU_CLOCK:
				4330	pmu = &perf_ops_cpu_clock;
				4331
				4332	break;
				4333	case PERF_COUNT_SW_TASK_CLOCK:
				4334	/*
				4335	* If the user instantiates this as a per-cpu event,
				4336	* use the cpu_clock event instead.
				4337	*/
				4338	if (event->ctx->task)
				4339	pmu = &perf_ops_task_clock;
				4340	else
				4341	pmu = &perf_ops_cpu_clock;
				4342
				4343	break;
				4344	case PERF_COUNT_SW_PAGE_FAULTS:
				4345	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
				4346	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
				4347	case PERF_COUNT_SW_CONTEXT_SWITCHES:
				4348	case PERF_COUNT_SW_CPU_MIGRATIONS:
Anton Blanchard	f7d7986	2009-10-18 01:09:29 +0000	[diff] [blame]	4349	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
				4350	case PERF_COUNT_SW_EMULATION_FAULTS:
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4351	if (!event->parent) {
				4352	atomic_inc(&perf_swevent_enabled[event_id]);
				4353	event->destroy = sw_perf_event_destroy;
				4354	}
				4355	pmu = &perf_ops_generic;
				4356	break;
				4357	}
				4358
				4359	return pmu;
				4360	}
				4361
				4362	/*
				4363	* Allocate and initialize a event structure
				4364	*/
				4365	static struct perf_event *
				4366	perf_event_alloc(struct perf_event_attr *attr,
				4367	int cpu,
				4368	struct perf_event_context *ctx,
				4369	struct perf_event *group_leader,
				4370	struct perf_event *parent_event,
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4371	perf_callback_t callback,
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4372	gfp_t gfpflags)
				4373	{
				4374	const struct pmu *pmu;
				4375	struct perf_event *event;
				4376	struct hw_perf_event *hwc;
				4377	long err;
				4378
				4379	event = kzalloc(sizeof(*event), gfpflags);
				4380	if (!event)
				4381	return ERR_PTR(-ENOMEM);
				4382
				4383	/*
				4384	* Single events are their own group leaders, with an
				4385	* empty sibling list:
				4386	*/
				4387	if (!group_leader)
				4388	group_leader = event;
				4389
				4390	mutex_init(&event->child_mutex);
				4391	INIT_LIST_HEAD(&event->child_list);
				4392
				4393	INIT_LIST_HEAD(&event->group_entry);
				4394	INIT_LIST_HEAD(&event->event_entry);
				4395	INIT_LIST_HEAD(&event->sibling_list);
				4396	init_waitqueue_head(&event->waitq);
				4397
				4398	mutex_init(&event->mmap_mutex);
				4399
				4400	event->cpu = cpu;
				4401	event->attr = *attr;
				4402	event->group_leader = group_leader;
				4403	event->pmu = NULL;
				4404	event->ctx = ctx;
				4405	event->oncpu = -1;
				4406
				4407	event->parent = parent_event;
				4408
				4409	event->ns = get_pid_ns(current->nsproxy->pid_ns);
				4410	event->id = atomic64_inc_return(&perf_event_id);
				4411
				4412	event->state = PERF_EVENT_STATE_INACTIVE;
				4413
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4414	if (!callback && parent_event)
				4415	callback = parent_event->callback;
				4416
				4417	event->callback = callback;
				4418
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4419	if (attr->disabled)
				4420	event->state = PERF_EVENT_STATE_OFF;
				4421
				4422	pmu = NULL;
				4423
				4424	hwc = &event->hw;
				4425	hwc->sample_period = attr->sample_period;
				4426	if (attr->freq && attr->sample_freq)
				4427	hwc->sample_period = 1;
				4428	hwc->last_period = hwc->sample_period;
				4429
				4430	atomic64_set(&hwc->period_left, hwc->sample_period);
				4431
				4432	/*
				4433	* we currently do not support PERF_FORMAT_GROUP on inherited events
				4434	*/
				4435	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
				4436	goto done;
				4437
				4438	switch (attr->type) {
				4439	case PERF_TYPE_RAW:
				4440	case PERF_TYPE_HARDWARE:
				4441	case PERF_TYPE_HW_CACHE:
				4442	pmu = hw_perf_event_init(event);
				4443	break;
				4444
				4445	case PERF_TYPE_SOFTWARE:
				4446	pmu = sw_perf_event_init(event);
				4447	break;
				4448
				4449	case PERF_TYPE_TRACEPOINT:
				4450	pmu = tp_perf_event_init(event);
				4451	break;
				4452
Frederic Weisbecker	24f1e32c	2009-09-09 19:22:48 +0200	[diff] [blame]	4453	case PERF_TYPE_BREAKPOINT:
				4454	pmu = bp_perf_event_init(event);
				4455	break;
				4456
				4457
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4458	default:
				4459	break;
				4460	}
				4461	done:
				4462	err = 0;
				4463	if (!pmu)
				4464	err = -EINVAL;
				4465	else if (IS_ERR(pmu))
				4466	err = PTR_ERR(pmu);
				4467
				4468	if (err) {
				4469	if (event->ns)
				4470	put_pid_ns(event->ns);
				4471	kfree(event);
				4472	return ERR_PTR(err);
				4473	}
				4474
				4475	event->pmu = pmu;
				4476
				4477	if (!event->parent) {
				4478	atomic_inc(&nr_events);
				4479	if (event->attr.mmap)
				4480	atomic_inc(&nr_mmap_events);
				4481	if (event->attr.comm)
				4482	atomic_inc(&nr_comm_events);
				4483	if (event->attr.task)
				4484	atomic_inc(&nr_task_events);
				4485	}
				4486
				4487	return event;
				4488	}
				4489
				4490	static int perf_copy_attr(struct perf_event_attr __user *uattr,
				4491	struct perf_event_attr *attr)
				4492	{
				4493	u32 size;
				4494	int ret;
				4495
				4496	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
				4497	return -EFAULT;
				4498
				4499	/*
				4500	* zero the full structure, so that a short copy will be nice.
				4501	*/
				4502	memset(attr, 0, sizeof(*attr));
				4503
				4504	ret = get_user(size, &uattr->size);
				4505	if (ret)
				4506	return ret;
				4507
				4508	if (size > PAGE_SIZE) /* silly large */
				4509	goto err_size;
				4510
				4511	if (!size) /* abi compat */
				4512	size = PERF_ATTR_SIZE_VER0;
				4513
				4514	if (size < PERF_ATTR_SIZE_VER0)
				4515	goto err_size;
				4516
				4517	/*
				4518	* If we're handed a bigger struct than we know of,
				4519	* ensure all the unknown bits are 0 - i.e. new
				4520	* user-space does not rely on any kernel feature
				4521	* extensions we dont know about yet.
				4522	*/
				4523	if (size > sizeof(*attr)) {
				4524	unsigned char __user *addr;
				4525	unsigned char __user *end;
				4526	unsigned char val;
				4527
				4528	addr = (void __user )uattr + sizeof(attr);
				4529	end = (void __user *)uattr + size;
				4530
				4531	for (; addr < end; addr++) {
				4532	ret = get_user(val, addr);
				4533	if (ret)
				4534	return ret;
				4535	if (val)
				4536	goto err_size;
				4537	}
				4538	size = sizeof(*attr);
				4539	}
				4540
				4541	ret = copy_from_user(attr, uattr, size);
				4542	if (ret)
				4543	return -EFAULT;
				4544
				4545	/*
				4546	* If the type exists, the corresponding creation will verify
				4547	* the attr->config.
				4548	*/
				4549	if (attr->type >= PERF_TYPE_MAX)
				4550	return -EINVAL;
				4551
				4552	if (attr->__reserved_1 \|\| attr->__reserved_2 \|\| attr->__reserved_3)
				4553	return -EINVAL;
				4554
				4555	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
				4556	return -EINVAL;
				4557
				4558	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
				4559	return -EINVAL;
				4560
				4561	out:
				4562	return ret;
				4563
				4564	err_size:
				4565	put_user(sizeof(*attr), &uattr->size);
				4566	ret = -E2BIG;
				4567	goto out;
				4568	}
				4569
Li Zefan	6fb2915	2009-10-15 11:21:42 +0800	[diff] [blame]	4570	static int perf_event_set_output(struct perf_event *event, int output_fd)
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4571	{
				4572	struct perf_event *output_event = NULL;
				4573	struct file *output_file = NULL;
				4574	struct perf_event *old_output;
				4575	int fput_needed = 0;
				4576	int ret = -EINVAL;
				4577
				4578	if (!output_fd)
				4579	goto set;
				4580
				4581	output_file = fget_light(output_fd, &fput_needed);
				4582	if (!output_file)
				4583	return -EBADF;
				4584
				4585	if (output_file->f_op != &perf_fops)
				4586	goto out;
				4587
				4588	output_event = output_file->private_data;
				4589
				4590	/* Don't chain output fds */
				4591	if (output_event->output)
				4592	goto out;
				4593
				4594	/* Don't set an output fd when we already have an output channel */
				4595	if (event->data)
				4596	goto out;
				4597
				4598	atomic_long_inc(&output_file->f_count);
				4599
				4600	set:
				4601	mutex_lock(&event->mmap_mutex);
				4602	old_output = event->output;
				4603	rcu_assign_pointer(event->output, output_event);
				4604	mutex_unlock(&event->mmap_mutex);
				4605
				4606	if (old_output) {
				4607	/*
				4608	* we need to make sure no existing perf_output_*()
				4609	* is still referencing this event.
				4610	*/
				4611	synchronize_rcu();
				4612	fput(old_output->filp);
				4613	}
				4614
				4615	ret = 0;
				4616	out:
				4617	fput_light(output_file, fput_needed);
				4618	return ret;
				4619	}
				4620
				4621	/**
				4622	* sys_perf_event_open - open a performance event, associate it to a task/cpu
				4623	*
				4624	* @attr_uptr: event_id type attributes for monitoring/sampling
				4625	* @pid: target pid
				4626	* @cpu: target cpu
				4627	* @group_fd: group leader event fd
				4628	*/
				4629	SYSCALL_DEFINE5(perf_event_open,
				4630	struct perf_event_attr __user *, attr_uptr,
				4631	pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
				4632	{
				4633	struct perf_event event, group_leader;
				4634	struct perf_event_attr attr;
				4635	struct perf_event_context *ctx;
				4636	struct file *event_file = NULL;
				4637	struct file *group_file = NULL;
				4638	int fput_needed = 0;
				4639	int fput_needed2 = 0;
				4640	int err;
				4641
				4642	/* for future expandability... */
				4643	if (flags & ~(PERF_FLAG_FD_NO_GROUP \| PERF_FLAG_FD_OUTPUT))
				4644	return -EINVAL;
				4645
				4646	err = perf_copy_attr(attr_uptr, &attr);
				4647	if (err)
				4648	return err;
				4649
				4650	if (!attr.exclude_kernel) {
				4651	if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
				4652	return -EACCES;
				4653	}
				4654
				4655	if (attr.freq) {
				4656	if (attr.sample_freq > sysctl_perf_event_sample_rate)
				4657	return -EINVAL;
				4658	}
				4659
				4660	/*
				4661	* Get the target context (task or percpu):
				4662	*/
				4663	ctx = find_get_context(pid, cpu);
				4664	if (IS_ERR(ctx))
				4665	return PTR_ERR(ctx);
				4666
				4667	/*
				4668	* Look up the group leader (we will attach this event to it):
				4669	*/
				4670	group_leader = NULL;
				4671	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
				4672	err = -EINVAL;
				4673	group_file = fget_light(group_fd, &fput_needed);
				4674	if (!group_file)
				4675	goto err_put_context;
				4676	if (group_file->f_op != &perf_fops)
				4677	goto err_put_context;
				4678
				4679	group_leader = group_file->private_data;
				4680	/*
				4681	* Do not allow a recursive hierarchy (this new sibling
				4682	* becoming part of another group-sibling):
				4683	*/
				4684	if (group_leader->group_leader != group_leader)
				4685	goto err_put_context;
				4686	/*
				4687	* Do not allow to attach to a group in a different
				4688	* task or CPU context:
				4689	*/
				4690	if (group_leader->ctx != ctx)
				4691	goto err_put_context;
				4692	/*
				4693	* Only a group leader can be exclusive or pinned
				4694	*/
				4695	if (attr.exclusive \|\| attr.pinned)
				4696	goto err_put_context;
				4697	}
				4698
				4699	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4700	NULL, NULL, GFP_KERNEL);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4701	err = PTR_ERR(event);
				4702	if (IS_ERR(event))
				4703	goto err_put_context;
				4704
				4705	err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
				4706	if (err < 0)
				4707	goto err_free_put_context;
				4708
				4709	event_file = fget_light(err, &fput_needed2);
				4710	if (!event_file)
				4711	goto err_free_put_context;
				4712
				4713	if (flags & PERF_FLAG_FD_OUTPUT) {
				4714	err = perf_event_set_output(event, group_fd);
				4715	if (err)
				4716	goto err_fput_free_put_context;
				4717	}
				4718
				4719	event->filp = event_file;
				4720	WARN_ON_ONCE(ctx->parent_ctx);
				4721	mutex_lock(&ctx->mutex);
				4722	perf_install_in_context(ctx, event, cpu);
				4723	++ctx->generation;
				4724	mutex_unlock(&ctx->mutex);
				4725
				4726	event->owner = current;
				4727	get_task_struct(current);
				4728	mutex_lock(&current->perf_event_mutex);
				4729	list_add_tail(&event->owner_entry, &current->perf_event_list);
				4730	mutex_unlock(&current->perf_event_mutex);
				4731
				4732	err_fput_free_put_context:
				4733	fput_light(event_file, fput_needed2);
				4734
				4735	err_free_put_context:
				4736	if (err < 0)
				4737	kfree(event);
				4738
				4739	err_put_context:
				4740	if (err < 0)
				4741	put_ctx(ctx);
				4742
				4743	fput_light(group_file, fput_needed);
				4744
				4745	return err;
				4746	}
				4747
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	4748	/**
				4749	* perf_event_create_kernel_counter
				4750	*
				4751	* @attr: attributes of the counter to create
				4752	* @cpu: cpu in which the counter is bound
				4753	* @pid: task to profile
				4754	*/
				4755	struct perf_event *
				4756	perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4757	pid_t pid, perf_callback_t callback)
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	4758	{
				4759	struct perf_event *event;
				4760	struct perf_event_context *ctx;
				4761	int err;
				4762
				4763	/*
				4764	* Get the target context (task or percpu):
				4765	*/
				4766
				4767	ctx = find_get_context(pid, cpu);
				4768	if (IS_ERR(ctx))
Frederic Weisbecker	24f1e32c	2009-09-09 19:22:48 +0200	[diff] [blame]	4769	return NULL;
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	4770
				4771	event = perf_event_alloc(attr, cpu, ctx, NULL,
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4772	NULL, callback, GFP_KERNEL);
Arjan van de Ven	fb0459d	2009-09-25 12:25:56 +0200	[diff] [blame]	4773	err = PTR_ERR(event);
				4774	if (IS_ERR(event))
				4775	goto err_put_context;
				4776
				4777	event->filp = NULL;
				4778	WARN_ON_ONCE(ctx->parent_ctx);
				4779	mutex_lock(&ctx->mutex);
				4780	perf_install_in_context(ctx, event, cpu);
				4781	++ctx->generation;
				4782	mutex_unlock(&ctx->mutex);
				4783
				4784	event->owner = current;
				4785	get_task_struct(current);
				4786	mutex_lock(&current->perf_event_mutex);
				4787	list_add_tail(&event->owner_entry, &current->perf_event_list);
				4788	mutex_unlock(&current->perf_event_mutex);
				4789
				4790	return event;
				4791
				4792	err_put_context:
				4793	if (err < 0)
				4794	put_ctx(ctx);
				4795
				4796	return NULL;
				4797	}
				4798	EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
				4799
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4800	/*
				4801	* inherit a event from parent task to child task:
				4802	*/
				4803	static struct perf_event *
				4804	inherit_event(struct perf_event *parent_event,
				4805	struct task_struct *parent,
				4806	struct perf_event_context *parent_ctx,
				4807	struct task_struct *child,
				4808	struct perf_event *group_leader,
				4809	struct perf_event_context *child_ctx)
				4810	{
				4811	struct perf_event *child_event;
				4812
				4813	/*
				4814	* Instead of creating recursive hierarchies of events,
				4815	* we link inherited events back to the original parent,
				4816	* which has a filp for sure, which we use as the reference
				4817	* count:
				4818	*/
				4819	if (parent_event->parent)
				4820	parent_event = parent_event->parent;
				4821
				4822	child_event = perf_event_alloc(&parent_event->attr,
				4823	parent_event->cpu, child_ctx,
				4824	group_leader, parent_event,
Frederic Weisbecker	97eaf53	2009-10-18 15:33:50 +0200	[diff] [blame]	4825	NULL, GFP_KERNEL);
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4826	if (IS_ERR(child_event))
				4827	return child_event;
				4828	get_ctx(child_ctx);
				4829
				4830	/*
				4831	* Make the child state follow the state of the parent event,
				4832	* not its attr.disabled bit. We hold the parent's mutex,
				4833	* so we won't race with perf_event_{en, dis}able_family.
				4834	*/
				4835	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
				4836	child_event->state = PERF_EVENT_STATE_INACTIVE;
				4837	else
				4838	child_event->state = PERF_EVENT_STATE_OFF;
				4839
				4840	if (parent_event->attr.freq)
				4841	child_event->hw.sample_period = parent_event->hw.sample_period;
				4842
Peter Zijlstra	453f19e	2009-11-20 22:19:43 +0100	[diff] [blame]	4843	child_event->overflow_handler = parent_event->overflow_handler;
				4844
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	4845	/*
				4846	* Link it up in the child's context:
				4847	*/
				4848	add_event_to_ctx(child_event, child_ctx);
				4849
				4850	/*
				4851	* Get a reference to the parent filp - we will fput it
				4852	* when the child event exits. This is safe to do because
				4853	* we are in the parent and we know that the filp still
				4854	* exists and has a nonzero count:
				4855	*/
				4856	atomic_long_inc(&parent_event->filp->f_count);
				4857
				4858	/*
				4859	* Link this into the parent event's child list
				4860	*/
				4861	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
				4862	mutex_lock(&parent_event->child_mutex);
				4863	list_add_tail(&child_event->child_list, &parent_event->child_list);
				4864	mutex_unlock(&parent_event->child_mutex);
				4865
				4866	return child_event;
				4867	}
				4868
				4869	static int inherit_group(struct perf_event *parent_event,
				4870	struct task_struct *parent,
				4871	struct perf_event_context *parent_ctx,
				4872	struct task_struct *child,
				4873	struct perf_event_context *child_ctx)
				4874	{
				4875	struct perf_event *leader;
				4876	struct perf_event *sub;
				4877	struct perf_event *child_ctr;
				4878
				4879	leader = inherit_event(parent_event, parent, parent_ctx,
				4880	child, NULL, child_ctx);
				4881	if (IS_ERR(leader))
				4882	return PTR_ERR(leader);
				4883	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
				4884	child_ctr = inherit_event(sub, parent, parent_ctx,
				4885	child, leader, child_ctx);
				4886	if (IS_ERR(child_ctr))
				4887	return PTR_ERR(child_ctr);
				4888	}
				4889	return 0;
				4890	}
				4891
				4892	static void sync_child_event(struct perf_event *child_event,
				4893	struct task_struct *child)
				4894	{
				4895	struct perf_event *parent_event = child_event->parent;
				4896	u64 child_val;
				4897
				4898	if (child_event->attr.inherit_stat)
				4899	perf_event_read_event(child_event, child);
				4900
				4901	child_val = atomic64_read(&child_event->count);
				4902
				4903	/*
				4904	* Add back the child's count to the parent's count:
				4905	*/
				4906	atomic64_add(child_val, &parent_event->count);
				4907	atomic64_add(child_event->total_time_enabled,
				4908	&parent_event->child_total_time_enabled);
				4909	atomic64_add(child_event->total_time_running,
				4910	&parent_event->child_total_time_running);
				4911
				4912	/*
				4913	* Remove this event from the parent's list
				4914	*/
				4915	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
				4916	mutex_lock(&parent_event->child_mutex);
				4917	list_del_init(&child_event->child_list);
				4918	mutex_unlock(&parent_event->child_mutex);
				4919
				4920	/*
				4921	* Release the parent event, if this was the last
				4922	* reference to it.
				4923	*/
				4924	fput(parent_event->filp);
				4925	}
				4926
				4927	static void
				4928	__perf_event_exit_task(struct perf_event *child_event,
				4929	struct perf_event_context *child_ctx,
				4930	struct task_struct *child)
				4931	{
				4932	struct perf_event *parent_event;
				4933
				4934	update_event_times(child_event);
				4935	perf_event_remove_from_context(child_event);
				4936
				4937	parent_event = child_event->parent;
				4938	/*
				4939	* It can happen that parent exits first, and has events
				4940	* that are still around due to the child reference. These
				4941	* events need to be zapped - but otherwise linger.
				4942	*/
				4943	if (parent_event) {
				4944	sync_child_event(child_event, child);
				4945	free_event(child_event);
				4946	}
				4947	}
				4948
				4949	/*
				4950	* When a child task exits, feed back event values to parent events.
				4951	*/
				4952	void perf_event_exit_task(struct task_struct *child)
				4953	{
				4954	struct perf_event child_event, tmp;
				4955	struct perf_event_context *child_ctx;
				4956	unsigned long flags;
				4957
				4958	if (likely(!child->perf_event_ctxp)) {
				4959	perf_event_task(child, NULL, 0);
				4960	return;
				4961	}
				4962
				4963	local_irq_save(flags);
				4964	/*
				4965	* We can't reschedule here because interrupts are disabled,
				4966	* and either child is current or it is a task that can't be
				4967	* scheduled, so we are now safe from rescheduling changing
				4968	* our context.
				4969	*/
				4970	child_ctx = child->perf_event_ctxp;
				4971	__perf_event_task_sched_out(child_ctx);
				4972
				4973	/*
				4974	* Take the context lock here so that if find_get_context is
				4975	* reading child->perf_event_ctxp, we wait until it has
				4976	* incremented the context's refcount before we do put_ctx below.
				4977	*/
				4978	spin_lock(&child_ctx->lock);
				4979	child->perf_event_ctxp = NULL;
				4980	/*
				4981	* If this context is a clone; unclone it so it can't get
				4982	* swapped to another process while we're removing all
				4983	* the events from it.
				4984	*/
				4985	unclone_ctx(child_ctx);
				4986	spin_unlock_irqrestore(&child_ctx->lock, flags);
				4987
				4988	/*
				4989	* Report the task dead after unscheduling the events so that we
				4990	* won't get any samples after PERF_RECORD_EXIT. We can however still
				4991	* get a few PERF_RECORD_READ events.
				4992	*/
				4993	perf_event_task(child, child_ctx, 0);
				4994
				4995	/*
				4996	* We can recurse on the same lock type through:
				4997	*
				4998	* __perf_event_exit_task()
				4999	* sync_child_event()
				5000	* fput(parent_event->filp)
				5001	* perf_release()
				5002	* mutex_lock(&ctx->mutex)
				5003	*
				5004	* But since its the parent context it won't be the same instance.
				5005	*/
				5006	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
				5007
				5008	again:
				5009	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
				5010	group_entry)
				5011	__perf_event_exit_task(child_event, child_ctx, child);
				5012
				5013	/*
				5014	* If the last event was a group event, it will have appended all
				5015	* its siblings to the list, but we obtained 'tmp' before that which
				5016	* will still point to the list head terminating the iteration.
				5017	*/
				5018	if (!list_empty(&child_ctx->group_list))
				5019	goto again;
				5020
				5021	mutex_unlock(&child_ctx->mutex);
				5022
				5023	put_ctx(child_ctx);
				5024	}
				5025
				5026	/*
				5027	* free an unexposed, unused context as created by inheritance by
				5028	* init_task below, used by fork() in case of fail.
				5029	*/
				5030	void perf_event_free_task(struct task_struct *task)
				5031	{
				5032	struct perf_event_context *ctx = task->perf_event_ctxp;
				5033	struct perf_event event, tmp;
				5034
				5035	if (!ctx)
				5036	return;
				5037
				5038	mutex_lock(&ctx->mutex);
				5039	again:
				5040	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
				5041	struct perf_event *parent = event->parent;
				5042
				5043	if (WARN_ON_ONCE(!parent))
				5044	continue;
				5045
				5046	mutex_lock(&parent->child_mutex);
				5047	list_del_init(&event->child_list);
				5048	mutex_unlock(&parent->child_mutex);
				5049
				5050	fput(parent->filp);
				5051
				5052	list_del_event(event, ctx);
				5053	free_event(event);
				5054	}
				5055
				5056	if (!list_empty(&ctx->group_list))
				5057	goto again;
				5058
				5059	mutex_unlock(&ctx->mutex);
				5060
				5061	put_ctx(ctx);
				5062	}
				5063
				5064	/*
				5065	* Initialize the perf_event context in task_struct
				5066	*/
				5067	int perf_event_init_task(struct task_struct *child)
				5068	{
				5069	struct perf_event_context child_ctx, parent_ctx;
				5070	struct perf_event_context *cloned_ctx;
				5071	struct perf_event *event;
				5072	struct task_struct *parent = current;
				5073	int inherited_all = 1;
				5074	int ret = 0;
				5075
				5076	child->perf_event_ctxp = NULL;
				5077
				5078	mutex_init(&child->perf_event_mutex);
				5079	INIT_LIST_HEAD(&child->perf_event_list);
				5080
				5081	if (likely(!parent->perf_event_ctxp))
				5082	return 0;
				5083
				5084	/*
				5085	* This is executed from the parent task context, so inherit
				5086	* events that have been marked for cloning.
				5087	* First allocate and initialize a context for the child.
				5088	*/
				5089
				5090	child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
				5091	if (!child_ctx)
				5092	return -ENOMEM;
				5093
				5094	__perf_event_init_context(child_ctx, child);
				5095	child->perf_event_ctxp = child_ctx;
				5096	get_task_struct(child);
				5097
				5098	/*
				5099	* If the parent's context is a clone, pin it so it won't get
				5100	* swapped under us.
				5101	*/
				5102	parent_ctx = perf_pin_task_context(parent);
				5103
				5104	/*
				5105	* No need to check if parent_ctx != NULL here; since we saw
				5106	* it non-NULL earlier, the only reason for it to become NULL
				5107	* is if we exit, and since we're currently in the middle of
				5108	* a fork we can't be exiting at the same time.
				5109	*/
				5110
				5111	/*
				5112	* Lock the parent list. No need to lock the child - not PID
				5113	* hashed yet and not running, so nobody can access it.
				5114	*/
				5115	mutex_lock(&parent_ctx->mutex);
				5116
				5117	/*
				5118	* We dont have to disable NMIs - we are only looking at
				5119	* the list, not manipulating it:
				5120	*/
Xiao Guangrong	27f9994	2009-09-25 13:54:01 +0800	[diff] [blame]	5121	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
Ingo Molnar	cdd6c48	2009-09-21 12:02:48 +0200	[diff] [blame]	5122
				5123	if (!event->attr.inherit) {
				5124	inherited_all = 0;
				5125	continue;
				5126	}
				5127
				5128	ret = inherit_group(event, parent, parent_ctx,
				5129	child, child_ctx);
				5130	if (ret) {
				5131	inherited_all = 0;
				5132	break;
				5133	}
				5134	}
				5135
				5136	if (inherited_all) {
				5137	/*
				5138	* Mark the child context as a clone of the parent
				5139	* context, or of whatever the parent is a clone of.
				5140	* Note that if the parent is a clone, it could get
				5141	* uncloned at any point, but that doesn't matter
				5142	* because the list of events and the generation
				5143	* count can't have changed since we took the mutex.
				5144	*/
				5145	cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
				5146	if (cloned_ctx) {
				5147	child_ctx->parent_ctx = cloned_ctx;
				5148	child_ctx->parent_gen = parent_ctx->parent_gen;
				5149	} else {
				5150	child_ctx->parent_ctx = parent_ctx;
				5151	child_ctx->parent_gen = parent_ctx->generation;
				5152	}
				5153	get_ctx(child_ctx->parent_ctx);
				5154	}
				5155
				5156	mutex_unlock(&parent_ctx->mutex);
				5157
				5158	perf_unpin_context(parent_ctx);
				5159
				5160	return ret;
				5161	}
				5162
				5163	static void __cpuinit perf_event_init_cpu(int cpu)
				5164	{
				5165	struct perf_cpu_context *cpuctx;
				5166
				5167	cpuctx = &per_cpu(perf_cpu_context, cpu);
				5168	__perf_event_init_context(&cpuctx->ctx, NULL);
				5169
				5170	spin_lock(&perf_resource_lock);
				5171	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
				5172	spin_unlock(&perf_resource_lock);
				5173
				5174	hw_perf_event_setup(cpu);
				5175	}
				5176
				5177	#ifdef CONFIG_HOTPLUG_CPU
				5178	static void __perf_event_exit_cpu(void *info)
				5179	{
				5180	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				5181	struct perf_event_context *ctx = &cpuctx->ctx;
				5182	struct perf_event event, tmp;
				5183
				5184	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
				5185	__perf_event_remove_from_context(event);
				5186	}
				5187	static void perf_event_exit_cpu(int cpu)
				5188	{
				5189	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				5190	struct perf_event_context *ctx = &cpuctx->ctx;
				5191
				5192	mutex_lock(&ctx->mutex);
				5193	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
				5194	mutex_unlock(&ctx->mutex);
				5195	}
				5196	#else
				5197	static inline void perf_event_exit_cpu(int cpu) { }
				5198	#endif
				5199
				5200	static int __cpuinit
				5201	perf_cpu_notify(struct notifier_block self, unsigned long action, void hcpu)
				5202	{
				5203	unsigned int cpu = (long)hcpu;
				5204
				5205	switch (action) {
				5206
				5207	case CPU_UP_PREPARE:
				5208	case CPU_UP_PREPARE_FROZEN:
				5209	perf_event_init_cpu(cpu);
				5210	break;
				5211
				5212	case CPU_ONLINE:
				5213	case CPU_ONLINE_FROZEN:
				5214	hw_perf_event_setup_online(cpu);
				5215	break;
				5216
				5217	case CPU_DOWN_PREPARE:
				5218	case CPU_DOWN_PREPARE_FROZEN:
				5219	perf_event_exit_cpu(cpu);
				5220	break;
				5221
				5222	default:
				5223	break;
				5224	}
				5225
				5226	return NOTIFY_OK;
				5227	}
				5228
				5229	/*
				5230	* This has to have a higher priority than migration_notifier in sched.c.
				5231	*/
				5232	static struct notifier_block __cpuinitdata perf_cpu_nb = {
				5233	.notifier_call = perf_cpu_notify,
				5234	.priority = 20,
				5235	};
				5236
				5237	void __init perf_event_init(void)
				5238	{
				5239	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
				5240	(void *)(long)smp_processor_id());
				5241	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
				5242	(void *)(long)smp_processor_id());
				5243	register_cpu_notifier(&perf_cpu_nb);
				5244	}
				5245
				5246	static ssize_t perf_show_reserve_percpu(struct sysdev_class class, char buf)
				5247	{
				5248	return sprintf(buf, "%d\n", perf_reserved_percpu);
				5249	}
				5250
				5251	static ssize_t
				5252	perf_set_reserve_percpu(struct sysdev_class *class,
				5253	const char *buf,
				5254	size_t count)
				5255	{
				5256	struct perf_cpu_context *cpuctx;
				5257	unsigned long val;
				5258	int err, cpu, mpt;
				5259
				5260	err = strict_strtoul(buf, 10, &val);
				5261	if (err)
				5262	return err;
				5263	if (val > perf_max_events)
				5264	return -EINVAL;
				5265
				5266	spin_lock(&perf_resource_lock);
				5267	perf_reserved_percpu = val;
				5268	for_each_online_cpu(cpu) {
				5269	cpuctx = &per_cpu(perf_cpu_context, cpu);
				5270	spin_lock_irq(&cpuctx->ctx.lock);
				5271	mpt = min(perf_max_events - cpuctx->ctx.nr_events,
				5272	perf_max_events - perf_reserved_percpu);
				5273	cpuctx->max_pertask = mpt;
				5274	spin_unlock_irq(&cpuctx->ctx.lock);
				5275	}
				5276	spin_unlock(&perf_resource_lock);
				5277
				5278	return count;
				5279	}
				5280
				5281	static ssize_t perf_show_overcommit(struct sysdev_class class, char buf)
				5282	{
				5283	return sprintf(buf, "%d\n", perf_overcommit);
				5284	}
				5285
				5286	static ssize_t
				5287	perf_set_overcommit(struct sysdev_class class, const char buf, size_t count)
				5288	{
				5289	unsigned long val;
				5290	int err;
				5291
				5292	err = strict_strtoul(buf, 10, &val);
				5293	if (err)
				5294	return err;
				5295	if (val > 1)
				5296	return -EINVAL;
				5297
				5298	spin_lock(&perf_resource_lock);
				5299	perf_overcommit = val;
				5300	spin_unlock(&perf_resource_lock);
				5301
				5302	return count;
				5303	}
				5304
				5305	static SYSDEV_CLASS_ATTR(
				5306	reserve_percpu,
				5307	0644,
				5308	perf_show_reserve_percpu,
				5309	perf_set_reserve_percpu
				5310	);
				5311
				5312	static SYSDEV_CLASS_ATTR(
				5313	overcommit,
				5314	0644,
				5315	perf_show_overcommit,
				5316	perf_set_overcommit
				5317	);
				5318
				5319	static struct attribute *perfclass_attrs[] = {
				5320	&attr_reserve_percpu.attr,
				5321	&attr_overcommit.attr,
				5322	NULL
				5323	};
				5324
				5325	static struct attribute_group perfclass_attr_group = {
				5326	.attrs = perfclass_attrs,
				5327	.name = "perf_events",
				5328	};
				5329
				5330	static int __init perf_event_sysfs_init(void)
				5331	{
				5332	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				5333	&perfclass_attr_group);
				5334	}
				5335	device_initcall(perf_event_sysfs_init);