Blame - kernel/perf_counter.c - android_kernel_htc_msm8960

blob: 278209c547a870dd12a43997dc3c5915ca084665 [file] [log] [blame]

Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	1	/*
				2	* Performance counter core code
				3	*
				4	* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
				5	* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
				6	*
				7	* For licencing details see kernel-base/COPYING
				8	*/
				9
				10	#include <linux/fs.h>
				11	#include <linux/cpu.h>
				12	#include <linux/smp.h>
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	13	#include <linux/file.h>
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	14	#include <linux/poll.h>
				15	#include <linux/sysfs.h>
				16	#include <linux/ptrace.h>
				17	#include <linux/percpu.h>
				18	#include <linux/uaccess.h>
				19	#include <linux/syscalls.h>
				20	#include <linux/anon_inodes.h>
				21	#include <linux/perf_counter.h>
				22
				23	/*
				24	* Each CPU has a list of per CPU counters:
				25	*/
				26	DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
				27
				28	int perf_max_counters __read_mostly;
				29	static int perf_reserved_percpu __read_mostly;
				30	static int perf_overcommit __read_mostly = 1;
				31
				32	/*
				33	* Mutex for (sysadmin-configurable) counter reservations:
				34	*/
				35	static DEFINE_MUTEX(perf_resource_mutex);
				36
				37	/*
				38	* Architecture provided APIs - weak aliases:
				39	*/
				40
Thomas Gleixner	dfa7c89	2008-12-08 19:35:37 +0100	[diff] [blame]	41	int __weak hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	42	{
				43	return -EINVAL;
				44	}
				45
				46	void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
				47	void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
				48	void __weak hw_perf_counter_read(struct perf_counter *counter) { }
				49	void __weak hw_perf_disable_all(void) { }
				50	void __weak hw_perf_enable_all(void) { }
				51	void __weak hw_perf_counter_setup(void) { }
				52
				53	#if BITS_PER_LONG == 64
				54
				55	/*
				56	* Read the cached counter in counter safe against cross CPU / NMI
				57	* modifications. 64 bit version - no complications.
				58	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	59	static inline u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	60	{
				61	return (u64) atomic64_read(&counter->count);
				62	}
				63
				64	#else
				65
				66	/*
				67	* Read the cached counter in counter safe against cross CPU / NMI
				68	* modifications. 32 bit version.
				69	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	70	static u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	71	{
				72	u32 cntl, cnth;
				73
				74	local_irq_disable();
				75	do {
				76	cnth = atomic_read(&counter->count32[1]);
				77	cntl = atomic_read(&counter->count32[0]);
				78	} while (cnth != atomic_read(&counter->count32[1]));
				79
				80	local_irq_enable();
				81
				82	return cntl \| ((u64) cnth) << 32;
				83	}
				84
				85	#endif
				86
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	87	static void
				88	list_add_counter(struct perf_counter counter, struct perf_counter_context ctx)
				89	{
				90	struct perf_counter *group_leader = counter->group_leader;
				91
				92	/*
				93	* Depending on whether it is a standalone or sibling counter,
				94	* add it straight to the context's counter list, or to the group
				95	* leader's sibling list:
				96	*/
				97	if (counter->group_leader == counter)
				98	list_add_tail(&counter->list_entry, &ctx->counter_list);
				99	else
				100	list_add_tail(&counter->list_entry, &group_leader->sibling_list);
				101	}
				102
				103	static void
				104	list_del_counter(struct perf_counter counter, struct perf_counter_context ctx)
				105	{
				106	struct perf_counter sibling, tmp;
				107
				108	list_del_init(&counter->list_entry);
				109
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	110	/*
				111	* If this was a group counter with sibling counters then
				112	* upgrade the siblings to singleton counters by adding them
				113	* to the context list directly:
				114	*/
				115	list_for_each_entry_safe(sibling, tmp,
				116	&counter->sibling_list, list_entry) {
				117
				118	list_del_init(&sibling->list_entry);
				119	list_add_tail(&sibling->list_entry, &ctx->counter_list);
				120	WARN_ON_ONCE(!sibling->group_leader);
				121	WARN_ON_ONCE(sibling->group_leader == sibling);
				122	sibling->group_leader = sibling;
				123	}
				124	}
				125
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	126	/*
				127	* Cross CPU call to remove a performance counter
				128	*
				129	* We disable the counter on the hardware level first. After that we
				130	* remove it from the context list.
				131	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	132	static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	133	{
				134	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				135	struct perf_counter *counter = info;
				136	struct perf_counter_context *ctx = counter->ctx;
				137
				138	/*
				139	* If this is a task context, we need to check whether it is
				140	* the current task context of this cpu. If not it has been
				141	* scheduled out before the smp call arrived.
				142	*/
				143	if (ctx->task && cpuctx->task_ctx != ctx)
				144	return;
				145
				146	spin_lock(&ctx->lock);
				147
				148	if (counter->active) {
				149	hw_perf_counter_disable(counter);
				150	counter->active = 0;
				151	ctx->nr_active--;
				152	cpuctx->active_oncpu--;
				153	counter->task = NULL;
				154	}
				155	ctx->nr_counters--;
				156
				157	/*
				158	* Protect the list operation against NMI by disabling the
				159	* counters on a global level. NOP for non NMI based counters.
				160	*/
				161	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	162	list_del_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	163	hw_perf_enable_all();
				164
				165	if (!ctx->task) {
				166	/*
				167	* Allow more per task counters with respect to the
				168	* reservation:
				169	*/
				170	cpuctx->max_pertask =
				171	min(perf_max_counters - ctx->nr_counters,
				172	perf_max_counters - perf_reserved_percpu);
				173	}
				174
				175	spin_unlock(&ctx->lock);
				176	}
				177
				178
				179	/*
				180	* Remove the counter from a task's (or a CPU's) list of counters.
				181	*
				182	* Must be called with counter->mutex held.
				183	*
				184	* CPU counters are removed with a smp call. For task counters we only
				185	* call when the task is on a CPU.
				186	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	187	static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	188	{
				189	struct perf_counter_context *ctx = counter->ctx;
				190	struct task_struct *task = ctx->task;
				191
				192	if (!task) {
				193	/*
				194	* Per cpu counters are removed via an smp call and
				195	* the removal is always sucessful.
				196	*/
				197	smp_call_function_single(counter->cpu,
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	198	__perf_counter_remove_from_context,
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	199	counter, 1);
				200	return;
				201	}
				202
				203	retry:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	204	task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	205	counter);
				206
				207	spin_lock_irq(&ctx->lock);
				208	/*
				209	* If the context is active we need to retry the smp call.
				210	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	211	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	212	spin_unlock_irq(&ctx->lock);
				213	goto retry;
				214	}
				215
				216	/*
				217	* The lock prevents that this context is scheduled in so we
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	218	* can remove the counter safely, if the call above did not
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	219	* succeed.
				220	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	221	if (!list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	222	ctx->nr_counters--;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	223	list_del_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	224	counter->task = NULL;
				225	}
				226	spin_unlock_irq(&ctx->lock);
				227	}
				228
				229	/*
				230	* Cross CPU call to install and enable a preformance counter
				231	*/
				232	static void __perf_install_in_context(void *info)
				233	{
				234	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				235	struct perf_counter *counter = info;
				236	struct perf_counter_context *ctx = counter->ctx;
				237	int cpu = smp_processor_id();
				238
				239	/*
				240	* If this is a task context, we need to check whether it is
				241	* the current task context of this cpu. If not it has been
				242	* scheduled out before the smp call arrived.
				243	*/
				244	if (ctx->task && cpuctx->task_ctx != ctx)
				245	return;
				246
				247	spin_lock(&ctx->lock);
				248
				249	/*
				250	* Protect the list operation against NMI by disabling the
				251	* counters on a global level. NOP for non NMI based counters.
				252	*/
				253	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	254	list_add_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	255	hw_perf_enable_all();
				256
				257	ctx->nr_counters++;
				258
				259	if (cpuctx->active_oncpu < perf_max_counters) {
				260	hw_perf_counter_enable(counter);
				261	counter->active = 1;
				262	counter->oncpu = cpu;
				263	ctx->nr_active++;
				264	cpuctx->active_oncpu++;
				265	}
				266
				267	if (!ctx->task && cpuctx->max_pertask)
				268	cpuctx->max_pertask--;
				269
				270	spin_unlock(&ctx->lock);
				271	}
				272
				273	/*
				274	* Attach a performance counter to a context
				275	*
				276	* First we add the counter to the list with the hardware enable bit
				277	* in counter->hw_config cleared.
				278	*
				279	* If the counter is attached to a task which is on a CPU we use a smp
				280	* call to enable it in the task context. The task might have been
				281	* scheduled away, but we check this in the smp call again.
				282	*/
				283	static void
				284	perf_install_in_context(struct perf_counter_context *ctx,
				285	struct perf_counter *counter,
				286	int cpu)
				287	{
				288	struct task_struct *task = ctx->task;
				289
				290	counter->ctx = ctx;
				291	if (!task) {
				292	/*
				293	* Per cpu counters are installed via an smp call and
				294	* the install is always sucessful.
				295	*/
				296	smp_call_function_single(cpu, __perf_install_in_context,
				297	counter, 1);
				298	return;
				299	}
				300
				301	counter->task = task;
				302	retry:
				303	task_oncpu_function_call(task, __perf_install_in_context,
				304	counter);
				305
				306	spin_lock_irq(&ctx->lock);
				307	/*
				308	* If the context is active and the counter has not been added
				309	* we need to retry the smp call.
				310	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	311	if (ctx->nr_active && list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	312	spin_unlock_irq(&ctx->lock);
				313	goto retry;
				314	}
				315
				316	/*
				317	* The lock prevents that this context is scheduled in so we
				318	* can add the counter safely, if it the call above did not
				319	* succeed.
				320	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	321	if (list_empty(&counter->list_entry)) {
				322	list_add_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	323	ctx->nr_counters++;
				324	}
				325	spin_unlock_irq(&ctx->lock);
				326	}
				327
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	328	static void
				329	counter_sched_out(struct perf_counter *counter,
				330	struct perf_cpu_context *cpuctx,
				331	struct perf_counter_context *ctx)
				332	{
				333	if (!counter->active)
				334	return;
				335
				336	hw_perf_counter_disable(counter);
				337	counter->active = 0;
				338	counter->oncpu = -1;
				339
				340	cpuctx->active_oncpu--;
				341	ctx->nr_active--;
				342	}
				343
				344	static void
				345	group_sched_out(struct perf_counter *group_counter,
				346	struct perf_cpu_context *cpuctx,
				347	struct perf_counter_context *ctx)
				348	{
				349	struct perf_counter *counter;
				350
				351	counter_sched_out(group_counter, cpuctx, ctx);
				352
				353	/*
				354	* Schedule out siblings (if any):
				355	*/
				356	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
				357	counter_sched_out(counter, cpuctx, ctx);
				358	}
				359
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	360	/*
				361	* Called from scheduler to remove the counters of the current task,
				362	* with interrupts disabled.
				363	*
				364	* We stop each counter and update the counter value in counter->count.
				365	*
				366	* This does not protect us against NMI, but hw_perf_counter_disable()
				367	* sets the disabled bit in the control field of counter _before_
				368	* accessing the counter control register. If a NMI hits, then it will
				369	* not restart the counter.
				370	*/
				371	void perf_counter_task_sched_out(struct task_struct *task, int cpu)
				372	{
				373	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				374	struct perf_counter_context *ctx = &task->perf_counter_ctx;
				375	struct perf_counter *counter;
				376
				377	if (likely(!cpuctx->task_ctx))
				378	return;
				379
				380	spin_lock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	381	if (ctx->nr_active) {
				382	list_for_each_entry(counter, &ctx->counter_list, list_entry)
				383	group_sched_out(counter, cpuctx, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	384	}
				385	spin_unlock(&ctx->lock);
				386	cpuctx->task_ctx = NULL;
				387	}
				388
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	389	static void
				390	counter_sched_in(struct perf_counter *counter,
				391	struct perf_cpu_context *cpuctx,
				392	struct perf_counter_context *ctx,
				393	int cpu)
				394	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	395	hw_perf_counter_enable(counter);
				396	counter->active = 1;
				397	counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
				398
				399	cpuctx->active_oncpu++;
				400	ctx->nr_active++;
				401	}
				402
				403	static void
				404	group_sched_in(struct perf_counter *group_counter,
				405	struct perf_cpu_context *cpuctx,
				406	struct perf_counter_context *ctx,
				407	int cpu)
				408	{
				409	struct perf_counter *counter;
				410
				411	counter_sched_in(group_counter, cpuctx, ctx, cpu);
				412
				413	/*
				414	* Schedule in siblings as one group (if any):
				415	*/
				416	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
				417	counter_sched_in(counter, cpuctx, ctx, cpu);
				418	}
				419
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	420	/*
				421	* Called from scheduler to add the counters of the current task
				422	* with interrupts disabled.
				423	*
				424	* We restore the counter value and then enable it.
				425	*
				426	* This does not protect us against NMI, but hw_perf_counter_enable()
				427	* sets the enabled bit in the control field of counter _before_
				428	* accessing the counter control register. If a NMI hits, then it will
				429	* keep the counter running.
				430	*/
				431	void perf_counter_task_sched_in(struct task_struct *task, int cpu)
				432	{
				433	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				434	struct perf_counter_context *ctx = &task->perf_counter_ctx;
				435	struct perf_counter *counter;
				436
				437	if (likely(!ctx->nr_counters))
				438	return;
				439
				440	spin_lock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	441	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	442	if (ctx->nr_active == cpuctx->max_pertask)
				443	break;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	444
				445	/*
				446	* Listen to the 'cpu' scheduling filter constraint
				447	* of counters:
				448	*/
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	449	if (counter->cpu != -1 && counter->cpu != cpu)
				450	continue;
				451
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	452	group_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	453	}
				454	spin_unlock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	455
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	456	cpuctx->task_ctx = ctx;
				457	}
				458
				459	void perf_counter_task_tick(struct task_struct *curr, int cpu)
				460	{
				461	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
				462	struct perf_counter *counter;
				463
				464	if (likely(!ctx->nr_counters))
				465	return;
				466
				467	perf_counter_task_sched_out(curr, cpu);
				468
				469	spin_lock(&ctx->lock);
				470
				471	/*
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	472	* Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	473	*/
				474	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	475	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
				476	list_del(&counter->list_entry);
				477	list_add_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	478	break;
				479	}
				480	hw_perf_enable_all();
				481
				482	spin_unlock(&ctx->lock);
				483
				484	perf_counter_task_sched_in(curr, cpu);
				485	}
				486
				487	/*
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	488	* Initialize the perf_counter context in a task_struct:
				489	*/
				490	static void
				491	__perf_counter_init_context(struct perf_counter_context *ctx,
				492	struct task_struct *task)
				493	{
				494	spin_lock_init(&ctx->lock);
				495	INIT_LIST_HEAD(&ctx->counter_list);
				496	ctx->nr_counters = 0;
				497	ctx->task = task;
				498	}
				499	/*
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	500	* Initialize the perf_counter context in task_struct
				501	*/
				502	void perf_counter_init_task(struct task_struct *task)
				503	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	504	__perf_counter_init_context(&task->perf_counter_ctx, task);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	505	}
				506
				507	/*
				508	* Cross CPU call to read the hardware counter
				509	*/
				510	static void __hw_perf_counter_read(void *info)
				511	{
				512	hw_perf_counter_read(info);
				513	}
				514
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	515	static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	516	{
				517	/*
				518	* If counter is enabled and currently active on a CPU, update the
				519	* value in the counter structure:
				520	*/
				521	if (counter->active) {
				522	smp_call_function_single(counter->oncpu,
				523	__hw_perf_counter_read, counter, 1);
				524	}
				525
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	526	return perf_counter_read_safe(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	527	}
				528
				529	/*
				530	* Cross CPU call to switch performance data pointers
				531	*/
				532	static void __perf_switch_irq_data(void *info)
				533	{
				534	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				535	struct perf_counter *counter = info;
				536	struct perf_counter_context *ctx = counter->ctx;
				537	struct perf_data *oldirqdata = counter->irqdata;
				538
				539	/*
				540	* If this is a task context, we need to check whether it is
				541	* the current task context of this cpu. If not it has been
				542	* scheduled out before the smp call arrived.
				543	*/
				544	if (ctx->task) {
				545	if (cpuctx->task_ctx != ctx)
				546	return;
				547	spin_lock(&ctx->lock);
				548	}
				549
				550	/* Change the pointer NMI safe */
				551	atomic_long_set((atomic_long_t *)&counter->irqdata,
				552	(unsigned long) counter->usrdata);
				553	counter->usrdata = oldirqdata;
				554
				555	if (ctx->task)
				556	spin_unlock(&ctx->lock);
				557	}
				558
				559	static struct perf_data perf_switch_irq_data(struct perf_counter counter)
				560	{
				561	struct perf_counter_context *ctx = counter->ctx;
				562	struct perf_data *oldirqdata = counter->irqdata;
				563	struct task_struct *task = ctx->task;
				564
				565	if (!task) {
				566	smp_call_function_single(counter->cpu,
				567	__perf_switch_irq_data,
				568	counter, 1);
				569	return counter->usrdata;
				570	}
				571
				572	retry:
				573	spin_lock_irq(&ctx->lock);
				574	if (!counter->active) {
				575	counter->irqdata = counter->usrdata;
				576	counter->usrdata = oldirqdata;
				577	spin_unlock_irq(&ctx->lock);
				578	return oldirqdata;
				579	}
				580	spin_unlock_irq(&ctx->lock);
				581	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
				582	/* Might have failed, because task was scheduled out */
				583	if (counter->irqdata == oldirqdata)
				584	goto retry;
				585
				586	return counter->usrdata;
				587	}
				588
				589	static void put_context(struct perf_counter_context *ctx)
				590	{
				591	if (ctx->task)
				592	put_task_struct(ctx->task);
				593	}
				594
				595	static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
				596	{
				597	struct perf_cpu_context *cpuctx;
				598	struct perf_counter_context *ctx;
				599	struct task_struct *task;
				600
				601	/*
				602	* If cpu is not a wildcard then this is a percpu counter:
				603	*/
				604	if (cpu != -1) {
				605	/* Must be root to operate on a CPU counter: */
				606	if (!capable(CAP_SYS_ADMIN))
				607	return ERR_PTR(-EACCES);
				608
				609	if (cpu < 0 \|\| cpu > num_possible_cpus())
				610	return ERR_PTR(-EINVAL);
				611
				612	/*
				613	* We could be clever and allow to attach a counter to an
				614	* offline CPU and activate it when the CPU comes up, but
				615	* that's for later.
				616	*/
				617	if (!cpu_isset(cpu, cpu_online_map))
				618	return ERR_PTR(-ENODEV);
				619
				620	cpuctx = &per_cpu(perf_cpu_context, cpu);
				621	ctx = &cpuctx->ctx;
				622
				623	WARN_ON_ONCE(ctx->task);
				624	return ctx;
				625	}
				626
				627	rcu_read_lock();
				628	if (!pid)
				629	task = current;
				630	else
				631	task = find_task_by_vpid(pid);
				632	if (task)
				633	get_task_struct(task);
				634	rcu_read_unlock();
				635
				636	if (!task)
				637	return ERR_PTR(-ESRCH);
				638
				639	ctx = &task->perf_counter_ctx;
				640	ctx->task = task;
				641
				642	/* Reuse ptrace permission checks for now. */
				643	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
				644	put_context(ctx);
				645	return ERR_PTR(-EACCES);
				646	}
				647
				648	return ctx;
				649	}
				650
				651	/*
				652	* Called when the last reference to the file is gone.
				653	*/
				654	static int perf_release(struct inode inode, struct file file)
				655	{
				656	struct perf_counter *counter = file->private_data;
				657	struct perf_counter_context *ctx = counter->ctx;
				658
				659	file->private_data = NULL;
				660
				661	mutex_lock(&counter->mutex);
				662
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	663	perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	664	put_context(ctx);
				665
				666	mutex_unlock(&counter->mutex);
				667
				668	kfree(counter);
				669
				670	return 0;
				671	}
				672
				673	/*
				674	* Read the performance counter - simple non blocking version for now
				675	*/
				676	static ssize_t
				677	perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
				678	{
				679	u64 cntval;
				680
				681	if (count != sizeof(cntval))
				682	return -EINVAL;
				683
				684	mutex_lock(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	685	cntval = perf_counter_read(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	686	mutex_unlock(&counter->mutex);
				687
				688	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
				689	}
				690
				691	static ssize_t
				692	perf_copy_usrdata(struct perf_data usrdata, char __user buf, size_t count)
				693	{
				694	if (!usrdata->len)
				695	return 0;
				696
				697	count = min(count, (size_t)usrdata->len);
				698	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
				699	return -EFAULT;
				700
				701	/* Adjust the counters */
				702	usrdata->len -= count;
				703	if (!usrdata->len)
				704	usrdata->rd_idx = 0;
				705	else
				706	usrdata->rd_idx += count;
				707
				708	return count;
				709	}
				710
				711	static ssize_t
				712	perf_read_irq_data(struct perf_counter *counter,
				713	char __user *buf,
				714	size_t count,
				715	int nonblocking)
				716	{
				717	struct perf_data irqdata, usrdata;
				718	DECLARE_WAITQUEUE(wait, current);
				719	ssize_t res;
				720
				721	irqdata = counter->irqdata;
				722	usrdata = counter->usrdata;
				723
				724	if (usrdata->len + irqdata->len >= count)
				725	goto read_pending;
				726
				727	if (nonblocking)
				728	return -EAGAIN;
				729
				730	spin_lock_irq(&counter->waitq.lock);
				731	__add_wait_queue(&counter->waitq, &wait);
				732	for (;;) {
				733	set_current_state(TASK_INTERRUPTIBLE);
				734	if (usrdata->len + irqdata->len >= count)
				735	break;
				736
				737	if (signal_pending(current))
				738	break;
				739
				740	spin_unlock_irq(&counter->waitq.lock);
				741	schedule();
				742	spin_lock_irq(&counter->waitq.lock);
				743	}
				744	__remove_wait_queue(&counter->waitq, &wait);
				745	__set_current_state(TASK_RUNNING);
				746	spin_unlock_irq(&counter->waitq.lock);
				747
				748	if (usrdata->len + irqdata->len < count)
				749	return -ERESTARTSYS;
				750	read_pending:
				751	mutex_lock(&counter->mutex);
				752
				753	/* Drain pending data first: */
				754	res = perf_copy_usrdata(usrdata, buf, count);
				755	if (res < 0 \|\| res == count)
				756	goto out;
				757
				758	/* Switch irq buffer: */
				759	usrdata = perf_switch_irq_data(counter);
				760	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
				761	if (!res)
				762	res = -EFAULT;
				763	} else {
				764	res = count;
				765	}
				766	out:
				767	mutex_unlock(&counter->mutex);
				768
				769	return res;
				770	}
				771
				772	static ssize_t
				773	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)
				774	{
				775	struct perf_counter *counter = file->private_data;
				776
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	777	switch (counter->hw_event.record_type) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	778	case PERF_RECORD_SIMPLE:
				779	return perf_read_hw(counter, buf, count);
				780
				781	case PERF_RECORD_IRQ:
				782	case PERF_RECORD_GROUP:
				783	return perf_read_irq_data(counter, buf, count,
				784	file->f_flags & O_NONBLOCK);
				785	}
				786	return -EINVAL;
				787	}
				788
				789	static unsigned int perf_poll(struct file file, poll_table wait)
				790	{
				791	struct perf_counter *counter = file->private_data;
				792	unsigned int events = 0;
				793	unsigned long flags;
				794
				795	poll_wait(file, &counter->waitq, wait);
				796
				797	spin_lock_irqsave(&counter->waitq.lock, flags);
				798	if (counter->usrdata->len \|\| counter->irqdata->len)
				799	events \|= POLLIN;
				800	spin_unlock_irqrestore(&counter->waitq.lock, flags);
				801
				802	return events;
				803	}
				804
				805	static const struct file_operations perf_fops = {
				806	.release = perf_release,
				807	.read = perf_read,
				808	.poll = perf_poll,
				809	};
				810
				811	/*
				812	* Allocate and initialize a counter structure
				813	*/
				814	static struct perf_counter *
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	815	perf_counter_alloc(struct perf_counter_hw_event *hw_event,
				816	int cpu,
				817	struct perf_counter *group_leader)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	818	{
				819	struct perf_counter counter = kzalloc(sizeof(counter), GFP_KERNEL);
				820
				821	if (!counter)
				822	return NULL;
				823
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	824	/*
				825	* Single counters are their own group leaders, with an
				826	* empty sibling list:
				827	*/
				828	if (!group_leader)
				829	group_leader = counter;
				830
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	831	mutex_init(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	832	INIT_LIST_HEAD(&counter->list_entry);
				833	INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	834	init_waitqueue_head(&counter->waitq);
				835
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	836	counter->irqdata = &counter->data[0];
				837	counter->usrdata = &counter->data[1];
				838	counter->cpu = cpu;
				839	counter->hw_event = *hw_event;
				840	counter->wakeup_pending = 0;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	841	counter->group_leader = group_leader;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	842
				843	return counter;
				844	}
				845
				846	/**
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	847	* sys_perf_task_open - open a performance counter, associate it to a task/cpu
				848	*
				849	* @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	850	* @pid: target pid
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	851	* @cpu: target cpu
				852	* @group_fd: group leader counter fd
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	853	*/
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	854	asmlinkage int sys_perf_counter_open(
				855
				856	struct perf_counter_hw_event *hw_event_uptr __user,
				857	pid_t pid,
				858	int cpu,
				859	int group_fd)
				860
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	861	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	862	struct perf_counter counter, group_leader;
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	863	struct perf_counter_hw_event hw_event;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	864	struct perf_counter_context *ctx;
				865	struct file *group_file = NULL;
				866	int fput_needed = 0;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	867	int ret;
				868
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	869	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixner	eab656a	2008-12-08 19:26:59 +0100	[diff] [blame]	870	return -EFAULT;
				871
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	872	/*
Ingo Molnar	ccff286	2008-12-11 11:26:29 +0100	[diff] [blame^]	873	* Get the target context (task or percpu):
				874	*/
				875	ctx = find_get_context(pid, cpu);
				876	if (IS_ERR(ctx))
				877	return PTR_ERR(ctx);
				878
				879	/*
				880	* Look up the group leader (we will attach this counter to it):
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	881	*/
				882	group_leader = NULL;
				883	if (group_fd != -1) {
				884	ret = -EINVAL;
				885	group_file = fget_light(group_fd, &fput_needed);
				886	if (!group_file)
Ingo Molnar	ccff286	2008-12-11 11:26:29 +0100	[diff] [blame^]	887	goto err_put_context;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	888	if (group_file->f_op != &perf_fops)
Ingo Molnar	ccff286	2008-12-11 11:26:29 +0100	[diff] [blame^]	889	goto err_put_context;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	890
				891	group_leader = group_file->private_data;
				892	/*
Ingo Molnar	ccff286	2008-12-11 11:26:29 +0100	[diff] [blame^]	893	* Do not allow a recursive hierarchy (this new sibling
				894	* becoming part of another group-sibling):
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	895	*/
Ingo Molnar	ccff286	2008-12-11 11:26:29 +0100	[diff] [blame^]	896	if (group_leader->group_leader != group_leader)
				897	goto err_put_context;
				898	/*
				899	* Do not allow to attach to a group in a different
				900	* task or CPU context:
				901	*/
				902	if (group_leader->ctx != ctx)
				903	goto err_put_context;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	904	}
				905
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	906	ret = -ENOMEM;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	907	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	908	if (!counter)
				909	goto err_put_context;
				910
Thomas Gleixner	dfa7c89	2008-12-08 19:35:37 +0100	[diff] [blame]	911	ret = hw_perf_counter_init(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	912	if (ret)
				913	goto err_free_put_context;
				914
				915	perf_install_in_context(ctx, counter, cpu);
				916
				917	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
				918	if (ret < 0)
				919	goto err_remove_free_put_context;
				920
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	921	out_fput:
				922	fput_light(group_file, fput_needed);
				923
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	924	return ret;
				925
				926	err_remove_free_put_context:
				927	mutex_lock(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	928	perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	929	mutex_unlock(&counter->mutex);
				930
				931	err_free_put_context:
				932	kfree(counter);
				933
				934	err_put_context:
				935	put_context(ctx);
				936
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	937	goto out_fput;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	938	}
				939
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	940	static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	941	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	942	struct perf_cpu_context *cpuctx;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	943
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	944	cpuctx = &per_cpu(perf_cpu_context, cpu);
				945	__perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	946
				947	mutex_lock(&perf_resource_mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	948	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	949	mutex_unlock(&perf_resource_mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	950
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	951	hw_perf_counter_setup();
				952	}
				953
				954	#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	955	static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	956	{
				957	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				958	struct perf_counter_context *ctx = &cpuctx->ctx;
				959	struct perf_counter counter, tmp;
				960
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	961	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
				962	__perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	963
				964	}
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	965	static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	966	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	967	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	968	}
				969	#else
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	970	static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	971	#endif
				972
				973	static int __cpuinit
				974	perf_cpu_notify(struct notifier_block self, unsigned long action, void hcpu)
				975	{
				976	unsigned int cpu = (long)hcpu;
				977
				978	switch (action) {
				979
				980	case CPU_UP_PREPARE:
				981	case CPU_UP_PREPARE_FROZEN:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	982	perf_counter_init_cpu(cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	983	break;
				984
				985	case CPU_DOWN_PREPARE:
				986	case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame]	987	perf_counter_exit_cpu(cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	988	break;
				989
				990	default:
				991	break;
				992	}
				993
				994	return NOTIFY_OK;
				995	}
				996
				997	static struct notifier_block __cpuinitdata perf_cpu_nb = {
				998	.notifier_call = perf_cpu_notify,
				999	};
				1000
				1001	static int __init perf_counter_init(void)
				1002	{
				1003	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
				1004	(void *)(long)smp_processor_id());
				1005	register_cpu_notifier(&perf_cpu_nb);
				1006
				1007	return 0;
				1008	}
				1009	early_initcall(perf_counter_init);
				1010
				1011	static ssize_t perf_show_reserve_percpu(struct sysdev_class class, char buf)
				1012	{
				1013	return sprintf(buf, "%d\n", perf_reserved_percpu);
				1014	}
				1015
				1016	static ssize_t
				1017	perf_set_reserve_percpu(struct sysdev_class *class,
				1018	const char *buf,
				1019	size_t count)
				1020	{
				1021	struct perf_cpu_context *cpuctx;
				1022	unsigned long val;
				1023	int err, cpu, mpt;
				1024
				1025	err = strict_strtoul(buf, 10, &val);
				1026	if (err)
				1027	return err;
				1028	if (val > perf_max_counters)
				1029	return -EINVAL;
				1030
				1031	mutex_lock(&perf_resource_mutex);
				1032	perf_reserved_percpu = val;
				1033	for_each_online_cpu(cpu) {
				1034	cpuctx = &per_cpu(perf_cpu_context, cpu);
				1035	spin_lock_irq(&cpuctx->ctx.lock);
				1036	mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
				1037	perf_max_counters - perf_reserved_percpu);
				1038	cpuctx->max_pertask = mpt;
				1039	spin_unlock_irq(&cpuctx->ctx.lock);
				1040	}
				1041	mutex_unlock(&perf_resource_mutex);
				1042
				1043	return count;
				1044	}
				1045
				1046	static ssize_t perf_show_overcommit(struct sysdev_class class, char buf)
				1047	{
				1048	return sprintf(buf, "%d\n", perf_overcommit);
				1049	}
				1050
				1051	static ssize_t
				1052	perf_set_overcommit(struct sysdev_class class, const char buf, size_t count)
				1053	{
				1054	unsigned long val;
				1055	int err;
				1056
				1057	err = strict_strtoul(buf, 10, &val);
				1058	if (err)
				1059	return err;
				1060	if (val > 1)
				1061	return -EINVAL;
				1062
				1063	mutex_lock(&perf_resource_mutex);
				1064	perf_overcommit = val;
				1065	mutex_unlock(&perf_resource_mutex);
				1066
				1067	return count;
				1068	}
				1069
				1070	static SYSDEV_CLASS_ATTR(
				1071	reserve_percpu,
				1072	0644,
				1073	perf_show_reserve_percpu,
				1074	perf_set_reserve_percpu
				1075	);
				1076
				1077	static SYSDEV_CLASS_ATTR(
				1078	overcommit,
				1079	0644,
				1080	perf_show_overcommit,
				1081	perf_set_overcommit
				1082	);
				1083
				1084	static struct attribute *perfclass_attrs[] = {
				1085	&attr_reserve_percpu.attr,
				1086	&attr_overcommit.attr,
				1087	NULL
				1088	};
				1089
				1090	static struct attribute_group perfclass_attr_group = {
				1091	.attrs = perfclass_attrs,
				1092	.name = "perf_counters",
				1093	};
				1094
				1095	static int __init perf_counter_sysfs_init(void)
				1096	{
				1097	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				1098	&perfclass_attr_group);
				1099	}
				1100	device_initcall(perf_counter_sysfs_init);
				1101