Blame - kernel/perf_counter.c - android_kernel_oneplus_msm8996

blob: fa59fe8c02d5b7d393f6fef36ca97e9a6bd84a74 [file] [log] [blame]

Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	1	/*
				2	* Performance counter core code
				3	*
				4	* Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
				5	* Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
				6	*
				7	* For licencing details see kernel-base/COPYING
				8	*/
				9
				10	#include <linux/fs.h>
				11	#include <linux/cpu.h>
				12	#include <linux/smp.h>
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	13	#include <linux/file.h>
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	14	#include <linux/poll.h>
				15	#include <linux/sysfs.h>
				16	#include <linux/ptrace.h>
				17	#include <linux/percpu.h>
				18	#include <linux/uaccess.h>
				19	#include <linux/syscalls.h>
				20	#include <linux/anon_inodes.h>
				21	#include <linux/perf_counter.h>
				22
				23	/*
				24	* Each CPU has a list of per CPU counters:
				25	*/
				26	DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
				27
				28	int perf_max_counters __read_mostly;
				29	static int perf_reserved_percpu __read_mostly;
				30	static int perf_overcommit __read_mostly = 1;
				31
				32	/*
				33	* Mutex for (sysadmin-configurable) counter reservations:
				34	*/
				35	static DEFINE_MUTEX(perf_resource_mutex);
				36
				37	/*
				38	* Architecture provided APIs - weak aliases:
				39	*/
				40
Thomas Gleixner	dfa7c89	2008-12-08 19:35:37 +0100	[diff] [blame]	41	int __weak hw_perf_counter_init(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	42	{
				43	return -EINVAL;
				44	}
				45
				46	void __weak hw_perf_counter_enable(struct perf_counter *counter) { }
				47	void __weak hw_perf_counter_disable(struct perf_counter *counter) { }
				48	void __weak hw_perf_counter_read(struct perf_counter *counter) { }
				49	void __weak hw_perf_disable_all(void) { }
				50	void __weak hw_perf_enable_all(void) { }
				51	void __weak hw_perf_counter_setup(void) { }
				52
				53	#if BITS_PER_LONG == 64
				54
				55	/*
				56	* Read the cached counter in counter safe against cross CPU / NMI
				57	* modifications. 64 bit version - no complications.
				58	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	59	static inline u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	60	{
				61	return (u64) atomic64_read(&counter->count);
				62	}
				63
				64	#else
				65
				66	/*
				67	* Read the cached counter in counter safe against cross CPU / NMI
				68	* modifications. 32 bit version.
				69	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	70	static u64 perf_counter_read_safe(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	71	{
				72	u32 cntl, cnth;
				73
				74	local_irq_disable();
				75	do {
				76	cnth = atomic_read(&counter->count32[1]);
				77	cntl = atomic_read(&counter->count32[0]);
				78	} while (cnth != atomic_read(&counter->count32[1]));
				79
				80	local_irq_enable();
				81
				82	return cntl \| ((u64) cnth) << 32;
				83	}
				84
				85	#endif
				86
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	87	static void
				88	list_add_counter(struct perf_counter counter, struct perf_counter_context ctx)
				89	{
				90	struct perf_counter *group_leader = counter->group_leader;
				91
				92	/*
				93	* Depending on whether it is a standalone or sibling counter,
				94	* add it straight to the context's counter list, or to the group
				95	* leader's sibling list:
				96	*/
				97	if (counter->group_leader == counter)
				98	list_add_tail(&counter->list_entry, &ctx->counter_list);
				99	else
				100	list_add_tail(&counter->list_entry, &group_leader->sibling_list);
				101	}
				102
				103	static void
				104	list_del_counter(struct perf_counter counter, struct perf_counter_context ctx)
				105	{
				106	struct perf_counter sibling, tmp;
				107
				108	list_del_init(&counter->list_entry);
				109
				110	if (list_empty(&counter->sibling_list))
				111	return;
				112
				113	/*
				114	* If this was a group counter with sibling counters then
				115	* upgrade the siblings to singleton counters by adding them
				116	* to the context list directly:
				117	*/
				118	list_for_each_entry_safe(sibling, tmp,
				119	&counter->sibling_list, list_entry) {
				120
				121	list_del_init(&sibling->list_entry);
				122	list_add_tail(&sibling->list_entry, &ctx->counter_list);
				123	WARN_ON_ONCE(!sibling->group_leader);
				124	WARN_ON_ONCE(sibling->group_leader == sibling);
				125	sibling->group_leader = sibling;
				126	}
				127	}
				128
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	129	/*
				130	* Cross CPU call to remove a performance counter
				131	*
				132	* We disable the counter on the hardware level first. After that we
				133	* remove it from the context list.
				134	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	135	static void __perf_counter_remove_from_context(void *info)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	136	{
				137	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				138	struct perf_counter *counter = info;
				139	struct perf_counter_context *ctx = counter->ctx;
				140
				141	/*
				142	* If this is a task context, we need to check whether it is
				143	* the current task context of this cpu. If not it has been
				144	* scheduled out before the smp call arrived.
				145	*/
				146	if (ctx->task && cpuctx->task_ctx != ctx)
				147	return;
				148
				149	spin_lock(&ctx->lock);
				150
				151	if (counter->active) {
				152	hw_perf_counter_disable(counter);
				153	counter->active = 0;
				154	ctx->nr_active--;
				155	cpuctx->active_oncpu--;
				156	counter->task = NULL;
				157	}
				158	ctx->nr_counters--;
				159
				160	/*
				161	* Protect the list operation against NMI by disabling the
				162	* counters on a global level. NOP for non NMI based counters.
				163	*/
				164	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	165	list_del_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	166	hw_perf_enable_all();
				167
				168	if (!ctx->task) {
				169	/*
				170	* Allow more per task counters with respect to the
				171	* reservation:
				172	*/
				173	cpuctx->max_pertask =
				174	min(perf_max_counters - ctx->nr_counters,
				175	perf_max_counters - perf_reserved_percpu);
				176	}
				177
				178	spin_unlock(&ctx->lock);
				179	}
				180
				181
				182	/*
				183	* Remove the counter from a task's (or a CPU's) list of counters.
				184	*
				185	* Must be called with counter->mutex held.
				186	*
				187	* CPU counters are removed with a smp call. For task counters we only
				188	* call when the task is on a CPU.
				189	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	190	static void perf_counter_remove_from_context(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	191	{
				192	struct perf_counter_context *ctx = counter->ctx;
				193	struct task_struct *task = ctx->task;
				194
				195	if (!task) {
				196	/*
				197	* Per cpu counters are removed via an smp call and
				198	* the removal is always sucessful.
				199	*/
				200	smp_call_function_single(counter->cpu,
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	201	__perf_counter_remove_from_context,
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	202	counter, 1);
				203	return;
				204	}
				205
				206	retry:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	207	task_oncpu_function_call(task, __perf_counter_remove_from_context,
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	208	counter);
				209
				210	spin_lock_irq(&ctx->lock);
				211	/*
				212	* If the context is active we need to retry the smp call.
				213	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	214	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	215	spin_unlock_irq(&ctx->lock);
				216	goto retry;
				217	}
				218
				219	/*
				220	* The lock prevents that this context is scheduled in so we
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	221	* can remove the counter safely, if the call above did not
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	222	* succeed.
				223	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	224	if (!list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	225	ctx->nr_counters--;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	226	list_del_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	227	counter->task = NULL;
				228	}
				229	spin_unlock_irq(&ctx->lock);
				230	}
				231
				232	/*
				233	* Cross CPU call to install and enable a preformance counter
				234	*/
				235	static void __perf_install_in_context(void *info)
				236	{
				237	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				238	struct perf_counter *counter = info;
				239	struct perf_counter_context *ctx = counter->ctx;
				240	int cpu = smp_processor_id();
				241
				242	/*
				243	* If this is a task context, we need to check whether it is
				244	* the current task context of this cpu. If not it has been
				245	* scheduled out before the smp call arrived.
				246	*/
				247	if (ctx->task && cpuctx->task_ctx != ctx)
				248	return;
				249
				250	spin_lock(&ctx->lock);
				251
				252	/*
				253	* Protect the list operation against NMI by disabling the
				254	* counters on a global level. NOP for non NMI based counters.
				255	*/
				256	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	257	list_add_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	258	hw_perf_enable_all();
				259
				260	ctx->nr_counters++;
				261
				262	if (cpuctx->active_oncpu < perf_max_counters) {
				263	hw_perf_counter_enable(counter);
				264	counter->active = 1;
				265	counter->oncpu = cpu;
				266	ctx->nr_active++;
				267	cpuctx->active_oncpu++;
				268	}
				269
				270	if (!ctx->task && cpuctx->max_pertask)
				271	cpuctx->max_pertask--;
				272
				273	spin_unlock(&ctx->lock);
				274	}
				275
				276	/*
				277	* Attach a performance counter to a context
				278	*
				279	* First we add the counter to the list with the hardware enable bit
				280	* in counter->hw_config cleared.
				281	*
				282	* If the counter is attached to a task which is on a CPU we use a smp
				283	* call to enable it in the task context. The task might have been
				284	* scheduled away, but we check this in the smp call again.
				285	*/
				286	static void
				287	perf_install_in_context(struct perf_counter_context *ctx,
				288	struct perf_counter *counter,
				289	int cpu)
				290	{
				291	struct task_struct *task = ctx->task;
				292
				293	counter->ctx = ctx;
				294	if (!task) {
				295	/*
				296	* Per cpu counters are installed via an smp call and
				297	* the install is always sucessful.
				298	*/
				299	smp_call_function_single(cpu, __perf_install_in_context,
				300	counter, 1);
				301	return;
				302	}
				303
				304	counter->task = task;
				305	retry:
				306	task_oncpu_function_call(task, __perf_install_in_context,
				307	counter);
				308
				309	spin_lock_irq(&ctx->lock);
				310	/*
				311	* If the context is active and the counter has not been added
				312	* we need to retry the smp call.
				313	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	314	if (ctx->nr_active && list_empty(&counter->list_entry)) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	315	spin_unlock_irq(&ctx->lock);
				316	goto retry;
				317	}
				318
				319	/*
				320	* The lock prevents that this context is scheduled in so we
				321	* can add the counter safely, if it the call above did not
				322	* succeed.
				323	*/
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	324	if (list_empty(&counter->list_entry)) {
				325	list_add_counter(counter, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	326	ctx->nr_counters++;
				327	}
				328	spin_unlock_irq(&ctx->lock);
				329	}
				330
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	331	static void
				332	counter_sched_out(struct perf_counter *counter,
				333	struct perf_cpu_context *cpuctx,
				334	struct perf_counter_context *ctx)
				335	{
				336	if (!counter->active)
				337	return;
				338
				339	hw_perf_counter_disable(counter);
				340	counter->active = 0;
				341	counter->oncpu = -1;
				342
				343	cpuctx->active_oncpu--;
				344	ctx->nr_active--;
				345	}
				346
				347	static void
				348	group_sched_out(struct perf_counter *group_counter,
				349	struct perf_cpu_context *cpuctx,
				350	struct perf_counter_context *ctx)
				351	{
				352	struct perf_counter *counter;
				353
				354	counter_sched_out(group_counter, cpuctx, ctx);
				355
				356	/*
				357	* Schedule out siblings (if any):
				358	*/
				359	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
				360	counter_sched_out(counter, cpuctx, ctx);
				361	}
				362
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	363	/*
				364	* Called from scheduler to remove the counters of the current task,
				365	* with interrupts disabled.
				366	*
				367	* We stop each counter and update the counter value in counter->count.
				368	*
				369	* This does not protect us against NMI, but hw_perf_counter_disable()
				370	* sets the disabled bit in the control field of counter _before_
				371	* accessing the counter control register. If a NMI hits, then it will
				372	* not restart the counter.
				373	*/
				374	void perf_counter_task_sched_out(struct task_struct *task, int cpu)
				375	{
				376	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				377	struct perf_counter_context *ctx = &task->perf_counter_ctx;
				378	struct perf_counter *counter;
				379
				380	if (likely(!cpuctx->task_ctx))
				381	return;
				382
				383	spin_lock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	384	if (ctx->nr_active) {
				385	list_for_each_entry(counter, &ctx->counter_list, list_entry)
				386	group_sched_out(counter, cpuctx, ctx);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	387	}
				388	spin_unlock(&ctx->lock);
				389	cpuctx->task_ctx = NULL;
				390	}
				391
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	392	static void
				393	counter_sched_in(struct perf_counter *counter,
				394	struct perf_cpu_context *cpuctx,
				395	struct perf_counter_context *ctx,
				396	int cpu)
				397	{
				398	if (!counter->active)
				399	return;
				400
				401	hw_perf_counter_enable(counter);
				402	counter->active = 1;
				403	counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
				404
				405	cpuctx->active_oncpu++;
				406	ctx->nr_active++;
				407	}
				408
				409	static void
				410	group_sched_in(struct perf_counter *group_counter,
				411	struct perf_cpu_context *cpuctx,
				412	struct perf_counter_context *ctx,
				413	int cpu)
				414	{
				415	struct perf_counter *counter;
				416
				417	counter_sched_in(group_counter, cpuctx, ctx, cpu);
				418
				419	/*
				420	* Schedule in siblings as one group (if any):
				421	*/
				422	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
				423	counter_sched_in(counter, cpuctx, ctx, cpu);
				424	}
				425
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	426	/*
				427	* Called from scheduler to add the counters of the current task
				428	* with interrupts disabled.
				429	*
				430	* We restore the counter value and then enable it.
				431	*
				432	* This does not protect us against NMI, but hw_perf_counter_enable()
				433	* sets the enabled bit in the control field of counter _before_
				434	* accessing the counter control register. If a NMI hits, then it will
				435	* keep the counter running.
				436	*/
				437	void perf_counter_task_sched_in(struct task_struct *task, int cpu)
				438	{
				439	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
				440	struct perf_counter_context *ctx = &task->perf_counter_ctx;
				441	struct perf_counter *counter;
				442
				443	if (likely(!ctx->nr_counters))
				444	return;
				445
				446	spin_lock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	447	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	448	if (ctx->nr_active == cpuctx->max_pertask)
				449	break;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	450
				451	/*
				452	* Listen to the 'cpu' scheduling filter constraint
				453	* of counters:
				454	*/
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	455	if (counter->cpu != -1 && counter->cpu != cpu)
				456	continue;
				457
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	458	group_sched_in(counter, cpuctx, ctx, cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	459	}
				460	spin_unlock(&ctx->lock);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	461
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	462	cpuctx->task_ctx = ctx;
				463	}
				464
				465	void perf_counter_task_tick(struct task_struct *curr, int cpu)
				466	{
				467	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
				468	struct perf_counter *counter;
				469
				470	if (likely(!ctx->nr_counters))
				471	return;
				472
				473	perf_counter_task_sched_out(curr, cpu);
				474
				475	spin_lock(&ctx->lock);
				476
				477	/*
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	478	* Rotate the first entry last (works just fine for group counters too):
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	479	*/
				480	hw_perf_disable_all();
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	481	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
				482	list_del(&counter->list_entry);
				483	list_add_tail(&counter->list_entry, &ctx->counter_list);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	484	break;
				485	}
				486	hw_perf_enable_all();
				487
				488	spin_unlock(&ctx->lock);
				489
				490	perf_counter_task_sched_in(curr, cpu);
				491	}
				492
				493	/*
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	494	* Initialize the perf_counter context in a task_struct:
				495	*/
				496	static void
				497	__perf_counter_init_context(struct perf_counter_context *ctx,
				498	struct task_struct *task)
				499	{
				500	spin_lock_init(&ctx->lock);
				501	INIT_LIST_HEAD(&ctx->counter_list);
				502	ctx->nr_counters = 0;
				503	ctx->task = task;
				504	}
				505	/*
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	506	* Initialize the perf_counter context in task_struct
				507	*/
				508	void perf_counter_init_task(struct task_struct *task)
				509	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	510	__perf_counter_init_context(&task->perf_counter_ctx, task);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	511	}
				512
				513	/*
				514	* Cross CPU call to read the hardware counter
				515	*/
				516	static void __hw_perf_counter_read(void *info)
				517	{
				518	hw_perf_counter_read(info);
				519	}
				520
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	521	static u64 perf_counter_read(struct perf_counter *counter)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	522	{
				523	/*
				524	* If counter is enabled and currently active on a CPU, update the
				525	* value in the counter structure:
				526	*/
				527	if (counter->active) {
				528	smp_call_function_single(counter->oncpu,
				529	__hw_perf_counter_read, counter, 1);
				530	}
				531
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	532	return perf_counter_read_safe(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	533	}
				534
				535	/*
				536	* Cross CPU call to switch performance data pointers
				537	*/
				538	static void __perf_switch_irq_data(void *info)
				539	{
				540	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				541	struct perf_counter *counter = info;
				542	struct perf_counter_context *ctx = counter->ctx;
				543	struct perf_data *oldirqdata = counter->irqdata;
				544
				545	/*
				546	* If this is a task context, we need to check whether it is
				547	* the current task context of this cpu. If not it has been
				548	* scheduled out before the smp call arrived.
				549	*/
				550	if (ctx->task) {
				551	if (cpuctx->task_ctx != ctx)
				552	return;
				553	spin_lock(&ctx->lock);
				554	}
				555
				556	/* Change the pointer NMI safe */
				557	atomic_long_set((atomic_long_t *)&counter->irqdata,
				558	(unsigned long) counter->usrdata);
				559	counter->usrdata = oldirqdata;
				560
				561	if (ctx->task)
				562	spin_unlock(&ctx->lock);
				563	}
				564
				565	static struct perf_data perf_switch_irq_data(struct perf_counter counter)
				566	{
				567	struct perf_counter_context *ctx = counter->ctx;
				568	struct perf_data *oldirqdata = counter->irqdata;
				569	struct task_struct *task = ctx->task;
				570
				571	if (!task) {
				572	smp_call_function_single(counter->cpu,
				573	__perf_switch_irq_data,
				574	counter, 1);
				575	return counter->usrdata;
				576	}
				577
				578	retry:
				579	spin_lock_irq(&ctx->lock);
				580	if (!counter->active) {
				581	counter->irqdata = counter->usrdata;
				582	counter->usrdata = oldirqdata;
				583	spin_unlock_irq(&ctx->lock);
				584	return oldirqdata;
				585	}
				586	spin_unlock_irq(&ctx->lock);
				587	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
				588	/* Might have failed, because task was scheduled out */
				589	if (counter->irqdata == oldirqdata)
				590	goto retry;
				591
				592	return counter->usrdata;
				593	}
				594
				595	static void put_context(struct perf_counter_context *ctx)
				596	{
				597	if (ctx->task)
				598	put_task_struct(ctx->task);
				599	}
				600
				601	static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
				602	{
				603	struct perf_cpu_context *cpuctx;
				604	struct perf_counter_context *ctx;
				605	struct task_struct *task;
				606
				607	/*
				608	* If cpu is not a wildcard then this is a percpu counter:
				609	*/
				610	if (cpu != -1) {
				611	/* Must be root to operate on a CPU counter: */
				612	if (!capable(CAP_SYS_ADMIN))
				613	return ERR_PTR(-EACCES);
				614
				615	if (cpu < 0 \|\| cpu > num_possible_cpus())
				616	return ERR_PTR(-EINVAL);
				617
				618	/*
				619	* We could be clever and allow to attach a counter to an
				620	* offline CPU and activate it when the CPU comes up, but
				621	* that's for later.
				622	*/
				623	if (!cpu_isset(cpu, cpu_online_map))
				624	return ERR_PTR(-ENODEV);
				625
				626	cpuctx = &per_cpu(perf_cpu_context, cpu);
				627	ctx = &cpuctx->ctx;
				628
				629	WARN_ON_ONCE(ctx->task);
				630	return ctx;
				631	}
				632
				633	rcu_read_lock();
				634	if (!pid)
				635	task = current;
				636	else
				637	task = find_task_by_vpid(pid);
				638	if (task)
				639	get_task_struct(task);
				640	rcu_read_unlock();
				641
				642	if (!task)
				643	return ERR_PTR(-ESRCH);
				644
				645	ctx = &task->perf_counter_ctx;
				646	ctx->task = task;
				647
				648	/* Reuse ptrace permission checks for now. */
				649	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
				650	put_context(ctx);
				651	return ERR_PTR(-EACCES);
				652	}
				653
				654	return ctx;
				655	}
				656
				657	/*
				658	* Called when the last reference to the file is gone.
				659	*/
				660	static int perf_release(struct inode inode, struct file file)
				661	{
				662	struct perf_counter *counter = file->private_data;
				663	struct perf_counter_context *ctx = counter->ctx;
				664
				665	file->private_data = NULL;
				666
				667	mutex_lock(&counter->mutex);
				668
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	669	perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	670	put_context(ctx);
				671
				672	mutex_unlock(&counter->mutex);
				673
				674	kfree(counter);
				675
				676	return 0;
				677	}
				678
				679	/*
				680	* Read the performance counter - simple non blocking version for now
				681	*/
				682	static ssize_t
				683	perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
				684	{
				685	u64 cntval;
				686
				687	if (count != sizeof(cntval))
				688	return -EINVAL;
				689
				690	mutex_lock(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	691	cntval = perf_counter_read(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	692	mutex_unlock(&counter->mutex);
				693
				694	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
				695	}
				696
				697	static ssize_t
				698	perf_copy_usrdata(struct perf_data usrdata, char __user buf, size_t count)
				699	{
				700	if (!usrdata->len)
				701	return 0;
				702
				703	count = min(count, (size_t)usrdata->len);
				704	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
				705	return -EFAULT;
				706
				707	/* Adjust the counters */
				708	usrdata->len -= count;
				709	if (!usrdata->len)
				710	usrdata->rd_idx = 0;
				711	else
				712	usrdata->rd_idx += count;
				713
				714	return count;
				715	}
				716
				717	static ssize_t
				718	perf_read_irq_data(struct perf_counter *counter,
				719	char __user *buf,
				720	size_t count,
				721	int nonblocking)
				722	{
				723	struct perf_data irqdata, usrdata;
				724	DECLARE_WAITQUEUE(wait, current);
				725	ssize_t res;
				726
				727	irqdata = counter->irqdata;
				728	usrdata = counter->usrdata;
				729
				730	if (usrdata->len + irqdata->len >= count)
				731	goto read_pending;
				732
				733	if (nonblocking)
				734	return -EAGAIN;
				735
				736	spin_lock_irq(&counter->waitq.lock);
				737	__add_wait_queue(&counter->waitq, &wait);
				738	for (;;) {
				739	set_current_state(TASK_INTERRUPTIBLE);
				740	if (usrdata->len + irqdata->len >= count)
				741	break;
				742
				743	if (signal_pending(current))
				744	break;
				745
				746	spin_unlock_irq(&counter->waitq.lock);
				747	schedule();
				748	spin_lock_irq(&counter->waitq.lock);
				749	}
				750	__remove_wait_queue(&counter->waitq, &wait);
				751	__set_current_state(TASK_RUNNING);
				752	spin_unlock_irq(&counter->waitq.lock);
				753
				754	if (usrdata->len + irqdata->len < count)
				755	return -ERESTARTSYS;
				756	read_pending:
				757	mutex_lock(&counter->mutex);
				758
				759	/* Drain pending data first: */
				760	res = perf_copy_usrdata(usrdata, buf, count);
				761	if (res < 0 \|\| res == count)
				762	goto out;
				763
				764	/* Switch irq buffer: */
				765	usrdata = perf_switch_irq_data(counter);
				766	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
				767	if (!res)
				768	res = -EFAULT;
				769	} else {
				770	res = count;
				771	}
				772	out:
				773	mutex_unlock(&counter->mutex);
				774
				775	return res;
				776	}
				777
				778	static ssize_t
				779	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)
				780	{
				781	struct perf_counter *counter = file->private_data;
				782
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	783	switch (counter->hw_event.record_type) {
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	784	case PERF_RECORD_SIMPLE:
				785	return perf_read_hw(counter, buf, count);
				786
				787	case PERF_RECORD_IRQ:
				788	case PERF_RECORD_GROUP:
				789	return perf_read_irq_data(counter, buf, count,
				790	file->f_flags & O_NONBLOCK);
				791	}
				792	return -EINVAL;
				793	}
				794
				795	static unsigned int perf_poll(struct file file, poll_table wait)
				796	{
				797	struct perf_counter *counter = file->private_data;
				798	unsigned int events = 0;
				799	unsigned long flags;
				800
				801	poll_wait(file, &counter->waitq, wait);
				802
				803	spin_lock_irqsave(&counter->waitq.lock, flags);
				804	if (counter->usrdata->len \|\| counter->irqdata->len)
				805	events \|= POLLIN;
				806	spin_unlock_irqrestore(&counter->waitq.lock, flags);
				807
				808	return events;
				809	}
				810
				811	static const struct file_operations perf_fops = {
				812	.release = perf_release,
				813	.read = perf_read,
				814	.poll = perf_poll,
				815	};
				816
				817	/*
				818	* Allocate and initialize a counter structure
				819	*/
				820	static struct perf_counter *
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	821	perf_counter_alloc(struct perf_counter_hw_event *hw_event,
				822	int cpu,
				823	struct perf_counter *group_leader)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	824	{
				825	struct perf_counter counter = kzalloc(sizeof(counter), GFP_KERNEL);
				826
				827	if (!counter)
				828	return NULL;
				829
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	830	/*
				831	* Single counters are their own group leaders, with an
				832	* empty sibling list:
				833	*/
				834	if (!group_leader)
				835	group_leader = counter;
				836
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	837	mutex_init(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	838	INIT_LIST_HEAD(&counter->list_entry);
				839	INIT_LIST_HEAD(&counter->sibling_list);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	840	init_waitqueue_head(&counter->waitq);
				841
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	842	counter->irqdata = &counter->data[0];
				843	counter->usrdata = &counter->data[1];
				844	counter->cpu = cpu;
				845	counter->hw_event = *hw_event;
				846	counter->wakeup_pending = 0;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	847	counter->group_leader = group_leader;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	848
				849	return counter;
				850	}
				851
				852	/**
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	853	* sys_perf_task_open - open a performance counter, associate it to a task/cpu
				854	*
				855	* @hw_event_uptr: event type attributes for monitoring/sampling
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	856	* @pid: target pid
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	857	* @cpu: target cpu
				858	* @group_fd: group leader counter fd
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	859	*/
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	860	asmlinkage int sys_perf_counter_open(
				861
				862	struct perf_counter_hw_event *hw_event_uptr __user,
				863	pid_t pid,
				864	int cpu,
				865	int group_fd)
				866
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	867	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	868	struct perf_counter counter, group_leader;
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	869	struct perf_counter_hw_event hw_event;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	870	struct perf_counter_context *ctx;
				871	struct file *group_file = NULL;
				872	int fput_needed = 0;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	873	int ret;
				874
Ingo Molnar	9f66a38	2008-12-10 12:33:23 +0100	[diff] [blame]	875	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
Thomas Gleixner	eab656a	2008-12-08 19:26:59 +0100	[diff] [blame]	876	return -EFAULT;
				877
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	878	/*
				879	* Look up the group leader:
				880	*/
				881	group_leader = NULL;
				882	if (group_fd != -1) {
				883	ret = -EINVAL;
				884	group_file = fget_light(group_fd, &fput_needed);
				885	if (!group_file)
				886	goto out_fput;
				887	if (group_file->f_op != &perf_fops)
				888	goto out_fput;
				889
				890	group_leader = group_file->private_data;
				891	/*
				892	* Do not allow a recursive hierarchy:
				893	*/
				894	if (group_leader->group_leader)
				895	goto out_fput;
				896	}
				897
				898	/*
				899	* Get the target context (task or percpu):
				900	*/
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	901	ctx = find_get_context(pid, cpu);
				902	if (IS_ERR(ctx))
				903	return PTR_ERR(ctx);
				904
				905	ret = -ENOMEM;
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	906	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	907	if (!counter)
				908	goto err_put_context;
				909
Thomas Gleixner	dfa7c89	2008-12-08 19:35:37 +0100	[diff] [blame]	910	ret = hw_perf_counter_init(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	911	if (ret)
				912	goto err_free_put_context;
				913
				914	perf_install_in_context(ctx, counter, cpu);
				915
				916	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
				917	if (ret < 0)
				918	goto err_remove_free_put_context;
				919
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	920	out_fput:
				921	fput_light(group_file, fput_needed);
				922
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	923	return ret;
				924
				925	err_remove_free_put_context:
				926	mutex_lock(&counter->mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	927	perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	928	mutex_unlock(&counter->mutex);
				929
				930	err_free_put_context:
				931	kfree(counter);
				932
				933	err_put_context:
				934	put_context(ctx);
				935
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	936	goto out_fput;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	937	}
				938
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	939	static void __cpuinit perf_counter_init_cpu(int cpu)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	940	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	941	struct perf_cpu_context *cpuctx;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	942
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	943	cpuctx = &per_cpu(perf_cpu_context, cpu);
				944	__perf_counter_init_context(&cpuctx->ctx, NULL);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	945
				946	mutex_lock(&perf_resource_mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	947	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	948	mutex_unlock(&perf_resource_mutex);
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	949
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	950	hw_perf_counter_setup();
				951	}
				952
				953	#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	954	static void __perf_counter_exit_cpu(void *info)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	955	{
				956	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
				957	struct perf_counter_context *ctx = &cpuctx->ctx;
				958	struct perf_counter counter, tmp;
				959
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	960	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
				961	__perf_counter_remove_from_context(counter);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	962
				963	}
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	964	static void perf_counter_exit_cpu(int cpu)
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	965	{
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	966	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	967	}
				968	#else
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	969	static inline void perf_counter_exit_cpu(int cpu) { }
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	970	#endif
				971
				972	static int __cpuinit
				973	perf_cpu_notify(struct notifier_block self, unsigned long action, void hcpu)
				974	{
				975	unsigned int cpu = (long)hcpu;
				976
				977	switch (action) {
				978
				979	case CPU_UP_PREPARE:
				980	case CPU_UP_PREPARE_FROZEN:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	981	perf_counter_init_cpu(cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	982	break;
				983
				984	case CPU_DOWN_PREPARE:
				985	case CPU_DOWN_PREPARE_FROZEN:
Ingo Molnar	04289bb	2008-12-11 08:38:42 +0100	[diff] [blame^]	986	perf_counter_exit_cpu(cpu);
Thomas Gleixner	0793a61	2008-12-04 20:12:29 +0100	[diff] [blame]	987	break;
				988
				989	default:
				990	break;
				991	}
				992
				993	return NOTIFY_OK;
				994	}
				995
				996	static struct notifier_block __cpuinitdata perf_cpu_nb = {
				997	.notifier_call = perf_cpu_notify,
				998	};
				999
				1000	static int __init perf_counter_init(void)
				1001	{
				1002	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
				1003	(void *)(long)smp_processor_id());
				1004	register_cpu_notifier(&perf_cpu_nb);
				1005
				1006	return 0;
				1007	}
				1008	early_initcall(perf_counter_init);
				1009
				1010	static ssize_t perf_show_reserve_percpu(struct sysdev_class class, char buf)
				1011	{
				1012	return sprintf(buf, "%d\n", perf_reserved_percpu);
				1013	}
				1014
				1015	static ssize_t
				1016	perf_set_reserve_percpu(struct sysdev_class *class,
				1017	const char *buf,
				1018	size_t count)
				1019	{
				1020	struct perf_cpu_context *cpuctx;
				1021	unsigned long val;
				1022	int err, cpu, mpt;
				1023
				1024	err = strict_strtoul(buf, 10, &val);
				1025	if (err)
				1026	return err;
				1027	if (val > perf_max_counters)
				1028	return -EINVAL;
				1029
				1030	mutex_lock(&perf_resource_mutex);
				1031	perf_reserved_percpu = val;
				1032	for_each_online_cpu(cpu) {
				1033	cpuctx = &per_cpu(perf_cpu_context, cpu);
				1034	spin_lock_irq(&cpuctx->ctx.lock);
				1035	mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
				1036	perf_max_counters - perf_reserved_percpu);
				1037	cpuctx->max_pertask = mpt;
				1038	spin_unlock_irq(&cpuctx->ctx.lock);
				1039	}
				1040	mutex_unlock(&perf_resource_mutex);
				1041
				1042	return count;
				1043	}
				1044
				1045	static ssize_t perf_show_overcommit(struct sysdev_class class, char buf)
				1046	{
				1047	return sprintf(buf, "%d\n", perf_overcommit);
				1048	}
				1049
				1050	static ssize_t
				1051	perf_set_overcommit(struct sysdev_class class, const char buf, size_t count)
				1052	{
				1053	unsigned long val;
				1054	int err;
				1055
				1056	err = strict_strtoul(buf, 10, &val);
				1057	if (err)
				1058	return err;
				1059	if (val > 1)
				1060	return -EINVAL;
				1061
				1062	mutex_lock(&perf_resource_mutex);
				1063	perf_overcommit = val;
				1064	mutex_unlock(&perf_resource_mutex);
				1065
				1066	return count;
				1067	}
				1068
				1069	static SYSDEV_CLASS_ATTR(
				1070	reserve_percpu,
				1071	0644,
				1072	perf_show_reserve_percpu,
				1073	perf_set_reserve_percpu
				1074	);
				1075
				1076	static SYSDEV_CLASS_ATTR(
				1077	overcommit,
				1078	0644,
				1079	perf_show_overcommit,
				1080	perf_set_overcommit
				1081	);
				1082
				1083	static struct attribute *perfclass_attrs[] = {
				1084	&attr_reserve_percpu.attr,
				1085	&attr_overcommit.attr,
				1086	NULL
				1087	};
				1088
				1089	static struct attribute_group perfclass_attr_group = {
				1090	.attrs = perfclass_attrs,
				1091	.name = "perf_counters",
				1092	};
				1093
				1094	static int __init perf_counter_sysfs_init(void)
				1095	{
				1096	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
				1097	&perfclass_attr_group);
				1098	}
				1099	device_initcall(perf_counter_sysfs_init);
				1100