Blame - kernel/cgroup.c - android_kernel_htc_msm8960

blob: 883928c0e147ff6a99da00cff89c191b230f0b25 [file] [log] [blame]

Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1	/*
				2	* kernel/cgroup.c
				3	*
				4	* Generic process-grouping system.
				5	*
				6	* Based originally on the cpuset system, extracted by Paul Menage
				7	* Copyright (C) 2006 Google, Inc
				8	*
				9	* Copyright notices from the original cpuset code:
				10	* --------------------------------------------------
				11	* Copyright (C) 2003 BULL SA.
				12	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
				13	*
				14	* Portions derived from Patrick Mochel's sysfs code.
				15	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				16	*
				17	* 2003-10-10 Written by Simon Derr.
				18	* 2003-10-22 Updates by Stephen Hemminger.
				19	* 2004 May-July Rework by Paul Jackson.
				20	* ---------------------------------------------------
				21	*
				22	* This file is subject to the terms and conditions of the GNU General Public
				23	* License. See the file COPYING in the main directory of the Linux
				24	* distribution for more details.
				25	*/
				26
				27	#include <linux/cgroup.h>
				28	#include <linux/errno.h>
				29	#include <linux/fs.h>
				30	#include <linux/kernel.h>
				31	#include <linux/list.h>
				32	#include <linux/mm.h>
				33	#include <linux/mutex.h>
				34	#include <linux/mount.h>
				35	#include <linux/pagemap.h>
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	36	#include <linux/proc_fs.h>
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	37	#include <linux/rcupdate.h>
				38	#include <linux/sched.h>
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	39	#include <linux/backing-dev.h>
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	40	#include <linux/seq_file.h>
				41	#include <linux/slab.h>
				42	#include <linux/magic.h>
				43	#include <linux/spinlock.h>
				44	#include <linux/string.h>
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	45	#include <linux/sort.h>
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	46	#include <asm/atomic.h>
				47
				48	/* Generate an array of cgroup subsystem pointers */
				49	#define SUBSYS(_x) &_x ## _subsys,
				50
				51	static struct cgroup_subsys *subsys[] = {
				52	#include <linux/cgroup_subsys.h>
				53	};
				54
				55	/*
				56	* A cgroupfs_root represents the root of a cgroup hierarchy,
				57	* and may be associated with a superblock to form an active
				58	* hierarchy
				59	*/
				60	struct cgroupfs_root {
				61	struct super_block *sb;
				62
				63	/*
				64	* The bitmask of subsystems intended to be attached to this
				65	* hierarchy
				66	*/
				67	unsigned long subsys_bits;
				68
				69	/* The bitmask of subsystems currently attached to this hierarchy */
				70	unsigned long actual_subsys_bits;
				71
				72	/* A list running through the attached subsystems */
				73	struct list_head subsys_list;
				74
				75	/* The root cgroup for this hierarchy */
				76	struct cgroup top_cgroup;
				77
				78	/* Tracks how many cgroups are currently defined in hierarchy.*/
				79	int number_of_cgroups;
				80
				81	/* A list running through the mounted hierarchies */
				82	struct list_head root_list;
				83
				84	/* Hierarchy-specific flags */
				85	unsigned long flags;
				86	};
				87
				88
				89	/*
				90	* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
				91	* subsystems that are otherwise unattached - it never has more than a
				92	* single cgroup, and all tasks are part of that cgroup.
				93	*/
				94	static struct cgroupfs_root rootnode;
				95
				96	/* The list of hierarchy roots */
				97
				98	static LIST_HEAD(roots);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	99	static int root_count;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	100
				101	/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
				102	#define dummytop (&rootnode.top_cgroup)
				103
				104	/* This flag indicates whether tasks in the fork and exit paths should
				105	* take callback_mutex and check for fork/exit handlers to call. This
				106	* avoids us having to do extra work in the fork/exit path if none of the
				107	* subsystems need to be called.
				108	*/
				109	static int need_forkexit_callback;
				110
				111	/* bits in struct cgroup flags field */
				112	enum {
				113	CONT_REMOVED,
				114	};
				115
				116	/* convenient tests for these bits */
				117	inline int cgroup_is_removed(const struct cgroup *cont)
				118	{
				119	return test_bit(CONT_REMOVED, &cont->flags);
				120	}
				121
				122	/* bits in struct cgroupfs_root flags field */
				123	enum {
				124	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
				125	};
				126
				127	/*
				128	* for_each_subsys() allows you to iterate on each subsystem attached to
				129	* an active hierarchy
				130	*/
				131	#define for_each_subsys(_root, _ss) \
				132	list_for_each_entry(_ss, &_root->subsys_list, sibling)
				133
				134	/* for_each_root() allows you to iterate across the active hierarchies */
				135	#define for_each_root(_root) \
				136	list_for_each_entry(_root, &roots, root_list)
				137
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	138	/* Link structure for associating css_set objects with cgroups */
				139	struct cg_cgroup_link {
				140	/*
				141	* List running through cg_cgroup_links associated with a
				142	* cgroup, anchored on cgroup->css_sets
				143	*/
				144	struct list_head cont_link_list;
				145	/*
				146	* List running through cg_cgroup_links pointing at a
				147	* single css_set object, anchored on css_set->cg_links
				148	*/
				149	struct list_head cg_link_list;
				150	struct css_set *cg;
				151	};
				152
				153	/* The default css_set - used by init and its children prior to any
				154	* hierarchies being mounted. It contains a pointer to the root state
				155	* for each subsystem. Also used to anchor the list of css_sets. Not
				156	* reference-counted, to improve performance when child cgroups
				157	* haven't been created.
				158	*/
				159
				160	static struct css_set init_css_set;
				161	static struct cg_cgroup_link init_css_set_link;
				162
				163	/* css_set_lock protects the list of css_set objects, and the
				164	* chain of tasks off each css_set. Nests outside task->alloc_lock
				165	* due to cgroup_iter_start() */
				166	static DEFINE_RWLOCK(css_set_lock);
				167	static int css_set_count;
				168
				169	/* We don't maintain the lists running through each css_set to its
				170	* task until after the first call to cgroup_iter_start(). This
				171	* reduces the fork()/exit() overhead for people who have cgroups
				172	* compiled into their kernel but not actually in use */
				173	static int use_task_css_set_links;
				174
				175	/* When we create or destroy a css_set, the operation simply
				176	* takes/releases a reference count on all the cgroups referenced
				177	* by subsystems in this css_set. This can end up multiple-counting
				178	* some cgroups, but that's OK - the ref-count is just a
				179	* busy/not-busy indicator; ensuring that we only count each cgroup
				180	* once would require taking a global lock to ensure that no
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	181	* subsystems moved between hierarchies while we were doing so.
				182	*
				183	* Possible TODO: decide at boot time based on the number of
				184	* registered subsystems and the number of CPUs or NUMA nodes whether
				185	* it's better for performance to ref-count every subsystem, or to
				186	* take a global lock and only add one ref count to each hierarchy.
				187	*/
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	188
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	189	/*
				190	* unlink a css_set from the list and free it
				191	*/
				192	static void release_css_set(struct kref *k)
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	193	{
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	194	struct css_set *cg = container_of(k, struct css_set, ref);
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	195	int i;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	196
				197	write_lock(&css_set_lock);
				198	list_del(&cg->list);
				199	css_set_count--;
				200	while (!list_empty(&cg->cg_links)) {
				201	struct cg_cgroup_link *link;
				202	link = list_entry(cg->cg_links.next,
				203	struct cg_cgroup_link, cg_link_list);
				204	list_del(&link->cg_link_list);
				205	list_del(&link->cont_link_list);
				206	kfree(link);
				207	}
				208	write_unlock(&css_set_lock);
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	209	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
				210	atomic_dec(&cg->subsys[i]->cgroup->count);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	211	kfree(cg);
				212	}
				213
				214	/*
				215	* refcounted get/put for css_set objects
				216	*/
				217	static inline void get_css_set(struct css_set *cg)
				218	{
				219	kref_get(&cg->ref);
				220	}
				221
				222	static inline void put_css_set(struct css_set *cg)
				223	{
				224	kref_put(&cg->ref, release_css_set);
				225	}
				226
				227	/*
				228	* find_existing_css_set() is a helper for
				229	* find_css_set(), and checks to see whether an existing
				230	* css_set is suitable. This currently walks a linked-list for
				231	* simplicity; a later patch will use a hash table for better
				232	* performance
				233	*
				234	* oldcg: the cgroup group that we're using before the cgroup
				235	* transition
				236	*
				237	* cont: the cgroup that we're moving into
				238	*
				239	* template: location in which to build the desired set of subsystem
				240	* state objects for the new cgroup group
				241	*/
				242
				243	static struct css_set *find_existing_css_set(
				244	struct css_set *oldcg,
				245	struct cgroup *cont,
				246	struct cgroup_subsys_state *template[])
				247	{
				248	int i;
				249	struct cgroupfs_root *root = cont->root;
				250	struct list_head *l = &init_css_set.list;
				251
				252	/* Built the set of subsystem state objects that we want to
				253	* see in the new css_set */
				254	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				255	if (root->subsys_bits & (1ull << i)) {
				256	/* Subsystem is in this hierarchy. So we want
				257	* the subsystem state from the new
				258	* cgroup */
				259	template[i] = cont->subsys[i];
				260	} else {
				261	/* Subsystem is not in this hierarchy, so we
				262	* don't want to change the subsystem state */
				263	template[i] = oldcg->subsys[i];
				264	}
				265	}
				266
				267	/* Look through existing cgroup groups to find one to reuse */
				268	do {
				269	struct css_set *cg =
				270	list_entry(l, struct css_set, list);
				271
				272	if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
				273	/* All subsystems matched */
				274	return cg;
				275	}
				276	/* Try the next cgroup group */
				277	l = l->next;
				278	} while (l != &init_css_set.list);
				279
				280	/* No existing cgroup group matched */
				281	return NULL;
				282	}
				283
				284	/*
				285	* allocate_cg_links() allocates "count" cg_cgroup_link structures
				286	* and chains them on tmp through their cont_link_list fields. Returns 0 on
				287	* success or a negative error
				288	*/
				289
				290	static int allocate_cg_links(int count, struct list_head *tmp)
				291	{
				292	struct cg_cgroup_link *link;
				293	int i;
				294	INIT_LIST_HEAD(tmp);
				295	for (i = 0; i < count; i++) {
				296	link = kmalloc(sizeof(*link), GFP_KERNEL);
				297	if (!link) {
				298	while (!list_empty(tmp)) {
				299	link = list_entry(tmp->next,
				300	struct cg_cgroup_link,
				301	cont_link_list);
				302	list_del(&link->cont_link_list);
				303	kfree(link);
				304	}
				305	return -ENOMEM;
				306	}
				307	list_add(&link->cont_link_list, tmp);
				308	}
				309	return 0;
				310	}
				311
				312	static void free_cg_links(struct list_head *tmp)
				313	{
				314	while (!list_empty(tmp)) {
				315	struct cg_cgroup_link *link;
				316	link = list_entry(tmp->next,
				317	struct cg_cgroup_link,
				318	cont_link_list);
				319	list_del(&link->cont_link_list);
				320	kfree(link);
				321	}
				322	}
				323
				324	/*
				325	* find_css_set() takes an existing cgroup group and a
				326	* cgroup object, and returns a css_set object that's
				327	* equivalent to the old group, but with the given cgroup
				328	* substituted into the appropriate hierarchy. Must be called with
				329	* cgroup_mutex held
				330	*/
				331
				332	static struct css_set *find_css_set(
				333	struct css_set oldcg, struct cgroup cont)
				334	{
				335	struct css_set *res;
				336	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
				337	int i;
				338
				339	struct list_head tmp_cg_links;
				340	struct cg_cgroup_link *link;
				341
				342	/* First see if we already have a cgroup group that matches
				343	* the desired set */
				344	write_lock(&css_set_lock);
				345	res = find_existing_css_set(oldcg, cont, template);
				346	if (res)
				347	get_css_set(res);
				348	write_unlock(&css_set_lock);
				349
				350	if (res)
				351	return res;
				352
				353	res = kmalloc(sizeof(*res), GFP_KERNEL);
				354	if (!res)
				355	return NULL;
				356
				357	/* Allocate all the cg_cgroup_link objects that we'll need */
				358	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
				359	kfree(res);
				360	return NULL;
				361	}
				362
				363	kref_init(&res->ref);
				364	INIT_LIST_HEAD(&res->cg_links);
				365	INIT_LIST_HEAD(&res->tasks);
				366
				367	/* Copy the set of subsystem state objects generated in
				368	* find_existing_css_set() */
				369	memcpy(res->subsys, template, sizeof(res->subsys));
				370
				371	write_lock(&css_set_lock);
				372	/* Add reference counts and links from the new css_set. */
				373	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				374	struct cgroup *cont = res->subsys[i]->cgroup;
				375	struct cgroup_subsys *ss = subsys[i];
				376	atomic_inc(&cont->count);
				377	/*
				378	* We want to add a link once per cgroup, so we
				379	* only do it for the first subsystem in each
				380	* hierarchy
				381	*/
				382	if (ss->root->subsys_list.next == &ss->sibling) {
				383	BUG_ON(list_empty(&tmp_cg_links));
				384	link = list_entry(tmp_cg_links.next,
				385	struct cg_cgroup_link,
				386	cont_link_list);
				387	list_del(&link->cont_link_list);
				388	list_add(&link->cont_link_list, &cont->css_sets);
				389	link->cg = res;
				390	list_add(&link->cg_link_list, &res->cg_links);
				391	}
				392	}
				393	if (list_empty(&rootnode.subsys_list)) {
				394	link = list_entry(tmp_cg_links.next,
				395	struct cg_cgroup_link,
				396	cont_link_list);
				397	list_del(&link->cont_link_list);
				398	list_add(&link->cont_link_list, &dummytop->css_sets);
				399	link->cg = res;
				400	list_add(&link->cg_link_list, &res->cg_links);
				401	}
				402
				403	BUG_ON(!list_empty(&tmp_cg_links));
				404
				405	/* Link this cgroup group into the list */
				406	list_add(&res->list, &init_css_set.list);
				407	css_set_count++;
				408	INIT_LIST_HEAD(&res->tasks);
				409	write_unlock(&css_set_lock);
				410
				411	return res;
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	412	}
				413
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	414	/*
				415	* There is one global cgroup mutex. We also require taking
				416	* task_lock() when dereferencing a task's cgroup subsys pointers.
				417	* See "The task_lock() exception", at the end of this comment.
				418	*
				419	* A task must hold cgroup_mutex to modify cgroups.
				420	*
				421	* Any task can increment and decrement the count field without lock.
				422	* So in general, code holding cgroup_mutex can't rely on the count
				423	* field not changing. However, if the count goes to zero, then only
				424	* attach_task() can increment it again. Because a count of zero
				425	* means that no tasks are currently attached, therefore there is no
				426	* way a task attached to that cgroup can fork (the other way to
				427	* increment the count). So code holding cgroup_mutex can safely
				428	* assume that if the count is zero, it will stay zero. Similarly, if
				429	* a task holds cgroup_mutex on a cgroup with zero count, it
				430	* knows that the cgroup won't be removed, as cgroup_rmdir()
				431	* needs that mutex.
				432	*
				433	* The cgroup_common_file_write handler for operations that modify
				434	* the cgroup hierarchy holds cgroup_mutex across the entire operation,
				435	* single threading all such cgroup modifications across the system.
				436	*
				437	* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
				438	* (usually) take cgroup_mutex. These are the two most performance
				439	* critical pieces of code here. The exception occurs on cgroup_exit(),
				440	* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
				441	* is taken, and if the cgroup count is zero, a usermode call made
				442	* to /sbin/cgroup_release_agent with the name of the cgroup (path
				443	* relative to the root of cgroup file system) as the argument.
				444	*
				445	* A cgroup can only be deleted if both its 'count' of using tasks
				446	* is zero, and its list of 'children' cgroups is empty. Since all
				447	* tasks in the system use _some_ cgroup, and since there is always at
				448	* least one task in the system (init, pid == 1), therefore, top_cgroup
				449	* always has either children cgroups and/or using tasks. So we don't
				450	* need a special hack to ensure that top_cgroup cannot be deleted.
				451	*
				452	* The task_lock() exception
				453	*
				454	* The need for this exception arises from the action of
				455	* attach_task(), which overwrites one tasks cgroup pointer with
				456	* another. It does so using cgroup_mutexe, however there are
				457	* several performance critical places that need to reference
				458	* task->cgroup without the expense of grabbing a system global
				459	* mutex. Therefore except as noted below, when dereferencing or, as
				460	* in attach_task(), modifying a task'ss cgroup pointer we use
				461	* task_lock(), which acts on a spinlock (task->alloc_lock) already in
				462	* the task_struct routinely used for such matters.
				463	*
				464	* P.S. One more locking exception. RCU is used to guard the
				465	* update of a tasks cgroup pointer by attach_task()
				466	*/
				467
				468	static DEFINE_MUTEX(cgroup_mutex);
				469
				470	/**
				471	* cgroup_lock - lock out any changes to cgroup structures
				472	*
				473	*/
				474
				475	void cgroup_lock(void)
				476	{
				477	mutex_lock(&cgroup_mutex);
				478	}
				479
				480	/**
				481	* cgroup_unlock - release lock on cgroup changes
				482	*
				483	* Undo the lock taken in a previous cgroup_lock() call.
				484	*/
				485
				486	void cgroup_unlock(void)
				487	{
				488	mutex_unlock(&cgroup_mutex);
				489	}
				490
				491	/*
				492	* A couple of forward declarations required, due to cyclic reference loop:
				493	* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
				494	* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
				495	* -> cgroup_mkdir.
				496	*/
				497
				498	static int cgroup_mkdir(struct inode dir, struct dentry dentry, int mode);
				499	static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry);
				500	static int cgroup_populate_dir(struct cgroup *cont);
				501	static struct inode_operations cgroup_dir_inode_operations;
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	502	static struct file_operations proc_cgroupstats_operations;
				503
				504	static struct backing_dev_info cgroup_backing_dev_info = {
				505	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
				506	};
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	507
				508	static struct inode cgroup_new_inode(mode_t mode, struct super_block sb)
				509	{
				510	struct inode *inode = new_inode(sb);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	511
				512	if (inode) {
				513	inode->i_mode = mode;
				514	inode->i_uid = current->fsuid;
				515	inode->i_gid = current->fsgid;
				516	inode->i_blocks = 0;
				517	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				518	inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
				519	}
				520	return inode;
				521	}
				522
				523	static void cgroup_diput(struct dentry dentry, struct inode inode)
				524	{
				525	/* is dentry a directory ? if so, kfree() associated cgroup */
				526	if (S_ISDIR(inode->i_mode)) {
				527	struct cgroup *cont = dentry->d_fsdata;
				528	BUG_ON(!(cgroup_is_removed(cont)));
				529	kfree(cont);
				530	}
				531	iput(inode);
				532	}
				533
				534	static void remove_dir(struct dentry *d)
				535	{
				536	struct dentry *parent = dget(d->d_parent);
				537
				538	d_delete(d);
				539	simple_rmdir(parent->d_inode, d);
				540	dput(parent);
				541	}
				542
				543	static void cgroup_clear_directory(struct dentry *dentry)
				544	{
				545	struct list_head *node;
				546
				547	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
				548	spin_lock(&dcache_lock);
				549	node = dentry->d_subdirs.next;
				550	while (node != &dentry->d_subdirs) {
				551	struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
				552	list_del_init(node);
				553	if (d->d_inode) {
				554	/* This should never be called on a cgroup
				555	* directory with child cgroups */
				556	BUG_ON(d->d_inode->i_mode & S_IFDIR);
				557	d = dget_locked(d);
				558	spin_unlock(&dcache_lock);
				559	d_delete(d);
				560	simple_unlink(dentry->d_inode, d);
				561	dput(d);
				562	spin_lock(&dcache_lock);
				563	}
				564	node = dentry->d_subdirs.next;
				565	}
				566	spin_unlock(&dcache_lock);
				567	}
				568
				569	/*
				570	* NOTE : the dentry must have been dget()'ed
				571	*/
				572	static void cgroup_d_remove_dir(struct dentry *dentry)
				573	{
				574	cgroup_clear_directory(dentry);
				575
				576	spin_lock(&dcache_lock);
				577	list_del_init(&dentry->d_u.d_child);
				578	spin_unlock(&dcache_lock);
				579	remove_dir(dentry);
				580	}
				581
				582	static int rebind_subsystems(struct cgroupfs_root *root,
				583	unsigned long final_bits)
				584	{
				585	unsigned long added_bits, removed_bits;
				586	struct cgroup *cont = &root->top_cgroup;
				587	int i;
				588
				589	removed_bits = root->actual_subsys_bits & ~final_bits;
				590	added_bits = final_bits & ~root->actual_subsys_bits;
				591	/* Check that any added subsystems are currently free */
				592	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				593	unsigned long long bit = 1ull << i;
				594	struct cgroup_subsys *ss = subsys[i];
				595	if (!(bit & added_bits))
				596	continue;
				597	if (ss->root != &rootnode) {
				598	/* Subsystem isn't free */
				599	return -EBUSY;
				600	}
				601	}
				602
				603	/* Currently we don't handle adding/removing subsystems when
				604	* any child cgroups exist. This is theoretically supportable
				605	* but involves complex error handling, so it's being left until
				606	* later */
				607	if (!list_empty(&cont->children))
				608	return -EBUSY;
				609
				610	/* Process each subsystem */
				611	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				612	struct cgroup_subsys *ss = subsys[i];
				613	unsigned long bit = 1UL << i;
				614	if (bit & added_bits) {
				615	/* We're binding this subsystem to this hierarchy */
				616	BUG_ON(cont->subsys[i]);
				617	BUG_ON(!dummytop->subsys[i]);
				618	BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
				619	cont->subsys[i] = dummytop->subsys[i];
				620	cont->subsys[i]->cgroup = cont;
				621	list_add(&ss->sibling, &root->subsys_list);
				622	rcu_assign_pointer(ss->root, root);
				623	if (ss->bind)
				624	ss->bind(ss, cont);
				625
				626	} else if (bit & removed_bits) {
				627	/* We're removing this subsystem */
				628	BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
				629	BUG_ON(cont->subsys[i]->cgroup != cont);
				630	if (ss->bind)
				631	ss->bind(ss, dummytop);
				632	dummytop->subsys[i]->cgroup = dummytop;
				633	cont->subsys[i] = NULL;
				634	rcu_assign_pointer(subsys[i]->root, &rootnode);
				635	list_del(&ss->sibling);
				636	} else if (bit & final_bits) {
				637	/* Subsystem state should already exist */
				638	BUG_ON(!cont->subsys[i]);
				639	} else {
				640	/* Subsystem state shouldn't exist */
				641	BUG_ON(cont->subsys[i]);
				642	}
				643	}
				644	root->subsys_bits = root->actual_subsys_bits = final_bits;
				645	synchronize_rcu();
				646
				647	return 0;
				648	}
				649
				650	static int cgroup_show_options(struct seq_file seq, struct vfsmount vfs)
				651	{
				652	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
				653	struct cgroup_subsys *ss;
				654
				655	mutex_lock(&cgroup_mutex);
				656	for_each_subsys(root, ss)
				657	seq_printf(seq, ",%s", ss->name);
				658	if (test_bit(ROOT_NOPREFIX, &root->flags))
				659	seq_puts(seq, ",noprefix");
				660	mutex_unlock(&cgroup_mutex);
				661	return 0;
				662	}
				663
				664	struct cgroup_sb_opts {
				665	unsigned long subsys_bits;
				666	unsigned long flags;
				667	};
				668
				669	/* Convert a hierarchy specifier into a bitmask of subsystems and
				670	* flags. */
				671	static int parse_cgroupfs_options(char *data,
				672	struct cgroup_sb_opts *opts)
				673	{
				674	char token, o = data ?: "all";
				675
				676	opts->subsys_bits = 0;
				677	opts->flags = 0;
				678
				679	while ((token = strsep(&o, ",")) != NULL) {
				680	if (!*token)
				681	return -EINVAL;
				682	if (!strcmp(token, "all")) {
				683	opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
				684	} else if (!strcmp(token, "noprefix")) {
				685	set_bit(ROOT_NOPREFIX, &opts->flags);
				686	} else {
				687	struct cgroup_subsys *ss;
				688	int i;
				689	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				690	ss = subsys[i];
				691	if (!strcmp(token, ss->name)) {
				692	set_bit(i, &opts->subsys_bits);
				693	break;
				694	}
				695	}
				696	if (i == CGROUP_SUBSYS_COUNT)
				697	return -ENOENT;
				698	}
				699	}
				700
				701	/* We can't have an empty hierarchy */
				702	if (!opts->subsys_bits)
				703	return -EINVAL;
				704
				705	return 0;
				706	}
				707
				708	static int cgroup_remount(struct super_block sb, int flags, char *data)
				709	{
				710	int ret = 0;
				711	struct cgroupfs_root *root = sb->s_fs_info;
				712	struct cgroup *cont = &root->top_cgroup;
				713	struct cgroup_sb_opts opts;
				714
				715	mutex_lock(&cont->dentry->d_inode->i_mutex);
				716	mutex_lock(&cgroup_mutex);
				717
				718	/* See what subsystems are wanted */
				719	ret = parse_cgroupfs_options(data, &opts);
				720	if (ret)
				721	goto out_unlock;
				722
				723	/* Don't allow flags to change at remount */
				724	if (opts.flags != root->flags) {
				725	ret = -EINVAL;
				726	goto out_unlock;
				727	}
				728
				729	ret = rebind_subsystems(root, opts.subsys_bits);
				730
				731	/* (re)populate subsystem files */
				732	if (!ret)
				733	cgroup_populate_dir(cont);
				734
				735	out_unlock:
				736	mutex_unlock(&cgroup_mutex);
				737	mutex_unlock(&cont->dentry->d_inode->i_mutex);
				738	return ret;
				739	}
				740
				741	static struct super_operations cgroup_ops = {
				742	.statfs = simple_statfs,
				743	.drop_inode = generic_delete_inode,
				744	.show_options = cgroup_show_options,
				745	.remount_fs = cgroup_remount,
				746	};
				747
				748	static void init_cgroup_root(struct cgroupfs_root *root)
				749	{
				750	struct cgroup *cont = &root->top_cgroup;
				751	INIT_LIST_HEAD(&root->subsys_list);
				752	INIT_LIST_HEAD(&root->root_list);
				753	root->number_of_cgroups = 1;
				754	cont->root = root;
				755	cont->top_cgroup = cont;
				756	INIT_LIST_HEAD(&cont->sibling);
				757	INIT_LIST_HEAD(&cont->children);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	758	INIT_LIST_HEAD(&cont->css_sets);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	759	}
				760
				761	static int cgroup_test_super(struct super_block sb, void data)
				762	{
				763	struct cgroupfs_root *new = data;
				764	struct cgroupfs_root *root = sb->s_fs_info;
				765
				766	/* First check subsystems */
				767	if (new->subsys_bits != root->subsys_bits)
				768	return 0;
				769
				770	/* Next check flags */
				771	if (new->flags != root->flags)
				772	return 0;
				773
				774	return 1;
				775	}
				776
				777	static int cgroup_set_super(struct super_block sb, void data)
				778	{
				779	int ret;
				780	struct cgroupfs_root *root = data;
				781
				782	ret = set_anon_super(sb, NULL);
				783	if (ret)
				784	return ret;
				785
				786	sb->s_fs_info = root;
				787	root->sb = sb;
				788
				789	sb->s_blocksize = PAGE_CACHE_SIZE;
				790	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
				791	sb->s_magic = CGROUP_SUPER_MAGIC;
				792	sb->s_op = &cgroup_ops;
				793
				794	return 0;
				795	}
				796
				797	static int cgroup_get_rootdir(struct super_block *sb)
				798	{
				799	struct inode *inode =
				800	cgroup_new_inode(S_IFDIR \| S_IRUGO \| S_IXUGO \| S_IWUSR, sb);
				801	struct dentry *dentry;
				802
				803	if (!inode)
				804	return -ENOMEM;
				805
				806	inode->i_op = &simple_dir_inode_operations;
				807	inode->i_fop = &simple_dir_operations;
				808	inode->i_op = &cgroup_dir_inode_operations;
				809	/* directories start off with i_nlink == 2 (for "." entry) */
				810	inc_nlink(inode);
				811	dentry = d_alloc_root(inode);
				812	if (!dentry) {
				813	iput(inode);
				814	return -ENOMEM;
				815	}
				816	sb->s_root = dentry;
				817	return 0;
				818	}
				819
				820	static int cgroup_get_sb(struct file_system_type *fs_type,
				821	int flags, const char *unused_dev_name,
				822	void data, struct vfsmount mnt)
				823	{
				824	struct cgroup_sb_opts opts;
				825	int ret = 0;
				826	struct super_block *sb;
				827	struct cgroupfs_root *root;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	828	struct list_head tmp_cg_links, *l;
				829	INIT_LIST_HEAD(&tmp_cg_links);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	830
				831	/* First find the desired set of subsystems */
				832	ret = parse_cgroupfs_options(data, &opts);
				833	if (ret)
				834	return ret;
				835
				836	root = kzalloc(sizeof(*root), GFP_KERNEL);
				837	if (!root)
				838	return -ENOMEM;
				839
				840	init_cgroup_root(root);
				841	root->subsys_bits = opts.subsys_bits;
				842	root->flags = opts.flags;
				843
				844	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
				845
				846	if (IS_ERR(sb)) {
				847	kfree(root);
				848	return PTR_ERR(sb);
				849	}
				850
				851	if (sb->s_fs_info != root) {
				852	/* Reusing an existing superblock */
				853	BUG_ON(sb->s_root == NULL);
				854	kfree(root);
				855	root = NULL;
				856	} else {
				857	/* New superblock */
				858	struct cgroup *cont = &root->top_cgroup;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	859	struct inode *inode;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	860
				861	BUG_ON(sb->s_root != NULL);
				862
				863	ret = cgroup_get_rootdir(sb);
				864	if (ret)
				865	goto drop_new_super;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	866	inode = sb->s_root->d_inode;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	867
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	868	mutex_lock(&inode->i_mutex);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	869	mutex_lock(&cgroup_mutex);
				870
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	871	/*
				872	* We're accessing css_set_count without locking
				873	* css_set_lock here, but that's OK - it can only be
				874	* increased by someone holding cgroup_lock, and
				875	* that's us. The worst that can happen is that we
				876	* have some link structures left over
				877	*/
				878	ret = allocate_cg_links(css_set_count, &tmp_cg_links);
				879	if (ret) {
				880	mutex_unlock(&cgroup_mutex);
				881	mutex_unlock(&inode->i_mutex);
				882	goto drop_new_super;
				883	}
				884
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	885	ret = rebind_subsystems(root, root->subsys_bits);
				886	if (ret == -EBUSY) {
				887	mutex_unlock(&cgroup_mutex);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	888	mutex_unlock(&inode->i_mutex);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	889	goto drop_new_super;
				890	}
				891
				892	/* EBUSY should be the only error here */
				893	BUG_ON(ret);
				894
				895	list_add(&root->root_list, &roots);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	896	root_count++;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	897
				898	sb->s_root->d_fsdata = &root->top_cgroup;
				899	root->top_cgroup.dentry = sb->s_root;
				900
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	901	/* Link the top cgroup in this hierarchy into all
				902	* the css_set objects */
				903	write_lock(&css_set_lock);
				904	l = &init_css_set.list;
				905	do {
				906	struct css_set *cg;
				907	struct cg_cgroup_link *link;
				908	cg = list_entry(l, struct css_set, list);
				909	BUG_ON(list_empty(&tmp_cg_links));
				910	link = list_entry(tmp_cg_links.next,
				911	struct cg_cgroup_link,
				912	cont_link_list);
				913	list_del(&link->cont_link_list);
				914	link->cg = cg;
				915	list_add(&link->cont_link_list,
				916	&root->top_cgroup.css_sets);
				917	list_add(&link->cg_link_list, &cg->cg_links);
				918	l = l->next;
				919	} while (l != &init_css_set.list);
				920	write_unlock(&css_set_lock);
				921
				922	free_cg_links(&tmp_cg_links);
				923
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	924	BUG_ON(!list_empty(&cont->sibling));
				925	BUG_ON(!list_empty(&cont->children));
				926	BUG_ON(root->number_of_cgroups != 1);
				927
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	928	cgroup_populate_dir(cont);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	929	mutex_unlock(&inode->i_mutex);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	930	mutex_unlock(&cgroup_mutex);
				931	}
				932
				933	return simple_set_mnt(mnt, sb);
				934
				935	drop_new_super:
				936	up_write(&sb->s_umount);
				937	deactivate_super(sb);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	938	free_cg_links(&tmp_cg_links);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	939	return ret;
				940	}
				941
				942	static void cgroup_kill_sb(struct super_block *sb) {
				943	struct cgroupfs_root *root = sb->s_fs_info;
				944	struct cgroup *cont = &root->top_cgroup;
				945	int ret;
				946
				947	BUG_ON(!root);
				948
				949	BUG_ON(root->number_of_cgroups != 1);
				950	BUG_ON(!list_empty(&cont->children));
				951	BUG_ON(!list_empty(&cont->sibling));
				952
				953	mutex_lock(&cgroup_mutex);
				954
				955	/* Rebind all subsystems back to the default hierarchy */
				956	ret = rebind_subsystems(root, 0);
				957	/* Shouldn't be able to fail ... */
				958	BUG_ON(ret);
				959
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	960	/*
				961	* Release all the links from css_sets to this hierarchy's
				962	* root cgroup
				963	*/
				964	write_lock(&css_set_lock);
				965	while (!list_empty(&cont->css_sets)) {
				966	struct cg_cgroup_link *link;
				967	link = list_entry(cont->css_sets.next,
				968	struct cg_cgroup_link, cont_link_list);
				969	list_del(&link->cg_link_list);
				970	list_del(&link->cont_link_list);
				971	kfree(link);
				972	}
				973	write_unlock(&css_set_lock);
				974
				975	if (!list_empty(&root->root_list)) {
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	976	list_del(&root->root_list);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	977	root_count--;
				978	}
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	979	mutex_unlock(&cgroup_mutex);
				980
				981	kfree(root);
				982	kill_litter_super(sb);
				983	}
				984
				985	static struct file_system_type cgroup_fs_type = {
				986	.name = "cgroup",
				987	.get_sb = cgroup_get_sb,
				988	.kill_sb = cgroup_kill_sb,
				989	};
				990
				991	static inline struct cgroup __d_cont(struct dentry dentry)
				992	{
				993	return dentry->d_fsdata;
				994	}
				995
				996	static inline struct cftype __d_cft(struct dentry dentry)
				997	{
				998	return dentry->d_fsdata;
				999	}
				1000
				1001	/*
				1002	* Called with cgroup_mutex held. Writes path of cgroup into buf.
				1003	* Returns 0 on success, -errno on error.
				1004	*/
				1005	int cgroup_path(const struct cgroup cont, char buf, int buflen)
				1006	{
				1007	char *start;
				1008
				1009	if (cont == dummytop) {
				1010	/*
				1011	* Inactive subsystems have no dentry for their root
				1012	* cgroup
				1013	*/
				1014	strcpy(buf, "/");
				1015	return 0;
				1016	}
				1017
				1018	start = buf + buflen;
				1019
				1020	*--start = '\0';
				1021	for (;;) {
				1022	int len = cont->dentry->d_name.len;
				1023	if ((start -= len) < buf)
				1024	return -ENAMETOOLONG;
				1025	memcpy(start, cont->dentry->d_name.name, len);
				1026	cont = cont->parent;
				1027	if (!cont)
				1028	break;
				1029	if (!cont->parent)
				1030	continue;
				1031	if (--start < buf)
				1032	return -ENAMETOOLONG;
				1033	*start = '/';
				1034	}
				1035	memmove(buf, start, buf + buflen - start);
				1036	return 0;
				1037	}
				1038
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1039	/*
				1040	* Return the first subsystem attached to a cgroup's hierarchy, and
				1041	* its subsystem id.
				1042	*/
				1043
				1044	static void get_first_subsys(const struct cgroup *cont,
				1045	struct cgroup_subsys_state *css, int subsys_id)
				1046	{
				1047	const struct cgroupfs_root *root = cont->root;
				1048	const struct cgroup_subsys *test_ss;
				1049	BUG_ON(list_empty(&root->subsys_list));
				1050	test_ss = list_entry(root->subsys_list.next,
				1051	struct cgroup_subsys, sibling);
				1052	if (css) {
				1053	*css = cont->subsys[test_ss->subsys_id];
				1054	BUG_ON(!*css);
				1055	}
				1056	if (subsys_id)
				1057	*subsys_id = test_ss->subsys_id;
				1058	}
				1059
				1060	/*
				1061	* Attach task 'tsk' to cgroup 'cont'
				1062	*
				1063	* Call holding cgroup_mutex. May take task_lock of
				1064	* the task 'pid' during call.
				1065	*/
				1066	static int attach_task(struct cgroup cont, struct task_struct tsk)
				1067	{
				1068	int retval = 0;
				1069	struct cgroup_subsys *ss;
				1070	struct cgroup *oldcont;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1071	struct css_set *cg = tsk->cgroups;
				1072	struct css_set *newcg;
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1073	struct cgroupfs_root *root = cont->root;
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1074	int subsys_id;
				1075
				1076	get_first_subsys(cont, NULL, &subsys_id);
				1077
				1078	/* Nothing to do if the task is already in that cgroup */
				1079	oldcont = task_cgroup(tsk, subsys_id);
				1080	if (cont == oldcont)
				1081	return 0;
				1082
				1083	for_each_subsys(root, ss) {
				1084	if (ss->can_attach) {
				1085	retval = ss->can_attach(ss, cont, tsk);
				1086	if (retval) {
				1087	return retval;
				1088	}
				1089	}
				1090	}
				1091
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1092	/*
				1093	* Locate or allocate a new css_set for this task,
				1094	* based on its final set of cgroups
				1095	*/
				1096	newcg = find_css_set(cg, cont);
				1097	if (!newcg) {
				1098	return -ENOMEM;
				1099	}
				1100
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1101	task_lock(tsk);
				1102	if (tsk->flags & PF_EXITING) {
				1103	task_unlock(tsk);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1104	put_css_set(newcg);
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1105	return -ESRCH;
				1106	}
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1107	rcu_assign_pointer(tsk->cgroups, newcg);
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1108	task_unlock(tsk);
				1109
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1110	/* Update the css_set linked lists if we're using them */
				1111	write_lock(&css_set_lock);
				1112	if (!list_empty(&tsk->cg_list)) {
				1113	list_del(&tsk->cg_list);
				1114	list_add(&tsk->cg_list, &newcg->tasks);
				1115	}
				1116	write_unlock(&css_set_lock);
				1117
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1118	for_each_subsys(root, ss) {
				1119	if (ss->attach) {
				1120	ss->attach(ss, cont, oldcont, tsk);
				1121	}
				1122	}
				1123
				1124	synchronize_rcu();
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1125	put_css_set(cg);
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1126	return 0;
				1127	}
				1128
				1129	/*
				1130	* Attach task with pid 'pid' to cgroup 'cont'. Call with
				1131	* cgroup_mutex, may take task_lock of task
				1132	*/
				1133	static int attach_task_by_pid(struct cgroup cont, char pidbuf)
				1134	{
				1135	pid_t pid;
				1136	struct task_struct *tsk;
				1137	int ret;
				1138
				1139	if (sscanf(pidbuf, "%d", &pid) != 1)
				1140	return -EIO;
				1141
				1142	if (pid) {
				1143	rcu_read_lock();
				1144	tsk = find_task_by_pid(pid);
				1145	if (!tsk \|\| tsk->flags & PF_EXITING) {
				1146	rcu_read_unlock();
				1147	return -ESRCH;
				1148	}
				1149	get_task_struct(tsk);
				1150	rcu_read_unlock();
				1151
				1152	if ((current->euid) && (current->euid != tsk->uid)
				1153	&& (current->euid != tsk->suid)) {
				1154	put_task_struct(tsk);
				1155	return -EACCES;
				1156	}
				1157	} else {
				1158	tsk = current;
				1159	get_task_struct(tsk);
				1160	}
				1161
				1162	ret = attach_task(cont, tsk);
				1163	put_task_struct(tsk);
				1164	return ret;
				1165	}
				1166
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1167	/* The various types of files and directories in a cgroup file system */
				1168
				1169	enum cgroup_filetype {
				1170	FILE_ROOT,
				1171	FILE_DIR,
				1172	FILE_TASKLIST,
				1173	};
				1174
Paul Menage	355e0c4	2007-10-18 23:39:33 -0700	[diff] [blame]	1175	static ssize_t cgroup_write_uint(struct cgroup cont, struct cftype cft,
				1176	struct file *file,
				1177	const char __user *userbuf,
				1178	size_t nbytes, loff_t *unused_ppos)
				1179	{
				1180	char buffer[64];
				1181	int retval = 0;
				1182	u64 val;
				1183	char *end;
				1184
				1185	if (!nbytes)
				1186	return -EINVAL;
				1187	if (nbytes >= sizeof(buffer))
				1188	return -E2BIG;
				1189	if (copy_from_user(buffer, userbuf, nbytes))
				1190	return -EFAULT;
				1191
				1192	buffer[nbytes] = 0; /* nul-terminate */
				1193
				1194	/* strip newline if necessary */
				1195	if (nbytes && (buffer[nbytes-1] == '\n'))
				1196	buffer[nbytes-1] = 0;
				1197	val = simple_strtoull(buffer, &end, 0);
				1198	if (*end)
				1199	return -EINVAL;
				1200
				1201	/* Pass to subsystem */
				1202	retval = cft->write_uint(cont, cft, val);
				1203	if (!retval)
				1204	retval = nbytes;
				1205	return retval;
				1206	}
				1207
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1208	static ssize_t cgroup_common_file_write(struct cgroup *cont,
				1209	struct cftype *cft,
				1210	struct file *file,
				1211	const char __user *userbuf,
				1212	size_t nbytes, loff_t *unused_ppos)
				1213	{
				1214	enum cgroup_filetype type = cft->private;
				1215	char *buffer;
				1216	int retval = 0;
				1217
				1218	if (nbytes >= PATH_MAX)
				1219	return -E2BIG;
				1220
				1221	/* +1 for nul-terminator */
				1222	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
				1223	if (buffer == NULL)
				1224	return -ENOMEM;
				1225
				1226	if (copy_from_user(buffer, userbuf, nbytes)) {
				1227	retval = -EFAULT;
				1228	goto out1;
				1229	}
				1230	buffer[nbytes] = 0; /* nul-terminate */
				1231
				1232	mutex_lock(&cgroup_mutex);
				1233
				1234	if (cgroup_is_removed(cont)) {
				1235	retval = -ENODEV;
				1236	goto out2;
				1237	}
				1238
				1239	switch (type) {
				1240	case FILE_TASKLIST:
				1241	retval = attach_task_by_pid(cont, buffer);
				1242	break;
				1243	default:
				1244	retval = -EINVAL;
				1245	goto out2;
				1246	}
				1247
				1248	if (retval == 0)
				1249	retval = nbytes;
				1250	out2:
				1251	mutex_unlock(&cgroup_mutex);
				1252	out1:
				1253	kfree(buffer);
				1254	return retval;
				1255	}
				1256
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1257	static ssize_t cgroup_file_write(struct file file, const char __user buf,
				1258	size_t nbytes, loff_t *ppos)
				1259	{
				1260	struct cftype *cft = __d_cft(file->f_dentry);
				1261	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				1262
				1263	if (!cft)
				1264	return -ENODEV;
Paul Menage	355e0c4	2007-10-18 23:39:33 -0700	[diff] [blame]	1265	if (cft->write)
				1266	return cft->write(cont, cft, file, buf, nbytes, ppos);
				1267	if (cft->write_uint)
				1268	return cgroup_write_uint(cont, cft, file, buf, nbytes, ppos);
				1269	return -EINVAL;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1270	}
				1271
				1272	static ssize_t cgroup_read_uint(struct cgroup cont, struct cftype cft,
				1273	struct file *file,
				1274	char __user *buf, size_t nbytes,
				1275	loff_t *ppos)
				1276	{
				1277	char tmp[64];
				1278	u64 val = cft->read_uint(cont, cft);
				1279	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
				1280
				1281	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
				1282	}
				1283
				1284	static ssize_t cgroup_file_read(struct file file, char __user buf,
				1285	size_t nbytes, loff_t *ppos)
				1286	{
				1287	struct cftype *cft = __d_cft(file->f_dentry);
				1288	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				1289
				1290	if (!cft)
				1291	return -ENODEV;
				1292
				1293	if (cft->read)
				1294	return cft->read(cont, cft, file, buf, nbytes, ppos);
				1295	if (cft->read_uint)
				1296	return cgroup_read_uint(cont, cft, file, buf, nbytes, ppos);
				1297	return -EINVAL;
				1298	}
				1299
				1300	static int cgroup_file_open(struct inode inode, struct file file)
				1301	{
				1302	int err;
				1303	struct cftype *cft;
				1304
				1305	err = generic_file_open(inode, file);
				1306	if (err)
				1307	return err;
				1308
				1309	cft = __d_cft(file->f_dentry);
				1310	if (!cft)
				1311	return -ENODEV;
				1312	if (cft->open)
				1313	err = cft->open(inode, file);
				1314	else
				1315	err = 0;
				1316
				1317	return err;
				1318	}
				1319
				1320	static int cgroup_file_release(struct inode inode, struct file file)
				1321	{
				1322	struct cftype *cft = __d_cft(file->f_dentry);
				1323	if (cft->release)
				1324	return cft->release(inode, file);
				1325	return 0;
				1326	}
				1327
				1328	/*
				1329	* cgroup_rename - Only allow simple rename of directories in place.
				1330	*/
				1331	static int cgroup_rename(struct inode old_dir, struct dentry old_dentry,
				1332	struct inode new_dir, struct dentry new_dentry)
				1333	{
				1334	if (!S_ISDIR(old_dentry->d_inode->i_mode))
				1335	return -ENOTDIR;
				1336	if (new_dentry->d_inode)
				1337	return -EEXIST;
				1338	if (old_dir != new_dir)
				1339	return -EIO;
				1340	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
				1341	}
				1342
				1343	static struct file_operations cgroup_file_operations = {
				1344	.read = cgroup_file_read,
				1345	.write = cgroup_file_write,
				1346	.llseek = generic_file_llseek,
				1347	.open = cgroup_file_open,
				1348	.release = cgroup_file_release,
				1349	};
				1350
				1351	static struct inode_operations cgroup_dir_inode_operations = {
				1352	.lookup = simple_lookup,
				1353	.mkdir = cgroup_mkdir,
				1354	.rmdir = cgroup_rmdir,
				1355	.rename = cgroup_rename,
				1356	};
				1357
				1358	static int cgroup_create_file(struct dentry *dentry, int mode,
				1359	struct super_block *sb)
				1360	{
				1361	static struct dentry_operations cgroup_dops = {
				1362	.d_iput = cgroup_diput,
				1363	};
				1364
				1365	struct inode *inode;
				1366
				1367	if (!dentry)
				1368	return -ENOENT;
				1369	if (dentry->d_inode)
				1370	return -EEXIST;
				1371
				1372	inode = cgroup_new_inode(mode, sb);
				1373	if (!inode)
				1374	return -ENOMEM;
				1375
				1376	if (S_ISDIR(mode)) {
				1377	inode->i_op = &cgroup_dir_inode_operations;
				1378	inode->i_fop = &simple_dir_operations;
				1379
				1380	/* start off with i_nlink == 2 (for "." entry) */
				1381	inc_nlink(inode);
				1382
				1383	/* start with the directory inode held, so that we can
				1384	* populate it without racing with another mkdir */
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1385	mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1386	} else if (S_ISREG(mode)) {
				1387	inode->i_size = 0;
				1388	inode->i_fop = &cgroup_file_operations;
				1389	}
				1390	dentry->d_op = &cgroup_dops;
				1391	d_instantiate(dentry, inode);
				1392	dget(dentry); /* Extra count - pin the dentry in core */
				1393	return 0;
				1394	}
				1395
				1396	/*
				1397	* cgroup_create_dir - create a directory for an object.
				1398	* cont: the cgroup we create the directory for.
				1399	* It must have a valid ->parent field
				1400	* And we are going to fill its ->dentry field.
				1401	* dentry: dentry of the new container
				1402	* mode: mode to set on new directory.
				1403	*/
				1404	static int cgroup_create_dir(struct cgroup cont, struct dentry dentry,
				1405	int mode)
				1406	{
				1407	struct dentry *parent;
				1408	int error = 0;
				1409
				1410	parent = cont->parent->dentry;
				1411	error = cgroup_create_file(dentry, S_IFDIR \| mode, cont->root->sb);
				1412	if (!error) {
				1413	dentry->d_fsdata = cont;
				1414	inc_nlink(parent->d_inode);
				1415	cont->dentry = dentry;
				1416	dget(dentry);
				1417	}
				1418	dput(dentry);
				1419
				1420	return error;
				1421	}
				1422
				1423	int cgroup_add_file(struct cgroup *cont,
				1424	struct cgroup_subsys *subsys,
				1425	const struct cftype *cft)
				1426	{
				1427	struct dentry *dir = cont->dentry;
				1428	struct dentry *dentry;
				1429	int error;
				1430
				1431	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
				1432	if (subsys && !test_bit(ROOT_NOPREFIX, &cont->root->flags)) {
				1433	strcpy(name, subsys->name);
				1434	strcat(name, ".");
				1435	}
				1436	strcat(name, cft->name);
				1437	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
				1438	dentry = lookup_one_len(name, dir, strlen(name));
				1439	if (!IS_ERR(dentry)) {
				1440	error = cgroup_create_file(dentry, 0644 \| S_IFREG,
				1441	cont->root->sb);
				1442	if (!error)
				1443	dentry->d_fsdata = (void *)cft;
				1444	dput(dentry);
				1445	} else
				1446	error = PTR_ERR(dentry);
				1447	return error;
				1448	}
				1449
				1450	int cgroup_add_files(struct cgroup *cont,
				1451	struct cgroup_subsys *subsys,
				1452	const struct cftype cft[],
				1453	int count)
				1454	{
				1455	int i, err;
				1456	for (i = 0; i < count; i++) {
				1457	err = cgroup_add_file(cont, subsys, &cft[i]);
				1458	if (err)
				1459	return err;
				1460	}
				1461	return 0;
				1462	}
				1463
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1464	/* Count the number of tasks in a cgroup. */
				1465
				1466	int cgroup_task_count(const struct cgroup *cont)
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1467	{
				1468	int count = 0;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1469	struct list_head *l;
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1470
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1471	read_lock(&css_set_lock);
				1472	l = cont->css_sets.next;
				1473	while (l != &cont->css_sets) {
				1474	struct cg_cgroup_link *link =
				1475	list_entry(l, struct cg_cgroup_link, cont_link_list);
				1476	count += atomic_read(&link->cg->ref.refcount);
				1477	l = l->next;
				1478	}
				1479	read_unlock(&css_set_lock);
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1480	return count;
				1481	}
				1482
				1483	/*
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1484	* Advance a list_head iterator. The iterator should be positioned at
				1485	* the start of a css_set
				1486	*/
				1487	static void cgroup_advance_iter(struct cgroup *cont,
				1488	struct cgroup_iter *it)
				1489	{
				1490	struct list_head *l = it->cg_link;
				1491	struct cg_cgroup_link *link;
				1492	struct css_set *cg;
				1493
				1494	/* Advance to the next non-empty css_set */
				1495	do {
				1496	l = l->next;
				1497	if (l == &cont->css_sets) {
				1498	it->cg_link = NULL;
				1499	return;
				1500	}
				1501	link = list_entry(l, struct cg_cgroup_link, cont_link_list);
				1502	cg = link->cg;
				1503	} while (list_empty(&cg->tasks));
				1504	it->cg_link = l;
				1505	it->task = cg->tasks.next;
				1506	}
				1507
				1508	void cgroup_iter_start(struct cgroup cont, struct cgroup_iter it)
				1509	{
				1510	/*
				1511	* The first time anyone tries to iterate across a cgroup,
				1512	* we need to enable the list linking each css_set to its
				1513	* tasks, and fix up all existing tasks.
				1514	*/
				1515	if (!use_task_css_set_links) {
				1516	struct task_struct p, g;
				1517	write_lock(&css_set_lock);
				1518	use_task_css_set_links = 1;
				1519	do_each_thread(g, p) {
				1520	task_lock(p);
				1521	if (list_empty(&p->cg_list))
				1522	list_add(&p->cg_list, &p->cgroups->tasks);
				1523	task_unlock(p);
				1524	} while_each_thread(g, p);
				1525	write_unlock(&css_set_lock);
				1526	}
				1527	read_lock(&css_set_lock);
				1528	it->cg_link = &cont->css_sets;
				1529	cgroup_advance_iter(cont, it);
				1530	}
				1531
				1532	struct task_struct cgroup_iter_next(struct cgroup cont,
				1533	struct cgroup_iter *it)
				1534	{
				1535	struct task_struct *res;
				1536	struct list_head *l = it->task;
				1537
				1538	/* If the iterator cg is NULL, we have no tasks */
				1539	if (!it->cg_link)
				1540	return NULL;
				1541	res = list_entry(l, struct task_struct, cg_list);
				1542	/* Advance iterator to find next entry */
				1543	l = l->next;
				1544	if (l == &res->cgroups->tasks) {
				1545	/* We reached the end of this task list - move on to
				1546	* the next cg_cgroup_link */
				1547	cgroup_advance_iter(cont, it);
				1548	} else {
				1549	it->task = l;
				1550	}
				1551	return res;
				1552	}
				1553
				1554	void cgroup_iter_end(struct cgroup cont, struct cgroup_iter it)
				1555	{
				1556	read_unlock(&css_set_lock);
				1557	}
				1558
				1559	/*
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1560	* Stuff for reading the 'tasks' file.
				1561	*
				1562	* Reading this file can return large amounts of data if a cgroup has
				1563	* lots of attached tasks. So it may need several calls to read(),
				1564	* but we cannot guarantee that the information we produce is correct
				1565	* unless we produce it entirely atomically.
				1566	*
				1567	* Upon tasks file open(), a struct ctr_struct is allocated, that
				1568	* will have a pointer to an array (also allocated here). The struct
				1569	* ctr_struct * is stored in file->private_data. Its resources will
				1570	* be freed by release() when the file is closed. The array is used
				1571	* to sprintf the PIDs and then used by read().
				1572	*/
				1573	struct ctr_struct {
				1574	char *buf;
				1575	int bufsz;
				1576	};
				1577
				1578	/*
				1579	* Load into 'pidarray' up to 'npids' of the tasks using cgroup
				1580	* 'cont'. Return actual number of pids loaded. No need to
				1581	* task_lock(p) when reading out p->cgroup, since we're in an RCU
				1582	* read section, so the css_set can't go away, and is
				1583	* immutable after creation.
				1584	*/
				1585	static int pid_array_load(pid_t pidarray, int npids, struct cgroup cont)
				1586	{
				1587	int n = 0;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1588	struct cgroup_iter it;
				1589	struct task_struct *tsk;
				1590	cgroup_iter_start(cont, &it);
				1591	while ((tsk = cgroup_iter_next(cont, &it))) {
				1592	if (unlikely(n == npids))
				1593	break;
				1594	pidarray[n++] = pid_nr(task_pid(tsk));
				1595	}
				1596	cgroup_iter_end(cont, &it);
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1597	return n;
				1598	}
				1599
				1600	static int cmppid(const void a, const void b)
				1601	{
				1602	return (pid_t )a - (pid_t )b;
				1603	}
				1604
				1605	/*
				1606	* Convert array 'a' of 'npids' pid_t's to a string of newline separated
				1607	* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
				1608	* count 'cnt' of how many chars would be written if buf were large enough.
				1609	*/
				1610	static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
				1611	{
				1612	int cnt = 0;
				1613	int i;
				1614
				1615	for (i = 0; i < npids; i++)
				1616	cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
				1617	return cnt;
				1618	}
				1619
				1620	/*
				1621	* Handle an open on 'tasks' file. Prepare a buffer listing the
				1622	* process id's of tasks currently attached to the cgroup being opened.
				1623	*
				1624	* Does not require any specific cgroup mutexes, and does not take any.
				1625	*/
				1626	static int cgroup_tasks_open(struct inode unused, struct file file)
				1627	{
				1628	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				1629	struct ctr_struct *ctr;
				1630	pid_t *pidarray;
				1631	int npids;
				1632	char c;
				1633
				1634	if (!(file->f_mode & FMODE_READ))
				1635	return 0;
				1636
				1637	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
				1638	if (!ctr)
				1639	goto err0;
				1640
				1641	/*
				1642	* If cgroup gets more users after we read count, we won't have
				1643	* enough space - tough. This race is indistinguishable to the
				1644	* caller from the case that the additional cgroup users didn't
				1645	* show up until sometime later on.
				1646	*/
				1647	npids = cgroup_task_count(cont);
				1648	if (npids) {
				1649	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
				1650	if (!pidarray)
				1651	goto err1;
				1652
				1653	npids = pid_array_load(pidarray, npids, cont);
				1654	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
				1655
				1656	/* Call pid_array_to_buf() twice, first just to get bufsz */
				1657	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
				1658	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
				1659	if (!ctr->buf)
				1660	goto err2;
				1661	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
				1662
				1663	kfree(pidarray);
				1664	} else {
				1665	ctr->buf = 0;
				1666	ctr->bufsz = 0;
				1667	}
				1668	file->private_data = ctr;
				1669	return 0;
				1670
				1671	err2:
				1672	kfree(pidarray);
				1673	err1:
				1674	kfree(ctr);
				1675	err0:
				1676	return -ENOMEM;
				1677	}
				1678
				1679	static ssize_t cgroup_tasks_read(struct cgroup *cont,
				1680	struct cftype *cft,
				1681	struct file file, char __user buf,
				1682	size_t nbytes, loff_t *ppos)
				1683	{
				1684	struct ctr_struct *ctr = file->private_data;
				1685
				1686	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
				1687	}
				1688
				1689	static int cgroup_tasks_release(struct inode *unused_inode,
				1690	struct file *file)
				1691	{
				1692	struct ctr_struct *ctr;
				1693
				1694	if (file->f_mode & FMODE_READ) {
				1695	ctr = file->private_data;
				1696	kfree(ctr->buf);
				1697	kfree(ctr);
				1698	}
				1699	return 0;
				1700	}
				1701
				1702	/*
				1703	* for the common functions, 'private' gives the type of file
				1704	*/
				1705	static struct cftype cft_tasks = {
				1706	.name = "tasks",
				1707	.open = cgroup_tasks_open,
				1708	.read = cgroup_tasks_read,
				1709	.write = cgroup_common_file_write,
				1710	.release = cgroup_tasks_release,
				1711	.private = FILE_TASKLIST,
				1712	};
				1713
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1714	static int cgroup_populate_dir(struct cgroup *cont)
				1715	{
				1716	int err;
				1717	struct cgroup_subsys *ss;
				1718
				1719	/* First clear out any existing files */
				1720	cgroup_clear_directory(cont->dentry);
				1721
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1722	err = cgroup_add_file(cont, NULL, &cft_tasks);
				1723	if (err < 0)
				1724	return err;
				1725
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1726	for_each_subsys(cont->root, ss) {
				1727	if (ss->populate && (err = ss->populate(ss, cont)) < 0)
				1728	return err;
				1729	}
				1730
				1731	return 0;
				1732	}
				1733
				1734	static void init_cgroup_css(struct cgroup_subsys_state *css,
				1735	struct cgroup_subsys *ss,
				1736	struct cgroup *cont)
				1737	{
				1738	css->cgroup = cont;
				1739	atomic_set(&css->refcnt, 0);
				1740	css->flags = 0;
				1741	if (cont == dummytop)
				1742	set_bit(CSS_ROOT, &css->flags);
				1743	BUG_ON(cont->subsys[ss->subsys_id]);
				1744	cont->subsys[ss->subsys_id] = css;
				1745	}
				1746
				1747	/*
				1748	* cgroup_create - create a cgroup
				1749	* parent: cgroup that will be parent of the new cgroup.
				1750	* name: name of the new cgroup. Will be strcpy'ed.
				1751	* mode: mode to set on new inode
				1752	*
				1753	* Must be called with the mutex on the parent inode held
				1754	*/
				1755
				1756	static long cgroup_create(struct cgroup parent, struct dentry dentry,
				1757	int mode)
				1758	{
				1759	struct cgroup *cont;
				1760	struct cgroupfs_root *root = parent->root;
				1761	int err = 0;
				1762	struct cgroup_subsys *ss;
				1763	struct super_block *sb = root->sb;
				1764
				1765	cont = kzalloc(sizeof(*cont), GFP_KERNEL);
				1766	if (!cont)
				1767	return -ENOMEM;
				1768
				1769	/* Grab a reference on the superblock so the hierarchy doesn't
				1770	* get deleted on unmount if there are child cgroups. This
				1771	* can be done outside cgroup_mutex, since the sb can't
				1772	* disappear while someone has an open control file on the
				1773	* fs */
				1774	atomic_inc(&sb->s_active);
				1775
				1776	mutex_lock(&cgroup_mutex);
				1777
				1778	cont->flags = 0;
				1779	INIT_LIST_HEAD(&cont->sibling);
				1780	INIT_LIST_HEAD(&cont->children);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1781	INIT_LIST_HEAD(&cont->css_sets);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1782
				1783	cont->parent = parent;
				1784	cont->root = parent->root;
				1785	cont->top_cgroup = parent->top_cgroup;
				1786
				1787	for_each_subsys(root, ss) {
				1788	struct cgroup_subsys_state *css = ss->create(ss, cont);
				1789	if (IS_ERR(css)) {
				1790	err = PTR_ERR(css);
				1791	goto err_destroy;
				1792	}
				1793	init_cgroup_css(css, ss, cont);
				1794	}
				1795
				1796	list_add(&cont->sibling, &cont->parent->children);
				1797	root->number_of_cgroups++;
				1798
				1799	err = cgroup_create_dir(cont, dentry, mode);
				1800	if (err < 0)
				1801	goto err_remove;
				1802
				1803	/* The cgroup directory was pre-locked for us */
				1804	BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex));
				1805
				1806	err = cgroup_populate_dir(cont);
				1807	/* If err < 0, we have a half-filled directory - oh well ;) */
				1808
				1809	mutex_unlock(&cgroup_mutex);
				1810	mutex_unlock(&cont->dentry->d_inode->i_mutex);
				1811
				1812	return 0;
				1813
				1814	err_remove:
				1815
				1816	list_del(&cont->sibling);
				1817	root->number_of_cgroups--;
				1818
				1819	err_destroy:
				1820
				1821	for_each_subsys(root, ss) {
				1822	if (cont->subsys[ss->subsys_id])
				1823	ss->destroy(ss, cont);
				1824	}
				1825
				1826	mutex_unlock(&cgroup_mutex);
				1827
				1828	/* Release the reference count that we took on the superblock */
				1829	deactivate_super(sb);
				1830
				1831	kfree(cont);
				1832	return err;
				1833	}
				1834
				1835	static int cgroup_mkdir(struct inode dir, struct dentry dentry, int mode)
				1836	{
				1837	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
				1838
				1839	/* the vfs holds inode->i_mutex already */
				1840	return cgroup_create(c_parent, dentry, mode \| S_IFDIR);
				1841	}
				1842
				1843	static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
				1844	{
				1845	struct cgroup *cont = dentry->d_fsdata;
				1846	struct dentry *d;
				1847	struct cgroup *parent;
				1848	struct cgroup_subsys *ss;
				1849	struct super_block *sb;
				1850	struct cgroupfs_root *root;
				1851	int css_busy = 0;
				1852
				1853	/* the vfs holds both inode->i_mutex already */
				1854
				1855	mutex_lock(&cgroup_mutex);
				1856	if (atomic_read(&cont->count) != 0) {
				1857	mutex_unlock(&cgroup_mutex);
				1858	return -EBUSY;
				1859	}
				1860	if (!list_empty(&cont->children)) {
				1861	mutex_unlock(&cgroup_mutex);
				1862	return -EBUSY;
				1863	}
				1864
				1865	parent = cont->parent;
				1866	root = cont->root;
				1867	sb = root->sb;
				1868
				1869	/* Check the reference count on each subsystem. Since we
				1870	* already established that there are no tasks in the
				1871	* cgroup, if the css refcount is also 0, then there should
				1872	* be no outstanding references, so the subsystem is safe to
				1873	* destroy */
				1874	for_each_subsys(root, ss) {
				1875	struct cgroup_subsys_state *css;
				1876	css = cont->subsys[ss->subsys_id];
				1877	if (atomic_read(&css->refcnt)) {
				1878	css_busy = 1;
				1879	break;
				1880	}
				1881	}
				1882	if (css_busy) {
				1883	mutex_unlock(&cgroup_mutex);
				1884	return -EBUSY;
				1885	}
				1886
				1887	for_each_subsys(root, ss) {
				1888	if (cont->subsys[ss->subsys_id])
				1889	ss->destroy(ss, cont);
				1890	}
				1891
				1892	set_bit(CONT_REMOVED, &cont->flags);
				1893	/* delete my sibling from parent->children */
				1894	list_del(&cont->sibling);
				1895	spin_lock(&cont->dentry->d_lock);
				1896	d = dget(cont->dentry);
				1897	cont->dentry = NULL;
				1898	spin_unlock(&d->d_lock);
				1899
				1900	cgroup_d_remove_dir(d);
				1901	dput(d);
				1902	root->number_of_cgroups--;
				1903
				1904	mutex_unlock(&cgroup_mutex);
				1905	/* Drop the active superblock reference that we took when we
				1906	* created the cgroup */
				1907	deactivate_super(sb);
				1908	return 0;
				1909	}
				1910
				1911	static void cgroup_init_subsys(struct cgroup_subsys *ss)
				1912	{
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1913	struct cgroup_subsys_state *css;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1914	struct list_head *l;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1915	printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
				1916
				1917	/* Create the top cgroup state for this subsystem */
				1918	ss->root = &rootnode;
				1919	css = ss->create(ss, dummytop);
				1920	/* We don't handle early failures gracefully */
				1921	BUG_ON(IS_ERR(css));
				1922	init_cgroup_css(css, ss, dummytop);
				1923
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1924	/* Update all cgroup groups to contain a subsys
				1925	* pointer to this state - since the subsystem is
				1926	* newly registered, all tasks and hence all cgroup
				1927	* groups are in the subsystem's top cgroup. */
				1928	write_lock(&css_set_lock);
				1929	l = &init_css_set.list;
				1930	do {
				1931	struct css_set *cg =
				1932	list_entry(l, struct css_set, list);
				1933	cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
				1934	l = l->next;
				1935	} while (l != &init_css_set.list);
				1936	write_unlock(&css_set_lock);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1937
				1938	/* If this subsystem requested that it be notified with fork
				1939	* events, we should send it one now for every process in the
				1940	* system */
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1941	if (ss->fork) {
				1942	struct task_struct g, p;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1943
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1944	read_lock(&tasklist_lock);
				1945	do_each_thread(g, p) {
				1946	ss->fork(ss, p);
				1947	} while_each_thread(g, p);
				1948	read_unlock(&tasklist_lock);
				1949	}
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1950
				1951	need_forkexit_callback \|= ss->fork \|\| ss->exit;
				1952
				1953	ss->active = 1;
				1954	}
				1955
				1956	/**
				1957	* cgroup_init_early - initialize cgroups at system boot, and
				1958	* initialize any subsystems that request early init.
				1959	*/
				1960	int __init cgroup_init_early(void)
				1961	{
				1962	int i;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1963	kref_init(&init_css_set.ref);
				1964	kref_get(&init_css_set.ref);
				1965	INIT_LIST_HEAD(&init_css_set.list);
				1966	INIT_LIST_HEAD(&init_css_set.cg_links);
				1967	INIT_LIST_HEAD(&init_css_set.tasks);
				1968	css_set_count = 1;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1969	init_cgroup_root(&rootnode);
				1970	list_add(&rootnode.root_list, &roots);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	1971	root_count = 1;
				1972	init_task.cgroups = &init_css_set;
				1973
				1974	init_css_set_link.cg = &init_css_set;
				1975	list_add(&init_css_set_link.cont_link_list,
				1976	&rootnode.top_cgroup.css_sets);
				1977	list_add(&init_css_set_link.cg_link_list,
				1978	&init_css_set.cg_links);
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1979
				1980	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				1981	struct cgroup_subsys *ss = subsys[i];
				1982
				1983	BUG_ON(!ss->name);
				1984	BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
				1985	BUG_ON(!ss->create);
				1986	BUG_ON(!ss->destroy);
				1987	if (ss->subsys_id != i) {
				1988	printk(KERN_ERR "Subsys %s id == %d\n",
				1989	ss->name, ss->subsys_id);
				1990	BUG();
				1991	}
				1992
				1993	if (ss->early_init)
				1994	cgroup_init_subsys(ss);
				1995	}
				1996	return 0;
				1997	}
				1998
				1999	/**
				2000	* cgroup_init - register cgroup filesystem and /proc file, and
				2001	* initialize any subsystems that didn't request early init.
				2002	*/
				2003	int __init cgroup_init(void)
				2004	{
				2005	int err;
				2006	int i;
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2007	struct proc_dir_entry *entry;
				2008
				2009	err = bdi_init(&cgroup_backing_dev_info);
				2010	if (err)
				2011	return err;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	2012
				2013	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				2014	struct cgroup_subsys *ss = subsys[i];
				2015	if (!ss->early_init)
				2016	cgroup_init_subsys(ss);
				2017	}
				2018
				2019	err = register_filesystem(&cgroup_fs_type);
				2020	if (err < 0)
				2021	goto out;
				2022
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2023	entry = create_proc_entry("cgroups", 0, NULL);
				2024	if (entry)
				2025	entry->proc_fops = &proc_cgroupstats_operations;
				2026
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	2027	out:
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2028	if (err)
				2029	bdi_destroy(&cgroup_backing_dev_info);
				2030
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	2031	return err;
				2032	}
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2033
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2034	/*
				2035	* proc_cgroup_show()
				2036	* - Print task's cgroup paths into seq_file, one line for each hierarchy
				2037	* - Used for /proc/<pid>/cgroup.
				2038	* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
				2039	* doesn't really matter if tsk->cgroup changes after we read it,
				2040	* and we take cgroup_mutex, keeping attach_task() from changing it
				2041	* anyway. No need to check that tsk->cgroup != NULL, thanks to
				2042	* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
				2043	* cgroup to top_cgroup.
				2044	*/
				2045
				2046	/* TODO: Use a proper seq_file iterator */
				2047	static int proc_cgroup_show(struct seq_file m, void v)
				2048	{
				2049	struct pid *pid;
				2050	struct task_struct *tsk;
				2051	char *buf;
				2052	int retval;
				2053	struct cgroupfs_root *root;
				2054
				2055	retval = -ENOMEM;
				2056	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				2057	if (!buf)
				2058	goto out;
				2059
				2060	retval = -ESRCH;
				2061	pid = m->private;
				2062	tsk = get_pid_task(pid, PIDTYPE_PID);
				2063	if (!tsk)
				2064	goto out_free;
				2065
				2066	retval = 0;
				2067
				2068	mutex_lock(&cgroup_mutex);
				2069
				2070	for_each_root(root) {
				2071	struct cgroup_subsys *ss;
				2072	struct cgroup *cont;
				2073	int subsys_id;
				2074	int count = 0;
				2075
				2076	/* Skip this hierarchy if it has no active subsystems */
				2077	if (!root->actual_subsys_bits)
				2078	continue;
				2079	for_each_subsys(root, ss)
				2080	seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
				2081	seq_putc(m, ':');
				2082	get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
				2083	cont = task_cgroup(tsk, subsys_id);
				2084	retval = cgroup_path(cont, buf, PAGE_SIZE);
				2085	if (retval < 0)
				2086	goto out_unlock;
				2087	seq_puts(m, buf);
				2088	seq_putc(m, '\n');
				2089	}
				2090
				2091	out_unlock:
				2092	mutex_unlock(&cgroup_mutex);
				2093	put_task_struct(tsk);
				2094	out_free:
				2095	kfree(buf);
				2096	out:
				2097	return retval;
				2098	}
				2099
				2100	static int cgroup_open(struct inode inode, struct file file)
				2101	{
				2102	struct pid *pid = PROC_I(inode)->pid;
				2103	return single_open(file, proc_cgroup_show, pid);
				2104	}
				2105
				2106	struct file_operations proc_cgroup_operations = {
				2107	.open = cgroup_open,
				2108	.read = seq_read,
				2109	.llseek = seq_lseek,
				2110	.release = single_release,
				2111	};
				2112
				2113	/* Display information about each subsystem and each hierarchy */
				2114	static int proc_cgroupstats_show(struct seq_file m, void v)
				2115	{
				2116	int i;
				2117	struct cgroupfs_root *root;
				2118
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2119	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2120	mutex_lock(&cgroup_mutex);
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2121	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				2122	struct cgroup_subsys *ss = subsys[i];
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2123	seq_printf(m, "%s\t%lu\t%d\n",
				2124	ss->name, ss->root->subsys_bits,
				2125	ss->root->number_of_cgroups);
Paul Menage	a424316	2007-10-18 23:39:35 -0700	[diff] [blame]	2126	}
				2127	mutex_unlock(&cgroup_mutex);
				2128	return 0;
				2129	}
				2130
				2131	static int cgroupstats_open(struct inode inode, struct file file)
				2132	{
				2133	return single_open(file, proc_cgroupstats_show, 0);
				2134	}
				2135
				2136	static struct file_operations proc_cgroupstats_operations = {
				2137	.open = cgroupstats_open,
				2138	.read = seq_read,
				2139	.llseek = seq_lseek,
				2140	.release = single_release,
				2141	};
				2142
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2143	/**
				2144	* cgroup_fork - attach newly forked task to its parents cgroup.
				2145	* @tsk: pointer to task_struct of forking parent process.
				2146	*
				2147	* Description: A task inherits its parent's cgroup at fork().
				2148	*
				2149	* A pointer to the shared css_set was automatically copied in
				2150	* fork.c by dup_task_struct(). However, we ignore that copy, since
				2151	* it was not made under the protection of RCU or cgroup_mutex, so
				2152	* might no longer be a valid cgroup pointer. attach_task() might
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2153	* have already changed current->cgroups, allowing the previously
				2154	* referenced cgroup group to be removed and freed.
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2155	*
				2156	* At the point that cgroup_fork() is called, 'current' is the parent
				2157	* task, and the passed argument 'child' points to the child task.
				2158	*/
				2159	void cgroup_fork(struct task_struct *child)
				2160	{
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2161	task_lock(current);
				2162	child->cgroups = current->cgroups;
				2163	get_css_set(child->cgroups);
				2164	task_unlock(current);
				2165	INIT_LIST_HEAD(&child->cg_list);
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2166	}
				2167
				2168	/**
				2169	* cgroup_fork_callbacks - called on a new task very soon before
				2170	* adding it to the tasklist. No need to take any locks since no-one
				2171	* can be operating on this task
				2172	*/
				2173	void cgroup_fork_callbacks(struct task_struct *child)
				2174	{
				2175	if (need_forkexit_callback) {
				2176	int i;
				2177	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				2178	struct cgroup_subsys *ss = subsys[i];
				2179	if (ss->fork)
				2180	ss->fork(ss, child);
				2181	}
				2182	}
				2183	}
				2184
				2185	/**
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2186	* cgroup_post_fork - called on a new task after adding it to the
				2187	* task list. Adds the task to the list running through its css_set
				2188	* if necessary. Has to be after the task is visible on the task list
				2189	* in case we race with the first call to cgroup_iter_start() - to
				2190	* guarantee that the new task ends up on its list. */
				2191	void cgroup_post_fork(struct task_struct *child)
				2192	{
				2193	if (use_task_css_set_links) {
				2194	write_lock(&css_set_lock);
				2195	if (list_empty(&child->cg_list))
				2196	list_add(&child->cg_list, &child->cgroups->tasks);
				2197	write_unlock(&css_set_lock);
				2198	}
				2199	}
				2200	/**
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2201	* cgroup_exit - detach cgroup from exiting task
				2202	* @tsk: pointer to task_struct of exiting process
				2203	*
				2204	* Description: Detach cgroup from @tsk and release it.
				2205	*
				2206	* Note that cgroups marked notify_on_release force every task in
				2207	* them to take the global cgroup_mutex mutex when exiting.
				2208	* This could impact scaling on very large systems. Be reluctant to
				2209	* use notify_on_release cgroups where very high task exit scaling
				2210	* is required on large systems.
				2211	*
				2212	* the_top_cgroup_hack:
				2213	*
				2214	* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
				2215	*
				2216	* We call cgroup_exit() while the task is still competent to
				2217	* handle notify_on_release(), then leave the task attached to the
				2218	* root cgroup in each hierarchy for the remainder of its exit.
				2219	*
				2220	* To do this properly, we would increment the reference count on
				2221	* top_cgroup, and near the very end of the kernel/exit.c do_exit()
				2222	* code we would add a second cgroup function call, to drop that
				2223	* reference. This would just create an unnecessary hot spot on
				2224	* the top_cgroup reference count, to no avail.
				2225	*
				2226	* Normally, holding a reference to a cgroup without bumping its
				2227	* count is unsafe. The cgroup could go away, or someone could
				2228	* attach us to a different cgroup, decrementing the count on
				2229	* the first cgroup that we never incremented. But in this case,
				2230	* top_cgroup isn't going away, and either task has PF_EXITING set,
				2231	* which wards off any attach_task() attempts, or task is a failed
				2232	* fork, never visible to attach_task.
				2233	*
				2234	*/
				2235	void cgroup_exit(struct task_struct *tsk, int run_callbacks)
				2236	{
				2237	int i;
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2238	struct css_set *cg;
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2239
				2240	if (run_callbacks && need_forkexit_callback) {
				2241	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				2242	struct cgroup_subsys *ss = subsys[i];
				2243	if (ss->exit)
				2244	ss->exit(ss, tsk);
				2245	}
				2246	}
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2247
				2248	/*
				2249	* Unlink from the css_set task list if necessary.
				2250	* Optimistically check cg_list before taking
				2251	* css_set_lock
				2252	*/
				2253	if (!list_empty(&tsk->cg_list)) {
				2254	write_lock(&css_set_lock);
				2255	if (!list_empty(&tsk->cg_list))
				2256	list_del(&tsk->cg_list);
				2257	write_unlock(&css_set_lock);
				2258	}
				2259
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2260	/* Reassign the task to the init_css_set. */
				2261	task_lock(tsk);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2262	cg = tsk->cgroups;
				2263	tsk->cgroups = &init_css_set;
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2264	task_unlock(tsk);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2265	if (cg)
				2266	put_css_set(cg);
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame]	2267	}
Paul Menage	697f416	2007-10-18 23:39:34 -0700	[diff] [blame]	2268
				2269	/**
				2270	* cgroup_clone - duplicate the current cgroup in the hierarchy
				2271	* that the given subsystem is attached to, and move this task into
				2272	* the new child
				2273	*/
				2274	int cgroup_clone(struct task_struct tsk, struct cgroup_subsys subsys)
				2275	{
				2276	struct dentry *dentry;
				2277	int ret = 0;
				2278	char nodename[MAX_CGROUP_TYPE_NAMELEN];
				2279	struct cgroup parent, child;
				2280	struct inode *inode;
				2281	struct css_set *cg;
				2282	struct cgroupfs_root *root;
				2283	struct cgroup_subsys *ss;
				2284
				2285	/* We shouldn't be called by an unregistered subsystem */
				2286	BUG_ON(!subsys->active);
				2287
				2288	/* First figure out what hierarchy and cgroup we're dealing
				2289	* with, and pin them so we can drop cgroup_mutex */
				2290	mutex_lock(&cgroup_mutex);
				2291	again:
				2292	root = subsys->root;
				2293	if (root == &rootnode) {
				2294	printk(KERN_INFO
				2295	"Not cloning cgroup for unused subsystem %s\n",
				2296	subsys->name);
				2297	mutex_unlock(&cgroup_mutex);
				2298	return 0;
				2299	}
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2300	cg = tsk->cgroups;
Paul Menage	697f416	2007-10-18 23:39:34 -0700	[diff] [blame]	2301	parent = task_cgroup(tsk, subsys->subsys_id);
				2302
				2303	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
				2304
				2305	/* Pin the hierarchy */
				2306	atomic_inc(&parent->root->sb->s_active);
				2307
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2308	/* Keep the cgroup alive */
				2309	get_css_set(cg);
Paul Menage	697f416	2007-10-18 23:39:34 -0700	[diff] [blame]	2310	mutex_unlock(&cgroup_mutex);
				2311
				2312	/* Now do the VFS work to create a cgroup */
				2313	inode = parent->dentry->d_inode;
				2314
				2315	/* Hold the parent directory mutex across this operation to
				2316	* stop anyone else deleting the new cgroup */
				2317	mutex_lock(&inode->i_mutex);
				2318	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
				2319	if (IS_ERR(dentry)) {
				2320	printk(KERN_INFO
				2321	"Couldn't allocate dentry for %s: %ld\n", nodename,
				2322	PTR_ERR(dentry));
				2323	ret = PTR_ERR(dentry);
				2324	goto out_release;
				2325	}
				2326
				2327	/* Create the cgroup directory, which also creates the cgroup */
				2328	ret = vfs_mkdir(inode, dentry, S_IFDIR \| 0755);
				2329	child = __d_cont(dentry);
				2330	dput(dentry);
				2331	if (ret) {
				2332	printk(KERN_INFO
				2333	"Failed to create cgroup %s: %d\n", nodename,
				2334	ret);
				2335	goto out_release;
				2336	}
				2337
				2338	if (!child) {
				2339	printk(KERN_INFO
				2340	"Couldn't find new cgroup %s\n", nodename);
				2341	ret = -ENOMEM;
				2342	goto out_release;
				2343	}
				2344
				2345	/* The cgroup now exists. Retake cgroup_mutex and check
				2346	* that we're still in the same state that we thought we
				2347	* were. */
				2348	mutex_lock(&cgroup_mutex);
				2349	if ((root != subsys->root) \|\|
				2350	(parent != task_cgroup(tsk, subsys->subsys_id))) {
				2351	/* Aargh, we raced ... */
				2352	mutex_unlock(&inode->i_mutex);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2353	put_css_set(cg);
Paul Menage	697f416	2007-10-18 23:39:34 -0700	[diff] [blame]	2354
				2355	deactivate_super(parent->root->sb);
				2356	/* The cgroup is still accessible in the VFS, but
				2357	* we're not going to try to rmdir() it at this
				2358	* point. */
				2359	printk(KERN_INFO
				2360	"Race in cgroup_clone() - leaking cgroup %s\n",
				2361	nodename);
				2362	goto again;
				2363	}
				2364
				2365	/* do any required auto-setup */
				2366	for_each_subsys(root, ss) {
				2367	if (ss->post_clone)
				2368	ss->post_clone(ss, child);
				2369	}
				2370
				2371	/* All seems fine. Finish by moving the task into the new cgroup */
				2372	ret = attach_task(child, tsk);
				2373	mutex_unlock(&cgroup_mutex);
				2374
				2375	out_release:
				2376	mutex_unlock(&inode->i_mutex);
Paul Menage	817929e	2007-10-18 23:39:36 -0700	[diff] [blame^]	2377	put_css_set(cg);
Paul Menage	697f416	2007-10-18 23:39:34 -0700	[diff] [blame]	2378	deactivate_super(parent->root->sb);
				2379	return ret;
				2380	}
				2381
				2382	/*
				2383	* See if "cont" is a descendant of the current task's cgroup in
				2384	* the appropriate hierarchy
				2385	*
				2386	* If we are sending in dummytop, then presumably we are creating
				2387	* the top cgroup in the subsystem.
				2388	*
				2389	* Called only by the ns (nsproxy) cgroup.
				2390	*/
				2391	int cgroup_is_descendant(const struct cgroup *cont)
				2392	{
				2393	int ret;
				2394	struct cgroup *target;
				2395	int subsys_id;
				2396
				2397	if (cont == dummytop)
				2398	return 1;
				2399
				2400	get_first_subsys(cont, NULL, &subsys_id);
				2401	target = task_cgroup(current, subsys_id);
				2402	while (cont != target && cont!= cont->top_cgroup)
				2403	cont = cont->parent;
				2404	ret = (cont == target);
				2405	return ret;
				2406	}