Blame - kernel/fork.c - android_kernel_htc_msm8960

blob: f42a17f88699bb4c1a07b4f93961c92e3c5d75a5 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/kernel/fork.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	/*
				8	* 'fork.c' contains the help-routines for the 'fork' system call
				9	* (see also entry.S and others).
				10	* Fork is rather simple, once you get the hang of it, but the memory
				11	* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
				12	*/
				13
				14	#include <linux/config.h>
				15	#include <linux/slab.h>
				16	#include <linux/init.h>
				17	#include <linux/unistd.h>
				18	#include <linux/smp_lock.h>
				19	#include <linux/module.h>
				20	#include <linux/vmalloc.h>
				21	#include <linux/completion.h>
				22	#include <linux/namespace.h>
				23	#include <linux/personality.h>
				24	#include <linux/mempolicy.h>
				25	#include <linux/sem.h>
				26	#include <linux/file.h>
				27	#include <linux/key.h>
				28	#include <linux/binfmts.h>
				29	#include <linux/mman.h>
				30	#include <linux/fs.h>
				31	#include <linux/cpu.h>
				32	#include <linux/cpuset.h>
				33	#include <linux/security.h>
				34	#include <linux/swap.h>
				35	#include <linux/syscalls.h>
				36	#include <linux/jiffies.h>
				37	#include <linux/futex.h>
				38	#include <linux/ptrace.h>
				39	#include <linux/mount.h>
				40	#include <linux/audit.h>
				41	#include <linux/profile.h>
				42	#include <linux/rmap.h>
				43	#include <linux/acct.h>
				44
				45	#include <asm/pgtable.h>
				46	#include <asm/pgalloc.h>
				47	#include <asm/uaccess.h>
				48	#include <asm/mmu_context.h>
				49	#include <asm/cacheflush.h>
				50	#include <asm/tlbflush.h>
				51
				52	/*
				53	* Protected counters by write_lock_irq(&tasklist_lock)
				54	*/
				55	unsigned long total_forks; /* Handle normal Linux uptimes. */
				56	int nr_threads; /* The idle threads do not count.. */
				57
				58	int max_threads; /* tunable limit on nr_threads */
				59
				60	DEFINE_PER_CPU(unsigned long, process_counts) = 0;
				61
				62	__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
				63
				64	EXPORT_SYMBOL(tasklist_lock);
				65
				66	int nr_processes(void)
				67	{
				68	int cpu;
				69	int total = 0;
				70
				71	for_each_online_cpu(cpu)
				72	total += per_cpu(process_counts, cpu);
				73
				74	return total;
				75	}
				76
				77	#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
				78	# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
				79	# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
				80	static kmem_cache_t *task_struct_cachep;
				81	#endif
				82
				83	/* SLAB cache for signal_struct structures (tsk->signal) */
				84	kmem_cache_t *signal_cachep;
				85
				86	/* SLAB cache for sighand_struct structures (tsk->sighand) */
				87	kmem_cache_t *sighand_cachep;
				88
				89	/* SLAB cache for files_struct structures (tsk->files) */
				90	kmem_cache_t *files_cachep;
				91
				92	/* SLAB cache for fs_struct structures (tsk->fs) */
				93	kmem_cache_t *fs_cachep;
				94
				95	/* SLAB cache for vm_area_struct structures */
				96	kmem_cache_t *vm_area_cachep;
				97
				98	/* SLAB cache for mm_struct structures (tsk->mm) */
				99	static kmem_cache_t *mm_cachep;
				100
				101	void free_task(struct task_struct *tsk)
				102	{
				103	free_thread_info(tsk->thread_info);
				104	free_task_struct(tsk);
				105	}
				106	EXPORT_SYMBOL(free_task);
				107
				108	void __put_task_struct(struct task_struct *tsk)
				109	{
				110	WARN_ON(!(tsk->exit_state & (EXIT_DEAD \| EXIT_ZOMBIE)));
				111	WARN_ON(atomic_read(&tsk->usage));
				112	WARN_ON(tsk == current);
				113
				114	if (unlikely(tsk->audit_context))
				115	audit_free(tsk);
				116	security_task_free(tsk);
				117	free_uid(tsk->user);
				118	put_group_info(tsk->group_info);
				119
				120	if (!profile_handoff_task(tsk))
				121	free_task(tsk);
				122	}
				123
				124	void __init fork_init(unsigned long mempages)
				125	{
				126	#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
				127	#ifndef ARCH_MIN_TASKALIGN
				128	#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
				129	#endif
				130	/* create a slab on which task_structs can be allocated */
				131	task_struct_cachep =
				132	kmem_cache_create("task_struct", sizeof(struct task_struct),
				133	ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
				134	#endif
				135
				136	/*
				137	* The default maximum number of threads is set to a safe
				138	* value: the thread structures can take up at most half
				139	* of memory.
				140	*/
				141	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
				142
				143	/*
				144	* we need to allow at least 20 threads to boot a system
				145	*/
				146	if(max_threads < 20)
				147	max_threads = 20;
				148
				149	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
				150	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
				151	init_task.signal->rlim[RLIMIT_SIGPENDING] =
				152	init_task.signal->rlim[RLIMIT_NPROC];
				153	}
				154
				155	static struct task_struct dup_task_struct(struct task_struct orig)
				156	{
				157	struct task_struct *tsk;
				158	struct thread_info *ti;
				159
				160	prepare_to_copy(orig);
				161
				162	tsk = alloc_task_struct();
				163	if (!tsk)
				164	return NULL;
				165
				166	ti = alloc_thread_info(tsk);
				167	if (!ti) {
				168	free_task_struct(tsk);
				169	return NULL;
				170	}
				171
				172	ti = orig->thread_info;
				173	tsk = orig;
				174	tsk->thread_info = ti;
				175	ti->task = tsk;
				176
				177	/* One for us, one for whoever does the "release_task()" (usually parent) */
				178	atomic_set(&tsk->usage,2);
				179	return tsk;
				180	}
				181
				182	#ifdef CONFIG_MMU
				183	static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
				184	{
				185	struct vm_area_struct * mpnt, tmp, *pprev;
				186	struct rb_node *rb_link, rb_parent;
				187	int retval;
				188	unsigned long charge;
				189	struct mempolicy *pol;
				190
				191	down_write(&oldmm->mmap_sem);
				192	flush_cache_mm(current->mm);
				193	mm->locked_vm = 0;
				194	mm->mmap = NULL;
				195	mm->mmap_cache = NULL;
				196	mm->free_area_cache = oldmm->mmap_base;
				197	mm->map_count = 0;
				198	set_mm_counter(mm, rss, 0);
				199	set_mm_counter(mm, anon_rss, 0);
				200	cpus_clear(mm->cpu_vm_mask);
				201	mm->mm_rb = RB_ROOT;
				202	rb_link = &mm->mm_rb.rb_node;
				203	rb_parent = NULL;
				204	pprev = &mm->mmap;
				205
				206	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
				207	struct file *file;
				208
				209	if (mpnt->vm_flags & VM_DONTCOPY) {
				210	__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
				211	-vma_pages(mpnt));
				212	continue;
				213	}
				214	charge = 0;
				215	if (mpnt->vm_flags & VM_ACCOUNT) {
				216	unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
				217	if (security_vm_enough_memory(len))
				218	goto fail_nomem;
				219	charge = len;
				220	}
				221	tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
				222	if (!tmp)
				223	goto fail_nomem;
				224	tmp = mpnt;
				225	pol = mpol_copy(vma_policy(mpnt));
				226	retval = PTR_ERR(pol);
				227	if (IS_ERR(pol))
				228	goto fail_nomem_policy;
				229	vma_set_policy(tmp, pol);
				230	tmp->vm_flags &= ~VM_LOCKED;
				231	tmp->vm_mm = mm;
				232	tmp->vm_next = NULL;
				233	anon_vma_link(tmp);
				234	file = tmp->vm_file;
				235	if (file) {
				236	struct inode *inode = file->f_dentry->d_inode;
				237	get_file(file);
				238	if (tmp->vm_flags & VM_DENYWRITE)
				239	atomic_dec(&inode->i_writecount);
				240
				241	/* insert tmp into the share list, just after mpnt */
				242	spin_lock(&file->f_mapping->i_mmap_lock);
				243	tmp->vm_truncate_count = mpnt->vm_truncate_count;
				244	flush_dcache_mmap_lock(file->f_mapping);
				245	vma_prio_tree_add(tmp, mpnt);
				246	flush_dcache_mmap_unlock(file->f_mapping);
				247	spin_unlock(&file->f_mapping->i_mmap_lock);
				248	}
				249
				250	/*
				251	* Link in the new vma and copy the page table entries:
				252	* link in first so that swapoff can see swap entries,
				253	* and try_to_unmap_one's find_vma find the new vma.
				254	*/
				255	spin_lock(&mm->page_table_lock);
				256	*pprev = tmp;
				257	pprev = &tmp->vm_next;
				258
				259	__vma_link_rb(mm, tmp, rb_link, rb_parent);
				260	rb_link = &tmp->vm_rb.rb_right;
				261	rb_parent = &tmp->vm_rb;
				262
				263	mm->map_count++;
				264	retval = copy_page_range(mm, current->mm, tmp);
				265	spin_unlock(&mm->page_table_lock);
				266
				267	if (tmp->vm_ops && tmp->vm_ops->open)
				268	tmp->vm_ops->open(tmp);
				269
				270	if (retval)
				271	goto out;
				272	}
				273	retval = 0;
				274
				275	out:
				276	flush_tlb_mm(current->mm);
				277	up_write(&oldmm->mmap_sem);
				278	return retval;
				279	fail_nomem_policy:
				280	kmem_cache_free(vm_area_cachep, tmp);
				281	fail_nomem:
				282	retval = -ENOMEM;
				283	vm_unacct_memory(charge);
				284	goto out;
				285	}
				286
				287	static inline int mm_alloc_pgd(struct mm_struct * mm)
				288	{
				289	mm->pgd = pgd_alloc(mm);
				290	if (unlikely(!mm->pgd))
				291	return -ENOMEM;
				292	return 0;
				293	}
				294
				295	static inline void mm_free_pgd(struct mm_struct * mm)
				296	{
				297	pgd_free(mm->pgd);
				298	}
				299	#else
				300	#define dup_mmap(mm, oldmm) (0)
				301	#define mm_alloc_pgd(mm) (0)
				302	#define mm_free_pgd(mm)
				303	#endif /* CONFIG_MMU */
				304
				305	__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
				306
				307	#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
				308	#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
				309
				310	#include <linux/init_task.h>
				311
				312	static struct mm_struct * mm_init(struct mm_struct * mm)
				313	{
				314	atomic_set(&mm->mm_users, 1);
				315	atomic_set(&mm->mm_count, 1);
				316	init_rwsem(&mm->mmap_sem);
				317	INIT_LIST_HEAD(&mm->mmlist);
				318	mm->core_waiters = 0;
				319	mm->nr_ptes = 0;
				320	spin_lock_init(&mm->page_table_lock);
				321	rwlock_init(&mm->ioctx_list_lock);
				322	mm->ioctx_list = NULL;
				323	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
				324	mm->free_area_cache = TASK_UNMAPPED_BASE;
				325
				326	if (likely(!mm_alloc_pgd(mm))) {
				327	mm->def_flags = 0;
				328	return mm;
				329	}
				330	free_mm(mm);
				331	return NULL;
				332	}
				333
				334	/*
				335	* Allocate and initialize an mm_struct.
				336	*/
				337	struct mm_struct * mm_alloc(void)
				338	{
				339	struct mm_struct * mm;
				340
				341	mm = allocate_mm();
				342	if (mm) {
				343	memset(mm, 0, sizeof(*mm));
				344	mm = mm_init(mm);
				345	}
				346	return mm;
				347	}
				348
				349	/*
				350	* Called when the last reference to the mm
				351	* is dropped: either by a lazy thread or by
				352	* mmput. Free the page directory and the mm.
				353	*/
				354	void fastcall __mmdrop(struct mm_struct *mm)
				355	{
				356	BUG_ON(mm == &init_mm);
				357	mm_free_pgd(mm);
				358	destroy_context(mm);
				359	free_mm(mm);
				360	}
				361
				362	/*
				363	* Decrement the use count and release all resources for an mm.
				364	*/
				365	void mmput(struct mm_struct *mm)
				366	{
				367	if (atomic_dec_and_test(&mm->mm_users)) {
				368	exit_aio(mm);
				369	exit_mmap(mm);
				370	if (!list_empty(&mm->mmlist)) {
				371	spin_lock(&mmlist_lock);
				372	list_del(&mm->mmlist);
				373	spin_unlock(&mmlist_lock);
				374	}
				375	put_swap_token(mm);
				376	mmdrop(mm);
				377	}
				378	}
				379	EXPORT_SYMBOL_GPL(mmput);
				380
				381	/**
				382	* get_task_mm - acquire a reference to the task's mm
				383	*
				384	* Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
				385	* this kernel workthread has transiently adopted a user mm with use_mm,
				386	* to do its AIO) is not set and if so returns a reference to it, after
				387	* bumping up the use count. User must release the mm via mmput()
				388	* after use. Typically used by /proc and ptrace.
				389	*/
				390	struct mm_struct get_task_mm(struct task_struct task)
				391	{
				392	struct mm_struct *mm;
				393
				394	task_lock(task);
				395	mm = task->mm;
				396	if (mm) {
				397	if (task->flags & PF_BORROWED_MM)
				398	mm = NULL;
				399	else
				400	atomic_inc(&mm->mm_users);
				401	}
				402	task_unlock(task);
				403	return mm;
				404	}
				405	EXPORT_SYMBOL_GPL(get_task_mm);
				406
				407	/* Please note the differences between mmput and mm_release.
				408	* mmput is called whenever we stop holding onto a mm_struct,
				409	* error success whatever.
				410	*
				411	* mm_release is called after a mm_struct has been removed
				412	* from the current process.
				413	*
				414	* This difference is important for error handling, when we
				415	* only half set up a mm_struct for a new process and need to restore
				416	* the old one. Because we mmput the new mm_struct before
				417	* restoring the old one. . .
				418	* Eric Biederman 10 January 1998
				419	*/
				420	void mm_release(struct task_struct tsk, struct mm_struct mm)
				421	{
				422	struct completion *vfork_done = tsk->vfork_done;
				423
				424	/* Get rid of any cached register state */
				425	deactivate_mm(tsk, mm);
				426
				427	/* notify parent sleeping on vfork() */
				428	if (vfork_done) {
				429	tsk->vfork_done = NULL;
				430	complete(vfork_done);
				431	}
				432	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
				433	u32 __user * tidptr = tsk->clear_child_tid;
				434	tsk->clear_child_tid = NULL;
				435
				436	/*
				437	* We don't check the error code - if userspace has
				438	* not set up a proper pointer then tough luck.
				439	*/
				440	put_user(0, tidptr);
				441	sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
				442	}
				443	}
				444
				445	static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
				446	{
				447	struct mm_struct * mm, *oldmm;
				448	int retval;
				449
				450	tsk->min_flt = tsk->maj_flt = 0;
				451	tsk->nvcsw = tsk->nivcsw = 0;
				452
				453	tsk->mm = NULL;
				454	tsk->active_mm = NULL;
				455
				456	/*
				457	* Are we cloning a kernel thread?
				458	*
				459	* We need to steal a active VM for that..
				460	*/
				461	oldmm = current->mm;
				462	if (!oldmm)
				463	return 0;
				464
				465	if (clone_flags & CLONE_VM) {
				466	atomic_inc(&oldmm->mm_users);
				467	mm = oldmm;
				468	/*
				469	* There are cases where the PTL is held to ensure no
				470	* new threads start up in user mode using an mm, which
				471	* allows optimizing out ipis; the tlb_gather_mmu code
				472	* is an example.
				473	*/
				474	spin_unlock_wait(&oldmm->page_table_lock);
				475	goto good_mm;
				476	}
				477
				478	retval = -ENOMEM;
				479	mm = allocate_mm();
				480	if (!mm)
				481	goto fail_nomem;
				482
				483	/* Copy the current MM stuff.. */
				484	memcpy(mm, oldmm, sizeof(*mm));
				485	if (!mm_init(mm))
				486	goto fail_nomem;
				487
				488	if (init_new_context(tsk,mm))
				489	goto fail_nocontext;
				490
				491	retval = dup_mmap(mm, oldmm);
				492	if (retval)
				493	goto free_pt;
				494
				495	mm->hiwater_rss = get_mm_counter(mm,rss);
				496	mm->hiwater_vm = mm->total_vm;
				497
				498	good_mm:
				499	tsk->mm = mm;
				500	tsk->active_mm = mm;
				501	return 0;
				502
				503	free_pt:
				504	mmput(mm);
				505	fail_nomem:
				506	return retval;
				507
				508	fail_nocontext:
				509	/*
				510	* If init_new_context() failed, we cannot use mmput() to free the mm
				511	* because it calls destroy_context()
				512	*/
				513	mm_free_pgd(mm);
				514	free_mm(mm);
				515	return retval;
				516	}
				517
				518	static inline struct fs_struct __copy_fs_struct(struct fs_struct old)
				519	{
				520	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
				521	/* We don't need to lock fs - think why ;-) */
				522	if (fs) {
				523	atomic_set(&fs->count, 1);
				524	rwlock_init(&fs->lock);
				525	fs->umask = old->umask;
				526	read_lock(&old->lock);
				527	fs->rootmnt = mntget(old->rootmnt);
				528	fs->root = dget(old->root);
				529	fs->pwdmnt = mntget(old->pwdmnt);
				530	fs->pwd = dget(old->pwd);
				531	if (old->altroot) {
				532	fs->altrootmnt = mntget(old->altrootmnt);
				533	fs->altroot = dget(old->altroot);
				534	} else {
				535	fs->altrootmnt = NULL;
				536	fs->altroot = NULL;
				537	}
				538	read_unlock(&old->lock);
				539	}
				540	return fs;
				541	}
				542
				543	struct fs_struct copy_fs_struct(struct fs_struct old)
				544	{
				545	return __copy_fs_struct(old);
				546	}
				547
				548	EXPORT_SYMBOL_GPL(copy_fs_struct);
				549
				550	static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
				551	{
				552	if (clone_flags & CLONE_FS) {
				553	atomic_inc(&current->fs->count);
				554	return 0;
				555	}
				556	tsk->fs = __copy_fs_struct(current->fs);
				557	if (!tsk->fs)
				558	return -ENOMEM;
				559	return 0;
				560	}
				561
				562	static int count_open_files(struct files_struct *files, int size)
				563	{
				564	int i;
				565
				566	/* Find the last open fd */
				567	for (i = size/(8*sizeof(long)); i > 0; ) {
				568	if (files->open_fds->fds_bits[--i])
				569	break;
				570	}
				571	i = (i+1) * 8 * sizeof(long);
				572	return i;
				573	}
				574
				575	static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
				576	{
				577	struct files_struct oldf, newf;
				578	struct file old_fds, new_fds;
				579	int open_files, size, i, error = 0, expand;
				580
				581	/*
				582	* A background process may not have any files ...
				583	*/
				584	oldf = current->files;
				585	if (!oldf)
				586	goto out;
				587
				588	if (clone_flags & CLONE_FILES) {
				589	atomic_inc(&oldf->count);
				590	goto out;
				591	}
				592
				593	/*
				594	* Note: we may be using current for both targets (See exec.c)
				595	* This works because we cache current->files (old) as oldf. Don't
				596	* break this.
				597	*/
				598	tsk->files = NULL;
				599	error = -ENOMEM;
				600	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
				601	if (!newf)
				602	goto out;
				603
				604	atomic_set(&newf->count, 1);
				605
				606	spin_lock_init(&newf->file_lock);
				607	newf->next_fd = 0;
				608	newf->max_fds = NR_OPEN_DEFAULT;
				609	newf->max_fdset = __FD_SETSIZE;
				610	newf->close_on_exec = &newf->close_on_exec_init;
				611	newf->open_fds = &newf->open_fds_init;
				612	newf->fd = &newf->fd_array[0];
				613
				614	spin_lock(&oldf->file_lock);
				615
				616	open_files = count_open_files(oldf, oldf->max_fdset);
				617	expand = 0;
				618
				619	/*
				620	* Check whether we need to allocate a larger fd array or fd set.
				621	* Note: we're not a clone task, so the open count won't change.
				622	*/
				623	if (open_files > newf->max_fdset) {
				624	newf->max_fdset = 0;
				625	expand = 1;
				626	}
				627	if (open_files > newf->max_fds) {
				628	newf->max_fds = 0;
				629	expand = 1;
				630	}
				631
				632	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
				633	if (expand) {
				634	spin_unlock(&oldf->file_lock);
				635	spin_lock(&newf->file_lock);
				636	error = expand_files(newf, open_files-1);
				637	spin_unlock(&newf->file_lock);
				638	if (error < 0)
				639	goto out_release;
				640	spin_lock(&oldf->file_lock);
				641	}
				642
				643	old_fds = oldf->fd;
				644	new_fds = newf->fd;
				645
				646	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
				647	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
				648
				649	for (i = open_files; i != 0; i--) {
				650	struct file f = old_fds++;
				651	if (f) {
				652	get_file(f);
				653	} else {
				654	/*
				655	* The fd may be claimed in the fd bitmap but not yet
				656	* instantiated in the files array if a sibling thread
				657	* is partway through open(). So make sure that this
				658	* fd is available to the new process.
				659	*/
				660	FD_CLR(open_files - i, newf->open_fds);
				661	}
				662	*new_fds++ = f;
				663	}
				664	spin_unlock(&oldf->file_lock);
				665
				666	/* compute the remainder to be cleared */
				667	size = (newf->max_fds - open_files) * sizeof(struct file *);
				668
				669	/* This is long word aligned thus could use a optimized version */
				670	memset(new_fds, 0, size);
				671
				672	if (newf->max_fdset > open_files) {
				673	int left = (newf->max_fdset-open_files)/8;
				674	int start = open_files / (8 * sizeof(unsigned long));
				675
				676	memset(&newf->open_fds->fds_bits[start], 0, left);
				677	memset(&newf->close_on_exec->fds_bits[start], 0, left);
				678	}
				679
				680	tsk->files = newf;
				681	error = 0;
				682	out:
				683	return error;
				684
				685	out_release:
				686	free_fdset (newf->close_on_exec, newf->max_fdset);
				687	free_fdset (newf->open_fds, newf->max_fdset);
				688	free_fd_array(newf->fd, newf->max_fds);
				689	kmem_cache_free(files_cachep, newf);
				690	goto out;
				691	}
				692
				693	/*
				694	* Helper to unshare the files of the current task.
				695	* We don't want to expose copy_files internals to
				696	* the exec layer of the kernel.
				697	*/
				698
				699	int unshare_files(void)
				700	{
				701	struct files_struct *files = current->files;
				702	int rc;
				703
				704	if(!files)
				705	BUG();
				706
				707	/* This can race but the race causes us to copy when we don't
				708	need to and drop the copy */
				709	if(atomic_read(&files->count) == 1)
				710	{
				711	atomic_inc(&files->count);
				712	return 0;
				713	}
				714	rc = copy_files(0, current);
				715	if(rc)
				716	current->files = files;
				717	return rc;
				718	}
				719
				720	EXPORT_SYMBOL(unshare_files);
				721
				722	static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
				723	{
				724	struct sighand_struct *sig;
				725
				726	if (clone_flags & (CLONE_SIGHAND \| CLONE_THREAD)) {
				727	atomic_inc(&current->sighand->count);
				728	return 0;
				729	}
				730	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
				731	tsk->sighand = sig;
				732	if (!sig)
				733	return -ENOMEM;
				734	spin_lock_init(&sig->siglock);
				735	atomic_set(&sig->count, 1);
				736	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
				737	return 0;
				738	}
				739
				740	static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
				741	{
				742	struct signal_struct *sig;
				743	int ret;
				744
				745	if (clone_flags & CLONE_THREAD) {
				746	atomic_inc(&current->signal->count);
				747	atomic_inc(&current->signal->live);
				748	return 0;
				749	}
				750	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
				751	tsk->signal = sig;
				752	if (!sig)
				753	return -ENOMEM;
				754
				755	ret = copy_thread_group_keys(tsk);
				756	if (ret < 0) {
				757	kmem_cache_free(signal_cachep, sig);
				758	return ret;
				759	}
				760
				761	atomic_set(&sig->count, 1);
				762	atomic_set(&sig->live, 1);
				763	init_waitqueue_head(&sig->wait_chldexit);
				764	sig->flags = 0;
				765	sig->group_exit_code = 0;
				766	sig->group_exit_task = NULL;
				767	sig->group_stop_count = 0;
				768	sig->curr_target = NULL;
				769	init_sigpending(&sig->shared_pending);
				770	INIT_LIST_HEAD(&sig->posix_timers);
				771
				772	sig->it_real_value = sig->it_real_incr = 0;
				773	sig->real_timer.function = it_real_fn;
				774	sig->real_timer.data = (unsigned long) tsk;
				775	init_timer(&sig->real_timer);
				776
				777	sig->it_virt_expires = cputime_zero;
				778	sig->it_virt_incr = cputime_zero;
				779	sig->it_prof_expires = cputime_zero;
				780	sig->it_prof_incr = cputime_zero;
				781
				782	sig->tty = current->signal->tty;
				783	sig->pgrp = process_group(current);
				784	sig->session = current->signal->session;
				785	sig->leader = 0; /* session leadership doesn't inherit */
				786	sig->tty_old_pgrp = 0;
				787
				788	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
				789	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
				790	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
				791	sig->sched_time = 0;
				792	INIT_LIST_HEAD(&sig->cpu_timers[0]);
				793	INIT_LIST_HEAD(&sig->cpu_timers[1]);
				794	INIT_LIST_HEAD(&sig->cpu_timers[2]);
				795
				796	task_lock(current->group_leader);
				797	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
				798	task_unlock(current->group_leader);
				799
				800	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
				801	/*
				802	* New sole thread in the process gets an expiry time
				803	* of the whole CPU time limit.
				804	*/
				805	tsk->it_prof_expires =
				806	secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
				807	}
				808
				809	return 0;
				810	}
				811
				812	static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
				813	{
				814	unsigned long new_flags = p->flags;
				815
				816	new_flags &= ~PF_SUPERPRIV;
				817	new_flags \|= PF_FORKNOEXEC;
				818	if (!(clone_flags & CLONE_PTRACE))
				819	p->ptrace = 0;
				820	p->flags = new_flags;
				821	}
				822
				823	asmlinkage long sys_set_tid_address(int __user *tidptr)
				824	{
				825	current->clear_child_tid = tidptr;
				826
				827	return current->pid;
				828	}
				829
				830	/*
				831	* This creates a new process as a copy of the old one,
				832	* but does not actually start it yet.
				833	*
				834	* It copies the registers, and all the appropriate
				835	* parts of the process environment (as per the clone
				836	* flags). The actual kick-off is left to the caller.
				837	*/
				838	static task_t *copy_process(unsigned long clone_flags,
				839	unsigned long stack_start,
				840	struct pt_regs *regs,
				841	unsigned long stack_size,
				842	int __user *parent_tidptr,
				843	int __user *child_tidptr,
				844	int pid)
				845	{
				846	int retval;
				847	struct task_struct *p = NULL;
				848
				849	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
				850	return ERR_PTR(-EINVAL);
				851
				852	/*
				853	* Thread groups must share signals as well, and detached threads
				854	* can only be started up within the thread group.
				855	*/
				856	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
				857	return ERR_PTR(-EINVAL);
				858
				859	/*
				860	* Shared signal handlers imply shared VM. By way of the above,
				861	* thread groups also imply shared VM. Blocking this case allows
				862	* for various simplifications in other code.
				863	*/
				864	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
				865	return ERR_PTR(-EINVAL);
				866
				867	retval = security_task_create(clone_flags);
				868	if (retval)
				869	goto fork_out;
				870
				871	retval = -ENOMEM;
				872	p = dup_task_struct(current);
				873	if (!p)
				874	goto fork_out;
				875
				876	retval = -EAGAIN;
				877	if (atomic_read(&p->user->processes) >=
				878	p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
				879	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
				880	p->user != &root_user)
				881	goto bad_fork_free;
				882	}
				883
				884	atomic_inc(&p->user->__count);
				885	atomic_inc(&p->user->processes);
				886	get_group_info(p->group_info);
				887
				888	/*
				889	* If multiple threads are within copy_process(), then this check
				890	* triggers too late. This doesn't hurt, the check is only there
				891	* to stop root fork bombs.
				892	*/
				893	if (nr_threads >= max_threads)
				894	goto bad_fork_cleanup_count;
				895
				896	if (!try_module_get(p->thread_info->exec_domain->module))
				897	goto bad_fork_cleanup_count;
				898
				899	if (p->binfmt && !try_module_get(p->binfmt->module))
				900	goto bad_fork_cleanup_put_domain;
				901
				902	p->did_exec = 0;
				903	copy_flags(clone_flags, p);
				904	p->pid = pid;
				905	retval = -EFAULT;
				906	if (clone_flags & CLONE_PARENT_SETTID)
				907	if (put_user(p->pid, parent_tidptr))
				908	goto bad_fork_cleanup;
				909
				910	p->proc_dentry = NULL;
				911
				912	INIT_LIST_HEAD(&p->children);
				913	INIT_LIST_HEAD(&p->sibling);
				914	p->vfork_done = NULL;
				915	spin_lock_init(&p->alloc_lock);
				916	spin_lock_init(&p->proc_lock);
				917
				918	clear_tsk_thread_flag(p, TIF_SIGPENDING);
				919	init_sigpending(&p->pending);
				920
				921	p->utime = cputime_zero;
				922	p->stime = cputime_zero;
				923	p->sched_time = 0;
				924	p->rchar = 0; /* I/O counter: bytes read */
				925	p->wchar = 0; /* I/O counter: bytes written */
				926	p->syscr = 0; /* I/O counter: read syscalls */
				927	p->syscw = 0; /* I/O counter: write syscalls */
				928	acct_clear_integrals(p);
				929
				930	p->it_virt_expires = cputime_zero;
				931	p->it_prof_expires = cputime_zero;
				932	p->it_sched_expires = 0;
				933	INIT_LIST_HEAD(&p->cpu_timers[0]);
				934	INIT_LIST_HEAD(&p->cpu_timers[1]);
				935	INIT_LIST_HEAD(&p->cpu_timers[2]);
				936
				937	p->lock_depth = -1; /* -1 = no lock */
				938	do_posix_clock_monotonic_gettime(&p->start_time);
				939	p->security = NULL;
				940	p->io_context = NULL;
				941	p->io_wait = NULL;
				942	p->audit_context = NULL;
				943	#ifdef CONFIG_NUMA
				944	p->mempolicy = mpol_copy(p->mempolicy);
				945	if (IS_ERR(p->mempolicy)) {
				946	retval = PTR_ERR(p->mempolicy);
				947	p->mempolicy = NULL;
				948	goto bad_fork_cleanup;
				949	}
				950	#endif
				951
				952	p->tgid = p->pid;
				953	if (clone_flags & CLONE_THREAD)
				954	p->tgid = current->tgid;
				955
				956	if ((retval = security_task_alloc(p)))
				957	goto bad_fork_cleanup_policy;
				958	if ((retval = audit_alloc(p)))
				959	goto bad_fork_cleanup_security;
				960	/* copy all the process information */
				961	if ((retval = copy_semundo(clone_flags, p)))
				962	goto bad_fork_cleanup_audit;
				963	if ((retval = copy_files(clone_flags, p)))
				964	goto bad_fork_cleanup_semundo;
				965	if ((retval = copy_fs(clone_flags, p)))
				966	goto bad_fork_cleanup_files;
				967	if ((retval = copy_sighand(clone_flags, p)))
				968	goto bad_fork_cleanup_fs;
				969	if ((retval = copy_signal(clone_flags, p)))
				970	goto bad_fork_cleanup_sighand;
				971	if ((retval = copy_mm(clone_flags, p)))
				972	goto bad_fork_cleanup_signal;
				973	if ((retval = copy_keys(clone_flags, p)))
				974	goto bad_fork_cleanup_mm;
				975	if ((retval = copy_namespace(clone_flags, p)))
				976	goto bad_fork_cleanup_keys;
				977	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
				978	if (retval)
				979	goto bad_fork_cleanup_namespace;
				980
				981	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
				982	/*
				983	* Clear TID on mm_release()?
				984	*/
				985	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
				986
				987	/*
				988	* Syscall tracing should be turned off in the child regardless
				989	* of CLONE_PTRACE.
				990	*/
				991	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
				992
				993	/* Our parent execution domain becomes current domain
				994	These must match for thread signalling to apply */
				995
				996	p->parent_exec_id = p->self_exec_id;
				997
				998	/* ok, now we should be set up.. */
				999	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
				1000	p->pdeath_signal = 0;
				1001	p->exit_state = 0;
				1002
				1003	/* Perform scheduler related setup */
				1004	sched_fork(p);
				1005
				1006	/*
				1007	* Ok, make it visible to the rest of the system.
				1008	* We dont wake it up yet.
				1009	*/
				1010	p->group_leader = p;
				1011	INIT_LIST_HEAD(&p->ptrace_children);
				1012	INIT_LIST_HEAD(&p->ptrace_list);
				1013
				1014	/* Need tasklist lock for parent etc handling! */
				1015	write_lock_irq(&tasklist_lock);
				1016
				1017	/*
				1018	* The task hasn't been attached yet, so cpus_allowed mask cannot
				1019	* have changed. The cpus_allowed mask of the parent may have
				1020	* changed after it was copied first time, and it may then move to
				1021	* another CPU - so we re-copy it here and set the child's CPU to
				1022	* the parent's CPU. This avoids alot of nasty races.
				1023	*/
				1024	p->cpus_allowed = current->cpus_allowed;
				1025	set_task_cpu(p, smp_processor_id());
				1026
				1027	/*
				1028	* Check for pending SIGKILL! The new thread should not be allowed
				1029	* to slip out of an OOM kill. (or normal SIGKILL.)
				1030	*/
				1031	if (sigismember(&current->pending.signal, SIGKILL)) {
				1032	write_unlock_irq(&tasklist_lock);
				1033	retval = -EINTR;
				1034	goto bad_fork_cleanup_namespace;
				1035	}
				1036
				1037	/* CLONE_PARENT re-uses the old parent */
				1038	if (clone_flags & (CLONE_PARENT\|CLONE_THREAD))
				1039	p->real_parent = current->real_parent;
				1040	else
				1041	p->real_parent = current;
				1042	p->parent = p->real_parent;
				1043
				1044	if (clone_flags & CLONE_THREAD) {
				1045	spin_lock(&current->sighand->siglock);
				1046	/*
				1047	* Important: if an exit-all has been started then
				1048	* do not create this new thread - the whole thread
				1049	* group is supposed to exit anyway.
				1050	*/
				1051	if (current->signal->flags & SIGNAL_GROUP_EXIT) {
				1052	spin_unlock(&current->sighand->siglock);
				1053	write_unlock_irq(&tasklist_lock);
				1054	retval = -EAGAIN;
				1055	goto bad_fork_cleanup_namespace;
				1056	}
				1057	p->group_leader = current->group_leader;
				1058
				1059	if (current->signal->group_stop_count > 0) {
				1060	/*
				1061	* There is an all-stop in progress for the group.
				1062	* We ourselves will stop as soon as we check signals.
				1063	* Make the new thread part of that group stop too.
				1064	*/
				1065	current->signal->group_stop_count++;
				1066	set_tsk_thread_flag(p, TIF_SIGPENDING);
				1067	}
				1068
				1069	if (!cputime_eq(current->signal->it_virt_expires,
				1070	cputime_zero) \|\|
				1071	!cputime_eq(current->signal->it_prof_expires,
				1072	cputime_zero) \|\|
				1073	current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY \|\|
				1074	!list_empty(&current->signal->cpu_timers[0]) \|\|
				1075	!list_empty(&current->signal->cpu_timers[1]) \|\|
				1076	!list_empty(&current->signal->cpu_timers[2])) {
				1077	/*
				1078	* Have child wake up on its first tick to check
				1079	* for process CPU timers.
				1080	*/
				1081	p->it_prof_expires = jiffies_to_cputime(1);
				1082	}
				1083
				1084	spin_unlock(&current->sighand->siglock);
				1085	}
				1086
				1087	SET_LINKS(p);
				1088	if (unlikely(p->ptrace & PT_PTRACED))
				1089	__ptrace_link(p, current->parent);
				1090
				1091	cpuset_fork(p);
				1092
				1093	attach_pid(p, PIDTYPE_PID, p->pid);
				1094	attach_pid(p, PIDTYPE_TGID, p->tgid);
				1095	if (thread_group_leader(p)) {
				1096	attach_pid(p, PIDTYPE_PGID, process_group(p));
				1097	attach_pid(p, PIDTYPE_SID, p->signal->session);
				1098	if (p->pid)
				1099	__get_cpu_var(process_counts)++;
				1100	}
				1101
				1102	nr_threads++;
				1103	total_forks++;
				1104	write_unlock_irq(&tasklist_lock);
				1105	retval = 0;
				1106
				1107	fork_out:
				1108	if (retval)
				1109	return ERR_PTR(retval);
				1110	return p;
				1111
				1112	bad_fork_cleanup_namespace:
				1113	exit_namespace(p);
				1114	bad_fork_cleanup_keys:
				1115	exit_keys(p);
				1116	bad_fork_cleanup_mm:
				1117	if (p->mm)
				1118	mmput(p->mm);
				1119	bad_fork_cleanup_signal:
				1120	exit_signal(p);
				1121	bad_fork_cleanup_sighand:
				1122	exit_sighand(p);
				1123	bad_fork_cleanup_fs:
				1124	exit_fs(p); /* blocking */
				1125	bad_fork_cleanup_files:
				1126	exit_files(p); /* blocking */
				1127	bad_fork_cleanup_semundo:
				1128	exit_sem(p);
				1129	bad_fork_cleanup_audit:
				1130	audit_free(p);
				1131	bad_fork_cleanup_security:
				1132	security_task_free(p);
				1133	bad_fork_cleanup_policy:
				1134	#ifdef CONFIG_NUMA
				1135	mpol_free(p->mempolicy);
				1136	#endif
				1137	bad_fork_cleanup:
				1138	if (p->binfmt)
				1139	module_put(p->binfmt->module);
				1140	bad_fork_cleanup_put_domain:
				1141	module_put(p->thread_info->exec_domain->module);
				1142	bad_fork_cleanup_count:
				1143	put_group_info(p->group_info);
				1144	atomic_dec(&p->user->processes);
				1145	free_uid(p->user);
				1146	bad_fork_free:
				1147	free_task(p);
				1148	goto fork_out;
				1149	}
				1150
				1151	struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
				1152	{
				1153	memset(regs, 0, sizeof(struct pt_regs));
				1154	return regs;
				1155	}
				1156
				1157	task_t * __devinit fork_idle(int cpu)
				1158	{
				1159	task_t *task;
				1160	struct pt_regs regs;
				1161
				1162	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
				1163	if (!task)
				1164	return ERR_PTR(-ENOMEM);
				1165	init_idle(task, cpu);
				1166	unhash_process(task);
				1167	return task;
				1168	}
				1169
				1170	static inline int fork_traceflag (unsigned clone_flags)
				1171	{
				1172	if (clone_flags & CLONE_UNTRACED)
				1173	return 0;
				1174	else if (clone_flags & CLONE_VFORK) {
				1175	if (current->ptrace & PT_TRACE_VFORK)
				1176	return PTRACE_EVENT_VFORK;
				1177	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
				1178	if (current->ptrace & PT_TRACE_CLONE)
				1179	return PTRACE_EVENT_CLONE;
				1180	} else if (current->ptrace & PT_TRACE_FORK)
				1181	return PTRACE_EVENT_FORK;
				1182
				1183	return 0;
				1184	}
				1185
				1186	/*
				1187	* Ok, this is the main fork-routine.
				1188	*
				1189	* It copies the process, and if successful kick-starts
				1190	* it and waits for it to finish using the VM if required.
				1191	*/
				1192	long do_fork(unsigned long clone_flags,
				1193	unsigned long stack_start,
				1194	struct pt_regs *regs,
				1195	unsigned long stack_size,
				1196	int __user *parent_tidptr,
				1197	int __user *child_tidptr)
				1198	{
				1199	struct task_struct *p;
				1200	int trace = 0;
				1201	long pid = alloc_pidmap();
				1202
				1203	if (pid < 0)
				1204	return -EAGAIN;
				1205	if (unlikely(current->ptrace)) {
				1206	trace = fork_traceflag (clone_flags);
				1207	if (trace)
				1208	clone_flags \|= CLONE_PTRACE;
				1209	}
				1210
				1211	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
				1212	/*
				1213	* Do this prior waking up the new thread - the thread pointer
				1214	* might get invalid after that point, if the thread exits quickly.
				1215	*/
				1216	if (!IS_ERR(p)) {
				1217	struct completion vfork;
				1218
				1219	if (clone_flags & CLONE_VFORK) {
				1220	p->vfork_done = &vfork;
				1221	init_completion(&vfork);
				1222	}
				1223
				1224	if ((p->ptrace & PT_PTRACED) \|\| (clone_flags & CLONE_STOPPED)) {
				1225	/*
				1226	* We'll start up with an immediate SIGSTOP.
				1227	*/
				1228	sigaddset(&p->pending.signal, SIGSTOP);
				1229	set_tsk_thread_flag(p, TIF_SIGPENDING);
				1230	}
				1231
				1232	if (!(clone_flags & CLONE_STOPPED))
				1233	wake_up_new_task(p, clone_flags);
				1234	else
				1235	p->state = TASK_STOPPED;
				1236
				1237	if (unlikely (trace)) {
				1238	current->ptrace_message = pid;
				1239	ptrace_notify ((trace << 8) \| SIGTRAP);
				1240	}
				1241
				1242	if (clone_flags & CLONE_VFORK) {
				1243	wait_for_completion(&vfork);
				1244	if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
				1245	ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) \| SIGTRAP);
				1246	}
				1247	} else {
				1248	free_pidmap(pid);
				1249	pid = PTR_ERR(p);
				1250	}
				1251	return pid;
				1252	}
				1253
				1254	void __init proc_caches_init(void)
				1255	{
				1256	sighand_cachep = kmem_cache_create("sighand_cache",
				1257	sizeof(struct sighand_struct), 0,
				1258	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1259	signal_cachep = kmem_cache_create("signal_cache",
				1260	sizeof(struct signal_struct), 0,
				1261	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1262	files_cachep = kmem_cache_create("files_cache",
				1263	sizeof(struct files_struct), 0,
				1264	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1265	fs_cachep = kmem_cache_create("fs_cache",
				1266	sizeof(struct fs_struct), 0,
				1267	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1268	vm_area_cachep = kmem_cache_create("vm_area_struct",
				1269	sizeof(struct vm_area_struct), 0,
				1270	SLAB_PANIC, NULL, NULL);
				1271	mm_cachep = kmem_cache_create("mm_struct",
				1272	sizeof(struct mm_struct), 0,
				1273	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL, NULL);
				1274	}