Blame - arch/x86/mm/gup.c - android_kernel_oneplus_msm8996

blob: 269aa53932e02a2a25ff81dc488e326f6b9161fc [file] [log] [blame]

Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	1	/*
				2	* Lockless get_user_pages_fast for x86
				3	*
				4	* Copyright (C) 2008 Nick Piggin
				5	* Copyright (C) 2008 Novell Inc.
				6	*/
				7	#include <linux/sched.h>
				8	#include <linux/mm.h>
				9	#include <linux/vmstat.h>
				10	#include <linux/highmem.h>
				11
				12	#include <asm/pgtable.h>
				13
				14	static inline pte_t gup_get_pte(pte_t *ptep)
				15	{
				16	#ifndef CONFIG_X86_PAE
Ingo Molnar	0c87197	2009-06-15 11:35:01 +0200	[diff] [blame]	17	return ACCESS_ONCE(*ptep);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	18	#else
				19	/*
				20	* With get_user_pages_fast, we walk down the pagetables without taking
Andy Shevchenko	ab09809	2010-02-02 14:38:12 -0800	[diff] [blame]	21	* any locks. For this we would like to load the pointers atomically,
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	22	* but that is not possible (without expensive cmpxchg8b) on PAE. What
				23	* we do have is the guarantee that a pte will only either go from not
				24	* present to present, or present to not present or both -- it will not
				25	* switch to a completely different present page without a TLB flush in
				26	* between; something that we are blocking by holding interrupts off.
				27	*
				28	* Setting ptes from not present to present goes:
				29	* ptep->pte_high = h;
				30	* smp_wmb();
				31	* ptep->pte_low = l;
				32	*
				33	* And present to not present goes:
				34	* ptep->pte_low = 0;
				35	* smp_wmb();
				36	* ptep->pte_high = 0;
				37	*
				38	* We must ensure here that the load of pte_low sees l iff pte_high
				39	* sees h. We load pte_high after loading pte_low, which ensures we
				40	* don't see an older value of pte_high. Then we recheck pte_low,
				41	* which ensures that we haven't picked up a changed pte high. We might
				42	* have got rubbish values from pte_low and pte_high, but we are
				43	* guaranteed that pte_low will not have the present bit set unless
				44	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
				45	* we're safe.
				46	*
				47	* gup_get_pte should not be used or copied outside gup.c without being
				48	* very careful -- it does not atomically load the pte or anything that
				49	* is likely to be useful for you.
				50	*/
				51	pte_t pte;
				52
				53	retry:
				54	pte.pte_low = ptep->pte_low;
				55	smp_rmb();
				56	pte.pte_high = ptep->pte_high;
				57	smp_rmb();
				58	if (unlikely(pte.pte_low != ptep->pte_low))
				59	goto retry;
				60
				61	return pte;
				62	#endif
				63	}
				64
				65	/*
				66	* The performance critical leaf functions are made noinline otherwise gcc
				67	* inlines everything into a single function which results in too much
				68	* register pressure.
				69	*/
				70	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
				71	unsigned long end, int write, struct page *pages, int nr)
				72	{
				73	unsigned long mask;
				74	pte_t *ptep;
				75
				76	mask = _PAGE_PRESENT\|_PAGE_USER;
				77	if (write)
				78	mask \|= _PAGE_RW;
				79
				80	ptep = pte_offset_map(&pmd, addr);
				81	do {
				82	pte_t pte = gup_get_pte(ptep);
				83	struct page *page;
				84
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	85	if ((pte_flags(pte) & (mask \| _PAGE_SPECIAL)) != mask) {
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	86	pte_unmap(ptep);
				87	return 0;
				88	}
				89	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				90	page = pte_page(pte);
				91	get_page(page);
				92	pages[*nr] = page;
				93	(*nr)++;
				94
				95	} while (ptep++, addr += PAGE_SIZE, addr != end);
				96	pte_unmap(ptep - 1);
				97
				98	return 1;
				99	}
				100
				101	static inline void get_head_page_multiple(struct page *page, int nr)
				102	{
				103	VM_BUG_ON(page != compound_head(page));
				104	VM_BUG_ON(page_count(page) == 0);
				105	atomic_add(nr, &page->_count);
				106	}
				107
Andrea Arcangeli	9180706	2011-01-13 15:46:32 -0800	[diff] [blame]	108	static inline void get_huge_page_tail(struct page *page)
				109	{
				110	/*
				111	* __split_huge_page_refcount() cannot run
				112	* from under us.
				113	*/
				114	VM_BUG_ON(atomic_read(&page->_count) < 0);
				115	atomic_inc(&page->_count);
				116	}
				117
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	118	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
				119	unsigned long end, int write, struct page *pages, int nr)
				120	{
				121	unsigned long mask;
				122	pte_t pte = (pte_t )&pmd;
				123	struct page head, page;
				124	int refs;
				125
				126	mask = _PAGE_PRESENT\|_PAGE_USER;
				127	if (write)
				128	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	129	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	130	return 0;
				131	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	132	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	133	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				134
				135	refs = 0;
				136	head = pte_page(pte);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	137	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	138	do {
				139	VM_BUG_ON(compound_head(page) != head);
				140	pages[*nr] = page;
Andrea Arcangeli	9180706	2011-01-13 15:46:32 -0800	[diff] [blame]	141	if (PageTail(page))
				142	get_huge_page_tail(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	143	(*nr)++;
				144	page++;
				145	refs++;
				146	} while (addr += PAGE_SIZE, addr != end);
				147	get_head_page_multiple(head, refs);
				148
				149	return 1;
				150	}
				151
				152	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
				153	int write, struct page *pages, int nr)
				154	{
				155	unsigned long next;
				156	pmd_t *pmdp;
				157
				158	pmdp = pmd_offset(&pud, addr);
				159	do {
				160	pmd_t pmd = *pmdp;
				161
				162	next = pmd_addr_end(addr, end);
Andrea Arcangeli	64cc6ae	2011-01-13 15:46:42 -0800	[diff] [blame^]	163	/*
				164	* The pmd_trans_splitting() check below explains why
				165	* pmdp_splitting_flush has to flush the tlb, to stop
				166	* this gup-fast code from running while we set the
				167	* splitting bit in the pmd. Returning zero will take
				168	* the slow path that will call wait_split_huge_page()
				169	* if the pmd is still in splitting state. gup-fast
				170	* can't because it has irq disabled and
				171	* wait_split_huge_page() would never return as the
				172	* tlb flush IPI wouldn't run.
				173	*/
				174	if (pmd_none(pmd) \|\| pmd_trans_splitting(pmd))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	175	return 0;
				176	if (unlikely(pmd_large(pmd))) {
				177	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
				178	return 0;
				179	} else {
				180	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
				181	return 0;
				182	}
				183	} while (pmdp++, addr = next, addr != end);
				184
				185	return 1;
				186	}
				187
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	188	static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
				189	unsigned long end, int write, struct page *pages, int nr)
				190	{
				191	unsigned long mask;
				192	pte_t pte = (pte_t )&pud;
				193	struct page head, page;
				194	int refs;
				195
				196	mask = _PAGE_PRESENT\|_PAGE_USER;
				197	if (write)
				198	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	199	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	200	return 0;
				201	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	202	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	203	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				204
				205	refs = 0;
				206	head = pte_page(pte);
				207	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
				208	do {
				209	VM_BUG_ON(compound_head(page) != head);
				210	pages[*nr] = page;
				211	(*nr)++;
				212	page++;
				213	refs++;
				214	} while (addr += PAGE_SIZE, addr != end);
				215	get_head_page_multiple(head, refs);
				216
				217	return 1;
				218	}
				219
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	220	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
				221	int write, struct page *pages, int nr)
				222	{
				223	unsigned long next;
				224	pud_t *pudp;
				225
				226	pudp = pud_offset(&pgd, addr);
				227	do {
				228	pud_t pud = *pudp;
				229
				230	next = pud_addr_end(addr, end);
				231	if (pud_none(pud))
				232	return 0;
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	233	if (unlikely(pud_large(pud))) {
				234	if (!gup_huge_pud(pud, addr, next, write, pages, nr))
				235	return 0;
				236	} else {
				237	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
				238	return 0;
				239	}
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	240	} while (pudp++, addr = next, addr != end);
				241
				242	return 1;
				243	}
				244
Peter Zijlstra	465a454	2009-06-15 12:31:37 +0200	[diff] [blame]	245	/*
				246	* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
				247	* back to the regular GUP.
				248	*/
				249	int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
				250	struct page **pages)
				251	{
				252	struct mm_struct *mm = current->mm;
				253	unsigned long addr, len, end;
				254	unsigned long next;
				255	unsigned long flags;
				256	pgd_t *pgdp;
				257	int nr = 0;
				258
				259	start &= PAGE_MASK;
				260	addr = start;
				261	len = (unsigned long) nr_pages << PAGE_SHIFT;
				262	end = start + len;
				263	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
				264	(void __user *)start, len)))
				265	return 0;
				266
				267	/*
				268	* XXX: batch / limit 'nr', to avoid large irq off latency
				269	* needs some instrumenting to determine the common sizes used by
				270	* important workloads (eg. DB2), and whether limiting the batch size
				271	* will decrease performance.
				272	*
				273	* It seems like we're in the clear for the moment. Direct-IO is
				274	* the main guy that batches up lots of get_user_pages, and even
				275	* they are limited to 64-at-a-time which is not so many.
				276	*/
				277	/*
				278	* This doesn't prevent pagetable teardown, but does prevent
				279	* the pagetables and pages from being freed on x86.
				280	*
				281	* So long as we atomically load page table pointers versus teardown
				282	* (which we do on x86, with the above PAE exception), we can follow the
				283	* address down to the the page and take a ref on it.
				284	*/
				285	local_irq_save(flags);
				286	pgdp = pgd_offset(mm, addr);
				287	do {
				288	pgd_t pgd = *pgdp;
				289
				290	next = pgd_addr_end(addr, end);
				291	if (pgd_none(pgd))
				292	break;
				293	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				294	break;
				295	} while (pgdp++, addr = next, addr != end);
				296	local_irq_restore(flags);
				297
				298	return nr;
				299	}
				300
Andy Grover	a0d22f4	2009-04-09 16:45:29 -0700	[diff] [blame]	301	/**
				302	* get_user_pages_fast() - pin user pages in memory
				303	* @start: starting user address
				304	* @nr_pages: number of pages from start to pin
				305	* @write: whether pages will be written to
				306	* @pages: array that receives pointers to the pages pinned.
				307	* Should be at least nr_pages long.
				308	*
				309	* Attempt to pin user pages in memory without taking mm->mmap_sem.
				310	* If not successful, it will fall back to taking the lock and
				311	* calling get_user_pages().
				312	*
				313	* Returns number of pages pinned. This may be fewer than the number
				314	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				315	* were pinned, returns -errno.
				316	*/
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	317	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
				318	struct page **pages)
				319	{
				320	struct mm_struct *mm = current->mm;
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	321	unsigned long addr, len, end;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	322	unsigned long next;
				323	pgd_t *pgdp;
				324	int nr = 0;
				325
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	326	start &= PAGE_MASK;
				327	addr = start;
				328	len = (unsigned long) nr_pages << PAGE_SHIFT;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	329
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	330	end = start + len;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	331	if (end < start)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	332	goto slow_irqon;
				333
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	334	#ifdef CONFIG_X86_64
				335	if (end >> __VIRTUAL_MASK_SHIFT)
				336	goto slow_irqon;
				337	#endif
				338
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	339	/*
				340	* XXX: batch / limit 'nr', to avoid large irq off latency
				341	* needs some instrumenting to determine the common sizes used by
				342	* important workloads (eg. DB2), and whether limiting the batch size
				343	* will decrease performance.
				344	*
				345	* It seems like we're in the clear for the moment. Direct-IO is
				346	* the main guy that batches up lots of get_user_pages, and even
				347	* they are limited to 64-at-a-time which is not so many.
				348	*/
				349	/*
				350	* This doesn't prevent pagetable teardown, but does prevent
				351	* the pagetables and pages from being freed on x86.
				352	*
				353	* So long as we atomically load page table pointers versus teardown
				354	* (which we do on x86, with the above PAE exception), we can follow the
				355	* address down to the the page and take a ref on it.
				356	*/
				357	local_irq_disable();
				358	pgdp = pgd_offset(mm, addr);
				359	do {
				360	pgd_t pgd = *pgdp;
				361
				362	next = pgd_addr_end(addr, end);
				363	if (pgd_none(pgd))
				364	goto slow;
				365	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				366	goto slow;
				367	} while (pgdp++, addr = next, addr != end);
				368	local_irq_enable();
				369
				370	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
				371	return nr;
				372
				373	{
				374	int ret;
				375
				376	slow:
				377	local_irq_enable();
				378	slow_irqon:
				379	/* Try to get the remaining pages with get_user_pages */
				380	start += nr << PAGE_SHIFT;
				381	pages += nr;
				382
				383	down_read(&mm->mmap_sem);
				384	ret = get_user_pages(current, mm, start,
				385	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
				386	up_read(&mm->mmap_sem);
				387
				388	/* Have to be a bit careful with return values */
				389	if (nr > 0) {
				390	if (ret < 0)
				391	ret = nr;
				392	else
				393	ret += nr;
				394	}
				395
				396	return ret;
				397	}
				398	}