Blame - arch/tile/mm/pgtable.c - android_kernel_htc_msm8960

blob: 335c24621c418b1a7bffdacc2723bbcee665c8e2 [file] [log] [blame]

Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	1	/*
				2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation, version 2.
				7	*
				8	* This program is distributed in the hope that it will be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
				11	* NON INFRINGEMENT. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/sched.h>
				16	#include <linux/kernel.h>
				17	#include <linux/errno.h>
				18	#include <linux/mm.h>
				19	#include <linux/swap.h>
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	20	#include <linux/highmem.h>
				21	#include <linux/slab.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/spinlock.h>
				24	#include <linux/cpumask.h>
				25	#include <linux/module.h>
				26	#include <linux/io.h>
				27	#include <linux/vmalloc.h>
				28	#include <linux/smp.h>
				29
				30	#include <asm/system.h>
				31	#include <asm/pgtable.h>
				32	#include <asm/pgalloc.h>
				33	#include <asm/fixmap.h>
				34	#include <asm/tlb.h>
				35	#include <asm/tlbflush.h>
				36	#include <asm/homecache.h>
				37
				38	#define K(x) ((x) << (PAGE_SHIFT-10))
				39
				40	/*
				41	* The normal show_free_areas() is too verbose on Tile, with dozens
				42	* of processors and often four NUMA zones each with high and lowmem.
				43	*/
				44	void show_mem(void)
				45	{
				46	struct zone *zone;
				47
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	48	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	49	" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
				50	" pagecache:%lu swap:%lu\n",
				51	(global_page_state(NR_ACTIVE_ANON) +
				52	global_page_state(NR_ACTIVE_FILE)),
				53	(global_page_state(NR_INACTIVE_ANON) +
				54	global_page_state(NR_INACTIVE_FILE)),
				55	global_page_state(NR_FILE_DIRTY),
				56	global_page_state(NR_WRITEBACK),
				57	global_page_state(NR_UNSTABLE_NFS),
				58	global_page_state(NR_FREE_PAGES),
				59	(global_page_state(NR_SLAB_RECLAIMABLE) +
				60	global_page_state(NR_SLAB_UNRECLAIMABLE)),
				61	global_page_state(NR_FILE_MAPPED),
				62	global_page_state(NR_PAGETABLE),
				63	global_page_state(NR_BOUNCE),
				64	global_page_state(NR_FILE_PAGES),
				65	nr_swap_pages);
				66
				67	for_each_zone(zone) {
				68	unsigned long flags, order, total = 0, largest_order = -1;
				69
				70	if (!populated_zone(zone))
				71	continue;
				72
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	73	spin_lock_irqsave(&zone->lock, flags);
				74	for (order = 0; order < MAX_ORDER; order++) {
				75	int nr = zone->free_area[order].nr_free;
				76	total += nr << order;
				77	if (nr)
				78	largest_order = order;
				79	}
				80	spin_unlock_irqrestore(&zone->lock, flags);
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	81	pr_err("Node %d %7s: %lukB (largest %luKb)\n",
				82	zone_to_nid(zone), zone->name,
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	83	K(total), largest_order ? K(1UL) << largest_order : 0);
				84	}
				85	}
				86
				87	/*
				88	* Associate a virtual page frame with a given physical page frame
				89	* and protection flags for that frame.
				90	*/
				91	static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
				92	{
				93	pgd_t *pgd;
				94	pud_t *pud;
				95	pmd_t *pmd;
				96	pte_t *pte;
				97
				98	pgd = swapper_pg_dir + pgd_index(vaddr);
				99	if (pgd_none(*pgd)) {
				100	BUG();
				101	return;
				102	}
				103	pud = pud_offset(pgd, vaddr);
				104	if (pud_none(*pud)) {
				105	BUG();
				106	return;
				107	}
				108	pmd = pmd_offset(pud, vaddr);
				109	if (pmd_none(*pmd)) {
				110	BUG();
				111	return;
				112	}
				113	pte = pte_offset_kernel(pmd, vaddr);
				114	/* <pfn,flags> stored as-is, to permit clearing entries */
				115	set_pte(pte, pfn_pte(pfn, flags));
				116
				117	/*
				118	* It's enough to flush this one mapping.
				119	* This appears conservative since it is only called
				120	* from __set_fixmap.
				121	*/
				122	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
				123	}
				124
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	125	void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
				126	{
				127	unsigned long address = __fix_to_virt(idx);
				128
				129	if (idx >= __end_of_fixed_addresses) {
				130	BUG();
				131	return;
				132	}
				133	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
				134	}
				135
				136	#if defined(CONFIG_HIGHPTE)
				137	pte_t _pte_offset_map(pmd_t dir, unsigned long address, enum km_type type)
				138	{
				139	pte_t pte = kmap_atomic(pmd_page(dir), type) +
				140	(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
				141	return &pte[pte_index(address)];
				142	}
				143	#endif
				144
				145	/*
				146	* List of all pgd's needed so it can invalidate entries in both cached
				147	* and uncached pgd's. This is essentially codepath-based locking
				148	* against pageattr.c; it is the unique case in which a valid change
				149	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
				150	* vmalloc faults work because attached pagetables are never freed.
				151	* The locking scheme was chosen on the basis of manfred's
				152	* recommendations and having no core impact whatsoever.
				153	* -- wli
				154	*/
				155	DEFINE_SPINLOCK(pgd_lock);
				156	LIST_HEAD(pgd_list);
				157
				158	static inline void pgd_list_add(pgd_t *pgd)
				159	{
				160	list_add(pgd_to_list(pgd), &pgd_list);
				161	}
				162
				163	static inline void pgd_list_del(pgd_t *pgd)
				164	{
				165	list_del(pgd_to_list(pgd));
				166	}
				167
				168	#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
				169	#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
				170
				171	static void pgd_ctor(pgd_t *pgd)
				172	{
				173	unsigned long flags;
				174
				175	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
				176	spin_lock_irqsave(&pgd_lock, flags);
				177
				178	#ifndef __tilegx__
				179	/*
				180	* Check that the user interrupt vector has no L2.
				181	* It never should for the swapper, and new page tables
				182	* should always start with an empty user interrupt vector.
				183	*/
				184	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
				185	#endif
				186
				187	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
				188	swapper_pg_dir + KERNEL_PGD_INDEX_START,
				189	KERNEL_PGD_PTRS);
				190
				191	pgd_list_add(pgd);
				192	spin_unlock_irqrestore(&pgd_lock, flags);
				193	}
				194
				195	static void pgd_dtor(pgd_t *pgd)
				196	{
				197	unsigned long flags; /* can be called from interrupt context */
				198
				199	spin_lock_irqsave(&pgd_lock, flags);
				200	pgd_list_del(pgd);
				201	spin_unlock_irqrestore(&pgd_lock, flags);
				202	}
				203
				204	pgd_t pgd_alloc(struct mm_struct mm)
				205	{
				206	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
				207	if (pgd)
				208	pgd_ctor(pgd);
				209	return pgd;
				210	}
				211
				212	void pgd_free(struct mm_struct mm, pgd_t pgd)
				213	{
				214	pgd_dtor(pgd);
				215	kmem_cache_free(pgd_cache, pgd);
				216	}
				217
				218
				219	#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
				220
				221	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
				222	{
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	223	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO\|__GFP_COMP;
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	224	struct page *p;
				225
				226	#ifdef CONFIG_HIGHPTE
				227	flags \|= __GFP_HIGHMEM;
				228	#endif
				229
				230	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
				231	if (p == NULL)
				232	return NULL;
				233
				234	pgtable_page_ctor(p);
				235	return p;
				236	}
				237
				238	/*
				239	* Free page immediately (used in __pte_alloc if we raced with another
				240	* process). We have to correct whatever pte_alloc_one() did before
				241	* returning the pages to the allocator.
				242	*/
				243	void pte_free(struct mm_struct mm, struct page p)
				244	{
				245	pgtable_page_dtor(p);
				246	__free_pages(p, L2_USER_PGTABLE_ORDER);
				247	}
				248
				249	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
				250	unsigned long address)
				251	{
				252	int i;
				253
				254	pgtable_page_dtor(pte);
				255	tlb->need_flush = 1;
				256	if (tlb_fast_mode(tlb)) {
				257	struct page *pte_pages[L2_USER_PGTABLE_PAGES];
				258	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
				259	pte_pages[i] = pte + i;
				260	free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
				261	return;
				262	}
				263	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
				264	tlb->pages[tlb->nr++] = pte + i;
				265	if (tlb->nr >= FREE_PTE_NR)
				266	tlb_flush_mmu(tlb, 0, 0);
				267	}
				268	}
				269
				270	#ifndef __tilegx__
				271
				272	/*
				273	* FIXME: needs to be atomic vs hypervisor writes. For now we make the
				274	* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
				275	*/
				276	int ptep_test_and_clear_young(struct vm_area_struct *vma,
				277	unsigned long addr, pte_t *ptep)
				278	{
				279	#if HV_PTE_INDEX_ACCESSED < 8 \|\| HV_PTE_INDEX_ACCESSED >= 16
				280	# error Code assumes HV_PTE "accessed" bit in second byte
				281	#endif
				282	u8 tmp = (u8 )ptep;
				283	u8 second_byte = tmp[1];
				284	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
				285	return 0;
				286	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
				287	return 1;
				288	}
				289
				290	/*
				291	* This implementation is atomic vs hypervisor writes, since the hypervisor
				292	* always writes the low word (where "accessed" and "dirty" are) and this
				293	* routine only writes the high word.
				294	*/
				295	void ptep_set_wrprotect(struct mm_struct *mm,
				296	unsigned long addr, pte_t *ptep)
				297	{
				298	#if HV_PTE_INDEX_WRITABLE < 32
				299	# error Code assumes HV_PTE "writable" bit in high word
				300	#endif
				301	u32 tmp = (u32 )ptep;
				302	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
				303	}
				304
				305	#endif
				306
				307	pte_t virt_to_pte(struct mm_struct mm, unsigned long addr)
				308	{
				309	pgd_t *pgd;
				310	pud_t *pud;
				311	pmd_t *pmd;
				312
				313	if (pgd_addr_invalid(addr))
				314	return NULL;
				315
				316	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
				317	pud = pud_offset(pgd, addr);
				318	if (!pud_present(*pud))
				319	return NULL;
				320	pmd = pmd_offset(pud, addr);
				321	if (pmd_huge_page(*pmd))
				322	return (pte_t *)pmd;
				323	if (!pmd_present(*pmd))
				324	return NULL;
				325	return pte_offset_kernel(pmd, addr);
				326	}
				327
				328	pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
				329	{
				330	unsigned int width = smp_width;
				331	int x = cpu % width;
				332	int y = cpu / width;
				333	BUG_ON(y >= smp_height);
				334	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
				335	BUG_ON(cpu < 0 \|\| cpu >= NR_CPUS);
				336	BUG_ON(!cpu_is_valid_lotar(cpu));
				337	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
				338	}
				339
				340	int get_remote_cache_cpu(pgprot_t prot)
				341	{
				342	HV_LOTAR lotar = hv_pte_get_lotar(prot);
				343	int x = HV_LOTAR_X(lotar);
				344	int y = HV_LOTAR_Y(lotar);
				345	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
				346	return x + y * smp_width;
				347	}
				348
				349	void set_pte_order(pte_t *ptep, pte_t pte, int order)
				350	{
				351	unsigned long pfn = pte_pfn(pte);
				352	struct page *page = pfn_to_page(pfn);
				353
				354	/* Update the home of a PTE if necessary */
				355	pte = pte_set_home(pte, page_home(page));
				356
				357	#ifdef __tilegx__
				358	*ptep = pte;
				359	#else
				360	/*
				361	* When setting a PTE, write the high bits first, then write
				362	* the low bits. This sets the "present" bit only after the
				363	* other bits are in place. If a particular PTE update
				364	* involves transitioning from one valid PTE to another, it
				365	* may be necessary to call set_pte_order() more than once,
				366	* transitioning via a suitable intermediate state.
				367	* Note that this sequence also means that if we are transitioning
				368	* from any migrating PTE to a non-migrating one, we will not
				369	* see a half-updated PTE with the migrating bit off.
				370	*/
				371	#if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
				372	# error Must write the present and migrating bits last
				373	#endif
				374	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
				375	barrier();
				376	((u32 *)ptep)[0] = (u32)(pte_val(pte));
				377	#endif
				378	}
				379
				380	/* Can this mm load a PTE with cached_priority set? */
				381	static inline int mm_is_priority_cached(struct mm_struct *mm)
				382	{
				383	return mm->context.priority_cached;
				384	}
				385
				386	/*
				387	* Add a priority mapping to an mm_context and
				388	* notify the hypervisor if this is the first one.
				389	*/
				390	void start_mm_caching(struct mm_struct *mm)
				391	{
				392	if (!mm_is_priority_cached(mm)) {
				393	mm->context.priority_cached = -1U;
				394	hv_set_caching(-1U);
				395	}
				396	}
				397
				398	/*
				399	* Validate and return the priority_cached flag. We know if it's zero
				400	* that we don't need to scan, since we immediately set it non-zero
				401	* when we first consider a MAP_CACHE_PRIORITY mapping.
				402	*
				403	* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
				404	* since we're in an interrupt context (servicing switch_mm) we don't
				405	* worry about it and don't unset the "priority_cached" field.
				406	* Presumably we'll come back later and have more luck and clear
				407	* the value then; for now we'll just keep the cache marked for priority.
				408	*/
				409	static unsigned int update_priority_cached(struct mm_struct *mm)
				410	{
				411	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
				412	struct vm_area_struct *vm;
				413	for (vm = mm->mmap; vm; vm = vm->vm_next) {
				414	if (hv_pte_get_cached_priority(vm->vm_page_prot))
				415	break;
				416	}
				417	if (vm == NULL)
				418	mm->context.priority_cached = 0;
				419	up_write(&mm->mmap_sem);
				420	}
				421	return mm->context.priority_cached;
				422	}
				423
				424	/* Set caching correctly for an mm that we are switching to. */
				425	void check_mm_caching(struct mm_struct prev, struct mm_struct next)
				426	{
				427	if (!mm_is_priority_cached(next)) {
				428	/*
				429	* If the new mm doesn't use priority caching, just see if we
				430	* need the hv_set_caching(), or can assume it's already zero.
				431	*/
				432	if (mm_is_priority_cached(prev))
				433	hv_set_caching(0);
				434	} else {
				435	hv_set_caching(update_priority_cached(next));
				436	}
				437	}
				438
				439	#if CHIP_HAS_MMIO()
				440
				441	/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
				442	void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
				443	pgprot_t home)
				444	{
				445	void *addr;
				446	struct vm_struct *area;
				447	unsigned long offset, last_addr;
				448	pgprot_t pgprot;
				449
				450	/* Don't allow wraparound or zero size */
				451	last_addr = phys_addr + size - 1;
				452	if (!size \|\| last_addr < phys_addr)
				453	return NULL;
				454
				455	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
				456	pgprot = PAGE_KERNEL;
				457	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
				458	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
				459
				460	/*
				461	* Mappings have to be page-aligned
				462	*/
				463	offset = phys_addr & ~PAGE_MASK;
				464	phys_addr &= PAGE_MASK;
				465	size = PAGE_ALIGN(last_addr+1) - phys_addr;
				466
				467	/*
				468	* Ok, go for it..
				469	*/
				470	area = get_vm_area(size, VM_IOREMAP /* \| other flags? */);
				471	if (!area)
				472	return NULL;
				473	area->phys_addr = phys_addr;
				474	addr = area->addr;
				475	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
				476	phys_addr, pgprot)) {
				477	remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
				478	return NULL;
				479	}
				480	return (__force void __iomem ) (offset + (char )addr);
				481	}
				482	EXPORT_SYMBOL(ioremap_prot);
				483
				484	/* Map a PCI MMIO bus address into VA space. */
				485	void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
				486	{
				487	panic("ioremap for PCI MMIO is not supported");
				488	}
				489	EXPORT_SYMBOL(ioremap);
				490
				491	/* Unmap an MMIO VA mapping. */
				492	void iounmap(volatile void __iomem *addr_in)
				493	{
				494	volatile void __iomem addr = (volatile void __iomem )
				495	(PAGE_MASK & (unsigned long __force)addr_in);
				496	#if 1
				497	vunmap((void * __force)addr);
				498	#else
				499	/* x86 uses this complicated flow instead of vunmap(). Is
				500	* there any particular reason we should do the same? */
				501	struct vm_struct p, o;
				502
				503	/* Use the vm area unlocked, assuming the caller
				504	ensures there isn't another iounmap for the same address
				505	in parallel. Reuse of the virtual address is prevented by
				506	leaving it in the global lists until we're done with it.
				507	cpa takes care of the direct mappings. */
				508	read_lock(&vmlist_lock);
				509	for (p = vmlist; p; p = p->next) {
				510	if (p->addr == addr)
				511	break;
				512	}
				513	read_unlock(&vmlist_lock);
				514
				515	if (!p) {
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	516	pr_err("iounmap: bad address %p\n", addr);
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	517	dump_stack();
				518	return;
				519	}
				520
				521	/* Finally remove it */
				522	o = remove_vm_area((void *)addr);
				523	BUG_ON(p != o \|\| o == NULL);
				524	kfree(p);
				525	#endif
				526	}
				527	EXPORT_SYMBOL(iounmap);
				528
				529	#endif /* CHIP_HAS_MMIO() */