| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  *  linux/arch/i386/mm/init.c | 
 | 3 |  * | 
 | 4 |  *  Copyright (C) 1995  Linus Torvalds | 
 | 5 |  * | 
 | 6 |  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 
 | 7 |  */ | 
 | 8 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 9 | #include <linux/module.h> | 
 | 10 | #include <linux/signal.h> | 
 | 11 | #include <linux/sched.h> | 
 | 12 | #include <linux/kernel.h> | 
 | 13 | #include <linux/errno.h> | 
 | 14 | #include <linux/string.h> | 
 | 15 | #include <linux/types.h> | 
 | 16 | #include <linux/ptrace.h> | 
 | 17 | #include <linux/mman.h> | 
 | 18 | #include <linux/mm.h> | 
 | 19 | #include <linux/hugetlb.h> | 
 | 20 | #include <linux/swap.h> | 
 | 21 | #include <linux/smp.h> | 
 | 22 | #include <linux/init.h> | 
 | 23 | #include <linux/highmem.h> | 
 | 24 | #include <linux/pagemap.h> | 
| Randy Dunlap | c9cf552 | 2006-06-27 02:53:52 -0700 | [diff] [blame] | 25 | #include <linux/poison.h> | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 26 | #include <linux/bootmem.h> | 
 | 27 | #include <linux/slab.h> | 
 | 28 | #include <linux/proc_fs.h> | 
 | 29 | #include <linux/efi.h> | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 30 | #include <linux/memory_hotplug.h> | 
| Adrian Bunk | 27d99f7 | 2005-11-13 16:06:51 -0800 | [diff] [blame] | 31 | #include <linux/initrd.h> | 
| Shaohua Li | 55b2355 | 2006-06-23 02:04:49 -0700 | [diff] [blame] | 32 | #include <linux/cpumask.h> | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 33 |  | 
 | 34 | #include <asm/processor.h> | 
 | 35 | #include <asm/system.h> | 
 | 36 | #include <asm/uaccess.h> | 
 | 37 | #include <asm/pgtable.h> | 
 | 38 | #include <asm/dma.h> | 
 | 39 | #include <asm/fixmap.h> | 
 | 40 | #include <asm/e820.h> | 
 | 41 | #include <asm/apic.h> | 
 | 42 | #include <asm/tlb.h> | 
 | 43 | #include <asm/tlbflush.h> | 
 | 44 | #include <asm/sections.h> | 
 | 45 |  | 
 | 46 | unsigned int __VMALLOC_RESERVE = 128 << 20; | 
 | 47 |  | 
 | 48 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 
 | 49 | unsigned long highstart_pfn, highend_pfn; | 
 | 50 |  | 
 | 51 | static int noinline do_test_wp_bit(void); | 
 | 52 |  | 
 | 53 | /* | 
 | 54 |  * Creates a middle page table and puts a pointer to it in the | 
 | 55 |  * given global directory entry. This only returns the gd entry | 
 | 56 |  * in non-PAE compilation mode, since the middle layer is folded. | 
 | 57 |  */ | 
 | 58 | static pmd_t * __init one_md_table_init(pgd_t *pgd) | 
 | 59 | { | 
 | 60 | 	pud_t *pud; | 
 | 61 | 	pmd_t *pmd_table; | 
 | 62 | 		 | 
 | 63 | #ifdef CONFIG_X86_PAE | 
 | 64 | 	pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | 
 | 65 | 	set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 
 | 66 | 	pud = pud_offset(pgd, 0); | 
 | 67 | 	if (pmd_table != pmd_offset(pud, 0))  | 
 | 68 | 		BUG(); | 
 | 69 | #else | 
 | 70 | 	pud = pud_offset(pgd, 0); | 
 | 71 | 	pmd_table = pmd_offset(pud, 0); | 
 | 72 | #endif | 
 | 73 |  | 
 | 74 | 	return pmd_table; | 
 | 75 | } | 
 | 76 |  | 
 | 77 | /* | 
 | 78 |  * Create a page table and place a pointer to it in a middle page | 
 | 79 |  * directory entry. | 
 | 80 |  */ | 
 | 81 | static pte_t * __init one_page_table_init(pmd_t *pmd) | 
 | 82 | { | 
 | 83 | 	if (pmd_none(*pmd)) { | 
 | 84 | 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); | 
 | 85 | 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | 
 | 86 | 		if (page_table != pte_offset_kernel(pmd, 0)) | 
 | 87 | 			BUG();	 | 
 | 88 |  | 
 | 89 | 		return page_table; | 
 | 90 | 	} | 
 | 91 | 	 | 
 | 92 | 	return pte_offset_kernel(pmd, 0); | 
 | 93 | } | 
 | 94 |  | 
 | 95 | /* | 
 | 96 |  * This function initializes a certain range of kernel virtual memory  | 
 | 97 |  * with new bootmem page tables, everywhere page tables are missing in | 
 | 98 |  * the given range. | 
 | 99 |  */ | 
 | 100 |  | 
 | 101 | /* | 
 | 102 |  * NOTE: The pagetables are allocated contiguous on the physical space  | 
 | 103 |  * so we can cache the place of the first one and move around without  | 
 | 104 |  * checking the pgd every time. | 
 | 105 |  */ | 
 | 106 | static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) | 
 | 107 | { | 
 | 108 | 	pgd_t *pgd; | 
 | 109 | 	pud_t *pud; | 
 | 110 | 	pmd_t *pmd; | 
 | 111 | 	int pgd_idx, pmd_idx; | 
 | 112 | 	unsigned long vaddr; | 
 | 113 |  | 
 | 114 | 	vaddr = start; | 
 | 115 | 	pgd_idx = pgd_index(vaddr); | 
 | 116 | 	pmd_idx = pmd_index(vaddr); | 
 | 117 | 	pgd = pgd_base + pgd_idx; | 
 | 118 |  | 
 | 119 | 	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | 
 | 120 | 		if (pgd_none(*pgd))  | 
 | 121 | 			one_md_table_init(pgd); | 
 | 122 | 		pud = pud_offset(pgd, vaddr); | 
 | 123 | 		pmd = pmd_offset(pud, vaddr); | 
 | 124 | 		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { | 
 | 125 | 			if (pmd_none(*pmd))  | 
 | 126 | 				one_page_table_init(pmd); | 
 | 127 |  | 
 | 128 | 			vaddr += PMD_SIZE; | 
 | 129 | 		} | 
 | 130 | 		pmd_idx = 0; | 
 | 131 | 	} | 
 | 132 | } | 
 | 133 |  | 
 | 134 | static inline int is_kernel_text(unsigned long addr) | 
 | 135 | { | 
 | 136 | 	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | 
 | 137 | 		return 1; | 
 | 138 | 	return 0; | 
 | 139 | } | 
 | 140 |  | 
 | 141 | /* | 
 | 142 |  * This maps the physical memory to kernel virtual address space, a total  | 
 | 143 |  * of max_low_pfn pages, by creating page tables starting from address  | 
 | 144 |  * PAGE_OFFSET. | 
 | 145 |  */ | 
 | 146 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | 
 | 147 | { | 
 | 148 | 	unsigned long pfn; | 
 | 149 | 	pgd_t *pgd; | 
 | 150 | 	pmd_t *pmd; | 
 | 151 | 	pte_t *pte; | 
 | 152 | 	int pgd_idx, pmd_idx, pte_ofs; | 
 | 153 |  | 
 | 154 | 	pgd_idx = pgd_index(PAGE_OFFSET); | 
 | 155 | 	pgd = pgd_base + pgd_idx; | 
 | 156 | 	pfn = 0; | 
 | 157 |  | 
 | 158 | 	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { | 
 | 159 | 		pmd = one_md_table_init(pgd); | 
 | 160 | 		if (pfn >= max_low_pfn) | 
 | 161 | 			continue; | 
 | 162 | 		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { | 
 | 163 | 			unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; | 
 | 164 |  | 
 | 165 | 			/* Map with big pages if possible, otherwise create normal page tables. */ | 
 | 166 | 			if (cpu_has_pse) { | 
 | 167 | 				unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; | 
 | 168 |  | 
 | 169 | 				if (is_kernel_text(address) || is_kernel_text(address2)) | 
 | 170 | 					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); | 
 | 171 | 				else | 
 | 172 | 					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); | 
 | 173 | 				pfn += PTRS_PER_PTE; | 
 | 174 | 			} else { | 
 | 175 | 				pte = one_page_table_init(pmd); | 
 | 176 |  | 
 | 177 | 				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { | 
 | 178 | 						if (is_kernel_text(address)) | 
 | 179 | 							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); | 
 | 180 | 						else | 
 | 181 | 							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); | 
 | 182 | 				} | 
 | 183 | 			} | 
 | 184 | 		} | 
 | 185 | 	} | 
 | 186 | } | 
 | 187 |  | 
 | 188 | static inline int page_kills_ppro(unsigned long pagenr) | 
 | 189 | { | 
 | 190 | 	if (pagenr >= 0x70000 && pagenr <= 0x7003F) | 
 | 191 | 		return 1; | 
 | 192 | 	return 0; | 
 | 193 | } | 
 | 194 |  | 
 | 195 | extern int is_available_memory(efi_memory_desc_t *); | 
 | 196 |  | 
| Dave Hansen | 5b505b9 | 2005-06-23 00:07:41 -0700 | [diff] [blame] | 197 | int page_is_ram(unsigned long pagenr) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 198 | { | 
 | 199 | 	int i; | 
 | 200 | 	unsigned long addr, end; | 
 | 201 |  | 
 | 202 | 	if (efi_enabled) { | 
 | 203 | 		efi_memory_desc_t *md; | 
| Matt Tolentino | 7ae65fd | 2005-09-03 15:56:27 -0700 | [diff] [blame] | 204 | 		void *p; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 205 |  | 
| Matt Tolentino | 7ae65fd | 2005-09-03 15:56:27 -0700 | [diff] [blame] | 206 | 		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | 
 | 207 | 			md = p; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 208 | 			if (!is_available_memory(md)) | 
 | 209 | 				continue; | 
 | 210 | 			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; | 
 | 211 | 			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; | 
 | 212 |  | 
 | 213 | 			if ((pagenr >= addr) && (pagenr < end)) | 
 | 214 | 				return 1; | 
 | 215 | 		} | 
 | 216 | 		return 0; | 
 | 217 | 	} | 
 | 218 |  | 
 | 219 | 	for (i = 0; i < e820.nr_map; i++) { | 
 | 220 |  | 
 | 221 | 		if (e820.map[i].type != E820_RAM)	/* not usable memory */ | 
 | 222 | 			continue; | 
 | 223 | 		/* | 
 | 224 | 		 *	!!!FIXME!!! Some BIOSen report areas as RAM that | 
 | 225 | 		 *	are not. Notably the 640->1Mb area. We need a sanity | 
 | 226 | 		 *	check here. | 
 | 227 | 		 */ | 
 | 228 | 		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | 
 | 229 | 		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | 
 | 230 | 		if  ((pagenr >= addr) && (pagenr < end)) | 
 | 231 | 			return 1; | 
 | 232 | 	} | 
 | 233 | 	return 0; | 
 | 234 | } | 
 | 235 |  | 
 | 236 | #ifdef CONFIG_HIGHMEM | 
 | 237 | pte_t *kmap_pte; | 
 | 238 | pgprot_t kmap_prot; | 
 | 239 |  | 
 | 240 | #define kmap_get_fixmap_pte(vaddr)					\ | 
 | 241 | 	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) | 
 | 242 |  | 
 | 243 | static void __init kmap_init(void) | 
 | 244 | { | 
 | 245 | 	unsigned long kmap_vstart; | 
 | 246 |  | 
 | 247 | 	/* cache the first kmap pte */ | 
 | 248 | 	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); | 
 | 249 | 	kmap_pte = kmap_get_fixmap_pte(kmap_vstart); | 
 | 250 |  | 
 | 251 | 	kmap_prot = PAGE_KERNEL; | 
 | 252 | } | 
 | 253 |  | 
 | 254 | static void __init permanent_kmaps_init(pgd_t *pgd_base) | 
 | 255 | { | 
 | 256 | 	pgd_t *pgd; | 
 | 257 | 	pud_t *pud; | 
 | 258 | 	pmd_t *pmd; | 
 | 259 | 	pte_t *pte; | 
 | 260 | 	unsigned long vaddr; | 
 | 261 |  | 
 | 262 | 	vaddr = PKMAP_BASE; | 
 | 263 | 	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | 
 | 264 |  | 
 | 265 | 	pgd = swapper_pg_dir + pgd_index(vaddr); | 
 | 266 | 	pud = pud_offset(pgd, vaddr); | 
 | 267 | 	pmd = pmd_offset(pud, vaddr); | 
 | 268 | 	pte = pte_offset_kernel(pmd, vaddr); | 
 | 269 | 	pkmap_page_table = pte;	 | 
 | 270 | } | 
 | 271 |  | 
| Matt Tolentino | c09b424 | 2006-01-17 07:03:44 +0100 | [diff] [blame] | 272 | static void __meminit free_new_highpage(struct page *page) | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 273 | { | 
| Nick Piggin | 7835e98 | 2006-03-22 00:08:40 -0800 | [diff] [blame] | 274 | 	init_page_count(page); | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 275 | 	__free_page(page); | 
 | 276 | 	totalhigh_pages++; | 
 | 277 | } | 
 | 278 |  | 
 | 279 | void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 280 | { | 
 | 281 | 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | 
 | 282 | 		ClearPageReserved(page); | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 283 | 		free_new_highpage(page); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 284 | 	} else | 
 | 285 | 		SetPageReserved(page); | 
 | 286 | } | 
 | 287 |  | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 288 | static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) | 
 | 289 | { | 
 | 290 | 	free_new_highpage(page); | 
 | 291 | 	totalram_pages++; | 
 | 292 | #ifdef CONFIG_FLATMEM | 
 | 293 | 	max_mapnr = max(pfn, max_mapnr); | 
 | 294 | #endif | 
 | 295 | 	num_physpages++; | 
 | 296 | 	return 0; | 
 | 297 | } | 
 | 298 |  | 
 | 299 | /* | 
 | 300 |  * Not currently handling the NUMA case. | 
 | 301 |  * Assuming single node and all memory that | 
 | 302 |  * has been added dynamically that would be | 
 | 303 |  * onlined here is in HIGHMEM | 
 | 304 |  */ | 
 | 305 | void online_page(struct page *page) | 
 | 306 | { | 
 | 307 | 	ClearPageReserved(page); | 
 | 308 | 	add_one_highpage_hotplug(page, page_to_pfn(page)); | 
 | 309 | } | 
 | 310 |  | 
 | 311 |  | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 312 | #ifdef CONFIG_NUMA | 
 | 313 | extern void set_highmem_pages_init(int); | 
 | 314 | #else | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 315 | static void __init set_highmem_pages_init(int bad_ppro) | 
 | 316 | { | 
 | 317 | 	int pfn; | 
 | 318 | 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 319 | 		add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 320 | 	totalram_pages += totalhigh_pages; | 
 | 321 | } | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 322 | #endif /* CONFIG_FLATMEM */ | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 323 |  | 
 | 324 | #else | 
 | 325 | #define kmap_init() do { } while (0) | 
 | 326 | #define permanent_kmaps_init(pgd_base) do { } while (0) | 
 | 327 | #define set_highmem_pages_init(bad_ppro) do { } while (0) | 
 | 328 | #endif /* CONFIG_HIGHMEM */ | 
 | 329 |  | 
 | 330 | unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; | 
| Alexey Dobriyan | 129f694 | 2005-06-23 00:08:33 -0700 | [diff] [blame] | 331 | EXPORT_SYMBOL(__PAGE_KERNEL); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 332 | unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | 
 | 333 |  | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 334 | #ifdef CONFIG_NUMA | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 335 | extern void __init remap_numa_kva(void); | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 336 | #else | 
 | 337 | #define remap_numa_kva() do {} while (0) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 338 | #endif | 
 | 339 |  | 
 | 340 | static void __init pagetable_init (void) | 
 | 341 | { | 
 | 342 | 	unsigned long vaddr; | 
 | 343 | 	pgd_t *pgd_base = swapper_pg_dir; | 
 | 344 |  | 
 | 345 | #ifdef CONFIG_X86_PAE | 
 | 346 | 	int i; | 
 | 347 | 	/* Init entries of the first-level page table to the zero page */ | 
 | 348 | 	for (i = 0; i < PTRS_PER_PGD; i++) | 
 | 349 | 		set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); | 
 | 350 | #endif | 
 | 351 |  | 
 | 352 | 	/* Enable PSE if available */ | 
 | 353 | 	if (cpu_has_pse) { | 
 | 354 | 		set_in_cr4(X86_CR4_PSE); | 
 | 355 | 	} | 
 | 356 |  | 
 | 357 | 	/* Enable PGE if available */ | 
 | 358 | 	if (cpu_has_pge) { | 
 | 359 | 		set_in_cr4(X86_CR4_PGE); | 
 | 360 | 		__PAGE_KERNEL |= _PAGE_GLOBAL; | 
 | 361 | 		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; | 
 | 362 | 	} | 
 | 363 |  | 
 | 364 | 	kernel_physical_mapping_init(pgd_base); | 
 | 365 | 	remap_numa_kva(); | 
 | 366 |  | 
 | 367 | 	/* | 
 | 368 | 	 * Fixed mappings, only the page table structure has to be | 
 | 369 | 	 * created - mappings will be set by set_fixmap(): | 
 | 370 | 	 */ | 
 | 371 | 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | 
 | 372 | 	page_table_range_init(vaddr, 0, pgd_base); | 
 | 373 |  | 
 | 374 | 	permanent_kmaps_init(pgd_base); | 
 | 375 |  | 
 | 376 | #ifdef CONFIG_X86_PAE | 
 | 377 | 	/* | 
 | 378 | 	 * Add low memory identity-mappings - SMP needs it when | 
 | 379 | 	 * starting up on an AP from real-mode. In the non-PAE | 
 | 380 | 	 * case we already have these mappings through head.S. | 
 | 381 | 	 * All user-space mappings are explicitly cleared after | 
 | 382 | 	 * SMP startup. | 
 | 383 | 	 */ | 
| Zachary Amsden | c9b02a2 | 2005-09-03 15:56:40 -0700 | [diff] [blame] | 384 | 	set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 385 | #endif | 
 | 386 | } | 
 | 387 |  | 
| Shaohua Li | 55b2355 | 2006-06-23 02:04:49 -0700 | [diff] [blame] | 388 | #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 389 | /* | 
 | 390 |  * Swap suspend & friends need this for resume because things like the intel-agp | 
 | 391 |  * driver might have split up a kernel 4MB mapping. | 
 | 392 |  */ | 
 | 393 | char __nosavedata swsusp_pg_dir[PAGE_SIZE] | 
 | 394 | 	__attribute__ ((aligned (PAGE_SIZE))); | 
 | 395 |  | 
 | 396 | static inline void save_pg_dir(void) | 
 | 397 | { | 
 | 398 | 	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | 
 | 399 | } | 
 | 400 | #else | 
 | 401 | static inline void save_pg_dir(void) | 
 | 402 | { | 
 | 403 | } | 
 | 404 | #endif | 
 | 405 |  | 
 | 406 | void zap_low_mappings (void) | 
 | 407 | { | 
 | 408 | 	int i; | 
 | 409 |  | 
 | 410 | 	save_pg_dir(); | 
 | 411 |  | 
 | 412 | 	/* | 
 | 413 | 	 * Zap initial low-memory mappings. | 
 | 414 | 	 * | 
 | 415 | 	 * Note that "pgd_clear()" doesn't do it for | 
 | 416 | 	 * us, because pgd_clear() is a no-op on i386. | 
 | 417 | 	 */ | 
 | 418 | 	for (i = 0; i < USER_PTRS_PER_PGD; i++) | 
 | 419 | #ifdef CONFIG_X86_PAE | 
 | 420 | 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | 
 | 421 | #else | 
 | 422 | 		set_pgd(swapper_pg_dir+i, __pgd(0)); | 
 | 423 | #endif | 
 | 424 | 	flush_tlb_all(); | 
 | 425 | } | 
 | 426 |  | 
 | 427 | static int disable_nx __initdata = 0; | 
| Ravikiran G Thirumalai | 6c231b7 | 2005-09-06 15:17:45 -0700 | [diff] [blame] | 428 | u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 429 |  | 
 | 430 | /* | 
 | 431 |  * noexec = on|off | 
 | 432 |  * | 
 | 433 |  * Control non executable mappings. | 
 | 434 |  * | 
 | 435 |  * on      Enable | 
 | 436 |  * off     Disable | 
 | 437 |  */ | 
 | 438 | void __init noexec_setup(const char *str) | 
 | 439 | { | 
 | 440 | 	if (!strncmp(str, "on",2) && cpu_has_nx) { | 
 | 441 | 		__supported_pte_mask |= _PAGE_NX; | 
 | 442 | 		disable_nx = 0; | 
 | 443 | 	} else if (!strncmp(str,"off",3)) { | 
 | 444 | 		disable_nx = 1; | 
 | 445 | 		__supported_pte_mask &= ~_PAGE_NX; | 
 | 446 | 	} | 
 | 447 | } | 
 | 448 |  | 
 | 449 | int nx_enabled = 0; | 
 | 450 | #ifdef CONFIG_X86_PAE | 
 | 451 |  | 
 | 452 | static void __init set_nx(void) | 
 | 453 | { | 
 | 454 | 	unsigned int v[4], l, h; | 
 | 455 |  | 
 | 456 | 	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | 
 | 457 | 		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | 
 | 458 | 		if ((v[3] & (1 << 20)) && !disable_nx) { | 
 | 459 | 			rdmsr(MSR_EFER, l, h); | 
 | 460 | 			l |= EFER_NX; | 
 | 461 | 			wrmsr(MSR_EFER, l, h); | 
 | 462 | 			nx_enabled = 1; | 
 | 463 | 			__supported_pte_mask |= _PAGE_NX; | 
 | 464 | 		} | 
 | 465 | 	} | 
 | 466 | } | 
 | 467 |  | 
 | 468 | /* | 
 | 469 |  * Enables/disables executability of a given kernel page and | 
 | 470 |  * returns the previous setting. | 
 | 471 |  */ | 
 | 472 | int __init set_kernel_exec(unsigned long vaddr, int enable) | 
 | 473 | { | 
 | 474 | 	pte_t *pte; | 
 | 475 | 	int ret = 1; | 
 | 476 |  | 
 | 477 | 	if (!nx_enabled) | 
 | 478 | 		goto out; | 
 | 479 |  | 
 | 480 | 	pte = lookup_address(vaddr); | 
 | 481 | 	BUG_ON(!pte); | 
 | 482 |  | 
 | 483 | 	if (!pte_exec_kernel(*pte)) | 
 | 484 | 		ret = 0; | 
 | 485 |  | 
 | 486 | 	if (enable) | 
 | 487 | 		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); | 
 | 488 | 	else | 
 | 489 | 		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); | 
 | 490 | 	__flush_tlb_all(); | 
 | 491 | out: | 
 | 492 | 	return ret; | 
 | 493 | } | 
 | 494 |  | 
 | 495 | #endif | 
 | 496 |  | 
 | 497 | /* | 
 | 498 |  * paging_init() sets up the page tables - note that the first 8MB are | 
 | 499 |  * already mapped by head.S. | 
 | 500 |  * | 
 | 501 |  * This routines also unmaps the page at virtual kernel address 0, so | 
 | 502 |  * that we can trap those pesky NULL-reference errors in the kernel. | 
 | 503 |  */ | 
 | 504 | void __init paging_init(void) | 
 | 505 | { | 
 | 506 | #ifdef CONFIG_X86_PAE | 
 | 507 | 	set_nx(); | 
 | 508 | 	if (nx_enabled) | 
 | 509 | 		printk("NX (Execute Disable) protection: active\n"); | 
 | 510 | #endif | 
 | 511 |  | 
 | 512 | 	pagetable_init(); | 
 | 513 |  | 
 | 514 | 	load_cr3(swapper_pg_dir); | 
 | 515 |  | 
 | 516 | #ifdef CONFIG_X86_PAE | 
 | 517 | 	/* | 
 | 518 | 	 * We will bail out later - printk doesn't work right now so | 
 | 519 | 	 * the user would just see a hanging kernel. | 
 | 520 | 	 */ | 
 | 521 | 	if (cpu_has_pae) | 
 | 522 | 		set_in_cr4(X86_CR4_PAE); | 
 | 523 | #endif | 
 | 524 | 	__flush_tlb_all(); | 
 | 525 |  | 
 | 526 | 	kmap_init(); | 
 | 527 | } | 
 | 528 |  | 
 | 529 | /* | 
 | 530 |  * Test if the WP bit works in supervisor mode. It isn't supported on 386's | 
 | 531 |  * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This | 
 | 532 |  * used to involve black magic jumps to work around some nasty CPU bugs, | 
 | 533 |  * but fortunately the switch to using exceptions got rid of all that. | 
 | 534 |  */ | 
 | 535 |  | 
 | 536 | static void __init test_wp_bit(void) | 
 | 537 | { | 
 | 538 | 	printk("Checking if this processor honours the WP bit even in supervisor mode... "); | 
 | 539 |  | 
 | 540 | 	/* Any page-aligned address will do, the test is non-destructive */ | 
 | 541 | 	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); | 
 | 542 | 	boot_cpu_data.wp_works_ok = do_test_wp_bit(); | 
 | 543 | 	clear_fixmap(FIX_WP_TEST); | 
 | 544 |  | 
 | 545 | 	if (!boot_cpu_data.wp_works_ok) { | 
 | 546 | 		printk("No.\n"); | 
 | 547 | #ifdef CONFIG_X86_WP_WORKS_OK | 
 | 548 | 		panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | 
 | 549 | #endif | 
 | 550 | 	} else { | 
 | 551 | 		printk("Ok.\n"); | 
 | 552 | 	} | 
 | 553 | } | 
 | 554 |  | 
 | 555 | static void __init set_max_mapnr_init(void) | 
 | 556 | { | 
 | 557 | #ifdef CONFIG_HIGHMEM | 
 | 558 | 	num_physpages = highend_pfn; | 
 | 559 | #else | 
 | 560 | 	num_physpages = max_low_pfn; | 
 | 561 | #endif | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 562 | #ifdef CONFIG_FLATMEM | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 563 | 	max_mapnr = num_physpages; | 
 | 564 | #endif | 
 | 565 | } | 
 | 566 |  | 
 | 567 | static struct kcore_list kcore_mem, kcore_vmalloc;  | 
 | 568 |  | 
 | 569 | void __init mem_init(void) | 
 | 570 | { | 
 | 571 | 	extern int ppro_with_ram_bug(void); | 
 | 572 | 	int codesize, reservedpages, datasize, initsize; | 
 | 573 | 	int tmp; | 
 | 574 | 	int bad_ppro; | 
 | 575 |  | 
| Andy Whitcroft | 05b79bd | 2005-06-23 00:07:57 -0700 | [diff] [blame] | 576 | #ifdef CONFIG_FLATMEM | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 577 | 	if (!mem_map) | 
 | 578 | 		BUG(); | 
 | 579 | #endif | 
 | 580 | 	 | 
 | 581 | 	bad_ppro = ppro_with_ram_bug(); | 
 | 582 |  | 
 | 583 | #ifdef CONFIG_HIGHMEM | 
 | 584 | 	/* check that fixmap and pkmap do not overlap */ | 
 | 585 | 	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | 
 | 586 | 		printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); | 
 | 587 | 		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | 
 | 588 | 				PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); | 
 | 589 | 		BUG(); | 
 | 590 | 	} | 
 | 591 | #endif | 
 | 592 |   | 
 | 593 | 	set_max_mapnr_init(); | 
 | 594 |  | 
 | 595 | #ifdef CONFIG_HIGHMEM | 
 | 596 | 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | 
 | 597 | #else | 
 | 598 | 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | 
 | 599 | #endif | 
 | 600 |  | 
 | 601 | 	/* this will put all low memory onto the freelists */ | 
 | 602 | 	totalram_pages += free_all_bootmem(); | 
 | 603 |  | 
 | 604 | 	reservedpages = 0; | 
 | 605 | 	for (tmp = 0; tmp < max_low_pfn; tmp++) | 
 | 606 | 		/* | 
 | 607 | 		 * Only count reserved RAM pages | 
 | 608 | 		 */ | 
 | 609 | 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | 
 | 610 | 			reservedpages++; | 
 | 611 |  | 
 | 612 | 	set_highmem_pages_init(bad_ppro); | 
 | 613 |  | 
 | 614 | 	codesize =  (unsigned long) &_etext - (unsigned long) &_text; | 
 | 615 | 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext; | 
 | 616 | 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin; | 
 | 617 |  | 
 | 618 | 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);  | 
 | 619 | 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,  | 
 | 620 | 		   VMALLOC_END-VMALLOC_START); | 
 | 621 |  | 
 | 622 | 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", | 
 | 623 | 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | 
 | 624 | 		num_physpages << (PAGE_SHIFT-10), | 
 | 625 | 		codesize >> 10, | 
 | 626 | 		reservedpages << (PAGE_SHIFT-10), | 
 | 627 | 		datasize >> 10, | 
 | 628 | 		initsize >> 10, | 
 | 629 | 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | 
 | 630 | 	       ); | 
 | 631 |  | 
 | 632 | #ifdef CONFIG_X86_PAE | 
 | 633 | 	if (!cpu_has_pae) | 
 | 634 | 		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); | 
 | 635 | #endif | 
 | 636 | 	if (boot_cpu_data.wp_works_ok < 0) | 
 | 637 | 		test_wp_bit(); | 
 | 638 |  | 
 | 639 | 	/* | 
 | 640 | 	 * Subtle. SMP is doing it's boot stuff late (because it has to | 
 | 641 | 	 * fork idle threads) - but it also needs low mappings for the | 
 | 642 | 	 * protected-mode entry to work. We zap these entries only after | 
 | 643 | 	 * the WP-bit has been tested. | 
 | 644 | 	 */ | 
 | 645 | #ifndef CONFIG_SMP | 
 | 646 | 	zap_low_mappings(); | 
 | 647 | #endif | 
 | 648 | } | 
 | 649 |  | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 650 | /* | 
 | 651 |  * this is for the non-NUMA, single node SMP system case. | 
 | 652 |  * Specifically, in the case of x86, we will always add | 
 | 653 |  * memory to the highmem for now. | 
 | 654 |  */ | 
| KAMEZAWA Hiroyuki | ad8f579 | 2006-05-20 15:00:03 -0700 | [diff] [blame] | 655 | #ifdef CONFIG_MEMORY_HOTPLUG | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 656 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 
| Yasunori Goto | bc02af9 | 2006-06-27 02:53:30 -0700 | [diff] [blame] | 657 | int arch_add_memory(int nid, u64 start, u64 size) | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 658 | { | 
 | 659 | 	struct pglist_data *pgdata = &contig_page_data; | 
 | 660 | 	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; | 
 | 661 | 	unsigned long start_pfn = start >> PAGE_SHIFT; | 
 | 662 | 	unsigned long nr_pages = size >> PAGE_SHIFT; | 
 | 663 |  | 
 | 664 | 	return __add_pages(zone, start_pfn, nr_pages); | 
 | 665 | } | 
 | 666 |  | 
 | 667 | int remove_memory(u64 start, u64 size) | 
 | 668 | { | 
 | 669 | 	return -EINVAL; | 
 | 670 | } | 
 | 671 | #endif | 
| Andi Kleen | 9d99aaa | 2006-04-07 19:49:15 +0200 | [diff] [blame] | 672 | #endif | 
| Dave Hansen | 05039b9 | 2005-10-29 18:16:57 -0700 | [diff] [blame] | 673 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 674 | kmem_cache_t *pgd_cache; | 
 | 675 | kmem_cache_t *pmd_cache; | 
 | 676 |  | 
 | 677 | void __init pgtable_cache_init(void) | 
 | 678 | { | 
 | 679 | 	if (PTRS_PER_PMD > 1) { | 
 | 680 | 		pmd_cache = kmem_cache_create("pmd", | 
 | 681 | 					PTRS_PER_PMD*sizeof(pmd_t), | 
 | 682 | 					PTRS_PER_PMD*sizeof(pmd_t), | 
 | 683 | 					0, | 
 | 684 | 					pmd_ctor, | 
 | 685 | 					NULL); | 
 | 686 | 		if (!pmd_cache) | 
 | 687 | 			panic("pgtable_cache_init(): cannot create pmd cache"); | 
 | 688 | 	} | 
 | 689 | 	pgd_cache = kmem_cache_create("pgd", | 
 | 690 | 				PTRS_PER_PGD*sizeof(pgd_t), | 
 | 691 | 				PTRS_PER_PGD*sizeof(pgd_t), | 
 | 692 | 				0, | 
 | 693 | 				pgd_ctor, | 
 | 694 | 				PTRS_PER_PMD == 1 ? pgd_dtor : NULL); | 
 | 695 | 	if (!pgd_cache) | 
 | 696 | 		panic("pgtable_cache_init(): Cannot create pgd cache"); | 
 | 697 | } | 
 | 698 |  | 
 | 699 | /* | 
 | 700 |  * This function cannot be __init, since exceptions don't work in that | 
 | 701 |  * section.  Put this after the callers, so that it cannot be inlined. | 
 | 702 |  */ | 
 | 703 | static int noinline do_test_wp_bit(void) | 
 | 704 | { | 
 | 705 | 	char tmp_reg; | 
 | 706 | 	int flag; | 
 | 707 |  | 
 | 708 | 	__asm__ __volatile__( | 
 | 709 | 		"	movb %0,%1	\n" | 
 | 710 | 		"1:	movb %1,%0	\n" | 
 | 711 | 		"	xorl %2,%2	\n" | 
 | 712 | 		"2:			\n" | 
 | 713 | 		".section __ex_table,\"a\"\n" | 
 | 714 | 		"	.align 4	\n" | 
 | 715 | 		"	.long 1b,2b	\n" | 
 | 716 | 		".previous		\n" | 
 | 717 | 		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | 
 | 718 | 		 "=q" (tmp_reg), | 
 | 719 | 		 "=r" (flag) | 
 | 720 | 		:"2" (1) | 
 | 721 | 		:"memory"); | 
 | 722 | 	 | 
 | 723 | 	return flag; | 
 | 724 | } | 
 | 725 |  | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 726 | #ifdef CONFIG_DEBUG_RODATA | 
 | 727 |  | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 728 | void mark_rodata_ro(void) | 
 | 729 | { | 
| Heiko Carstens | a581c2a | 2006-07-01 04:36:30 -0700 | [diff] [blame] | 730 | 	unsigned long addr = (unsigned long)__start_rodata; | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 731 |  | 
| Heiko Carstens | a581c2a | 2006-07-01 04:36:30 -0700 | [diff] [blame] | 732 | 	for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 733 | 		change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); | 
 | 734 |  | 
| Heiko Carstens | a581c2a | 2006-07-01 04:36:30 -0700 | [diff] [blame] | 735 | 	printk("Write protecting the kernel read-only data: %uk\n", | 
 | 736 | 			(__end_rodata - __start_rodata) >> 10); | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 737 |  | 
 | 738 | 	/* | 
 | 739 | 	 * change_page_attr() requires a global_flush_tlb() call after it. | 
 | 740 | 	 * We do this after the printk so that if something went wrong in the | 
 | 741 | 	 * change, the printk gets out at least to give a better debug hint | 
 | 742 | 	 * of who is the culprit. | 
 | 743 | 	 */ | 
 | 744 | 	global_flush_tlb(); | 
 | 745 | } | 
 | 746 | #endif | 
 | 747 |  | 
| Gerd Hoffmann | 9a0b581 | 2006-03-23 02:59:32 -0800 | [diff] [blame] | 748 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 
 | 749 | { | 
 | 750 | 	unsigned long addr; | 
 | 751 |  | 
 | 752 | 	for (addr = begin; addr < end; addr += PAGE_SIZE) { | 
 | 753 | 		ClearPageReserved(virt_to_page(addr)); | 
 | 754 | 		init_page_count(virt_to_page(addr)); | 
| Randy Dunlap | c9cf552 | 2006-06-27 02:53:52 -0700 | [diff] [blame] | 755 | 		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | 
| Gerd Hoffmann | 9a0b581 | 2006-03-23 02:59:32 -0800 | [diff] [blame] | 756 | 		free_page(addr); | 
 | 757 | 		totalram_pages++; | 
 | 758 | 	} | 
 | 759 | 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); | 
 | 760 | } | 
 | 761 |  | 
 | 762 | void free_initmem(void) | 
 | 763 | { | 
 | 764 | 	free_init_pages("unused kernel memory", | 
 | 765 | 			(unsigned long)(&__init_begin), | 
 | 766 | 			(unsigned long)(&__init_end)); | 
 | 767 | } | 
| Arjan van de Ven | 63aaf30 | 2006-01-06 00:12:02 -0800 | [diff] [blame] | 768 |  | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 769 | #ifdef CONFIG_BLK_DEV_INITRD | 
 | 770 | void free_initrd_mem(unsigned long start, unsigned long end) | 
 | 771 | { | 
| Gerd Hoffmann | 9a0b581 | 2006-03-23 02:59:32 -0800 | [diff] [blame] | 772 | 	free_init_pages("initrd memory", start, end); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 773 | } | 
 | 774 | #endif | 
| Gerd Hoffmann | 9a0b581 | 2006-03-23 02:59:32 -0800 | [diff] [blame] | 775 |  |