Blame - arch/tile/kernel/setup.c - android_kernel_htc_msm8960

blob: 934136b61cebbf95c6f71add5ce7d2eceeeb6689 [file] [log] [blame]

Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame^]	1	/*
				2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation, version 2.
				7	*
				8	* This program is distributed in the hope that it will be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
				11	* NON INFRINGEMENT. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/sched.h>
				16	#include <linux/kernel.h>
				17	#include <linux/mmzone.h>
				18	#include <linux/bootmem.h>
				19	#include <linux/module.h>
				20	#include <linux/node.h>
				21	#include <linux/cpu.h>
				22	#include <linux/ioport.h>
				23	#include <linux/kexec.h>
				24	#include <linux/pci.h>
				25	#include <linux/initrd.h>
				26	#include <linux/io.h>
				27	#include <linux/highmem.h>
				28	#include <linux/smp.h>
				29	#include <linux/timex.h>
				30	#include <asm/setup.h>
				31	#include <asm/sections.h>
				32	#include <asm/sections.h>
				33	#include <asm/cacheflush.h>
				34	#include <asm/cacheflush.h>
				35	#include <asm/pgalloc.h>
				36	#include <asm/mmu_context.h>
				37	#include <hv/hypervisor.h>
				38	#include <arch/interrupts.h>
				39
				40	/* <linux/smp.h> doesn't provide this definition. */
				41	#ifndef CONFIG_SMP
				42	#define setup_max_cpus 1
				43	#endif
				44
				45	static inline int ABS(int x) { return x >= 0 ? x : -x; }
				46
				47	/* Chip information */
				48	char chip_model[64] __write_once;
				49
				50	struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
				51	EXPORT_SYMBOL(node_data);
				52
				53	/* We only create bootmem data on node 0. */
				54	static bootmem_data_t __initdata node0_bdata;
				55
				56	/* Information on the NUMA nodes that we compute early */
				57	unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES];
				58	unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES];
				59	unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
				60	unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
				61	unsigned long __initdata node_free_pfn[MAX_NUMNODES];
				62
				63	#ifdef CONFIG_HIGHMEM
				64	/* Page frame index of end of lowmem on each controller. */
				65	unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
				66
				67	/* Number of pages that can be mapped into lowmem. */
				68	static unsigned long __initdata mappable_physpages;
				69	#endif
				70
				71	/* Data on which physical memory controller corresponds to which NUMA node */
				72	int node_controller[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = -1 };
				73
				74	#ifdef CONFIG_HIGHMEM
				75	/* Map information from VAs to PAs */
				76	unsigned long pbase_map[1 << (32 - HPAGE_SHIFT)]
				77	__write_once __attribute__((aligned(L2_CACHE_BYTES)));
				78	EXPORT_SYMBOL(pbase_map);
				79
				80	/* Map information from PAs to VAs */
				81	void *vbase_map[NR_PA_HIGHBIT_VALUES]
				82	__write_once __attribute__((aligned(L2_CACHE_BYTES)));
				83	EXPORT_SYMBOL(vbase_map);
				84	#endif
				85
				86	/* Node number as a function of the high PA bits */
				87	int highbits_to_node[NR_PA_HIGHBIT_VALUES] __write_once;
				88	EXPORT_SYMBOL(highbits_to_node);
				89
				90	static unsigned int __initdata maxmem_pfn = -1U;
				91	static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = {
				92	[0 ... MAX_NUMNODES-1] = -1U
				93	};
				94	static nodemask_t __initdata isolnodes;
				95
				96	#ifdef CONFIG_PCI
				97	enum { DEFAULT_PCI_RESERVE_MB = 64 };
				98	static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB;
				99	unsigned long __initdata pci_reserve_start_pfn = -1U;
				100	unsigned long __initdata pci_reserve_end_pfn = -1U;
				101	#endif
				102
				103	static int __init setup_maxmem(char *str)
				104	{
				105	long maxmem_mb;
				106	if (str == NULL \|\| strict_strtol(str, 0, &maxmem_mb) != 0 \|\|
				107	maxmem_mb == 0)
				108	return -EINVAL;
				109
				110	maxmem_pfn = (maxmem_mb >> (HPAGE_SHIFT - 20)) <<
				111	(HPAGE_SHIFT - PAGE_SHIFT);
				112	printk("Forcing RAM used to no more than %dMB\n",
				113	maxmem_pfn >> (20 - PAGE_SHIFT));
				114	return 0;
				115	}
				116	early_param("maxmem", setup_maxmem);
				117
				118	static int __init setup_maxnodemem(char *str)
				119	{
				120	char *endp;
				121	long maxnodemem_mb, node;
				122
				123	node = str ? simple_strtoul(str, &endp, 0) : INT_MAX;
				124	if (node >= MAX_NUMNODES \|\| *endp != ':' \|\|
				125	strict_strtol(endp+1, 0, &maxnodemem_mb) != 0)
				126	return -EINVAL;
				127
				128	maxnodemem_pfn[node] = (maxnodemem_mb >> (HPAGE_SHIFT - 20)) <<
				129	(HPAGE_SHIFT - PAGE_SHIFT);
				130	printk("Forcing RAM used on node %ld to no more than %dMB\n",
				131	node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT));
				132	return 0;
				133	}
				134	early_param("maxnodemem", setup_maxnodemem);
				135
				136	static int __init setup_isolnodes(char *str)
				137	{
				138	char buf[MAX_NUMNODES * 5];
				139	if (str == NULL \|\| nodelist_parse(str, isolnodes) != 0)
				140	return -EINVAL;
				141
				142	nodelist_scnprintf(buf, sizeof(buf), isolnodes);
				143	printk("Set isolnodes value to '%s'\n", buf);
				144	return 0;
				145	}
				146	early_param("isolnodes", setup_isolnodes);
				147
				148	#ifdef CONFIG_PCI
				149	static int __init setup_pci_reserve(char* str)
				150	{
				151	unsigned long mb;
				152
				153	if (str == NULL \|\| strict_strtoul(str, 0, &mb) != 0 \|\|
				154	mb > 3 * 1024)
				155	return -EINVAL;
				156
				157	pci_reserve_mb = mb;
				158	printk("Reserving %dMB for PCIE root complex mappings\n",
				159	pci_reserve_mb);
				160	return 0;
				161	}
				162	early_param("pci_reserve", setup_pci_reserve);
				163	#endif
				164
				165	#ifndef __tilegx__
				166	/*
				167	* vmalloc=size forces the vmalloc area to be exactly 'size' bytes.
				168	* This can be used to increase (or decrease) the vmalloc area.
				169	*/
				170	static int __init parse_vmalloc(char *arg)
				171	{
				172	if (!arg)
				173	return -EINVAL;
				174
				175	VMALLOC_RESERVE = (memparse(arg, &arg) + PGDIR_SIZE - 1) & PGDIR_MASK;
				176
				177	/* See validate_va() for more on this test. */
				178	if ((long)_VMALLOC_START >= 0)
				179	early_panic("\"vmalloc=%#lx\" value too large: maximum %#lx\n",
				180	VMALLOC_RESERVE, _VMALLOC_END - 0x80000000UL);
				181
				182	return 0;
				183	}
				184	early_param("vmalloc", parse_vmalloc);
				185	#endif
				186
				187	#ifdef CONFIG_HIGHMEM
				188	/*
				189	* Determine for each controller where its lowmem is mapped and how
				190	* much of it is mapped there. On controller zero, the first few
				191	* megabytes are mapped at 0xfd000000 as code, so in principle we
				192	* could start our data mappings higher up, but for now we don't
				193	* bother, to avoid additional confusion.
				194	*
				195	* One question is whether, on systems with more than 768 Mb and
				196	* controllers of different sizes, to map in a proportionate amount of
				197	* each one, or to try to map the same amount from each controller.
				198	* (E.g. if we have three controllers with 256MB, 1GB, and 256MB
				199	* respectively, do we map 256MB from each, or do we map 128 MB, 512
				200	* MB, and 128 MB respectively?) For now we use a proportionate
				201	* solution like the latter.
				202	*
				203	* The VA/PA mapping demands that we align our decisions at 16 MB
				204	* boundaries so that we can rapidly convert VA to PA.
				205	*/
				206	static void *__init setup_pa_va_mapping(void)
				207	{
				208	unsigned long curr_pages = 0;
				209	unsigned long vaddr = PAGE_OFFSET;
				210	nodemask_t highonlynodes = isolnodes;
				211	int i, j;
				212
				213	memset(pbase_map, -1, sizeof(pbase_map));
				214	memset(vbase_map, -1, sizeof(vbase_map));
				215
				216	/* Node zero cannot be isolated for LOWMEM purposes. */
				217	node_clear(0, highonlynodes);
				218
				219	/* Count up the number of pages on non-highonlynodes controllers. */
				220	mappable_physpages = 0;
				221	for_each_online_node(i) {
				222	if (!node_isset(i, highonlynodes))
				223	mappable_physpages +=
				224	node_end_pfn[i] - node_start_pfn[i];
				225	}
				226
				227	for_each_online_node(i) {
				228	unsigned long start = node_start_pfn[i];
				229	unsigned long end = node_end_pfn[i];
				230	unsigned long size = end - start;
				231	unsigned long vaddr_end;
				232
				233	if (node_isset(i, highonlynodes)) {
				234	/* Mark this controller as having no lowmem. */
				235	node_lowmem_end_pfn[i] = start;
				236	continue;
				237	}
				238
				239	curr_pages += size;
				240	if (mappable_physpages > MAXMEM_PFN) {
				241	vaddr_end = PAGE_OFFSET +
				242	(((u64)curr_pages * MAXMEM_PFN /
				243	mappable_physpages)
				244	<< PAGE_SHIFT);
				245	} else {
				246	vaddr_end = PAGE_OFFSET + (curr_pages << PAGE_SHIFT);
				247	}
				248	for (j = 0; vaddr < vaddr_end; vaddr += HPAGE_SIZE, ++j) {
				249	unsigned long this_pfn =
				250	start + (j << HUGETLB_PAGE_ORDER);
				251	pbase_map[vaddr >> HPAGE_SHIFT] = this_pfn;
				252	if (vbase_map[__pfn_to_highbits(this_pfn)] ==
				253	(void *)-1)
				254	vbase_map[__pfn_to_highbits(this_pfn)] =
				255	(void *)(vaddr & HPAGE_MASK);
				256	}
				257	node_lowmem_end_pfn[i] = start + (j << HUGETLB_PAGE_ORDER);
				258	BUG_ON(node_lowmem_end_pfn[i] > end);
				259	}
				260
				261	/* Return highest address of any mapped memory. */
				262	return (void *)vaddr;
				263	}
				264	#endif /* CONFIG_HIGHMEM */
				265
				266	/*
				267	* Register our most important memory mappings with the debug stub.
				268	*
				269	* This is up to 4 mappings for lowmem, one mapping per memory
				270	* controller, plus one for our text segment.
				271	*/
				272	void __cpuinit store_permanent_mappings(void)
				273	{
				274	int i;
				275
				276	for_each_online_node(i) {
				277	HV_PhysAddr pa = ((HV_PhysAddr)node_start_pfn[i]) << PAGE_SHIFT;
				278	#ifdef CONFIG_HIGHMEM
				279	HV_PhysAddr high_mapped_pa = node_lowmem_end_pfn[i];
				280	#else
				281	HV_PhysAddr high_mapped_pa = node_end_pfn[i];
				282	#endif
				283
				284	unsigned long pages = high_mapped_pa - node_start_pfn[i];
				285	HV_VirtAddr addr = (HV_VirtAddr) __va(pa);
				286	hv_store_mapping(addr, pages << PAGE_SHIFT, pa);
				287	}
				288
				289	hv_store_mapping((HV_VirtAddr)_stext,
				290	(uint32_t)(_einittext - _stext), 0);
				291	}
				292
				293	/*
				294	* Use hv_inquire_physical() to populate node_{start,end}_pfn[]
				295	* and node_online_map, doing suitable sanity-checking.
				296	* Also set min_low_pfn, max_low_pfn, and max_pfn.
				297	*/
				298	static void __init setup_memory(void)
				299	{
				300	int i, j;
				301	int highbits_seen[NR_PA_HIGHBIT_VALUES] = { 0 };
				302	#ifdef CONFIG_HIGHMEM
				303	long highmem_pages;
				304	#endif
				305	#ifndef __tilegx__
				306	int cap;
				307	#endif
				308	#if defined(CONFIG_HIGHMEM) \|\| defined(__tilegx__)
				309	long lowmem_pages;
				310	#endif
				311
				312	/* We are using a char to hold the cpu_2_node[] mapping */
				313	BUG_ON(MAX_NUMNODES > 127);
				314
				315	/* Discover the ranges of memory available to us */
				316	for (i = 0; ; ++i) {
				317	unsigned long start, size, end, highbits;
				318	HV_PhysAddrRange range = hv_inquire_physical(i);
				319	if (range.size == 0)
				320	break;
				321	#ifdef CONFIG_FLATMEM
				322	if (i > 0) {
				323	printk("Can't use discontiguous PAs: %#llx..%#llx\n",
				324	range.size, range.start + range.size);
				325	continue;
				326	}
				327	#endif
				328	#ifndef __tilegx__
				329	if ((unsigned long)range.start) {
				330	printk("Range not at 4GB multiple: %#llx..%#llx\n",
				331	range.start, range.start + range.size);
				332	continue;
				333	}
				334	#endif
				335	if ((range.start & (HPAGE_SIZE-1)) != 0 \|\|
				336	(range.size & (HPAGE_SIZE-1)) != 0) {
				337	unsigned long long start_pa = range.start;
				338	unsigned long long size = range.size;
				339	range.start = (start_pa + HPAGE_SIZE - 1) & HPAGE_MASK;
				340	range.size -= (range.start - start_pa);
				341	range.size &= HPAGE_MASK;
				342	printk("Range not hugepage-aligned: %#llx..%#llx:"
				343	" now %#llx-%#llx\n",
				344	start_pa, start_pa + size,
				345	range.start, range.start + range.size);
				346	}
				347	highbits = __pa_to_highbits(range.start);
				348	if (highbits >= NR_PA_HIGHBIT_VALUES) {
				349	printk("PA high bits too high: %#llx..%#llx\n",
				350	range.start, range.start + range.size);
				351	continue;
				352	}
				353	if (highbits_seen[highbits]) {
				354	printk("Range overlaps in high bits: %#llx..%#llx\n",
				355	range.start, range.start + range.size);
				356	continue;
				357	}
				358	highbits_seen[highbits] = 1;
				359	if (PFN_DOWN(range.size) > maxnodemem_pfn[i]) {
				360	int size = maxnodemem_pfn[i];
				361	if (size > 0) {
				362	printk("Maxnodemem reduced node %d to"
				363	" %d pages\n", i, size);
				364	range.size = (HV_PhysAddr)size << PAGE_SHIFT;
				365	} else {
				366	printk("Maxnodemem disabled node %d\n", i);
				367	continue;
				368	}
				369	}
				370	if (num_physpages + PFN_DOWN(range.size) > maxmem_pfn) {
				371	int size = maxmem_pfn - num_physpages;
				372	if (size > 0) {
				373	printk("Maxmem reduced node %d to %d pages\n",
				374	i, size);
				375	range.size = (HV_PhysAddr)size << PAGE_SHIFT;
				376	} else {
				377	printk("Maxmem disabled node %d\n", i);
				378	continue;
				379	}
				380	}
				381	if (i >= MAX_NUMNODES) {
				382	printk("Too many PA nodes (#%d): %#llx...%#llx\n",
				383	i, range.size, range.size + range.start);
				384	continue;
				385	}
				386
				387	start = range.start >> PAGE_SHIFT;
				388	size = range.size >> PAGE_SHIFT;
				389	end = start + size;
				390
				391	#ifndef __tilegx__
				392	if (((HV_PhysAddr)end << PAGE_SHIFT) !=
				393	(range.start + range.size)) {
				394	printk("PAs too high to represent: %#llx..%#llx\n",
				395	range.start, range.start + range.size);
				396	continue;
				397	}
				398	#endif
				399	#ifdef CONFIG_PCI
				400	/*
				401	* Blocks that overlap the pci reserved region must
				402	* have enough space to hold the maximum percpu data
				403	* region at the top of the range. If there isn't
				404	* enough space above the reserved region, just
				405	* truncate the node.
				406	*/
				407	if (start <= pci_reserve_start_pfn &&
				408	end > pci_reserve_start_pfn) {
				409	unsigned int per_cpu_size =
				410	__per_cpu_end - __per_cpu_start;
				411	unsigned int percpu_pages =
				412	NR_CPUS * (PFN_UP(per_cpu_size) >> PAGE_SHIFT);
				413	if (end < pci_reserve_end_pfn + percpu_pages) {
				414	end = pci_reserve_start_pfn;
				415	printk("PCI mapping region reduced node %d to"
				416	" %ld pages\n", i, end - start);
				417	}
				418	}
				419	#endif
				420
				421	for (j = __pfn_to_highbits(start);
				422	j <= __pfn_to_highbits(end - 1); j++)
				423	highbits_to_node[j] = i;
				424
				425	node_start_pfn[i] = start;
				426	node_end_pfn[i] = end;
				427	node_controller[i] = range.controller;
				428	num_physpages += size;
				429	max_pfn = end;
				430
				431	/* Mark node as online */
				432	node_set(i, node_online_map);
				433	node_set(i, node_possible_map);
				434	}
				435
				436	#ifndef __tilegx__
				437	/*
				438	* For 4KB pages, mem_map "struct page" data is 1% of the size
				439	* of the physical memory, so can be quite big (640 MB for
				440	* four 16G zones). These structures must be mapped in
				441	* lowmem, and since we currently cap out at about 768 MB,
				442	* it's impractical to try to use this much address space.
				443	* For now, arbitrarily cap the amount of physical memory
				444	* we're willing to use at 8 million pages (32GB of 4KB pages).
				445	*/
				446	cap = 8 * 1024 * 1024; /* 8 million pages */
				447	if (num_physpages > cap) {
				448	int num_nodes = num_online_nodes();
				449	int cap_each = cap / num_nodes;
				450	unsigned long dropped_pages = 0;
				451	for (i = 0; i < num_nodes; ++i) {
				452	int size = node_end_pfn[i] - node_start_pfn[i];
				453	if (size > cap_each) {
				454	dropped_pages += (size - cap_each);
				455	node_end_pfn[i] = node_start_pfn[i] + cap_each;
				456	}
				457	}
				458	num_physpages -= dropped_pages;
				459	printk(KERN_WARNING "Only using %ldMB memory;"
				460	" ignoring %ldMB.\n",
				461	num_physpages >> (20 - PAGE_SHIFT),
				462	dropped_pages >> (20 - PAGE_SHIFT));
				463	printk(KERN_WARNING "Consider using a larger page size.\n");
				464	}
				465	#endif
				466
				467	/* Heap starts just above the last loaded address. */
				468	min_low_pfn = PFN_UP((unsigned long)_end - PAGE_OFFSET);
				469
				470	#ifdef CONFIG_HIGHMEM
				471	/* Find where we map lowmem from each controller. */
				472	high_memory = setup_pa_va_mapping();
				473
				474	/* Set max_low_pfn based on what node 0 can directly address. */
				475	max_low_pfn = node_lowmem_end_pfn[0];
				476
				477	lowmem_pages = (mappable_physpages > MAXMEM_PFN) ?
				478	MAXMEM_PFN : mappable_physpages;
				479	highmem_pages = (long) (num_physpages - lowmem_pages);
				480
				481	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
				482	pages_to_mb(highmem_pages > 0 ? highmem_pages : 0));
				483	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
				484	pages_to_mb(lowmem_pages));
				485	#else
				486	/* Set max_low_pfn based on what node 0 can directly address. */
				487	max_low_pfn = node_end_pfn[0];
				488
				489	#ifndef __tilegx__
				490	if (node_end_pfn[0] > MAXMEM_PFN) {
				491	printk(KERN_WARNING "Only using %ldMB LOWMEM.\n",
				492	MAXMEM>>20);
				493	printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
				494	max_low_pfn = MAXMEM_PFN;
				495	max_pfn = MAXMEM_PFN;
				496	num_physpages = MAXMEM_PFN;
				497	node_end_pfn[0] = MAXMEM_PFN;
				498	} else {
				499	printk(KERN_NOTICE "%ldMB memory available.\n",
				500	pages_to_mb(node_end_pfn[0]));
				501	}
				502	for (i = 1; i < MAX_NUMNODES; ++i) {
				503	node_start_pfn[i] = 0;
				504	node_end_pfn[i] = 0;
				505	}
				506	high_memory = __va(node_end_pfn[0]);
				507	#else
				508	lowmem_pages = 0;
				509	for (i = 0; i < MAX_NUMNODES; ++i) {
				510	int pages = node_end_pfn[i] - node_start_pfn[i];
				511	lowmem_pages += pages;
				512	if (pages)
				513	high_memory = pfn_to_kaddr(node_end_pfn[i]);
				514	}
				515	printk(KERN_NOTICE "%ldMB memory available.\n",
				516	pages_to_mb(lowmem_pages));
				517	#endif
				518	#endif
				519	}
				520
				521	static void __init setup_bootmem_allocator(void)
				522	{
				523	unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn;
				524
				525	/* Provide a node 0 bdata. */
				526	NODE_DATA(0)->bdata = &node0_bdata;
				527
				528	#ifdef CONFIG_PCI
				529	/* Don't let boot memory alias the PCI region. */
				530	last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn);
				531	#else
				532	last_alloc_pfn = max_low_pfn;
				533	#endif
				534
				535	/*
				536	* Initialize the boot-time allocator (with low memory only):
				537	* The first argument says where to put the bitmap, and the
				538	* second says where the end of allocatable memory is.
				539	*/
				540	bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn);
				541
				542	/*
				543	* Let the bootmem allocator use all the space we've given it
				544	* except for its own bitmap.
				545	*/
				546	first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size);
				547	if (first_alloc_pfn >= last_alloc_pfn)
				548	early_panic("Not enough memory on controller 0 for bootmem\n");
				549
				550	free_bootmem(PFN_PHYS(first_alloc_pfn),
				551	PFN_PHYS(last_alloc_pfn - first_alloc_pfn));
				552
				553	#ifdef CONFIG_KEXEC
				554	if (crashk_res.start != crashk_res.end)
				555	reserve_bootmem(crashk_res.start,
				556	crashk_res.end - crashk_res.start + 1, 0);
				557	#endif
				558
				559	}
				560
				561	void *__init alloc_remap(int nid, unsigned long size)
				562	{
				563	int pages = node_end_pfn[nid] - node_start_pfn[nid];
				564	void *map = pfn_to_kaddr(node_memmap_pfn[nid]);
				565	BUG_ON(size != pages * sizeof(struct page));
				566	memset(map, 0, size);
				567	return map;
				568	}
				569
				570	static int __init percpu_size(void)
				571	{
				572	int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
				573	#ifdef CONFIG_MODULES
				574	if (size < PERCPU_ENOUGH_ROOM)
				575	size = PERCPU_ENOUGH_ROOM;
				576	#endif
				577	/* In several places we assume the per-cpu data fits on a huge page. */
				578	BUG_ON(kdata_huge && size > HPAGE_SIZE);
				579	return size;
				580	}
				581
				582	static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
				583	{
				584	void *kva = __alloc_bootmem(size, PAGE_SIZE, goal);
				585	unsigned long pfn = kaddr_to_pfn(kva);
				586	BUG_ON(goal && PFN_PHYS(pfn) != goal);
				587	return pfn;
				588	}
				589
				590	static void __init zone_sizes_init(void)
				591	{
				592	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
				593	unsigned long node_percpu[MAX_NUMNODES] = { 0 };
				594	int size = percpu_size();
				595	int num_cpus = smp_height * smp_width;
				596	int i;
				597
				598	for (i = 0; i < num_cpus; ++i)
				599	node_percpu[cpu_to_node(i)] += size;
				600
				601	for_each_online_node(i) {
				602	unsigned long start = node_start_pfn[i];
				603	unsigned long end = node_end_pfn[i];
				604	#ifdef CONFIG_HIGHMEM
				605	unsigned long lowmem_end = node_lowmem_end_pfn[i];
				606	#else
				607	unsigned long lowmem_end = end;
				608	#endif
				609	int memmap_size = (end - start) * sizeof(struct page);
				610	node_free_pfn[i] = start;
				611
				612	/*
				613	* Set aside pages for per-cpu data and the mem_map array.
				614	*
				615	* Since the per-cpu data requires special homecaching,
				616	* if we are in kdata_huge mode, we put it at the end of
				617	* the lowmem region. If we're not in kdata_huge mode,
				618	* we take the per-cpu pages from the bottom of the
				619	* controller, since that avoids fragmenting a huge page
				620	* that users might want. We always take the memmap
				621	* from the bottom of the controller, since with
				622	* kdata_huge that lets it be under a huge TLB entry.
				623	*
				624	* If the user has requested isolnodes for a controller,
				625	* though, there'll be no lowmem, so we just alloc_bootmem
				626	* the memmap. There will be no percpu memory either.
				627	*/
				628	if (__pfn_to_highbits(start) == 0) {
				629	/* In low PAs, allocate via bootmem. */
				630	unsigned long goal = 0;
				631	node_memmap_pfn[i] =
				632	alloc_bootmem_pfn(memmap_size, goal);
				633	if (kdata_huge)
				634	goal = PFN_PHYS(lowmem_end) - node_percpu[i];
				635	if (node_percpu[i])
				636	node_percpu_pfn[i] =
				637	alloc_bootmem_pfn(node_percpu[i], goal);
				638	} else if (cpu_isset(i, isolnodes)) {
				639	node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0);
				640	BUG_ON(node_percpu[i] != 0);
				641	} else {
				642	/* In high PAs, just reserve some pages. */
				643	node_memmap_pfn[i] = node_free_pfn[i];
				644	node_free_pfn[i] += PFN_UP(memmap_size);
				645	if (!kdata_huge) {
				646	node_percpu_pfn[i] = node_free_pfn[i];
				647	node_free_pfn[i] += PFN_UP(node_percpu[i]);
				648	} else {
				649	node_percpu_pfn[i] =
				650	lowmem_end - PFN_UP(node_percpu[i]);
				651	}
				652	}
				653
				654	#ifdef CONFIG_HIGHMEM
				655	if (start > lowmem_end) {
				656	zones_size[ZONE_NORMAL] = 0;
				657	zones_size[ZONE_HIGHMEM] = end - start;
				658	} else {
				659	zones_size[ZONE_NORMAL] = lowmem_end - start;
				660	zones_size[ZONE_HIGHMEM] = end - lowmem_end;
				661	}
				662	#else
				663	zones_size[ZONE_NORMAL] = end - start;
				664	#endif
				665
				666	/*
				667	* Everyone shares node 0's bootmem allocator, but
				668	* we use alloc_remap(), above, to put the actual
				669	* struct page array on the individual controllers,
				670	* which is most of the data that we actually care about.
				671	* We can't place bootmem allocators on the other
				672	* controllers since the bootmem allocator can only
				673	* operate on 32-bit physical addresses.
				674	*/
				675	NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
				676
				677	free_area_init_node(i, zones_size, start, NULL);
				678	printk(KERN_DEBUG " DMA zone: %ld per-cpu pages\n",
				679	PFN_UP(node_percpu[i]));
				680
				681	/* Track the type of memory on each node */
				682	if (zones_size[ZONE_NORMAL])
				683	node_set_state(i, N_NORMAL_MEMORY);
				684	#ifdef CONFIG_HIGHMEM
				685	if (end != start)
				686	node_set_state(i, N_HIGH_MEMORY);
				687	#endif
				688
				689	node_set_online(i);
				690	}
				691	}
				692
				693	#ifdef CONFIG_NUMA
				694
				695	/* which logical CPUs are on which nodes */
				696	struct cpumask node_2_cpu_mask[MAX_NUMNODES] __write_once;
				697	EXPORT_SYMBOL(node_2_cpu_mask);
				698
				699	/* which node each logical CPU is on */
				700	char cpu_2_node[NR_CPUS] __write_once __attribute__((aligned(L2_CACHE_BYTES)));
				701	EXPORT_SYMBOL(cpu_2_node);
				702
				703	/* Return cpu_to_node() except for cpus not yet assigned, which return -1 */
				704	static int __init cpu_to_bound_node(int cpu, struct cpumask* unbound_cpus)
				705	{
				706	if (!cpu_possible(cpu) \|\| cpumask_test_cpu(cpu, unbound_cpus))
				707	return -1;
				708	else
				709	return cpu_to_node(cpu);
				710	}
				711
				712	/* Return number of immediately-adjacent tiles sharing the same NUMA node. */
				713	static int __init node_neighbors(int node, int cpu,
				714	struct cpumask *unbound_cpus)
				715	{
				716	int neighbors = 0;
				717	int w = smp_width;
				718	int h = smp_height;
				719	int x = cpu % w;
				720	int y = cpu / w;
				721	if (x > 0 && cpu_to_bound_node(cpu-1, unbound_cpus) == node)
				722	++neighbors;
				723	if (x < w-1 && cpu_to_bound_node(cpu+1, unbound_cpus) == node)
				724	++neighbors;
				725	if (y > 0 && cpu_to_bound_node(cpu-w, unbound_cpus) == node)
				726	++neighbors;
				727	if (y < h-1 && cpu_to_bound_node(cpu+w, unbound_cpus) == node)
				728	++neighbors;
				729	return neighbors;
				730	}
				731
				732	static void __init setup_numa_mapping(void)
				733	{
				734	int distance[MAX_NUMNODES][NR_CPUS];
				735	HV_Coord coord;
				736	int cpu, node, cpus, i, x, y;
				737	int num_nodes = num_online_nodes();
				738	struct cpumask unbound_cpus;
				739	nodemask_t default_nodes;
				740
				741	cpumask_clear(&unbound_cpus);
				742
				743	/* Get set of nodes we will use for defaults */
				744	nodes_andnot(default_nodes, node_online_map, isolnodes);
				745	if (nodes_empty(default_nodes)) {
				746	BUG_ON(!node_isset(0, node_online_map));
				747	printk("Forcing NUMA node zero available as a default node\n");
				748	node_set(0, default_nodes);
				749	}
				750
				751	/* Populate the distance[] array */
				752	memset(distance, -1, sizeof(distance));
				753	cpu = 0;
				754	for (coord.y = 0; coord.y < smp_height; ++coord.y) {
				755	for (coord.x = 0; coord.x < smp_width;
				756	++coord.x, ++cpu) {
				757	BUG_ON(cpu >= nr_cpu_ids);
				758	if (!cpu_possible(cpu)) {
				759	cpu_2_node[cpu] = -1;
				760	continue;
				761	}
				762	for_each_node_mask(node, default_nodes) {
				763	HV_MemoryControllerInfo info =
				764	hv_inquire_memory_controller(
				765	coord, node_controller[node]);
				766	distance[node][cpu] =
				767	ABS(info.coord.x) + ABS(info.coord.y);
				768	}
				769	cpumask_set_cpu(cpu, &unbound_cpus);
				770	}
				771	}
				772	cpus = cpu;
				773
				774	/*
				775	* Round-robin through the NUMA nodes until all the cpus are
				776	* assigned. We could be more clever here (e.g. create four
				777	* sorted linked lists on the same set of cpu nodes, and pull
				778	* off them in round-robin sequence, removing from all four
				779	* lists each time) but given the relatively small numbers
				780	* involved, O(n^2) seem OK for a one-time cost.
				781	*/
				782	node = first_node(default_nodes);
				783	while (!cpumask_empty(&unbound_cpus)) {
				784	int best_cpu = -1;
				785	int best_distance = INT_MAX;
				786	for (cpu = 0; cpu < cpus; ++cpu) {
				787	if (cpumask_test_cpu(cpu, &unbound_cpus)) {
				788	/*
				789	* Compute metric, which is how much
				790	* closer the cpu is to this memory
				791	* controller than the others, shifted
				792	* up, and then the number of
				793	* neighbors already in the node as an
				794	* epsilon adjustment to try to keep
				795	* the nodes compact.
				796	*/
				797	int d = distance[node][cpu] * num_nodes;
				798	for_each_node_mask(i, default_nodes) {
				799	if (i != node)
				800	d -= distance[i][cpu];
				801	}
				802	d = 8; / allow space for epsilon */
				803	d -= node_neighbors(node, cpu, &unbound_cpus);
				804	if (d < best_distance) {
				805	best_cpu = cpu;
				806	best_distance = d;
				807	}
				808	}
				809	}
				810	BUG_ON(best_cpu < 0);
				811	cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
				812	cpu_2_node[best_cpu] = node;
				813	cpumask_clear_cpu(best_cpu, &unbound_cpus);
				814	node = next_node(node, default_nodes);
				815	if (node == MAX_NUMNODES)
				816	node = first_node(default_nodes);
				817	}
				818
				819	/* Print out node assignments and set defaults for disabled cpus */
				820	cpu = 0;
				821	for (y = 0; y < smp_height; ++y) {
				822	printk(KERN_DEBUG "NUMA cpu-to-node row %d:", y);
				823	for (x = 0; x < smp_width; ++x, ++cpu) {
				824	if (cpu_to_node(cpu) < 0) {
				825	printk(" -");
				826	cpu_2_node[cpu] = first_node(default_nodes);
				827	} else {
				828	printk(" %d", cpu_to_node(cpu));
				829	}
				830	}
				831	printk("\n");
				832	}
				833	}
				834
				835	static struct cpu cpu_devices[NR_CPUS];
				836
				837	static int __init topology_init(void)
				838	{
				839	int i;
				840
				841	for_each_online_node(i)
				842	register_one_node(i);
				843
				844	for_each_present_cpu(i)
				845	register_cpu(&cpu_devices[i], i);
				846
				847	return 0;
				848	}
				849
				850	subsys_initcall(topology_init);
				851
				852	#else /* !CONFIG_NUMA */
				853
				854	#define setup_numa_mapping() do { } while (0)
				855
				856	#endif /* CONFIG_NUMA */
				857
				858	/**
				859	* setup_mpls() - Allow the user-space code to access various SPRs.
				860	*
				861	* Also called from online_secondary().
				862	*/
				863	void __cpuinit setup_mpls(void)
				864	{
				865	/* Allow asynchronous TLB interrupts. */
				866	#if CHIP_HAS_TILE_DMA()
				867	raw_local_irq_unmask(INT_DMATLB_MISS);
				868	raw_local_irq_unmask(INT_DMATLB_ACCESS);
				869	#endif
				870	#if CHIP_HAS_SN_PROC()
				871	raw_local_irq_unmask(INT_SNITLB_MISS);
				872	#endif
				873
				874	/*
				875	* Allow user access to many generic SPRs, like the cycle
				876	* counter, PASS/FAIL/DONE, INTERRUPT_CRITICAL_SECTION, etc.
				877	*/
				878	__insn_mtspr(SPR_MPL_WORLD_ACCESS_SET_0, 1);
				879
				880	#if CHIP_HAS_SN()
				881	/* Static network is not restricted. */
				882	__insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
				883	#endif
				884	#if CHIP_HAS_SN_PROC()
				885	__insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
				886	__insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
				887	#endif
				888
				889	/*
				890	* Set the MPL for interrupt control 0 to user level.
				891	* This includes access to the SYSTEM_SAVE and EX_CONTEXT SPRs,
				892	* as well as the PL 0 interrupt mask.
				893	*/
				894	__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
				895	}
				896
				897	static int __initdata set_initramfs_file;
				898	static char __initdata initramfs_file[128] = "initramfs.cpio.gz";
				899
				900	static int __init setup_initramfs_file(char *str)
				901	{
				902	if (str == NULL)
				903	return -EINVAL;
				904	strncpy(initramfs_file, str, sizeof(initramfs_file) - 1);
				905	set_initramfs_file = 1;
				906
				907	return 0;
				908	}
				909	early_param("initramfs_file", setup_initramfs_file);
				910
				911	/*
				912	* We look for an additional "initramfs.cpio.gz" file in the hvfs.
				913	* If there is one, we allocate some memory for it and it will be
				914	* unpacked to the initramfs after any built-in initramfs_data.
				915	*/
				916	static void __init load_hv_initrd(void)
				917	{
				918	HV_FS_StatInfo stat;
				919	int fd, rc;
				920	void *initrd;
				921
				922	fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
				923	if (fd == HV_ENOENT) {
				924	if (set_initramfs_file)
				925	printk("No such hvfs initramfs file '%s'\n",
				926	initramfs_file);
				927	return;
				928	}
				929	BUG_ON(fd < 0);
				930	stat = hv_fs_fstat(fd);
				931	BUG_ON(stat.size < 0);
				932	if (stat.flags & HV_FS_ISDIR) {
				933	printk("Ignoring hvfs file '%s': it's a directory.\n",
				934	initramfs_file);
				935	return;
				936	}
				937	initrd = alloc_bootmem_pages(stat.size);
				938	rc = hv_fs_pread(fd, (HV_VirtAddr) initrd, stat.size, 0);
				939	if (rc != stat.size) {
				940	printk("Error reading %d bytes from hvfs file '%s': %d\n",
				941	stat.size, initramfs_file, rc);
				942	free_bootmem((unsigned long) initrd, stat.size);
				943	return;
				944	}
				945	initrd_start = (unsigned long) initrd;
				946	initrd_end = initrd_start + stat.size;
				947	}
				948
				949	void __init free_initrd_mem(unsigned long begin, unsigned long end)
				950	{
				951	free_bootmem(begin, end - begin);
				952	}
				953
				954	static void __init validate_hv(void)
				955	{
				956	/*
				957	* It may already be too late, but let's check our built-in
				958	* configuration against what the hypervisor is providing.
				959	*/
				960	unsigned long glue_size = hv_sysconf(HV_SYSCONF_GLUE_SIZE);
				961	int hv_page_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL);
				962	int hv_hpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE);
				963	HV_ASIDRange asid_range;
				964
				965	#ifndef CONFIG_SMP
				966	HV_Topology topology = hv_inquire_topology();
				967	BUG_ON(topology.coord.x != 0 \|\| topology.coord.y != 0);
				968	if (topology.width != 1 \|\| topology.height != 1) {
				969	printk("Warning: booting UP kernel on %dx%d grid;"
				970	" will ignore all but first tile.\n",
				971	topology.width, topology.height);
				972	}
				973	#endif
				974
				975	if (PAGE_OFFSET + HV_GLUE_START_CPA + glue_size > (unsigned long)_text)
				976	early_panic("Hypervisor glue size %ld is too big!\n",
				977	glue_size);
				978	if (hv_page_size != PAGE_SIZE)
				979	early_panic("Hypervisor page size %#x != our %#lx\n",
				980	hv_page_size, PAGE_SIZE);
				981	if (hv_hpage_size != HPAGE_SIZE)
				982	early_panic("Hypervisor huge page size %#x != our %#lx\n",
				983	hv_hpage_size, HPAGE_SIZE);
				984
				985	#ifdef CONFIG_SMP
				986	/*
				987	* Some hypervisor APIs take a pointer to a bitmap array
				988	* whose size is at least the number of cpus on the chip.
				989	* We use a struct cpumask for this, so it must be big enough.
				990	*/
				991	if ((smp_height * smp_width) > nr_cpu_ids)
				992	early_panic("Hypervisor %d x %d grid too big for Linux"
				993	" NR_CPUS %d\n", smp_height, smp_width,
				994	nr_cpu_ids);
				995	#endif
				996
				997	/*
				998	* Check that we're using allowed ASIDs, and initialize the
				999	* various asid variables to their appropriate initial states.
				1000	*/
				1001	asid_range = hv_inquire_asid(0);
				1002	__get_cpu_var(current_asid) = min_asid = asid_range.start;
				1003	max_asid = asid_range.start + asid_range.size - 1;
				1004
				1005	if (hv_confstr(HV_CONFSTR_CHIP_MODEL, (HV_VirtAddr)chip_model,
				1006	sizeof(chip_model)) < 0) {
				1007	printk("Warning: HV_CONFSTR_CHIP_MODEL not available\n");
				1008	strlcpy(chip_model, "unknown", sizeof(chip_model));
				1009	}
				1010	}
				1011
				1012	static void __init validate_va(void)
				1013	{
				1014	#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
				1015	/*
				1016	* Similarly, make sure we're only using allowed VAs.
				1017	* We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
				1018	* and 0 .. KERNEL_HIGH_VADDR.
				1019	* In addition, make sure we CAN'T use the end of memory, since
				1020	* we use the last chunk of each pgd for the pgd_list.
				1021	*/
				1022	int i, fc_fd_ok = 0;
				1023	unsigned long max_va = 0;
				1024	unsigned long list_va =
				1025	((PGD_LIST_OFFSET / sizeof(pgd_t)) << PGDIR_SHIFT);
				1026
				1027	for (i = 0; ; ++i) {
				1028	HV_VirtAddrRange range = hv_inquire_virtual(i);
				1029	if (range.size == 0)
				1030	break;
				1031	if (range.start <= MEM_USER_INTRPT &&
				1032	range.start + range.size >= MEM_HV_INTRPT)
				1033	fc_fd_ok = 1;
				1034	if (range.start == 0)
				1035	max_va = range.size;
				1036	BUG_ON(range.start + range.size > list_va);
				1037	}
				1038	if (!fc_fd_ok)
				1039	early_panic("Hypervisor not configured for VAs 0xfc/0xfd\n");
				1040	if (max_va == 0)
				1041	early_panic("Hypervisor not configured for low VAs\n");
				1042	if (max_va < KERNEL_HIGH_VADDR)
				1043	early_panic("Hypervisor max VA %#lx smaller than %#lx\n",
				1044	max_va, KERNEL_HIGH_VADDR);
				1045
				1046	/* Kernel PCs must have their high bit set; see intvec.S. */
				1047	if ((long)VMALLOC_START >= 0)
				1048	early_panic(
				1049	"Linux VMALLOC region below the 2GB line (%#lx)!\n"
				1050	"Reconfigure the kernel with fewer NR_HUGE_VMAPS\n"
				1051	"or smaller VMALLOC_RESERVE.\n",
				1052	VMALLOC_START);
				1053	#endif
				1054	}
				1055
				1056	/*
				1057	* cpu_lotar_map lists all the cpus that are valid for the supervisor
				1058	* to cache data on at a page level, i.e. what cpus can be placed in
				1059	* the LOTAR field of a PTE. It is equivalent to the set of possible
				1060	* cpus plus any other cpus that are willing to share their cache.
				1061	* It is set by hv_inquire_tiles(HV_INQ_TILES_LOTAR).
				1062	*/
				1063	struct cpumask __write_once cpu_lotar_map;
				1064	EXPORT_SYMBOL(cpu_lotar_map);
				1065
				1066	#if CHIP_HAS_CBOX_HOME_MAP()
				1067	/*
				1068	* hash_for_home_map lists all the tiles that hash-for-home data
				1069	* will be cached on. Note that this may includes tiles that are not
				1070	* valid for this supervisor to use otherwise (e.g. if a hypervisor
				1071	* device is being shared between multiple supervisors).
				1072	* It is set by hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE).
				1073	*/
				1074	struct cpumask hash_for_home_map;
				1075	EXPORT_SYMBOL(hash_for_home_map);
				1076	#endif
				1077
				1078	/*
				1079	* cpu_cacheable_map lists all the cpus whose caches the hypervisor can
				1080	* flush on our behalf. It is set to cpu_possible_map OR'ed with
				1081	* hash_for_home_map, and it is what should be passed to
				1082	* hv_flush_remote() to flush all caches. Note that if there are
				1083	* dedicated hypervisor driver tiles that have authorized use of their
				1084	* cache, those tiles will only appear in cpu_lotar_map, NOT in
				1085	* cpu_cacheable_map, as they are a special case.
				1086	*/
				1087	struct cpumask __write_once cpu_cacheable_map;
				1088	EXPORT_SYMBOL(cpu_cacheable_map);
				1089
				1090	static __initdata struct cpumask disabled_map;
				1091
				1092	static int __init disabled_cpus(char *str)
				1093	{
				1094	int boot_cpu = smp_processor_id();
				1095
				1096	if (str == NULL \|\| cpulist_parse_crop(str, &disabled_map) != 0)
				1097	return -EINVAL;
				1098	if (cpumask_test_cpu(boot_cpu, &disabled_map)) {
				1099	printk("disabled_cpus: can't disable boot cpu %d\n", boot_cpu);
				1100	cpumask_clear_cpu(boot_cpu, &disabled_map);
				1101	}
				1102	return 0;
				1103	}
				1104
				1105	early_param("disabled_cpus", disabled_cpus);
				1106
				1107	void __init print_disabled_cpus()
				1108	{
				1109	if (!cpumask_empty(&disabled_map)) {
				1110	char buf[100];
				1111	cpulist_scnprintf(buf, sizeof(buf), &disabled_map);
				1112	printk(KERN_INFO "CPUs not available for Linux: %s\n", buf);
				1113	}
				1114	}
				1115
				1116	static void __init setup_cpu_maps(void)
				1117	{
				1118	struct cpumask hv_disabled_map, cpu_possible_init;
				1119	int boot_cpu = smp_processor_id();
				1120	int cpus, i, rc;
				1121
				1122	/* Learn which cpus are allowed by the hypervisor. */
				1123	rc = hv_inquire_tiles(HV_INQ_TILES_AVAIL,
				1124	(HV_VirtAddr) cpumask_bits(&cpu_possible_init),
				1125	sizeof(cpu_cacheable_map));
				1126	if (rc < 0)
				1127	early_panic("hv_inquire_tiles(AVAIL) failed: rc %d\n", rc);
				1128	if (!cpumask_test_cpu(boot_cpu, &cpu_possible_init))
				1129	early_panic("Boot CPU %d disabled by hypervisor!\n", boot_cpu);
				1130
				1131	/* Compute the cpus disabled by the hvconfig file. */
				1132	cpumask_complement(&hv_disabled_map, &cpu_possible_init);
				1133
				1134	/* Include them with the cpus disabled by "disabled_cpus". */
				1135	cpumask_or(&disabled_map, &disabled_map, &hv_disabled_map);
				1136
				1137	/*
				1138	* Disable every cpu after "setup_max_cpus". But don't mark
				1139	* as disabled the cpus that are outside of our initial rectangle,
				1140	* since that turns out to be confusing.
				1141	*/
				1142	cpus = 1; /* this cpu */
				1143	cpumask_set_cpu(boot_cpu, &disabled_map); /* ignore this cpu */
				1144	for (i = 0; cpus < setup_max_cpus; ++i)
				1145	if (!cpumask_test_cpu(i, &disabled_map))
				1146	++cpus;
				1147	for (; i < smp_height * smp_width; ++i)
				1148	cpumask_set_cpu(i, &disabled_map);
				1149	cpumask_clear_cpu(boot_cpu, &disabled_map); /* reset this cpu */
				1150	for (i = smp_height * smp_width; i < NR_CPUS; ++i)
				1151	cpumask_clear_cpu(i, &disabled_map);
				1152
				1153	/*
				1154	* Setup cpu_possible map as every cpu allocated to us, minus
				1155	* the results of any "disabled_cpus" settings.
				1156	*/
				1157	cpumask_andnot(&cpu_possible_init, &cpu_possible_init, &disabled_map);
				1158	init_cpu_possible(&cpu_possible_init);
				1159
				1160	/* Learn which cpus are valid for LOTAR caching. */
				1161	rc = hv_inquire_tiles(HV_INQ_TILES_LOTAR,
				1162	(HV_VirtAddr) cpumask_bits(&cpu_lotar_map),
				1163	sizeof(cpu_lotar_map));
				1164	if (rc < 0) {
				1165	printk("warning: no HV_INQ_TILES_LOTAR; using AVAIL\n");
				1166	cpu_lotar_map = cpu_possible_map;
				1167	}
				1168
				1169	#if CHIP_HAS_CBOX_HOME_MAP()
				1170	/* Retrieve set of CPUs used for hash-for-home caching */
				1171	rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE,
				1172	(HV_VirtAddr) hash_for_home_map.bits,
				1173	sizeof(hash_for_home_map));
				1174	if (rc < 0)
				1175	early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
				1176	cpumask_or(&cpu_cacheable_map, &cpu_possible_map, &hash_for_home_map);
				1177	#else
				1178	cpu_cacheable_map = cpu_possible_map;
				1179	#endif
				1180	}
				1181
				1182
				1183	static int __init dataplane(char *str)
				1184	{
				1185	printk("WARNING: dataplane support disabled in this kernel\n");
				1186	return 0;
				1187	}
				1188
				1189	early_param("dataplane", dataplane);
				1190
				1191	#ifdef CONFIG_CMDLINE_BOOL
				1192	static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
				1193	#endif
				1194
				1195	void __init setup_arch(char **cmdline_p)
				1196	{
				1197	int len;
				1198
				1199	#if defined(CONFIG_CMDLINE_BOOL) && defined(CONFIG_CMDLINE_OVERRIDE)
				1200	len = hv_get_command_line((HV_VirtAddr) boot_command_line,
				1201	COMMAND_LINE_SIZE);
				1202	if (boot_command_line[0])
				1203	printk("WARNING: ignoring dynamic command line \"%s\"\n",
				1204	boot_command_line);
				1205	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
				1206	#else
				1207	char *hv_cmdline;
				1208	#if defined(CONFIG_CMDLINE_BOOL)
				1209	if (builtin_cmdline[0]) {
				1210	int builtin_len = strlcpy(boot_command_line, builtin_cmdline,
				1211	COMMAND_LINE_SIZE);
				1212	if (builtin_len < COMMAND_LINE_SIZE-1)
				1213	boot_command_line[builtin_len++] = ' ';
				1214	hv_cmdline = &boot_command_line[builtin_len];
				1215	len = COMMAND_LINE_SIZE - builtin_len;
				1216	} else
				1217	#endif
				1218	{
				1219	hv_cmdline = boot_command_line;
				1220	len = COMMAND_LINE_SIZE;
				1221	}
				1222	len = hv_get_command_line((HV_VirtAddr) hv_cmdline, len);
				1223	if (len < 0 \|\| len > COMMAND_LINE_SIZE)
				1224	early_panic("hv_get_command_line failed: %d\n", len);
				1225	#endif
				1226
				1227	*cmdline_p = boot_command_line;
				1228
				1229	/* Set disabled_map and setup_max_cpus very early */
				1230	parse_early_param();
				1231
				1232	/* Make sure the kernel is compatible with the hypervisor. */
				1233	validate_hv();
				1234	validate_va();
				1235
				1236	setup_cpu_maps();
				1237
				1238
				1239	#ifdef CONFIG_PCI
				1240	/*
				1241	* Initialize the PCI structures. This is done before memory
				1242	* setup so that we know whether or not a pci_reserve region
				1243	* is necessary.
				1244	*/
				1245	if (tile_pci_init() == 0)
				1246	pci_reserve_mb = 0;
				1247
				1248	/* PCI systems reserve a region just below 4GB for mapping iomem. */
				1249	pci_reserve_end_pfn = (1 << (32 - PAGE_SHIFT));
				1250	pci_reserve_start_pfn = pci_reserve_end_pfn -
				1251	(pci_reserve_mb << (20 - PAGE_SHIFT));
				1252	#endif
				1253
				1254	init_mm.start_code = (unsigned long) _text;
				1255	init_mm.end_code = (unsigned long) _etext;
				1256	init_mm.end_data = (unsigned long) _edata;
				1257	init_mm.brk = (unsigned long) _end;
				1258
				1259	setup_memory();
				1260	store_permanent_mappings();
				1261	setup_bootmem_allocator();
				1262
				1263	/*
				1264	* NOTE: before this point _nobody_ is allowed to allocate
				1265	* any memory using the bootmem allocator.
				1266	*/
				1267
				1268	paging_init();
				1269	setup_numa_mapping();
				1270	zone_sizes_init();
				1271	set_page_homes();
				1272	setup_mpls();
				1273	setup_clock();
				1274	load_hv_initrd();
				1275	}
				1276
				1277
				1278	/*
				1279	* Set up per-cpu memory.
				1280	*/
				1281
				1282	unsigned long __per_cpu_offset[NR_CPUS] __write_once;
				1283	EXPORT_SYMBOL(__per_cpu_offset);
				1284
				1285	static size_t __initdata pfn_offset[MAX_NUMNODES] = { 0 };
				1286	static unsigned long __initdata percpu_pfn[NR_CPUS] = { 0 };
				1287
				1288	/*
				1289	* As the percpu code allocates pages, we return the pages from the
				1290	* end of the node for the specified cpu.
				1291	*/
				1292	static void *__init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
				1293	{
				1294	int nid = cpu_to_node(cpu);
				1295	unsigned long pfn = node_percpu_pfn[nid] + pfn_offset[nid];
				1296
				1297	BUG_ON(size % PAGE_SIZE != 0);
				1298	pfn_offset[nid] += size / PAGE_SIZE;
				1299	if (percpu_pfn[cpu] == 0)
				1300	percpu_pfn[cpu] = pfn;
				1301	return pfn_to_kaddr(pfn);
				1302	}
				1303
				1304	/*
				1305	* Pages reserved for percpu memory are not freeable, and in any case we are
				1306	* on a short path to panic() in setup_per_cpu_area() at this point anyway.
				1307	*/
				1308	static void __init pcpu_fc_free(void *ptr, size_t size)
				1309	{
				1310	}
				1311
				1312	/*
				1313	* Set up vmalloc page tables using bootmem for the percpu code.
				1314	*/
				1315	static void __init pcpu_fc_populate_pte(unsigned long addr)
				1316	{
				1317	pgd_t *pgd;
				1318	pud_t *pud;
				1319	pmd_t *pmd;
				1320	pte_t *pte;
				1321
				1322	BUG_ON(pgd_addr_invalid(addr));
				1323
				1324	pgd = swapper_pg_dir + pgd_index(addr);
				1325	pud = pud_offset(pgd, addr);
				1326	BUG_ON(!pud_present(*pud));
				1327	pmd = pmd_offset(pud, addr);
				1328	if (pmd_present(*pmd)) {
				1329	BUG_ON(pmd_huge_page(*pmd));
				1330	} else {
				1331	pte = __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE,
				1332	HV_PAGE_TABLE_ALIGN, 0);
				1333	pmd_populate_kernel(&init_mm, pmd, pte);
				1334	}
				1335	}
				1336
				1337	void __init setup_per_cpu_areas(void)
				1338	{
				1339	struct page *pg;
				1340	unsigned long delta, pfn, lowmem_va;
				1341	unsigned long size = percpu_size();
				1342	char *ptr;
				1343	int rc, cpu, i;
				1344
				1345	rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_fc_alloc,
				1346	pcpu_fc_free, pcpu_fc_populate_pte);
				1347	if (rc < 0)
				1348	panic("Cannot initialize percpu area (err=%d)", rc);
				1349
				1350	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
				1351	for_each_possible_cpu(cpu) {
				1352	__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
				1353
				1354	/* finv the copy out of cache so we can change homecache */
				1355	ptr = pcpu_base_addr + pcpu_unit_offsets[cpu];
				1356	__finv_buffer(ptr, size);
				1357	pfn = percpu_pfn[cpu];
				1358
				1359	/* Rewrite the page tables to cache on that cpu */
				1360	pg = pfn_to_page(pfn);
				1361	for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {
				1362
				1363	/* Update the vmalloc mapping and page home. */
				1364	pte_t *ptep =
				1365	virt_to_pte(NULL, (unsigned long)ptr + i);
				1366	pte_t pte = *ptep;
				1367	BUG_ON(pfn != pte_pfn(pte));
				1368	pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
				1369	pte = set_remote_cache_cpu(pte, cpu);
				1370	set_pte(ptep, pte);
				1371
				1372	/* Update the lowmem mapping for consistency. */
				1373	lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
				1374	ptep = virt_to_pte(NULL, lowmem_va);
				1375	if (pte_huge(*ptep)) {
				1376	printk(KERN_DEBUG "early shatter of huge page"
				1377	" at %#lx\n", lowmem_va);
				1378	shatter_pmd((pmd_t *)ptep);
				1379	ptep = virt_to_pte(NULL, lowmem_va);
				1380	BUG_ON(pte_huge(*ptep));
				1381	}
				1382	BUG_ON(pfn != pte_pfn(*ptep));
				1383	set_pte(ptep, pte);
				1384	}
				1385	}
				1386
				1387	/* Set our thread pointer appropriately. */
				1388	set_my_cpu_offset(__per_cpu_offset[smp_processor_id()]);
				1389
				1390	/* Make sure the finv's have completed. */
				1391	mb_incoherent();
				1392
				1393	/* Flush the TLB so we reference it properly from here on out. */
				1394	local_flush_tlb_all();
				1395	}
				1396
				1397	static struct resource data_resource = {
				1398	.name = "Kernel data",
				1399	.start = 0,
				1400	.end = 0,
				1401	.flags = IORESOURCE_BUSY \| IORESOURCE_MEM
				1402	};
				1403
				1404	static struct resource code_resource = {
				1405	.name = "Kernel code",
				1406	.start = 0,
				1407	.end = 0,
				1408	.flags = IORESOURCE_BUSY \| IORESOURCE_MEM
				1409	};
				1410
				1411	/*
				1412	* We reserve all resources above 4GB so that PCI won't try to put
				1413	* mappings above 4GB; the standard allows that for some devices but
				1414	* the probing code trunates values to 32 bits.
				1415	*/
				1416	#ifdef CONFIG_PCI
				1417	static struct resource* __init
				1418	insert_non_bus_resource(void)
				1419	{
				1420	struct resource *res =
				1421	kzalloc(sizeof(struct resource), GFP_ATOMIC);
				1422	res->name = "Non-Bus Physical Address Space";
				1423	res->start = (1ULL << 32);
				1424	res->end = -1LL;
				1425	res->flags = IORESOURCE_BUSY \| IORESOURCE_MEM;
				1426	if (insert_resource(&iomem_resource, res)) {
				1427	kfree(res);
				1428	return NULL;
				1429	}
				1430	return res;
				1431	}
				1432	#endif
				1433
				1434	static struct resource* __init
				1435	insert_ram_resource(u64 start_pfn, u64 end_pfn)
				1436	{
				1437	struct resource *res =
				1438	kzalloc(sizeof(struct resource), GFP_ATOMIC);
				1439	res->name = "System RAM";
				1440	res->start = start_pfn << PAGE_SHIFT;
				1441	res->end = (end_pfn << PAGE_SHIFT) - 1;
				1442	res->flags = IORESOURCE_BUSY \| IORESOURCE_MEM;
				1443	if (insert_resource(&iomem_resource, res)) {
				1444	kfree(res);
				1445	return NULL;
				1446	}
				1447	return res;
				1448	}
				1449
				1450	/*
				1451	* Request address space for all standard resources
				1452	*
				1453	* If the system includes PCI root complex drivers, we need to create
				1454	* a window just below 4GB where PCI BARs can be mapped.
				1455	*/
				1456	static int __init request_standard_resources(void)
				1457	{
				1458	int i;
				1459	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
				1460
				1461	iomem_resource.end = -1LL;
				1462	#ifdef CONFIG_PCI
				1463	insert_non_bus_resource();
				1464	#endif
				1465
				1466	for_each_online_node(i) {
				1467	u64 start_pfn = node_start_pfn[i];
				1468	u64 end_pfn = node_end_pfn[i];
				1469
				1470	#ifdef CONFIG_PCI
				1471	if (start_pfn <= pci_reserve_start_pfn &&
				1472	end_pfn > pci_reserve_start_pfn) {
				1473	if (end_pfn > pci_reserve_end_pfn)
				1474	insert_ram_resource(pci_reserve_end_pfn,
				1475	end_pfn);
				1476	end_pfn = pci_reserve_start_pfn;
				1477	}
				1478	#endif
				1479	insert_ram_resource(start_pfn, end_pfn);
				1480	}
				1481
				1482	code_resource.start = __pa(_text - CODE_DELTA);
				1483	code_resource.end = __pa(_etext - CODE_DELTA)-1;
				1484	data_resource.start = __pa(_sdata);
				1485	data_resource.end = __pa(_end)-1;
				1486
				1487	insert_resource(&iomem_resource, &code_resource);
				1488	insert_resource(&iomem_resource, &data_resource);
				1489
				1490	#ifdef CONFIG_KEXEC
				1491	insert_resource(&iomem_resource, &crashk_res);
				1492	#endif
				1493
				1494	return 0;
				1495	}
				1496
				1497	subsys_initcall(request_standard_resources);