blob: ca10701e7a905a36f88b604a5b2c3ed280112d6b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
Andi Kleen68a3a7f2006-04-07 19:49:18 +020018#include <linux/bootmem.h>
19#include <linux/mm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <asm/proto.h>
21#include <asm/numa.h>
Andi Kleen8a6fdd32006-01-11 22:44:39 +010022#include <asm/e820.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023
Andi Kleenc31fbb12006-09-26 10:52:33 +020024int acpi_numa __initdata;
25
Andi Kleen68a3a7f2006-04-07 19:49:18 +020026#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
27 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
28 && !defined(CONFIG_MEMORY_HOTPLUG)
29#define RESERVE_HOTADD 1
30#endif
31
Linus Torvalds1da177e2005-04-16 15:20:36 -070032static struct acpi_table_slit *acpi_slit;
33
34static nodemask_t nodes_parsed __initdata;
Andi Kleenabe059e2006-03-25 16:29:12 +010035static struct bootnode nodes[MAX_NUMNODES] __initdata;
Andi Kleen68a3a7f2006-04-07 19:49:18 +020036static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
37static int found_add_area __initdata;
Andi Kleenfad79062006-05-15 18:19:44 +020038int hotadd_percent __initdata = 0;
39#ifndef RESERVE_HOTADD
40#define hotadd_percent 0 /* Ignore all settings */
41#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -070042
Andi Kleen9391a3f2006-02-03 21:51:17 +010043/* Too small nodes confuse the VM badly. Usually they result
44 from BIOS bugs. */
45#define NODE_MIN_SIZE (4*1024*1024)
46
Linus Torvalds1da177e2005-04-16 15:20:36 -070047static __init int setup_node(int pxm)
48{
Yasunori Goto762834e2006-06-23 02:03:19 -070049 return acpi_map_pxm_to_node(pxm);
Linus Torvalds1da177e2005-04-16 15:20:36 -070050}
51
52static __init int conflicting_nodes(unsigned long start, unsigned long end)
53{
54 int i;
Andi Kleen4b6a4552005-09-12 18:49:25 +020055 for_each_node_mask(i, nodes_parsed) {
Andi Kleenabe059e2006-03-25 16:29:12 +010056 struct bootnode *nd = &nodes[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -070057 if (nd->start == nd->end)
58 continue;
59 if (nd->end > start && nd->start < end)
Andi Kleen05d1fa42005-09-12 18:49:24 +020060 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070061 if (nd->end == end && nd->start == start)
Andi Kleen05d1fa42005-09-12 18:49:24 +020062 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070063 }
64 return -1;
65}
66
67static __init void cutoff_node(int i, unsigned long start, unsigned long end)
68{
Andi Kleenabe059e2006-03-25 16:29:12 +010069 struct bootnode *nd = &nodes[i];
Andi Kleen68a3a7f2006-04-07 19:49:18 +020070
71 if (found_add_area)
72 return;
73
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 if (nd->start < start) {
75 nd->start = start;
76 if (nd->end < nd->start)
77 nd->start = nd->end;
78 }
79 if (nd->end > end) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070080 nd->end = end;
81 if (nd->start > nd->end)
82 nd->start = nd->end;
83 }
84}
85
86static __init void bad_srat(void)
87{
Andi Kleen2bce2b52005-09-12 18:49:25 +020088 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070089 printk(KERN_ERR "SRAT: SRAT not used.\n");
90 acpi_numa = -1;
Andi Kleenfad79062006-05-15 18:19:44 +020091 found_add_area = 0;
Andi Kleen2bce2b52005-09-12 18:49:25 +020092 for (i = 0; i < MAX_LOCAL_APIC; i++)
93 apicid_to_node[i] = NUMA_NO_NODE;
Andi Kleen68a3a7f2006-04-07 19:49:18 +020094 for (i = 0; i < MAX_NUMNODES; i++)
95 nodes_add[i].start = nodes[i].end = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070096}
97
98static __init inline int srat_disabled(void)
99{
100 return numa_off || acpi_numa < 0;
101}
102
Andi Kleen1584b892006-01-11 22:43:42 +0100103/*
104 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
105 * up the NUMA heuristics which wants the local node to have a smaller
106 * distance than the others.
107 * Do some quick checks here and only use the SLIT if it passes.
108 */
109static __init int slit_valid(struct acpi_table_slit *slit)
110{
111 int i, j;
112 int d = slit->localities;
113 for (i = 0; i < d; i++) {
114 for (j = 0; j < d; j++) {
115 u8 val = slit->entry[d*i + j];
116 if (i == j) {
117 if (val != 10)
118 return 0;
119 } else if (val <= 10)
120 return 0;
121 }
122 }
123 return 1;
124}
125
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126/* Callback for SLIT parsing */
127void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
128{
Andi Kleen1584b892006-01-11 22:43:42 +0100129 if (!slit_valid(slit)) {
130 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
131 return;
132 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 acpi_slit = slit;
134}
135
136/* Callback for Proximity Domain -> LAPIC mapping */
137void __init
138acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
139{
140 int pxm, node;
Andi Kleend22fe802006-02-03 21:51:26 +0100141 if (srat_disabled())
142 return;
Andi Kleenfad79062006-05-15 18:19:44 +0200143 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
144 bad_srat();
Andi Kleend22fe802006-02-03 21:51:26 +0100145 return;
146 }
147 if (pa->flags.enabled == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 return;
149 pxm = pa->proximity_domain;
150 node = setup_node(pxm);
151 if (node < 0) {
152 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
153 bad_srat();
154 return;
155 }
Andi Kleen0b07e982005-09-12 18:49:24 +0200156 apicid_to_node[pa->apic_id] = node;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 acpi_numa = 1;
Andi Kleen0b07e982005-09-12 18:49:24 +0200158 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
159 pxm, pa->apic_id, node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160}
161
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200162#ifdef RESERVE_HOTADD
163/*
164 * Protect against too large hotadd areas that would fill up memory.
165 */
166static int hotadd_enough_memory(struct bootnode *nd)
167{
168 static unsigned long allocated;
169 static unsigned long last_area_end;
170 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
171 long mem = pages * sizeof(struct page);
172 unsigned long addr;
173 unsigned long allowed;
174 unsigned long oldpages = pages;
175
176 if (mem < 0)
177 return 0;
178 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
179 allowed = (allowed / 100) * hotadd_percent;
180 if (allocated + mem > allowed) {
Andi Kleenfad79062006-05-15 18:19:44 +0200181 unsigned long range;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200182 /* Give them at least part of their hotadd memory upto hotadd_percent
183 It would be better to spread the limit out
184 over multiple hotplug areas, but that is too complicated
185 right now */
186 if (allocated >= allowed)
187 return 0;
Andi Kleenfad79062006-05-15 18:19:44 +0200188 range = allowed - allocated;
189 pages = (range / PAGE_SIZE);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200190 mem = pages * sizeof(struct page);
Andi Kleenfad79062006-05-15 18:19:44 +0200191 nd->end = nd->start + range;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200192 }
193 /* Not completely fool proof, but a good sanity check */
194 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
195 if (addr == -1UL)
196 return 0;
197 if (pages != oldpages)
198 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
199 pages << PAGE_SHIFT);
200 last_area_end = addr + mem;
201 allocated += mem;
202 return 1;
203}
204
205/*
206 * It is fine to add this area to the nodes data it will be used later
207 * This code supports one contigious hot add area per node.
208 */
209static int reserve_hotadd(int node, unsigned long start, unsigned long end)
210{
211 unsigned long s_pfn = start >> PAGE_SHIFT;
212 unsigned long e_pfn = end >> PAGE_SHIFT;
213 int changed = 0;
214 struct bootnode *nd = &nodes_add[node];
215
216 /* I had some trouble with strange memory hotadd regions breaking
217 the boot. Be very strict here and reject anything unexpected.
218 If you want working memory hotadd write correct SRATs.
219
220 The node size check is a basic sanity check to guard against
221 mistakes */
222 if ((signed long)(end - start) < NODE_MIN_SIZE) {
223 printk(KERN_ERR "SRAT: Hotplug area too small\n");
224 return -1;
225 }
226
227 /* This check might be a bit too strict, but I'm keeping it for now. */
228 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
229 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
230 return -1;
231 }
232
233 if (!hotadd_enough_memory(&nodes_add[node])) {
234 printk(KERN_ERR "SRAT: Hotplug area too large\n");
235 return -1;
236 }
237
238 /* Looks good */
239
240 found_add_area = 1;
241 if (nd->start == nd->end) {
242 nd->start = start;
243 nd->end = end;
244 changed = 1;
245 } else {
246 if (nd->start == end) {
247 nd->start = start;
248 changed = 1;
249 }
250 if (nd->end == start) {
251 nd->end = end;
252 changed = 1;
253 }
254 if (!changed)
255 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
256 }
257
258 if ((nd->end >> PAGE_SHIFT) > end_pfn)
259 end_pfn = nd->end >> PAGE_SHIFT;
260
261 if (changed)
262 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
263 return 0;
264}
265#endif
266
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
268void __init
269acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
270{
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200271 struct bootnode *nd, oldnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 unsigned long start, end;
273 int node, pxm;
274 int i;
275
Andi Kleend22fe802006-02-03 21:51:26 +0100276 if (srat_disabled())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100278 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
279 bad_srat();
280 return;
281 }
282 if (ma->flags.enabled == 0)
283 return;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200284 if (ma->flags.hot_pluggable && hotadd_percent == 0)
285 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100286 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
287 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 pxm = ma->proximity_domain;
289 node = setup_node(pxm);
290 if (node < 0) {
291 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
292 bad_srat();
293 return;
294 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 i = conflicting_nodes(start, end);
Andi Kleen05d1fa42005-09-12 18:49:24 +0200296 if (i == node) {
297 printk(KERN_WARNING
298 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
299 pxm, start, end, nodes[i].start, nodes[i].end);
300 } else if (i >= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 printk(KERN_ERR
Andi Kleen05d1fa42005-09-12 18:49:24 +0200302 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
303 pxm, start, end, node_to_pxm(i),
304 nodes[i].start, nodes[i].end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305 bad_srat();
306 return;
307 }
308 nd = &nodes[node];
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200309 oldnode = *nd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310 if (!node_test_and_set(node, nodes_parsed)) {
311 nd->start = start;
312 nd->end = end;
313 } else {
314 if (start < nd->start)
315 nd->start = start;
316 if (nd->end < end)
317 nd->end = end;
318 }
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200319
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
321 nd->start, nd->end);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200322
323#ifdef RESERVE_HOTADD
324 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
325 /* Ignore hotadd region. Undo damage */
326 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
327 *nd = oldnode;
328 if ((nd->start | nd->end) == 0)
329 node_clear(node, nodes_parsed);
330 }
331#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332}
333
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100334/* Sanity check to catch more bad SRATs (they are amazingly common).
335 Make sure the PXMs cover all memory. */
336static int nodes_cover_memory(void)
337{
338 int i;
339 unsigned long pxmram, e820ram;
340
341 pxmram = 0;
342 for_each_node_mask(i, nodes_parsed) {
343 unsigned long s = nodes[i].start >> PAGE_SHIFT;
344 unsigned long e = nodes[i].end >> PAGE_SHIFT;
345 pxmram += e - s;
346 pxmram -= e820_hole_size(s, e);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200347 pxmram -= nodes_add[i].end - nodes_add[i].start;
348 if ((long)pxmram < 0)
349 pxmram = 0;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100350 }
351
352 e820ram = end_pfn - e820_hole_size(0, end_pfn);
Andi Kleenfdb9df92006-02-16 23:42:13 +0100353 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
354 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100355 printk(KERN_ERR
356 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
357 (pxmram << PAGE_SHIFT) >> 20,
358 (e820ram << PAGE_SHIFT) >> 20);
359 return 0;
360 }
361 return 1;
362}
363
Andi Kleen9391a3f2006-02-03 21:51:17 +0100364static void unparse_node(int node)
365{
366 int i;
367 node_clear(node, nodes_parsed);
368 for (i = 0; i < MAX_LOCAL_APIC; i++) {
369 if (apicid_to_node[i] == node)
370 apicid_to_node[i] = NUMA_NO_NODE;
371 }
372}
373
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374void __init acpi_numa_arch_fixup(void) {}
375
376/* Use the information discovered above to actually set up the nodes. */
377int __init acpi_scan_nodes(unsigned long start, unsigned long end)
378{
379 int i;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100380
Andi Kleen9391a3f2006-02-03 21:51:17 +0100381 /* First clean up the node list */
382 for (i = 0; i < MAX_NUMNODES; i++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200383 cutoff_node(i, start, end);
Daniel Yeisley0d015322006-05-30 22:47:57 +0200384 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
Andi Kleen9391a3f2006-02-03 21:51:17 +0100385 unparse_node(i);
Daniel Yeisley0d015322006-05-30 22:47:57 +0200386 node_set_offline(i);
387 }
Andi Kleen9391a3f2006-02-03 21:51:17 +0100388 }
389
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 if (acpi_numa <= 0)
391 return -1;
Andi Kleene58e0d02005-09-12 18:49:25 +0200392
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100393 if (!nodes_cover_memory()) {
394 bad_srat();
395 return -1;
396 }
397
Andi Kleen2aed7112006-02-16 23:42:16 +0100398 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 if (memnode_shift < 0) {
400 printk(KERN_ERR
401 "SRAT: No NUMA node hash function found. Contact maintainer\n");
402 bad_srat();
403 return -1;
404 }
Andi Kleene58e0d02005-09-12 18:49:25 +0200405
406 /* Finally register nodes */
407 for_each_node_mask(i, nodes_parsed)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
Andi Kleena8062232006-04-07 19:49:21 +0200409 /* Try again in case setup_node_bootmem missed one due
410 to missing bootmem */
411 for_each_node_mask(i, nodes_parsed)
412 if (!node_online(i))
413 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
414
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 for (i = 0; i < NR_CPUS; i++) {
416 if (cpu_to_node[i] == NUMA_NO_NODE)
417 continue;
418 if (!node_isset(cpu_to_node[i], nodes_parsed))
Andi Kleen69d81fc2005-11-05 17:25:53 +0100419 numa_set_node(i, NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 }
421 numa_init_array();
422 return 0;
423}
424
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200425void __init srat_reserve_add_area(int nodeid)
426{
427 if (found_add_area && nodes_add[nodeid].end) {
428 u64 total_mb;
429
430 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
431 "for node %d at %Lx-%Lx\n",
432 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
433 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
434 >> PAGE_SHIFT;
435 total_mb *= sizeof(struct page);
436 total_mb >>= 20;
437 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
438 "pre-allocated memory.\n", (unsigned long long)total_mb);
439 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
440 nodes_add[nodeid].end - nodes_add[nodeid].start);
441 }
442}
443
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444int __node_distance(int a, int b)
445{
446 int index;
447
448 if (!acpi_slit)
449 return a == b ? 10 : 20;
450 index = acpi_slit->localities * node_to_pxm(a);
451 return acpi_slit->entry[index + node_to_pxm(b)];
452}
453
454EXPORT_SYMBOL(__node_distance);