blob: b8020dc7b71ebc8ba2aab6a26a72f0bc7d124108 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/threads.h>
12#include <linux/bootmem.h>
13#include <linux/init.h>
14#include <linux/mm.h>
15#include <linux/mmzone.h>
Paul Gortmaker4b16f8e2011-07-22 18:24:23 -040016#include <linux/export.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070017#include <linux/nodemask.h>
18#include <linux/cpu.h>
19#include <linux/notifier.h>
Yinghai Lu95f72d12010-07-12 14:36:09 +100020#include <linux/memblock.h>
Michael Ellerman6df16462008-02-14 11:37:49 +110021#include <linux/of.h>
Dave Hansen06eccea2009-02-12 12:36:04 +000022#include <linux/pfn.h>
Jesse Larrew9eff1a32010-12-01 12:31:15 +000023#include <linux/cpuset.h>
24#include <linux/node.h>
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110025#include <asm/sparsemem.h>
David S. Millerd9b2b2a2008-02-13 16:56:49 -080026#include <asm/prom.h>
Paul Mackerras2249ca92005-11-07 13:18:13 +110027#include <asm/smp.h>
Jesse Larrew9eff1a32010-12-01 12:31:15 +000028#include <asm/firmware.h>
29#include <asm/paca.h>
Jesse Larrew39bf9902010-12-17 22:07:47 +000030#include <asm/hvcall.h>
David Howellsae3a1972012-03-28 18:30:02 +010031#include <asm/setup.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
33static int numa_enabled = 1;
34
Balbir Singh1daa6d02008-02-01 15:57:31 +110035static char *cmdline __initdata;
36
Linus Torvalds1da177e2005-04-16 15:20:36 -070037static int numa_debug;
38#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
39
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110040int numa_cpu_lookup_table[NR_CPUS];
Anton Blanchard25863de2010-04-26 15:32:43 +000041cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -070042struct pglist_data *node_data[MAX_NUMNODES];
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110043
44EXPORT_SYMBOL(numa_cpu_lookup_table);
Anton Blanchard25863de2010-04-26 15:32:43 +000045EXPORT_SYMBOL(node_to_cpumask_map);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110046EXPORT_SYMBOL(node_data);
47
Linus Torvalds1da177e2005-04-16 15:20:36 -070048static int min_common_depth;
Mike Kravetz237a0982005-12-05 12:06:42 -080049static int n_mem_addr_cells, n_mem_size_cells;
Anton Blanchard41eab6f2010-05-16 20:22:31 +000050static int form1_affinity;
51
52#define MAX_DISTANCE_REF_POINTS 4
53static int distance_ref_points_depth;
54static const unsigned int *distance_ref_points;
55static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
Anton Blanchard25863de2010-04-26 15:32:43 +000057/*
58 * Allocate node_to_cpumask_map based on number of available nodes
59 * Requires node_possible_map to be valid.
60 *
Wanlong Gao95129382012-01-12 17:20:09 -080061 * Note: cpumask_of_node() is not valid until after this is done.
Anton Blanchard25863de2010-04-26 15:32:43 +000062 */
63static void __init setup_node_to_cpumask_map(void)
64{
Cody P Schaferf9d531b2013-04-29 15:08:03 -070065 unsigned int node;
Anton Blanchard25863de2010-04-26 15:32:43 +000066
67 /* setup nr_node_ids if not done yet */
Cody P Schaferf9d531b2013-04-29 15:08:03 -070068 if (nr_node_ids == MAX_NUMNODES)
69 setup_nr_node_ids();
Anton Blanchard25863de2010-04-26 15:32:43 +000070
71 /* allocate the map */
72 for (node = 0; node < nr_node_ids; node++)
73 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
74
75 /* cpumask_of_node() will now work */
76 dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
77}
78
Balbir Singh1daa6d02008-02-01 15:57:31 +110079static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
80 unsigned int *nid)
81{
82 unsigned long long mem;
83 char *p = cmdline;
84 static unsigned int fake_nid;
85 static unsigned long long curr_boundary;
86
87 /*
88 * Modify node id, iff we started creating NUMA nodes
89 * We want to continue from where we left of the last time
90 */
91 if (fake_nid)
92 *nid = fake_nid;
93 /*
94 * In case there are no more arguments to parse, the
95 * node_id should be the same as the last fake node id
96 * (we've handled this above).
97 */
98 if (!p)
99 return 0;
100
101 mem = memparse(p, &p);
102 if (!mem)
103 return 0;
104
105 if (mem < curr_boundary)
106 return 0;
107
108 curr_boundary = mem;
109
110 if ((end_pfn << PAGE_SHIFT) > mem) {
111 /*
112 * Skip commas and spaces
113 */
114 while (*p == ',' || *p == ' ' || *p == '\t')
115 p++;
116
117 cmdline = p;
118 fake_nid++;
119 *nid = fake_nid;
120 dbg("created new fake_node with id %d\n", fake_nid);
121 return 1;
122 }
123 return 0;
124}
125
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000126/*
Tejun Heo5dfe8662011-07-14 09:46:10 +0200127 * get_node_active_region - Return active region containing pfn
Jon Tollefsone8170372008-10-16 18:59:43 +0000128 * Active range returned is empty if none found.
Tejun Heo5dfe8662011-07-14 09:46:10 +0200129 * @pfn: The page to return the region for
130 * @node_ar: Returned set to the active region containing @pfn
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000131 */
Tejun Heo5dfe8662011-07-14 09:46:10 +0200132static void __init get_node_active_region(unsigned long pfn,
133 struct node_active_region *node_ar)
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000134{
Tejun Heo5dfe8662011-07-14 09:46:10 +0200135 unsigned long start_pfn, end_pfn;
136 int i, nid;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000137
Tejun Heo5dfe8662011-07-14 09:46:10 +0200138 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
139 if (pfn >= start_pfn && pfn < end_pfn) {
140 node_ar->nid = nid;
141 node_ar->start_pfn = start_pfn;
142 node_ar->end_pfn = end_pfn;
143 break;
144 }
145 }
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000146}
147
Jesse Larrew39bf9902010-12-17 22:07:47 +0000148static void map_cpu_to_node(int cpu, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149{
150 numa_cpu_lookup_table[cpu] = node;
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100151
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600152 dbg("adding cpu %d to node %d\n", cpu, node);
153
Anton Blanchard25863de2010-04-26 15:32:43 +0000154 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
155 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156}
157
Jesse Larrew39bf9902010-12-17 22:07:47 +0000158#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159static void unmap_cpu_from_node(unsigned long cpu)
160{
161 int node = numa_cpu_lookup_table[cpu];
162
163 dbg("removing cpu %lu from node %d\n", cpu, node);
164
Anton Blanchard25863de2010-04-26 15:32:43 +0000165 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
Anton Blanchard429f4d82011-01-29 12:37:16 +0000166 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 } else {
168 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
169 cpu, node);
170 }
171}
Jesse Larrew39bf9902010-12-17 22:07:47 +0000172#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174/* must hold reference to node during call */
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000175static const int *of_get_associativity(struct device_node *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176{
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000177 return of_get_property(dev, "ibm,associativity", NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178}
179
Chandrucf000852008-08-30 00:28:16 +1000180/*
181 * Returns the property linux,drconf-usable-memory if
182 * it exists (the property exists only in kexec/kdump kernels,
183 * added by kexec-tools)
184 */
185static const u32 *of_get_usable_memory(struct device_node *memory)
186{
187 const u32 *prop;
188 u32 len;
189 prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
190 if (!prop || len < sizeof(unsigned int))
191 return 0;
192 return prop;
193}
194
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000195int __node_distance(int a, int b)
196{
197 int i;
198 int distance = LOCAL_DISTANCE;
199
200 if (!form1_affinity)
201 return distance;
202
203 for (i = 0; i < distance_ref_points_depth; i++) {
204 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
205 break;
206
207 /* Double the distance for each NUMA level */
208 distance *= 2;
209 }
210
211 return distance;
212}
213
214static void initialize_distance_lookup_table(int nid,
215 const unsigned int *associativity)
216{
217 int i;
218
219 if (!form1_affinity)
220 return;
221
222 for (i = 0; i < distance_ref_points_depth; i++) {
223 distance_lookup_table[nid][i] =
224 associativity[distance_ref_points[i]];
225 }
226}
227
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600228/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
229 * info is found.
230 */
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000231static int associativity_to_nid(const unsigned int *associativity)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232{
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600233 int nid = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234
235 if (min_common_depth == -1)
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600236 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000238 if (associativity[0] >= min_common_depth)
239 nid = associativity[min_common_depth];
Nathan Lynchbc16a752006-03-20 18:36:15 -0600240
241 /* POWER4 LPAR uses 0xffff as invalid node */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600242 if (nid == 0xffff || nid >= MAX_NUMNODES)
243 nid = -1;
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000244
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000245 if (nid > 0 && associativity[0] >= distance_ref_points_depth)
246 initialize_distance_lookup_table(nid, associativity);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000247
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600248out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600249 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250}
251
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000252/* Returns the nid associated with the given device tree node,
253 * or -1 if not found.
254 */
255static int of_node_to_nid_single(struct device_node *device)
256{
257 int nid = -1;
258 const unsigned int *tmp;
259
260 tmp = of_get_associativity(device);
261 if (tmp)
262 nid = associativity_to_nid(tmp);
263 return nid;
264}
265
Jeremy Kerr953039c2006-05-01 12:16:12 -0700266/* Walk the device tree upwards, looking for an associativity id */
267int of_node_to_nid(struct device_node *device)
268{
269 struct device_node *tmp;
270 int nid = -1;
271
272 of_node_get(device);
273 while (device) {
274 nid = of_node_to_nid_single(device);
275 if (nid != -1)
276 break;
277
278 tmp = device;
279 device = of_get_parent(tmp);
280 of_node_put(tmp);
281 }
282 of_node_put(device);
283
284 return nid;
285}
286EXPORT_SYMBOL_GPL(of_node_to_nid);
287
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288static int __init find_min_common_depth(void)
289{
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000290 int depth;
Anton Blanchardbc8449c2010-05-16 20:28:35 +0000291 struct device_node *chosen;
Michael Ellermane70606e2011-04-10 20:42:05 +0000292 struct device_node *root;
Anton Blanchardbc8449c2010-05-16 20:28:35 +0000293 const char *vec5;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294
Dipankar Sarma1c8ee732011-10-28 04:25:32 +0000295 if (firmware_has_feature(FW_FEATURE_OPAL))
296 root = of_find_node_by_path("/ibm,opal");
297 else
298 root = of_find_node_by_path("/rtas");
Michael Ellermane70606e2011-04-10 20:42:05 +0000299 if (!root)
300 root = of_find_node_by_path("/");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301
302 /*
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000303 * This property is a set of 32-bit integers, each representing
304 * an index into the ibm,associativity nodes.
305 *
306 * With form 0 affinity the first integer is for an SMP configuration
307 * (should be all 0's) and the second is for a normal NUMA
308 * configuration. We have only one level of NUMA.
309 *
310 * With form 1 affinity the first integer is the most significant
311 * NUMA boundary and the following are progressively less significant
312 * boundaries. There can be more than one level of NUMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 */
Michael Ellermane70606e2011-04-10 20:42:05 +0000314 distance_ref_points = of_get_property(root,
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000315 "ibm,associativity-reference-points",
316 &distance_ref_points_depth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000318 if (!distance_ref_points) {
319 dbg("NUMA: ibm,associativity-reference-points not found.\n");
320 goto err;
321 }
322
323 distance_ref_points_depth /= sizeof(int);
324
Anton Blanchardbc8449c2010-05-16 20:28:35 +0000325#define VEC5_AFFINITY_BYTE 5
326#define VEC5_AFFINITY 0x80
Dipankar Sarma1c8ee732011-10-28 04:25:32 +0000327
328 if (firmware_has_feature(FW_FEATURE_OPAL))
329 form1_affinity = 1;
330 else {
331 chosen = of_find_node_by_path("/chosen");
332 if (chosen) {
333 vec5 = of_get_property(chosen,
334 "ibm,architecture-vec-5", NULL);
335 if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
336 VEC5_AFFINITY)) {
337 dbg("Using form 1 affinity\n");
338 form1_affinity = 1;
339 }
Gavin Shan5b958a72012-05-30 17:07:29 +0000340
341 of_node_put(chosen);
Anton Blanchardbc8449c2010-05-16 20:28:35 +0000342 }
Anton Blanchard4b83c332010-04-07 15:33:44 +0000343 }
344
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000345 if (form1_affinity) {
346 depth = distance_ref_points[0];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 } else {
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000348 if (distance_ref_points_depth < 2) {
349 printk(KERN_WARNING "NUMA: "
350 "short ibm,associativity-reference-points\n");
351 goto err;
352 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000354 depth = distance_ref_points[1];
355 }
356
357 /*
358 * Warn and cap if the hardware supports more than
359 * MAX_DISTANCE_REF_POINTS domains.
360 */
361 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
362 printk(KERN_WARNING "NUMA: distance array capped at "
363 "%d entries\n", MAX_DISTANCE_REF_POINTS);
364 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
365 }
366
Michael Ellermane70606e2011-04-10 20:42:05 +0000367 of_node_put(root);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 return depth;
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000369
370err:
Michael Ellermane70606e2011-04-10 20:42:05 +0000371 of_node_put(root);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000372 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373}
374
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800375static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376{
377 struct device_node *memory = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378
379 memory = of_find_node_by_type(memory, "memory");
Paul Mackerras54c23312005-12-05 15:50:39 +1100380 if (!memory)
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800381 panic("numa.c: No memory nodes found!");
Paul Mackerras54c23312005-12-05 15:50:39 +1100382
Stephen Rothwella8bda5d2007-04-03 10:56:50 +1000383 *n_addr_cells = of_n_addr_cells(memory);
Stephen Rothwell9213fee2007-04-03 10:57:48 +1000384 *n_size_cells = of_n_size_cells(memory);
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800385 of_node_put(memory);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
David Rientjes2011b1d2011-12-08 12:46:37 +0000388static unsigned long read_n_cells(int n, const unsigned int **buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389{
390 unsigned long result = 0;
391
392 while (n--) {
393 result = (result << 32) | **buf;
394 (*buf)++;
395 }
396 return result;
397}
398
Nathan Fontenot83426812008-07-03 13:35:54 +1000399/*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000400 * Read the next memblock list entry from the ibm,dynamic-memory property
Nathan Fontenot83426812008-07-03 13:35:54 +1000401 * and return the information in the provided of_drconf_cell structure.
402 */
403static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
404{
405 const u32 *cp;
406
407 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
408
409 cp = *cellp;
410 drmem->drc_index = cp[0];
411 drmem->reserved = cp[1];
412 drmem->aa_index = cp[2];
413 drmem->flags = cp[3];
414
415 *cellp = cp + 4;
416}
417
418/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300419 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
Nathan Fontenot83426812008-07-03 13:35:54 +1000420 *
Yinghai Lu95f72d12010-07-12 14:36:09 +1000421 * The layout of the ibm,dynamic-memory property is a number N of memblock
422 * list entries followed by N memblock list entries. Each memblock list entry
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300423 * contains information as laid out in the of_drconf_cell struct above.
Nathan Fontenot83426812008-07-03 13:35:54 +1000424 */
425static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
426{
427 const u32 *prop;
428 u32 len, entries;
429
430 prop = of_get_property(memory, "ibm,dynamic-memory", &len);
431 if (!prop || len < sizeof(unsigned int))
432 return 0;
433
434 entries = *prop++;
435
436 /* Now that we know the number of entries, revalidate the size
437 * of the property read in to ensure we have everything
438 */
439 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
440 return 0;
441
442 *dm = prop;
443 return entries;
444}
445
446/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300447 * Retrieve and validate the ibm,lmb-size property for drconf memory
Nathan Fontenot83426812008-07-03 13:35:54 +1000448 * from the device tree.
449 */
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000450static u64 of_get_lmb_size(struct device_node *memory)
Nathan Fontenot83426812008-07-03 13:35:54 +1000451{
452 const u32 *prop;
453 u32 len;
454
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000455 prop = of_get_property(memory, "ibm,lmb-size", &len);
Nathan Fontenot83426812008-07-03 13:35:54 +1000456 if (!prop || len < sizeof(unsigned int))
457 return 0;
458
459 return read_n_cells(n_mem_size_cells, &prop);
460}
461
462struct assoc_arrays {
463 u32 n_arrays;
464 u32 array_sz;
465 const u32 *arrays;
466};
467
468/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300469 * Retrieve and validate the list of associativity arrays for drconf
Nathan Fontenot83426812008-07-03 13:35:54 +1000470 * memory from the ibm,associativity-lookup-arrays property of the
471 * device tree..
472 *
473 * The layout of the ibm,associativity-lookup-arrays property is a number N
474 * indicating the number of associativity arrays, followed by a number M
475 * indicating the size of each associativity array, followed by a list
476 * of N associativity arrays.
477 */
478static int of_get_assoc_arrays(struct device_node *memory,
479 struct assoc_arrays *aa)
480{
481 const u32 *prop;
482 u32 len;
483
484 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
485 if (!prop || len < 2 * sizeof(unsigned int))
486 return -1;
487
488 aa->n_arrays = *prop++;
489 aa->array_sz = *prop++;
490
Justin P. Mattock42b2aa82011-11-28 20:31:00 -0800491 /* Now that we know the number of arrays and size of each array,
Nathan Fontenot83426812008-07-03 13:35:54 +1000492 * revalidate the size of the property read in.
493 */
494 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
495 return -1;
496
497 aa->arrays = prop;
498 return 0;
499}
500
501/*
502 * This is like of_node_to_nid_single() for memory represented in the
503 * ibm,dynamic-reconfiguration-memory node.
504 */
505static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
506 struct assoc_arrays *aa)
507{
508 int default_nid = 0;
509 int nid = default_nid;
510 int index;
511
512 if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
513 !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
514 drmem->aa_index < aa->n_arrays) {
515 index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
516 nid = aa->arrays[index];
517
518 if (nid == 0xffff || nid >= MAX_NUMNODES)
519 nid = default_nid;
520 }
521
522 return nid;
523}
524
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525/*
526 * Figure out to which domain a cpu belongs and stick it there.
527 * Return the id of the domain used.
528 */
Nathan Lynch2e5ce392006-03-20 18:35:15 -0600529static int __cpuinit numa_setup_cpu(unsigned long lcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530{
Nathan Lynchcf950b72006-03-20 18:35:45 -0600531 int nid = 0;
Milton Miller8b16cd22009-01-08 02:19:45 +0000532 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
534 if (!cpu) {
535 WARN_ON(1);
536 goto out;
537 }
538
Jeremy Kerr953039c2006-05-01 12:16:12 -0700539 nid = of_node_to_nid_single(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600541 if (nid < 0 || !node_online(nid))
H Hartley Sweeten72c33682010-03-05 13:42:43 -0800542 nid = first_online_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600544 map_cpu_to_node(lcpu, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545
546 of_node_put(cpu);
547
Nathan Lynchcf950b72006-03-20 18:35:45 -0600548 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549}
550
Chandra Seetharaman74b85f32006-06-27 02:54:09 -0700551static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552 unsigned long action,
553 void *hcpu)
554{
555 unsigned long lcpu = (unsigned long)hcpu;
556 int ret = NOTIFY_DONE;
557
558 switch (action) {
559 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700560 case CPU_UP_PREPARE_FROZEN:
Nathan Lynch2b261222006-03-20 18:37:15 -0600561 numa_setup_cpu(lcpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 ret = NOTIFY_OK;
563 break;
564#ifdef CONFIG_HOTPLUG_CPU
565 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700566 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700568 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 unmap_cpu_from_node(lcpu);
570 break;
571 ret = NOTIFY_OK;
572#endif
573 }
574 return ret;
575}
576
577/*
578 * Check and possibly modify a memory region to enforce the memory limit.
579 *
580 * Returns the size the region should have to enforce the memory limit.
581 * This will either be the original value of size, a truncated value,
582 * or zero. If the returned value of size is 0 the region should be
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300583 * discarded as it lies wholly above the memory limit.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 */
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100585static unsigned long __init numa_enforce_memory_limit(unsigned long start,
586 unsigned long size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700587{
588 /*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000589 * We use memblock_end_of_DRAM() in here instead of memory_limit because
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590 * we've already adjusted it for the limit and it takes care of
Milton Millerfe552492008-10-20 15:37:04 +0000591 * having memory holes below the limit. Also, in the case of
592 * iommu_is_off, memory_limit is not set but is implicitly enforced.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594
Yinghai Lu95f72d12010-07-12 14:36:09 +1000595 if (start + size <= memblock_end_of_DRAM())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 return size;
597
Yinghai Lu95f72d12010-07-12 14:36:09 +1000598 if (start >= memblock_end_of_DRAM())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 return 0;
600
Yinghai Lu95f72d12010-07-12 14:36:09 +1000601 return memblock_end_of_DRAM() - start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602}
603
Paul Mackerras02045682006-11-29 22:27:42 +1100604/*
Chandrucf000852008-08-30 00:28:16 +1000605 * Reads the counter for a given entry in
606 * linux,drconf-usable-memory property
607 */
608static inline int __init read_usm_ranges(const u32 **usm)
609{
610 /*
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000611 * For each lmb in ibm,dynamic-memory a corresponding
Chandrucf000852008-08-30 00:28:16 +1000612 * entry in linux,drconf-usable-memory property contains
613 * a counter followed by that many (base, size) duple.
614 * read the counter from linux,drconf-usable-memory
615 */
616 return read_n_cells(n_mem_size_cells, usm);
617}
618
619/*
Paul Mackerras02045682006-11-29 22:27:42 +1100620 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
621 * node. This assumes n_mem_{addr,size}_cells have been set.
622 */
623static void __init parse_drconf_memory(struct device_node *memory)
624{
Michael Neuling82b25212012-06-19 20:01:45 +0000625 const u32 *uninitialized_var(dm), *usm;
Chandrucf000852008-08-30 00:28:16 +1000626 unsigned int n, rc, ranges, is_kexec_kdump = 0;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000627 unsigned long lmb_size, base, size, sz;
Nathan Fontenot83426812008-07-03 13:35:54 +1000628 int nid;
Benjamin Herrenschmidtaa709f32012-07-05 16:30:33 +0000629 struct assoc_arrays aa = { .arrays = NULL };
Paul Mackerras02045682006-11-29 22:27:42 +1100630
Nathan Fontenot83426812008-07-03 13:35:54 +1000631 n = of_get_drconf_memory(memory, &dm);
632 if (!n)
Paul Mackerras02045682006-11-29 22:27:42 +1100633 return;
634
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000635 lmb_size = of_get_lmb_size(memory);
636 if (!lmb_size)
Nathan Fontenot83426812008-07-03 13:35:54 +1000637 return;
638
639 rc = of_get_assoc_arrays(memory, &aa);
640 if (rc)
Paul Mackerras02045682006-11-29 22:27:42 +1100641 return;
642
Chandrucf000852008-08-30 00:28:16 +1000643 /* check if this is a kexec/kdump kernel */
644 usm = of_get_usable_memory(memory);
645 if (usm != NULL)
646 is_kexec_kdump = 1;
647
Paul Mackerras02045682006-11-29 22:27:42 +1100648 for (; n != 0; --n) {
Nathan Fontenot83426812008-07-03 13:35:54 +1000649 struct of_drconf_cell drmem;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100650
Nathan Fontenot83426812008-07-03 13:35:54 +1000651 read_drconf_cell(&drmem, &dm);
652
653 /* skip this block if the reserved bit is set in flags (0x80)
654 or if the block is not assigned to this partition (0x8) */
655 if ((drmem.flags & DRCONF_MEM_RESERVED)
656 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
657 continue;
658
Chandrucf000852008-08-30 00:28:16 +1000659 base = drmem.base_addr;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000660 size = lmb_size;
Chandrucf000852008-08-30 00:28:16 +1000661 ranges = 1;
Nathan Fontenot83426812008-07-03 13:35:54 +1000662
Chandrucf000852008-08-30 00:28:16 +1000663 if (is_kexec_kdump) {
664 ranges = read_usm_ranges(&usm);
665 if (!ranges) /* there are no (base, size) duple */
666 continue;
667 }
668 do {
669 if (is_kexec_kdump) {
670 base = read_n_cells(n_mem_addr_cells, &usm);
671 size = read_n_cells(n_mem_size_cells, &usm);
672 }
673 nid = of_drconf_to_nid_single(&drmem, &aa);
674 fake_numa_create_new_node(
675 ((base + size) >> PAGE_SHIFT),
Nathan Fontenot83426812008-07-03 13:35:54 +1000676 &nid);
Chandrucf000852008-08-30 00:28:16 +1000677 node_set_online(nid);
678 sz = numa_enforce_memory_limit(base, size);
679 if (sz)
Tejun Heo1d7cfe12011-12-08 10:22:08 -0800680 memblock_set_node(base, sz, nid);
Chandrucf000852008-08-30 00:28:16 +1000681 } while (--ranges);
Paul Mackerras02045682006-11-29 22:27:42 +1100682 }
683}
684
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685static int __init parse_numa_properties(void)
686{
Anton Blanchard94db7c52011-08-10 20:44:22 +0000687 struct device_node *memory;
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600688 int default_nid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 unsigned long i;
690
691 if (numa_enabled == 0) {
692 printk(KERN_WARNING "NUMA disabled by user\n");
693 return -1;
694 }
695
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696 min_common_depth = find_min_common_depth();
697
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698 if (min_common_depth < 0)
699 return min_common_depth;
700
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600701 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
702
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 /*
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600704 * Even though we connect cpus to numa domains later in SMP
705 * init, we need to know the node ids now. This is because
706 * each node to be onlined must have NODE_DATA etc backing it.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707 */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600708 for_each_present_cpu(i) {
Anton Blancharddfbe93a2011-08-10 20:44:23 +0000709 struct device_node *cpu;
Nathan Lynchcf950b72006-03-20 18:35:45 -0600710 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711
Milton Miller8b16cd22009-01-08 02:19:45 +0000712 cpu = of_get_cpu_node(i, NULL);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600713 BUG_ON(!cpu);
Jeremy Kerr953039c2006-05-01 12:16:12 -0700714 nid = of_node_to_nid_single(cpu);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600715 of_node_put(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600717 /*
718 * Don't fall back to default_nid yet -- we will plug
719 * cpus into nodes once the memory scan has discovered
720 * the topology.
721 */
722 if (nid < 0)
723 continue;
724 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 }
726
Mike Kravetz237a0982005-12-05 12:06:42 -0800727 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
Anton Blanchard94db7c52011-08-10 20:44:22 +0000728
729 for_each_node_by_type(memory, "memory") {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 unsigned long start;
731 unsigned long size;
Nathan Lynchcf950b72006-03-20 18:35:45 -0600732 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 int ranges;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000734 const unsigned int *memcell_buf;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 unsigned int len;
736
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000737 memcell_buf = of_get_property(memory,
Michael Ellermanba759482005-12-04 18:39:55 +1100738 "linux,usable-memory", &len);
739 if (!memcell_buf || len <= 0)
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000740 memcell_buf = of_get_property(memory, "reg", &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 if (!memcell_buf || len <= 0)
742 continue;
743
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +1100744 /* ranges in cell */
745 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746new_range:
747 /* these are order-sensitive, and modify the buffer pointer */
Mike Kravetz237a0982005-12-05 12:06:42 -0800748 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
749 size = read_n_cells(n_mem_size_cells, &memcell_buf);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600751 /*
752 * Assumption: either all memory nodes or none will
753 * have associativity properties. If none, then
754 * everything goes to default_nid.
755 */
Jeremy Kerr953039c2006-05-01 12:16:12 -0700756 nid = of_node_to_nid_single(memory);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600757 if (nid < 0)
758 nid = default_nid;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100759
760 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600761 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100763 if (!(size = numa_enforce_memory_limit(start, size))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 if (--ranges)
765 goto new_range;
766 else
767 continue;
768 }
769
Tejun Heo1d7cfe12011-12-08 10:22:08 -0800770 memblock_set_node(start, size, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771
772 if (--ranges)
773 goto new_range;
774 }
775
Paul Mackerras02045682006-11-29 22:27:42 +1100776 /*
Anton Blancharddfbe93a2011-08-10 20:44:23 +0000777 * Now do the same thing for each MEMBLOCK listed in the
778 * ibm,dynamic-memory property in the
779 * ibm,dynamic-reconfiguration-memory node.
Paul Mackerras02045682006-11-29 22:27:42 +1100780 */
781 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
782 if (memory)
783 parse_drconf_memory(memory);
784
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 return 0;
786}
787
788static void __init setup_nonnuma(void)
789{
Yinghai Lu95f72d12010-07-12 14:36:09 +1000790 unsigned long top_of_ram = memblock_end_of_DRAM();
791 unsigned long total_ram = memblock_phys_mem_size();
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700792 unsigned long start_pfn, end_pfn;
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000793 unsigned int nid = 0;
794 struct memblock_region *reg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795
Olof Johanssone110b282006-04-12 15:25:01 -0500796 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 top_of_ram, total_ram);
Olof Johanssone110b282006-04-12 15:25:01 -0500798 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 (top_of_ram - total_ram) >> 20);
800
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000801 for_each_memblock(memory, reg) {
Yinghai Luc7fc2de2010-10-12 14:07:09 -0700802 start_pfn = memblock_region_memory_base_pfn(reg);
803 end_pfn = memblock_region_memory_end_pfn(reg);
Balbir Singh1daa6d02008-02-01 15:57:31 +1100804
805 fake_numa_create_new_node(end_pfn, &nid);
Tejun Heo1d7cfe12011-12-08 10:22:08 -0800806 memblock_set_node(PFN_PHYS(start_pfn),
807 PFN_PHYS(end_pfn - start_pfn), nid);
Balbir Singh1daa6d02008-02-01 15:57:31 +1100808 node_set_online(nid);
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700809 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810}
811
Anton Blanchard4b703a22005-12-13 06:56:47 +1100812void __init dump_numa_cpu_topology(void)
813{
814 unsigned int node;
815 unsigned int cpu, count;
816
817 if (min_common_depth == -1 || !numa_enabled)
818 return;
819
820 for_each_online_node(node) {
Olof Johanssone110b282006-04-12 15:25:01 -0500821 printk(KERN_DEBUG "Node %d CPUs:", node);
Anton Blanchard4b703a22005-12-13 06:56:47 +1100822
823 count = 0;
824 /*
825 * If we used a CPU iterator here we would miss printing
826 * the holes in the cpumap.
827 */
Anton Blanchard25863de2010-04-26 15:32:43 +0000828 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
829 if (cpumask_test_cpu(cpu,
830 node_to_cpumask_map[node])) {
Anton Blanchard4b703a22005-12-13 06:56:47 +1100831 if (count == 0)
832 printk(" %u", cpu);
833 ++count;
834 } else {
835 if (count > 1)
836 printk("-%u", cpu - 1);
837 count = 0;
838 }
839 }
840
841 if (count > 1)
Anton Blanchard25863de2010-04-26 15:32:43 +0000842 printk("-%u", nr_cpu_ids - 1);
Anton Blanchard4b703a22005-12-13 06:56:47 +1100843 printk("\n");
844 }
845}
846
847static void __init dump_numa_memory_topology(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848{
849 unsigned int node;
850 unsigned int count;
851
852 if (min_common_depth == -1 || !numa_enabled)
853 return;
854
855 for_each_online_node(node) {
856 unsigned long i;
857
Olof Johanssone110b282006-04-12 15:25:01 -0500858 printk(KERN_DEBUG "Node %d Memory:", node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859
860 count = 0;
861
Yinghai Lu95f72d12010-07-12 14:36:09 +1000862 for (i = 0; i < memblock_end_of_DRAM();
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100863 i += (1 << SECTION_SIZE_BITS)) {
864 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 if (count == 0)
866 printk(" 0x%lx", i);
867 ++count;
868 } else {
869 if (count > 0)
870 printk("-0x%lx", i);
871 count = 0;
872 }
873 }
874
875 if (count > 0)
876 printk("-0x%lx", i);
877 printk("\n");
878 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879}
880
881/*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000882 * Allocate some memory, satisfying the memblock or bootmem allocator where
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 * required. nid is the preferred node and end is the physical address of
884 * the highest address in the node.
885 *
Dave Hansen0be210f2008-12-09 08:21:35 +0000886 * Returns the virtual address of the memory.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 */
Dave Hansen893473d2008-12-09 08:21:36 +0000888static void __init *careful_zallocation(int nid, unsigned long size,
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100889 unsigned long align,
890 unsigned long end_pfn)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891{
Dave Hansen0be210f2008-12-09 08:21:35 +0000892 void *ret;
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100893 int new_nid;
Dave Hansen0be210f2008-12-09 08:21:35 +0000894 unsigned long ret_paddr;
895
Yinghai Lu95f72d12010-07-12 14:36:09 +1000896 ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897
898 /* retry over all memory */
Dave Hansen0be210f2008-12-09 08:21:35 +0000899 if (!ret_paddr)
Yinghai Lu95f72d12010-07-12 14:36:09 +1000900 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901
Dave Hansen0be210f2008-12-09 08:21:35 +0000902 if (!ret_paddr)
Dave Hansen5d21ea22008-12-09 08:21:33 +0000903 panic("numa.c: cannot allocate %lu bytes for node %d",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700904 size, nid);
905
Dave Hansen0be210f2008-12-09 08:21:35 +0000906 ret = __va(ret_paddr);
907
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 /*
Dave Hansenc555e522008-12-09 08:21:32 +0000909 * We initialize the nodes in numeric order: 0, 1, 2...
Yinghai Lu95f72d12010-07-12 14:36:09 +1000910 * and hand over control from the MEMBLOCK allocator to the
Dave Hansenc555e522008-12-09 08:21:32 +0000911 * bootmem allocator. If this function is called for
912 * node 5, then we know that all nodes <5 are using the
Yinghai Lu95f72d12010-07-12 14:36:09 +1000913 * bootmem allocator instead of the MEMBLOCK allocator.
Dave Hansenc555e522008-12-09 08:21:32 +0000914 *
915 * So, check the nid from which this allocation came
916 * and double check to see if we need to use bootmem
Yinghai Lu95f72d12010-07-12 14:36:09 +1000917 * instead of the MEMBLOCK. We don't free the MEMBLOCK memory
Dave Hansenc555e522008-12-09 08:21:32 +0000918 * since it would be useless.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919 */
Dave Hansen0be210f2008-12-09 08:21:35 +0000920 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100921 if (new_nid < nid) {
Dave Hansen0be210f2008-12-09 08:21:35 +0000922 ret = __alloc_bootmem_node(NODE_DATA(new_nid),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 size, align, 0);
924
Dave Hansen0be210f2008-12-09 08:21:35 +0000925 dbg("alloc_bootmem %p %lx\n", ret, size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 }
927
Dave Hansen893473d2008-12-09 08:21:36 +0000928 memset(ret, 0, size);
Dave Hansen0be210f2008-12-09 08:21:35 +0000929 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930}
931
Chandra Seetharaman74b85f32006-06-27 02:54:09 -0700932static struct notifier_block __cpuinitdata ppc64_numa_nb = {
933 .notifier_call = cpu_numa_callback,
934 .priority = 1 /* Must run before sched domains notifier. */
935};
936
David Rientjes28e86bd2011-12-08 12:33:29 +0000937static void __init mark_reserved_regions_for_nid(int nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938{
Dave Hansen4a618662008-11-24 12:02:35 +0000939 struct pglist_data *node = NODE_DATA(nid);
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000940 struct memblock_region *reg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000942 for_each_memblock(reserved, reg) {
943 unsigned long physbase = reg->base;
944 unsigned long size = reg->size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000945 unsigned long start_pfn = physbase >> PAGE_SHIFT;
Dave Hansen06eccea2009-02-12 12:36:04 +0000946 unsigned long end_pfn = PFN_UP(physbase + size);
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000947 struct node_active_region node_ar;
Dave Hansen4a618662008-11-24 12:02:35 +0000948 unsigned long node_end_pfn = node->node_start_pfn +
949 node->node_spanned_pages;
950
951 /*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000952 * Check to make sure that this memblock.reserved area is
Dave Hansen4a618662008-11-24 12:02:35 +0000953 * within the bounds of the node that we care about.
954 * Checking the nid of the start and end points is not
955 * sufficient because the reserved area could span the
956 * entire node.
957 */
958 if (end_pfn <= node->node_start_pfn ||
959 start_pfn >= node_end_pfn)
960 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700961
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000962 get_node_active_region(start_pfn, &node_ar);
Jon Tollefsone8170372008-10-16 18:59:43 +0000963 while (start_pfn < end_pfn &&
964 node_ar.start_pfn < node_ar.end_pfn) {
965 unsigned long reserve_size = size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000966 /*
967 * if reserved region extends past active region
968 * then trim size to active region
969 */
970 if (end_pfn > node_ar.end_pfn)
Jon Tollefsone8170372008-10-16 18:59:43 +0000971 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
Dave Hansen06eccea2009-02-12 12:36:04 +0000972 - physbase;
Dave Hansena4c74dd2008-12-11 08:36:06 +0000973 /*
974 * Only worry about *this* node, others may not
975 * yet have valid NODE_DATA().
976 */
977 if (node_ar.nid == nid) {
978 dbg("reserve_bootmem %lx %lx nid=%d\n",
979 physbase, reserve_size, node_ar.nid);
980 reserve_bootmem_node(NODE_DATA(node_ar.nid),
981 physbase, reserve_size,
982 BOOTMEM_DEFAULT);
983 }
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000984 /*
985 * if reserved region is contained in the active region
986 * then done.
987 */
988 if (end_pfn <= node_ar.end_pfn)
989 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000991 /*
992 * reserved region extends past the active region
993 * get next active region that contains this
994 * reserved region
995 */
996 start_pfn = node_ar.end_pfn;
997 physbase = start_pfn << PAGE_SHIFT;
Jon Tollefsone8170372008-10-16 18:59:43 +0000998 size = size - reserve_size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000999 get_node_active_region(start_pfn, &node_ar);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 }
Dave Hansen4a618662008-11-24 12:02:35 +00001002}
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001003
Dave Hansen4a618662008-11-24 12:02:35 +00001004
1005void __init do_init_bootmem(void)
1006{
1007 int nid;
Dave Hansen4a618662008-11-24 12:02:35 +00001008
1009 min_low_pfn = 0;
Yinghai Lu95f72d12010-07-12 14:36:09 +10001010 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
Dave Hansen4a618662008-11-24 12:02:35 +00001011 max_pfn = max_low_pfn;
1012
1013 if (parse_numa_properties())
1014 setup_nonnuma();
1015 else
1016 dump_numa_memory_topology();
1017
Dave Hansen4a618662008-11-24 12:02:35 +00001018 for_each_online_node(nid) {
1019 unsigned long start_pfn, end_pfn;
Dave Hansen0be210f2008-12-09 08:21:35 +00001020 void *bootmem_vaddr;
Dave Hansen4a618662008-11-24 12:02:35 +00001021 unsigned long bootmap_pages;
1022
1023 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1024
1025 /*
1026 * Allocate the node structure node local if possible
1027 *
1028 * Be careful moving this around, as it relies on all
1029 * previous nodes' bootmem to be initialized and have
1030 * all reserved areas marked.
1031 */
Dave Hansen893473d2008-12-09 08:21:36 +00001032 NODE_DATA(nid) = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +00001033 sizeof(struct pglist_data),
1034 SMP_CACHE_BYTES, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +00001035
1036 dbg("node %d\n", nid);
1037 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1038
1039 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1040 NODE_DATA(nid)->node_start_pfn = start_pfn;
1041 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1042
1043 if (NODE_DATA(nid)->node_spanned_pages == 0)
1044 continue;
1045
1046 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1047 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1048
1049 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
Dave Hansen893473d2008-12-09 08:21:36 +00001050 bootmem_vaddr = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +00001051 bootmap_pages << PAGE_SHIFT,
1052 PAGE_SIZE, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +00001053
Dave Hansen0be210f2008-12-09 08:21:35 +00001054 dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
Dave Hansen4a618662008-11-24 12:02:35 +00001055
Dave Hansen0be210f2008-12-09 08:21:35 +00001056 init_bootmem_node(NODE_DATA(nid),
1057 __pa(bootmem_vaddr) >> PAGE_SHIFT,
Dave Hansen4a618662008-11-24 12:02:35 +00001058 start_pfn, end_pfn);
1059
1060 free_bootmem_with_active_regions(nid, end_pfn);
1061 /*
1062 * Be very careful about moving this around. Future
Dave Hansen893473d2008-12-09 08:21:36 +00001063 * calls to careful_zallocation() depend on this getting
Dave Hansen4a618662008-11-24 12:02:35 +00001064 * done correctly.
1065 */
1066 mark_reserved_regions_for_nid(nid);
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001067 sparse_memory_present_with_active_regions(nid);
Dave Hansen4a618662008-11-24 12:02:35 +00001068 }
Benjamin Herrenschmidtd3f62042009-06-02 21:16:38 +00001069
1070 init_bootmem_done = 1;
Anton Blanchard25863de2010-04-26 15:32:43 +00001071
1072 /*
1073 * Now bootmem is initialised we can create the node to cpumask
1074 * lookup tables and setup the cpu callback to populate them.
1075 */
1076 setup_node_to_cpumask_map();
1077
1078 register_cpu_notifier(&ppc64_numa_nb);
1079 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1080 (void *)(unsigned long)boot_cpuid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081}
1082
1083void __init paging_init(void)
1084{
Mel Gorman6391af12006-10-11 01:20:39 -07001085 unsigned long max_zone_pfns[MAX_NR_ZONES];
1086 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
Yinghai Lu95f72d12010-07-12 14:36:09 +10001087 max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
Mel Gormanc67c3cb2006-09-27 01:49:49 -07001088 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089}
1090
1091static int __init early_numa(char *p)
1092{
1093 if (!p)
1094 return 0;
1095
1096 if (strstr(p, "off"))
1097 numa_enabled = 0;
1098
1099 if (strstr(p, "debug"))
1100 numa_debug = 1;
1101
Balbir Singh1daa6d02008-02-01 15:57:31 +11001102 p = strstr(p, "fake=");
1103 if (p)
1104 cmdline = p + strlen("fake=");
1105
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 return 0;
1107}
1108early_param("numa", early_numa);
Mike Kravetz237a0982005-12-05 12:06:42 -08001109
1110#ifdef CONFIG_MEMORY_HOTPLUG
1111/*
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001112 * Find the node associated with a hot added memory section for
1113 * memory represented in the device tree by the property
1114 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
Nathan Fontenot0db93602008-07-03 13:25:08 +10001115 */
1116static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1117 unsigned long scn_addr)
1118{
1119 const u32 *dm;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001120 unsigned int drconf_cell_cnt, rc;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001121 unsigned long lmb_size;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001122 struct assoc_arrays aa;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001123 int nid = -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001124
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001125 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1126 if (!drconf_cell_cnt)
1127 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001128
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001129 lmb_size = of_get_lmb_size(memory);
1130 if (!lmb_size)
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001131 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001132
1133 rc = of_get_assoc_arrays(memory, &aa);
1134 if (rc)
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001135 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001136
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001137 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
Nathan Fontenot0db93602008-07-03 13:25:08 +10001138 struct of_drconf_cell drmem;
1139
1140 read_drconf_cell(&drmem, &dm);
1141
1142 /* skip this block if it is reserved or not assigned to
1143 * this partition */
1144 if ((drmem.flags & DRCONF_MEM_RESERVED)
1145 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1146 continue;
1147
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001148 if ((scn_addr < drmem.base_addr)
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001149 || (scn_addr >= (drmem.base_addr + lmb_size)))
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001150 continue;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001151
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001152 nid = of_drconf_to_nid_single(&drmem, &aa);
1153 break;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001154 }
1155
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001156 return nid;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001157}
1158
1159/*
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001160 * Find the node associated with a hot added memory section for memory
1161 * represented in the device tree as a node (i.e. memory@XXXX) for
Yinghai Lu95f72d12010-07-12 14:36:09 +10001162 * each memblock.
Mike Kravetz237a0982005-12-05 12:06:42 -08001163 */
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001164int hot_add_node_scn_to_nid(unsigned long scn_addr)
Mike Kravetz237a0982005-12-05 12:06:42 -08001165{
Anton Blanchard94db7c52011-08-10 20:44:22 +00001166 struct device_node *memory;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001167 int nid = -1;
Mike Kravetz237a0982005-12-05 12:06:42 -08001168
Anton Blanchard94db7c52011-08-10 20:44:22 +00001169 for_each_node_by_type(memory, "memory") {
Mike Kravetz237a0982005-12-05 12:06:42 -08001170 unsigned long start, size;
Mike Kravetzb226e462005-12-16 14:30:35 -08001171 int ranges;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +10001172 const unsigned int *memcell_buf;
Mike Kravetz237a0982005-12-05 12:06:42 -08001173 unsigned int len;
1174
Stephen Rothwelle2eb6392007-04-03 22:26:41 +10001175 memcell_buf = of_get_property(memory, "reg", &len);
Mike Kravetz237a0982005-12-05 12:06:42 -08001176 if (!memcell_buf || len <= 0)
1177 continue;
1178
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +11001179 /* ranges in cell */
1180 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Mike Kravetz237a0982005-12-05 12:06:42 -08001181
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001182 while (ranges--) {
1183 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1184 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1185
1186 if ((scn_addr < start) || (scn_addr >= (start + size)))
1187 continue;
1188
1189 nid = of_node_to_nid_single(memory);
1190 break;
Mike Kravetz237a0982005-12-05 12:06:42 -08001191 }
1192
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001193 if (nid >= 0)
1194 break;
Mike Kravetz237a0982005-12-05 12:06:42 -08001195 }
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001196
Anton Blanchard60831842011-08-10 20:44:21 +00001197 of_node_put(memory);
1198
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001199 return nid;
Mike Kravetz237a0982005-12-05 12:06:42 -08001200}
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001201
1202/*
1203 * Find the node associated with a hot added memory section. Section
Yinghai Lu95f72d12010-07-12 14:36:09 +10001204 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1205 * sections are fully contained within a single MEMBLOCK.
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001206 */
1207int hot_add_scn_to_nid(unsigned long scn_addr)
1208{
1209 struct device_node *memory = NULL;
1210 int nid, found = 0;
1211
1212 if (!numa_enabled || (min_common_depth < 0))
H Hartley Sweeten72c33682010-03-05 13:42:43 -08001213 return first_online_node;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001214
1215 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1216 if (memory) {
1217 nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1218 of_node_put(memory);
1219 } else {
1220 nid = hot_add_node_scn_to_nid(scn_addr);
1221 }
1222
1223 if (nid < 0 || !node_online(nid))
H Hartley Sweeten72c33682010-03-05 13:42:43 -08001224 nid = first_online_node;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001225
1226 if (NODE_DATA(nid)->node_spanned_pages)
1227 return nid;
1228
1229 for_each_online_node(nid) {
1230 if (NODE_DATA(nid)->node_spanned_pages) {
1231 found = 1;
1232 break;
1233 }
1234 }
1235
1236 BUG_ON(!found);
1237 return nid;
1238}
1239
Nishanth Aravamudancd342062010-10-26 17:35:12 +00001240static u64 hot_add_drconf_memory_max(void)
1241{
1242 struct device_node *memory = NULL;
1243 unsigned int drconf_cell_cnt = 0;
1244 u64 lmb_size = 0;
1245 const u32 *dm = 0;
1246
1247 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1248 if (memory) {
1249 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1250 lmb_size = of_get_lmb_size(memory);
1251 of_node_put(memory);
1252 }
1253 return lmb_size * drconf_cell_cnt;
1254}
1255
1256/*
1257 * memory_hotplug_max - return max address of memory that may be added
1258 *
1259 * This is currently only used on systems that support drconfig memory
1260 * hotplug.
1261 */
1262u64 memory_hotplug_max(void)
1263{
1264 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1265}
Mike Kravetz237a0982005-12-05 12:06:42 -08001266#endif /* CONFIG_MEMORY_HOTPLUG */
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001267
Jesse Larrewbd034032011-01-20 19:00:51 +00001268/* Virtual Processor Home Node (VPHN) support */
Jesse Larrew39bf9902010-12-17 22:07:47 +00001269#ifdef CONFIG_PPC_SPLPAR
Anton Blanchard5de16692011-01-29 12:24:34 +00001270static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001271static cpumask_t cpu_associativity_changes_mask;
1272static int vphn_enabled;
1273static void set_topology_timer(void);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001274
1275/*
1276 * Store the current values of the associativity change counters in the
1277 * hypervisor.
1278 */
1279static void setup_cpu_associativity_change_counters(void)
1280{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001281 int cpu;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001282
Anton Blanchard5de16692011-01-29 12:24:34 +00001283 /* The VPHN feature supports a maximum of 8 reference points */
1284 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1285
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001286 for_each_possible_cpu(cpu) {
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001287 int i;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001288 u8 *counts = vphn_cpu_change_counts[cpu];
1289 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1290
Anton Blanchard5de16692011-01-29 12:24:34 +00001291 for (i = 0; i < distance_ref_points_depth; i++)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001292 counts[i] = hypervisor_counts[i];
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001293 }
1294}
1295
1296/*
1297 * The hypervisor maintains a set of 8 associativity change counters in
1298 * the VPA of each cpu that correspond to the associativity levels in the
1299 * ibm,associativity-reference-points property. When an associativity
1300 * level changes, the corresponding counter is incremented.
1301 *
1302 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1303 * node associativity levels have changed.
1304 *
1305 * Returns the number of cpus with unhandled associativity changes.
1306 */
1307static int update_cpu_associativity_changes_mask(void)
1308{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001309 int cpu, nr_cpus = 0;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001310 cpumask_t *changes = &cpu_associativity_changes_mask;
1311
1312 cpumask_clear(changes);
1313
1314 for_each_possible_cpu(cpu) {
1315 int i, changed = 0;
1316 u8 *counts = vphn_cpu_change_counts[cpu];
1317 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1318
Anton Blanchard5de16692011-01-29 12:24:34 +00001319 for (i = 0; i < distance_ref_points_depth; i++) {
Anton Blanchardd69043e2011-01-29 12:26:19 +00001320 if (hypervisor_counts[i] != counts[i]) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001321 counts[i] = hypervisor_counts[i];
1322 changed = 1;
1323 }
1324 }
1325 if (changed) {
1326 cpumask_set_cpu(cpu, changes);
1327 nr_cpus++;
1328 }
1329 }
1330
1331 return nr_cpus;
1332}
1333
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001334/*
1335 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1336 * the complete property we have to add the length in the first cell.
1337 */
1338#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001339
1340/*
1341 * Convert the associativity domain numbers returned from the hypervisor
1342 * to the sequence they would appear in the ibm,associativity property.
1343 */
1344static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1345{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001346 int i, nr_assoc_doms = 0;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001347 const u16 *field = (const u16*) packed;
1348
1349#define VPHN_FIELD_UNUSED (0xffff)
1350#define VPHN_FIELD_MSB (0x8000)
1351#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
1352
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001353 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001354 if (*field == VPHN_FIELD_UNUSED) {
1355 /* All significant fields processed, and remaining
1356 * fields contain the reserved value of all 1's.
1357 * Just store them.
1358 */
1359 unpacked[i] = *((u32*)field);
1360 field += 2;
Jesse Larrew7639ada2011-01-20 19:01:13 +00001361 } else if (*field & VPHN_FIELD_MSB) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001362 /* Data is in the lower 15 bits of this field */
1363 unpacked[i] = *field & VPHN_FIELD_MASK;
1364 field++;
1365 nr_assoc_doms++;
Jesse Larrew7639ada2011-01-20 19:01:13 +00001366 } else {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001367 /* Data is in the lower 15 bits of this field
1368 * concatenated with the next 16 bit field
1369 */
1370 unpacked[i] = *((u32*)field);
1371 field += 2;
1372 nr_assoc_doms++;
1373 }
1374 }
1375
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001376 /* The first cell contains the length of the property */
1377 unpacked[0] = nr_assoc_doms;
1378
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001379 return nr_assoc_doms;
1380}
1381
1382/*
1383 * Retrieve the new associativity information for a virtual processor's
1384 * home node.
1385 */
1386static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1387{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001388 long rc;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001389 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1390 u64 flags = 1;
1391 int hwcpu = get_hard_smp_processor_id(cpu);
1392
1393 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1394 vphn_unpack_associativity(retbuf, associativity);
1395
1396 return rc;
1397}
1398
1399static long vphn_get_associativity(unsigned long cpu,
1400 unsigned int *associativity)
1401{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001402 long rc;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001403
1404 rc = hcall_vphn(cpu, associativity);
1405
1406 switch (rc) {
1407 case H_FUNCTION:
1408 printk(KERN_INFO
1409 "VPHN is not supported. Disabling polling...\n");
1410 stop_topology_update();
1411 break;
1412 case H_HARDWARE:
1413 printk(KERN_ERR
1414 "hcall_vphn() experienced a hardware fault "
1415 "preventing VPHN. Disabling polling...\n");
1416 stop_topology_update();
1417 }
1418
1419 return rc;
1420}
1421
1422/*
1423 * Update the node maps and sysfs entries for each cpu whose home node
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001424 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001425 */
1426int arch_update_cpu_topology(void)
1427{
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001428 int cpu, nid, old_nid, changed = 0;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001429 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
Kay Sievers8a25a2f2011-12-21 14:29:42 -08001430 struct device *dev;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001431
KOSAKI Motohiro104699c2011-04-28 05:07:23 +00001432 for_each_cpu(cpu,&cpu_associativity_changes_mask) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001433 vphn_get_associativity(cpu, associativity);
1434 nid = associativity_to_nid(associativity);
1435
1436 if (nid < 0 || !node_online(nid))
1437 nid = first_online_node;
1438
1439 old_nid = numa_cpu_lookup_table[cpu];
1440
1441 /* Disable hotplug while we update the cpu
1442 * masks and sysfs.
1443 */
1444 get_online_cpus();
1445 unregister_cpu_under_node(cpu, old_nid);
1446 unmap_cpu_from_node(cpu);
1447 map_cpu_to_node(cpu, nid);
1448 register_cpu_under_node(cpu, nid);
1449 put_online_cpus();
1450
Kay Sievers8a25a2f2011-12-21 14:29:42 -08001451 dev = get_cpu_device(cpu);
1452 if (dev)
1453 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001454 changed = 1;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001455 }
1456
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001457 return changed;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001458}
1459
1460static void topology_work_fn(struct work_struct *work)
1461{
1462 rebuild_sched_domains();
1463}
1464static DECLARE_WORK(topology_work, topology_work_fn);
1465
1466void topology_schedule_update(void)
1467{
1468 schedule_work(&topology_work);
1469}
1470
1471static void topology_timer_fn(unsigned long ignored)
1472{
1473 if (!vphn_enabled)
1474 return;
1475 if (update_cpu_associativity_changes_mask() > 0)
1476 topology_schedule_update();
1477 set_topology_timer();
1478}
1479static struct timer_list topology_timer =
1480 TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1481
1482static void set_topology_timer(void)
1483{
1484 topology_timer.data = 0;
1485 topology_timer.expires = jiffies + 60 * HZ;
1486 add_timer(&topology_timer);
1487}
1488
1489/*
1490 * Start polling for VPHN associativity changes.
1491 */
1492int start_topology_update(void)
1493{
1494 int rc = 0;
1495
Benjamin Herrenschmidt36e86952011-03-09 13:00:14 +00001496 /* Disabled until races with load balancing are fixed */
1497 if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
Anton Blanchardfe5cfd62011-01-29 12:35:22 +00001498 get_lppaca()->shared_proc) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001499 vphn_enabled = 1;
1500 setup_cpu_associativity_change_counters();
1501 init_timer_deferrable(&topology_timer);
1502 set_topology_timer();
1503 rc = 1;
1504 }
1505
1506 return rc;
1507}
1508__initcall(start_topology_update);
1509
1510/*
1511 * Disable polling for VPHN associativity changes.
1512 */
1513int stop_topology_update(void)
1514{
1515 vphn_enabled = 0;
1516 return del_timer_sync(&topology_timer);
1517}
Jesse Larrew39bf9902010-12-17 22:07:47 +00001518#endif /* CONFIG_PPC_SPLPAR */