Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Some of the code in this file has been gleaned from the 64 bit |
| 3 | * discontigmem support code base. |
| 4 | * |
| 5 | * Copyright (C) 2002, IBM Corp. |
| 6 | * |
| 7 | * All rights reserved. |
| 8 | * |
| 9 | * This program is free software; you can redistribute it and/or modify |
| 10 | * it under the terms of the GNU General Public License as published by |
| 11 | * the Free Software Foundation; either version 2 of the License, or |
| 12 | * (at your option) any later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but |
| 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
| 17 | * NON INFRINGEMENT. See the GNU General Public License for more |
| 18 | * details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU General Public License |
| 21 | * along with this program; if not, write to the Free Software |
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 23 | * |
| 24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> |
| 25 | */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 26 | #include <linux/mm.h> |
| 27 | #include <linux/bootmem.h> |
| 28 | #include <linux/mmzone.h> |
| 29 | #include <linux/acpi.h> |
| 30 | #include <linux/nodemask.h> |
| 31 | #include <asm/srat.h> |
| 32 | #include <asm/topology.h> |
| 33 | |
| 34 | /* |
| 35 | * proximity macros and definitions |
| 36 | */ |
| 37 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ |
| 38 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ |
| 39 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) |
| 40 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 41 | /* bitmap length; _PXM is at most 255 */ |
| 42 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) |
| 43 | static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ |
| 44 | |
| 45 | #define MAX_CHUNKS_PER_NODE 4 |
| 46 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) |
| 47 | struct node_memory_chunk_s { |
| 48 | unsigned long start_pfn; |
| 49 | unsigned long end_pfn; |
| 50 | u8 pxm; // proximity domain of node |
| 51 | u8 nid; // which cnode contains this chunk? |
| 52 | u8 bank; // which mem bank on this node |
| 53 | }; |
| 54 | static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; |
| 55 | |
| 56 | static int num_memory_chunks; /* total number of memory chunks */ |
| 57 | static int zholes_size_init; |
| 58 | static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; |
| 59 | |
| 60 | extern void * boot_ioremap(unsigned long, unsigned long); |
| 61 | |
| 62 | /* Identify CPU proximity domains */ |
| 63 | static void __init parse_cpu_affinity_structure(char *p) |
| 64 | { |
| 65 | struct acpi_table_processor_affinity *cpu_affinity = |
| 66 | (struct acpi_table_processor_affinity *) p; |
| 67 | |
| 68 | if (!cpu_affinity->flags.enabled) |
| 69 | return; /* empty entry */ |
| 70 | |
| 71 | /* mark this node as "seen" in node bitmap */ |
| 72 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); |
| 73 | |
| 74 | printk("CPU 0x%02X in proximity domain 0x%02X\n", |
| 75 | cpu_affinity->apic_id, cpu_affinity->proximity_domain); |
| 76 | } |
| 77 | |
| 78 | /* |
| 79 | * Identify memory proximity domains and hot-remove capabilities. |
| 80 | * Fill node memory chunk list structure. |
| 81 | */ |
| 82 | static void __init parse_memory_affinity_structure (char *sratp) |
| 83 | { |
| 84 | unsigned long long paddr, size; |
| 85 | unsigned long start_pfn, end_pfn; |
| 86 | u8 pxm; |
| 87 | struct node_memory_chunk_s *p, *q, *pend; |
| 88 | struct acpi_table_memory_affinity *memory_affinity = |
| 89 | (struct acpi_table_memory_affinity *) sratp; |
| 90 | |
| 91 | if (!memory_affinity->flags.enabled) |
| 92 | return; /* empty entry */ |
| 93 | |
| 94 | /* mark this node as "seen" in node bitmap */ |
| 95 | BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); |
| 96 | |
| 97 | /* calculate info for memory chunk structure */ |
| 98 | paddr = memory_affinity->base_addr_hi; |
| 99 | paddr = (paddr << 32) | memory_affinity->base_addr_lo; |
| 100 | size = memory_affinity->length_hi; |
| 101 | size = (size << 32) | memory_affinity->length_lo; |
| 102 | |
| 103 | start_pfn = paddr >> PAGE_SHIFT; |
| 104 | end_pfn = (paddr + size) >> PAGE_SHIFT; |
| 105 | |
| 106 | pxm = memory_affinity->proximity_domain; |
| 107 | |
| 108 | if (num_memory_chunks >= MAXCHUNKS) { |
| 109 | printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", |
| 110 | size/(1024*1024), paddr); |
| 111 | return; |
| 112 | } |
| 113 | |
| 114 | /* Insertion sort based on base address */ |
| 115 | pend = &node_memory_chunk[num_memory_chunks]; |
| 116 | for (p = &node_memory_chunk[0]; p < pend; p++) { |
| 117 | if (start_pfn < p->start_pfn) |
| 118 | break; |
| 119 | } |
| 120 | if (p < pend) { |
| 121 | for (q = pend; q >= p; q--) |
| 122 | *(q + 1) = *q; |
| 123 | } |
| 124 | p->start_pfn = start_pfn; |
| 125 | p->end_pfn = end_pfn; |
| 126 | p->pxm = pxm; |
| 127 | |
| 128 | num_memory_chunks++; |
| 129 | |
| 130 | printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", |
| 131 | start_pfn, end_pfn, |
| 132 | memory_affinity->memory_type, |
| 133 | memory_affinity->proximity_domain, |
| 134 | (memory_affinity->flags.hot_pluggable ? |
| 135 | "enabled and removable" : "enabled" ) ); |
| 136 | } |
| 137 | |
Andi Kleen | fed6441 | 2005-11-05 17:25:53 +0100 | [diff] [blame] | 138 | #if MAX_NR_ZONES != 4 |
| 139 | #error "MAX_NR_ZONES != 4, chunk_to_zone requires review" |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 140 | #endif |
| 141 | /* Take a chunk of pages from page frame cstart to cend and count the number |
| 142 | * of pages in each zone, returned via zones[]. |
| 143 | */ |
| 144 | static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, |
| 145 | unsigned long *zones) |
| 146 | { |
| 147 | unsigned long max_dma; |
| 148 | extern unsigned long max_low_pfn; |
| 149 | |
| 150 | int z; |
| 151 | unsigned long rend; |
| 152 | |
| 153 | /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide |
| 154 | * similarly scoped information and should be handled in a consistant |
| 155 | * manner. |
| 156 | */ |
| 157 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
| 158 | |
| 159 | /* Split the hole into the zones in which it falls. Repeatedly |
| 160 | * take the segment in which the remaining hole starts, round it |
| 161 | * to the end of that zone. |
| 162 | */ |
| 163 | memset(zones, 0, MAX_NR_ZONES * sizeof(long)); |
| 164 | while (cstart < cend) { |
| 165 | if (cstart < max_dma) { |
| 166 | z = ZONE_DMA; |
| 167 | rend = (cend < max_dma)? cend : max_dma; |
| 168 | |
| 169 | } else if (cstart < max_low_pfn) { |
| 170 | z = ZONE_NORMAL; |
| 171 | rend = (cend < max_low_pfn)? cend : max_low_pfn; |
| 172 | |
| 173 | } else { |
| 174 | z = ZONE_HIGHMEM; |
| 175 | rend = cend; |
| 176 | } |
| 177 | zones[z] += rend - cstart; |
| 178 | cstart = rend; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | /* |
| 183 | * The SRAT table always lists ascending addresses, so can always |
| 184 | * assume that the first "start" address that you see is the real |
| 185 | * start of the node, and that the current "end" address is after |
| 186 | * the previous one. |
| 187 | */ |
| 188 | static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) |
| 189 | { |
| 190 | /* |
| 191 | * Only add present memory as told by the e820. |
| 192 | * There is no guarantee from the SRAT that the memory it |
| 193 | * enumerates is present at boot time because it represents |
| 194 | * *possible* memory hotplug areas the same as normal RAM. |
| 195 | */ |
| 196 | if (memory_chunk->start_pfn >= max_pfn) { |
| 197 | printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", |
| 198 | memory_chunk->start_pfn, memory_chunk->end_pfn); |
| 199 | return; |
| 200 | } |
| 201 | if (memory_chunk->nid != nid) |
| 202 | return; |
| 203 | |
| 204 | if (!node_has_online_mem(nid)) |
| 205 | node_start_pfn[nid] = memory_chunk->start_pfn; |
| 206 | |
| 207 | if (node_start_pfn[nid] > memory_chunk->start_pfn) |
| 208 | node_start_pfn[nid] = memory_chunk->start_pfn; |
| 209 | |
| 210 | if (node_end_pfn[nid] < memory_chunk->end_pfn) |
| 211 | node_end_pfn[nid] = memory_chunk->end_pfn; |
| 212 | } |
| 213 | |
| 214 | /* Parse the ACPI Static Resource Affinity Table */ |
| 215 | static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) |
| 216 | { |
| 217 | u8 *start, *end, *p; |
| 218 | int i, j, nid; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 219 | |
| 220 | start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ |
| 221 | p = start; |
| 222 | end = (u8 *)sratp + sratp->header.length; |
| 223 | |
| 224 | memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ |
| 225 | memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); |
| 226 | memset(zholes_size, 0, sizeof(zholes_size)); |
| 227 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 228 | num_memory_chunks = 0; |
| 229 | while (p < end) { |
| 230 | switch (*p) { |
| 231 | case ACPI_SRAT_PROCESSOR_AFFINITY: |
| 232 | parse_cpu_affinity_structure(p); |
| 233 | break; |
| 234 | case ACPI_SRAT_MEMORY_AFFINITY: |
| 235 | parse_memory_affinity_structure(p); |
| 236 | break; |
| 237 | default: |
| 238 | printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); |
| 239 | break; |
| 240 | } |
| 241 | p += p[1]; |
| 242 | if (p[1] == 0) { |
| 243 | printk("acpi20_parse_srat: Entry length value is zero;" |
| 244 | " can't parse any further!\n"); |
| 245 | break; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | if (num_memory_chunks == 0) { |
| 250 | printk("could not finy any ACPI SRAT memory areas.\n"); |
| 251 | goto out_fail; |
| 252 | } |
| 253 | |
| 254 | /* Calculate total number of nodes in system from PXM bitmap and create |
| 255 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem |
| 256 | * to specify the range of _PXM values.) |
| 257 | */ |
| 258 | /* |
| 259 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain |
| 260 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically |
| 261 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES |
| 262 | * approaches MAX_PXM_DOMAINS for i386. |
| 263 | */ |
| 264 | nodes_clear(node_online_map); |
| 265 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { |
| 266 | if (BMAP_TEST(pxm_bitmap, i)) { |
Yasunori Goto | 762834e | 2006-06-23 02:03:19 -0700 | [diff] [blame] | 267 | int nid = acpi_map_pxm_to_node(i); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 268 | node_set_online(nid); |
| 269 | } |
| 270 | } |
| 271 | BUG_ON(num_online_nodes() == 0); |
| 272 | |
| 273 | /* set cnode id in memory chunk structure */ |
| 274 | for (i = 0; i < num_memory_chunks; i++) |
Yasunori Goto | 762834e | 2006-06-23 02:03:19 -0700 | [diff] [blame] | 275 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 276 | |
| 277 | printk("pxm bitmap: "); |
| 278 | for (i = 0; i < sizeof(pxm_bitmap); i++) { |
| 279 | printk("%02X ", pxm_bitmap[i]); |
| 280 | } |
| 281 | printk("\n"); |
| 282 | printk("Number of logical nodes in system = %d\n", num_online_nodes()); |
| 283 | printk("Number of memory chunks in system = %d\n", num_memory_chunks); |
| 284 | |
| 285 | for (j = 0; j < num_memory_chunks; j++){ |
| 286 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; |
| 287 | printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", |
| 288 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); |
| 289 | node_read_chunk(chunk->nid, chunk); |
| 290 | } |
| 291 | |
| 292 | for_each_online_node(nid) { |
| 293 | unsigned long start = node_start_pfn[nid]; |
| 294 | unsigned long end = node_end_pfn[nid]; |
| 295 | |
| 296 | memory_present(nid, start, end); |
| 297 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); |
| 298 | } |
| 299 | return 1; |
| 300 | out_fail: |
| 301 | return 0; |
| 302 | } |
| 303 | |
| 304 | int __init get_memcfg_from_srat(void) |
| 305 | { |
| 306 | struct acpi_table_header *header = NULL; |
| 307 | struct acpi_table_rsdp *rsdp = NULL; |
| 308 | struct acpi_table_rsdt *rsdt = NULL; |
| 309 | struct acpi_pointer *rsdp_address = NULL; |
| 310 | struct acpi_table_rsdt saved_rsdt; |
| 311 | int tables = 0; |
| 312 | int i = 0; |
| 313 | |
Magnus Damm | 5d35704 | 2005-10-30 14:59:48 -0800 | [diff] [blame] | 314 | if (ACPI_FAILURE(acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, |
| 315 | rsdp_address))) { |
| 316 | printk("%s: System description tables not found\n", |
| 317 | __FUNCTION__); |
| 318 | goto out_err; |
| 319 | } |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 320 | |
| 321 | if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) { |
| 322 | printk("%s: assigning address to rsdp\n", __FUNCTION__); |
| 323 | rsdp = (struct acpi_table_rsdp *) |
| 324 | (u32)rsdp_address->pointer.physical; |
| 325 | } else { |
| 326 | printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__); |
| 327 | goto out_err; |
| 328 | } |
| 329 | if (!rsdp) { |
| 330 | printk("%s: Didn't find ACPI root!\n", __FUNCTION__); |
| 331 | goto out_err; |
| 332 | } |
| 333 | |
| 334 | printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, |
| 335 | rsdp->oem_id); |
| 336 | |
| 337 | if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) { |
| 338 | printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); |
| 339 | goto out_err; |
| 340 | } |
| 341 | |
| 342 | rsdt = (struct acpi_table_rsdt *) |
| 343 | boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); |
| 344 | |
| 345 | if (!rsdt) { |
| 346 | printk(KERN_WARNING |
| 347 | "%s: ACPI: Invalid root system description tables (RSDT)\n", |
| 348 | __FUNCTION__); |
| 349 | goto out_err; |
| 350 | } |
| 351 | |
| 352 | header = & rsdt->header; |
| 353 | |
| 354 | if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) { |
| 355 | printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); |
| 356 | goto out_err; |
| 357 | } |
| 358 | |
| 359 | /* |
| 360 | * The number of tables is computed by taking the |
| 361 | * size of all entries (header size minus total |
| 362 | * size of RSDT) divided by the size of each entry |
| 363 | * (4-byte table pointers). |
| 364 | */ |
| 365 | tables = (header->length - sizeof(struct acpi_table_header)) / 4; |
| 366 | |
| 367 | if (!tables) |
| 368 | goto out_err; |
| 369 | |
| 370 | memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); |
| 371 | |
| 372 | if (saved_rsdt.header.length > sizeof(saved_rsdt)) { |
| 373 | printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", |
| 374 | saved_rsdt.header.length); |
| 375 | goto out_err; |
| 376 | } |
| 377 | |
| 378 | printk("Begin SRAT table scan....\n"); |
| 379 | |
| 380 | for (i = 0; i < tables; i++) { |
| 381 | /* Map in header, then map in full table length. */ |
| 382 | header = (struct acpi_table_header *) |
| 383 | boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header)); |
| 384 | if (!header) |
| 385 | break; |
| 386 | header = (struct acpi_table_header *) |
| 387 | boot_ioremap(saved_rsdt.entry[i], header->length); |
| 388 | if (!header) |
| 389 | break; |
| 390 | |
| 391 | if (strncmp((char *) &header->signature, "SRAT", 4)) |
| 392 | continue; |
| 393 | |
| 394 | /* we've found the srat table. don't need to look at any more tables */ |
| 395 | return acpi20_parse_srat((struct acpi_table_srat *)header); |
| 396 | } |
| 397 | out_err: |
| 398 | printk("failed to get NUMA memory information from SRAT table\n"); |
| 399 | return 0; |
| 400 | } |
| 401 | |
| 402 | /* For each node run the memory list to determine whether there are |
| 403 | * any memory holes. For each hole determine which ZONE they fall |
| 404 | * into. |
| 405 | * |
| 406 | * NOTE#1: this requires knowledge of the zone boundries and so |
| 407 | * _cannot_ be performed before those are calculated in setup_memory. |
| 408 | * |
| 409 | * NOTE#2: we rely on the fact that the memory chunks are ordered by |
| 410 | * start pfn number during setup. |
| 411 | */ |
| 412 | static void __init get_zholes_init(void) |
| 413 | { |
| 414 | int nid; |
| 415 | int c; |
| 416 | int first; |
| 417 | unsigned long end = 0; |
| 418 | |
| 419 | for_each_online_node(nid) { |
| 420 | first = 1; |
| 421 | for (c = 0; c < num_memory_chunks; c++){ |
| 422 | if (node_memory_chunk[c].nid == nid) { |
| 423 | if (first) { |
| 424 | end = node_memory_chunk[c].end_pfn; |
| 425 | first = 0; |
| 426 | |
| 427 | } else { |
| 428 | /* Record any gap between this chunk |
| 429 | * and the previous chunk on this node |
| 430 | * against the zones it spans. |
| 431 | */ |
| 432 | chunk_to_zones(end, |
| 433 | node_memory_chunk[c].start_pfn, |
| 434 | &zholes_size[nid * MAX_NR_ZONES]); |
| 435 | } |
| 436 | } |
| 437 | } |
| 438 | } |
| 439 | } |
| 440 | |
| 441 | unsigned long * __init get_zholes_size(int nid) |
| 442 | { |
| 443 | if (!zholes_size_init) { |
| 444 | zholes_size_init++; |
| 445 | get_zholes_init(); |
| 446 | } |
| 447 | if (nid >= MAX_NUMNODES || !node_online(nid)) |
| 448 | printk("%s: nid = %d is invalid/offline. num_online_nodes = %d", |
| 449 | __FUNCTION__, nid, num_online_nodes()); |
| 450 | return &zholes_size[nid * MAX_NR_ZONES]; |
| 451 | } |