[PATCH] x86_64: Speed up numa_node_id by putting it directly into the PDA

Not go from the CPU number to an mapping array.
Mode number is often used now in fast paths.

This also adds a generic numa_node_id to all the topology includes

Suggested by Eric Dumazet

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f27731a..99cfa75 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -823,7 +823,7 @@
  		if (!node_online(node))
  			node = nearby_node(apicid);
  	}
-  	cpu_to_node[cpu] = node;
+	numa_set_node(cpu, node);
 
   	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
   			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
@@ -975,7 +975,7 @@
 	node = apicid_to_node[hard_smp_processor_id()];
 	if (node == NUMA_NO_NODE)
 		node = 0;
-	cpu_to_node[cpu] = node;
+	numa_set_node(cpu, node);
 
 	if (acpi_numa > 0)
 		printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 18e86e2..4bf6458 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -156,7 +156,7 @@
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
-		cpu_to_node[i] = rr;
+ 		numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
@@ -242,7 +242,7 @@
 	nodes_clear(node_online_map);
 	node_set_online(0);
 	for (i = 0; i < NR_CPUS; i++)
-		cpu_to_node[i] = 0;
+		numa_set_node(i, 0);
 	node_to_cpumask[0] = cpumask_of_cpu(0);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
@@ -252,6 +252,12 @@
 	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	cpu_pda[cpu].nodenumber = node;
+	cpu_to_node[cpu] = node;
+}
+
 unsigned long __init numa_free_all_bootmem(void) 
 { 
 	int i;
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844..c7aa08a 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -203,7 +203,7 @@
 		if (cpu_to_node[i] == NUMA_NO_NODE)
 			continue;
 		if (!node_isset(cpu_to_node[i], nodes_parsed))
-			cpu_to_node[i] = NUMA_NO_NODE; 
+			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
 	return 0;
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index bcf55c3..d51e56f 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -17,6 +17,8 @@
 extern void numa_init_array(void);
 extern int numa_off;
 
+extern void numa_set_node(int cpu, int node);
+
 extern unsigned char apicid_to_node[256];
 
 #define NUMA_NO_NODE 0xff
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index bbf89aa..8733ccf 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -15,6 +15,7 @@
         int irqcount;		    /* Irq nesting counter. Starts with -1 */  	
 	int cpunumber;		    /* Logical CPU number */
 	char *irqstackptr;	/* top of irqstack */
+	int nodenumber;		    /* number of current node */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	struct mm_struct *active_mm;
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 1c603cd..d39ebd5 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -28,6 +28,8 @@
 #define pcibus_to_node(bus)		((long)(bus->sysdata))	
 #define pcibus_to_cpumask(bus)		node_to_cpumask(pcibus_to_node(bus));
 
+#define numa_node_id()			read_pda(nodenumber)
+
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 57fc99c..f3cffc3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,7 +435,9 @@
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
+#ifndef numa_node_id
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
+#endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES