x86: merge 64 and 32 SMP percpu handling

Now that pda is allocated as part of percpu, percpu doesn't need to be
accessed through pda.  Unify x86_64 SMP percpu access with x86_32 SMP
one.  Other than the segment register, operand size and the base of
percpu symbols, they behave identical now.

This patch replaces now unnecessary pda->data_offset with a dummy
field which is necessary to keep stack_canary at its place.  This
patch also moves per_cpu_offset initialization out of init_gdt() into
setup_per_cpu_areas().  Note that this change also necessitates
explicit per_cpu_offset initializations in voyager_smp.c.

With this change, x86_OP_percpu()'s are as efficient on x86_64 as on
x86_32 and also x86_64 can use assembly PER_CPU macros.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 63d4628..be1ff34 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -125,14 +125,14 @@
 #endif
 }
 
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
+#ifdef CONFIG_X86_64
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+	[0] = (unsigned long)__per_cpu_load,
+};
+#else
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
 #endif
+EXPORT_SYMBOL(__per_cpu_offset);
 
 /*
  * Great future plan:
@@ -178,6 +178,7 @@
 #endif
 
 		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 #ifdef CONFIG_X86_64
 		cpu_pda(cpu) = (void *)ptr;
 
@@ -190,7 +191,7 @@
 		else
 			memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
 #endif
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 
 		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}