sparc64: Store per-cpu offset in trap_block[]

Surprisingly this actually makes LOAD_PER_CPU_BASE() a little
more efficient.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 91bf4c7..f8f2105 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -641,28 +641,6 @@
 	/* Not reached... */
 
 1:
-	/* If we boot on a non-zero cpu, all of the per-cpu
-	 * variable references we make before setting up the
-	 * per-cpu areas will use a bogus offset.  Put a
-	 * compensating factor into __per_cpu_base to handle
-	 * this cleanly.
-	 *
-	 * What the per-cpu code calculates is:
-	 *
-	 *	__per_cpu_base + (cpu << __per_cpu_shift)
-	 *
-	 * These two variables are zero initially, so to
-	 * make it all cancel out to zero we need to put
-	 * "0 - (cpu << 0)" into __per_cpu_base so that the
-	 * above formula evaluates to zero.
-	 *
-	 * We cannot even perform a printk() until this stuff
-	 * is setup as that calls cpu_clock() which uses
-	 * per-cpu variables.
-	 */
-	sub	%g0, %o0, %o1
-	sethi	%hi(__per_cpu_base), %o2
-	stx	%o1, [%o2 + %lo(__per_cpu_base)]
 #else
 	mov	0, %o0
 #endif
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 4226d0e..b20f253 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1371,23 +1371,17 @@
 {
 }
 
-unsigned long __per_cpu_base __read_mostly;
-unsigned long __per_cpu_shift __read_mostly;
-
-EXPORT_SYMBOL(__per_cpu_base);
-EXPORT_SYMBOL(__per_cpu_shift);
-
 void __init real_setup_per_cpu_areas(void)
 {
-	unsigned long paddr, goal, size, i;
+	unsigned long base, shift, paddr, goal, size, i;
 	char *ptr;
 
 	/* Copy section for each CPU (we discard the original) */
 	goal = PERCPU_ENOUGH_ROOM;
 
-	__per_cpu_shift = PAGE_SHIFT;
+	shift = PAGE_SHIFT;
 	for (size = PAGE_SIZE; size < goal; size <<= 1UL)
-		__per_cpu_shift++;
+		shift++;
 
 	paddr = lmb_alloc(size * NR_CPUS, PAGE_SIZE);
 	if (!paddr) {
@@ -1396,10 +1390,12 @@
 	}
 
 	ptr = __va(paddr);
-	__per_cpu_base = ptr - __per_cpu_start;
+	base = ptr - __per_cpu_start;
 
-	for (i = 0; i < NR_CPUS; i++, ptr += size)
+	for (i = 0; i < NR_CPUS; i++, ptr += size) {
+		__per_cpu_offset(i) = base + (i * size);
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+	}
 
 	/* Setup %g5 for the boot cpu.  */
 	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index d809c4e..d073aab 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2509,6 +2509,7 @@
 }
 
 struct trap_per_cpu trap_block[NR_CPUS];
+EXPORT_SYMBOL(trap_block);
 
 /* This can get invoked before sched_init() so play it super safe
  * and use hard_smp_processor_id().
@@ -2592,7 +2593,9 @@
 	    (TRAP_PER_CPU_RESUM_QMASK !=
 	     offsetof(struct trap_per_cpu, resum_qmask)) ||
 	    (TRAP_PER_CPU_NONRESUM_QMASK !=
-	     offsetof(struct trap_per_cpu, nonresum_qmask)))
+	     offsetof(struct trap_per_cpu, nonresum_qmask)) ||
+	    (TRAP_PER_CPU_PER_CPU_BASE !=
+	     offsetof(struct trap_per_cpu, __per_cpu_base)))
 		trap_per_cpu_offsets_are_bolixed_dave();
 
 	if ((TSB_CONFIG_TSB !=