[SPARC64]: Fix TLB context allocation with SMT style shared TLBs.

The context allocation scheme we use depends upon there being a 1<-->1
mapping from cpu to physical TLB for correctness.  Chips like Niagara
break this assumption.

So what we do is notify all cpus with a cross call when the context
version number changes, and if necessary this makes them allocate
a valid context for the address space they are running at the time.

Stress tested with make -j1024, make -j2048, and make -j4096 kernel
builds on a 32-strand, 8 core, T2000 with 16GB of ram.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 0cd9b16..1ce9408 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -885,26 +885,44 @@
 	put_cpu();
 }
 
+static void __smp_receive_signal_mask(cpumask_t mask)
+{
+	smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
+}
+
 void smp_receive_signal(int cpu)
 {
 	cpumask_t mask = cpumask_of_cpu(cpu);
 
-	if (cpu_online(cpu)) {
-		u64 data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
-
-		if (tlb_type == spitfire)
-			spitfire_xcall_deliver(data0, 0, 0, mask);
-		else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-			cheetah_xcall_deliver(data0, 0, 0, mask);
-		else if (tlb_type == hypervisor)
-			hypervisor_xcall_deliver(data0, 0, 0, mask);
-	}
+	if (cpu_online(cpu))
+		__smp_receive_signal_mask(mask);
 }
 
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
-	/* Just return, rtrap takes care of the rest. */
+	struct mm_struct *mm;
+
 	clear_softint(1 << irq);
+
+	/* See if we need to allocate a new TLB context because
+	 * the version of the one we are using is now out of date.
+	 */
+	mm = current->active_mm;
+	if (likely(mm)) {
+		if (unlikely(!CTX_VALID(mm->context))) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&mm->context.lock, flags);
+			get_new_mmu_context(mm);
+			load_secondary_context(mm);
+			spin_unlock_irqrestore(&mm->context.lock, flags);
+		}
+	}
+}
+
+void smp_new_mmu_context_version(void)
+{
+	__smp_receive_signal_mask(cpu_online_map);
 }
 
 void smp_report_regs(void)
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 16f0db3..ccf083a 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -629,17 +629,20 @@
  * let the user have CTX 0 (nucleus) or we ever use a CTX
  * version of zero (and thus NO_CONTEXT would not be caught
  * by version mis-match tests in mmu_context.h).
+ *
+ * Always invoked with interrupts disabled.
  */
 void get_new_mmu_context(struct mm_struct *mm)
 {
 	unsigned long ctx, new_ctx;
 	unsigned long orig_pgsz_bits;
-	
+	int new_version;
 
 	spin_lock(&ctx_alloc_lock);
 	orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
 	ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
 	new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
+	new_version = 0;
 	if (new_ctx >= (1 << CTX_NR_BITS)) {
 		new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
 		if (new_ctx >= ctx) {
@@ -662,6 +665,7 @@
 				mmu_context_bmap[i + 2] = 0;
 				mmu_context_bmap[i + 3] = 0;
 			}
+			new_version = 1;
 			goto out;
 		}
 	}
@@ -671,6 +675,9 @@
 	tlb_context_cache = new_ctx;
 	mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
 	spin_unlock(&ctx_alloc_lock);
+
+	if (unlikely(new_version))
+		smp_new_mmu_context_version();
 }
 
 void sparc_ultra_dump_itlb(void)