[SPARC64]: Eliminate NR_CPUS limitations.

Cheetah systems can have cpuids as large as 1023, although physical
systems don't have that many cpus.

Only three limitations existed in the kernel preventing arbitrary
NR_CPUS values:

1) dcache dirty cpu state stored in page->flags on
   D-cache aliasing platforms.  With some build time
   calculations and some build-time BUG checks on
   page->flags layout, this one was easily solved.

2) The cheetah XCALL delivery code could only handle
   a cpumask with up to 32 cpus set.  Some simple looping
   logic clears that up too.

3) thread_info->cpu was a u8, easily changed to a u16.

There are a few spots in the kernel that still put NR_CPUS
sized arrays on the kernel stack, but that's not a sparc64
specific problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h
index 2ebf7f2..98252cd 100644
--- a/include/asm-sparc64/thread_info.h
+++ b/include/asm-sparc64/thread_info.h
@@ -38,8 +38,8 @@
 	/* D$ line 1 */
 	struct task_struct	*task;
 	unsigned long		flags;
-	__u8			cpu;
 	__u8			fpsaved[7];
+	__u8			pad;
 	unsigned long		ksp;
 
 	/* D$ line 2 */
@@ -49,7 +49,7 @@
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u8			new_child;
 	__u8			syscall_noerror;
-	__u16			__pad;
+	__u16			cpu;
 
 	unsigned long		*utraps;
 
@@ -83,8 +83,7 @@
 #define TI_CURRENT_DS	(TI_FLAGS + TI_FLAG_BYTE_CURRENT_DS)
 #define TI_FPDEPTH	(TI_FLAGS + TI_FLAG_BYTE_FPDEPTH)
 #define TI_WSAVED	(TI_FLAGS + TI_FLAG_BYTE_WSAVED)
-#define TI_CPU		0x00000010
-#define TI_FPSAVED	0x00000011
+#define TI_FPSAVED	0x00000010
 #define TI_KSP		0x00000018
 #define TI_FAULT_ADDR	0x00000020
 #define TI_KREGS	0x00000028
@@ -92,6 +91,7 @@
 #define TI_PRE_COUNT	0x00000038
 #define TI_NEW_CHILD	0x0000003c
 #define TI_SYS_NOERROR	0x0000003d
+#define TI_CPU		0x0000003e
 #define TI_UTRAPS	0x00000040
 #define TI_REG_WINDOW	0x00000048
 #define TI_RWIN_SPTRS	0x000003c8