[POWERPC] Fix MMIO ops to provide expected barrier behaviour

This changes the writeX family of functions to have a sync instruction
before the MMIO store rather than after, because the generally expected
behaviour is that the device receiving the MMIO store can be guaranteed
to see the effects of any preceding writes to normal memory.

To preserve ordering between writeX and readX, and to preserve ordering
between preceding stores and the readX, the readX family of functions
have had an sync added before the load.

Although writeX followed by spin_unlock is not officially guaranteed
to keep the writeX inside the spin-locked region unless an mmiowb()
is used, there are currently drivers that depend on the previous
behaviour on powerpc, which was that the mmiowb wasn't actually required.
Therefore we have a per-cpu flag that is set by writeX, cleared by
__raw_spin_lock and mmiowb, and tested by __raw_spin_unlock.  If it is
set, __raw_spin_unlock does a sync and clears it.

This changes both 32-bit and 64-bit readX/writeX.  32-bit already has a
sync in __raw_spin_unlock (since lwsync doesn't exist on 32-bit), and thus
doesn't need the per-cpu flag.

Tested on G5 (PPC970) and POWER5.

Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index fc23040..f770805 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -17,15 +17,6 @@
 
 	.text
 
-#ifdef CONFIG_PPC64
-#define IN_SYNC		twi	0,r5,0; isync
-#define EIEIO_32
-#define SYNC_64		sync
-#else /* CONFIG_PPC32 */
-#define IN_SYNC
-#define EIEIO_32	eieio
-#define SYNC_64
-#endif
 /*
  * Returns (address we are running at) - (address we were linked at)
  * for use before the text and data are mapped to KERNELBASE.
@@ -70,6 +61,7 @@
  * The *_ns versions don't do byte-swapping.
  */
 _GLOBAL(_insb)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,1
@@ -78,7 +70,8 @@
 	eieio
 	stbu	r5,1(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsb)
@@ -86,14 +79,15 @@
 	mtctr	r5
 	subi	r4,r4,1
 	blelr-
+	sync
 00:	lbzu	r5,1(r4)
 	stb	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 _GLOBAL(_insw)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,2
@@ -102,7 +96,8 @@
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsw)
@@ -110,14 +105,15 @@
 	mtctr	r5
 	subi	r4,r4,2
 	blelr-
+	sync
 00:	lhzu	r5,2(r4)
-	EIEIO_32
 	sthbrx	r5,0,r3
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 _GLOBAL(_insl)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,4
@@ -126,7 +122,8 @@
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 _GLOBAL(_outsl)
@@ -134,17 +131,18 @@
 	mtctr	r5
 	subi	r4,r4,4
 	blelr-
+	sync
 00:	lwzu	r5,4(r4)
 	stwbrx	r5,0,r3
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 #ifdef CONFIG_PPC32
 _GLOBAL(__ide_mm_insw)
 #endif
 _GLOBAL(_insw_ns)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,2
@@ -153,7 +151,8 @@
 	eieio
 	sthu	r5,2(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 #ifdef CONFIG_PPC32
@@ -164,17 +163,18 @@
 	mtctr	r5
 	subi	r4,r4,2
 	blelr-
+	sync
 00:	lhzu	r5,2(r4)
 	sth	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr
 
 #ifdef CONFIG_PPC32
 _GLOBAL(__ide_mm_insl)
 #endif
 _GLOBAL(_insl_ns)
+	sync
 	cmpwi	0,r5,0
 	mtctr	r5
 	subi	r4,r4,4
@@ -183,7 +183,8 @@
 	eieio
 	stwu	r5,4(r4)
 	bdnz	00b
-	IN_SYNC
+	twi	0,r5,0
+	isync
 	blr
 
 #ifdef CONFIG_PPC32
@@ -194,10 +195,10 @@
 	mtctr	r5
 	subi	r4,r4,4
 	blelr-
+	sync
 00:	lwzu	r5,4(r4)
 	stw	r5,0(r3)
-	EIEIO_32
 	bdnz	00b
-	SYNC_64
+	sync
 	blr