[S390] Inline assembly cleanup.

Major cleanup of all s390 inline assemblies. They now have a common
coding style. Quite a few have been shortened, mainly by using register
asm variables. Use of the EX_TABLE macro helps  as well. The atomic ops,
bit ops and locking inlines new use the Q-constraint if a newer gcc
is used.  That results in slightly better code.

Thanks to Christian Borntraeger for proof reading the changes.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index 1604004..ccbafe4 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -23,74 +23,68 @@
 
 extern struct task_struct *__switch_to(void *, void *);
 
-#ifdef __s390x__
-#define __FLAG_SHIFT 56
-#else /* ! __s390x__ */
-#define __FLAG_SHIFT 24
-#endif /* ! __s390x__ */
-
 static inline void save_fp_regs(s390_fp_regs *fpregs)
 {
-	asm volatile (
-		"   std   0,8(%1)\n"
-		"   std   2,24(%1)\n"
-		"   std   4,40(%1)\n"
-		"   std   6,56(%1)"
-		: "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory" );
+	asm volatile(
+		"	std	0,8(%1)\n"
+		"	std	2,24(%1)\n"
+		"	std	4,40(%1)\n"
+		"	std	6,56(%1)"
+		: "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory");
 	if (!MACHINE_HAS_IEEE)
 		return;
 	asm volatile(
-		"   stfpc 0(%1)\n"
-		"   std   1,16(%1)\n"
-		"   std   3,32(%1)\n"
-		"   std   5,48(%1)\n"
-		"   std   7,64(%1)\n"
-		"   std   8,72(%1)\n"
-		"   std   9,80(%1)\n"
-		"   std   10,88(%1)\n"
-		"   std   11,96(%1)\n"
-		"   std   12,104(%1)\n"
-		"   std   13,112(%1)\n"
-		"   std   14,120(%1)\n"
-		"   std   15,128(%1)\n"
-		: "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory" );
+		"	stfpc	0(%1)\n"
+		"	std	1,16(%1)\n"
+		"	std	3,32(%1)\n"
+		"	std	5,48(%1)\n"
+		"	std	7,64(%1)\n"
+		"	std	8,72(%1)\n"
+		"	std	9,80(%1)\n"
+		"	std	10,88(%1)\n"
+		"	std	11,96(%1)\n"
+		"	std	12,104(%1)\n"
+		"	std	13,112(%1)\n"
+		"	std	14,120(%1)\n"
+		"	std	15,128(%1)\n"
+		: "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory");
 }
 
 static inline void restore_fp_regs(s390_fp_regs *fpregs)
 {
-	asm volatile (
-		"   ld    0,8(%0)\n"
-		"   ld    2,24(%0)\n"
-		"   ld    4,40(%0)\n"
-		"   ld    6,56(%0)"
-		: : "a" (fpregs), "m" (*fpregs) );
+	asm volatile(
+		"	ld	0,8(%0)\n"
+		"	ld	2,24(%0)\n"
+		"	ld	4,40(%0)\n"
+		"	ld	6,56(%0)"
+		: : "a" (fpregs), "m" (*fpregs));
 	if (!MACHINE_HAS_IEEE)
 		return;
 	asm volatile(
-		"   lfpc  0(%0)\n"
-		"   ld    1,16(%0)\n"
-		"   ld    3,32(%0)\n"
-		"   ld    5,48(%0)\n"
-		"   ld    7,64(%0)\n"
-		"   ld    8,72(%0)\n"
-		"   ld    9,80(%0)\n"
-		"   ld    10,88(%0)\n"
-		"   ld    11,96(%0)\n"
-		"   ld    12,104(%0)\n"
-		"   ld    13,112(%0)\n"
-		"   ld    14,120(%0)\n"
-		"   ld    15,128(%0)\n"
-		: : "a" (fpregs), "m" (*fpregs) );
+		"	lfpc	0(%0)\n"
+		"	ld	1,16(%0)\n"
+		"	ld	3,32(%0)\n"
+		"	ld	5,48(%0)\n"
+		"	ld	7,64(%0)\n"
+		"	ld	8,72(%0)\n"
+		"	ld	9,80(%0)\n"
+		"	ld	10,88(%0)\n"
+		"	ld	11,96(%0)\n"
+		"	ld	12,104(%0)\n"
+		"	ld	13,112(%0)\n"
+		"	ld	14,120(%0)\n"
+		"	ld	15,128(%0)\n"
+		: : "a" (fpregs), "m" (*fpregs));
 }
 
 static inline void save_access_regs(unsigned int *acrs)
 {
-	asm volatile ("stam 0,15,0(%0)" : : "a" (acrs) : "memory" );
+	asm volatile("stam 0,15,0(%0)" : : "a" (acrs) : "memory");
 }
 
 static inline void restore_access_regs(unsigned int *acrs)
 {
-	asm volatile ("lam 0,15,0(%0)" : : "a" (acrs) );
+	asm volatile("lam 0,15,0(%0)" : : "a" (acrs));
 }
 
 #define switch_to(prev,next,last) do {					     \
@@ -126,7 +120,7 @@
 	account_vtime(prev);						     \
 } while (0)
 
-#define nop() __asm__ __volatile__ ("nop")
+#define nop() asm volatile("nop")
 
 #define xchg(ptr,x)							  \
 ({									  \
@@ -147,15 +141,15 @@
 		shift = (3 ^ (addr & 3)) << 3;
 		addr ^= addr & 3;
 		asm volatile(
-			"    l   %0,0(%4)\n"
-			"0:  lr  0,%0\n"
-			"    nr  0,%3\n"
-			"    or  0,%2\n"
-			"    cs  %0,0,0(%4)\n"
-			"    jl  0b\n"
+			"	l	%0,0(%4)\n"
+			"0:	lr	0,%0\n"
+			"	nr	0,%3\n"
+			"	or	0,%2\n"
+			"	cs	%0,0,0(%4)\n"
+			"	jl	0b\n"
 			: "=&d" (old), "=m" (*(int *) addr)
 			: "d" (x << shift), "d" (~(255 << shift)), "a" (addr),
-			  "m" (*(int *) addr) : "memory", "cc", "0" );
+			  "m" (*(int *) addr) : "memory", "cc", "0");
 		x = old >> shift;
 		break;
 	case 2:
@@ -163,36 +157,36 @@
 		shift = (2 ^ (addr & 2)) << 3;
 		addr ^= addr & 2;
 		asm volatile(
-			"    l   %0,0(%4)\n"
-			"0:  lr  0,%0\n"
-			"    nr  0,%3\n"
-			"    or  0,%2\n"
-			"    cs  %0,0,0(%4)\n"
-			"    jl  0b\n"
+			"	l	%0,0(%4)\n"
+			"0:	lr	0,%0\n"
+			"	nr	0,%3\n"
+			"	or	0,%2\n"
+			"	cs	%0,0,0(%4)\n"
+			"	jl	0b\n"
 			: "=&d" (old), "=m" (*(int *) addr)
 			: "d" (x << shift), "d" (~(65535 << shift)), "a" (addr),
-			  "m" (*(int *) addr) : "memory", "cc", "0" );
+			  "m" (*(int *) addr) : "memory", "cc", "0");
 		x = old >> shift;
 		break;
 	case 4:
-		asm volatile (
-			"    l   %0,0(%3)\n"
-			"0:  cs  %0,%2,0(%3)\n"
-			"    jl  0b\n"
+		asm volatile(
+			"	l	%0,0(%3)\n"
+			"0:	cs	%0,%2,0(%3)\n"
+			"	jl	0b\n"
 			: "=&d" (old), "=m" (*(int *) ptr)
 			: "d" (x), "a" (ptr), "m" (*(int *) ptr)
-			: "memory", "cc" );
+			: "memory", "cc");
 		x = old;
 		break;
 #ifdef __s390x__
 	case 8:
-		asm volatile (
-			"    lg  %0,0(%3)\n"
-			"0:  csg %0,%2,0(%3)\n"
-			"    jl  0b\n"
+		asm volatile(
+			"	lg	%0,0(%3)\n"
+			"0:	csg	%0,%2,0(%3)\n"
+			"	jl	0b\n"
 			: "=&d" (old), "=m" (*(long *) ptr)
 			: "d" (x), "a" (ptr), "m" (*(long *) ptr)
-			: "memory", "cc" );
+			: "memory", "cc");
 		x = old;
 		break;
 #endif /* __s390x__ */
@@ -224,55 +218,55 @@
 		shift = (3 ^ (addr & 3)) << 3;
 		addr ^= addr & 3;
 		asm volatile(
-			"    l   %0,0(%4)\n"
-			"0:  nr  %0,%5\n"
-                        "    lr  %1,%0\n"
-			"    or  %0,%2\n"
-			"    or  %1,%3\n"
-			"    cs  %0,%1,0(%4)\n"
-			"    jnl 1f\n"
-			"    xr  %1,%0\n"
-			"    nr  %1,%5\n"
-			"    jnz 0b\n"
+			"	l	%0,0(%4)\n"
+			"0:	nr	%0,%5\n"
+			"	lr	%1,%0\n"
+			"	or	%0,%2\n"
+			"	or	%1,%3\n"
+			"	cs	%0,%1,0(%4)\n"
+			"	jnl	1f\n"
+			"	xr	%1,%0\n"
+			"	nr	%1,%5\n"
+			"	jnz	0b\n"
 			"1:"
 			: "=&d" (prev), "=&d" (tmp)
 			: "d" (old << shift), "d" (new << shift), "a" (ptr),
 			  "d" (~(255 << shift))
-			: "memory", "cc" );
+			: "memory", "cc");
 		return prev >> shift;
 	case 2:
 		addr = (unsigned long) ptr;
 		shift = (2 ^ (addr & 2)) << 3;
 		addr ^= addr & 2;
 		asm volatile(
-			"    l   %0,0(%4)\n"
-			"0:  nr  %0,%5\n"
-                        "    lr  %1,%0\n"
-			"    or  %0,%2\n"
-			"    or  %1,%3\n"
-			"    cs  %0,%1,0(%4)\n"
-			"    jnl 1f\n"
-			"    xr  %1,%0\n"
-			"    nr  %1,%5\n"
-			"    jnz 0b\n"
+			"	l	%0,0(%4)\n"
+			"0:	nr	%0,%5\n"
+			"	lr	%1,%0\n"
+			"	or	%0,%2\n"
+			"	or	%1,%3\n"
+			"	cs	%0,%1,0(%4)\n"
+			"	jnl	1f\n"
+			"	xr	%1,%0\n"
+			"	nr	%1,%5\n"
+			"	jnz	0b\n"
 			"1:"
 			: "=&d" (prev), "=&d" (tmp)
 			: "d" (old << shift), "d" (new << shift), "a" (ptr),
 			  "d" (~(65535 << shift))
-			: "memory", "cc" );
+			: "memory", "cc");
 		return prev >> shift;
 	case 4:
-		asm volatile (
-			"    cs  %0,%2,0(%3)\n"
+		asm volatile(
+			"	cs	%0,%2,0(%3)\n"
 			: "=&d" (prev) : "0" (old), "d" (new), "a" (ptr)
-			: "memory", "cc" );
+			: "memory", "cc");
 		return prev;
 #ifdef __s390x__
 	case 8:
-		asm volatile (
-			"    csg %0,%2,0(%3)\n"
+		asm volatile(
+			"	csg	%0,%2,0(%3)\n"
 			: "=&d" (prev) : "0" (old), "d" (new), "a" (ptr)
-			: "memory", "cc" );
+			: "memory", "cc");
 		return prev;
 #endif /* __s390x__ */
         }
@@ -289,8 +283,8 @@
  * all memory ops have completed wrt other CPU's ( see 7-15 POP  DJB ).
  */
 
-#define eieio()  __asm__ __volatile__ ( "bcr 15,0" : : : "memory" ) 
-# define SYNC_OTHER_CORES(x)   eieio() 
+#define eieio()	asm volatile("bcr 15,0" : : : "memory")
+#define SYNC_OTHER_CORES(x)   eieio()
 #define mb()    eieio()
 #define rmb()   eieio()
 #define wmb()   eieio()
@@ -307,117 +301,56 @@
 
 #ifdef __s390x__
 
-#define __ctl_load(array, low, high) ({ \
-	typedef struct { char _[sizeof(array)]; } addrtype; \
-	__asm__ __volatile__ ( \
-		"   bras  1,0f\n" \
-                "   lctlg 0,0,0(%0)\n" \
-		"0: ex    %1,0(1)" \
-		: : "a" (&array), "a" (((low)<<4)+(high)), \
-		    "m" (*(addrtype *)(array)) : "1" ); \
+#define __ctl_load(array, low, high) ({				\
+	typedef struct { char _[sizeof(array)]; } addrtype;	\
+	asm volatile(						\
+		"	lctlg	%1,%2,0(%0)\n"			\
+		: : "a" (&array), "i" (low), "i" (high),	\
+		    "m" (*(addrtype *)(array)));		\
 	})
 
-#define __ctl_store(array, low, high) ({ \
-	typedef struct { char _[sizeof(array)]; } addrtype; \
-	__asm__ __volatile__ ( \
-		"   bras  1,0f\n" \
-		"   stctg 0,0,0(%1)\n" \
-		"0: ex    %2,0(1)" \
-		: "=m" (*(addrtype *)(array)) \
-		: "a" (&array), "a" (((low)<<4)+(high)) : "1" ); \
+#define __ctl_store(array, low, high) ({			\
+	typedef struct { char _[sizeof(array)]; } addrtype;	\
+	asm volatile(						\
+		"	stctg	%2,%3,0(%1)\n"			\
+		: "=m" (*(addrtype *)(array))			\
+		: "a" (&array), "i" (low), "i" (high));		\
 	})
 
-#define __ctl_set_bit(cr, bit) ({ \
-        __u8 __dummy[24]; \
-        __asm__ __volatile__ ( \
-                "    bras  1,0f\n"       /* skip indirect insns */ \
-                "    stctg 0,0,0(%1)\n" \
-                "    lctlg 0,0,0(%1)\n" \
-                "0:  ex    %2,0(1)\n"    /* execute stctl */ \
-                "    lg    0,0(%1)\n" \
-                "    ogr   0,%3\n"       /* set the bit */ \
-                "    stg   0,0(%1)\n" \
-                "1:  ex    %2,6(1)"      /* execute lctl */ \
-                : "=m" (__dummy) \
-		: "a" ((((unsigned long) &__dummy) + 7) & ~7UL), \
-		  "a" (cr*17), "a" (1L<<(bit)) \
-                : "cc", "0", "1" ); \
-        })
-
-#define __ctl_clear_bit(cr, bit) ({ \
-        __u8 __dummy[16]; \
-        __asm__ __volatile__ ( \
-                "    bras  1,0f\n"       /* skip indirect insns */ \
-                "    stctg 0,0,0(%1)\n" \
-                "    lctlg 0,0,0(%1)\n" \
-                "0:  ex    %2,0(1)\n"    /* execute stctl */ \
-                "    lg    0,0(%1)\n" \
-                "    ngr   0,%3\n"       /* set the bit */ \
-                "    stg   0,0(%1)\n" \
-                "1:  ex    %2,6(1)"      /* execute lctl */ \
-                : "=m" (__dummy) \
-		: "a" ((((unsigned long) &__dummy) + 7) & ~7UL), \
-		  "a" (cr*17), "a" (~(1L<<(bit))) \
-                : "cc", "0", "1" ); \
-        })
-
 #else /* __s390x__ */
 
-#define __ctl_load(array, low, high) ({ \
-	typedef struct { char _[sizeof(array)]; } addrtype; \
-	__asm__ __volatile__ ( \
-		"   bras  1,0f\n" \
-                "   lctl 0,0,0(%0)\n" \
-		"0: ex    %1,0(1)" \
-		: : "a" (&array), "a" (((low)<<4)+(high)), \
-		    "m" (*(addrtype *)(array)) : "1" ); \
+#define __ctl_load(array, low, high) ({				\
+	typedef struct { char _[sizeof(array)]; } addrtype;	\
+	asm volatile(						\
+		"	lctl	%1,%2,0(%0)\n"			\
+		: : "a" (&array), "i" (low), "i" (high),	\
+		    "m" (*(addrtype *)(array)));		\
+})
+
+#define __ctl_store(array, low, high) ({			\
+	typedef struct { char _[sizeof(array)]; } addrtype;	\
+	asm volatile(						\
+		"	stctl	%2,%3,0(%1)\n"			\
+		: "=m" (*(addrtype *)(array))			\
+		: "a" (&array), "i" (low), "i" (high));		\
 	})
 
-#define __ctl_store(array, low, high) ({ \
-	typedef struct { char _[sizeof(array)]; } addrtype; \
-	__asm__ __volatile__ ( \
-		"   bras  1,0f\n" \
-		"   stctl 0,0,0(%1)\n" \
-		"0: ex    %2,0(1)" \
-		: "=m" (*(addrtype *)(array)) \
-		: "a" (&array), "a" (((low)<<4)+(high)): "1" ); \
-	})
-
-#define __ctl_set_bit(cr, bit) ({ \
-        __u8 __dummy[16]; \
-        __asm__ __volatile__ ( \
-                "    bras  1,0f\n"       /* skip indirect insns */ \
-                "    stctl 0,0,0(%1)\n" \
-                "    lctl  0,0,0(%1)\n" \
-                "0:  ex    %2,0(1)\n"    /* execute stctl */ \
-                "    l     0,0(%1)\n" \
-                "    or    0,%3\n"       /* set the bit */ \
-                "    st    0,0(%1)\n" \
-                "1:  ex    %2,4(1)"      /* execute lctl */ \
-                : "=m" (__dummy) \
-		: "a" ((((unsigned long) &__dummy) + 7) & ~7UL), \
-		  "a" (cr*17), "a" (1<<(bit)) \
-                : "cc", "0", "1" ); \
-        })
-
-#define __ctl_clear_bit(cr, bit) ({ \
-        __u8 __dummy[16]; \
-        __asm__ __volatile__ ( \
-                "    bras  1,0f\n"       /* skip indirect insns */ \
-                "    stctl 0,0,0(%1)\n" \
-                "    lctl  0,0,0(%1)\n" \
-                "0:  ex    %2,0(1)\n"    /* execute stctl */ \
-                "    l     0,0(%1)\n" \
-                "    nr    0,%3\n"       /* set the bit */ \
-                "    st    0,0(%1)\n" \
-                "1:  ex    %2,4(1)"      /* execute lctl */ \
-                : "=m" (__dummy) \
-		: "a" ((((unsigned long) &__dummy) + 7) & ~7UL), \
-		  "a" (cr*17), "a" (~(1<<(bit))) \
-                : "cc", "0", "1" ); \
-        })
 #endif /* __s390x__ */
 
+#define __ctl_set_bit(cr, bit) ({	\
+	unsigned long __dummy;		\
+	__ctl_store(__dummy, cr, cr);	\
+	__dummy |= 1UL << (bit);	\
+	__ctl_load(__dummy, cr, cr);	\
+})
+
+#define __ctl_clear_bit(cr, bit) ({	\
+	unsigned long __dummy;		\
+	__ctl_store(__dummy, cr, cr);	\
+	__dummy &= ~(1UL << (bit));	\
+	__ctl_load(__dummy, cr, cr);	\
+})
+
 #include <linux/irqflags.h>
 
 /*
@@ -427,8 +360,7 @@
 static inline void
 __set_psw_mask(unsigned long mask)
 {
-	local_save_flags(mask);
-	__load_psw_mask(mask);
+	__load_psw_mask(mask | (__raw_local_irq_stosm(0x00) & ~(-1UL >> 8)));
 }
 
 #define local_mcck_enable()  __set_psw_mask(PSW_KERNEL_BITS)