[S390] Inline assembly cleanup.

Major cleanup of all s390 inline assemblies. They now have a common
coding style. Quite a few have been shortened, mainly by using register
asm variables. Use of the EX_TABLE macro helps  as well. The atomic ops,
bit ops and locking inlines new use the Q-constraint if a newer gcc
is used.  That results in slightly better code.

Thanks to Christian Borntraeger for proof reading the changes.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 91b2884..c46e3d4 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -544,10 +544,7 @@
 		current->ptrace &= ~PT_DTRACE;
 		task_unlock(current);
 		current->thread.fp_regs.fpc=0;
-		__asm__ __volatile__
-		        ("sr  0,0\n\t"
-		         "sfpc 0,0\n\t"
-			 : : :"0");
+		asm volatile("sfpc %0,0" : : "d" (0));
 	}
         putname(filename);
 out:
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index 4ef44e5..1eae74e 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -25,11 +25,8 @@
  */
 int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 {
-	const int mask = 0x40000000L;
-	unsigned long flags;
-	int return_code;
-	int return_len;
-	int cmdlen;
+	unsigned long flags, cmdlen;
+	int return_code, return_len;
 
 	spin_lock_irqsave(&cpcmd_lock, flags);
 	cmdlen = strlen(cmd);
@@ -38,64 +35,44 @@
 	ASCEBC(cpcmd_buf, cmdlen);
 
 	if (response != NULL && rlen > 0) {
+		register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
+		register unsigned long reg3 asm ("3") = (addr_t) response;
+		register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L;
+		register unsigned long reg5 asm ("5") = rlen;
+
 		memset(response, 0, rlen);
+		asm volatile(
 #ifndef CONFIG_64BIT
-		asm volatile (	"lra	2,0(%2)\n"
-				"lr	4,%3\n"
-				"o	4,%6\n"
-				"lra	3,0(%4)\n"
-				"lr	5,%5\n"
-				"diag	2,4,0x8\n"
-				"brc	8, 1f\n"
-				"ar	5, %5\n"
-				"1: \n"
-				"lr	%0,4\n"
-				"lr	%1,5\n"
-				: "=d" (return_code), "=d" (return_len)
-				: "a" (cpcmd_buf), "d" (cmdlen),
-				"a" (response), "d" (rlen), "m" (mask)
-				: "cc", "2", "3", "4", "5" );
+			"	diag	%2,%0,0x8\n"
+			"	brc	8,1f\n"
+			"	ar	%1,%4\n"
 #else /* CONFIG_64BIT */
-                asm volatile (	"lrag	2,0(%2)\n"
-				"lgr	4,%3\n"
-				"o	4,%6\n"
-				"lrag	3,0(%4)\n"
-				"lgr	5,%5\n"
-				"sam31\n"
-				"diag	2,4,0x8\n"
-				"sam64\n"
-				"brc	8, 1f\n"
-				"agr	5, %5\n"
-				"1: \n"
-				"lgr	%0,4\n"
-				"lgr	%1,5\n"
-				: "=d" (return_code), "=d" (return_len)
-				: "a" (cpcmd_buf), "d" (cmdlen),
-				"a" (response), "d" (rlen), "m" (mask)
-				: "cc", "2", "3", "4", "5" );
+			"	sam31\n"
+			"	diag	%2,%0,0x8\n"
+			"	sam64\n"
+			"	brc	8,1f\n"
+			"	agr	%1,%4\n"
 #endif /* CONFIG_64BIT */
+			"1:\n"
+			: "+d" (reg4), "+d" (reg5)
+			: "d" (reg2), "d" (reg3), "d" (rlen) : "cc");
+		return_code = (int) reg4;
+		return_len = (int) reg5;
                 EBCASC(response, rlen);
         } else {
+		register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
+		register unsigned long reg3 asm ("3") = cmdlen;
 		return_len = 0;
+		asm volatile(
 #ifndef CONFIG_64BIT
-                asm volatile (	"lra	2,0(%1)\n"
-				"lr	3,%2\n"
-				"diag	2,3,0x8\n"
-				"lr	%0,3\n"
-				: "=d" (return_code)
-				: "a" (cpcmd_buf), "d" (cmdlen)
-				: "2", "3"  );
+			"	diag	%1,%0,0x8\n"
 #else /* CONFIG_64BIT */
-                asm volatile (	"lrag	2,0(%1)\n"
-				"lgr	3,%2\n"
-				"sam31\n"
-				"diag	2,3,0x8\n"
-				"sam64\n"
-				"lgr	%0,3\n"
-				: "=d" (return_code)
-				: "a" (cpcmd_buf), "d" (cmdlen)
-				: "2", "3" );
+			"	sam31\n"
+			"	diag	%1,%0,0x8\n"
+			"	sam64\n"
 #endif /* CONFIG_64BIT */
+			: "+d" (reg3) : "d" (reg2) : "cc");
+		return_code = (int) reg3;
         }
 	spin_unlock_irqrestore(&cpcmd_lock, flags);
 	if (response_code != NULL)
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 6555cc4..1f5e782 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -120,24 +120,15 @@
 
 static int diag308(unsigned long subcode, void *addr)
 {
-	register unsigned long _addr asm("0") = (unsigned long)addr;
+	register unsigned long _addr asm("0") = (unsigned long) addr;
 	register unsigned long _rc asm("1") = 0;
 
-	asm volatile (
-		"   diag %0,%2,0x308\n"
-		"0: \n"
-		".section __ex_table,\"a\"\n"
-#ifdef CONFIG_64BIT
-		"   .align 8\n"
-		"   .quad 0b, 0b\n"
-#else
-		"   .align 4\n"
-		"   .long 0b, 0b\n"
-#endif
-		".previous\n"
+	asm volatile(
+		"	diag	%0,%2,0x308\n"
+		"0:\n"
+		EX_TABLE(0b,0b)
 		: "+d" (_addr), "+d" (_rc)
-		: "d" (subcode) : "cc", "memory" );
-
+		: "d" (subcode) : "cc", "memory");
 	return _rc;
 }
 
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index d3cbfa3..6603fbb 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -45,7 +45,7 @@
 #include <asm/irq.h>
 #include <asm/timer.h>
 
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
 
 /*
  * Return saved PC of a blocked thread. used in kernel/sched.
@@ -177,7 +177,8 @@
 
 extern void kernel_thread_starter(void);
 
-__asm__(".align 4\n"
+asm(
+	".align 4\n"
 	"kernel_thread_starter:\n"
 	"    la    2,0(10)\n"
 	"    basr  14,9\n"
diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c
index 8dfb690..191303f 100644
--- a/arch/s390/kernel/semaphore.c
+++ b/arch/s390/kernel/semaphore.c
@@ -26,17 +26,17 @@
 {
 	int old_val, new_val;
 
-        __asm__ __volatile__("   l     %0,0(%3)\n"
-                             "0: ltr   %1,%0\n"
-			     "   jhe   1f\n"
-			     "   lhi   %1,0\n"
-			     "1: ar    %1,%4\n"
-                             "   cs    %0,%1,0(%3)\n"
-                             "   jl    0b\n"
-                             : "=&d" (old_val), "=&d" (new_val),
-			       "=m" (sem->count)
-			     : "a" (&sem->count), "d" (incr), "m" (sem->count)
-			     : "cc" );
+	asm volatile(
+		"	l	%0,0(%3)\n"
+		"0:	ltr	%1,%0\n"
+		"	jhe	1f\n"
+		"	lhi	%1,0\n"
+		"1:	ar	%1,%4\n"
+		"	cs	%0,%1,0(%3)\n"
+		"	jl	0b\n"
+		: "=&d" (old_val), "=&d" (new_val), "=m" (sem->count)
+		: "a" (&sem->count), "d" (incr), "m" (sem->count)
+		: "cc");
 	return old_val;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e3d9325..a21cfbb 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -101,7 +101,7 @@
         /*
          * Store processor id in lowcore (used e.g. in timer_interrupt)
          */
-        asm volatile ("stidp %0": "=m" (S390_lowcore.cpu_data.cpu_id));
+	asm volatile("stidp %0": "=m" (S390_lowcore.cpu_data.cpu_id));
         S390_lowcore.cpu_data.cpu_addr = addr;
 
         /*
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b2e6f4c..a8e6199 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -63,7 +63,7 @@
 static void smp_ext_bitcall_others(ec_bit_sig);
 
 /*
- * Structure and data for smp_call_function(). This is designed to minimise
+5B * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
 static DEFINE_SPINLOCK(call_lock);
@@ -418,59 +418,49 @@
 /*
  * parameter area for the set/clear control bit callbacks
  */
-typedef struct
-{
-	__u16 start_ctl;
-	__u16 end_ctl;
+struct ec_creg_mask_parms {
 	unsigned long orvals[16];
 	unsigned long andvals[16];
-} ec_creg_mask_parms;
+};
 
 /*
  * callback for setting/clearing control bits
  */
 void smp_ctl_bit_callback(void *info) {
-	ec_creg_mask_parms *pp;
+	struct ec_creg_mask_parms *pp = info;
 	unsigned long cregs[16];
 	int i;
 	
-	pp = (ec_creg_mask_parms *) info;
-	__ctl_store(cregs[pp->start_ctl], pp->start_ctl, pp->end_ctl);
-	for (i = pp->start_ctl; i <= pp->end_ctl; i++)
+	__ctl_store(cregs, 0, 15);
+	for (i = 0; i <= 15; i++)
 		cregs[i] = (cregs[i] & pp->andvals[i]) | pp->orvals[i];
-	__ctl_load(cregs[pp->start_ctl], pp->start_ctl, pp->end_ctl);
+	__ctl_load(cregs, 0, 15);
 }
 
 /*
  * Set a bit in a control register of all cpus
  */
-void smp_ctl_set_bit(int cr, int bit) {
-        ec_creg_mask_parms parms;
+void smp_ctl_set_bit(int cr, int bit)
+{
+	struct ec_creg_mask_parms parms;
 
-	parms.start_ctl = cr;
-	parms.end_ctl = cr;
+	memset(&parms.orvals, 0, sizeof(parms.orvals));
+	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.orvals[cr] = 1 << bit;
-	parms.andvals[cr] = -1L;
-	preempt_disable();
-	smp_call_function(smp_ctl_bit_callback, &parms, 0, 1);
-        __ctl_set_bit(cr, bit);
-	preempt_enable();
+	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
 }
 
 /*
  * Clear a bit in a control register of all cpus
  */
-void smp_ctl_clear_bit(int cr, int bit) {
-        ec_creg_mask_parms parms;
+void smp_ctl_clear_bit(int cr, int bit)
+{
+	struct ec_creg_mask_parms parms;
 
-	parms.start_ctl = cr;
-	parms.end_ctl = cr;
-	parms.orvals[cr] = 0;
+	memset(&parms.orvals, 0, sizeof(parms.orvals));
+	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.andvals[cr] = ~(1L << bit);
-	preempt_disable();
-	smp_call_function(smp_ctl_bit_callback, &parms, 0, 1);
-        __ctl_clear_bit(cr, bit);
-	preempt_enable();
+	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
 }
 
 /*
@@ -650,9 +640,9 @@
 	sf->gprs[9] = (unsigned long) sf;
 	cpu_lowcore->save_area[15] = (unsigned long) sf;
 	__ctl_store(cpu_lowcore->cregs_save_area[0], 0, 15);
-	__asm__ __volatile__("stam  0,15,0(%0)"
-			     : : "a" (&cpu_lowcore->access_regs_save_area)
-			     : "memory");
+	asm volatile(
+		"	stam	0,15,0(%0)"
+		: : "a" (&cpu_lowcore->access_regs_save_area) : "memory");
 	cpu_lowcore->percpu_offset = __per_cpu_offset[cpu];
         cpu_lowcore->current_task = (unsigned long) idle;
         cpu_lowcore->cpu_data.cpu_nr = cpu;
@@ -708,7 +698,7 @@
 __cpu_disable(void)
 {
 	unsigned long flags;
-	ec_creg_mask_parms cr_parms;
+	struct ec_creg_mask_parms cr_parms;
 	int cpu = smp_processor_id();
 
 	spin_lock_irqsave(&smp_reserve_lock, flags);
@@ -724,30 +714,21 @@
 		pfault_fini();
 #endif
 
-	/* disable all external interrupts */
+	memset(&cr_parms.orvals, 0, sizeof(cr_parms.orvals));
+	memset(&cr_parms.andvals, 0xff, sizeof(cr_parms.andvals));
 
-	cr_parms.start_ctl = 0;
-	cr_parms.end_ctl = 0;
+	/* disable all external interrupts */
 	cr_parms.orvals[0] = 0;
 	cr_parms.andvals[0] = ~(1<<15 | 1<<14 | 1<<13 | 1<<12 |
 				1<<11 | 1<<10 | 1<< 6 | 1<< 4);
-	smp_ctl_bit_callback(&cr_parms);
-
 	/* disable all I/O interrupts */
-
-	cr_parms.start_ctl = 6;
-	cr_parms.end_ctl = 6;
 	cr_parms.orvals[6] = 0;
 	cr_parms.andvals[6] = ~(1<<31 | 1<<30 | 1<<29 | 1<<28 |
 				1<<27 | 1<<26 | 1<<25 | 1<<24);
-	smp_ctl_bit_callback(&cr_parms);
-
 	/* disable most machine checks */
-
-	cr_parms.start_ctl = 14;
-	cr_parms.end_ctl = 14;
 	cr_parms.orvals[14] = 0;
 	cr_parms.andvals[14] = ~(1<<28 | 1<<27 | 1<<26 | 1<<25 | 1<<24);
+
 	smp_ctl_bit_callback(&cr_parms);
 
 	spin_unlock_irqrestore(&smp_reserve_lock, flags);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 74e6178..1981c61 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -351,10 +351,12 @@
 	int cc;
 
         /* kick the TOD clock */
-        asm volatile ("STCK 0(%1)\n\t"
-                      "IPM  %0\n\t"
-                      "SRL  %0,28" : "=r" (cc) : "a" (&init_timer_cc) 
-				   : "memory", "cc");
+	asm volatile(
+		"	stck	0(%2)\n"
+		"	ipm	%0\n"
+		"	srl	%0,28"
+		: "=d" (cc), "=m" (init_timer_cc)
+		: "a" (&init_timer_cc) : "cc");
         switch (cc) {
         case 0: /* clock in set state: all is fine */
                 break;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index c4982c9..3eb4fab 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -597,8 +597,7 @@
 		local_irq_enable();
 
 	if (MACHINE_HAS_IEEE)
-		__asm__ volatile ("stfpc %0\n\t" 
-				  : "=m" (current->thread.fp_regs.fpc));
+		asm volatile("stfpc %0" : "=m" (current->thread.fp_regs.fpc));
 
 #ifdef CONFIG_MATHEMU
         else if (regs->psw.mask & PSW_MASK_PSTATE) {