powerpc: Improve resolution of VDSO clock_gettime

Currently the clock_gettime implementation in the VDSO produces a
result with microsecond resolution for the cases that are handled
without a system call, i.e. CLOCK_REALTIME and CLOCK_MONOTONIC.  The
nanoseconds field of the result is obtained by computing a
microseconds value and multiplying by 1000.

This changes the code in the VDSO to do the computation for
clock_gettime with nanosecond resolution.  That means that the
resolution of the result will ultimately depend on the timebase
frequency.

Because the timestamp in the VDSO datapage (stamp_xsec, the real time
corresponding to the timebase count in tb_orig_stamp) is in units of
2^-20 seconds, it doesn't have sufficient resolution for computing a
result with nanosecond resolution.  Therefore this adds a copy of
xtime to the VDSO datapage and updates it in update_gtod() along with
the other time-related fields.

Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index 72ca26d..ee038d4 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -16,6 +16,13 @@
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 
+/* Offset for the low 32-bit part of a field of long type */
+#ifdef CONFIG_PPC64
+#define LOPART	4
+#else
+#define LOPART	0
+#endif
+
 	.text
 /*
  * Exact prototype of gettimeofday
@@ -90,101 +97,53 @@
 
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
-	mr	r10,r3			/* r10 saves id */
 	mr	r11,r4			/* r11 saves tp */
 	bl	__get_datapage@local	/* get data page */
 	mr	r9,r3			/* datapage ptr in r9 */
-	beq	cr1,50f			/* if monotonic -> jump there */
 
-	/*
-	 * CLOCK_REALTIME
-	 */
-
-	bl	__do_get_xsec@local	/* get xsec from tb & kernel */
-	bne-	98f			/* out of line -> do syscall */
-
-	/* seconds are xsec >> 20 */
-	rlwinm	r5,r4,12,20,31
-	rlwimi	r5,r3,12,0,19
-	stw	r5,TSPC32_TV_SEC(r11)
-
-	/* get remaining xsec and convert to nsec. we scale
-	 * up remaining xsec by 12 bits and get the top 32 bits
-	 * of the multiplication, then we multiply by 1000
-	 */
-	rlwinm	r5,r4,12,0,19
-	lis	r6,1000000@h
-	ori	r6,r6,1000000@l
-	mulhwu	r5,r5,r6
-	mulli	r5,r5,1000
-	stw	r5,TSPC32_TV_NSEC(r11)
-	mtlr	r12
-	crclr	cr0*4+so
-	li	r3,0
-	blr
+50:	bl	__do_get_tspec@local	/* get sec/nsec from tb & kernel */
+	bne	cr1,80f			/* not monotonic -> all done */
 
 	/*
 	 * CLOCK_MONOTONIC
 	 */
 
-50:	bl	__do_get_xsec@local	/* get xsec from tb & kernel */
-	bne-	98f			/* out of line -> do syscall */
-
-	/* seconds are xsec >> 20 */
-	rlwinm	r6,r4,12,20,31
-	rlwimi	r6,r3,12,0,19
-
-	/* get remaining xsec and convert to nsec. we scale
-	 * up remaining xsec by 12 bits and get the top 32 bits
-	 * of the multiplication, then we multiply by 1000
-	 */
-	rlwinm	r7,r4,12,0,19
-	lis	r5,1000000@h
-	ori	r5,r5,1000000@l
-	mulhwu	r7,r7,r5
-	mulli	r7,r7,1000
-
 	/* now we must fixup using wall to monotonic. We need to snapshot
 	 * that value and do the counter trick again. Fortunately, we still
 	 * have the counter value in r8 that was returned by __do_get_xsec.
-	 * At this point, r6,r7 contain our sec/nsec values, r3,r4 and r5
-	 * can be used
+	 * At this point, r3,r4 contain our sec/nsec values, r5 and r6
+	 * can be used, r7 contains NSEC_PER_SEC.
 	 */
 
-	lwz	r3,WTOM_CLOCK_SEC(r9)
-	lwz	r4,WTOM_CLOCK_NSEC(r9)
+	lwz	r5,WTOM_CLOCK_SEC(r9)
+	lwz	r6,WTOM_CLOCK_NSEC(r9)
 
-	/* We now have our result in r3,r4. We create a fake dependency
-	 * on that result and re-check the counter
+	/* We now have our offset in r5,r6. We create a fake dependency
+	 * on that value and re-check the counter
 	 */
-	or	r5,r4,r3
-	xor	r0,r5,r5
+	or	r0,r6,r5
+	xor	r0,r0,r0
 	add	r9,r9,r0
-#ifdef CONFIG_PPC64
-	lwz	r0,(CFG_TB_UPDATE_COUNT+4)(r9)
-#else
-	lwz	r0,(CFG_TB_UPDATE_COUNT)(r9)
-#endif
+	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
         cmpl    cr0,r8,r0		/* check if updated */
 	bne-	50b
 
-	/* Calculate and store result. Note that this mimmics the C code,
+	/* Calculate and store result. Note that this mimics the C code,
 	 * which may cause funny results if nsec goes negative... is that
 	 * possible at all ?
 	 */
-	add	r3,r3,r6
-	add	r4,r4,r7
-	lis	r5,NSEC_PER_SEC@h
-	ori	r5,r5,NSEC_PER_SEC@l
-	cmpl	cr0,r4,r5
-	cmpli	cr1,r4,0
+	add	r3,r3,r5
+	add	r4,r4,r6
+	cmpw	cr0,r4,r7
+	cmpwi	cr1,r4,0
 	blt	1f
-	subf	r4,r5,r4
+	subf	r4,r7,r4
 	addi	r3,r3,1
-1:	bge	cr1,1f
+1:	bge	cr1,80f
 	addi	r3,r3,-1
-	add	r4,r4,r5
-1:	stw	r3,TSPC32_TV_SEC(r11)
+	add	r4,r4,r7
+
+80:	stw	r3,TSPC32_TV_SEC(r11)
 	stw	r4,TSPC32_TV_NSEC(r11)
 
 	mtlr	r12
@@ -195,10 +154,6 @@
 	/*
 	 * syscall fallback
 	 */
-98:
-	mtlr	r12
-	mr	r3,r10
-	mr	r4,r11
 99:
 	li	r0,__NR_clock_gettime
 	sc
@@ -254,11 +209,7 @@
 	/* Check for update count & load values. We use the low
 	 * order 32 bits of the update count
 	 */
-#ifdef CONFIG_PPC64
-1:	lwz	r8,(CFG_TB_UPDATE_COUNT+4)(r9)
-#else
-1:	lwz	r8,(CFG_TB_UPDATE_COUNT)(r9)
-#endif
+1:	lwz	r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
 	andi.	r0,r8,1			/* pending update ? loop */
 	bne-	1b
 	xor	r0,r8,r8		/* create dependency */
@@ -305,11 +256,7 @@
 	or	r6,r4,r3
 	xor	r0,r6,r6
 	add	r9,r9,r0
-#ifdef CONFIG_PPC64
-	lwz	r0,(CFG_TB_UPDATE_COUNT+4)(r9)
-#else
-	lwz	r0,(CFG_TB_UPDATE_COUNT)(r9)
-#endif
+	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
         cmpl    cr0,r8,r0		/* check if updated */
 	bne-	1b
 
@@ -322,3 +269,98 @@
 	 */
 3:	blr
   .cfi_endproc
+
+/*
+ * This is the core of clock_gettime(), it returns the current
+ * time in seconds and nanoseconds in r3 and r4.
+ * It expects the datapage ptr in r9 and doesn't clobber it.
+ * It clobbers r0, r5, r6, r10 and returns NSEC_PER_SEC in r7.
+ * On return, r8 contains the counter value that can be reused.
+ * This clobbers cr0 but not any other cr field.
+ */
+__do_get_tspec:
+  .cfi_startproc
+	/* Check for update count & load values. We use the low
+	 * order 32 bits of the update count
+	 */
+1:	lwz	r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
+	andi.	r0,r8,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r8,r8		/* create dependency */
+	add	r9,r9,r0
+
+	/* Load orig stamp (offset to TB) */
+	lwz	r5,CFG_TB_ORIG_STAMP(r9)
+	lwz	r6,(CFG_TB_ORIG_STAMP+4)(r9)
+
+	/* Get a stable TB value */
+2:	mftbu	r3
+	mftbl	r4
+	mftbu	r0
+	cmpl	cr0,r3,r0
+	bne-	2b
+
+	/* Subtract tb orig stamp and shift left 12 bits.
+	 */
+	subfc	r7,r6,r4
+	subfe	r0,r5,r3
+	slwi	r0,r0,12
+	rlwimi.	r0,r7,12,20,31
+	slwi	r7,r7,12
+
+	/* Load scale factor & do multiplication */
+	lwz	r5,CFG_TB_TO_XS(r9)	/* load values */
+	lwz	r6,(CFG_TB_TO_XS+4)(r9)
+	mulhwu	r3,r7,r6
+	mullw	r10,r7,r5
+	mulhwu	r4,r7,r5
+	addc	r10,r3,r10
+	li	r3,0
+
+	beq+	4f			/* skip high part computation if 0 */
+	mulhwu	r3,r0,r5
+	mullw	r7,r0,r5
+	mulhwu	r5,r0,r6
+	mullw	r6,r0,r6
+	adde	r4,r4,r7
+	addze	r3,r3
+	addc	r4,r4,r5
+	addze	r3,r3
+	addc	r10,r10,r6
+
+4:	addze	r4,r4			/* add in carry */
+	lis	r7,NSEC_PER_SEC@h
+	ori	r7,r7,NSEC_PER_SEC@l
+	mulhwu	r4,r4,r7		/* convert to nanoseconds */
+
+	/* At this point, we have seconds & nanoseconds since the xtime
+	 * stamp in r3+CA and r4.  Load & add the xtime stamp.
+	 */
+#ifdef CONFIG_PPC64
+	lwz	r5,STAMP_XTIME+TSPC64_TV_SEC+LOPART(r9)
+	lwz	r6,STAMP_XTIME+TSPC64_TV_NSEC+LOPART(r9)
+#else
+	lwz	r5,STAMP_XTIME+TSPC32_TV_SEC(r9)
+	lwz	r6,STAMP_XTIME+TSPC32_TV_NSEC(r9)
+#endif
+	add	r4,r4,r6
+	adde	r3,r3,r5
+
+	/* We now have our result in r3,r4. We create a fake dependency
+	 * on that result and re-check the counter
+	 */
+	or	r6,r4,r3
+	xor	r0,r6,r6
+	add	r9,r9,r0
+	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
+        cmpl    cr0,r8,r0		/* check if updated */
+	bne-	1b
+
+	/* check for nanosecond overflow and adjust if necessary */
+	cmpw	r4,r7
+	bltlr				/* all done if no overflow */
+	subf	r4,r7,r4		/* adjust if overflow */
+	addi	r3,r3,1
+
+	blr
+  .cfi_endproc