powerpc: Improve resolution of VDSO clock_gettime

Currently the clock_gettime implementation in the VDSO produces a
result with microsecond resolution for the cases that are handled
without a system call, i.e. CLOCK_REALTIME and CLOCK_MONOTONIC.  The
nanoseconds field of the result is obtained by computing a
microseconds value and multiplying by 1000.

This changes the code in the VDSO to do the computation for
clock_gettime with nanosecond resolution.  That means that the
resolution of the result will ultimately depend on the timebase
frequency.

Because the timestamp in the VDSO datapage (stamp_xsec, the real time
corresponding to the timebase count in tb_orig_stamp) is in units of
2^-20 seconds, it doesn't have sufficient resolution for computing a
result with nanosecond resolution.  Therefore this adds a copy of
xtime to the VDSO datapage and updates it in update_gtod() along with
the other time-related fields.

Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index c6401f9..262cd58 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -75,90 +75,49 @@
 
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
-	mr	r10,r3			/* r10 saves id */
 	mr	r11,r4			/* r11 saves tp */
 	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
-	beq	cr1,50f			/* if monotonic -> jump there */
-
-	/*
-	 * CLOCK_REALTIME
-	 */
-
-	bl	V_LOCAL_FUNC(__do_get_xsec)	/* get xsec from tb & kernel */
-
-	lis     r7,15			/* r7 = 1000000 = USEC_PER_SEC */
-	ori     r7,r7,16960
-	rldicl  r5,r4,44,20		/* r5 = sec = xsec / XSEC_PER_SEC */
-	rldicr  r6,r5,20,43		/* r6 = sec * XSEC_PER_SEC */
-	std	r5,TSPC64_TV_SEC(r11)	/* store sec in tv */
-	subf	r0,r6,r4		/* r0 = xsec = (xsec - r6) */
-	mulld   r0,r0,r7		/* usec = (xsec * USEC_PER_SEC) /
-					 * XSEC_PER_SEC
-					 */
-	rldicl  r0,r0,44,20
-	mulli	r0,r0,1000		/* nsec = usec * 1000 */
-	std	r0,TSPC64_TV_NSEC(r11)	/* store nsec in tp */
-
-	mtlr	r12
-	crclr	cr0*4+so
-	li	r3,0
-	blr
+50:	bl	V_LOCAL_FUNC(__do_get_tspec)	/* get time from tb & kernel */
+	bne	cr1,80f			/* if not monotonic, all done */
 
 	/*
 	 * CLOCK_MONOTONIC
 	 */
 
-50:	bl	V_LOCAL_FUNC(__do_get_xsec)	/* get xsec from tb & kernel */
-
-	lis     r7,15			/* r7 = 1000000 = USEC_PER_SEC */
-	ori     r7,r7,16960
-	rldicl  r5,r4,44,20		/* r5 = sec = xsec / XSEC_PER_SEC */
-	rldicr  r6,r5,20,43		/* r6 = sec * XSEC_PER_SEC */
-	subf	r0,r6,r4		/* r0 = xsec = (xsec - r6) */
-	mulld   r0,r0,r7		/* usec = (xsec * USEC_PER_SEC) /
-					 * XSEC_PER_SEC
-					 */
-	rldicl  r6,r0,44,20
-	mulli	r6,r6,1000		/* nsec = usec * 1000 */
-
 	/* now we must fixup using wall to monotonic. We need to snapshot
 	 * that value and do the counter trick again. Fortunately, we still
-	 * have the counter value in r8 that was returned by __do_get_xsec.
-	 * At this point, r5,r6 contain our sec/nsec values.
-	 * can be used
+	 * have the counter value in r8 that was returned by __do_get_tspec.
+	 * At this point, r4,r5 contain our sec/nsec values.
 	 */
 
-	lwa	r4,WTOM_CLOCK_SEC(r3)
-	lwa	r7,WTOM_CLOCK_NSEC(r3)
+	lwa	r6,WTOM_CLOCK_SEC(r3)
+	lwa	r9,WTOM_CLOCK_NSEC(r3)
 
-	/* We now have our result in r4,r7. We create a fake dependency
+	/* We now have our result in r6,r9. We create a fake dependency
 	 * on that result and re-check the counter
 	 */
-	or	r9,r4,r7
-	xor	r0,r9,r9
+	or	r0,r6,r9
+	xor	r0,r0,r0
 	add	r3,r3,r0
 	ld	r0,CFG_TB_UPDATE_COUNT(r3)
         cmpld   cr0,r0,r8		/* check if updated */
 	bne-	50b
 
-	/* Calculate and store result. Note that this mimmics the C code,
-	 * which may cause funny results if nsec goes negative... is that
-	 * possible at all ?
+	/* Add wall->monotonic offset and check for overflow or underflow.
 	 */
-	add	r4,r4,r5
-	add	r7,r7,r6
-	lis	r9,NSEC_PER_SEC@h
-	ori	r9,r9,NSEC_PER_SEC@l
-	cmpl	cr0,r7,r9
-	cmpli	cr1,r7,0
+	add	r4,r4,r6
+	add	r5,r5,r9
+	cmpd	cr0,r5,r7
+	cmpdi	cr1,r5,0
 	blt	1f
-	subf	r7,r9,r7
+	subf	r5,r7,r5
 	addi	r4,r4,1
-1:	bge	cr1,1f
+1:	bge	cr1,80f
 	addi	r4,r4,-1
-	add	r7,r7,r9
-1:	std	r4,TSPC64_TV_SEC(r11)
-	std	r7,TSPC64_TV_NSEC(r11)
+	add	r5,r5,r7
+
+80:	std	r4,TSPC64_TV_SEC(r11)
+	std	r5,TSPC64_TV_NSEC(r11)
 
 	mtlr	r12
 	crclr	cr0*4+so
@@ -168,10 +127,6 @@
 	/*
 	 * syscall fallback
 	 */
-98:
-	mtlr	r12
-	mr	r3,r10
-	mr	r4,r11
 99:
 	li	r0,__NR_clock_gettime
 	sc
@@ -253,3 +208,59 @@
 	blr
   .cfi_endproc
 V_FUNCTION_END(__do_get_xsec)
+
+/*
+ * This is the core of clock_gettime(), it returns the current
+ * time in seconds and nanoseconds in r4 and r5.
+ * It expects the datapage ptr in r3 and doesn't clobber it.
+ * It clobbers r0 and r6 and returns NSEC_PER_SEC in r7.
+ * On return, r8 contains the counter value that can be reused.
+ * This clobbers cr0 but not any other cr field.
+ */
+V_FUNCTION_BEGIN(__do_get_tspec)
+  .cfi_startproc
+	/* check for update count & load values */
+1:	ld	r8,CFG_TB_UPDATE_COUNT(r3)
+	andi.	r0,r8,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r8,r8		/* create dependency */
+	add	r3,r3,r0
+
+	/* Get TB & offset it. We use the MFTB macro which will generate
+	 * workaround code for Cell.
+	 */
+	MFTB(r7)
+	ld	r9,CFG_TB_ORIG_STAMP(r3)
+	subf	r7,r9,r7
+
+	/* Scale result */
+	ld	r5,CFG_TB_TO_XS(r3)
+	sldi	r7,r7,12		/* compute time since stamp_xtime */
+	mulhdu	r6,r7,r5		/* in units of 2^-32 seconds */
+
+	/* Add stamp since epoch */
+	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+	ld	r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
+	or	r0,r4,r5
+	or	r0,r0,r6
+	xor	r0,r0,r0
+	add	r3,r3,r0
+	ld	r0,CFG_TB_UPDATE_COUNT(r3)
+	cmpld   r0,r8			/* check if updated */
+	bne-	1b			/* reload if so */
+
+	/* convert to seconds & nanoseconds and add to stamp */
+	lis	r7,NSEC_PER_SEC@h
+	ori	r7,r7,NSEC_PER_SEC@l
+	mulhwu	r0,r6,r7		/* compute nanoseconds and */
+	srdi	r6,r6,32		/* seconds since stamp_xtime */
+	clrldi	r0,r0,32
+	add	r5,r5,r0		/* add nanoseconds together */
+	cmpd	r5,r7			/* overflow? */
+	add	r4,r4,r6
+	bltlr				/* all done if no overflow */
+	subf	r5,r7,r5		/* if overflow, adjust */
+	addi	r4,r4,1
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__do_get_tspec)