[PATCH] x86_64: On Intel CPUs don't do an additional CPU sync before RDTSC

RDTSC serialization using cpuid is not needed for Intel platforms.
This increases gettimeofday performance.

Cc: vojtech@suse.cz
Cc: rohit.seth@intel.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 8090a0a..46bf556 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1042,6 +1042,7 @@
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+	set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
  	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index f8d4b69..25bc58a 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -71,14 +71,6 @@
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
-static inline void rdtscll_sync(unsigned long *tsc)
-{
-#ifdef CONFIG_SMP
-	sync_core();
-#endif
-	rdtscll(*tsc);
-}
-
 /*
  * do_gettimeoffset() returns microseconds since last timer interrupt was
  * triggered by hardware. A memory read of HPET is slower than a register read
@@ -93,7 +85,7 @@
 {
 	unsigned long t;
 	unsigned long x;
-	rdtscll_sync(&t);
+	t = get_cycles_sync();
 	if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
 	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
 	return x;
@@ -309,8 +301,7 @@
 			last_offset = vxtime.last_tsc;
 			base = monotonic_base;
 		} while (read_seqretry(&xtime_lock, seq));
-		sync_core();
-		rdtscll(this_offset);
+		this_offset = get_cycles_sync();
 		offset = (this_offset - last_offset)*1000/cpu_khz; 
 		return base + offset;
 	}
@@ -391,7 +382,7 @@
 		delay = LATCH - 1 - delay;
 	}
 
-	rdtscll_sync(&tsc);
+	tsc = get_cycles_sync();
 
 	if (vxtime.mode == VXTIME_HPET) {
 		if (offset - vxtime.last > hpet_tick) {
@@ -700,8 +691,7 @@
 	do {
 		local_irq_disable();
 		hpet_now = hpet_readl(HPET_COUNTER);
-		sync_core();
-		rdtscl(tsc_now);
+		tsc_now = get_cycles_sync();
 		local_irq_restore(flags);
 	} while ((tsc_now - tsc_start) < TICK_COUNT &&
 		 (hpet_now - hpet_start) < TICK_COUNT);
@@ -731,11 +721,9 @@
 	outb(0xb0, 0x43);
 	outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
 	outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-	rdtscll(start);
-	sync_core();
+	start = get_cycles_sync();
 	while ((inb(0x61) & 0x20) == 0);
-	sync_core();
-	rdtscll(end);
+	end = get_cycles_sync();
 
 	spin_unlock_irqrestore(&i8253_lock, flags);
 	
@@ -939,7 +927,7 @@
 	vxtime.mode = VXTIME_TSC;
 	vxtime.quot = (1000000L << 32) / vxtime_hz;
 	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
-	rdtscll_sync(&vxtime.last_tsc);
+	vxtime.last_tsc = get_cycles_sync();
 	setup_irq(0, &irq0);
 
 	set_cyc2ns_scale(cpu_khz);
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 70a0bd1..9e51047 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -66,8 +66,7 @@
 			(__jiffies - __wall_jiffies) * (1000000 / HZ);
 
 		if (__vxtime.mode != VXTIME_HPET) {
-			sync_core();
-			rdtscll(t);
+			t = get_cycles_sync();
 			if (t < __vxtime.last_tsc)
 				t = __vxtime.last_tsc;
 			usec += ((t - __vxtime.last_tsc) *