blob: 5f6ad032575aa353ac3e5da5232b9a9c8d7fd7cf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name.
8 *
9 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
10 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 *
16 * Note: the concept clashes with user mode linux. If you use UML and
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */
19
Ingo Molnar2b7d0392008-11-12 13:17:38 +010020/* Disable profiling for userspace code: */
Steven Rostedt2ed84ee2008-11-12 15:24:24 -050021#define DISABLE_BRANCH_PROFILING
Steven Rostedt1f0d69a2008-11-12 00:14:39 -050022
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/time.h>
24#include <linux/init.h>
25#include <linux/kernel.h>
26#include <linux/timer.h>
27#include <linux/seqlock.h>
28#include <linux/jiffies.h>
29#include <linux/sysctl.h>
john stultz7460ed22007-02-16 01:28:21 -080030#include <linux/clocksource.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020031#include <linux/getcpu.h>
Andi Kleen8c131af2006-11-14 16:57:46 +010032#include <linux/cpu.h>
33#include <linux/smp.h>
34#include <linux/notifier.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035
36#include <asm/vsyscall.h>
37#include <asm/pgtable.h>
38#include <asm/page.h>
john stultz7460ed22007-02-16 01:28:21 -080039#include <asm/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040#include <asm/fixmap.h>
41#include <asm/errno.h>
42#include <asm/io.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020043#include <asm/segment.h>
44#include <asm/desc.h>
45#include <asm/topology.h>
Andi Kleen2aae9502007-07-21 17:10:01 +020046#include <asm/vgtod.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070047
Steven Rostedt23adec52008-05-12 21:20:41 +020048#define __vsyscall(nr) \
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
H. Peter Anvin65ea5b02008-01-30 13:30:56 +010050#define __syscall_clobber "r11","cx","memory"
Linus Torvalds1da177e2005-04-16 15:20:36 -070051
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -040052DEFINE_VVAR(int, vgetcpu_mode);
53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
Linus Torvalds1da177e2005-04-16 15:20:36 -070054{
john stultz7460ed22007-02-16 01:28:21 -080055 .lock = SEQLOCK_UNLOCKED,
56 .sysctl_enabled = 1,
57};
Linus Torvalds1da177e2005-04-16 15:20:36 -070058
Tony Breeds2c622142007-10-18 03:04:57 -070059void update_vsyscall_tz(void)
60{
61 unsigned long flags;
62
63 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
64 /* sys_tz has changed */
65 vsyscall_gtod_data.sys_tz = sys_tz;
66 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
67}
68
John Stultz76158562010-07-13 17:56:23 -070069void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
70 struct clocksource *clock, u32 mult)
john stultz7460ed22007-02-16 01:28:21 -080071{
72 unsigned long flags;
73
74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
75 /* copy vsyscall data */
Eric Dumazetc8118c62007-05-02 19:27:11 +020076 vsyscall_gtod_data.clock.vread = clock->vread;
77 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
78 vsyscall_gtod_data.clock.mask = clock->mask;
Lin Ming0696b712009-11-17 13:49:50 +080079 vsyscall_gtod_data.clock.mult = mult;
Eric Dumazetc8118c62007-05-02 19:27:11 +020080 vsyscall_gtod_data.clock.shift = clock->shift;
81 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
82 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
John Stultz76158562010-07-13 17:56:23 -070083 vsyscall_gtod_data.wall_to_monotonic = *wtm;
john stultzda15cfd2009-08-19 19:13:34 -070084 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
john stultz7460ed22007-02-16 01:28:21 -080085 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086}
87
john stultz7460ed22007-02-16 01:28:21 -080088/* RED-PEN may want to readd seq locking, but then the variable should be
89 * write-once.
90 */
Andi Kleen2c8bc942006-01-11 22:45:30 +010091static __always_inline void do_get_tz(struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070092{
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -040093 *tz = VVAR(vsyscall_gtod_data).sys_tz;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094}
95
Andi Kleen2c8bc942006-01-11 22:45:30 +010096static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070097{
98 int ret;
Thomas Gleixnerce28b9862008-02-20 23:57:30 +010099 asm volatile("syscall"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100 : "=a" (ret)
john stultz7460ed22007-02-16 01:28:21 -0800101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103 return ret;
104}
105
Andi Kleen2c8bc942006-01-11 22:45:30 +0100106static __always_inline long time_syscall(long *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107{
108 long secs;
Thomas Gleixnerce28b9862008-02-20 23:57:30 +0100109 asm volatile("syscall"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114
john stultz7460ed22007-02-16 01:28:21 -0800115static __always_inline void do_vgettimeofday(struct timeval * tv)
116{
117 cycle_t now, base, mask, cycle_delta;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200118 unsigned seq;
119 unsigned long mult, shift, nsec;
john stultz7460ed22007-02-16 01:28:21 -0800120 cycle_t (*vread)(void);
121 do {
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
john stultz7460ed22007-02-16 01:28:21 -0800123
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400124 vread = VVAR(vsyscall_gtod_data).clock.vread;
125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
Al Viro89952d12007-03-14 09:17:59 +0000127 gettimeofday(tv,NULL);
john stultz7460ed22007-02-16 01:28:21 -0800128 return;
129 }
Ingo Molnarcb9e35d2008-11-08 20:27:00 +0100130
john stultz7460ed22007-02-16 01:28:21 -0800131 now = vread();
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
133 mask = VVAR(vsyscall_gtod_data).clock.mask;
134 mult = VVAR(vsyscall_gtod_data).clock.mult;
135 shift = VVAR(vsyscall_gtod_data).clock.shift;
john stultz7460ed22007-02-16 01:28:21 -0800136
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
john stultz7460ed22007-02-16 01:28:21 -0800140
141 /* calculate interval: */
142 cycle_delta = (now - base) & mask;
143 /* convert to nsecs: */
Eric Dumazetc8118c62007-05-02 19:27:11 +0200144 nsec += (cycle_delta * mult) >> shift;
john stultz7460ed22007-02-16 01:28:21 -0800145
Eric Dumazetc8118c62007-05-02 19:27:11 +0200146 while (nsec >= NSEC_PER_SEC) {
john stultz7460ed22007-02-16 01:28:21 -0800147 tv->tv_sec += 1;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200148 nsec -= NSEC_PER_SEC;
john stultz7460ed22007-02-16 01:28:21 -0800149 }
Eric Dumazetc8118c62007-05-02 19:27:11 +0200150 tv->tv_usec = nsec / NSEC_PER_USEC;
john stultz7460ed22007-02-16 01:28:21 -0800151}
152
Andi Kleen2e8ad432005-09-12 18:49:24 +0200153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 if (tv)
156 do_vgettimeofday(tv);
157 if (tz)
158 do_get_tz(tz);
159 return 0;
160}
161
162/* This will break when the xtime seconds get inaccurate, but that is
163 * unlikely */
Andi Kleen2e8ad432005-09-12 18:49:24 +0200164time_t __vsyscall(1) vtime(time_t *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165{
John Stultz8c736262010-07-13 17:56:18 -0700166 unsigned seq;
Eric Dumazet272a3712007-05-02 19:27:11 +0200167 time_t result;
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 return time_syscall(t);
john stultzd0aff6e2007-05-21 14:31:52 +0200170
John Stultz8c736262010-07-13 17:56:18 -0700171 do {
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
John Stultz8c736262010-07-13 17:56:18 -0700173
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400174 result = VVAR(vsyscall_gtod_data).wall_time_sec;
John Stultz8c736262010-07-13 17:56:18 -0700175
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
John Stultz8c736262010-07-13 17:56:18 -0700177
Eric Dumazet272a3712007-05-02 19:27:11 +0200178 if (t)
179 *t = result;
180 return result;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181}
182
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200183/* Fast way to get current CPU and node.
184 This helps to do per node and per CPU caches in user space.
185 The result is not guaranteed without CPU affinity, but usually
186 works out because the scheduler tries to keep a thread on the same
187 CPU.
188
189 tcache must point to a two element sized long array.
190 All arguments can be NULL. */
191long __vsyscall(2)
192vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193{
Glauber de Oliveira Costa8f12dea2008-01-30 13:31:06 +0100194 unsigned int p;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200195 unsigned long j = 0;
196
197 /* Fast cache - only recompute value once per jiffies and avoid
198 relatively costly rdtscp/cpuid otherwise.
199 This works because the scheduler usually keeps the process
200 on the same CPU and this syscall doesn't guarantee its
201 results anyways.
202 We do this here because otherwise user space would do it on
203 its own in a likely inferior way (no access to jiffies).
204 If you don't like it pass NULL. */
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
Andi Kleen34596dc2006-09-30 01:47:55 +0200206 p = tcache->blob[1];
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -0400207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200208 /* Load per CPU data from RDTSCP */
Glauber de Oliveira Costa8f12dea2008-01-30 13:31:06 +0100209 native_read_tscp(&p);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200210 } else {
211 /* Load per CPU data from GDT */
212 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
213 }
214 if (tcache) {
Andi Kleen34596dc2006-09-30 01:47:55 +0200215 tcache->blob[0] = j;
216 tcache->blob[1] = p;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200217 }
218 if (cpu)
219 *cpu = p & 0xfff;
220 if (node)
221 *node = p >> 12;
222 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223}
224
Ingo Molnara4928cf2008-04-23 13:20:56 +0200225static long __vsyscall(3) venosys_1(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226{
227 return -ENOSYS;
228}
229
230#ifdef CONFIG_SYSCTL
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231static ctl_table kernel_table2[] = {
Eric W. Biederman282a8212007-10-18 03:05:27 -0700232 { .procname = "vsyscall64",
john stultz7460ed22007-02-16 01:28:21 -0800233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
Thomas Gleixnerd67bbac2008-02-27 09:39:52 +0100234 .mode = 0644,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700235 .proc_handler = proc_dointvec },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800236 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237};
238
239static ctl_table kernel_root_table2[] = {
Eric W. Biederman24a06562009-04-03 05:33:18 -0700240 { .procname = "kernel", .mode = 0555,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 .child = kernel_table2 },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800242 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244#endif
245
Andi Kleen8c131af2006-11-14 16:57:46 +0100246/* Assume __initcall executes before all user space. Hopefully kmod
247 doesn't violate that. We'll find out if it does. */
248static void __cpuinit vsyscall_set_cpu(int cpu)
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200249{
Jeremy Fitzhardingefc8b8a62008-06-25 00:19:01 -0400250 unsigned long d;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200251 unsigned long node = 0;
252#ifdef CONFIG_NUMA
Mike Travis98c9e272007-10-17 18:04:39 +0200253 node = cpu_to_node(cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200254#endif
Mike Travis92cb7612007-10-19 20:35:04 +0200255 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
Andi Kleen8c131af2006-11-14 16:57:46 +0100256 write_rdtscp_aux((node << 12) | cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200257
258 /* Store cpu number in limit so that it can be loaded quickly
259 in user space in vgetcpu.
260 12 bits for the CPU and 8 bits for the node. */
Jeremy Fitzhardingefc8b8a62008-06-25 00:19:01 -0400261 d = 0x0f40000000000ULL;
262 d |= cpu;
263 d |= (node & 0xf) << 12;
264 d |= (node >> 4) << 48;
265 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200266}
267
Andi Kleen8c131af2006-11-14 16:57:46 +0100268static void __cpuinit cpu_vsyscall_init(void *arg)
269{
270 /* preemption should be already off */
271 vsyscall_set_cpu(raw_smp_processor_id());
272}
273
274static int __cpuinit
275cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
276{
277 long cpu = (long)arg;
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700278 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
Jens Axboe8691e5a2008-06-06 11:18:06 +0200279 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
Andi Kleen8c131af2006-11-14 16:57:46 +0100280 return NOTIFY_DONE;
281}
282
Ingo Molnare4026442008-01-30 13:32:39 +0100283void __init map_vsyscall(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284{
285 extern char __vsyscall_0;
286 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
287
Ernie Petrides103efcd2006-12-07 02:14:09 +0100288 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
290}
291
292static int __init vsyscall_init(void)
293{
294 BUG_ON(((unsigned long) &vgettimeofday !=
295 VSYSCALL_ADDR(__NR_vgettimeofday)));
296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700299#ifdef CONFIG_SYSCTL
Eric W. Biederman0b4d4142007-02-14 00:34:09 -0800300 register_sysctl_table(kernel_root_table2);
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700301#endif
Jens Axboe15c8b6c2008-05-09 09:39:44 +0200302 on_each_cpu(cpu_vsyscall_init, NULL, 1);
Sheng Yangbe43f832009-12-18 16:48:45 +0800303 /* notifier priority > KVM */
304 hotcpu_notifier(cpu_vsyscall_notifier, 30);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305 return 0;
306}
307
308__initcall(vsyscall_init);