blob: 06c34949bfdc9d09e90a219765aeb711c67e85a4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/kernel/vsyscall.c
3 *
4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Copyright 2003 Andi Kleen, SuSE Labs.
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 * If we want more than four we need a vDSO.
17 *
18 * Note: the concept clashes with user mode linux. If you use UML and
19 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
20 */
21
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/timer.h>
26#include <linux/seqlock.h>
27#include <linux/jiffies.h>
28#include <linux/sysctl.h>
john stultz7460ed22007-02-16 01:28:21 -080029#include <linux/clocksource.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020030#include <linux/getcpu.h>
Andi Kleen8c131af2006-11-14 16:57:46 +010031#include <linux/cpu.h>
32#include <linux/smp.h>
33#include <linux/notifier.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034
35#include <asm/vsyscall.h>
36#include <asm/pgtable.h>
37#include <asm/page.h>
john stultz7460ed22007-02-16 01:28:21 -080038#include <asm/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <asm/fixmap.h>
40#include <asm/errno.h>
41#include <asm/io.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020042#include <asm/segment.h>
43#include <asm/desc.h>
44#include <asm/topology.h>
Andi Kleen2aae9502007-07-21 17:10:01 +020045#include <asm/vgtod.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
47#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
Arnd Bergmannf5738ce2006-12-06 20:37:29 -080048#define __syscall_clobber "r11","rcx","memory"
Vivek Goyal0dbf7022007-05-02 19:27:07 +020049#define __pa_vsymbol(x) \
50 ({unsigned long v; \
51 extern char __vsyscall_0; \
52 asm("" : "=r" (v) : "0" (x)); \
53 ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
Eric Dumazetc8118c62007-05-02 19:27:11 +020055/*
56 * vsyscall_gtod_data contains data that is :
57 * - readonly from vsyscalls
58 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
59 * Try to keep this structure as small as possible to avoid cache line ping pongs
60 */
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020061int __vgetcpu_mode __section_vgetcpu_mode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070062
Andi Kleen2aae9502007-07-21 17:10:01 +020063struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
Linus Torvalds1da177e2005-04-16 15:20:36 -070064{
john stultz7460ed22007-02-16 01:28:21 -080065 .lock = SEQLOCK_UNLOCKED,
66 .sysctl_enabled = 1,
67};
Linus Torvalds1da177e2005-04-16 15:20:36 -070068
john stultz7460ed22007-02-16 01:28:21 -080069void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
70{
71 unsigned long flags;
72
73 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
74 /* copy vsyscall data */
Eric Dumazetc8118c62007-05-02 19:27:11 +020075 vsyscall_gtod_data.clock.vread = clock->vread;
76 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
77 vsyscall_gtod_data.clock.mask = clock->mask;
78 vsyscall_gtod_data.clock.mult = clock->mult;
79 vsyscall_gtod_data.clock.shift = clock->shift;
80 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
81 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
john stultz7460ed22007-02-16 01:28:21 -080082 vsyscall_gtod_data.sys_tz = sys_tz;
Andi Kleen2aae9502007-07-21 17:10:01 +020083 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
john stultz7460ed22007-02-16 01:28:21 -080085 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086}
87
john stultz7460ed22007-02-16 01:28:21 -080088/* RED-PEN may want to readd seq locking, but then the variable should be
89 * write-once.
90 */
Andi Kleen2c8bc942006-01-11 22:45:30 +010091static __always_inline void do_get_tz(struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070092{
john stultz7460ed22007-02-16 01:28:21 -080093 *tz = __vsyscall_gtod_data.sys_tz;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094}
95
Andi Kleen2c8bc942006-01-11 22:45:30 +010096static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070097{
98 int ret;
99 asm volatile("vsysc2: syscall"
100 : "=a" (ret)
john stultz7460ed22007-02-16 01:28:21 -0800101 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
102 : __syscall_clobber );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103 return ret;
104}
105
Andi Kleen2c8bc942006-01-11 22:45:30 +0100106static __always_inline long time_syscall(long *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107{
108 long secs;
109 asm volatile("vsysc1: syscall"
110 : "=a" (secs)
111 : "0" (__NR_time),"D" (t) : __syscall_clobber);
112 return secs;
113}
114
john stultz7460ed22007-02-16 01:28:21 -0800115static __always_inline void do_vgettimeofday(struct timeval * tv)
116{
117 cycle_t now, base, mask, cycle_delta;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200118 unsigned seq;
119 unsigned long mult, shift, nsec;
john stultz7460ed22007-02-16 01:28:21 -0800120 cycle_t (*vread)(void);
121 do {
122 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
123
124 vread = __vsyscall_gtod_data.clock.vread;
125 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
Al Viro89952d12007-03-14 09:17:59 +0000126 gettimeofday(tv,NULL);
john stultz7460ed22007-02-16 01:28:21 -0800127 return;
128 }
129 now = vread();
130 base = __vsyscall_gtod_data.clock.cycle_last;
131 mask = __vsyscall_gtod_data.clock.mask;
132 mult = __vsyscall_gtod_data.clock.mult;
133 shift = __vsyscall_gtod_data.clock.shift;
134
Eric Dumazetc8118c62007-05-02 19:27:11 +0200135 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
136 nsec = __vsyscall_gtod_data.wall_time_nsec;
john stultz7460ed22007-02-16 01:28:21 -0800137 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
138
139 /* calculate interval: */
140 cycle_delta = (now - base) & mask;
141 /* convert to nsecs: */
Eric Dumazetc8118c62007-05-02 19:27:11 +0200142 nsec += (cycle_delta * mult) >> shift;
john stultz7460ed22007-02-16 01:28:21 -0800143
Eric Dumazetc8118c62007-05-02 19:27:11 +0200144 while (nsec >= NSEC_PER_SEC) {
john stultz7460ed22007-02-16 01:28:21 -0800145 tv->tv_sec += 1;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200146 nsec -= NSEC_PER_SEC;
john stultz7460ed22007-02-16 01:28:21 -0800147 }
Eric Dumazetc8118c62007-05-02 19:27:11 +0200148 tv->tv_usec = nsec / NSEC_PER_USEC;
john stultz7460ed22007-02-16 01:28:21 -0800149}
150
Andi Kleen2e8ad432005-09-12 18:49:24 +0200151int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 if (tv)
154 do_vgettimeofday(tv);
155 if (tz)
156 do_get_tz(tz);
157 return 0;
158}
159
160/* This will break when the xtime seconds get inaccurate, but that is
161 * unlikely */
Andi Kleen2e8ad432005-09-12 18:49:24 +0200162time_t __vsyscall(1) vtime(time_t *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163{
john stultzd0aff6e2007-05-21 14:31:52 +0200164 struct timeval tv;
Eric Dumazet272a3712007-05-02 19:27:11 +0200165 time_t result;
john stultz7460ed22007-02-16 01:28:21 -0800166 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 return time_syscall(t);
john stultzd0aff6e2007-05-21 14:31:52 +0200168
169 vgettimeofday(&tv, 0);
170 result = tv.tv_sec;
Eric Dumazet272a3712007-05-02 19:27:11 +0200171 if (t)
172 *t = result;
173 return result;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174}
175
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200176/* Fast way to get current CPU and node.
177 This helps to do per node and per CPU caches in user space.
178 The result is not guaranteed without CPU affinity, but usually
179 works out because the scheduler tries to keep a thread on the same
180 CPU.
181
182 tcache must point to a two element sized long array.
183 All arguments can be NULL. */
184long __vsyscall(2)
185vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186{
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200187 unsigned int dummy, p;
188 unsigned long j = 0;
189
190 /* Fast cache - only recompute value once per jiffies and avoid
191 relatively costly rdtscp/cpuid otherwise.
192 This works because the scheduler usually keeps the process
193 on the same CPU and this syscall doesn't guarantee its
194 results anyways.
195 We do this here because otherwise user space would do it on
196 its own in a likely inferior way (no access to jiffies).
197 If you don't like it pass NULL. */
Andi Kleen34596dc2006-09-30 01:47:55 +0200198 if (tcache && tcache->blob[0] == (j = __jiffies)) {
199 p = tcache->blob[1];
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200200 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
201 /* Load per CPU data from RDTSCP */
202 rdtscp(dummy, dummy, p);
203 } else {
204 /* Load per CPU data from GDT */
205 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
206 }
207 if (tcache) {
Andi Kleen34596dc2006-09-30 01:47:55 +0200208 tcache->blob[0] = j;
209 tcache->blob[1] = p;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200210 }
211 if (cpu)
212 *cpu = p & 0xfff;
213 if (node)
214 *node = p >> 12;
215 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216}
217
Andi Kleen2e8ad432005-09-12 18:49:24 +0200218long __vsyscall(3) venosys_1(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219{
220 return -ENOSYS;
221}
222
223#ifdef CONFIG_SYSCTL
224
225#define SYSCALL 0x050f
226#define NOP2 0x9090
227
228/*
229 * NOP out syscall in vsyscall page when not needed.
230 */
231static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
232 void __user *buffer, size_t *lenp, loff_t *ppos)
233{
234 extern u16 vsysc1, vsysc2;
Andi Kleen131cfd72006-09-26 10:52:33 +0200235 u16 __iomem *map1;
236 u16 __iomem *map2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
238 if (!write)
239 return ret;
240 /* gcc has some trouble with __va(__pa()), so just do it this
241 way. */
Vivek Goyal0dbf7022007-05-02 19:27:07 +0200242 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 if (!map1)
244 return -ENOMEM;
Vivek Goyal0dbf7022007-05-02 19:27:07 +0200245 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 if (!map2) {
247 ret = -ENOMEM;
248 goto out;
249 }
john stultz7460ed22007-02-16 01:28:21 -0800250 if (!vsyscall_gtod_data.sysctl_enabled) {
Andi Kleen131cfd72006-09-26 10:52:33 +0200251 writew(SYSCALL, map1);
252 writew(SYSCALL, map2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253 } else {
Andi Kleen131cfd72006-09-26 10:52:33 +0200254 writew(NOP2, map1);
255 writew(NOP2, map2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 }
257 iounmap(map2);
258out:
259 iounmap(map1);
260 return ret;
261}
262
263static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
264 void __user *oldval, size_t __user *oldlenp,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -0800265 void __user *newval, size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266{
267 return -ENOSYS;
268}
269
270static ctl_table kernel_table2[] = {
271 { .ctl_name = 99, .procname = "vsyscall64",
john stultz7460ed22007-02-16 01:28:21 -0800272 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
273 .mode = 0644,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274 .strategy = vsyscall_sysctl_nostrat,
275 .proc_handler = vsyscall_sysctl_change },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800276 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277};
278
279static ctl_table kernel_root_table2[] = {
280 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
281 .child = kernel_table2 },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800282 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283};
284
285#endif
286
Andi Kleen8c131af2006-11-14 16:57:46 +0100287/* Assume __initcall executes before all user space. Hopefully kmod
288 doesn't violate that. We'll find out if it does. */
289static void __cpuinit vsyscall_set_cpu(int cpu)
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200290{
291 unsigned long *d;
292 unsigned long node = 0;
293#ifdef CONFIG_NUMA
294 node = cpu_to_node[cpu];
295#endif
Andi Kleen8c131af2006-11-14 16:57:46 +0100296 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
297 write_rdtscp_aux((node << 12) | cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200298
299 /* Store cpu number in limit so that it can be loaded quickly
300 in user space in vgetcpu.
301 12 bits for the CPU and 8 bits for the node. */
302 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
303 *d = 0x0f40000000000ULL;
304 *d |= cpu;
305 *d |= (node & 0xf) << 12;
306 *d |= (node >> 4) << 48;
307}
308
Andi Kleen8c131af2006-11-14 16:57:46 +0100309static void __cpuinit cpu_vsyscall_init(void *arg)
310{
311 /* preemption should be already off */
312 vsyscall_set_cpu(raw_smp_processor_id());
313}
314
315static int __cpuinit
316cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
317{
318 long cpu = (long)arg;
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700319 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
Andi Kleen8c131af2006-11-14 16:57:46 +0100320 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
321 return NOTIFY_DONE;
322}
323
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324static void __init map_vsyscall(void)
325{
326 extern char __vsyscall_0;
327 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
328
Ernie Petrides103efcd2006-12-07 02:14:09 +0100329 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
331}
332
333static int __init vsyscall_init(void)
334{
335 BUG_ON(((unsigned long) &vgettimeofday !=
336 VSYSCALL_ADDR(__NR_vgettimeofday)));
337 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
338 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200339 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 map_vsyscall();
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700341#ifdef CONFIG_SYSCTL
Eric W. Biederman0b4d4142007-02-14 00:34:09 -0800342 register_sysctl_table(kernel_root_table2);
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700343#endif
Andi Kleen8c131af2006-11-14 16:57:46 +0100344 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
345 hotcpu_notifier(cpu_vsyscall_notifier, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 return 0;
347}
348
349__initcall(vsyscall_init);