blob: 5684e5a1282b99608de1d9e6b6b72aeac53b6b43 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * Thanks to hpa@transmeta.com for some useful hint.
6 * Special thanks to Ingo Molnar for his early experience with
7 * a different vsyscall implementation for Linux/IA32 and for the name.
8 *
9 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
10 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
11 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
12 * jumping out of line if necessary. We cannot add more with this
13 * mechanism because older kernels won't return -ENOSYS.
14 * If we want more than four we need a vDSO.
15 *
16 * Note: the concept clashes with user mode linux. If you use UML and
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */
19
20#include <linux/time.h>
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/timer.h>
24#include <linux/seqlock.h>
25#include <linux/jiffies.h>
26#include <linux/sysctl.h>
john stultz7460ed22007-02-16 01:28:21 -080027#include <linux/clocksource.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020028#include <linux/getcpu.h>
Andi Kleen8c131af2006-11-14 16:57:46 +010029#include <linux/cpu.h>
30#include <linux/smp.h>
31#include <linux/notifier.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
33#include <asm/vsyscall.h>
34#include <asm/pgtable.h>
35#include <asm/page.h>
john stultz7460ed22007-02-16 01:28:21 -080036#include <asm/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <asm/fixmap.h>
38#include <asm/errno.h>
39#include <asm/io.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020040#include <asm/segment.h>
41#include <asm/desc.h>
42#include <asm/topology.h>
Andi Kleen2aae9502007-07-21 17:10:01 +020043#include <asm/vgtod.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070044
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
Arnd Bergmannf5738ce2006-12-06 20:37:29 -080046#define __syscall_clobber "r11","rcx","memory"
Vivek Goyal0dbf7022007-05-02 19:27:07 +020047#define __pa_vsymbol(x) \
48 ({unsigned long v; \
49 extern char __vsyscall_0; \
50 asm("" : "=r" (v) : "0" (x)); \
51 ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
Linus Torvalds1da177e2005-04-16 15:20:36 -070052
Eric Dumazetc8118c62007-05-02 19:27:11 +020053/*
54 * vsyscall_gtod_data contains data that is :
55 * - readonly from vsyscalls
56 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
57 * Try to keep this structure as small as possible to avoid cache line ping pongs
58 */
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020059int __vgetcpu_mode __section_vgetcpu_mode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070060
Andi Kleen2aae9502007-07-21 17:10:01 +020061struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
Linus Torvalds1da177e2005-04-16 15:20:36 -070062{
john stultz7460ed22007-02-16 01:28:21 -080063 .lock = SEQLOCK_UNLOCKED,
64 .sysctl_enabled = 1,
65};
Linus Torvalds1da177e2005-04-16 15:20:36 -070066
john stultz7460ed22007-02-16 01:28:21 -080067void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
68{
69 unsigned long flags;
70
71 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
72 /* copy vsyscall data */
Eric Dumazetc8118c62007-05-02 19:27:11 +020073 vsyscall_gtod_data.clock.vread = clock->vread;
74 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
75 vsyscall_gtod_data.clock.mask = clock->mask;
76 vsyscall_gtod_data.clock.mult = clock->mult;
77 vsyscall_gtod_data.clock.shift = clock->shift;
78 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
79 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
john stultz7460ed22007-02-16 01:28:21 -080080 vsyscall_gtod_data.sys_tz = sys_tz;
Andi Kleen2aae9502007-07-21 17:10:01 +020081 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
82 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
john stultz7460ed22007-02-16 01:28:21 -080083 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070084}
85
john stultz7460ed22007-02-16 01:28:21 -080086/* RED-PEN may want to readd seq locking, but then the variable should be
87 * write-once.
88 */
Andi Kleen2c8bc942006-01-11 22:45:30 +010089static __always_inline void do_get_tz(struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070090{
john stultz7460ed22007-02-16 01:28:21 -080091 *tz = __vsyscall_gtod_data.sys_tz;
Linus Torvalds1da177e2005-04-16 15:20:36 -070092}
93
Andi Kleen2c8bc942006-01-11 22:45:30 +010094static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -070095{
96 int ret;
97 asm volatile("vsysc2: syscall"
98 : "=a" (ret)
john stultz7460ed22007-02-16 01:28:21 -080099 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
100 : __syscall_clobber );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101 return ret;
102}
103
Andi Kleen2c8bc942006-01-11 22:45:30 +0100104static __always_inline long time_syscall(long *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
106 long secs;
107 asm volatile("vsysc1: syscall"
108 : "=a" (secs)
109 : "0" (__NR_time),"D" (t) : __syscall_clobber);
110 return secs;
111}
112
john stultz7460ed22007-02-16 01:28:21 -0800113static __always_inline void do_vgettimeofday(struct timeval * tv)
114{
115 cycle_t now, base, mask, cycle_delta;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200116 unsigned seq;
117 unsigned long mult, shift, nsec;
john stultz7460ed22007-02-16 01:28:21 -0800118 cycle_t (*vread)(void);
119 do {
120 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
121
122 vread = __vsyscall_gtod_data.clock.vread;
123 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
Al Viro89952d12007-03-14 09:17:59 +0000124 gettimeofday(tv,NULL);
john stultz7460ed22007-02-16 01:28:21 -0800125 return;
126 }
127 now = vread();
128 base = __vsyscall_gtod_data.clock.cycle_last;
129 mask = __vsyscall_gtod_data.clock.mask;
130 mult = __vsyscall_gtod_data.clock.mult;
131 shift = __vsyscall_gtod_data.clock.shift;
132
Eric Dumazetc8118c62007-05-02 19:27:11 +0200133 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
134 nsec = __vsyscall_gtod_data.wall_time_nsec;
john stultz7460ed22007-02-16 01:28:21 -0800135 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
136
137 /* calculate interval: */
138 cycle_delta = (now - base) & mask;
139 /* convert to nsecs: */
Eric Dumazetc8118c62007-05-02 19:27:11 +0200140 nsec += (cycle_delta * mult) >> shift;
john stultz7460ed22007-02-16 01:28:21 -0800141
Eric Dumazetc8118c62007-05-02 19:27:11 +0200142 while (nsec >= NSEC_PER_SEC) {
john stultz7460ed22007-02-16 01:28:21 -0800143 tv->tv_sec += 1;
Eric Dumazetc8118c62007-05-02 19:27:11 +0200144 nsec -= NSEC_PER_SEC;
john stultz7460ed22007-02-16 01:28:21 -0800145 }
Eric Dumazetc8118c62007-05-02 19:27:11 +0200146 tv->tv_usec = nsec / NSEC_PER_USEC;
john stultz7460ed22007-02-16 01:28:21 -0800147}
148
Andi Kleen2e8ad432005-09-12 18:49:24 +0200149int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 if (tv)
152 do_vgettimeofday(tv);
153 if (tz)
154 do_get_tz(tz);
155 return 0;
156}
157
158/* This will break when the xtime seconds get inaccurate, but that is
159 * unlikely */
Andi Kleen2e8ad432005-09-12 18:49:24 +0200160time_t __vsyscall(1) vtime(time_t *t)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161{
john stultzd0aff6e2007-05-21 14:31:52 +0200162 struct timeval tv;
Eric Dumazet272a3712007-05-02 19:27:11 +0200163 time_t result;
john stultz7460ed22007-02-16 01:28:21 -0800164 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 return time_syscall(t);
john stultzd0aff6e2007-05-21 14:31:52 +0200166
167 vgettimeofday(&tv, 0);
168 result = tv.tv_sec;
Eric Dumazet272a3712007-05-02 19:27:11 +0200169 if (t)
170 *t = result;
171 return result;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172}
173
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200174/* Fast way to get current CPU and node.
175 This helps to do per node and per CPU caches in user space.
176 The result is not guaranteed without CPU affinity, but usually
177 works out because the scheduler tries to keep a thread on the same
178 CPU.
179
180 tcache must point to a two element sized long array.
181 All arguments can be NULL. */
182long __vsyscall(2)
183vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184{
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200185 unsigned int dummy, p;
186 unsigned long j = 0;
187
188 /* Fast cache - only recompute value once per jiffies and avoid
189 relatively costly rdtscp/cpuid otherwise.
190 This works because the scheduler usually keeps the process
191 on the same CPU and this syscall doesn't guarantee its
192 results anyways.
193 We do this here because otherwise user space would do it on
194 its own in a likely inferior way (no access to jiffies).
195 If you don't like it pass NULL. */
Andi Kleen34596dc2006-09-30 01:47:55 +0200196 if (tcache && tcache->blob[0] == (j = __jiffies)) {
197 p = tcache->blob[1];
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200198 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
199 /* Load per CPU data from RDTSCP */
200 rdtscp(dummy, dummy, p);
201 } else {
202 /* Load per CPU data from GDT */
203 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
204 }
205 if (tcache) {
Andi Kleen34596dc2006-09-30 01:47:55 +0200206 tcache->blob[0] = j;
207 tcache->blob[1] = p;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200208 }
209 if (cpu)
210 *cpu = p & 0xfff;
211 if (node)
212 *node = p >> 12;
213 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214}
215
Andi Kleen2e8ad432005-09-12 18:49:24 +0200216long __vsyscall(3) venosys_1(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217{
218 return -ENOSYS;
219}
220
221#ifdef CONFIG_SYSCTL
222
223#define SYSCALL 0x050f
224#define NOP2 0x9090
225
226/*
227 * NOP out syscall in vsyscall page when not needed.
228 */
229static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
230 void __user *buffer, size_t *lenp, loff_t *ppos)
231{
232 extern u16 vsysc1, vsysc2;
Andi Kleen131cfd72006-09-26 10:52:33 +0200233 u16 __iomem *map1;
234 u16 __iomem *map2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236 if (!write)
237 return ret;
238 /* gcc has some trouble with __va(__pa()), so just do it this
239 way. */
Vivek Goyal0dbf7022007-05-02 19:27:07 +0200240 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 if (!map1)
242 return -ENOMEM;
Vivek Goyal0dbf7022007-05-02 19:27:07 +0200243 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 if (!map2) {
245 ret = -ENOMEM;
246 goto out;
247 }
john stultz7460ed22007-02-16 01:28:21 -0800248 if (!vsyscall_gtod_data.sysctl_enabled) {
Andi Kleen131cfd72006-09-26 10:52:33 +0200249 writew(SYSCALL, map1);
250 writew(SYSCALL, map2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 } else {
Andi Kleen131cfd72006-09-26 10:52:33 +0200252 writew(NOP2, map1);
253 writew(NOP2, map2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254 }
255 iounmap(map2);
256out:
257 iounmap(map1);
258 return ret;
259}
260
261static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
262 void __user *oldval, size_t __user *oldlenp,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -0800263 void __user *newval, size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264{
265 return -ENOSYS;
266}
267
268static ctl_table kernel_table2[] = {
269 { .ctl_name = 99, .procname = "vsyscall64",
john stultz7460ed22007-02-16 01:28:21 -0800270 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
271 .mode = 0644,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 .strategy = vsyscall_sysctl_nostrat,
273 .proc_handler = vsyscall_sysctl_change },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800274 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275};
276
277static ctl_table kernel_root_table2[] = {
278 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
279 .child = kernel_table2 },
Eric W. Biederman7a44d372007-02-14 00:33:50 -0800280 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281};
282
283#endif
284
Andi Kleen8c131af2006-11-14 16:57:46 +0100285/* Assume __initcall executes before all user space. Hopefully kmod
286 doesn't violate that. We'll find out if it does. */
287static void __cpuinit vsyscall_set_cpu(int cpu)
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200288{
289 unsigned long *d;
290 unsigned long node = 0;
291#ifdef CONFIG_NUMA
Mike Travis98c9e272007-10-17 18:04:39 +0200292 node = cpu_to_node(cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200293#endif
Andi Kleen8c131af2006-11-14 16:57:46 +0100294 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
295 write_rdtscp_aux((node << 12) | cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200296
297 /* Store cpu number in limit so that it can be loaded quickly
298 in user space in vgetcpu.
299 12 bits for the CPU and 8 bits for the node. */
300 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
301 *d = 0x0f40000000000ULL;
302 *d |= cpu;
303 *d |= (node & 0xf) << 12;
304 *d |= (node >> 4) << 48;
305}
306
Andi Kleen8c131af2006-11-14 16:57:46 +0100307static void __cpuinit cpu_vsyscall_init(void *arg)
308{
309 /* preemption should be already off */
310 vsyscall_set_cpu(raw_smp_processor_id());
311}
312
313static int __cpuinit
314cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
315{
316 long cpu = (long)arg;
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700317 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
Andi Kleen8c131af2006-11-14 16:57:46 +0100318 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
319 return NOTIFY_DONE;
320}
321
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322static void __init map_vsyscall(void)
323{
324 extern char __vsyscall_0;
325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
326
Ernie Petrides103efcd2006-12-07 02:14:09 +0100327 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
329}
330
331static int __init vsyscall_init(void)
332{
333 BUG_ON(((unsigned long) &vgettimeofday !=
334 VSYSCALL_ADDR(__NR_vgettimeofday)));
335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 map_vsyscall();
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700339#ifdef CONFIG_SYSCTL
Eric W. Biederman0b4d4142007-02-14 00:34:09 -0800340 register_sysctl_table(kernel_root_table2);
Andi Kleenf3c5f5e2005-05-16 21:53:33 -0700341#endif
Andi Kleen8c131af2006-11-14 16:57:46 +0100342 on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
343 hotcpu_notifier(cpu_vsyscall_notifier, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 return 0;
345}
346
347__initcall(vsyscall_init);