blob: dda7dff9cef7e624be6239ca463789e09eedd200 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
Andy Lutomirski5cec93c2011-06-05 13:50:24 -04005 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07007 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
Linus Torvalds1da177e2005-04-16 15:20:36 -070016 *
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040017 * Note: the concept clashes with user mode linux. UML users should
18 * use the vDSO.
Linus Torvalds1da177e2005-04-16 15:20:36 -070019 */
20
Ingo Molnar2b7d0392008-11-12 13:17:38 +010021/* Disable profiling for userspace code: */
Steven Rostedt2ed84ee2008-11-12 15:24:24 -050022#define DISABLE_BRANCH_PROFILING
Steven Rostedt1f0d69a2008-11-12 00:14:39 -050023
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <linux/time.h>
25#include <linux/init.h>
26#include <linux/kernel.h>
27#include <linux/timer.h>
28#include <linux/seqlock.h>
29#include <linux/jiffies.h>
30#include <linux/sysctl.h>
john stultz7460ed22007-02-16 01:28:21 -080031#include <linux/clocksource.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020032#include <linux/getcpu.h>
Andi Kleen8c131af2006-11-14 16:57:46 +010033#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/notifier.h>
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040036#include <linux/syscalls.h>
37#include <linux/ratelimit.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39#include <asm/vsyscall.h>
40#include <asm/pgtable.h>
Andy Lutomirskic9712942011-07-13 09:24:09 -040041#include <asm/compat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070042#include <asm/page.h>
john stultz7460ed22007-02-16 01:28:21 -080043#include <asm/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070044#include <asm/fixmap.h>
45#include <asm/errno.h>
46#include <asm/io.h>
Vojtech Pavlikc08c8202006-09-26 10:52:28 +020047#include <asm/segment.h>
48#include <asm/desc.h>
49#include <asm/topology.h>
Andi Kleen2aae9502007-07-21 17:10:01 +020050#include <asm/vgtod.h>
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040051#include <asm/traps.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070052
Andy Lutomirski8c49d9a2011-05-23 09:31:24 -040053DEFINE_VVAR(int, vgetcpu_mode);
54DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
Linus Torvalds1da177e2005-04-16 15:20:36 -070055{
Eric Dumazetc4dbe542011-05-24 14:08:08 +020056 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
john stultz7460ed22007-02-16 01:28:21 -080057};
Linus Torvalds1da177e2005-04-16 15:20:36 -070058
Tony Breeds2c622142007-10-18 03:04:57 -070059void update_vsyscall_tz(void)
60{
61 unsigned long flags;
62
63 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
64 /* sys_tz has changed */
65 vsyscall_gtod_data.sys_tz = sys_tz;
66 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
67}
68
John Stultz76158562010-07-13 17:56:23 -070069void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
70 struct clocksource *clock, u32 mult)
john stultz7460ed22007-02-16 01:28:21 -080071{
72 unsigned long flags;
73
74 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040075
john stultz7460ed22007-02-16 01:28:21 -080076 /* copy vsyscall data */
Andy Lutomirski98d0ac32011-07-14 06:47:22 -040077 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040078 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
79 vsyscall_gtod_data.clock.mask = clock->mask;
80 vsyscall_gtod_data.clock.mult = mult;
81 vsyscall_gtod_data.clock.shift = clock->shift;
82 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
83 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
84 vsyscall_gtod_data.wall_to_monotonic = *wtm;
85 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
86
john stultz7460ed22007-02-16 01:28:21 -080087 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070088}
89
Andy Lutomirski5cec93c2011-06-05 13:50:24 -040090static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
91 const char *message)
92{
93 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
94 struct task_struct *tsk;
95
96 if (!show_unhandled_signals || !__ratelimit(&rs))
97 return;
98
99 tsk = current;
100
Andy Lutomirskic9712942011-07-13 09:24:09 -0400101 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400102 level, tsk->comm, task_pid_nr(tsk),
Andy Lutomirskic9712942011-07-13 09:24:09 -0400103 message, regs->ip - 2, regs->cs,
104 regs->sp, regs->ax, regs->si, regs->di);
105}
106
107static int addr_to_vsyscall_nr(unsigned long addr)
108{
109 int nr;
110
111 if ((addr & ~0xC00UL) != VSYSCALL_START)
112 return -EINVAL;
113
114 nr = (addr & 0xC00UL) >> 10;
115 if (nr >= 3)
116 return -EINVAL;
117
118 return nr;
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400119}
120
121void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
122{
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400123 struct task_struct *tsk;
124 unsigned long caller;
125 int vsyscall_nr;
126 long ret;
127
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400128 local_irq_enable();
129
130 /*
Andy Lutomirskic9712942011-07-13 09:24:09 -0400131 * Real 64-bit user mode code has cs == __USER_CS. Anything else
132 * is bogus.
133 */
134 if (regs->cs != __USER_CS) {
135 /*
136 * If we trapped from kernel mode, we might as well OOPS now
137 * instead of returning to some random address and OOPSing
138 * then.
139 */
140 BUG_ON(!user_mode(regs));
141
142 /* Compat mode and non-compat 32-bit CS should both segfault. */
143 warn_bad_vsyscall(KERN_WARNING, regs,
144 "illegal int 0xcc from 32-bit mode");
145 goto sigsegv;
146 }
147
148 /*
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400149 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
150 * and int 0xcc is two bytes long.
151 */
Andy Lutomirskic9712942011-07-13 09:24:09 -0400152 vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
153 if (vsyscall_nr < 0) {
154 warn_bad_vsyscall(KERN_WARNING, regs,
155 "illegal int 0xcc (exploit attempt?)");
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400156 goto sigsegv;
157 }
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400158
159 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
160 warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
161 goto sigsegv;
162 }
163
164 tsk = current;
165 if (seccomp_mode(&tsk->seccomp))
166 do_exit(SIGKILL);
167
168 switch (vsyscall_nr) {
169 case 0:
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400170 ret = sys_gettimeofday(
171 (struct timeval __user *)regs->di,
172 (struct timezone __user *)regs->si);
173 break;
174
175 case 1:
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400176 ret = sys_time((time_t __user *)regs->di);
177 break;
178
179 case 2:
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400180 ret = sys_getcpu((unsigned __user *)regs->di,
181 (unsigned __user *)regs->si,
182 0);
183 break;
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400184 }
185
186 if (ret == -EFAULT) {
187 /*
188 * Bad news -- userspace fed a bad pointer to a vsyscall.
189 *
190 * With a real vsyscall, that would have caused SIGSEGV.
191 * To make writing reliable exploits using the emulated
192 * vsyscalls harder, generate SIGSEGV here as well.
193 */
194 warn_bad_vsyscall(KERN_INFO, regs,
195 "vsyscall fault (exploit attempt?)");
196 goto sigsegv;
197 }
198
199 regs->ax = ret;
200
201 /* Emulate a ret instruction. */
202 regs->ip = caller;
203 regs->sp += 8;
204
205 local_irq_disable();
206 return;
207
208sigsegv:
209 regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
210 force_sig(SIGSEGV, current);
Andy Lutomirskic9712942011-07-13 09:24:09 -0400211 local_irq_disable();
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400212}
213
214/*
215 * Assume __initcall executes before all user space. Hopefully kmod
216 * doesn't violate that. We'll find out if it does.
john stultz7460ed22007-02-16 01:28:21 -0800217 */
Andi Kleen8c131af2006-11-14 16:57:46 +0100218static void __cpuinit vsyscall_set_cpu(int cpu)
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200219{
Jeremy Fitzhardingefc8b8a62008-06-25 00:19:01 -0400220 unsigned long d;
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200221 unsigned long node = 0;
222#ifdef CONFIG_NUMA
Mike Travis98c9e272007-10-17 18:04:39 +0200223 node = cpu_to_node(cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200224#endif
Mike Travis92cb7612007-10-19 20:35:04 +0200225 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
Andi Kleen8c131af2006-11-14 16:57:46 +0100226 write_rdtscp_aux((node << 12) | cpu);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200227
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400228 /*
229 * Store cpu number in limit so that it can be loaded quickly
230 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
231 */
Jeremy Fitzhardingefc8b8a62008-06-25 00:19:01 -0400232 d = 0x0f40000000000ULL;
233 d |= cpu;
234 d |= (node & 0xf) << 12;
235 d |= (node >> 4) << 48;
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400236
Jeremy Fitzhardingefc8b8a62008-06-25 00:19:01 -0400237 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
Vojtech Pavlikc08c8202006-09-26 10:52:28 +0200238}
239
Andi Kleen8c131af2006-11-14 16:57:46 +0100240static void __cpuinit cpu_vsyscall_init(void *arg)
241{
242 /* preemption should be already off */
243 vsyscall_set_cpu(raw_smp_processor_id());
244}
245
246static int __cpuinit
247cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
248{
249 long cpu = (long)arg;
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400250
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700251 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
Jens Axboe8691e5a2008-06-06 11:18:06 +0200252 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400253
Andi Kleen8c131af2006-11-14 16:57:46 +0100254 return NOTIFY_DONE;
255}
256
Ingo Molnare4026442008-01-30 13:32:39 +0100257void __init map_vsyscall(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258{
259 extern char __vsyscall_0;
260 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
Andy Lutomirski9fd67b42011-06-05 13:50:19 -0400261 extern char __vvar_page;
262 unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
Ernie Petrides103efcd2006-12-07 02:14:09 +0100264 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
Andy Lutomirski9fd67b42011-06-05 13:50:19 -0400266 __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400267 BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268}
269
270static int __init vsyscall_init(void)
271{
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400272 BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
273
Jens Axboe15c8b6c2008-05-09 09:39:44 +0200274 on_each_cpu(cpu_vsyscall_init, NULL, 1);
Sheng Yangbe43f832009-12-18 16:48:45 +0800275 /* notifier priority > KVM */
276 hotcpu_notifier(cpu_vsyscall_notifier, 30);
Andy Lutomirski5cec93c2011-06-05 13:50:24 -0400277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 return 0;
279}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280__initcall(vsyscall_init);