seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between
kernel versions:
- If a tracer uses SECCOMP_RET_TRACE to select a syscall number
higher than the largest known syscall, emulate the unknown
vsyscall by returning -ENOSYS. (This is unlikely to make a
noticeable difference on x86-64 due to the way the system call
entry works.)
- On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy.
This updates the documentation accordingly.
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Will Drewry <wad@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8d141b3..b2e58a2 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -136,19 +136,6 @@
return nr;
}
-#ifdef CONFIG_SECCOMP
-static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
-{
- if (!seccomp_mode(&tsk->seccomp))
- return 0;
- task_pt_regs(tsk)->orig_ax = syscall_nr;
- task_pt_regs(tsk)->ax = syscall_nr;
- return __secure_computing(syscall_nr);
-}
-#else
-#define vsyscall_seccomp(_tsk, _nr) 0
-#endif
-
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
@@ -181,10 +168,9 @@
{
struct task_struct *tsk;
unsigned long caller;
- int vsyscall_nr;
+ int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_error;
long ret;
- int skip;
/*
* No point in checking CS -- the only way to get here is a user mode
@@ -216,6 +202,64 @@
}
tsk = current;
+
+ /*
+ * Check for access_ok violations and find the syscall nr.
+ *
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, NULL means "don't write anything" not "write it at
+ * address 0".
+ */
+ switch (vsyscall_nr) {
+ case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_gettimeofday;
+ break;
+
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_time;
+ break;
+
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_getcpu;
+ break;
+ }
+
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing(syscall_nr);
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ do_exit(SIGSYS);
+ }
+ if (tmp)
+ goto do_ret; /* skip requested */
+
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
@@ -223,49 +267,19 @@
prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
current_thread_info()->sig_on_uaccess_error = 1;
- /*
- * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
- * 64-bit, so we don't need to special-case it here. For all the
- * vsyscalls, NULL means "don't write anything" not "write it at
- * address 0".
- */
ret = -EFAULT;
- skip = 0;
switch (vsyscall_nr) {
case 0:
- skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
- if (skip)
- break;
-
- if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
- !write_ok_or_segv(regs->si, sizeof(struct timezone)))
- break;
-
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
- skip = vsyscall_seccomp(tsk, __NR_time);
- if (skip)
- break;
-
- if (!write_ok_or_segv(regs->di, sizeof(time_t)))
- break;
-
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
- skip = vsyscall_seccomp(tsk, __NR_getcpu);
- if (skip)
- break;
-
- if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
- !write_ok_or_segv(regs->si, sizeof(unsigned)))
- break;
-
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
NULL);
@@ -274,12 +288,7 @@
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
- if (skip) {
- if ((long)regs->ax <= 0L) /* seccomp errno emulation */
- goto do_ret;
- goto done; /* seccomp trace/trap */
- }
-
+check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
@@ -302,7 +311,6 @@
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
-done:
return true;
sigsegv: