linux/arch/x86/kernel/vsyscall_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
   3 *  Copyright 2003 Andi Kleen, SuSE Labs.
   4 *
   5 *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
   6 *
   7 *  Thanks to hpa@transmeta.com for some useful hint.
   8 *  Special thanks to Ingo Molnar for his early experience with
   9 *  a different vsyscall implementation for Linux/IA32 and for the name.
  10 *
  11 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
  12 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
  13 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  14 *  jumping out of line if necessary. We cannot add more with this
  15 *  mechanism because older kernels won't return -ENOSYS.
  16 *
  17 *  Note: the concept clashes with user mode linux.  UML users should
  18 *  use the vDSO.
  19 */
  20
  21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  22
  23#include <linux/time.h>
  24#include <linux/init.h>
  25#include <linux/kernel.h>
  26#include <linux/timer.h>
  27#include <linux/seqlock.h>
  28#include <linux/jiffies.h>
  29#include <linux/sysctl.h>
  30#include <linux/topology.h>
  31#include <linux/clocksource.h>
  32#include <linux/getcpu.h>
  33#include <linux/cpu.h>
  34#include <linux/smp.h>
  35#include <linux/notifier.h>
  36#include <linux/syscalls.h>
  37#include <linux/ratelimit.h>
  38
  39#include <asm/vsyscall.h>
  40#include <asm/pgtable.h>
  41#include <asm/compat.h>
  42#include <asm/page.h>
  43#include <asm/unistd.h>
  44#include <asm/fixmap.h>
  45#include <asm/errno.h>
  46#include <asm/io.h>
  47#include <asm/segment.h>
  48#include <asm/desc.h>
  49#include <asm/topology.h>
  50#include <asm/vgtod.h>
  51#include <asm/traps.h>
  52
  53#define CREATE_TRACE_POINTS
  54#include "vsyscall_trace.h"
  55
  56DEFINE_VVAR(int, vgetcpu_mode);
  57DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
  58
  59static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
  60
  61static int __init vsyscall_setup(char *str)
  62{
  63        if (str) {
  64                if (!strcmp("emulate", str))
  65                        vsyscall_mode = EMULATE;
  66                else if (!strcmp("native", str))
  67                        vsyscall_mode = NATIVE;
  68                else if (!strcmp("none", str))
  69                        vsyscall_mode = NONE;
  70                else
  71                        return -EINVAL;
  72
  73                return 0;
  74        }
  75
  76        return -EINVAL;
  77}
  78early_param("vsyscall", vsyscall_setup);
  79
  80void update_vsyscall_tz(void)
  81{
  82        vsyscall_gtod_data.sys_tz = sys_tz;
  83}
  84
  85void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
  86                        struct clocksource *clock, u32 mult)
  87{
  88        struct timespec monotonic;
  89
  90        write_seqcount_begin(&vsyscall_gtod_data.seq);
  91
  92        /* copy vsyscall data */
  93        vsyscall_gtod_data.clock.vclock_mode    = clock->archdata.vclock_mode;
  94        vsyscall_gtod_data.clock.cycle_last     = clock->cycle_last;
  95        vsyscall_gtod_data.clock.mask           = clock->mask;
  96        vsyscall_gtod_data.clock.mult           = mult;
  97        vsyscall_gtod_data.clock.shift          = clock->shift;
  98
  99        vsyscall_gtod_data.wall_time_sec        = wall_time->tv_sec;
 100        vsyscall_gtod_data.wall_time_nsec       = wall_time->tv_nsec;
 101
 102        monotonic = timespec_add(*wall_time, *wtm);
 103        vsyscall_gtod_data.monotonic_time_sec   = monotonic.tv_sec;
 104        vsyscall_gtod_data.monotonic_time_nsec  = monotonic.tv_nsec;
 105
 106        vsyscall_gtod_data.wall_time_coarse     = __current_kernel_time();
 107        vsyscall_gtod_data.monotonic_time_coarse =
 108                timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
 109
 110        write_seqcount_end(&vsyscall_gtod_data.seq);
 111}
 112
 113static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 114                              const char *message)
 115{
 116        if (!show_unhandled_signals)
 117                return;
 118
 119        pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
 120                              level, current->comm, task_pid_nr(current),
 121                              message, regs->ip, regs->cs,
 122                              regs->sp, regs->ax, regs->si, regs->di);
 123}
 124
 125static int addr_to_vsyscall_nr(unsigned long addr)
 126{
 127        int nr;
 128
 129        if ((addr & ~0xC00UL) != VSYSCALL_START)
 130                return -EINVAL;
 131
 132        nr = (addr & 0xC00UL) >> 10;
 133        if (nr >= 3)
 134                return -EINVAL;
 135
 136        return nr;
 137}
 138
 139#ifdef CONFIG_SECCOMP
 140static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
 141{
 142        if (!seccomp_mode(&tsk->seccomp))
 143                return 0;
 144        task_pt_regs(tsk)->orig_ax = syscall_nr;
 145        task_pt_regs(tsk)->ax = syscall_nr;
 146        return __secure_computing(syscall_nr);
 147}
 148#else
 149#define vsyscall_seccomp(_tsk, _nr) 0
 150#endif
 151
 152static bool write_ok_or_segv(unsigned long ptr, size_t size)
 153{
 154        /*
 155         * XXX: if access_ok, get_user, and put_user handled
 156         * sig_on_uaccess_error, this could go away.
 157         */
 158
 159        if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
 160                siginfo_t info;
 161                struct thread_struct *thread = &current->thread;
 162
 163                thread->error_code      = 6;  /* user fault, no page, write */
 164                thread->cr2             = ptr;
 165                thread->trap_nr         = X86_TRAP_PF;
 166
 167                memset(&info, 0, sizeof(info));
 168                info.si_signo           = SIGSEGV;
 169                info.si_errno           = 0;
 170                info.si_code            = SEGV_MAPERR;
 171                info.si_addr            = (void __user *)ptr;
 172
 173                force_sig_info(SIGSEGV, &info, current);
 174                return false;
 175        } else {
 176                return true;
 177        }
 178}
 179
 180bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 181{
 182        struct task_struct *tsk;
 183        unsigned long caller;
 184        int vsyscall_nr;
 185        int prev_sig_on_uaccess_error;
 186        long ret;
 187        int skip;
 188
 189        /*
 190         * No point in checking CS -- the only way to get here is a user mode
 191         * trap to a high address, which means that we're in 64-bit user code.
 192         */
 193
 194        WARN_ON_ONCE(address != regs->ip);
 195
 196        if (vsyscall_mode == NONE) {
 197                warn_bad_vsyscall(KERN_INFO, regs,
 198                                  "vsyscall attempted with vsyscall=none");
 199                return false;
 200        }
 201
 202        vsyscall_nr = addr_to_vsyscall_nr(address);
 203
 204        trace_emulate_vsyscall(vsyscall_nr);
 205
 206        if (vsyscall_nr < 0) {
 207                warn_bad_vsyscall(KERN_WARNING, regs,
 208                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
 209                goto sigsegv;
 210        }
 211
 212        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
 213                warn_bad_vsyscall(KERN_WARNING, regs,
 214                                  "vsyscall with bad stack (exploit attempt?)");
 215                goto sigsegv;
 216        }
 217
 218        tsk = current;
 219        /*
 220         * With a real vsyscall, page faults cause SIGSEGV.  We want to
 221         * preserve that behavior to make writing exploits harder.
 222         */
 223        prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
 224        current_thread_info()->sig_on_uaccess_error = 1;
 225
 226        /*
 227         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
 228         * 64-bit, so we don't need to special-case it here.  For all the
 229         * vsyscalls, NULL means "don't write anything" not "write it at
 230         * address 0".
 231         */
 232        ret = -EFAULT;
 233        skip = 0;
 234        switch (vsyscall_nr) {
 235        case 0:
 236                skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
 237                if (skip)
 238                        break;
 239
 240                if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
 241                    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
 242                        break;
 243
 244                ret = sys_gettimeofday(
 245                        (struct timeval __user *)regs->di,
 246                        (struct timezone __user *)regs->si);
 247                break;
 248
 249        case 1:
 250                skip = vsyscall_seccomp(tsk, __NR_time);
 251                if (skip)
 252                        break;
 253
 254                if (!write_ok_or_segv(regs->di, sizeof(time_t)))
 255                        break;
 256
 257                ret = sys_time((time_t __user *)regs->di);
 258                break;
 259
 260        case 2:
 261                skip = vsyscall_seccomp(tsk, __NR_getcpu);
 262                if (skip)
 263                        break;
 264
 265                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
 266                    !write_ok_or_segv(regs->si, sizeof(unsigned)))
 267                        break;
 268
 269                ret = sys_getcpu((unsigned __user *)regs->di,
 270                                 (unsigned __user *)regs->si,
 271                                 NULL);
 272                break;
 273        }
 274
 275        current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 276
 277        if (skip) {
 278                if ((long)regs->ax <= 0L) /* seccomp errno emulation */
 279                        goto do_ret;
 280                goto done; /* seccomp trace/trap */
 281        }
 282
 283        if (ret == -EFAULT) {
 284                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
 285                warn_bad_vsyscall(KERN_INFO, regs,
 286                                  "vsyscall fault (exploit attempt?)");
 287
 288                /*
 289                 * If we failed to generate a signal for any reason,
 290                 * generate one here.  (This should be impossible.)
 291                 */
 292                if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
 293                                 !sigismember(&tsk->pending.signal, SIGSEGV)))
 294                        goto sigsegv;
 295
 296                return true;  /* Don't emulate the ret. */
 297        }
 298
 299        regs->ax = ret;
 300
 301do_ret:
 302        /* Emulate a ret instruction. */
 303        regs->ip = caller;
 304        regs->sp += 8;
 305done:
 306        return true;
 307
 308sigsegv:
 309        force_sig(SIGSEGV, current);
 310        return true;
 311}
 312
 313/*
 314 * Assume __initcall executes before all user space. Hopefully kmod
 315 * doesn't violate that. We'll find out if it does.
 316 */
 317static void __cpuinit vsyscall_set_cpu(int cpu)
 318{
 319        unsigned long d;
 320        unsigned long node = 0;
 321#ifdef CONFIG_NUMA
 322        node = cpu_to_node(cpu);
 323#endif
 324        if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 325                write_rdtscp_aux((node << 12) | cpu);
 326
 327        /*
 328         * Store cpu number in limit so that it can be loaded quickly
 329         * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
 330         */
 331        d = 0x0f40000000000ULL;
 332        d |= cpu;
 333        d |= (node & 0xf) << 12;
 334        d |= (node >> 4) << 48;
 335
 336        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 337}
 338
 339static void __cpuinit cpu_vsyscall_init(void *arg)
 340{
 341        /* preemption should be already off */
 342        vsyscall_set_cpu(raw_smp_processor_id());
 343}
 344
 345static int __cpuinit
 346cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 347{
 348        long cpu = (long)arg;
 349
 350        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 351                smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 352
 353        return NOTIFY_DONE;
 354}
 355
 356void __init map_vsyscall(void)
 357{
 358        extern char __vsyscall_page;
 359        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 360        extern char __vvar_page;
 361        unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 362
 363        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
 364                     vsyscall_mode == NATIVE
 365                     ? PAGE_KERNEL_VSYSCALL
 366                     : PAGE_KERNEL_VVAR);
 367        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
 368                     (unsigned long)VSYSCALL_START);
 369
 370        __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
 371        BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
 372                     (unsigned long)VVAR_ADDRESS);
 373}
 374
 375static int __init vsyscall_init(void)
 376{
 377        BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
 378
 379        on_each_cpu(cpu_vsyscall_init, NULL, 1);
 380        /* notifier priority > KVM */
 381        hotcpu_notifier(cpu_vsyscall_notifier, 30);
 382
 383        return 0;
 384}
 385__initcall(vsyscall_init);
 386