linux/arch/x86/kernel/vsyscall_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
   3 *  Copyright 2003 Andi Kleen, SuSE Labs.
   4 *
   5 *  Thanks to hpa@transmeta.com for some useful hint.
   6 *  Special thanks to Ingo Molnar for his early experience with
   7 *  a different vsyscall implementation for Linux/IA32 and for the name.
   8 *
   9 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
  10 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
  11 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  12 *  jumping out of line if necessary. We cannot add more with this
  13 *  mechanism because older kernels won't return -ENOSYS.
  14 *  If we want more than four we need a vDSO.
  15 *
  16 *  Note: the concept clashes with user mode linux. If you use UML and
  17 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  18 */
  19
  20#include <linux/time.h>
  21#include <linux/init.h>
  22#include <linux/kernel.h>
  23#include <linux/timer.h>
  24#include <linux/seqlock.h>
  25#include <linux/jiffies.h>
  26#include <linux/sysctl.h>
  27#include <linux/clocksource.h>
  28#include <linux/getcpu.h>
  29#include <linux/cpu.h>
  30#include <linux/smp.h>
  31#include <linux/notifier.h>
  32
  33#include <asm/vsyscall.h>
  34#include <asm/pgtable.h>
  35#include <asm/page.h>
  36#include <asm/unistd.h>
  37#include <asm/fixmap.h>
  38#include <asm/errno.h>
  39#include <asm/io.h>
  40#include <asm/segment.h>
  41#include <asm/desc.h>
  42#include <asm/topology.h>
  43#include <asm/vgtod.h>
  44
  45#define __vsyscall(nr) \
  46                __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
  47#define __syscall_clobber "r11","cx","memory"
  48
  49/*
  50 * vsyscall_gtod_data contains data that is :
  51 * - readonly from vsyscalls
  52 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
  53 * Try to keep this structure as small as possible to avoid cache line ping pongs
  54 */
  55int __vgetcpu_mode __section_vgetcpu_mode;
  56
  57struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
  58{
  59        .lock = SEQLOCK_UNLOCKED,
  60        .sysctl_enabled = 1,
  61};
  62
  63void update_vsyscall_tz(void)
  64{
  65        unsigned long flags;
  66
  67        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  68        /* sys_tz has changed */
  69        vsyscall_gtod_data.sys_tz = sys_tz;
  70        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  71}
  72
  73void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
  74{
  75        unsigned long flags;
  76
  77        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  78        /* copy vsyscall data */
  79        vsyscall_gtod_data.clock.vread = clock->vread;
  80        vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
  81        vsyscall_gtod_data.clock.mask = clock->mask;
  82        vsyscall_gtod_data.clock.mult = clock->mult;
  83        vsyscall_gtod_data.clock.shift = clock->shift;
  84        vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
  85        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
  86        vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
  87        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  88}
  89
  90/* RED-PEN may want to readd seq locking, but then the variable should be
  91 * write-once.
  92 */
  93static __always_inline void do_get_tz(struct timezone * tz)
  94{
  95        *tz = __vsyscall_gtod_data.sys_tz;
  96}
  97
  98static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
  99{
 100        int ret;
 101        asm volatile("syscall"
 102                : "=a" (ret)
 103                : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
 104                : __syscall_clobber );
 105        return ret;
 106}
 107
 108static __always_inline long time_syscall(long *t)
 109{
 110        long secs;
 111        asm volatile("syscall"
 112                : "=a" (secs)
 113                : "0" (__NR_time),"D" (t) : __syscall_clobber);
 114        return secs;
 115}
 116
 117static __always_inline void do_vgettimeofday(struct timeval * tv)
 118{
 119        cycle_t now, base, mask, cycle_delta;
 120        unsigned seq;
 121        unsigned long mult, shift, nsec;
 122        cycle_t (*vread)(void);
 123        do {
 124                seq = read_seqbegin(&__vsyscall_gtod_data.lock);
 125
 126                vread = __vsyscall_gtod_data.clock.vread;
 127                if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
 128                        gettimeofday(tv,NULL);
 129                        return;
 130                }
 131                now = vread();
 132                base = __vsyscall_gtod_data.clock.cycle_last;
 133                mask = __vsyscall_gtod_data.clock.mask;
 134                mult = __vsyscall_gtod_data.clock.mult;
 135                shift = __vsyscall_gtod_data.clock.shift;
 136
 137                tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
 138                nsec = __vsyscall_gtod_data.wall_time_nsec;
 139        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
 140
 141        /* calculate interval: */
 142        cycle_delta = (now - base) & mask;
 143        /* convert to nsecs: */
 144        nsec += (cycle_delta * mult) >> shift;
 145
 146        while (nsec >= NSEC_PER_SEC) {
 147                tv->tv_sec += 1;
 148                nsec -= NSEC_PER_SEC;
 149        }
 150        tv->tv_usec = nsec / NSEC_PER_USEC;
 151}
 152
 153int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
 154{
 155        if (tv)
 156                do_vgettimeofday(tv);
 157        if (tz)
 158                do_get_tz(tz);
 159        return 0;
 160}
 161
 162/* This will break when the xtime seconds get inaccurate, but that is
 163 * unlikely */
 164time_t __vsyscall(1) vtime(time_t *t)
 165{
 166        struct timeval tv;
 167        time_t result;
 168        if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
 169                return time_syscall(t);
 170
 171        vgettimeofday(&tv, NULL);
 172        result = tv.tv_sec;
 173        if (t)
 174                *t = result;
 175        return result;
 176}
 177
 178/* Fast way to get current CPU and node.
 179   This helps to do per node and per CPU caches in user space.
 180   The result is not guaranteed without CPU affinity, but usually
 181   works out because the scheduler tries to keep a thread on the same
 182   CPU.
 183
 184   tcache must point to a two element sized long array.
 185   All arguments can be NULL. */
 186long __vsyscall(2)
 187vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 188{
 189        unsigned int p;
 190        unsigned long j = 0;
 191
 192        /* Fast cache - only recompute value once per jiffies and avoid
 193           relatively costly rdtscp/cpuid otherwise.
 194           This works because the scheduler usually keeps the process
 195           on the same CPU and this syscall doesn't guarantee its
 196           results anyways.
 197           We do this here because otherwise user space would do it on
 198           its own in a likely inferior way (no access to jiffies).
 199           If you don't like it pass NULL. */
 200        if (tcache && tcache->blob[0] == (j = __jiffies)) {
 201                p = tcache->blob[1];
 202        } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
 203                /* Load per CPU data from RDTSCP */
 204                native_read_tscp(&p);
 205        } else {
 206                /* Load per CPU data from GDT */
 207                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
 208        }
 209        if (tcache) {
 210                tcache->blob[0] = j;
 211                tcache->blob[1] = p;
 212        }
 213        if (cpu)
 214                *cpu = p & 0xfff;
 215        if (node)
 216                *node = p >> 12;
 217        return 0;
 218}
 219
 220static long __vsyscall(3) venosys_1(void)
 221{
 222        return -ENOSYS;
 223}
 224
 225#ifdef CONFIG_SYSCTL
 226
 227static int
 228vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
 229                       void __user *buffer, size_t *lenp, loff_t *ppos)
 230{
 231        return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 232}
 233
 234static ctl_table kernel_table2[] = {
 235        { .procname = "vsyscall64",
 236          .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 237          .mode = 0644,
 238          .proc_handler = vsyscall_sysctl_change },
 239        {}
 240};
 241
 242static ctl_table kernel_root_table2[] = {
 243        { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
 244          .child = kernel_table2 },
 245        {}
 246};
 247#endif
 248
 249/* Assume __initcall executes before all user space. Hopefully kmod
 250   doesn't violate that. We'll find out if it does. */
 251static void __cpuinit vsyscall_set_cpu(int cpu)
 252{
 253        unsigned long d;
 254        unsigned long node = 0;
 255#ifdef CONFIG_NUMA
 256        node = cpu_to_node(cpu);
 257#endif
 258        if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 259                write_rdtscp_aux((node << 12) | cpu);
 260
 261        /* Store cpu number in limit so that it can be loaded quickly
 262           in user space in vgetcpu.
 263           12 bits for the CPU and 8 bits for the node. */
 264        d = 0x0f40000000000ULL;
 265        d |= cpu;
 266        d |= (node & 0xf) << 12;
 267        d |= (node >> 4) << 48;
 268        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 269}
 270
 271static void __cpuinit cpu_vsyscall_init(void *arg)
 272{
 273        /* preemption should be already off */
 274        vsyscall_set_cpu(raw_smp_processor_id());
 275}
 276
 277static int __cpuinit
 278cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 279{
 280        long cpu = (long)arg;
 281        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 282                smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 283        return NOTIFY_DONE;
 284}
 285
 286void __init map_vsyscall(void)
 287{
 288        extern char __vsyscall_0;
 289        unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
 290
 291        /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 292        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
 293}
 294
 295static int __init vsyscall_init(void)
 296{
 297        BUG_ON(((unsigned long) &vgettimeofday !=
 298                        VSYSCALL_ADDR(__NR_vgettimeofday)));
 299        BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 300        BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 301        BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
 302#ifdef CONFIG_SYSCTL
 303        register_sysctl_table(kernel_root_table2);
 304#endif
 305        on_each_cpu(cpu_vsyscall_init, NULL, 1);
 306        hotcpu_notifier(cpu_vsyscall_notifier, 0);
 307        return 0;
 308}
 309
 310__initcall(vsyscall_init);
 311
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.