linux/arch/x86/xen/time.c
<<
>>
Prefs
   1/*
   2 * Xen time implementation.
   3 *
   4 * This is implemented in terms of a clocksource driver which uses
   5 * the hypervisor clock as a nanosecond timebase, and a clockevent
   6 * driver which uses the hypervisor's timer mechanism.
   7 *
   8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   9 */
  10#include <linux/kernel.h>
  11#include <linux/interrupt.h>
  12#include <linux/clocksource.h>
  13#include <linux/clockchips.h>
  14#include <linux/kernel_stat.h>
  15#include <linux/math64.h>
  16
  17#include <asm/pvclock.h>
  18#include <asm/xen/hypervisor.h>
  19#include <asm/xen/hypercall.h>
  20
  21#include <xen/events.h>
  22#include <xen/interface/xen.h>
  23#include <xen/interface/vcpu.h>
  24
  25#include "xen-ops.h"
  26
  27#define XEN_SHIFT 22
  28
  29/* Xen may fire a timer up to this many ns early */
  30#define TIMER_SLOP      100000
  31#define NS_PER_TICK     (1000000000LL / HZ)
  32
  33/* runstate info updated by Xen */
  34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  35
  36/* snapshots of runstate info */
  37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  38
  39/* unused ns of stolen and blocked time */
  40static DEFINE_PER_CPU(u64, residual_stolen);
  41static DEFINE_PER_CPU(u64, residual_blocked);
  42
  43/* return an consistent snapshot of 64-bit time/counter value */
  44static u64 get64(const u64 *p)
  45{
  46        u64 ret;
  47
  48        if (BITS_PER_LONG < 64) {
  49                u32 *p32 = (u32 *)p;
  50                u32 h, l;
  51
  52                /*
  53                 * Read high then low, and then make sure high is
  54                 * still the same; this will only loop if low wraps
  55                 * and carries into high.
  56                 * XXX some clean way to make this endian-proof?
  57                 */
  58                do {
  59                        h = p32[1];
  60                        barrier();
  61                        l = p32[0];
  62                        barrier();
  63                } while (p32[1] != h);
  64
  65                ret = (((u64)h) << 32) | l;
  66        } else
  67                ret = *p;
  68
  69        return ret;
  70}
  71
  72/*
  73 * Runstate accounting
  74 */
  75static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  76{
  77        u64 state_time;
  78        struct vcpu_runstate_info *state;
  79
  80        BUG_ON(preemptible());
  81
  82        state = &__get_cpu_var(runstate);
  83
  84        /*
  85         * The runstate info is always updated by the hypervisor on
  86         * the current CPU, so there's no need to use anything
  87         * stronger than a compiler barrier when fetching it.
  88         */
  89        do {
  90                state_time = get64(&state->state_entry_time);
  91                barrier();
  92                *res = *state;
  93                barrier();
  94        } while (get64(&state->state_entry_time) != state_time);
  95}
  96
  97/* return true when a vcpu could run but has no real cpu to run on */
  98bool xen_vcpu_stolen(int vcpu)
  99{
 100        return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
 101}
 102
 103static void setup_runstate_info(int cpu)
 104{
 105        struct vcpu_register_runstate_memory_area area;
 106
 107        area.addr.v = &per_cpu(runstate, cpu);
 108
 109        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
 110                               cpu, &area))
 111                BUG();
 112}
 113
 114static void do_stolen_accounting(void)
 115{
 116        struct vcpu_runstate_info state;
 117        struct vcpu_runstate_info *snap;
 118        s64 blocked, runnable, offline, stolen;
 119        cputime_t ticks;
 120
 121        get_runstate_snapshot(&state);
 122
 123        WARN_ON(state.state != RUNSTATE_running);
 124
 125        snap = &__get_cpu_var(runstate_snapshot);
 126
 127        /* work out how much time the VCPU has not been runn*ing*  */
 128        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
 129        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
 130        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
 131
 132        *snap = state;
 133
 134        /* Add the appropriate number of ticks of stolen time,
 135           including any left-overs from last time.  Passing NULL to
 136           account_steal_time accounts the time as stolen. */
 137        stolen = runnable + offline + __get_cpu_var(residual_stolen);
 138
 139        if (stolen < 0)
 140                stolen = 0;
 141
 142        ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 143        __get_cpu_var(residual_stolen) = stolen;
 144        account_steal_time(NULL, ticks);
 145
 146        /* Add the appropriate number of ticks of blocked time,
 147           including any left-overs from last time.  Passing idle to
 148           account_steal_time accounts the time as idle/wait. */
 149        blocked += __get_cpu_var(residual_blocked);
 150
 151        if (blocked < 0)
 152                blocked = 0;
 153
 154        ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 155        __get_cpu_var(residual_blocked) = blocked;
 156        account_steal_time(idle_task(smp_processor_id()), ticks);
 157}
 158
 159/* Get the TSC speed from Xen */
 160unsigned long xen_tsc_khz(void)
 161{
 162        u64 xen_khz = 1000000ULL << 32;
 163        const struct pvclock_vcpu_time_info *info =
 164                &HYPERVISOR_shared_info->vcpu_info[0].time;
 165
 166        do_div(xen_khz, info->tsc_to_system_mul);
 167        if (info->tsc_shift < 0)
 168                xen_khz <<= -info->tsc_shift;
 169        else
 170                xen_khz >>= info->tsc_shift;
 171
 172        return xen_khz;
 173}
 174
 175cycle_t xen_clocksource_read(void)
 176{
 177        struct pvclock_vcpu_time_info *src;
 178        cycle_t ret;
 179
 180        src = &get_cpu_var(xen_vcpu)->time;
 181        ret = pvclock_clocksource_read(src);
 182        put_cpu_var(xen_vcpu);
 183        return ret;
 184}
 185
 186static void xen_read_wallclock(struct timespec *ts)
 187{
 188        struct shared_info *s = HYPERVISOR_shared_info;
 189        struct pvclock_wall_clock *wall_clock = &(s->wc);
 190        struct pvclock_vcpu_time_info *vcpu_time;
 191
 192        vcpu_time = &get_cpu_var(xen_vcpu)->time;
 193        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
 194        put_cpu_var(xen_vcpu);
 195}
 196
 197unsigned long xen_get_wallclock(void)
 198{
 199        struct timespec ts;
 200
 201        xen_read_wallclock(&ts);
 202        return ts.tv_sec;
 203}
 204
 205int xen_set_wallclock(unsigned long now)
 206{
 207        /* do nothing for domU */
 208        return -1;
 209}
 210
 211static struct clocksource xen_clocksource __read_mostly = {
 212        .name = "xen",
 213        .rating = 400,
 214        .read = xen_clocksource_read,
 215        .mask = ~0,
 216        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
 217        .shift = XEN_SHIFT,
 218        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 219};
 220
 221/*
 222   Xen clockevent implementation
 223
 224   Xen has two clockevent implementations:
 225
 226   The old timer_op one works with all released versions of Xen prior
 227   to version 3.0.4.  This version of the hypervisor provides a
 228   single-shot timer with nanosecond resolution.  However, sharing the
 229   same event channel is a 100Hz tick which is delivered while the
 230   vcpu is running.  We don't care about or use this tick, but it will
 231   cause the core time code to think the timer fired too soon, and
 232   will end up resetting it each time.  It could be filtered, but
 233   doing so has complications when the ktime clocksource is not yet
 234   the xen clocksource (ie, at boot time).
 235
 236   The new vcpu_op-based timer interface allows the tick timer period
 237   to be changed or turned off.  The tick timer is not useful as a
 238   periodic timer because events are only delivered to running vcpus.
 239   The one-shot timer can report when a timeout is in the past, so
 240   set_next_event is capable of returning -ETIME when appropriate.
 241   This interface is used when available.
 242*/
 243
 244
 245/*
 246  Get a hypervisor absolute time.  In theory we could maintain an
 247  offset between the kernel's time and the hypervisor's time, and
 248  apply that to a kernel's absolute timeout.  Unfortunately the
 249  hypervisor and kernel times can drift even if the kernel is using
 250  the Xen clocksource, because ntp can warp the kernel's clocksource.
 251*/
 252static s64 get_abs_timeout(unsigned long delta)
 253{
 254        return xen_clocksource_read() + delta;
 255}
 256
 257static void xen_timerop_set_mode(enum clock_event_mode mode,
 258                                 struct clock_event_device *evt)
 259{
 260        switch (mode) {
 261        case CLOCK_EVT_MODE_PERIODIC:
 262                /* unsupported */
 263                WARN_ON(1);
 264                break;
 265
 266        case CLOCK_EVT_MODE_ONESHOT:
 267        case CLOCK_EVT_MODE_RESUME:
 268                break;
 269
 270        case CLOCK_EVT_MODE_UNUSED:
 271        case CLOCK_EVT_MODE_SHUTDOWN:
 272                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
 273                break;
 274        }
 275}
 276
 277static int xen_timerop_set_next_event(unsigned long delta,
 278                                      struct clock_event_device *evt)
 279{
 280        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 281
 282        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 283                BUG();
 284
 285        /* We may have missed the deadline, but there's no real way of
 286           knowing for sure.  If the event was in the past, then we'll
 287           get an immediate interrupt. */
 288
 289        return 0;
 290}
 291
 292static const struct clock_event_device xen_timerop_clockevent = {
 293        .name = "xen",
 294        .features = CLOCK_EVT_FEAT_ONESHOT,
 295
 296        .max_delta_ns = 0xffffffff,
 297        .min_delta_ns = TIMER_SLOP,
 298
 299        .mult = 1,
 300        .shift = 0,
 301        .rating = 500,
 302
 303        .set_mode = xen_timerop_set_mode,
 304        .set_next_event = xen_timerop_set_next_event,
 305};
 306
 307
 308
 309static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 310                                struct clock_event_device *evt)
 311{
 312        int cpu = smp_processor_id();
 313
 314        switch (mode) {
 315        case CLOCK_EVT_MODE_PERIODIC:
 316                WARN_ON(1);     /* unsupported */
 317                break;
 318
 319        case CLOCK_EVT_MODE_ONESHOT:
 320                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 321                        BUG();
 322                break;
 323
 324        case CLOCK_EVT_MODE_UNUSED:
 325        case CLOCK_EVT_MODE_SHUTDOWN:
 326                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
 327                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 328                        BUG();
 329                break;
 330        case CLOCK_EVT_MODE_RESUME:
 331                break;
 332        }
 333}
 334
 335static int xen_vcpuop_set_next_event(unsigned long delta,
 336                                     struct clock_event_device *evt)
 337{
 338        int cpu = smp_processor_id();
 339        struct vcpu_set_singleshot_timer single;
 340        int ret;
 341
 342        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 343
 344        single.timeout_abs_ns = get_abs_timeout(delta);
 345        single.flags = VCPU_SSHOTTMR_future;
 346
 347        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
 348
 349        BUG_ON(ret != 0 && ret != -ETIME);
 350
 351        return ret;
 352}
 353
 354static const struct clock_event_device xen_vcpuop_clockevent = {
 355        .name = "xen",
 356        .features = CLOCK_EVT_FEAT_ONESHOT,
 357
 358        .max_delta_ns = 0xffffffff,
 359        .min_delta_ns = TIMER_SLOP,
 360
 361        .mult = 1,
 362        .shift = 0,
 363        .rating = 500,
 364
 365        .set_mode = xen_vcpuop_set_mode,
 366        .set_next_event = xen_vcpuop_set_next_event,
 367};
 368
 369static const struct clock_event_device *xen_clockevent =
 370        &xen_timerop_clockevent;
 371static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
 372
 373static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 374{
 375        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
 376        irqreturn_t ret;
 377
 378        ret = IRQ_NONE;
 379        if (evt->event_handler) {
 380                evt->event_handler(evt);
 381                ret = IRQ_HANDLED;
 382        }
 383
 384        do_stolen_accounting();
 385
 386        return ret;
 387}
 388
 389void xen_setup_timer(int cpu)
 390{
 391        const char *name;
 392        struct clock_event_device *evt;
 393        int irq;
 394
 395        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 396
 397        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
 398        if (!name)
 399                name = "<timer kasprintf failed>";
 400
 401        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 402                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 403                                      name, NULL);
 404
 405        evt = &per_cpu(xen_clock_events, cpu);
 406        memcpy(evt, xen_clockevent, sizeof(*evt));
 407
 408        evt->cpumask = cpumask_of_cpu(cpu);
 409        evt->irq = irq;
 410
 411        setup_runstate_info(cpu);
 412}
 413
 414void xen_setup_cpu_clockevents(void)
 415{
 416        BUG_ON(preemptible());
 417
 418        clockevents_register_device(&__get_cpu_var(xen_clock_events));
 419}
 420
 421void xen_timer_resume(void)
 422{
 423        int cpu;
 424
 425        if (xen_clockevent != &xen_vcpuop_clockevent)
 426                return;
 427
 428        for_each_online_cpu(cpu) {
 429                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 430                        BUG();
 431        }
 432}
 433
 434__init void xen_time_init(void)
 435{
 436        int cpu = smp_processor_id();
 437
 438        clocksource_register(&xen_clocksource);
 439
 440        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 441                /* Successfully turned off 100Hz tick, so we have the
 442                   vcpuop-based timer interface */
 443                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 444                xen_clockevent = &xen_vcpuop_clockevent;
 445        }
 446
 447        /* Set initial system time with full resolution */
 448        xen_read_wallclock(&xtime);
 449        set_normalized_timespec(&wall_to_monotonic,
 450                                -xtime.tv_sec, -xtime.tv_nsec);
 451
 452        setup_force_cpu_cap(X86_FEATURE_TSC);
 453
 454        xen_setup_timer(cpu);
 455        xen_setup_cpu_clockevents();
 456}
 457