linux/arch/x86/xen/time.c
<<
>>
Prefs
   1/*
   2 * Xen time implementation.
   3 *
   4 * This is implemented in terms of a clocksource driver which uses
   5 * the hypervisor clock as a nanosecond timebase, and a clockevent
   6 * driver which uses the hypervisor's timer mechanism.
   7 *
   8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   9 */
  10#include <linux/kernel.h>
  11#include <linux/interrupt.h>
  12#include <linux/clocksource.h>
  13#include <linux/clockchips.h>
  14#include <linux/kernel_stat.h>
  15#include <linux/math64.h>
  16
  17#include <asm/pvclock.h>
  18#include <asm/xen/hypervisor.h>
  19#include <asm/xen/hypercall.h>
  20
  21#include <xen/events.h>
  22#include <xen/interface/xen.h>
  23#include <xen/interface/vcpu.h>
  24
  25#include "xen-ops.h"
  26
  27#define XEN_SHIFT 22
  28
  29/* Xen may fire a timer up to this many ns early */
  30#define TIMER_SLOP      100000
  31#define NS_PER_TICK     (1000000000LL / HZ)
  32
  33/* runstate info updated by Xen */
  34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  35
  36/* snapshots of runstate info */
  37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  38
  39/* unused ns of stolen and blocked time */
  40static DEFINE_PER_CPU(u64, residual_stolen);
  41static DEFINE_PER_CPU(u64, residual_blocked);
  42
  43/* return an consistent snapshot of 64-bit time/counter value */
  44static u64 get64(const u64 *p)
  45{
  46        u64 ret;
  47
  48        if (BITS_PER_LONG < 64) {
  49                u32 *p32 = (u32 *)p;
  50                u32 h, l;
  51
  52                /*
  53                 * Read high then low, and then make sure high is
  54                 * still the same; this will only loop if low wraps
  55                 * and carries into high.
  56                 * XXX some clean way to make this endian-proof?
  57                 */
  58                do {
  59                        h = p32[1];
  60                        barrier();
  61                        l = p32[0];
  62                        barrier();
  63                } while (p32[1] != h);
  64
  65                ret = (((u64)h) << 32) | l;
  66        } else
  67                ret = *p;
  68
  69        return ret;
  70}
  71
  72/*
  73 * Runstate accounting
  74 */
  75static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  76{
  77        u64 state_time;
  78        struct vcpu_runstate_info *state;
  79
  80        BUG_ON(preemptible());
  81
  82        state = &__get_cpu_var(runstate);
  83
  84        /*
  85         * The runstate info is always updated by the hypervisor on
  86         * the current CPU, so there's no need to use anything
  87         * stronger than a compiler barrier when fetching it.
  88         */
  89        do {
  90                state_time = get64(&state->state_entry_time);
  91                barrier();
  92                *res = *state;
  93                barrier();
  94        } while (get64(&state->state_entry_time) != state_time);
  95}
  96
  97/* return true when a vcpu could run but has no real cpu to run on */
  98bool xen_vcpu_stolen(int vcpu)
  99{
 100        return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
 101}
 102
 103static void setup_runstate_info(int cpu)
 104{
 105        struct vcpu_register_runstate_memory_area area;
 106
 107        area.addr.v = &per_cpu(runstate, cpu);
 108
 109        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
 110                               cpu, &area))
 111                BUG();
 112}
 113
 114static void do_stolen_accounting(void)
 115{
 116        struct vcpu_runstate_info state;
 117        struct vcpu_runstate_info *snap;
 118        s64 blocked, runnable, offline, stolen;
 119        cputime_t ticks;
 120
 121        get_runstate_snapshot(&state);
 122
 123        WARN_ON(state.state != RUNSTATE_running);
 124
 125        snap = &__get_cpu_var(runstate_snapshot);
 126
 127        /* work out how much time the VCPU has not been runn*ing*  */
 128        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
 129        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
 130        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
 131
 132        *snap = state;
 133
 134        /* Add the appropriate number of ticks of stolen time,
 135           including any left-overs from last time.  Passing NULL to
 136           account_steal_time accounts the time as stolen. */
 137        stolen = runnable + offline + __get_cpu_var(residual_stolen);
 138
 139        if (stolen < 0)
 140                stolen = 0;
 141
 142        ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 143        __get_cpu_var(residual_stolen) = stolen;
 144        account_steal_time(NULL, ticks);
 145
 146        /* Add the appropriate number of ticks of blocked time,
 147           including any left-overs from last time.  Passing idle to
 148           account_steal_time accounts the time as idle/wait. */
 149        blocked += __get_cpu_var(residual_blocked);
 150
 151        if (blocked < 0)
 152                blocked = 0;
 153
 154        ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 155        __get_cpu_var(residual_blocked) = blocked;
 156        account_steal_time(idle_task(smp_processor_id()), ticks);
 157}
 158
 159/*
 160 * Xen sched_clock implementation.  Returns the number of unstolen
 161 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
 162 * states.
 163 */
 164unsigned long long xen_sched_clock(void)
 165{
 166        struct vcpu_runstate_info state;
 167        cycle_t now;
 168        u64 ret;
 169        s64 offset;
 170
 171        /*
 172         * Ideally sched_clock should be called on a per-cpu basis
 173         * anyway, so preempt should already be disabled, but that's
 174         * not current practice at the moment.
 175         */
 176        preempt_disable();
 177
 178        now = xen_clocksource_read();
 179
 180        get_runstate_snapshot(&state);
 181
 182        WARN_ON(state.state != RUNSTATE_running);
 183
 184        offset = now - state.state_entry_time;
 185        if (offset < 0)
 186                offset = 0;
 187
 188        ret = state.time[RUNSTATE_blocked] +
 189                state.time[RUNSTATE_running] +
 190                offset;
 191
 192        preempt_enable();
 193
 194        return ret;
 195}
 196
 197
 198/* Get the TSC speed from Xen */
 199unsigned long xen_tsc_khz(void)
 200{
 201        struct pvclock_vcpu_time_info *info =
 202                &HYPERVISOR_shared_info->vcpu_info[0].time;
 203
 204        return pvclock_tsc_khz(info);
 205}
 206
 207cycle_t xen_clocksource_read(void)
 208{
 209        struct pvclock_vcpu_time_info *src;
 210        cycle_t ret;
 211
 212        src = &get_cpu_var(xen_vcpu)->time;
 213        ret = pvclock_clocksource_read(src);
 214        put_cpu_var(xen_vcpu);
 215        return ret;
 216}
 217
 218static void xen_read_wallclock(struct timespec *ts)
 219{
 220        struct shared_info *s = HYPERVISOR_shared_info;
 221        struct pvclock_wall_clock *wall_clock = &(s->wc);
 222        struct pvclock_vcpu_time_info *vcpu_time;
 223
 224        vcpu_time = &get_cpu_var(xen_vcpu)->time;
 225        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
 226        put_cpu_var(xen_vcpu);
 227}
 228
 229unsigned long xen_get_wallclock(void)
 230{
 231        struct timespec ts;
 232
 233        xen_read_wallclock(&ts);
 234        return ts.tv_sec;
 235}
 236
 237int xen_set_wallclock(unsigned long now)
 238{
 239        /* do nothing for domU */
 240        return -1;
 241}
 242
 243static struct clocksource xen_clocksource __read_mostly = {
 244        .name = "xen",
 245        .rating = 400,
 246        .read = xen_clocksource_read,
 247        .mask = ~0,
 248        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
 249        .shift = XEN_SHIFT,
 250        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 251};
 252
 253/*
 254   Xen clockevent implementation
 255
 256   Xen has two clockevent implementations:
 257
 258   The old timer_op one works with all released versions of Xen prior
 259   to version 3.0.4.  This version of the hypervisor provides a
 260   single-shot timer with nanosecond resolution.  However, sharing the
 261   same event channel is a 100Hz tick which is delivered while the
 262   vcpu is running.  We don't care about or use this tick, but it will
 263   cause the core time code to think the timer fired too soon, and
 264   will end up resetting it each time.  It could be filtered, but
 265   doing so has complications when the ktime clocksource is not yet
 266   the xen clocksource (ie, at boot time).
 267
 268   The new vcpu_op-based timer interface allows the tick timer period
 269   to be changed or turned off.  The tick timer is not useful as a
 270   periodic timer because events are only delivered to running vcpus.
 271   The one-shot timer can report when a timeout is in the past, so
 272   set_next_event is capable of returning -ETIME when appropriate.
 273   This interface is used when available.
 274*/
 275
 276
 277/*
 278  Get a hypervisor absolute time.  In theory we could maintain an
 279  offset between the kernel's time and the hypervisor's time, and
 280  apply that to a kernel's absolute timeout.  Unfortunately the
 281  hypervisor and kernel times can drift even if the kernel is using
 282  the Xen clocksource, because ntp can warp the kernel's clocksource.
 283*/
 284static s64 get_abs_timeout(unsigned long delta)
 285{
 286        return xen_clocksource_read() + delta;
 287}
 288
 289static void xen_timerop_set_mode(enum clock_event_mode mode,
 290                                 struct clock_event_device *evt)
 291{
 292        switch (mode) {
 293        case CLOCK_EVT_MODE_PERIODIC:
 294                /* unsupported */
 295                WARN_ON(1);
 296                break;
 297
 298        case CLOCK_EVT_MODE_ONESHOT:
 299        case CLOCK_EVT_MODE_RESUME:
 300                break;
 301
 302        case CLOCK_EVT_MODE_UNUSED:
 303        case CLOCK_EVT_MODE_SHUTDOWN:
 304                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
 305                break;
 306        }
 307}
 308
 309static int xen_timerop_set_next_event(unsigned long delta,
 310                                      struct clock_event_device *evt)
 311{
 312        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 313
 314        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 315                BUG();
 316
 317        /* We may have missed the deadline, but there's no real way of
 318           knowing for sure.  If the event was in the past, then we'll
 319           get an immediate interrupt. */
 320
 321        return 0;
 322}
 323
 324static const struct clock_event_device xen_timerop_clockevent = {
 325        .name = "xen",
 326        .features = CLOCK_EVT_FEAT_ONESHOT,
 327
 328        .max_delta_ns = 0xffffffff,
 329        .min_delta_ns = TIMER_SLOP,
 330
 331        .mult = 1,
 332        .shift = 0,
 333        .rating = 500,
 334
 335        .set_mode = xen_timerop_set_mode,
 336        .set_next_event = xen_timerop_set_next_event,
 337};
 338
 339
 340
 341static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 342                                struct clock_event_device *evt)
 343{
 344        int cpu = smp_processor_id();
 345
 346        switch (mode) {
 347        case CLOCK_EVT_MODE_PERIODIC:
 348                WARN_ON(1);     /* unsupported */
 349                break;
 350
 351        case CLOCK_EVT_MODE_ONESHOT:
 352                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 353                        BUG();
 354                break;
 355
 356        case CLOCK_EVT_MODE_UNUSED:
 357        case CLOCK_EVT_MODE_SHUTDOWN:
 358                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
 359                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 360                        BUG();
 361                break;
 362        case CLOCK_EVT_MODE_RESUME:
 363                break;
 364        }
 365}
 366
 367static int xen_vcpuop_set_next_event(unsigned long delta,
 368                                     struct clock_event_device *evt)
 369{
 370        int cpu = smp_processor_id();
 371        struct vcpu_set_singleshot_timer single;
 372        int ret;
 373
 374        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 375
 376        single.timeout_abs_ns = get_abs_timeout(delta);
 377        single.flags = VCPU_SSHOTTMR_future;
 378
 379        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
 380
 381        BUG_ON(ret != 0 && ret != -ETIME);
 382
 383        return ret;
 384}
 385
 386static const struct clock_event_device xen_vcpuop_clockevent = {
 387        .name = "xen",
 388        .features = CLOCK_EVT_FEAT_ONESHOT,
 389
 390        .max_delta_ns = 0xffffffff,
 391        .min_delta_ns = TIMER_SLOP,
 392
 393        .mult = 1,
 394        .shift = 0,
 395        .rating = 500,
 396
 397        .set_mode = xen_vcpuop_set_mode,
 398        .set_next_event = xen_vcpuop_set_next_event,
 399};
 400
 401static const struct clock_event_device *xen_clockevent =
 402        &xen_timerop_clockevent;
 403static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
 404
 405static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 406{
 407        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
 408        irqreturn_t ret;
 409
 410        ret = IRQ_NONE;
 411        if (evt->event_handler) {
 412                evt->event_handler(evt);
 413                ret = IRQ_HANDLED;
 414        }
 415
 416        do_stolen_accounting();
 417
 418        return ret;
 419}
 420
 421void xen_setup_timer(int cpu)
 422{
 423        const char *name;
 424        struct clock_event_device *evt;
 425        int irq;
 426
 427        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 428
 429        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
 430        if (!name)
 431                name = "<timer kasprintf failed>";
 432
 433        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 434                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 435                                      name, NULL);
 436
 437        evt = &per_cpu(xen_clock_events, cpu);
 438        memcpy(evt, xen_clockevent, sizeof(*evt));
 439
 440        evt->cpumask = cpumask_of_cpu(cpu);
 441        evt->irq = irq;
 442
 443        setup_runstate_info(cpu);
 444}
 445
 446void xen_teardown_timer(int cpu)
 447{
 448        struct clock_event_device *evt;
 449        BUG_ON(cpu == 0);
 450        evt = &per_cpu(xen_clock_events, cpu);
 451        unbind_from_irqhandler(evt->irq, NULL);
 452}
 453
 454void xen_setup_cpu_clockevents(void)
 455{
 456        BUG_ON(preemptible());
 457
 458        clockevents_register_device(&__get_cpu_var(xen_clock_events));
 459}
 460
 461void xen_timer_resume(void)
 462{
 463        int cpu;
 464
 465        if (xen_clockevent != &xen_vcpuop_clockevent)
 466                return;
 467
 468        for_each_online_cpu(cpu) {
 469                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 470                        BUG();
 471        }
 472}
 473
 474__init void xen_time_init(void)
 475{
 476        int cpu = smp_processor_id();
 477
 478        clocksource_register(&xen_clocksource);
 479
 480        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 481                /* Successfully turned off 100Hz tick, so we have the
 482                   vcpuop-based timer interface */
 483                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 484                xen_clockevent = &xen_vcpuop_clockevent;
 485        }
 486
 487        /* Set initial system time with full resolution */
 488        xen_read_wallclock(&xtime);
 489        set_normalized_timespec(&wall_to_monotonic,
 490                                -xtime.tv_sec, -xtime.tv_nsec);
 491
 492        setup_force_cpu_cap(X86_FEATURE_TSC);
 493
 494        xen_setup_timer(cpu);
 495        xen_setup_cpu_clockevents();
 496}
 497