linux/arch/x86/xen/time.c
<<
>>
Prefs
   1/*
   2 * Xen time implementation.
   3 *
   4 * This is implemented in terms of a clocksource driver which uses
   5 * the hypervisor clock as a nanosecond timebase, and a clockevent
   6 * driver which uses the hypervisor's timer mechanism.
   7 *
   8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   9 */
  10#include <linux/kernel.h>
  11#include <linux/interrupt.h>
  12#include <linux/clocksource.h>
  13#include <linux/clockchips.h>
  14#include <linux/kernel_stat.h>
  15#include <linux/math64.h>
  16
  17#include <asm/pvclock.h>
  18#include <asm/xen/hypervisor.h>
  19#include <asm/xen/hypercall.h>
  20
  21#include <xen/events.h>
  22#include <xen/interface/xen.h>
  23#include <xen/interface/vcpu.h>
  24
  25#include "xen-ops.h"
  26
  27#define XEN_SHIFT 22
  28
  29/* Xen may fire a timer up to this many ns early */
  30#define TIMER_SLOP      100000
  31#define NS_PER_TICK     (1000000000LL / HZ)
  32
  33/* runstate info updated by Xen */
  34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  35
  36/* snapshots of runstate info */
  37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  38
  39/* unused ns of stolen and blocked time */
  40static DEFINE_PER_CPU(u64, residual_stolen);
  41static DEFINE_PER_CPU(u64, residual_blocked);
  42
  43/* return an consistent snapshot of 64-bit time/counter value */
  44static u64 get64(const u64 *p)
  45{
  46        u64 ret;
  47
  48        if (BITS_PER_LONG < 64) {
  49                u32 *p32 = (u32 *)p;
  50                u32 h, l;
  51
  52                /*
  53                 * Read high then low, and then make sure high is
  54                 * still the same; this will only loop if low wraps
  55                 * and carries into high.
  56                 * XXX some clean way to make this endian-proof?
  57                 */
  58                do {
  59                        h = p32[1];
  60                        barrier();
  61                        l = p32[0];
  62                        barrier();
  63                } while (p32[1] != h);
  64
  65                ret = (((u64)h) << 32) | l;
  66        } else
  67                ret = *p;
  68
  69        return ret;
  70}
  71
  72/*
  73 * Runstate accounting
  74 */
  75static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  76{
  77        u64 state_time;
  78        struct vcpu_runstate_info *state;
  79
  80        BUG_ON(preemptible());
  81
  82        state = &__get_cpu_var(runstate);
  83
  84        /*
  85         * The runstate info is always updated by the hypervisor on
  86         * the current CPU, so there's no need to use anything
  87         * stronger than a compiler barrier when fetching it.
  88         */
  89        do {
  90                state_time = get64(&state->state_entry_time);
  91                barrier();
  92                *res = *state;
  93                barrier();
  94        } while (get64(&state->state_entry_time) != state_time);
  95}
  96
  97/* return true when a vcpu could run but has no real cpu to run on */
  98bool xen_vcpu_stolen(int vcpu)
  99{
 100        return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
 101}
 102
 103static void setup_runstate_info(int cpu)
 104{
 105        struct vcpu_register_runstate_memory_area area;
 106
 107        area.addr.v = &per_cpu(runstate, cpu);
 108
 109        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
 110                               cpu, &area))
 111                BUG();
 112}
 113
 114static void do_stolen_accounting(void)
 115{
 116        struct vcpu_runstate_info state;
 117        struct vcpu_runstate_info *snap;
 118        s64 blocked, runnable, offline, stolen;
 119        cputime_t ticks;
 120
 121        get_runstate_snapshot(&state);
 122
 123        WARN_ON(state.state != RUNSTATE_running);
 124
 125        snap = &__get_cpu_var(runstate_snapshot);
 126
 127        /* work out how much time the VCPU has not been runn*ing*  */
 128        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
 129        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
 130        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
 131
 132        *snap = state;
 133
 134        /* Add the appropriate number of ticks of stolen time,
 135           including any left-overs from last time. */
 136        stolen = runnable + offline + __get_cpu_var(residual_stolen);
 137
 138        if (stolen < 0)
 139                stolen = 0;
 140
 141        ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 142        __get_cpu_var(residual_stolen) = stolen;
 143        account_steal_ticks(ticks);
 144
 145        /* Add the appropriate number of ticks of blocked time,
 146           including any left-overs from last time. */
 147        blocked += __get_cpu_var(residual_blocked);
 148
 149        if (blocked < 0)
 150                blocked = 0;
 151
 152        ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 153        __get_cpu_var(residual_blocked) = blocked;
 154        account_idle_ticks(ticks);
 155}
 156
 157/*
 158 * Xen sched_clock implementation.  Returns the number of unstolen
 159 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
 160 * states.
 161 */
 162unsigned long long xen_sched_clock(void)
 163{
 164        struct vcpu_runstate_info state;
 165        cycle_t now;
 166        u64 ret;
 167        s64 offset;
 168
 169        /*
 170         * Ideally sched_clock should be called on a per-cpu basis
 171         * anyway, so preempt should already be disabled, but that's
 172         * not current practice at the moment.
 173         */
 174        preempt_disable();
 175
 176        now = xen_clocksource_read();
 177
 178        get_runstate_snapshot(&state);
 179
 180        WARN_ON(state.state != RUNSTATE_running);
 181
 182        offset = now - state.state_entry_time;
 183        if (offset < 0)
 184                offset = 0;
 185
 186        ret = state.time[RUNSTATE_blocked] +
 187                state.time[RUNSTATE_running] +
 188                offset;
 189
 190        preempt_enable();
 191
 192        return ret;
 193}
 194
 195
 196/* Get the TSC speed from Xen */
 197unsigned long xen_tsc_khz(void)
 198{
 199        struct pvclock_vcpu_time_info *info =
 200                &HYPERVISOR_shared_info->vcpu_info[0].time;
 201
 202        return pvclock_tsc_khz(info);
 203}
 204
 205cycle_t xen_clocksource_read(void)
 206{
 207        struct pvclock_vcpu_time_info *src;
 208        cycle_t ret;
 209
 210        src = &get_cpu_var(xen_vcpu)->time;
 211        ret = pvclock_clocksource_read(src);
 212        put_cpu_var(xen_vcpu);
 213        return ret;
 214}
 215
 216static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
 217{
 218        return xen_clocksource_read();
 219}
 220
 221static void xen_read_wallclock(struct timespec *ts)
 222{
 223        struct shared_info *s = HYPERVISOR_shared_info;
 224        struct pvclock_wall_clock *wall_clock = &(s->wc);
 225        struct pvclock_vcpu_time_info *vcpu_time;
 226
 227        vcpu_time = &get_cpu_var(xen_vcpu)->time;
 228        pvclock_read_wallclock(wall_clock, vcpu_time, ts);
 229        put_cpu_var(xen_vcpu);
 230}
 231
 232unsigned long xen_get_wallclock(void)
 233{
 234        struct timespec ts;
 235
 236        xen_read_wallclock(&ts);
 237        return ts.tv_sec;
 238}
 239
 240int xen_set_wallclock(unsigned long now)
 241{
 242        /* do nothing for domU */
 243        return -1;
 244}
 245
 246static struct clocksource xen_clocksource __read_mostly = {
 247        .name = "xen",
 248        .rating = 400,
 249        .read = xen_clocksource_get_cycles,
 250        .mask = ~0,
 251        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
 252        .shift = XEN_SHIFT,
 253        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 254};
 255
 256/*
 257   Xen clockevent implementation
 258
 259   Xen has two clockevent implementations:
 260
 261   The old timer_op one works with all released versions of Xen prior
 262   to version 3.0.4.  This version of the hypervisor provides a
 263   single-shot timer with nanosecond resolution.  However, sharing the
 264   same event channel is a 100Hz tick which is delivered while the
 265   vcpu is running.  We don't care about or use this tick, but it will
 266   cause the core time code to think the timer fired too soon, and
 267   will end up resetting it each time.  It could be filtered, but
 268   doing so has complications when the ktime clocksource is not yet
 269   the xen clocksource (ie, at boot time).
 270
 271   The new vcpu_op-based timer interface allows the tick timer period
 272   to be changed or turned off.  The tick timer is not useful as a
 273   periodic timer because events are only delivered to running vcpus.
 274   The one-shot timer can report when a timeout is in the past, so
 275   set_next_event is capable of returning -ETIME when appropriate.
 276   This interface is used when available.
 277*/
 278
 279
 280/*
 281  Get a hypervisor absolute time.  In theory we could maintain an
 282  offset between the kernel's time and the hypervisor's time, and
 283  apply that to a kernel's absolute timeout.  Unfortunately the
 284  hypervisor and kernel times can drift even if the kernel is using
 285  the Xen clocksource, because ntp can warp the kernel's clocksource.
 286*/
 287static s64 get_abs_timeout(unsigned long delta)
 288{
 289        return xen_clocksource_read() + delta;
 290}
 291
 292static void xen_timerop_set_mode(enum clock_event_mode mode,
 293                                 struct clock_event_device *evt)
 294{
 295        switch (mode) {
 296        case CLOCK_EVT_MODE_PERIODIC:
 297                /* unsupported */
 298                WARN_ON(1);
 299                break;
 300
 301        case CLOCK_EVT_MODE_ONESHOT:
 302        case CLOCK_EVT_MODE_RESUME:
 303                break;
 304
 305        case CLOCK_EVT_MODE_UNUSED:
 306        case CLOCK_EVT_MODE_SHUTDOWN:
 307                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
 308                break;
 309        }
 310}
 311
 312static int xen_timerop_set_next_event(unsigned long delta,
 313                                      struct clock_event_device *evt)
 314{
 315        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 316
 317        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 318                BUG();
 319
 320        /* We may have missed the deadline, but there's no real way of
 321           knowing for sure.  If the event was in the past, then we'll
 322           get an immediate interrupt. */
 323
 324        return 0;
 325}
 326
 327static const struct clock_event_device xen_timerop_clockevent = {
 328        .name = "xen",
 329        .features = CLOCK_EVT_FEAT_ONESHOT,
 330
 331        .max_delta_ns = 0xffffffff,
 332        .min_delta_ns = TIMER_SLOP,
 333
 334        .mult = 1,
 335        .shift = 0,
 336        .rating = 500,
 337
 338        .set_mode = xen_timerop_set_mode,
 339        .set_next_event = xen_timerop_set_next_event,
 340};
 341
 342
 343
 344static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 345                                struct clock_event_device *evt)
 346{
 347        int cpu = smp_processor_id();
 348
 349        switch (mode) {
 350        case CLOCK_EVT_MODE_PERIODIC:
 351                WARN_ON(1);     /* unsupported */
 352                break;
 353
 354        case CLOCK_EVT_MODE_ONESHOT:
 355                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 356                        BUG();
 357                break;
 358
 359        case CLOCK_EVT_MODE_UNUSED:
 360        case CLOCK_EVT_MODE_SHUTDOWN:
 361                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
 362                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 363                        BUG();
 364                break;
 365        case CLOCK_EVT_MODE_RESUME:
 366                break;
 367        }
 368}
 369
 370static int xen_vcpuop_set_next_event(unsigned long delta,
 371                                     struct clock_event_device *evt)
 372{
 373        int cpu = smp_processor_id();
 374        struct vcpu_set_singleshot_timer single;
 375        int ret;
 376
 377        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 378
 379        single.timeout_abs_ns = get_abs_timeout(delta);
 380        single.flags = VCPU_SSHOTTMR_future;
 381
 382        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
 383
 384        BUG_ON(ret != 0 && ret != -ETIME);
 385
 386        return ret;
 387}
 388
 389static const struct clock_event_device xen_vcpuop_clockevent = {
 390        .name = "xen",
 391        .features = CLOCK_EVT_FEAT_ONESHOT,
 392
 393        .max_delta_ns = 0xffffffff,
 394        .min_delta_ns = TIMER_SLOP,
 395
 396        .mult = 1,
 397        .shift = 0,
 398        .rating = 500,
 399
 400        .set_mode = xen_vcpuop_set_mode,
 401        .set_next_event = xen_vcpuop_set_next_event,
 402};
 403
 404static const struct clock_event_device *xen_clockevent =
 405        &xen_timerop_clockevent;
 406static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
 407
 408static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 409{
 410        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
 411        irqreturn_t ret;
 412
 413        ret = IRQ_NONE;
 414        if (evt->event_handler) {
 415                evt->event_handler(evt);
 416                ret = IRQ_HANDLED;
 417        }
 418
 419        do_stolen_accounting();
 420
 421        return ret;
 422}
 423
 424void xen_setup_timer(int cpu)
 425{
 426        const char *name;
 427        struct clock_event_device *evt;
 428        int irq;
 429
 430        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 431
 432        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
 433        if (!name)
 434                name = "<timer kasprintf failed>";
 435
 436        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 437                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 438                                      name, NULL);
 439
 440        evt = &per_cpu(xen_clock_events, cpu);
 441        memcpy(evt, xen_clockevent, sizeof(*evt));
 442
 443        evt->cpumask = cpumask_of(cpu);
 444        evt->irq = irq;
 445
 446        setup_runstate_info(cpu);
 447}
 448
 449void xen_teardown_timer(int cpu)
 450{
 451        struct clock_event_device *evt;
 452        BUG_ON(cpu == 0);
 453        evt = &per_cpu(xen_clock_events, cpu);
 454        unbind_from_irqhandler(evt->irq, NULL);
 455}
 456
 457void xen_setup_cpu_clockevents(void)
 458{
 459        BUG_ON(preemptible());
 460
 461        clockevents_register_device(&__get_cpu_var(xen_clock_events));
 462}
 463
 464void xen_timer_resume(void)
 465{
 466        int cpu;
 467
 468        if (xen_clockevent != &xen_vcpuop_clockevent)
 469                return;
 470
 471        for_each_online_cpu(cpu) {
 472                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 473                        BUG();
 474        }
 475}
 476
 477__init void xen_time_init(void)
 478{
 479        int cpu = smp_processor_id();
 480
 481        clocksource_register(&xen_clocksource);
 482
 483        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 484                /* Successfully turned off 100Hz tick, so we have the
 485                   vcpuop-based timer interface */
 486                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 487                xen_clockevent = &xen_vcpuop_clockevent;
 488        }
 489
 490        /* Set initial system time with full resolution */
 491        xen_read_wallclock(&xtime);
 492        set_normalized_timespec(&wall_to_monotonic,
 493                                -xtime.tv_sec, -xtime.tv_nsec);
 494
 495        setup_force_cpu_cap(X86_FEATURE_TSC);
 496
 497        xen_setup_timer(cpu);
 498        xen_setup_cpu_clockevents();
 499}
 500
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.