linux/arch/x86/kvm/x86.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright (C) 2008 Qumranet, Inc.
   8 * Copyright IBM Corporation, 2008
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *   Amit Shah    <amit.shah@qumranet.com>
  14 *   Ben-Ami Yassour <benami@il.ibm.com>
  15 *
  16 * This work is licensed under the terms of the GNU GPL, version 2.  See
  17 * the COPYING file in the top-level directory.
  18 *
  19 */
  20
  21#include <linux/kvm_host.h>
  22#include "irq.h"
  23#include "mmu.h"
  24#include "i8254.h"
  25#include "tss.h"
  26#include "kvm_cache_regs.h"
  27#include "x86.h"
  28
  29#include <linux/clocksource.h>
  30#include <linux/interrupt.h>
  31#include <linux/kvm.h>
  32#include <linux/fs.h>
  33#include <linux/vmalloc.h>
  34#include <linux/module.h>
  35#include <linux/mman.h>
  36#include <linux/highmem.h>
  37#include <linux/iommu.h>
  38#include <linux/intel-iommu.h>
  39#include <linux/cpufreq.h>
  40#include <linux/user-return-notifier.h>
  41#include <linux/srcu.h>
  42#include <linux/slab.h>
  43#include <linux/perf_event.h>
  44#include <trace/events/kvm.h>
  45
  46#define CREATE_TRACE_POINTS
  47#include "trace.h"
  48
  49#include <asm/debugreg.h>
  50#include <asm/uaccess.h>
  51#include <asm/msr.h>
  52#include <asm/desc.h>
  53#include <asm/mtrr.h>
  54#include <asm/mce.h>
  55
  56#define MAX_IO_MSRS 256
  57#define CR0_RESERVED_BITS                                               \
  58        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  59                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  60                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  61#define CR4_RESERVED_BITS                                               \
  62        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  63                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  64                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  65                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  66
  67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  68
  69#define KVM_MAX_MCE_BANKS 32
  70#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
  71
  72/* EFER defaults:
  73 * - enable syscall per default because its emulated by KVM
  74 * - enable LME and LMA per default on 64 bit KVM
  75 */
  76#ifdef CONFIG_X86_64
  77static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  78#else
  79static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  80#endif
  81
  82#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  83#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  84
  85static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  86static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  87                                    struct kvm_cpuid_entry2 __user *entries);
  88
  89struct kvm_x86_ops *kvm_x86_ops;
  90EXPORT_SYMBOL_GPL(kvm_x86_ops);
  91
  92int ignore_msrs = 0;
  93module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  94
  95#define KVM_NR_SHARED_MSRS 16
  96
  97struct kvm_shared_msrs_global {
  98        int nr;
  99        u32 msrs[KVM_NR_SHARED_MSRS];
 100};
 101
 102struct kvm_shared_msrs {
 103        struct user_return_notifier urn;
 104        bool registered;
 105        struct kvm_shared_msr_values {
 106                u64 host;
 107                u64 curr;
 108        } values[KVM_NR_SHARED_MSRS];
 109};
 110
 111static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 112static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
 113
 114struct kvm_stats_debugfs_item debugfs_entries[] = {
 115        { "pf_fixed", VCPU_STAT(pf_fixed) },
 116        { "pf_guest", VCPU_STAT(pf_guest) },
 117        { "tlb_flush", VCPU_STAT(tlb_flush) },
 118        { "invlpg", VCPU_STAT(invlpg) },
 119        { "exits", VCPU_STAT(exits) },
 120        { "io_exits", VCPU_STAT(io_exits) },
 121        { "mmio_exits", VCPU_STAT(mmio_exits) },
 122        { "signal_exits", VCPU_STAT(signal_exits) },
 123        { "irq_window", VCPU_STAT(irq_window_exits) },
 124        { "nmi_window", VCPU_STAT(nmi_window_exits) },
 125        { "halt_exits", VCPU_STAT(halt_exits) },
 126        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 127        { "hypercalls", VCPU_STAT(hypercalls) },
 128        { "request_irq", VCPU_STAT(request_irq_exits) },
 129        { "irq_exits", VCPU_STAT(irq_exits) },
 130        { "host_state_reload", VCPU_STAT(host_state_reload) },
 131        { "efer_reload", VCPU_STAT(efer_reload) },
 132        { "fpu_reload", VCPU_STAT(fpu_reload) },
 133        { "insn_emulation", VCPU_STAT(insn_emulation) },
 134        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 135        { "irq_injections", VCPU_STAT(irq_injections) },
 136        { "nmi_injections", VCPU_STAT(nmi_injections) },
 137        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 138        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 139        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 140        { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 141        { "mmu_flooded", VM_STAT(mmu_flooded) },
 142        { "mmu_recycled", VM_STAT(mmu_recycled) },
 143        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 144        { "mmu_unsync", VM_STAT(mmu_unsync) },
 145        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 146        { "largepages", VM_STAT(lpages) },
 147        { NULL }
 148};
 149
 150static void kvm_on_user_return(struct user_return_notifier *urn)
 151{
 152        unsigned slot;
 153        struct kvm_shared_msrs *locals
 154                = container_of(urn, struct kvm_shared_msrs, urn);
 155        struct kvm_shared_msr_values *values;
 156
 157        for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 158                values = &locals->values[slot];
 159                if (values->host != values->curr) {
 160                        wrmsrl(shared_msrs_global.msrs[slot], values->host);
 161                        values->curr = values->host;
 162                }
 163        }
 164        locals->registered = false;
 165        user_return_notifier_unregister(urn);
 166}
 167
 168static void shared_msr_update(unsigned slot, u32 msr)
 169{
 170        struct kvm_shared_msrs *smsr;
 171        u64 value;
 172
 173        smsr = &__get_cpu_var(shared_msrs);
 174        /* only read, and nobody should modify it at this time,
 175         * so don't need lock */
 176        if (slot >= shared_msrs_global.nr) {
 177                printk(KERN_ERR "kvm: invalid MSR slot!");
 178                return;
 179        }
 180        rdmsrl_safe(msr, &value);
 181        smsr->values[slot].host = value;
 182        smsr->values[slot].curr = value;
 183}
 184
 185void kvm_define_shared_msr(unsigned slot, u32 msr)
 186{
 187        if (slot >= shared_msrs_global.nr)
 188                shared_msrs_global.nr = slot + 1;
 189        shared_msrs_global.msrs[slot] = msr;
 190        /* we need ensured the shared_msr_global have been updated */
 191        smp_wmb();
 192}
 193EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 194
 195static void kvm_shared_msr_cpu_online(void)
 196{
 197        unsigned i;
 198
 199        for (i = 0; i < shared_msrs_global.nr; ++i)
 200                shared_msr_update(i, shared_msrs_global.msrs[i]);
 201}
 202
 203void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 204{
 205        struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 206
 207        if (((value ^ smsr->values[slot].curr) & mask) == 0)
 208                return;
 209        smsr->values[slot].curr = value;
 210        wrmsrl(shared_msrs_global.msrs[slot], value);
 211        if (!smsr->registered) {
 212                smsr->urn.on_user_return = kvm_on_user_return;
 213                user_return_notifier_register(&smsr->urn);
 214                smsr->registered = true;
 215        }
 216}
 217EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 218
 219static void drop_user_return_notifiers(void *ignore)
 220{
 221        struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 222
 223        if (smsr->registered)
 224                kvm_on_user_return(&smsr->urn);
 225}
 226
 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 228{
 229        if (irqchip_in_kernel(vcpu->kvm))
 230                return vcpu->arch.apic_base;
 231        else
 232                return vcpu->arch.apic_base;
 233}
 234EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 235
 236void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 237{
 238        /* TODO: reserve bits check */
 239        if (irqchip_in_kernel(vcpu->kvm))
 240                kvm_lapic_set_base(vcpu, data);
 241        else
 242                vcpu->arch.apic_base = data;
 243}
 244EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 245
 246#define EXCPT_BENIGN            0
 247#define EXCPT_CONTRIBUTORY      1
 248#define EXCPT_PF                2
 249
 250static int exception_class(int vector)
 251{
 252        switch (vector) {
 253        case PF_VECTOR:
 254                return EXCPT_PF;
 255        case DE_VECTOR:
 256        case TS_VECTOR:
 257        case NP_VECTOR:
 258        case SS_VECTOR:
 259        case GP_VECTOR:
 260                return EXCPT_CONTRIBUTORY;
 261        default:
 262                break;
 263        }
 264        return EXCPT_BENIGN;
 265}
 266
 267static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 268                unsigned nr, bool has_error, u32 error_code,
 269                bool reinject)
 270{
 271        u32 prev_nr;
 272        int class1, class2;
 273
 274        if (!vcpu->arch.exception.pending) {
 275        queue:
 276                vcpu->arch.exception.pending = true;
 277                vcpu->arch.exception.has_error_code = has_error;
 278                vcpu->arch.exception.nr = nr;
 279                vcpu->arch.exception.error_code = error_code;
 280                vcpu->arch.exception.reinject = reinject;
 281                return;
 282        }
 283
 284        /* to check exception */
 285        prev_nr = vcpu->arch.exception.nr;
 286        if (prev_nr == DF_VECTOR) {
 287                /* triple fault -> shutdown */
 288                set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 289                return;
 290        }
 291        class1 = exception_class(prev_nr);
 292        class2 = exception_class(nr);
 293        if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 294                || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 295                /* generate double fault per SDM Table 5-5 */
 296                vcpu->arch.exception.pending = true;
 297                vcpu->arch.exception.has_error_code = true;
 298                vcpu->arch.exception.nr = DF_VECTOR;
 299                vcpu->arch.exception.error_code = 0;
 300        } else
 301                /* replace previous exception with a new one in a hope
 302                   that instruction re-execution will regenerate lost
 303                   exception */
 304                goto queue;
 305}
 306
 307void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 308{
 309        kvm_multiple_exception(vcpu, nr, false, 0, false);
 310}
 311EXPORT_SYMBOL_GPL(kvm_queue_exception);
 312
 313void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 314{
 315        kvm_multiple_exception(vcpu, nr, false, 0, true);
 316}
 317EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 318
 319void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 320                           u32 error_code)
 321{
 322        ++vcpu->stat.pf_guest;
 323        vcpu->arch.cr2 = addr;
 324        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 325}
 326
 327void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 328{
 329        vcpu->arch.nmi_pending = 1;
 330}
 331EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 332
 333void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 334{
 335        kvm_multiple_exception(vcpu, nr, true, error_code, false);
 336}
 337EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 338
 339void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 340{
 341        kvm_multiple_exception(vcpu, nr, true, error_code, true);
 342}
 343EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 344
 345/*
 346 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 347 * a #GP and return false.
 348 */
 349bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 350{
 351        if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 352                return true;
 353        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 354        return false;
 355}
 356EXPORT_SYMBOL_GPL(kvm_require_cpl);
 357
 358/*
 359 * Load the pae pdptrs.  Return true is they are all valid.
 360 */
 361int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 362{
 363        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 364        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 365        int i;
 366        int ret;
 367        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 368
 369        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 370                                  offset * sizeof(u64), sizeof(pdpte));
 371        if (ret < 0) {
 372                ret = 0;
 373                goto out;
 374        }
 375        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 376                if (is_present_gpte(pdpte[i]) &&
 377                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 378                        ret = 0;
 379                        goto out;
 380                }
 381        }
 382        ret = 1;
 383
 384        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 385        __set_bit(VCPU_EXREG_PDPTR,
 386                  (unsigned long *)&vcpu->arch.regs_avail);
 387        __set_bit(VCPU_EXREG_PDPTR,
 388                  (unsigned long *)&vcpu->arch.regs_dirty);
 389out:
 390
 391        return ret;
 392}
 393EXPORT_SYMBOL_GPL(load_pdptrs);
 394
 395static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 396{
 397        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 398        bool changed = true;
 399        int r;
 400
 401        if (is_long_mode(vcpu) || !is_pae(vcpu))
 402                return false;
 403
 404        if (!test_bit(VCPU_EXREG_PDPTR,
 405                      (unsigned long *)&vcpu->arch.regs_avail))
 406                return true;
 407
 408        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 409        if (r < 0)
 410                goto out;
 411        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 412out:
 413
 414        return changed;
 415}
 416
 417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 418{
 419        cr0 |= X86_CR0_ET;
 420
 421#ifdef CONFIG_X86_64
 422        if (cr0 & 0xffffffff00000000UL) {
 423                kvm_inject_gp(vcpu, 0);
 424                return;
 425        }
 426#endif
 427
 428        cr0 &= ~CR0_RESERVED_BITS;
 429
 430        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 431                kvm_inject_gp(vcpu, 0);
 432                return;
 433        }
 434
 435        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 436                kvm_inject_gp(vcpu, 0);
 437                return;
 438        }
 439
 440        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 441#ifdef CONFIG_X86_64
 442                if ((vcpu->arch.efer & EFER_LME)) {
 443                        int cs_db, cs_l;
 444
 445                        if (!is_pae(vcpu)) {
 446                                kvm_inject_gp(vcpu, 0);
 447                                return;
 448                        }
 449                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 450                        if (cs_l) {
 451                                kvm_inject_gp(vcpu, 0);
 452                                return;
 453
 454                        }
 455                } else
 456#endif
 457                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 458                        kvm_inject_gp(vcpu, 0);
 459                        return;
 460                }
 461
 462        }
 463
 464        kvm_x86_ops->set_cr0(vcpu, cr0);
 465
 466        kvm_mmu_reset_context(vcpu);
 467        return;
 468}
 469EXPORT_SYMBOL_GPL(kvm_set_cr0);
 470
 471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 472{
 473        kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 474}
 475EXPORT_SYMBOL_GPL(kvm_lmsw);
 476
 477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 478{
 479        unsigned long old_cr4 = kvm_read_cr4(vcpu);
 480        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 481
 482        if (cr4 & CR4_RESERVED_BITS) {
 483                kvm_inject_gp(vcpu, 0);
 484                return;
 485        }
 486
 487        if (is_long_mode(vcpu)) {
 488                if (!(cr4 & X86_CR4_PAE)) {
 489                        kvm_inject_gp(vcpu, 0);
 490                        return;
 491                }
 492        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 493                   && ((cr4 ^ old_cr4) & pdptr_bits)
 494                   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 495                kvm_inject_gp(vcpu, 0);
 496                return;
 497        }
 498
 499        if (cr4 & X86_CR4_VMXE) {
 500                kvm_inject_gp(vcpu, 0);
 501                return;
 502        }
 503        kvm_x86_ops->set_cr4(vcpu, cr4);
 504        vcpu->arch.cr4 = cr4;
 505        kvm_mmu_reset_context(vcpu);
 506}
 507EXPORT_SYMBOL_GPL(kvm_set_cr4);
 508
 509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 510{
 511        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 512                kvm_mmu_sync_roots(vcpu);
 513                kvm_mmu_flush_tlb(vcpu);
 514                return;
 515        }
 516
 517        if (is_long_mode(vcpu)) {
 518                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 519                        kvm_inject_gp(vcpu, 0);
 520                        return;
 521                }
 522        } else {
 523                if (is_pae(vcpu)) {
 524                        if (cr3 & CR3_PAE_RESERVED_BITS) {
 525                                kvm_inject_gp(vcpu, 0);
 526                                return;
 527                        }
 528                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 529                                kvm_inject_gp(vcpu, 0);
 530                                return;
 531                        }
 532                }
 533                /*
 534                 * We don't check reserved bits in nonpae mode, because
 535                 * this isn't enforced, and VMware depends on this.
 536                 */
 537        }
 538
 539        /*
 540         * Does the new cr3 value map to physical memory? (Note, we
 541         * catch an invalid cr3 even in real-mode, because it would
 542         * cause trouble later on when we turn on paging anyway.)
 543         *
 544         * A real CPU would silently accept an invalid cr3 and would
 545         * attempt to use it - with largely undefined (and often hard
 546         * to debug) behavior on the guest side.
 547         */
 548        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 549                kvm_inject_gp(vcpu, 0);
 550        else {
 551                vcpu->arch.cr3 = cr3;
 552                vcpu->arch.mmu.new_cr3(vcpu);
 553        }
 554}
 555EXPORT_SYMBOL_GPL(kvm_set_cr3);
 556
 557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 558{
 559        if (cr8 & CR8_RESERVED_BITS) {
 560                kvm_inject_gp(vcpu, 0);
 561                return;
 562        }
 563        if (irqchip_in_kernel(vcpu->kvm))
 564                kvm_lapic_set_tpr(vcpu, cr8);
 565        else
 566                vcpu->arch.cr8 = cr8;
 567}
 568EXPORT_SYMBOL_GPL(kvm_set_cr8);
 569
 570unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 571{
 572        if (irqchip_in_kernel(vcpu->kvm))
 573                return kvm_lapic_get_cr8(vcpu);
 574        else
 575                return vcpu->arch.cr8;
 576}
 577EXPORT_SYMBOL_GPL(kvm_get_cr8);
 578
 579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 580{
 581        switch (dr) {
 582        case 0 ... 3:
 583                vcpu->arch.db[dr] = val;
 584                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 585                        vcpu->arch.eff_db[dr] = val;
 586                break;
 587        case 4:
 588                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 589                        kvm_queue_exception(vcpu, UD_VECTOR);
 590                        return 1;
 591                }
 592                /* fall through */
 593        case 6:
 594                if (val & 0xffffffff00000000ULL) {
 595                        kvm_inject_gp(vcpu, 0);
 596                        return 1;
 597                }
 598                vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 599                break;
 600        case 5:
 601                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 602                        kvm_queue_exception(vcpu, UD_VECTOR);
 603                        return 1;
 604                }
 605                /* fall through */
 606        default: /* 7 */
 607                if (val & 0xffffffff00000000ULL) {
 608                        kvm_inject_gp(vcpu, 0);
 609                        return 1;
 610                }
 611                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 612                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 613                        kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
 614                        vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
 615                }
 616                break;
 617        }
 618
 619        return 0;
 620}
 621EXPORT_SYMBOL_GPL(kvm_set_dr);
 622
 623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 624{
 625        switch (dr) {
 626        case 0 ... 3:
 627                *val = vcpu->arch.db[dr];
 628                break;
 629        case 4:
 630                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 631                        kvm_queue_exception(vcpu, UD_VECTOR);
 632                        return 1;
 633                }
 634                /* fall through */
 635        case 6:
 636                *val = vcpu->arch.dr6;
 637                break;
 638        case 5:
 639                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
 640                        kvm_queue_exception(vcpu, UD_VECTOR);
 641                        return 1;
 642                }
 643                /* fall through */
 644        default: /* 7 */
 645                *val = vcpu->arch.dr7;
 646                break;
 647        }
 648
 649        return 0;
 650}
 651EXPORT_SYMBOL_GPL(kvm_get_dr);
 652
 653static inline u32 bit(int bitno)
 654{
 655        return 1 << (bitno & 31);
 656}
 657
 658/*
 659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 660 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 661 *
 662 * This list is modified at module load time to reflect the
 663 * capabilities of the host cpu. This capabilities test skips MSRs that are
 664 * kvm-specific. Those are put in the beginning of the list.
 665 */
 666
 667#define KVM_SAVE_MSRS_BEGIN     7
 668static u32 msrs_to_save[] = {
 669        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 670        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 671        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 672        HV_X64_MSR_APIC_ASSIST_PAGE,
 673        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 674        MSR_K6_STAR,
 675#ifdef CONFIG_X86_64
 676        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 677#endif
 678        MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 679};
 680
 681static unsigned num_msrs_to_save;
 682
 683static u32 emulated_msrs[] = {
 684        MSR_IA32_MISC_ENABLE,
 685};
 686
 687static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 688{
 689        if (efer & efer_reserved_bits)
 690                return 1;
 691
 692        if (is_paging(vcpu)
 693            && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 694                return 1;
 695
 696        if (efer & EFER_FFXSR) {
 697                struct kvm_cpuid_entry2 *feat;
 698
 699                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 700                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
 701                        return 1;
 702        }
 703
 704        if (efer & EFER_SVME) {
 705                struct kvm_cpuid_entry2 *feat;
 706
 707                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 708                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
 709                        return 1;
 710        }
 711
 712        efer &= ~EFER_LMA;
 713        efer |= vcpu->arch.efer & EFER_LMA;
 714
 715        kvm_x86_ops->set_efer(vcpu, efer);
 716
 717        vcpu->arch.efer = efer;
 718
 719        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 720        kvm_mmu_reset_context(vcpu);
 721
 722        return 0;
 723}
 724
 725void kvm_enable_efer_bits(u64 mask)
 726{
 727       efer_reserved_bits &= ~mask;
 728}
 729EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 730
 731
 732/*
 733 * Writes msr value into into the appropriate "register".
 734 * Returns 0 on success, non-0 otherwise.
 735 * Assumes vcpu_load() was already called.
 736 */
 737int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 738{
 739        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 740}
 741
 742/*
 743 * Adapt set_msr() to msr_io()'s calling convention
 744 */
 745static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 746{
 747        return kvm_set_msr(vcpu, index, *data);
 748}
 749
 750static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 751{
 752        int version;
 753        int r;
 754        struct pvclock_wall_clock wc;
 755        struct timespec boot;
 756
 757        if (!wall_clock)
 758                return;
 759
 760        r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
 761        if (r)
 762                return;
 763
 764        if (version & 1)
 765                ++version;  /* first time write, random junk */
 766
 767        ++version;
 768
 769        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 770
 771        /*
 772         * The guest calculates current wall clock time by adding
 773         * system time (updated by kvm_write_guest_time below) to the
 774         * wall clock specified here.  guest system time equals host
 775         * system time for us, thus we must fill in host boot time here.
 776         */
 777        getboottime(&boot);
 778
 779        wc.sec = boot.tv_sec;
 780        wc.nsec = boot.tv_nsec;
 781        wc.version = version;
 782
 783        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 784
 785        version++;
 786        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 787}
 788
 789static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 790{
 791        uint32_t quotient, remainder;
 792
 793        /* Don't try to replace with do_div(), this one calculates
 794         * "(dividend << 32) / divisor" */
 795        __asm__ ( "divl %4"
 796                  : "=a" (quotient), "=d" (remainder)
 797                  : "0" (0), "1" (dividend), "r" (divisor) );
 798        return quotient;
 799}
 800
 801static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 802{
 803        uint64_t nsecs = 1000000000LL;
 804        int32_t  shift = 0;
 805        uint64_t tps64;
 806        uint32_t tps32;
 807
 808        tps64 = tsc_khz * 1000LL;
 809        while (tps64 > nsecs*2) {
 810                tps64 >>= 1;
 811                shift--;
 812        }
 813
 814        tps32 = (uint32_t)tps64;
 815        while (tps32 <= (uint32_t)nsecs) {
 816                tps32 <<= 1;
 817                shift++;
 818        }
 819
 820        hv_clock->tsc_shift = shift;
 821        hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 822
 823        pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 824                 __func__, tsc_khz, hv_clock->tsc_shift,
 825                 hv_clock->tsc_to_system_mul);
 826}
 827
 828static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 829
 830static void kvm_write_guest_time(struct kvm_vcpu *v)
 831{
 832        struct timespec ts;
 833        unsigned long flags;
 834        struct kvm_vcpu_arch *vcpu = &v->arch;
 835        void *shared_kaddr;
 836        unsigned long this_tsc_khz;
 837
 838        if ((!vcpu->time_page))
 839                return;
 840
 841        this_tsc_khz = get_cpu_var(cpu_tsc_khz);
 842        if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
 843                kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 844                vcpu->hv_clock_tsc_khz = this_tsc_khz;
 845        }
 846        put_cpu_var(cpu_tsc_khz);
 847
 848        /* Keep irq disabled to prevent changes to the clock */
 849        local_irq_save(flags);
 850        kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 851        ktime_get_ts(&ts);
 852        monotonic_to_bootbased(&ts);
 853        local_irq_restore(flags);
 854
 855        /* With all the info we got, fill in the values */
 856
 857        vcpu->hv_clock.system_time = ts.tv_nsec +
 858                                     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 859
 860        vcpu->hv_clock.flags = 0;
 861
 862        /*
 863         * The interface expects us to write an even number signaling that the
 864         * update is finished. Since the guest won't see the intermediate
 865         * state, we just increase by 2 at the end.
 866         */
 867        vcpu->hv_clock.version += 2;
 868
 869        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 870
 871        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 872               sizeof(vcpu->hv_clock));
 873
 874        kunmap_atomic(shared_kaddr, KM_USER0);
 875
 876        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 877}
 878
 879static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 880{
 881        struct kvm_vcpu_arch *vcpu = &v->arch;
 882
 883        if (!vcpu->time_page)
 884                return 0;
 885        set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 886        return 1;
 887}
 888
 889static bool msr_mtrr_valid(unsigned msr)
 890{
 891        switch (msr) {
 892        case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 893        case MSR_MTRRfix64K_00000:
 894        case MSR_MTRRfix16K_80000:
 895        case MSR_MTRRfix16K_A0000:
 896        case MSR_MTRRfix4K_C0000:
 897        case MSR_MTRRfix4K_C8000:
 898        case MSR_MTRRfix4K_D0000:
 899        case MSR_MTRRfix4K_D8000:
 900        case MSR_MTRRfix4K_E0000:
 901        case MSR_MTRRfix4K_E8000:
 902        case MSR_MTRRfix4K_F0000:
 903        case MSR_MTRRfix4K_F8000:
 904        case MSR_MTRRdefType:
 905        case MSR_IA32_CR_PAT:
 906                return true;
 907        case 0x2f8:
 908                return true;
 909        }
 910        return false;
 911}
 912
 913static bool valid_pat_type(unsigned t)
 914{
 915        return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 916}
 917
 918static bool valid_mtrr_type(unsigned t)
 919{
 920        return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 921}
 922
 923static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 924{
 925        int i;
 926
 927        if (!msr_mtrr_valid(msr))
 928                return false;
 929
 930        if (msr == MSR_IA32_CR_PAT) {
 931                for (i = 0; i < 8; i++)
 932                        if (!valid_pat_type((data >> (i * 8)) & 0xff))
 933                                return false;
 934                return true;
 935        } else if (msr == MSR_MTRRdefType) {
 936                if (data & ~0xcff)
 937                        return false;
 938                return valid_mtrr_type(data & 0xff);
 939        } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 940                for (i = 0; i < 8 ; i++)
 941                        if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 942                                return false;
 943                return true;
 944        }
 945
 946        /* variable MTRRs */
 947        return valid_mtrr_type(data & 0xff);
 948}
 949
 950static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 951{
 952        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 953
 954        if (!mtrr_valid(vcpu, msr, data))
 955                return 1;
 956
 957        if (msr == MSR_MTRRdefType) {
 958                vcpu->arch.mtrr_state.def_type = data;
 959                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 960        } else if (msr == MSR_MTRRfix64K_00000)
 961                p[0] = data;
 962        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 963                p[1 + msr - MSR_MTRRfix16K_80000] = data;
 964        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 965                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 966        else if (msr == MSR_IA32_CR_PAT)
 967                vcpu->arch.pat = data;
 968        else {  /* Variable MTRRs */
 969                int idx, is_mtrr_mask;
 970                u64 *pt;
 971
 972                idx = (msr - 0x200) / 2;
 973                is_mtrr_mask = msr - 0x200 - 2 * idx;
 974                if (!is_mtrr_mask)
 975                        pt =
 976                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 977                else
 978                        pt =
 979                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 980                *pt = data;
 981        }
 982
 983        kvm_mmu_reset_context(vcpu);
 984        return 0;
 985}
 986
 987static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 988{
 989        u64 mcg_cap = vcpu->arch.mcg_cap;
 990        unsigned bank_num = mcg_cap & 0xff;
 991
 992        switch (msr) {
 993        case MSR_IA32_MCG_STATUS:
 994                vcpu->arch.mcg_status = data;
 995                break;
 996        case MSR_IA32_MCG_CTL:
 997                if (!(mcg_cap & MCG_CTL_P))
 998                        return 1;
 999                if (data != 0 && data != ~(u64)0)
1000                        return -1;
1001                vcpu->arch.mcg_ctl = data;
1002                break;
1003        default:
1004                if (msr >= MSR_IA32_MC0_CTL &&
1005                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1006                        u32 offset = msr - MSR_IA32_MC0_CTL;
1007                        /* only 0 or all 1s can be written to IA32_MCi_CTL
1008                         * some Linux kernels though clear bit 10 in bank 4 to
1009                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1010                         * this to avoid an uncatched #GP in the guest
1011                         */
1012                        if ((offset & 0x3) == 0 &&
1013                            data != 0 && (data | (1 << 10)) != ~(u64)0)
1014                                return -1;
1015                        vcpu->arch.mce_banks[offset] = data;
1016                        break;
1017                }
1018                return 1;
1019        }
1020        return 0;
1021}
1022
1023static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1024{
1025        struct kvm *kvm = vcpu->kvm;
1026        int lm = is_long_mode(vcpu);
1027        u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1028                : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1029        u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1030                : kvm->arch.xen_hvm_config.blob_size_32;
1031        u32 page_num = data & ~PAGE_MASK;
1032        u64 page_addr = data & PAGE_MASK;
1033        u8 *page;
1034        int r;
1035
1036        r = -E2BIG;
1037        if (page_num >= blob_size)
1038                goto out;
1039        r = -ENOMEM;
1040        page = kzalloc(PAGE_SIZE, GFP_KERNEL);
1041        if (!page)
1042                goto out;
1043        r = -EFAULT;
1044        if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
1045                goto out_free;
1046        if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1047                goto out_free;
1048        r = 0;
1049out_free:
1050        kfree(page);
1051out:
1052        return r;
1053}
1054
1055static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1056{
1057        return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1058}
1059
1060static bool kvm_hv_msr_partition_wide(u32 msr)
1061{
1062        bool r = false;
1063        switch (msr) {
1064        case HV_X64_MSR_GUEST_OS_ID:
1065        case HV_X64_MSR_HYPERCALL:
1066                r = true;
1067                break;
1068        }
1069
1070        return r;
1071}
1072
1073static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1074{
1075        struct kvm *kvm = vcpu->kvm;
1076
1077        switch (msr) {
1078        case HV_X64_MSR_GUEST_OS_ID:
1079                kvm->arch.hv_guest_os_id = data;
1080                /* setting guest os id to zero disables hypercall page */
1081                if (!kvm->arch.hv_guest_os_id)
1082                        kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1083                break;
1084        case HV_X64_MSR_HYPERCALL: {
1085                u64 gfn;
1086                unsigned long addr;
1087                u8 instructions[4];
1088
1089                /* if guest os id is not set hypercall should remain disabled */
1090                if (!kvm->arch.hv_guest_os_id)
1091                        break;
1092                if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1093                        kvm->arch.hv_hypercall = data;
1094                        break;
1095                }
1096                gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1097                addr = gfn_to_hva(kvm, gfn);
1098                if (kvm_is_error_hva(addr))
1099                        return 1;
1100                kvm_x86_ops->patch_hypercall(vcpu, instructions);
1101                ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1102                if (copy_to_user((void __user *)addr, instructions, 4))
1103                        return 1;
1104                kvm->arch.hv_hypercall = data;
1105                break;
1106        }
1107        default:
1108                pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1109                          "data 0x%llx\n", msr, data);
1110                return 1;
1111        }
1112        return 0;
1113}
1114
1115static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1116{
1117        switch (msr) {
1118        case HV_X64_MSR_APIC_ASSIST_PAGE: {
1119                unsigned long addr;
1120
1121                if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1122                        vcpu->arch.hv_vapic = data;
1123                        break;
1124                }
1125                addr = gfn_to_hva(vcpu->kvm, data >>
1126                                  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1127                if (kvm_is_error_hva(addr))
1128                        return 1;
1129                if (clear_user((void __user *)addr, PAGE_SIZE))
1130                        return 1;
1131                vcpu->arch.hv_vapic = data;
1132                break;
1133        }
1134        case HV_X64_MSR_EOI:
1135                return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1136        case HV_X64_MSR_ICR:
1137                return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1138        case HV_X64_MSR_TPR:
1139                return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1140        default:
1141                pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1142                          "data 0x%llx\n", msr, data);
1143                return 1;
1144        }
1145
1146        return 0;
1147}
1148
1149int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1150{
1151        switch (msr) {
1152        case MSR_EFER:
1153                return set_efer(vcpu, data);
1154        case MSR_K7_HWCR:
1155                data &= ~(u64)0x40;     /* ignore flush filter disable */
1156                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
1157                if (data != 0) {
1158                        pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1159                                data);
1160                        return 1;
1161                }
1162                break;
1163        case MSR_FAM10H_MMIO_CONF_BASE:
1164                if (data != 0) {
1165                        pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1166                                "0x%llx\n", data);
1167                        return 1;
1168                }
1169                break;
1170        case MSR_AMD64_NB_CFG:
1171                break;
1172        case MSR_IA32_DEBUGCTLMSR:
1173                if (!data) {
1174                        /* We support the non-activated case already */
1175                        break;
1176                } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
1177                        /* Values other than LBR and BTF are vendor-specific,
1178                           thus reserved and should throw a #GP */
1179                        return 1;
1180                }
1181                pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1182                        __func__, data);
1183                break;
1184        case MSR_IA32_UCODE_REV:
1185        case MSR_IA32_UCODE_WRITE:
1186        case MSR_VM_HSAVE_PA:
1187        case MSR_AMD64_PATCH_LOADER:
1188                break;
1189        case 0x200 ... 0x2ff:
1190                return set_msr_mtrr(vcpu, msr, data);
1191        case MSR_IA32_APICBASE:
1192                kvm_set_apic_base(vcpu, data);
1193                break;
1194        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1195                return kvm_x2apic_msr_write(vcpu, msr, data);
1196        case MSR_IA32_MISC_ENABLE:
1197                vcpu->arch.ia32_misc_enable_msr = data;
1198                break;
1199        case MSR_KVM_WALL_CLOCK_NEW:
1200        case MSR_KVM_WALL_CLOCK:
1201                vcpu->kvm->arch.wall_clock = data;
1202                kvm_write_wall_clock(vcpu->kvm, data);
1203                break;
1204        case MSR_KVM_SYSTEM_TIME_NEW:
1205        case MSR_KVM_SYSTEM_TIME: {
1206                if (vcpu->arch.time_page) {
1207                        kvm_release_page_dirty(vcpu->arch.time_page);
1208                        vcpu->arch.time_page = NULL;
1209                }
1210
1211                vcpu->arch.time = data;
1212
1213                /* we verify if the enable bit is set... */
1214                if (!(data & 1))
1215                        break;
1216
1217                /* ...but clean it before doing the actual write */
1218                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
1219
1220                vcpu->arch.time_page =
1221                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1222
1223                if (is_error_page(vcpu->arch.time_page)) {
1224                        kvm_release_page_clean(vcpu->arch.time_page);
1225                        vcpu->arch.time_page = NULL;
1226                }
1227
1228                kvm_request_guest_time_update(vcpu);
1229                break;
1230        }
1231        case MSR_IA32_MCG_CTL:
1232        case MSR_IA32_MCG_STATUS:
1233        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1234                return set_msr_mce(vcpu, msr, data);
1235
1236        /* Performance counters are not protected by a CPUID bit,
1237         * so we should check all of them in the generic path for the sake of
1238         * cross vendor migration.
1239         * Writing a zero into the event select MSRs disables them,
1240         * which we perfectly emulate ;-). Any other value should be at least
1241         * reported, some guests depend on them.
1242         */
1243        case MSR_P6_EVNTSEL0:
1244        case MSR_P6_EVNTSEL1:
1245        case MSR_K7_EVNTSEL0:
1246        case MSR_K7_EVNTSEL1:
1247        case MSR_K7_EVNTSEL2:
1248        case MSR_K7_EVNTSEL3:
1249                if (data != 0)
1250                        pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1251                                "0x%x data 0x%llx\n", msr, data);
1252                break;
1253        /* at least RHEL 4 unconditionally writes to the perfctr registers,
1254         * so we ignore writes to make it happy.
1255         */
1256        case MSR_P6_PERFCTR0:
1257        case MSR_P6_PERFCTR1:
1258        case MSR_K7_PERFCTR0:
1259        case MSR_K7_PERFCTR1:
1260        case MSR_K7_PERFCTR2:
1261        case MSR_K7_PERFCTR3:
1262                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1263                        "0x%x data 0x%llx\n", msr, data);
1264                break;
1265        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1266                if (kvm_hv_msr_partition_wide(msr)) {
1267                        int r;
1268                        mutex_lock(&vcpu->kvm->lock);
1269                        r = set_msr_hyperv_pw(vcpu, msr, data);
1270                        mutex_unlock(&vcpu->kvm->lock);
1271                        return r;
1272                } else
1273                        return set_msr_hyperv(vcpu, msr, data);
1274                break;
1275        default:
1276                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1277                        return xen_hvm_config(vcpu, data);
1278                if (!ignore_msrs) {
1279                        pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1280                                msr, data);
1281                        return 1;
1282                } else {
1283                        pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1284                                msr, data);
1285                        break;
1286                }
1287        }
1288        return 0;
1289}
1290EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1291
1292
1293/*
1294 * Reads an msr value (of 'msr_index') into 'pdata'.
1295 * Returns 0 on success, non-0 otherwise.
1296 * Assumes vcpu_load() was already called.
1297 */
1298int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1299{
1300        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1301}
1302
1303static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1304{
1305        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1306
1307        if (!msr_mtrr_valid(msr))
1308                return 1;
1309
1310        if (msr == MSR_MTRRdefType)
1311                *pdata = vcpu->arch.mtrr_state.def_type +
1312                         (vcpu->arch.mtrr_state.enabled << 10);
1313        else if (msr == MSR_MTRRfix64K_00000)
1314                *pdata = p[0];
1315        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1316                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1317        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1318                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1319        else if (msr == MSR_IA32_CR_PAT)
1320                *pdata = vcpu->arch.pat;
1321        else {  /* Variable MTRRs */
1322                int idx, is_mtrr_mask;
1323                u64 *pt;
1324
1325                idx = (msr - 0x200) / 2;
1326                is_mtrr_mask = msr - 0x200 - 2 * idx;
1327                if (!is_mtrr_mask)
1328                        pt =
1329                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1330                else
1331                        pt =
1332                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1333                *pdata = *pt;
1334        }
1335
1336        return 0;
1337}
1338
1339static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1340{
1341        u64 data;
1342        u64 mcg_cap = vcpu->arch.mcg_cap;
1343        unsigned bank_num = mcg_cap & 0xff;
1344
1345        switch (msr) {
1346        case MSR_IA32_P5_MC_ADDR:
1347        case MSR_IA32_P5_MC_TYPE:
1348                data = 0;
1349                break;
1350        case MSR_IA32_MCG_CAP:
1351                data = vcpu->arch.mcg_cap;
1352                break;
1353        case MSR_IA32_MCG_CTL:
1354                if (!(mcg_cap & MCG_CTL_P))
1355                        return 1;
1356                data = vcpu->arch.mcg_ctl;
1357                break;
1358        case MSR_IA32_MCG_STATUS:
1359                data = vcpu->arch.mcg_status;
1360                break;
1361        default:
1362                if (msr >= MSR_IA32_MC0_CTL &&
1363                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1364                        u32 offset = msr - MSR_IA32_MC0_CTL;
1365                        data = vcpu->arch.mce_banks[offset];
1366                        break;
1367                }
1368                return 1;
1369        }
1370        *pdata = data;
1371        return 0;
1372}
1373
1374static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1375{
1376        u64 data = 0;
1377        struct kvm *kvm = vcpu->kvm;
1378
1379        switch (msr) {
1380        case HV_X64_MSR_GUEST_OS_ID:
1381                data = kvm->arch.hv_guest_os_id;
1382                break;
1383        case HV_X64_MSR_HYPERCALL:
1384                data = kvm->arch.hv_hypercall;
1385                break;
1386        default:
1387                pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1388                return 1;
1389        }
1390
1391        *pdata = data;
1392        return 0;
1393}
1394
1395static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1396{
1397        u64 data = 0;
1398
1399        switch (msr) {
1400        case HV_X64_MSR_VP_INDEX: {
1401                int r;
1402                struct kvm_vcpu *v;
1403                kvm_for_each_vcpu(r, v, vcpu->kvm)
1404                        if (v == vcpu)
1405                                data = r;
1406                break;
1407        }
1408        case HV_X64_MSR_EOI:
1409                return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1410        case HV_X64_MSR_ICR:
1411                return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1412        case HV_X64_MSR_TPR:
1413                return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1414        default:
1415                pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1416                return 1;
1417        }
1418        *pdata = data;
1419        return 0;
1420}
1421
1422int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1423{
1424        u64 data;
1425
1426        switch (msr) {
1427        case MSR_IA32_PLATFORM_ID:
1428        case MSR_IA32_UCODE_REV:
1429        case MSR_IA32_EBL_CR_POWERON:
1430        case MSR_IA32_DEBUGCTLMSR:
1431        case MSR_IA32_LASTBRANCHFROMIP:
1432        case MSR_IA32_LASTBRANCHTOIP:
1433        case MSR_IA32_LASTINTFROMIP:
1434        case MSR_IA32_LASTINTTOIP:
1435        case MSR_K8_SYSCFG:
1436        case MSR_K7_HWCR:
1437        case MSR_VM_HSAVE_PA:
1438        case MSR_P6_PERFCTR0:
1439        case MSR_P6_PERFCTR1:
1440        case MSR_P6_EVNTSEL0:
1441        case MSR_P6_EVNTSEL1:
1442        case MSR_K7_EVNTSEL0:
1443        case MSR_K7_PERFCTR0:
1444        case MSR_K8_INT_PENDING_MSG:
1445        case MSR_AMD64_NB_CFG:
1446        case MSR_FAM10H_MMIO_CONF_BASE:
1447                data = 0;
1448                break;
1449        case MSR_MTRRcap:
1450                data = 0x500 | KVM_NR_VAR_MTRR;
1451                break;
1452        case 0x200 ... 0x2ff:
1453                return get_msr_mtrr(vcpu, msr, pdata);
1454        case 0xcd: /* fsb frequency */
1455                data = 3;
1456                break;
1457        case MSR_IA32_APICBASE:
1458                data = kvm_get_apic_base(vcpu);
1459                break;
1460        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1461                return kvm_x2apic_msr_read(vcpu, msr, pdata);
1462                break;
1463        case MSR_IA32_MISC_ENABLE:
1464                data = vcpu->arch.ia32_misc_enable_msr;
1465                break;
1466        case MSR_IA32_PERF_STATUS:
1467                /* TSC increment by tick */
1468                data = 1000ULL;
1469                /* CPU multiplier */
1470                data |= (((uint64_t)4ULL) << 40);
1471                break;
1472        case MSR_EFER:
1473                data = vcpu->arch.efer;
1474                break;
1475        case MSR_KVM_WALL_CLOCK:
1476        case MSR_KVM_WALL_CLOCK_NEW:
1477                data = vcpu->kvm->arch.wall_clock;
1478                break;
1479        case MSR_KVM_SYSTEM_TIME:
1480        case MSR_KVM_SYSTEM_TIME_NEW:
1481                data = vcpu->arch.time;
1482                break;
1483        case MSR_IA32_P5_MC_ADDR:
1484        case MSR_IA32_P5_MC_TYPE:
1485        case MSR_IA32_MCG_CAP:
1486        case MSR_IA32_MCG_CTL:
1487        case MSR_IA32_MCG_STATUS:
1488        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1489                return get_msr_mce(vcpu, msr, pdata);
1490        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1491                if (kvm_hv_msr_partition_wide(msr)) {
1492                        int r;
1493                        mutex_lock(&vcpu->kvm->lock);
1494                        r = get_msr_hyperv_pw(vcpu, msr, pdata);
1495                        mutex_unlock(&vcpu->kvm->lock);
1496                        return r;
1497                } else
1498                        return get_msr_hyperv(vcpu, msr, pdata);
1499                break;
1500        default:
1501                if (!ignore_msrs) {
1502                        pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1503                        return 1;
1504                } else {
1505                        pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1506                        data = 0;
1507                }
1508                break;
1509        }
1510        *pdata = data;
1511        return 0;
1512}
1513EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1514
1515/*
1516 * Read or write a bunch of msrs. All parameters are kernel addresses.
1517 *
1518 * @return number of msrs set successfully.
1519 */
1520static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1521                    struct kvm_msr_entry *entries,
1522                    int (*do_msr)(struct kvm_vcpu *vcpu,
1523                                  unsigned index, u64 *data))
1524{
1525        int i, idx;
1526
1527        vcpu_load(vcpu);
1528
1529        idx = srcu_read_lock(&vcpu->kvm->srcu);
1530        for (i = 0; i < msrs->nmsrs; ++i)
1531                if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532                        break;
1533        srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534
1535        vcpu_put(vcpu);
1536
1537        return i;
1538}
1539
1540/*
1541 * Read or write a bunch of msrs. Parameters are user addresses.
1542 *
1543 * @return number of msrs set successfully.
1544 */
1545static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1546                  int (*do_msr)(struct kvm_vcpu *vcpu,
1547                                unsigned index, u64 *data),
1548                  int writeback)
1549{
1550        struct kvm_msrs msrs;
1551        struct kvm_msr_entry *entries;
1552        int r, n;
1553        unsigned size;
1554
1555        r = -EFAULT;
1556        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1557                goto out;
1558
1559        r = -E2BIG;
1560        if (msrs.nmsrs >= MAX_IO_MSRS)
1561                goto out;
1562
1563        r = -ENOMEM;
1564        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1565        entries = kmalloc(size, GFP_KERNEL);
1566        if (!entries)
1567                goto out;
1568
1569        r = -EFAULT;
1570        if (copy_from_user(entries, user_msrs->entries, size))
1571                goto out_free;
1572
1573        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1574        if (r < 0)
1575                goto out_free;
1576
1577        r = -EFAULT;
1578        if (writeback && copy_to_user(user_msrs->entries, entries, size))
1579                goto out_free;
1580
1581        r = n;
1582
1583out_free:
1584        kfree(entries);
1585out:
1586        return r;
1587}
1588
1589int kvm_dev_ioctl_check_extension(long ext)
1590{
1591        int r;
1592
1593        switch (ext) {
1594        case KVM_CAP_IRQCHIP:
1595        case KVM_CAP_HLT:
1596        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1597        case KVM_CAP_SET_TSS_ADDR:
1598        case KVM_CAP_EXT_CPUID:
1599        case KVM_CAP_CLOCKSOURCE:
1600        case KVM_CAP_PIT:
1601        case KVM_CAP_NOP_IO_DELAY:
1602        case KVM_CAP_MP_STATE:
1603        case KVM_CAP_SYNC_MMU:
1604        case KVM_CAP_REINJECT_CONTROL:
1605        case KVM_CAP_IRQ_INJECT_STATUS:
1606        case KVM_CAP_ASSIGN_DEV_IRQ:
1607        case KVM_CAP_IRQFD:
1608        case KVM_CAP_IOEVENTFD:
1609        case KVM_CAP_PIT2:
1610        case KVM_CAP_PIT_STATE2:
1611        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1612        case KVM_CAP_XEN_HVM:
1613        case KVM_CAP_ADJUST_CLOCK:
1614        case KVM_CAP_VCPU_EVENTS:
1615        case KVM_CAP_HYPERV:
1616        case KVM_CAP_HYPERV_VAPIC:
1617        case KVM_CAP_HYPERV_SPIN:
1618        case KVM_CAP_PCI_SEGMENT:
1619        case KVM_CAP_DEBUGREGS:
1620        case KVM_CAP_X86_ROBUST_SINGLESTEP:
1621                r = 1;
1622                break;
1623        case KVM_CAP_COALESCED_MMIO:
1624                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1625                break;
1626        case KVM_CAP_VAPIC:
1627                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1628                break;
1629        case KVM_CAP_NR_VCPUS:
1630                r = KVM_MAX_VCPUS;
1631                break;
1632        case KVM_CAP_NR_MEMSLOTS:
1633                r = KVM_MEMORY_SLOTS;
1634                break;
1635        case KVM_CAP_PV_MMU:    /* obsolete */
1636                r = 0;
1637                break;
1638        case KVM_CAP_IOMMU:
1639                r = iommu_found();
1640                break;
1641        case KVM_CAP_MCE:
1642                r = KVM_MAX_MCE_BANKS;
1643                break;
1644        default:
1645                r = 0;
1646                break;
1647        }
1648        return r;
1649
1650}
1651
1652long kvm_arch_dev_ioctl(struct file *filp,
1653                        unsigned int ioctl, unsigned long arg)
1654{
1655        void __user *argp = (void __user *)arg;
1656        long r;
1657
1658        switch (ioctl) {
1659        case KVM_GET_MSR_INDEX_LIST: {
1660                struct kvm_msr_list __user *user_msr_list = argp;
1661                struct kvm_msr_list msr_list;
1662                unsigned n;
1663
1664                r = -EFAULT;
1665                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1666                        goto out;
1667                n = msr_list.nmsrs;
1668                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1669                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1670                        goto out;
1671                r = -E2BIG;
1672                if (n < msr_list.nmsrs)
1673                        goto out;
1674                r = -EFAULT;
1675                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1676                                 num_msrs_to_save * sizeof(u32)))
1677                        goto out;
1678                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1679                                 &emulated_msrs,
1680                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1681                        goto out;
1682                r = 0;
1683                break;
1684        }
1685        case KVM_GET_SUPPORTED_CPUID: {
1686                struct kvm_cpuid2 __user *cpuid_arg = argp;
1687                struct kvm_cpuid2 cpuid;
1688
1689                r = -EFAULT;
1690                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1691                        goto out;
1692                r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1693                                                      cpuid_arg->entries);
1694                if (r)
1695                        goto out;
1696
1697                r = -EFAULT;
1698                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1699                        goto out;
1700                r = 0;
1701                break;
1702        }
1703        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1704                u64 mce_cap;
1705
1706                mce_cap = KVM_MCE_CAP_SUPPORTED;
1707                r = -EFAULT;
1708                if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1709                        goto out;
1710                r = 0;
1711                break;
1712        }
1713        default:
1714                r = -EINVAL;
1715        }
1716out:
1717        return r;
1718}
1719
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{
1722        kvm_x86_ops->vcpu_load(vcpu, cpu);
1723        if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724                unsigned long khz = cpufreq_quick_get(cpu);
1725                if (!khz)
1726                        khz = tsc_khz;
1727                per_cpu(cpu_tsc_khz, cpu) = khz;
1728        }
1729        kvm_request_guest_time_update(vcpu);
1730}
1731
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{
1734        kvm_put_guest_fpu(vcpu);
1735        kvm_x86_ops->vcpu_put(vcpu);
1736}
1737
1738static int is_efer_nx(void)
1739{
1740        unsigned long long efer = 0;
1741
1742        rdmsrl_safe(MSR_EFER, &efer);
1743        return efer & EFER_NX;
1744}
1745
1746static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1747{
1748        int i;
1749        struct kvm_cpuid_entry2 *e, *entry;
1750
1751        entry = NULL;
1752        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1753                e = &vcpu->arch.cpuid_entries[i];
1754                if (e->function == 0x80000001) {
1755                        entry = e;
1756                        break;
1757                }
1758        }
1759        if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1760                entry->edx &= ~(1 << 20);
1761                printk(KERN_INFO "kvm: guest NX capability removed\n");
1762        }
1763}
1764
1765/* when an old userspace process fills a new kernel module */
1766static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1767                                    struct kvm_cpuid *cpuid,
1768                                    struct kvm_cpuid_entry __user *entries)
1769{
1770        int r, i;
1771        struct kvm_cpuid_entry *cpuid_entries;
1772
1773        r = -E2BIG;
1774        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1775                goto out;
1776        r = -ENOMEM;
1777        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1778        if (!cpuid_entries)
1779                goto out;
1780        r = -EFAULT;
1781        if (copy_from_user(cpuid_entries, entries,
1782                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783                goto out_free;
1784        vcpu_load(vcpu);
1785        for (i = 0; i < cpuid->nent; i++) {
1786                vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787                vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1788                vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1789                vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1790                vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1791                vcpu->arch.cpuid_entries[i].index = 0;
1792                vcpu->arch.cpuid_entries[i].flags = 0;
1793                vcpu->arch.cpuid_entries[i].padding[0] = 0;
1794                vcpu->arch.cpuid_entries[i].padding[1] = 0;
1795                vcpu->arch.cpuid_entries[i].padding[2] = 0;
1796        }
1797        vcpu->arch.cpuid_nent = cpuid->nent;
1798        cpuid_fix_nx_cap(vcpu);
1799        r = 0;
1800        kvm_apic_set_version(vcpu);
1801        kvm_x86_ops->cpuid_update(vcpu);
1802        vcpu_put(vcpu);
1803
1804out_free:
1805        vfree(cpuid_entries);
1806out:
1807        return r;
1808}
1809
1810static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1811                                     struct kvm_cpuid2 *cpuid,
1812                                     struct kvm_cpuid_entry2 __user *entries)
1813{
1814        int r;
1815
1816        r = -E2BIG;
1817        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1818                goto out;
1819        r = -EFAULT;
1820        if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822                goto out;
1823        vcpu_load(vcpu);
1824        vcpu->arch.cpuid_nent = cpuid->nent;
1825        kvm_apic_set_version(vcpu);
1826        kvm_x86_ops->cpuid_update(vcpu);
1827        vcpu_put(vcpu);
1828        return 0;
1829
1830out:
1831        return r;
1832}
1833
1834static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1835                                     struct kvm_cpuid2 *cpuid,
1836                                     struct kvm_cpuid_entry2 __user *entries)
1837{
1838        int r;
1839
1840        vcpu_load(vcpu);
1841        r = -E2BIG;
1842        if (cpuid->nent < vcpu->arch.cpuid_nent)
1843                goto out;
1844        r = -EFAULT;
1845        if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1846                         vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1847                goto out;
1848        return 0;
1849
1850out:
1851        cpuid->nent = vcpu->arch.cpuid_nent;
1852        vcpu_put(vcpu);
1853        return r;
1854}
1855
1856static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1857                           u32 index)
1858{
1859        entry->function = function;
1860        entry->index = index;
1861        cpuid_count(entry->function, entry->index,
1862                    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1863        entry->flags = 0;
1864}
1865
1866#define F(x) bit(X86_FEATURE_##x)
1867
1868static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1869                         u32 index, int *nent, int maxnent)
1870{
1871        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1872#ifdef CONFIG_X86_64
1873        unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1874                                ? F(GBPAGES) : 0;
1875        unsigned f_lm = F(LM);
1876#else
1877        unsigned f_gbpages = 0;
1878        unsigned f_lm = 0;
1879#endif
1880        unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1881
1882        /* cpuid 1.edx */
1883        const u32 kvm_supported_word0_x86_features =
1884                F(FPU) | F(VME) | F(DE) | F(PSE) |
1885                F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1886                F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1887                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1888                F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1889                0 /* Reserved, DS, ACPI */ | F(MMX) |
1890                F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1891                0 /* HTT, TM, Reserved, PBE */;
1892        /* cpuid 0x80000001.edx */
1893        const u32 kvm_supported_word1_x86_features =
1894                F(FPU) | F(VME) | F(DE) | F(PSE) |
1895                F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1896                F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1897                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1898                F(PAT) | F(PSE36) | 0 /* Reserved */ |
1899                f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1900                F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1901                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902        /* cpuid 1.ecx */
1903        const u32 kvm_supported_word4_x86_features =
1904                F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1905                0 /* DS-CPL, VMX, SMX, EST */ |
1906                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908                0 /* Reserved, DCA */ | F(XMM4_1) |
1909                F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910                0 /* Reserved, XSAVE, OSXSAVE */;
1911        /* cpuid 0x80000001.ecx */
1912        const u32 kvm_supported_word6_x86_features =
1913                F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1914                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1915                F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1916                0 /* SKINIT */ | 0 /* WDT */;
1917
1918        /* all calls to cpuid_count() should be made on the same cpu */
1919        get_cpu();
1920        do_cpuid_1_ent(entry, function, index);
1921        ++*nent;
1922
1923        switch (function) {
1924        case 0:
1925                entry->eax = min(entry->eax, (u32)0xb);
1926                break;
1927        case 1:
1928                entry->edx &= kvm_supported_word0_x86_features;
1929                entry->ecx &= kvm_supported_word4_x86_features;
1930                /* we support x2apic emulation even if host does not support
1931                 * it since we emulate x2apic in software */
1932                entry->ecx |= F(X2APIC);
1933                break;
1934        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1935         * may return different values. This forces us to get_cpu() before
1936         * issuing the first command, and also to emulate this annoying behavior
1937         * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1938        case 2: {
1939                int t, times = entry->eax & 0xff;
1940
1941                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1942                entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1943                for (t = 1; t < times && *nent < maxnent; ++t) {
1944                        do_cpuid_1_ent(&entry[t], function, 0);
1945                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1946                        ++*nent;
1947                }
1948                break;
1949        }
1950        /* function 4 and 0xb have additional index. */
1951        case 4: {
1952                int i, cache_type;
1953
1954                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1955                /* read more entries until cache_type is zero */
1956                for (i = 1; *nent < maxnent; ++i) {
1957                        cache_type = entry[i - 1].eax & 0x1f;
1958                        if (!cache_type)
1959                                break;
1960                        do_cpuid_1_ent(&entry[i], function, i);
1961                        entry[i].flags |=
1962                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1963                        ++*nent;
1964                }
1965                break;
1966        }
1967        case 0xb: {
1968                int i, level_type;
1969
1970                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1971                /* read more entries until level_type is zero */
1972                for (i = 1; *nent < maxnent; ++i) {
1973                        level_type = entry[i - 1].ecx & 0xff00;
1974                        if (!level_type)
1975                                break;
1976                        do_cpuid_1_ent(&entry[i], function, i);
1977                        entry[i].flags |=
1978                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1979                        ++*nent;
1980                }
1981                break;
1982        }
1983        case KVM_CPUID_SIGNATURE: {
1984                char signature[12] = "KVMKVMKVM\0\0";
1985                u32 *sigptr = (u32 *)signature;
1986                entry->eax = 0;
1987                entry->ebx = sigptr[0];
1988                entry->ecx = sigptr[1];
1989                entry->edx = sigptr[2];
1990                break;
1991        }
1992        case KVM_CPUID_FEATURES:
1993                entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
1994                             (1 << KVM_FEATURE_NOP_IO_DELAY) |
1995                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
1996                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
1997                entry->ebx = 0;
1998                entry->ecx = 0;
1999                entry->edx = 0;
2000                break;
2001        case 0x80000000:
2002                entry->eax = min(entry->eax, 0x8000001a);
2003                break;
2004        case 0x80000001:
2005                entry->edx &= kvm_supported_word1_x86_features;
2006                entry->ecx &= kvm_supported_word6_x86_features;
2007                break;
2008        }
2009
2010        kvm_x86_ops->set_supported_cpuid(function, entry);
2011
2012        put_cpu();
2013}
2014
2015#undef F
2016
2017static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2018                                     struct kvm_cpuid_entry2 __user *entries)
2019{
2020        struct kvm_cpuid_entry2 *cpuid_entries;
2021        int limit, nent = 0, r = -E2BIG;
2022        u32 func;
2023
2024        if (cpuid->nent < 1)
2025                goto out;
2026        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2027                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
2028        r = -ENOMEM;
2029        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
2030        if (!cpuid_entries)
2031                goto out;
2032
2033        do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
2034        limit = cpuid_entries[0].eax;
2035        for (func = 1; func <= limit && nent < cpuid->nent; ++func)
2036                do_cpuid_ent(&cpuid_entries[nent], func, 0,
2037                             &nent, cpuid->nent);
2038        r = -E2BIG;
2039        if (nent >= cpuid->nent)
2040                goto out_free;
2041
2042        do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
2043        limit = cpuid_entries[nent - 1].eax;
2044        for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
2045                do_cpuid_ent(&cpuid_entries[nent], func, 0,
2046                             &nent, cpuid->nent);
2047
2048
2049
2050        r = -E2BIG;
2051        if (nent >= cpuid->nent)
2052                goto out_free;
2053
2054        do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2055                     cpuid->nent);
2056
2057        r = -E2BIG;
2058        if (nent >= cpuid->nent)
2059                goto out_free;
2060
2061        do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2062                     cpuid->nent);
2063
2064        r = -E2BIG;
2065        if (nent >= cpuid->nent)
2066                goto out_free;
2067
2068        r = -EFAULT;
2069        if (copy_to_user(entries, cpuid_entries,
2070                         nent * sizeof(struct kvm_cpuid_entry2)))
2071                goto out_free;
2072        cpuid->nent = nent;
2073        r = 0;
2074
2075out_free:
2076        vfree(cpuid_entries);
2077out:
2078        return r;
2079}
2080
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082                                    struct kvm_lapic_state *s)
2083{
2084        vcpu_load(vcpu);
2085        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086        vcpu_put(vcpu);
2087
2088        return 0;
2089}
2090
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092                                    struct kvm_lapic_state *s)
2093{
2094        vcpu_load(vcpu);
2095        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096        kvm_apic_post_state_restore(vcpu);
2097        update_cr8_intercept(vcpu);
2098        vcpu_put(vcpu);
2099
2100        return 0;
2101}
2102
2103static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2104                                    struct kvm_interrupt *irq)
2105{
2106        if (irq->irq < 0 || irq->irq >= 256)
2107                return -EINVAL;
2108        if (irqchip_in_kernel(vcpu->kvm))
2109                return -ENXIO;
2110        vcpu_load(vcpu);
2111
2112        kvm_queue_interrupt(vcpu, irq->irq, false);
2113
2114        vcpu_put(vcpu);
2115
2116        return 0;
2117}
2118
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{
2121        vcpu_load(vcpu);
2122        kvm_inject_nmi(vcpu);
2123        vcpu_put(vcpu);
2124
2125        return 0;
2126}
2127
2128static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2129                                           struct kvm_tpr_access_ctl *tac)
2130{
2131        if (tac->flags)
2132                return -EINVAL;
2133        vcpu->arch.tpr_access_reporting = !!tac->enabled;
2134        return 0;
2135}
2136
2137static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2138                                        u64 mcg_cap)
2139{
2140        int r;
2141        unsigned bank_num = mcg_cap & 0xff, bank;
2142
2143        vcpu_load(vcpu);
2144        r = -EINVAL;
2145        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146                goto out;
2147        if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2148                goto out;
2149        r = 0;
2150        vcpu->arch.mcg_cap = mcg_cap;
2151        /* Init IA32_MCG_CTL to all 1s */
2152        if (mcg_cap & MCG_CTL_P)
2153                vcpu->arch.mcg_ctl = ~(u64)0;
2154        /* Init IA32_MCi_CTL to all 1s */
2155        for (bank = 0; bank < bank_num; bank++)
2156                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out:
2158        vcpu_put(vcpu);
2159        return r;
2160}
2161
2162static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2163                                      struct kvm_x86_mce *mce)
2164{
2165        u64 mcg_cap = vcpu->arch.mcg_cap;
2166        unsigned bank_num = mcg_cap & 0xff;
2167        u64 *banks = vcpu->arch.mce_banks;
2168
2169        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2170                return -EINVAL;
2171        /*
2172         * if IA32_MCG_CTL is not all 1s, the uncorrected error
2173         * reporting is disabled
2174         */
2175        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2176            vcpu->arch.mcg_ctl != ~(u64)0)
2177                return 0;
2178        banks += 4 * mce->bank;
2179        /*
2180         * if IA32_MCi_CTL is not all 1s, the uncorrected error
2181         * reporting is disabled for the bank
2182         */
2183        if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2184                return 0;
2185        if (mce->status & MCI_STATUS_UC) {
2186                if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2187                    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2188                        printk(KERN_DEBUG "kvm: set_mce: "
2189                               "injects mce exception while "
2190                               "previous one is in progress!\n");
2191                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
2192                        return 0;
2193                }
2194                if (banks[1] & MCI_STATUS_VAL)
2195                        mce->status |= MCI_STATUS_OVER;
2196                banks[2] = mce->addr;
2197                banks[3] = mce->misc;
2198                vcpu->arch.mcg_status = mce->mcg_status;
2199                banks[1] = mce->status;
2200                kvm_queue_exception(vcpu, MC_VECTOR);
2201        } else if (!(banks[1] & MCI_STATUS_VAL)
2202                   || !(banks[1] & MCI_STATUS_UC)) {
2203                if (banks[1] & MCI_STATUS_VAL)
2204                        mce->status |= MCI_STATUS_OVER;
2205                banks[2] = mce->addr;
2206                banks[3] = mce->misc;
2207                banks[1] = mce->status;
2208        } else
2209                banks[1] |= MCI_STATUS_OVER;
2210        return 0;
2211}
2212
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214                                               struct kvm_vcpu_events *events)
2215{
2216        vcpu_load(vcpu);
2217
2218        events->exception.injected =
2219                vcpu->arch.exception.pending &&
2220                !kvm_exception_is_soft(vcpu->arch.exception.nr);
2221        events->exception.nr = vcpu->arch.exception.nr;
2222        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2223        events->exception.error_code = vcpu->arch.exception.error_code;
2224
2225        events->interrupt.injected =
2226                vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2227        events->interrupt.nr = vcpu->arch.interrupt.nr;
2228        events->interrupt.soft = 0;
2229        events->interrupt.shadow =
2230                kvm_x86_ops->get_interrupt_shadow(vcpu,
2231                        KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2232
2233        events->nmi.injected = vcpu->arch.nmi_injected;
2234        events->nmi.pending = vcpu->arch.nmi_pending;
2235        events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2236
2237        events->sipi_vector = vcpu->arch.sipi_vector;
2238
2239        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240                         | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241                         | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243        vcpu_put(vcpu);
2244}
2245
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2247                                              struct kvm_vcpu_events *events)
2248{
2249        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2250                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2251                              | KVM_VCPUEVENT_VALID_SHADOW))
2252                return -EINVAL;
2253
2254        vcpu_load(vcpu);
2255
2256        vcpu->arch.exception.pending = events->exception.injected;
2257        vcpu->arch.exception.nr = events->exception.nr;
2258        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2259        vcpu->arch.exception.error_code = events->exception.error_code;
2260
2261        vcpu->arch.interrupt.pending = events->interrupt.injected;
2262        vcpu->arch.interrupt.nr = events->interrupt.nr;
2263        vcpu->arch.interrupt.soft = events->interrupt.soft;
2264        if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2265                kvm_pic_clear_isr_ack(vcpu->kvm);
2266        if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2267                kvm_x86_ops->set_interrupt_shadow(vcpu,
2268                                                  events->interrupt.shadow);
2269
2270        vcpu->arch.nmi_injected = events->nmi.injected;
2271        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2272                vcpu->arch.nmi_pending = events->nmi.pending;
2273        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2274
2275        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276                vcpu->arch.sipi_vector = events->sipi_vector;
2277
2278        vcpu_put(vcpu);
2279
2280        return 0;
2281}
2282
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284                                             struct kvm_debugregs *dbgregs)
2285{
2286        vcpu_load(vcpu);
2287
2288        memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289        dbgregs->dr6 = vcpu->arch.dr6;
2290        dbgregs->dr7 = vcpu->arch.dr7;
2291        dbgregs->flags = 0;
2292
2293        vcpu_put(vcpu);
2294}
2295
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2297                                            struct kvm_debugregs *dbgregs)
2298{
2299        if (dbgregs->flags)
2300                return -EINVAL;
2301
2302        vcpu_load(vcpu);
2303
2304        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305        vcpu->arch.dr6 = dbgregs->dr6;
2306        vcpu->arch.dr7 = dbgregs->dr7;
2307
2308        vcpu_put(vcpu);
2309
2310        return 0;
2311}
2312
2313long kvm_arch_vcpu_ioctl(struct file *filp,
2314                         unsigned int ioctl, unsigned long arg)
2315{
2316        struct kvm_vcpu *vcpu = filp->private_data;
2317        void __user *argp = (void __user *)arg;
2318        int r;
2319        struct kvm_lapic_state *lapic = NULL;
2320
2321        switch (ioctl) {
2322        case KVM_GET_LAPIC: {
2323                r = -EINVAL;
2324                if (!vcpu->arch.apic)
2325                        goto out;
2326                lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327
2328                r = -ENOMEM;
2329                if (!lapic)
2330                        goto out;
2331                r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
2332                if (r)
2333                        goto out;
2334                r = -EFAULT;
2335                if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
2336                        goto out;
2337                r = 0;
2338                break;
2339        }
2340        case KVM_SET_LAPIC: {
2341                r = -EINVAL;
2342                if (!vcpu->arch.apic)
2343                        goto out;
2344                lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345                r = -ENOMEM;
2346                if (!lapic)
2347                        goto out;
2348                r = -EFAULT;
2349                if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
2350                        goto out;
2351                r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
2352                if (r)
2353                        goto out;
2354                r = 0;
2355                break;
2356        }
2357        case KVM_INTERRUPT: {
2358                struct kvm_interrupt irq;
2359
2360                r = -EFAULT;
2361                if (copy_from_user(&irq, argp, sizeof irq))
2362                        goto out;
2363                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2364                if (r)
2365                        goto out;
2366                r = 0;
2367                break;
2368        }
2369        case KVM_NMI: {
2370                r = kvm_vcpu_ioctl_nmi(vcpu);
2371                if (r)
2372                        goto out;
2373                r = 0;
2374                break;
2375        }
2376        case KVM_SET_CPUID: {
2377                struct kvm_cpuid __user *cpuid_arg = argp;
2378                struct kvm_cpuid cpuid;
2379
2380                r = -EFAULT;
2381                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2382                        goto out;
2383                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2384                if (r)
2385                        goto out;
2386                break;
2387        }
2388        case KVM_SET_CPUID2: {
2389                struct kvm_cpuid2 __user *cpuid_arg = argp;
2390                struct kvm_cpuid2 cpuid;
2391
2392                r = -EFAULT;
2393                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2394                        goto out;
2395                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
2396                                              cpuid_arg->entries);
2397                if (r)
2398                        goto out;
2399                break;
2400        }
2401        case KVM_GET_CPUID2: {
2402                struct kvm_cpuid2 __user *cpuid_arg = argp;
2403                struct kvm_cpuid2 cpuid;
2404
2405                r = -EFAULT;
2406                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2407                        goto out;
2408                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
2409                                              cpuid_arg->entries);
2410                if (r)
2411                        goto out;
2412                r = -EFAULT;
2413                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2414                        goto out;
2415                r = 0;
2416                break;
2417        }
2418        case KVM_GET_MSRS:
2419                r = msr_io(vcpu, argp, kvm_get_msr, 1);
2420                break;
2421        case KVM_SET_MSRS:
2422                r = msr_io(vcpu, argp, do_set_msr, 0);
2423                break;
2424        case KVM_TPR_ACCESS_REPORTING: {
2425                struct kvm_tpr_access_ctl tac;
2426
2427                r = -EFAULT;
2428                if (copy_from_user(&tac, argp, sizeof tac))
2429                        goto out;
2430                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
2431                if (r)
2432                        goto out;
2433                r = -EFAULT;
2434                if (copy_to_user(argp, &tac, sizeof tac))
2435                        goto out;
2436                r = 0;
2437                break;
2438        };
2439        case KVM_SET_VAPIC_ADDR: {
2440                struct kvm_vapic_addr va;
2441
2442                r = -EINVAL;
2443                if (!irqchip_in_kernel(vcpu->kvm))
2444                        goto out;
2445                r = -EFAULT;
2446                if (copy_from_user(&va, argp, sizeof va))
2447                        goto out;
2448                r = 0;
2449                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
2450                break;
2451        }
2452        case KVM_X86_SETUP_MCE: {
2453                u64 mcg_cap;
2454
2455                r = -EFAULT;
2456                if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
2457                        goto out;
2458                r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
2459                break;
2460        }
2461        case KVM_X86_SET_MCE: {
2462                struct kvm_x86_mce mce;
2463
2464                r = -EFAULT;
2465                if (copy_from_user(&mce, argp, sizeof mce))
2466                        goto out;
2467                vcpu_load(vcpu);
2468                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469                vcpu_put(vcpu);
2470                break;
2471        }
2472        case KVM_GET_VCPU_EVENTS: {
2473                struct kvm_vcpu_events events;
2474
2475                kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2476
2477                r = -EFAULT;
2478                if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2479                        break;
2480                r = 0;
2481                break;
2482        }
2483        case KVM_SET_VCPU_EVENTS: {
2484                struct kvm_vcpu_events events;
2485
2486                r = -EFAULT;
2487                if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2488                        break;
2489
2490                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2491                break;
2492        }
2493        case KVM_GET_DEBUGREGS: {
2494                struct kvm_debugregs dbgregs;
2495
2496                kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2497
2498                r = -EFAULT;
2499                if (copy_to_user(argp, &dbgregs,
2500                                 sizeof(struct kvm_debugregs)))
2501                        break;
2502                r = 0;
2503                break;
2504        }
2505        case KVM_SET_DEBUGREGS: {
2506                struct kvm_debugregs dbgregs;
2507
2508                r = -EFAULT;
2509                if (copy_from_user(&dbgregs, argp,
2510                                   sizeof(struct kvm_debugregs)))
2511                        break;
2512
2513                r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514                break;
2515        }
2516        default:
2517                r = -EINVAL;
2518        }
2519out:
2520        kfree(lapic);
2521        return r;
2522}
2523
2524static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2525{
2526        int ret;
2527
2528        if (addr > (unsigned int)(-3 * PAGE_SIZE))
2529                return -1;
2530        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2531        return ret;
2532}
2533
2534static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
2535                                              u64 ident_addr)
2536{
2537        kvm->arch.ept_identity_map_addr = ident_addr;
2538        return 0;
2539}
2540
2541static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2542                                          u32 kvm_nr_mmu_pages)
2543{
2544        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2545                return -EINVAL;
2546
2547        mutex_lock(&kvm->slots_lock);
2548        spin_lock(&kvm->mmu_lock);
2549
2550        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2551        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2552
2553        spin_unlock(&kvm->mmu_lock);
2554        mutex_unlock(&kvm->slots_lock);
2555        return 0;
2556}
2557
2558static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2559{
2560        return kvm->arch.n_alloc_mmu_pages;
2561}
2562
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565        int i;
2566        struct kvm_mem_alias *alias;
2567        struct kvm_mem_aliases *aliases;
2568
2569        aliases = kvm_aliases(kvm);
2570
2571        for (i = 0; i < aliases->naliases; ++i) {
2572                alias = &aliases->aliases[i];
2573                if (alias->flags & KVM_ALIAS_INVALID)
2574                        continue;
2575                if (gfn >= alias->base_gfn
2576                    && gfn < alias->base_gfn + alias->npages)
2577                        return alias->target_gfn + gfn - alias->base_gfn;
2578        }
2579        return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584        int i;
2585        struct kvm_mem_alias *alias;
2586        struct kvm_mem_aliases *aliases;
2587
2588        aliases = kvm_aliases(kvm);
2589
2590        for (i = 0; i < aliases->naliases; ++i) {
2591                alias = &aliases->aliases[i];
2592                if (gfn >= alias->base_gfn
2593                    && gfn < alias->base_gfn + alias->npages)
2594                        return alias->target_gfn + gfn - alias->base_gfn;
2595        }
2596        return gfn;
2597}
2598
2599/*
2600 * Set a new alias region.  Aliases map a portion of physical memory into
2601 * another portion.  This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605                                         struct kvm_memory_alias *alias)
2606{
2607        int r, n;
2608        struct kvm_mem_alias *p;
2609        struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611        r = -EINVAL;
2612        /* General sanity checks */
2613        if (alias->memory_size & (PAGE_SIZE - 1))
2614                goto out;
2615        if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616                goto out;
2617        if (alias->slot >= KVM_ALIAS_SLOTS)
2618                goto out;
2619        if (alias->guest_phys_addr + alias->memory_size
2620            < alias->guest_phys_addr)
2621                goto out;
2622        if (alias->target_phys_addr + alias->memory_size
2623            < alias->target_phys_addr)
2624                goto out;
2625
2626        r = -ENOMEM;
2627        aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628        if (!aliases)
2629                goto out;
2630
2631        mutex_lock(&kvm->slots_lock);
2632
2633        /* invalidate any gfn reference in case of deletion/shrinking */
2634        memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635        aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636        old_aliases = kvm->arch.aliases;
2637        rcu_assign_pointer(kvm->arch.aliases, aliases);
2638        synchronize_srcu_expedited(&kvm->srcu);
2639        kvm_mmu_zap_all(kvm);
2640        kfree(old_aliases);
2641
2642        r = -ENOMEM;
2643        aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644        if (!aliases)
2645                goto out_unlock;
2646
2647        memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649        p = &aliases->aliases[alias->slot];
2650        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651        p->npages = alias->memory_size >> PAGE_SHIFT;
2652        p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653        p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655        for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656                if (aliases->aliases[n - 1].npages)
2657                        break;
2658        aliases->naliases = n;
2659
2660        old_aliases = kvm->arch.aliases;
2661        rcu_assign_pointer(kvm->arch.aliases, aliases);
2662        synchronize_srcu_expedited(&kvm->srcu);
2663        kfree(old_aliases);
2664        r = 0;
2665
2666out_unlock:
2667        mutex_unlock(&kvm->slots_lock);
2668out:
2669        return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{
2674        int r;
2675
2676        r = 0;
2677        switch (chip->chip_id) {
2678        case KVM_IRQCHIP_PIC_MASTER:
2679                memcpy(&chip->chip.pic,
2680                        &pic_irqchip(kvm)->pics[0],
2681                        sizeof(struct kvm_pic_state));
2682                break;
2683        case KVM_IRQCHIP_PIC_SLAVE:
2684                memcpy(&chip->chip.pic,
2685                        &pic_irqchip(kvm)->pics[1],
2686                        sizeof(struct kvm_pic_state));
2687                break;
2688        case KVM_IRQCHIP_IOAPIC:
2689                r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2690                break;
2691        default:
2692                r = -EINVAL;
2693                break;
2694        }
2695        return r;
2696}
2697
2698static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2699{
2700        int r;
2701
2702        r = 0;
2703        switch (chip->chip_id) {
2704        case KVM_IRQCHIP_PIC_MASTER:
2705                raw_spin_lock(&pic_irqchip(kvm)->lock);
2706                memcpy(&pic_irqchip(kvm)->pics[0],
2707                        &chip->chip.pic,
2708                        sizeof(struct kvm_pic_state));
2709                raw_spin_unlock(&pic_irqchip(kvm)->lock);
2710                break;
2711        case KVM_IRQCHIP_PIC_SLAVE:
2712                raw_spin_lock(&pic_irqchip(kvm)->lock);
2713                memcpy(&pic_irqchip(kvm)->pics[1],
2714                        &chip->chip.pic,
2715                        sizeof(struct kvm_pic_state));
2716                raw_spin_unlock(&pic_irqchip(kvm)->lock);
2717                break;
2718        case KVM_IRQCHIP_IOAPIC:
2719                r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2720                break;
2721        default:
2722                r = -EINVAL;
2723                break;
2724        }
2725        kvm_pic_update_irq(pic_irqchip(kvm));
2726        return r;
2727}
2728
2729static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2730{
2731        int r = 0;
2732
2733        mutex_lock(&kvm->arch.vpit->pit_state.lock);
2734        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2735        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2736        return r;
2737}
2738
2739static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2740{
2741        int r = 0;
2742
2743        mutex_lock(&kvm->arch.vpit->pit_state.lock);
2744        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2745        kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2746        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2747        return r;
2748}
2749
2750static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2751{
2752        int r = 0;
2753
2754        mutex_lock(&kvm->arch.vpit->pit_state.lock);
2755        memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2756                sizeof(ps->channels));
2757        ps->flags = kvm->arch.vpit->pit_state.flags;
2758        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2759        return r;
2760}
2761
2762static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2763{
2764        int r = 0, start = 0;
2765        u32 prev_legacy, cur_legacy;
2766        mutex_lock(&kvm->arch.vpit->pit_state.lock);
2767        prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2768        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2769        if (!prev_legacy && cur_legacy)
2770                start = 1;
2771        memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2772               sizeof(kvm->arch.vpit->pit_state.channels));
2773        kvm->arch.vpit->pit_state.flags = ps->flags;
2774        kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2775        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2776        return r;
2777}
2778
2779static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2780                                 struct kvm_reinject_control *control)
2781{
2782        if (!kvm->arch.vpit)
2783                return -ENXIO;
2784        mutex_lock(&kvm->arch.vpit->pit_state.lock);
2785        kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2786        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2787        return 0;
2788}
2789
2790/*
2791 * Get (and clear) the dirty memory log for a memory slot.
2792 */
2793int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2794                                      struct kvm_dirty_log *log)
2795{
2796        int r, i;
2797        struct kvm_memory_slot *memslot;
2798        unsigned long n;
2799        unsigned long is_dirty = 0;
2800        unsigned long *dirty_bitmap = NULL;
2801
2802        mutex_lock(&kvm->slots_lock);
2803
2804        r = -EINVAL;
2805        if (log->slot >= KVM_MEMORY_SLOTS)
2806                goto out;
2807
2808        memslot = &kvm->memslots->memslots[log->slot];
2809        r = -ENOENT;
2810        if (!memslot->dirty_bitmap)
2811                goto out;
2812
2813        n = kvm_dirty_bitmap_bytes(memslot);
2814
2815        r = -ENOMEM;
2816        dirty_bitmap = vmalloc(n);
2817        if (!dirty_bitmap)
2818                goto out;
2819        memset(dirty_bitmap, 0, n);
2820
2821        for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822                is_dirty = memslot->dirty_bitmap[i];
2823
2824        /* If nothing is dirty, don't bother messing with page tables. */
2825        if (is_dirty) {
2826                struct kvm_memslots *slots, *old_slots;
2827
2828                spin_lock(&kvm->mmu_lock);
2829                kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830                spin_unlock(&kvm->mmu_lock);
2831
2832                slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2833                if (!slots)
2834                        goto out_free;
2835
2836                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837                slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838
2839                old_slots = kvm->memslots;
2840                rcu_assign_pointer(kvm->memslots, slots);
2841                synchronize_srcu_expedited(&kvm->srcu);
2842                dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843                kfree(old_slots);
2844        }
2845
2846        r = 0;
2847        if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848                r = -EFAULT;
2849out_free:
2850        vfree(dirty_bitmap);
2851out:
2852        mutex_unlock(&kvm->slots_lock);
2853        return r;
2854}
2855
2856long kvm_arch_vm_ioctl(struct file *filp,
2857                       unsigned int ioctl, unsigned long arg)
2858{
2859        struct kvm *kvm = filp->private_data;
2860        void __user *argp = (void __user *)arg;
2861        int r = -ENOTTY;
2862        /*
2863         * This union makes it completely explicit to gcc-3.x
2864         * that these two variables' stack usage should be
2865         * combined, not added together.
2866         */
2867        union {
2868                struct kvm_pit_state ps;
2869                struct kvm_pit_state2 ps2;
2870                struct kvm_memory_alias alias;
2871                struct kvm_pit_config pit_config;
2872        } u;
2873
2874        switch (ioctl) {
2875        case KVM_SET_TSS_ADDR:
2876                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2877                if (r < 0)
2878                        goto out;
2879                break;
2880        case KVM_SET_IDENTITY_MAP_ADDR: {
2881                u64 ident_addr;
2882
2883                r = -EFAULT;
2884                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2885                        goto out;
2886                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2887                if (r < 0)
2888                        goto out;
2889                break;
2890        }
2891        case KVM_SET_MEMORY_REGION: {
2892                struct kvm_memory_region kvm_mem;
2893                struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895                r = -EFAULT;
2896                if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897                        goto out;
2898                kvm_userspace_mem.slot = kvm_mem.slot;
2899                kvm_userspace_mem.flags = kvm_mem.flags;
2900                kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901                kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903                if (r)
2904                        goto out;
2905                break;
2906        }
2907        case KVM_SET_NR_MMU_PAGES:
2908                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909                if (r)
2910                        goto out;
2911                break;
2912        case KVM_GET_NR_MMU_PAGES:
2913                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914                break;
2915        case KVM_SET_MEMORY_ALIAS:
2916                r = -EFAULT;
2917                if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918                        goto out;
2919                r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920                if (r)
2921                        goto out;
2922                break;
2923        case KVM_CREATE_IRQCHIP: {
2924                struct kvm_pic *vpic;
2925
2926                mutex_lock(&kvm->lock);
2927                r = -EEXIST;
2928                if (kvm->arch.vpic)
2929                        goto create_irqchip_unlock;
2930                r = -ENOMEM;
2931                vpic = kvm_create_pic(kvm);
2932                if (vpic) {
2933                        r = kvm_ioapic_init(kvm);
2934                        if (r) {
2935                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2936                                                          &vpic->dev);
2937                                kfree(vpic);
2938                                goto create_irqchip_unlock;
2939                        }
2940                } else
2941                        goto create_irqchip_unlock;
2942                smp_wmb();
2943                kvm->arch.vpic = vpic;
2944                smp_wmb();
2945                r = kvm_setup_default_irq_routing(kvm);
2946                if (r) {
2947                        mutex_lock(&kvm->irq_lock);
2948                        kvm_ioapic_destroy(kvm);
2949                        kvm_destroy_pic(kvm);
2950                        mutex_unlock(&kvm->irq_lock);
2951                }
2952        create_irqchip_unlock:
2953                mutex_unlock(&kvm->lock);
2954                break;
2955        }
2956        case KVM_CREATE_PIT:
2957                u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2958                goto create_pit;
2959        case KVM_CREATE_PIT2:
2960                r = -EFAULT;
2961                if (copy_from_user(&u.pit_config, argp,
2962                                   sizeof(struct kvm_pit_config)))
2963                        goto out;
2964        create_pit:
2965                mutex_lock(&kvm->slots_lock);
2966                r = -EEXIST;
2967                if (kvm->arch.vpit)
2968                        goto create_pit_unlock;
2969                r = -ENOMEM;
2970                kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2971                if (kvm->arch.vpit)
2972                        r = 0;
2973        create_pit_unlock:
2974                mutex_unlock(&kvm->slots_lock);
2975                break;
2976        case KVM_IRQ_LINE_STATUS:
2977        case KVM_IRQ_LINE: {
2978                struct kvm_irq_level irq_event;
2979
2980                r = -EFAULT;
2981                if (copy_from_user(&irq_event, argp, sizeof irq_event))
2982                        goto out;
2983                r = -ENXIO;
2984                if (irqchip_in_kernel(kvm)) {
2985                        __s32 status;
2986                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2987                                        irq_event.irq, irq_event.level);
2988                        if (ioctl == KVM_IRQ_LINE_STATUS) {
2989                                r = -EFAULT;
2990                                irq_event.status = status;
2991                                if (copy_to_user(argp, &irq_event,
2992                                                        sizeof irq_event))
2993                                        goto out;
2994                        }
2995                        r = 0;
2996                }
2997                break;
2998        }
2999        case KVM_GET_IRQCHIP: {
3000                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3001                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
3002
3003                r = -ENOMEM;
3004                if (!chip)
3005                        goto out;
3006                r = -EFAULT;
3007                if (copy_from_user(chip, argp, sizeof *chip))
3008                        goto get_irqchip_out;
3009                r = -ENXIO;
3010                if (!irqchip_in_kernel(kvm))
3011                        goto get_irqchip_out;
3012                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3013                if (r)
3014                        goto get_irqchip_out;
3015                r = -EFAULT;
3016                if (copy_to_user(argp, chip, sizeof *chip))
3017                        goto get_irqchip_out;
3018                r = 0;
3019        get_irqchip_out:
3020                kfree(chip);
3021                if (r)
3022                        goto out;
3023                break;
3024        }
3025        case KVM_SET_IRQCHIP: {
3026                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3027                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
3028
3029                r = -ENOMEM;
3030                if (!chip)
3031                        goto out;
3032                r = -EFAULT;
3033                if (copy_from_user(chip, argp, sizeof *chip))
3034                        goto set_irqchip_out;
3035                r = -ENXIO;
3036                if (!irqchip_in_kernel(kvm))
3037                        goto set_irqchip_out;
3038                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3039                if (r)
3040                        goto set_irqchip_out;
3041                r = 0;
3042        set_irqchip_out:
3043                kfree(chip);
3044                if (r)
3045                        goto out;
3046                break;
3047        }
3048        case KVM_GET_PIT: {
3049                r = -EFAULT;
3050                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3051                        goto out;
3052                r = -ENXIO;
3053                if (!kvm->arch.vpit)
3054                        goto out;
3055                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3056                if (r)
3057                        goto out;
3058                r = -EFAULT;
3059                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3060                        goto out;
3061                r = 0;
3062                break;
3063        }
3064        case KVM_SET_PIT: {
3065                r = -EFAULT;
3066                if (copy_from_user(&u.ps, argp, sizeof u.ps))
3067                        goto out;
3068                r = -ENXIO;
3069                if (!kvm->arch.vpit)
3070                        goto out;
3071                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3072                if (r)
3073                        goto out;
3074                r = 0;
3075                break;
3076        }
3077        case KVM_GET_PIT2: {
3078                r = -ENXIO;
3079                if (!kvm->arch.vpit)
3080                        goto out;
3081                r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
3082                if (r)
3083                        goto out;
3084                r = -EFAULT;
3085                if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
3086                        goto out;
3087                r = 0;
3088                break;
3089        }
3090        case KVM_SET_PIT2: {
3091                r = -EFAULT;
3092                if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
3093                        goto out;
3094                r = -ENXIO;
3095                if (!kvm->arch.vpit)
3096                        goto out;
3097                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3098                if (r)
3099                        goto out;
3100                r = 0;
3101                break;
3102        }
3103        case KVM_REINJECT_CONTROL: {
3104                struct kvm_reinject_control control;
3105                r =  -EFAULT;
3106                if (copy_from_user(&control, argp, sizeof(control)))
3107                        goto out;
3108                r = kvm_vm_ioctl_reinject(kvm, &control);
3109                if (r)
3110                        goto out;
3111                r = 0;
3112                break;
3113        }
3114        case KVM_XEN_HVM_CONFIG: {
3115                r = -EFAULT;
3116                if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
3117                                   sizeof(struct kvm_xen_hvm_config)))
3118                        goto out;
3119                r = -EINVAL;
3120                if (kvm->arch.xen_hvm_config.flags)
3121                        goto out;
3122                r = 0;
3123                break;
3124        }
3125        case KVM_SET_CLOCK: {
3126                struct timespec now;
3127                struct kvm_clock_data user_ns;
3128                u64 now_ns;
3129                s64 delta;
3130
3131                r = -EFAULT;
3132                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
3133                        goto out;
3134
3135                r = -EINVAL;
3136                if (user_ns.flags)
3137                        goto out;
3138
3139                r = 0;
3140                ktime_get_ts(&now);
3141                now_ns = timespec_to_ns(&now);
3142                delta = user_ns.clock - now_ns;
3143                kvm->arch.kvmclock_offset = delta;
3144                break;
3145        }
3146        case KVM_GET_CLOCK: {
3147                struct timespec now;
3148                struct kvm_clock_data user_ns;
3149                u64 now_ns;
3150
3151                ktime_get_ts(&now);
3152                now_ns = timespec_to_ns(&now);
3153                user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3154                user_ns.flags = 0;
3155
3156                r = -EFAULT;
3157                if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
3158                        goto out;
3159                r = 0;
3160                break;
3161        }
3162
3163        default:
3164                ;
3165        }
3166out:
3167        return r;
3168}
3169
3170static void kvm_init_msr_list(void)
3171{
3172        u32 dummy[2];
3173        unsigned i, j;
3174
3175        /* skip the first msrs in the list. KVM-specific */
3176        for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3177                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
3178                        continue;
3179                if (j < i)
3180                        msrs_to_save[j] = msrs_to_save[i];
3181                j++;
3182        }
3183        num_msrs_to_save = j;
3184}
3185
3186static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3187                           const void *v)
3188{
3189        if (vcpu->arch.apic &&
3190            !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
3191                return 0;
3192
3193        return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3194}
3195
3196static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3197{
3198        if (vcpu->arch.apic &&
3199            !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
3200                return 0;
3201
3202        return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3203}
3204
3205static void kvm_set_segment(struct kvm_vcpu *vcpu,
3206                        struct kvm_segment *var, int seg)
3207{
3208        kvm_x86_ops->set_segment(vcpu, var, seg);
3209}
3210
3211void kvm_get_segment(struct kvm_vcpu *vcpu,
3212                     struct kvm_segment *var, int seg)
3213{
3214        kvm_x86_ops->get_segment(vcpu, var, seg);
3215}
3216
3217gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3218{
3219        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3220        return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3221}
3222
3223 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3224{
3225        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3226        access |= PFERR_FETCH_MASK;
3227        return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3228}
3229
3230gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3231{
3232        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3233        access |= PFERR_WRITE_MASK;
3234        return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3235}
3236
3237/* uses this to access any guest's mapped memory without checking CPL */
3238gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3239{
3240        return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3241}
3242
3243static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3244                                      struct kvm_vcpu *vcpu, u32 access,
3245                                      u32 *error)
3246{
3247        void *data = val;
3248        int r = X86EMUL_CONTINUE;
3249
3250        while (bytes) {
3251                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
3252                unsigned offset = addr & (PAGE_SIZE-1);
3253                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3254                int ret;
3255
3256                if (gpa == UNMAPPED_GVA) {
3257                        r = X86EMUL_PROPAGATE_FAULT;
3258                        goto out;
3259                }
3260                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261                if (ret < 0) {
3262                        r = X86EMUL_UNHANDLEABLE;
3263                        goto out;
3264                }
3265
3266                bytes -= toread;
3267                data += toread;
3268                addr += toread;
3269        }
3270out:
3271        return r;
3272}
3273
3274/* used for instruction fetching */
3275static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3276                                struct kvm_vcpu *vcpu, u32 *error)
3277{
3278        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3279        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3280                                          access | PFERR_FETCH_MASK, error);
3281}
3282
3283static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3284                               struct kvm_vcpu *vcpu, u32 *error)
3285{
3286        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3287        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3288                                          error);
3289}
3290
3291static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3292                               struct kvm_vcpu *vcpu, u32 *error)
3293{
3294        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3295}
3296
3297static int kvm_write_guest_virt_system(gva_t addr, void *val,
3298                                       unsigned int bytes,
3299                                       struct kvm_vcpu *vcpu,
3300                                       u32 *error)
3301{
3302        void *data = val;
3303        int r = X86EMUL_CONTINUE;
3304
3305        while (bytes) {
3306                gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3307                                                       PFERR_WRITE_MASK, error);
3308                unsigned offset = addr & (PAGE_SIZE-1);
3309                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3310                int ret;
3311
3312                if (gpa == UNMAPPED_GVA) {
3313                        r = X86EMUL_PROPAGATE_FAULT;
3314                        goto out;
3315                }
3316                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317                if (ret < 0) {
3318                        r = X86EMUL_UNHANDLEABLE;
3319                        goto out;
3320                }
3321
3322                bytes -= towrite;
3323                data += towrite;
3324                addr += towrite;
3325        }
3326out:
3327        return r;
3328}
3329
3330static int emulator_read_emulated(unsigned long addr,
3331                                  void *val,
3332                                  unsigned int bytes,
3333                                  struct kvm_vcpu *vcpu)
3334{
3335        gpa_t                 gpa;
3336        u32 error_code;
3337
3338        if (vcpu->mmio_read_completed) {
3339                memcpy(val, vcpu->mmio_data, bytes);
3340                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
3341                               vcpu->mmio_phys_addr, *(u64 *)val);
3342                vcpu->mmio_read_completed = 0;
3343                return X86EMUL_CONTINUE;
3344        }
3345
3346        gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3347
3348        if (gpa == UNMAPPED_GVA) {
3349                kvm_inject_page_fault(vcpu, addr, error_code);
3350                return X86EMUL_PROPAGATE_FAULT;
3351        }
3352
3353        /* For APIC access vmexit */
3354        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3355                goto mmio;
3356
3357        if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
3358                                == X86EMUL_CONTINUE)
3359                return X86EMUL_CONTINUE;
3360
3361mmio:
3362        /*
3363         * Is this MMIO handled locally?
3364         */
3365        if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
3366                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
3367                return X86EMUL_CONTINUE;
3368        }
3369
3370        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371
3372        vcpu->mmio_needed = 1;
3373        vcpu->mmio_phys_addr = gpa;
3374        vcpu->mmio_size = bytes;
3375        vcpu->mmio_is_write = 0;
3376
3377        return X86EMUL_UNHANDLEABLE;
3378}
3379
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3381                          const void *val, int bytes)
3382{
3383        int ret;
3384
3385        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
3386        if (ret < 0)
3387                return 0;
3388        kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
3389        return 1;
3390}
3391
3392static int emulator_write_emulated_onepage(unsigned long addr,
3393                                           const void *val,
3394                                           unsigned int bytes,
3395                                           struct kvm_vcpu *vcpu)
3396{
3397        gpa_t                 gpa;
3398        u32 error_code;
3399
3400        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
3401
3402        if (gpa == UNMAPPED_GVA) {
3403                kvm_inject_page_fault(vcpu, addr, error_code);
3404                return X86EMUL_PROPAGATE_FAULT;
3405        }
3406
3407        /* For APIC access vmexit */
3408        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3409                goto mmio;
3410
3411        if (emulator_write_phys(vcpu, gpa, val, bytes))
3412                return X86EMUL_CONTINUE;
3413
3414mmio:
3415        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
3416        /*
3417         * Is this MMIO handled locally?
3418         */
3419        if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
3420                return X86EMUL_CONTINUE;
3421
3422        vcpu->mmio_needed = 1;
3423        vcpu->mmio_phys_addr = gpa;
3424        vcpu->mmio_size = bytes;
3425        vcpu->mmio_is_write = 1;
3426        memcpy(vcpu->mmio_data, val, bytes);
3427
3428        return X86EMUL_CONTINUE;
3429}
3430
3431int emulator_write_emulated(unsigned long addr,
3432                            const void *val,
3433                            unsigned int bytes,
3434                            struct kvm_vcpu *vcpu)
3435{
3436        /* Crossing a page boundary? */
3437        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3438                int rc, now;
3439
3440                now = -addr & ~PAGE_MASK;
3441                rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
3442                if (rc != X86EMUL_CONTINUE)
3443                        return rc;
3444                addr += now;
3445                val += now;
3446                bytes -= now;
3447        }
3448        return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
3449}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451
3452#define CMPXCHG_TYPE(t, ptr, old, new) \
3453        (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3454
3455#ifdef CONFIG_X86_64
3456#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3457#else
3458#  define CMPXCHG64(ptr, old, new) \
3459        (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3460#endif
3461
3462static int emulator_cmpxchg_emulated(unsigned long addr,
3463                                     const void *old,
3464                                     const void *new,
3465                                     unsigned int bytes,
3466                                     struct kvm_vcpu *vcpu)
3467{
3468        gpa_t gpa;
3469        struct page *page;
3470        char *kaddr;
3471        bool exchanged;
3472
3473        /* guests cmpxchg8b have to be emulated atomically */
3474        if (bytes > 8 || (bytes & (bytes - 1)))
3475                goto emul_write;
3476
3477        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3478
3479        if (gpa == UNMAPPED_GVA ||
3480            (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3481                goto emul_write;
3482
3483        if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3484                goto emul_write;
3485
3486        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3487
3488        kaddr = kmap_atomic(page, KM_USER0);
3489        kaddr += offset_in_page(gpa);
3490        switch (bytes) {
3491        case 1:
3492                exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3493                break;
3494        case 2:
3495                exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3496                break;
3497        case 4:
3498                exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3499                break;
3500        case 8:
3501                exchanged = CMPXCHG64(kaddr, old, new);
3502                break;
3503        default:
3504                BUG();
3505        }
3506        kunmap_atomic(kaddr, KM_USER0);
3507        kvm_release_page_dirty(page);
3508
3509        if (!exchanged)
3510                return X86EMUL_CMPXCHG_FAILED;
3511
3512        kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3513
3514        return X86EMUL_CONTINUE;
3515
3516emul_write:
3517        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518
3519        return emulator_write_emulated(addr, new, bytes, vcpu);
3520}
3521
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3523{
3524        /* TODO: String I/O for in kernel device */
3525        int r;
3526
3527        if (vcpu->arch.pio.in)
3528                r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3529                                    vcpu->arch.pio.size, pd);
3530        else
3531                r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3532                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
3533                                     pd);
3534        return r;
3535}
3536
3537
3538static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3539                             unsigned int count, struct kvm_vcpu *vcpu)
3540{
3541        if (vcpu->arch.pio.count)
3542                goto data_avail;
3543
3544        trace_kvm_pio(1, port, size, 1);
3545
3546        vcpu->arch.pio.port = port;
3547        vcpu->arch.pio.in = 1;
3548        vcpu->arch.pio.count  = count;
3549        vcpu->arch.pio.size = size;
3550
3551        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3552        data_avail:
3553                memcpy(val, vcpu->arch.pio_data, size * count);
3554                vcpu->arch.pio.count = 0;
3555                return 1;
3556        }
3557
3558        vcpu->run->exit_reason = KVM_EXIT_IO;
3559        vcpu->run->io.direction = KVM_EXIT_IO_IN;
3560        vcpu->run->io.size = size;
3561        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3562        vcpu->run->io.count = count;
3563        vcpu->run->io.port = port;
3564
3565        return 0;
3566}
3567
3568static int emulator_pio_out_emulated(int size, unsigned short port,
3569                              const void *val, unsigned int count,
3570                              struct kvm_vcpu *vcpu)
3571{
3572        trace_kvm_pio(0, port, size, 1);
3573
3574        vcpu->arch.pio.port = port;
3575        vcpu->arch.pio.in = 0;
3576        vcpu->arch.pio.count = count;
3577        vcpu->arch.pio.size = size;
3578
3579        memcpy(vcpu->arch.pio_data, val, size * count);
3580
3581        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3582                vcpu->arch.pio.count = 0;
3583                return 1;
3584        }
3585
3586        vcpu->run->exit_reason = KVM_EXIT_IO;
3587        vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3588        vcpu->run->io.size = size;
3589        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3590        vcpu->run->io.count = count;
3591        vcpu->run->io.port = port;
3592
3593        return 0;
3594}
3595
3596static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3597{
3598        return kvm_x86_ops->get_segment_base(vcpu, seg);
3599}
3600
3601int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3602{
3603        kvm_mmu_invlpg(vcpu, address);
3604        return X86EMUL_CONTINUE;
3605}
3606
3607int emulate_clts(struct kvm_vcpu *vcpu)
3608{
3609        kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3610        kvm_x86_ops->fpu_activate(vcpu);
3611        return X86EMUL_CONTINUE;
3612}
3613
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3615{
3616        return kvm_get_dr(ctxt->vcpu, dr, dest);
3617}
3618
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3620{
3621        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3622
3623        return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624}
3625
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3627{
3628        u8 opcodes[4];
3629        unsigned long rip = kvm_rip_read(vcpu);
3630        unsigned long rip_linear;
3631
3632        if (!printk_ratelimit())
3633                return;
3634
3635        rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
3636
3637        kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639        printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640               context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{
3646        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3647}
3648
3649static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3650{
3651        unsigned long value;
3652
3653        switch (cr) {
3654        case 0:
3655                value = kvm_read_cr0(vcpu);
3656                break;
3657        case 2:
3658                value = vcpu->arch.cr2;
3659                break;
3660        case 3:
3661                value = vcpu->arch.cr3;
3662                break;
3663        case 4:
3664                value = kvm_read_cr4(vcpu);
3665                break;
3666        case 8:
3667                value = kvm_get_cr8(vcpu);
3668                break;
3669        default:
3670                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3671                return 0;
3672        }
3673
3674        return value;
3675}
3676
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{
3679        switch (cr) {
3680        case 0:
3681                kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682                break;
3683        case 2:
3684                vcpu->arch.cr2 = val;
3685                break;
3686        case 3:
3687                kvm_set_cr3(vcpu, val);
3688                break;
3689        case 4:
3690                kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691                break;
3692        case 8:
3693                kvm_set_cr8(vcpu, val & 0xfUL);
3694                break;
3695        default:
3696                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3697        }
3698}
3699
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3701{
3702        return kvm_x86_ops->get_cpl(vcpu);
3703}
3704
3705static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3706{
3707        kvm_x86_ops->get_gdt(vcpu, dt);
3708}
3709
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711                                           struct kvm_vcpu *vcpu)
3712{
3713        struct kvm_segment var;
3714
3715        kvm_get_segment(vcpu, &var, seg);
3716
3717        if (var.unusable)
3718                return false;
3719
3720        if (var.g)
3721                var.limit >>= 12;
3722        set_desc_limit(desc, var.limit);
3723        set_desc_base(desc, (unsigned long)var.base);
3724        desc->type = var.type;
3725        desc->s = var.s;
3726        desc->dpl = var.dpl;
3727        desc->p = var.present;
3728        desc->avl = var.avl;
3729        desc->l = var.l;
3730        desc->d = var.db;
3731        desc->g = var.g;
3732
3733        return true;
3734}
3735
3736static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3737                                           struct kvm_vcpu *vcpu)
3738{
3739        struct kvm_segment var;
3740
3741        /* needed to preserve selector */
3742        kvm_get_segment(vcpu, &var, seg);
3743
3744        var.base = get_desc_base(desc);
3745        var.limit = get_desc_limit(desc);
3746        if (desc->g)
3747                var.limit = (var.limit << 12) | 0xfff;
3748        var.type = desc->type;
3749        var.present = desc->p;
3750        var.dpl = desc->dpl;
3751        var.db = desc->d;
3752        var.s = desc->s;
3753        var.l = desc->l;
3754        var.g = desc->g;
3755        var.avl = desc->avl;
3756        var.present = desc->p;
3757        var.unusable = !var.present;
3758        var.padding = 0;
3759
3760        kvm_set_segment(vcpu, &var, seg);
3761        return;
3762}
3763
3764static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3765{
3766        struct kvm_segment kvm_seg;
3767
3768        kvm_get_segment(vcpu, &kvm_seg, seg);
3769        return kvm_seg.selector;
3770}
3771
3772static void emulator_set_segment_selector(u16 sel, int seg,
3773                                          struct kvm_vcpu *vcpu)
3774{
3775        struct kvm_segment kvm_seg;
3776
3777        kvm_get_segment(vcpu, &kvm_seg, seg);
3778        kvm_seg.selector = sel;
3779        kvm_set_segment(vcpu, &kvm_seg, seg);
3780}
3781
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784        kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = {
3788        .read_std            = kvm_read_guest_virt_system,
3789        .write_std           = kvm_write_guest_virt_system,
3790        .fetch               = kvm_fetch_guest_virt,
3791        .read_emulated       = emulator_read_emulated,
3792        .write_emulated      = emulator_write_emulated,
3793        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
3794        .pio_in_emulated     = emulator_pio_in_emulated,
3795        .pio_out_emulated    = emulator_pio_out_emulated,
3796        .get_cached_descriptor = emulator_get_cached_descriptor,
3797        .set_cached_descriptor = emulator_set_cached_descriptor,
3798        .get_segment_selector = emulator_get_segment_selector,
3799        .set_segment_selector = emulator_set_segment_selector,
3800        .get_gdt             = emulator_get_gdt,
3801        .get_cr              = emulator_get_cr,
3802        .set_cr              = emulator_set_cr,
3803        .cpl                 = emulator_get_cpl,
3804        .set_rflags          = emulator_set_rflags,
3805};
3806
3807static void cache_all_regs(struct kvm_vcpu *vcpu)
3808{
3809        kvm_register_read(vcpu, VCPU_REGS_RAX);
3810        kvm_register_read(vcpu, VCPU_REGS_RSP);
3811        kvm_register_read(vcpu, VCPU_REGS_RIP);
3812        vcpu->arch.regs_dirty = ~0;
3813}
3814
3815int emulate_instruction(struct kvm_vcpu *vcpu,
3816                        unsigned long cr2,
3817                        u16 error_code,
3818                        int emulation_type)
3819{
3820        int r, shadow_mask;
3821        struct decode_cache *c;
3822        struct kvm_run *run = vcpu->run;
3823
3824        kvm_clear_exception_queue(vcpu);
3825        vcpu->arch.mmio_fault_cr2 = cr2;
3826        /*
3827         * TODO: fix emulate.c to use guest_read/write_register
3828         * instead of direct ->regs accesses, can save hundred cycles
3829         * on Intel for instructions that don't read/change RSP, for
3830         * for example.
3831         */
3832        cache_all_regs(vcpu);
3833
3834        vcpu->mmio_is_write = 0;
3835
3836        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837                int cs_db, cs_l;
3838                kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3839
3840                vcpu->arch.emulate_ctxt.vcpu = vcpu;
3841                vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3842                vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3843                vcpu->arch.emulate_ctxt.mode =
3844                        (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3845                        (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3846                        ? X86EMUL_MODE_VM86 : cs_l
3847                        ? X86EMUL_MODE_PROT64 : cs_db
3848                        ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3849
3850                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851                trace_kvm_emulate_insn_start(vcpu);
3852
3853                /* Only allow emulation of specific instructions on #UD
3854                 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855                c = &vcpu->arch.emulate_ctxt.decode;
3856                if (emulation_type & EMULTYPE_TRAP_UD) {
3857                        if (!c->twobyte)
3858                                return EMULATE_FAIL;
3859                        switch (c->b) {
3860                        case 0x01: /* VMMCALL */
3861                                if (c->modrm_mod != 3 || c->modrm_rm != 1)
3862                                        return EMULATE_FAIL;
3863                                break;
3864                        case 0x34: /* sysenter */
3865                        case 0x35: /* sysexit */
3866                                if (c->modrm_mod != 0 || c->modrm_rm != 0)
3867                                        return EMULATE_FAIL;
3868                                break;
3869                        case 0x05: /* syscall */
3870                                if (c->modrm_mod != 0 || c->modrm_rm != 0)
3871                                        return EMULATE_FAIL;
3872                                break;
3873                        default:
3874                                return EMULATE_FAIL;
3875                        }
3876
3877                        if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
3878                                return EMULATE_FAIL;
3879                }
3880
3881                ++vcpu->stat.insn_emulation;
3882                if (r)  {
3883                        ++vcpu->stat.insn_emulation_fail;
3884                        trace_kvm_emulate_insn_failed(vcpu);
3885                        if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886                                return EMULATE_DONE;
3887                        return EMULATE_FAIL;
3888                }
3889        }
3890
3891        if (emulation_type & EMULTYPE_SKIP) {
3892                kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
3893                return EMULATE_DONE;
3894        }
3895
3896restart:
3897        r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898        shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899
3900        if (r == 0)
3901                kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3902
3903        if (vcpu->arch.pio.count) {
3904                if (!vcpu->arch.pio.in)
3905                        vcpu->arch.pio.count = 0;
3906                return EMULATE_DO_MMIO;
3907        }
3908
3909        if (r || vcpu->mmio_is_write) {
3910                run->exit_reason = KVM_EXIT_MMIO;
3911                run->mmio.phys_addr = vcpu->mmio_phys_addr;
3912                memcpy(run->mmio.data, vcpu->mmio_data, 8);
3913                run->mmio.len = vcpu->mmio_size;
3914                run->mmio.is_write = vcpu->mmio_is_write;
3915        }
3916
3917        if (r) {
3918                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3919                        goto done;
3920                if (!vcpu->mmio_needed) {
3921                        ++vcpu->stat.insn_emulation_fail;
3922                        trace_kvm_emulate_insn_failed(vcpu);
3923                        kvm_report_emulation_failure(vcpu, "mmio");
3924                        return EMULATE_FAIL;
3925                }
3926                return EMULATE_DO_MMIO;
3927        }
3928
3929        if (vcpu->mmio_is_write) {
3930                vcpu->mmio_needed = 0;
3931                return EMULATE_DO_MMIO;
3932        }
3933
3934done:
3935        if (vcpu->arch.exception.pending)
3936                vcpu->arch.emulate_ctxt.restart = false;
3937
3938        if (vcpu->arch.emulate_ctxt.restart)
3939                goto restart;
3940
3941        return EMULATE_DONE;
3942}
3943EXPORT_SYMBOL_GPL(emulate_instruction);
3944
3945int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3946{
3947        unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3948        int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3949        /* do not return to emulator after return from userspace */
3950        vcpu->arch.pio.count = 0;
3951        return ret;
3952}
3953EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3954
3955static void bounce_off(void *info)
3956{
3957        /* nothing */
3958}
3959
3960static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3961                                     void *data)
3962{
3963        struct cpufreq_freqs *freq = data;
3964        struct kvm *kvm;
3965        struct kvm_vcpu *vcpu;
3966        int i, send_ipi = 0;
3967
3968        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3969                return 0;
3970        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3971                return 0;
3972        per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3973
3974        spin_lock(&kvm_lock);
3975        list_for_each_entry(kvm, &vm_list, vm_list) {
3976                kvm_for_each_vcpu(i, vcpu, kvm) {
3977                        if (vcpu->cpu != freq->cpu)
3978                                continue;
3979                        if (!kvm_request_guest_time_update(vcpu))
3980                                continue;
3981                        if (vcpu->cpu != smp_processor_id())
3982                                send_ipi++;
3983                }
3984        }
3985        spin_unlock(&kvm_lock);
3986
3987        if (freq->old < freq->new && send_ipi) {
3988                /*
3989                 * We upscale the frequency.  Must make the guest
3990                 * doesn't see old kvmclock values while running with
3991                 * the new frequency, otherwise we risk the guest sees
3992                 * time go backwards.
3993                 *
3994                 * In case we update the frequency for another cpu
3995                 * (which might be in guest context) send an interrupt
3996                 * to kick the cpu out of guest context.  Next time
3997                 * guest context is entered kvmclock will be updated,
3998                 * so the guest will not see stale values.
3999                 */
4000                smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
4001        }
4002        return 0;
4003}
4004
4005static struct notifier_block kvmclock_cpufreq_notifier_block = {
4006        .notifier_call  = kvmclock_cpufreq_notifier
4007};
4008
4009static void kvm_timer_init(void)
4010{
4011        int cpu;
4012
4013        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4014                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4015                                          CPUFREQ_TRANSITION_NOTIFIER);
4016                for_each_online_cpu(cpu) {
4017                        unsigned long khz = cpufreq_get(cpu);
4018                        if (!khz)
4019                                khz = tsc_khz;
4020                        per_cpu(cpu_tsc_khz, cpu) = khz;
4021                }
4022        } else {
4023                for_each_possible_cpu(cpu)
4024                        per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4025        }
4026}
4027
4028static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
4029
4030static int kvm_is_in_guest(void)
4031{
4032        return percpu_read(current_vcpu) != NULL;
4033}
4034
4035static int kvm_is_user_mode(void)
4036{
4037        int user_mode = 3;
4038
4039        if (percpu_read(current_vcpu))
4040                user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
4041
4042        return user_mode != 0;
4043}
4044
4045static unsigned long kvm_get_guest_ip(void)
4046{
4047        unsigned long ip = 0;
4048
4049        if (percpu_read(current_vcpu))
4050                ip = kvm_rip_read(percpu_read(current_vcpu));
4051
4052        return ip;
4053}
4054
4055static struct perf_guest_info_callbacks kvm_guest_cbs = {
4056        .is_in_guest            = kvm_is_in_guest,
4057        .is_user_mode           = kvm_is_user_mode,
4058        .get_guest_ip           = kvm_get_guest_ip,
4059};
4060
4061void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
4062{
4063        percpu_write(current_vcpu, vcpu);
4064}
4065EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
4066
4067void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
4068{
4069        percpu_write(current_vcpu, NULL);
4070}
4071EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
4072
4073int kvm_arch_init(void *opaque)
4074{
4075        int r;
4076        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
4077
4078        if (kvm_x86_ops) {
4079                printk(KERN_ERR "kvm: already loaded the other module\n");
4080                r = -EEXIST;
4081                goto out;
4082        }
4083
4084        if (!ops->cpu_has_kvm_support()) {
4085                printk(KERN_ERR "kvm: no hardware support\n");
4086                r = -EOPNOTSUPP;
4087                goto out;
4088        }
4089        if (ops->disabled_by_bios()) {
4090                printk(KERN_ERR "kvm: disabled by bios\n");
4091                r = -EOPNOTSUPP;
4092                goto out;
4093        }
4094
4095        r = kvm_mmu_module_init();
4096        if (r)
4097                goto out;
4098
4099        kvm_init_msr_list();
4100
4101        kvm_x86_ops = ops;
4102        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4103        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4104        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4105                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
4106
4107        kvm_timer_init();
4108
4109        perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110
4111        return 0;
4112
4113out:
4114        return r;
4115}
4116
4117void kvm_arch_exit(void)
4118{
4119        perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
4120
4121        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4122                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4123                                            CPUFREQ_TRANSITION_NOTIFIER);
4124        kvm_x86_ops = NULL;
4125        kvm_mmu_module_exit();
4126}
4127
4128int kvm_emulate_halt(struct kvm_vcpu *vcpu)
4129{
4130        ++vcpu->stat.halt_exits;
4131        if (irqchip_in_kernel(vcpu->kvm)) {
4132                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
4133                return 1;
4134        } else {
4135                vcpu->run->exit_reason = KVM_EXIT_HLT;
4136                return 0;
4137        }
4138}
4139EXPORT_SYMBOL_GPL(kvm_emulate_halt);
4140
4141static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
4142                           unsigned long a1)
4143{
4144        if (is_long_mode(vcpu))
4145                return a0;
4146        else
4147                return a0 | ((gpa_t)a1 << 32);
4148}
4149
4150int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
4151{
4152        u64 param, ingpa, outgpa, ret;
4153        uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
4154        bool fast, longmode;
4155        int cs_db, cs_l;
4156
4157        /*
4158         * hypercall generates UD from non zero cpl and real mode
4159         * per HYPER-V spec
4160         */
4161        if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
4162                kvm_queue_exception(vcpu, UD_VECTOR);
4163                return 0;
4164        }
4165
4166        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4167        longmode = is_long_mode(vcpu) && cs_l == 1;
4168
4169        if (!longmode) {
4170                param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
4171                        (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
4172                ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
4173                        (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
4174                outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
4175                        (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
4176        }
4177#ifdef CONFIG_X86_64
4178        else {
4179                param = kvm_register_read(vcpu, VCPU_REGS_RCX);
4180                ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
4181                outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
4182        }
4183#endif
4184
4185        code = param & 0xffff;
4186        fast = (param >> 16) & 0x1;
4187        rep_cnt = (param >> 32) & 0xfff;
4188        rep_idx = (param >> 48) & 0xfff;
4189
4190        trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
4191
4192        switch (code) {
4193        case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
4194                kvm_vcpu_on_spin(vcpu);
4195                break;
4196        default:
4197                res = HV_STATUS_INVALID_HYPERCALL_CODE;
4198                break;
4199        }
4200
4201        ret = res | (((u64)rep_done & 0xfff) << 32);
4202        if (longmode) {
4203                kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
4204        } else {
4205                kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
4206                kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
4207        }
4208
4209        return 1;
4210}
4211
4212int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
4213{
4214        unsigned long nr, a0, a1, a2, a3, ret;
4215        int r = 1;
4216
4217        if (kvm_hv_hypercall_enabled(vcpu->kvm))
4218                return kvm_hv_hypercall(vcpu);
4219
4220        nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
4221        a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
4222        a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
4223        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
4224        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
4225
4226        trace_kvm_hypercall(nr, a0, a1, a2, a3);
4227
4228        if (!is_long_mode(vcpu)) {
4229                nr &= 0xFFFFFFFF;
4230                a0 &= 0xFFFFFFFF;
4231                a1 &= 0xFFFFFFFF;
4232                a2 &= 0xFFFFFFFF;
4233                a3 &= 0xFFFFFFFF;
4234        }
4235
4236        if (kvm_x86_ops->get_cpl(vcpu) != 0) {
4237                ret = -KVM_EPERM;
4238                goto out;
4239        }
4240
4241        switch (nr) {
4242        case KVM_HC_VAPIC_POLL_IRQ:
4243                ret = 0;
4244                break;
4245        case KVM_HC_MMU_OP:
4246                r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
4247                break;
4248        default:
4249                ret = -KVM_ENOSYS;
4250                break;
4251        }
4252out:
4253        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
4254        ++vcpu->stat.hypercalls;
4255        return r;
4256}
4257EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
4258
4259int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4260{
4261        char instruction[3];
4262        unsigned long rip = kvm_rip_read(vcpu);
4263
4264        /*
4265         * Blow out the MMU to ensure that no other VCPU has an active mapping
4266         * to ensure that the updated hypercall appears atomically across all
4267         * VCPUs.
4268         */
4269        kvm_mmu_zap_all(vcpu->kvm);
4270
4271        kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272
4273        return emulator_write_emulated(rip, instruction, 3, vcpu);
4274}
4275
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4277{
4278        struct desc_ptr dt = { limit, base };
4279
4280        kvm_x86_ops->set_gdt(vcpu, &dt);
4281}
4282
4283void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4284{
4285        struct desc_ptr dt = { limit, base };
4286
4287        kvm_x86_ops->set_idt(vcpu, &dt);
4288}
4289
4290static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4291{
4292        struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
4293        int j, nent = vcpu->arch.cpuid_nent;
4294
4295        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
4296        /* when no next entry is found, the current entry[i] is reselected */
4297        for (j = i + 1; ; j = (j + 1) % nent) {
4298                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
4299                if (ej->function == e->function) {
4300                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
4301                        return j;
4302                }
4303        }
4304        return 0; /* silence gcc, even though control never reaches here */
4305}
4306
4307/* find an entry with matching function, matching index (if needed), and that
4308 * should be read next (if it's stateful) */
4309static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
4310        u32 function, u32 index)
4311{
4312        if (e->function != function)
4313                return 0;
4314        if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
4315                return 0;
4316        if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
4317            !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
4318                return 0;
4319        return 1;
4320}
4321
4322struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
4323                                              u32 function, u32 index)
4324{
4325        int i;
4326        struct kvm_cpuid_entry2 *best = NULL;
4327
4328        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
4329                struct kvm_cpuid_entry2 *e;
4330
4331                e = &vcpu->arch.cpuid_entries[i];
4332                if (is_matching_cpuid_entry(e, function, index)) {
4333                        if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
4334                                move_to_next_stateful_cpuid_entry(vcpu, i);
4335                        best = e;
4336                        break;
4337                }
4338                /*
4339                 * Both basic or both extended?
4340                 */
4341                if (((e->function ^ function) & 0x80000000) == 0)
4342                        if (!best || e->function > best->function)
4343                                best = e;
4344        }
4345        return best;
4346}
4347EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
4348
4349int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4350{
4351        struct kvm_cpuid_entry2 *best;
4352
4353        best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4354        if (!best || best->eax < 0x80000008)
4355                goto not_found;
4356        best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4357        if (best)
4358                return best->eax & 0xff;
4359not_found:
4360        return 36;
4361}
4362
4363void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4364{
4365        u32 function, index;
4366        struct kvm_cpuid_entry2 *best;
4367
4368        function = kvm_register_read(vcpu, VCPU_REGS_RAX);
4369        index = kvm_register_read(vcpu, VCPU_REGS_RCX);
4370        kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
4371        kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
4372        kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
4373        kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
4374        best = kvm_find_cpuid_entry(vcpu, function, index);
4375        if (best) {
4376                kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
4377                kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
4378                kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
4379                kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
4380        }
4381        kvm_x86_ops->skip_emulated_instruction(vcpu);
4382        trace_kvm_cpuid(function,
4383                        kvm_register_read(vcpu, VCPU_REGS_RAX),
4384                        kvm_register_read(vcpu, VCPU_REGS_RBX),
4385                        kvm_register_read(vcpu, VCPU_REGS_RCX),
4386                        kvm_register_read(vcpu, VCPU_REGS_RDX));
4387}
4388EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
4389
4390/*
4391 * Check if userspace requested an interrupt window, and that the
4392 * interrupt window is open.
4393 *
4394 * No need to exit to userspace if we already have an interrupt queued.
4395 */
4396static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
4397{
4398        return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
4399                vcpu->run->request_interrupt_window &&
4400                kvm_arch_interrupt_allowed(vcpu));
4401}
4402
4403static void post_kvm_run_save(struct kvm_vcpu *vcpu)
4404{
4405        struct kvm_run *kvm_run = vcpu->run;
4406
4407        kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
4408        kvm_run->cr8 = kvm_get_cr8(vcpu);
4409        kvm_run->apic_base = kvm_get_apic_base(vcpu);
4410        if (irqchip_in_kernel(vcpu->kvm))
4411                kvm_run->ready_for_interrupt_injection = 1;
4412        else
4413                kvm_run->ready_for_interrupt_injection =
4414                        kvm_arch_interrupt_allowed(vcpu) &&
4415                        !kvm_cpu_has_interrupt(vcpu) &&
4416                        !kvm_event_needs_reinjection(vcpu);
4417}
4418
4419static void vapic_enter(struct kvm_vcpu *vcpu)
4420{
4421        struct kvm_lapic *apic = vcpu->arch.apic;
4422        struct page *page;
4423
4424        if (!apic || !apic->vapic_addr)
4425                return;
4426
4427        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
4428
4429        vcpu->arch.apic->vapic_page = page;
4430}
4431
4432static void vapic_exit(struct kvm_vcpu *vcpu)
4433{
4434        struct kvm_lapic *apic = vcpu->arch.apic;
4435        int idx;
4436
4437        if (!apic || !apic->vapic_addr)
4438                return;
4439
4440        idx = srcu_read_lock(&vcpu->kvm->srcu);
4441        kvm_release_page_dirty(apic->vapic_page);
4442        mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
4443        srcu_read_unlock(&vcpu->kvm->srcu, idx);
4444}
4445
4446static void update_cr8_intercept(struct kvm_vcpu *vcpu)
4447{
4448        int max_irr, tpr;
4449
4450        if (!kvm_x86_ops->update_cr8_intercept)
4451                return;
4452
4453        if (!vcpu->arch.apic)
4454                return;
4455
4456        if (!vcpu->arch.apic->vapic_addr)
4457                max_irr = kvm_lapic_find_highest_irr(vcpu);
4458        else
4459                max_irr = -1;
4460
4461        if (max_irr != -1)
4462                max_irr >>= 4;
4463
4464        tpr = kvm_lapic_get_cr8(vcpu);
4465
4466        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
4467}
4468
4469static void inject_pending_event(struct kvm_vcpu *vcpu)
4470{
4471        /* try to reinject previous events if any */
4472        if (vcpu->arch.exception.pending) {
4473                trace_kvm_inj_exception(vcpu->arch.exception.nr,
4474                                        vcpu->arch.exception.has_error_code,
4475                                        vcpu->arch.exception.error_code);
4476                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4477                                          vcpu->arch.exception.has_error_code,
4478                                          vcpu->arch.exception.error_code,
4479                                          vcpu->arch.exception.reinject);
4480                return;
4481        }
4482
4483        if (vcpu->arch.nmi_injected) {
4484                kvm_x86_ops->set_nmi(vcpu);
4485                return;
4486        }
4487
4488        if (vcpu->arch.interrupt.pending) {
4489                kvm_x86_ops->set_irq(vcpu);
4490                return;
4491        }
4492
4493        /* try to inject new event if pending */
4494        if (vcpu->arch.nmi_pending) {
4495                if (kvm_x86_ops->nmi_allowed(vcpu)) {
4496                        vcpu->arch.nmi_pending = false;
4497                        vcpu->arch.nmi_injected = true;
4498                        kvm_x86_ops->set_nmi(vcpu);
4499                }
4500        } else if (kvm_cpu_has_interrupt(vcpu)) {
4501                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
4502                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
4503                                            false);
4504                        kvm_x86_ops->set_irq(vcpu);
4505                }
4506        }
4507}
4508
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{
4511        int r;
4512        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513                vcpu->run->request_interrupt_window;
4514
4515        if (vcpu->requests)
4516                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517                        kvm_mmu_unload(vcpu);
4518
4519        r = kvm_mmu_reload(vcpu);
4520        if (unlikely(r))
4521                goto out;
4522
4523        if (vcpu->requests) {
4524                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
4525                        __kvm_migrate_timers(vcpu);
4526                if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
4527                        kvm_write_guest_time(vcpu);
4528                if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
4529                        kvm_mmu_sync_roots(vcpu);
4530                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
4531                        kvm_x86_ops->tlb_flush(vcpu);
4532                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
4533                                       &vcpu->requests)) {
4534                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535                        r = 0;
4536                        goto out;
4537                }
4538                if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
4539                        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540                        r = 0;
4541                        goto out;
4542                }
4543                if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4544                        vcpu->fpu_active = 0;
4545                        kvm_x86_ops->fpu_deactivate(vcpu);
4546                }
4547        }
4548
4549        preempt_disable();
4550
4551        kvm_x86_ops->prepare_guest_switch(vcpu);
4552        if (vcpu->fpu_active)
4553                kvm_load_guest_fpu(vcpu);
4554
4555        local_irq_disable();
4556
4557        clear_bit(KVM_REQ_KICK, &vcpu->requests);
4558        smp_mb__after_clear_bit();
4559
4560        if (vcpu->requests || need_resched() || signal_pending(current)) {
4561                set_bit(KVM_REQ_KICK, &vcpu->requests);
4562                local_irq_enable();
4563                preempt_enable();
4564                r = 1;
4565                goto out;
4566        }
4567
4568        inject_pending_event(vcpu);
4569
4570        /* enable NMI/IRQ window open exits if needed */
4571        if (vcpu->arch.nmi_pending)
4572                kvm_x86_ops->enable_nmi_window(vcpu);
4573        else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4574                kvm_x86_ops->enable_irq_window(vcpu);
4575
4576        if (kvm_lapic_enabled(vcpu)) {
4577                update_cr8_intercept(vcpu);
4578                kvm_lapic_sync_to_vapic(vcpu);
4579        }
4580
4581        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4582
4583        kvm_guest_enter();
4584
4585        if (unlikely(vcpu->arch.switch_db_regs)) {
4586                set_debugreg(0, 7);
4587                set_debugreg(vcpu->arch.eff_db[0], 0);
4588                set_debugreg(vcpu->arch.eff_db[1], 1);
4589                set_debugreg(vcpu->arch.eff_db[2], 2);
4590                set_debugreg(vcpu->arch.eff_db[3], 3);
4591        }
4592
4593        trace_kvm_entry(vcpu->vcpu_id);
4594        kvm_x86_ops->run(vcpu);
4595
4596        /*
4597         * If the guest has used debug registers, at least dr7
4598         * will be disabled while returning to the host.
4599         * If we don't have active breakpoints in the host, we don't
4600         * care about the messed up debug address registers. But if
4601         * we have some of them active, restore the old state.
4602         */
4603        if (hw_breakpoint_active())
4604                hw_breakpoint_restore();
4605
4606        set_bit(KVM_REQ_KICK, &vcpu->requests);
4607        local_irq_enable();
4608
4609        ++vcpu->stat.exits;
4610
4611        /*
4612         * We must have an instruction between local_irq_enable() and
4613         * kvm_guest_exit(), so the timer interrupt isn't delayed by
4614         * the interrupt shadow.  The stat.exits increment will do nicely.
4615         * But we need to prevent reordering, hence this barrier():
4616         */
4617        barrier();
4618
4619        kvm_guest_exit();
4620
4621        preempt_enable();
4622
4623        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4624
4625        /*
4626         * Profile KVM exit RIPs:
4627         */
4628        if (unlikely(prof_on == KVM_PROFILING)) {
4629                unsigned long rip = kvm_rip_read(vcpu);
4630                profile_hit(KVM_PROFILING, (void *)rip);
4631        }
4632
4633
4634        kvm_lapic_sync_from_vapic(vcpu);
4635
4636        r = kvm_x86_ops->handle_exit(vcpu);
4637out:
4638        return r;
4639}
4640
4641
4642static int __vcpu_run(struct kvm_vcpu *vcpu)
4643{
4644        int r;
4645        struct kvm *kvm = vcpu->kvm;
4646
4647        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
4648                pr_debug("vcpu %d received sipi with vector # %x\n",
4649                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
4650                kvm_lapic_reset(vcpu);
4651                r = kvm_arch_vcpu_reset(vcpu);
4652                if (r)
4653                        return r;
4654                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4655        }
4656
4657        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4658        vapic_enter(vcpu);
4659
4660        r = 1;
4661        while (r > 0) {
4662                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
4663                        r = vcpu_enter_guest(vcpu);
4664                else {
4665                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666                        kvm_vcpu_block(vcpu);
4667                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668                        if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
4669                        {
4670                                switch(vcpu->arch.mp_state) {
4671                                case KVM_MP_STATE_HALTED:
4672                                        vcpu->arch.mp_state =
4673                                                KVM_MP_STATE_RUNNABLE;
4674                                case KVM_MP_STATE_RUNNABLE:
4675                                        break;
4676                                case KVM_MP_STATE_SIPI_RECEIVED:
4677                                default:
4678                                        r = -EINTR;
4679                                        break;
4680                                }
4681                        }
4682                }
4683
4684                if (r <= 0)
4685                        break;
4686
4687                clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
4688                if (kvm_cpu_has_pending_timer(vcpu))
4689                        kvm_inject_pending_timer_irqs(vcpu);
4690
4691                if (dm_request_for_irq_injection(vcpu)) {
4692                        r = -EINTR;
4693                        vcpu->run->exit_reason = KVM_EXIT_INTR;
4694                        ++vcpu->stat.request_irq_exits;
4695                }
4696                if (signal_pending(current)) {
4697                        r = -EINTR;
4698                        vcpu->run->exit_reason = KVM_EXIT_INTR;
4699                        ++vcpu->stat.signal_exits;
4700                }
4701                if (need_resched()) {
4702                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4703                        kvm_resched(vcpu);
4704                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4705                }
4706        }
4707
4708        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4709
4710        vapic_exit(vcpu);
4711
4712        return r;
4713}
4714
4715int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4716{
4717        int r;
4718        sigset_t sigsaved;
4719
4720        vcpu_load(vcpu);
4721
4722        if (vcpu->sigset_active)
4723                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724
4725        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
4726                kvm_vcpu_block(vcpu);
4727                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
4728                r = -EAGAIN;
4729                goto out;
4730        }
4731
4732        /* re-sync apic's tpr */
4733        if (!irqchip_in_kernel(vcpu->kvm))
4734                kvm_set_cr8(vcpu, kvm_run->cr8);
4735
4736        if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4737            vcpu->arch.emulate_ctxt.restart) {
4738                if (vcpu->mmio_needed) {
4739                        memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4740                        vcpu->mmio_read_completed = 1;
4741                        vcpu->mmio_needed = 0;
4742                }
4743                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744                r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745                srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746                if (r == EMULATE_DO_MMIO) {
4747                        r = 0;
4748                        goto out;
4749                }
4750        }
4751        if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
4752                kvm_register_write(vcpu, VCPU_REGS_RAX,
4753                                     kvm_run->hypercall.ret);
4754
4755        r = __vcpu_run(vcpu);
4756
4757out:
4758        post_kvm_run_save(vcpu);
4759        if (vcpu->sigset_active)
4760                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761
4762        vcpu_put(vcpu);
4763        return r;
4764}
4765
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{
4768        vcpu_load(vcpu);
4769
4770        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771        regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772        regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4773        regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4774        regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4775        regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4776        regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4777        regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4778#ifdef CONFIG_X86_64
4779        regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
4780        regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
4781        regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
4782        regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
4783        regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
4784        regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
4785        regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
4786        regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
4787#endif
4788
4789        regs->rip = kvm_rip_read(vcpu);
4790        regs->rflags = kvm_get_rflags(vcpu);
4791
4792        vcpu_put(vcpu);
4793
4794        return 0;
4795}
4796
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{
4799        vcpu_load(vcpu);
4800
4801        kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802        kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803        kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
4804        kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
4805        kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
4806        kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
4807        kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
4808        kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
4809#ifdef CONFIG_X86_64
4810        kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
4811        kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
4812        kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
4813        kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
4814        kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
4815        kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
4816        kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
4817        kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
4818#endif
4819
4820        kvm_rip_write(vcpu, regs->rip);
4821        kvm_set_rflags(vcpu, regs->rflags);
4822
4823        vcpu->arch.exception.pending = false;
4824
4825        vcpu_put(vcpu);
4826
4827        return 0;
4828}
4829
4830void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4831{
4832        struct kvm_segment cs;
4833
4834        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
4835        *db = cs.db;
4836        *l = cs.l;
4837}
4838EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4839
4840int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4841                                  struct kvm_sregs *sregs)
4842{
4843        struct desc_ptr dt;
4844
4845        vcpu_load(vcpu);
4846
4847        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4850        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4851        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4852        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4853
4854        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4855        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4856
4857        kvm_x86_ops->get_idt(vcpu, &dt);
4858        sregs->idt.limit = dt.size;
4859        sregs->idt.base = dt.address;
4860        kvm_x86_ops->get_gdt(vcpu, &dt);
4861        sregs->gdt.limit = dt.size;
4862        sregs->gdt.base = dt.address;
4863
4864        sregs->cr0 = kvm_read_cr0(vcpu);
4865        sregs->cr2 = vcpu->arch.cr2;
4866        sregs->cr3 = vcpu->arch.cr3;
4867        sregs->cr4 = kvm_read_cr4(vcpu);
4868        sregs->cr8 = kvm_get_cr8(vcpu);
4869        sregs->efer = vcpu->arch.efer;
4870        sregs->apic_base = kvm_get_apic_base(vcpu);
4871
4872        memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
4873
4874        if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
4875                set_bit(vcpu->arch.interrupt.nr,
4876                        (unsigned long *)sregs->interrupt_bitmap);
4877
4878        vcpu_put(vcpu);
4879
4880        return 0;
4881}
4882
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884                                    struct kvm_mp_state *mp_state)
4885{
4886        vcpu_load(vcpu);
4887        mp_state->mp_state = vcpu->arch.mp_state;
4888        vcpu_put(vcpu);
4889        return 0;
4890}
4891
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893                                    struct kvm_mp_state *mp_state)
4894{
4895        vcpu_load(vcpu);
4896        vcpu->arch.mp_state = mp_state->mp_state;
4897        vcpu_put(vcpu);
4898        return 0;
4899}
4900
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902                    bool has_error_code, u32 error_code)
4903{
4904        int cs_db, cs_l, ret;
4905        cache_all_regs(vcpu);
4906
4907        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4908
4909        vcpu->arch.emulate_ctxt.vcpu = vcpu;
4910        vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4911        vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4912        vcpu->arch.emulate_ctxt.mode =
4913                (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4914                (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4915                ? X86EMUL_MODE_VM86 : cs_l
4916                ? X86EMUL_MODE_PROT64 : cs_db
4917                ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4918
4919        ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920                                   tss_selector, reason, has_error_code,
4921                                   error_code);
4922
4923        if (ret)
4924                return EMULATE_FAIL;
4925
4926        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927        return EMULATE_DONE;
4928}
4929EXPORT_SYMBOL_GPL(kvm_task_switch);
4930
4931int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4932                                  struct kvm_sregs *sregs)
4933{
4934        int mmu_reset_needed = 0;
4935        int pending_vec, max_bits;
4936        struct desc_ptr dt;
4937
4938        vcpu_load(vcpu);
4939
4940        dt.size = sregs->idt.limit;
4941        dt.address = sregs->idt.base;
4942        kvm_x86_ops->set_idt(vcpu, &dt);
4943        dt.size = sregs->gdt.limit;
4944        dt.address = sregs->gdt.base;
4945        kvm_x86_ops->set_gdt(vcpu, &dt);
4946
4947        vcpu->arch.cr2 = sregs->cr2;
4948        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4949        vcpu->arch.cr3 = sregs->cr3;
4950
4951        kvm_set_cr8(vcpu, sregs->cr8);
4952
4953        mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4954        kvm_x86_ops->set_efer(vcpu, sregs->efer);
4955        kvm_set_apic_base(vcpu, sregs->apic_base);
4956
4957        mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4958        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4959        vcpu->arch.cr0 = sregs->cr0;
4960
4961        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4962        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4963        if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4964                load_pdptrs(vcpu, vcpu->arch.cr3);
4965                mmu_reset_needed = 1;
4966        }
4967
4968        if (mmu_reset_needed)
4969                kvm_mmu_reset_context(vcpu);
4970
4971        max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4972        pending_vec = find_first_bit(
4973                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4974        if (pending_vec < max_bits) {
4975                kvm_queue_interrupt(vcpu, pending_vec, false);
4976                pr_debug("Set back pending irq %d\n", pending_vec);
4977                if (irqchip_in_kernel(vcpu->kvm))
4978                        kvm_pic_clear_isr_ack(vcpu->kvm);
4979        }
4980
4981        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4982        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4983        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4984        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4985        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4986        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4987
4988        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4989        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4990
4991        update_cr8_intercept(vcpu);
4992
4993        /* Older userspace won't unhalt the vcpu on reset. */
4994        if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4995            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4996            !is_protmode(vcpu))
4997                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998
4999        vcpu_put(vcpu);
5000
5001        return 0;
5002}
5003
5004int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5005                                        struct kvm_guest_debug *dbg)
5006{
5007        unsigned long rflags;
5008        int i, r;
5009
5010        vcpu_load(vcpu);
5011
5012        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013                r = -EBUSY;
5014                if (vcpu->arch.exception.pending)
5015                        goto unlock_out;
5016                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017                        kvm_queue_exception(vcpu, DB_VECTOR);
5018                else
5019                        kvm_queue_exception(vcpu, BP_VECTOR);
5020        }
5021
5022        /*
5023         * Read rflags as long as potentially injected trace flags are still
5024         * filtered out.
5025         */
5026        rflags = kvm_get_rflags(vcpu);
5027
5028        vcpu->guest_debug = dbg->control;
5029        if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
5030                vcpu->guest_debug = 0;
5031
5032        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5033                for (i = 0; i < KVM_NR_DB_REGS; ++i)
5034                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
5035                vcpu->arch.switch_db_regs =
5036                        (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
5037        } else {
5038                for (i = 0; i < KVM_NR_DB_REGS; i++)
5039                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
5040                vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5041        }
5042
5043        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5044                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5045                        get_segment_base(vcpu, VCPU_SREG_CS);
5046
5047        /*
5048         * Trigger an rflags update that will inject or remove the trace
5049         * flags.
5050         */
5051        kvm_set_rflags(vcpu, rflags);
5052
5053        kvm_x86_ops->set_guest_debug(vcpu, dbg);
5054
5055        r = 0;
5056
5057unlock_out:
5058        vcpu_put(vcpu);
5059
5060        return r;
5061}
5062
5063/*
5064 * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068        u16     cwd;
5069        u16     swd;
5070        u16     twd;
5071        u16     fop;
5072        u64     rip;
5073        u64     rdp;
5074        u32     mxcsr;
5075        u32     mxcsr_mask;
5076        u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078        u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080        u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address.
5086 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5088                                    struct kvm_translation *tr)
5089{
5090        unsigned long vaddr = tr->linear_address;
5091        gpa_t gpa;
5092        int idx;
5093
5094        vcpu_load(vcpu);
5095        idx = srcu_read_lock(&vcpu->kvm->srcu);
5096        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097        srcu_read_unlock(&vcpu->kvm->srcu, idx);
5098        tr->physical_address = gpa;
5099        tr->valid = gpa != UNMAPPED_GVA;
5100        tr->writeable = 1;
5101        tr->usermode = 0;
5102        vcpu_put(vcpu);
5103
5104        return 0;
5105}
5106
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{
5109        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
5110
5111        vcpu_load(vcpu);
5112
5113        memcpy(fpu->fpr, fxsave->st_space, 128);
5114        fpu->fcw = fxsave->cwd;
5115        fpu->fsw = fxsave->swd;
5116        fpu->ftwx = fxsave->twd;
5117        fpu->last_opcode = fxsave->fop;
5118        fpu->last_ip = fxsave->rip;
5119        fpu->last_dp = fxsave->rdp;
5120        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121
5122        vcpu_put(vcpu);
5123
5124        return 0;
5125}
5126
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{
5129        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
5130
5131        vcpu_load(vcpu);
5132
5133        memcpy(fxsave->st_space, fpu->fpr, 128);
5134        fxsave->cwd = fpu->fcw;
5135        fxsave->swd = fpu->fsw;
5136        fxsave->twd = fpu->ftwx;
5137        fxsave->fop = fpu->last_opcode;
5138        fxsave->rip = fpu->last_ip;
5139        fxsave->rdp = fpu->last_dp;
5140        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141
5142        vcpu_put(vcpu);
5143
5144        return 0;
5145}
5146
5147void fx_init(struct kvm_vcpu *vcpu)
5148{
5149        unsigned after_mxcsr_mask;
5150
5151        /*
5152         * Touch the fpu the first time in non atomic context as if
5153         * this is the first fpu instruction the exception handler
5154         * will fire before the instruction returns and it'll have to
5155         * allocate ram with GFP_KERNEL.
5156         */
5157        if (!used_math())
5158                kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160        /* Initialize guest FPU by resetting ours and saving into guest's */
5161        preempt_disable();
5162        kvm_fx_save(&vcpu->arch.host_fx_image);
5163        kvm_fx_finit();
5164        kvm_fx_save(&vcpu->arch.guest_fx_image);
5165        kvm_fx_restore(&vcpu->arch.host_fx_image);
5166        preempt_enable();
5167
5168        vcpu->arch.cr0 |= X86_CR0_ET;
5169        after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
5170        vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
5171        memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172               0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173}
5174EXPORT_SYMBOL_GPL(fx_init);
5175
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{
5178        if (vcpu->guest_fpu_loaded)
5179                return;
5180
5181        vcpu->guest_fpu_loaded = 1;
5182        kvm_fx_save(&vcpu->arch.host_fx_image);
5183        kvm_fx_restore(&vcpu->arch.guest_fx_image);
5184        trace_kvm_fpu(1);
5185}
5186
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{
5189        if (!vcpu->guest_fpu_loaded)
5190                return;
5191
5192        vcpu->guest_fpu_loaded = 0;
5193        kvm_fx_save(&vcpu->arch.guest_fx_image);
5194        kvm_fx_restore(&vcpu->arch.host_fx_image);
5195        ++vcpu->stat.fpu_reload;
5196        set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5197        trace_kvm_fpu(0);
5198}
5199
5200void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5201{
5202        if (vcpu->arch.time_page) {
5203                kvm_release_page_dirty(vcpu->arch.time_page);
5204                vcpu->arch.time_page = NULL;
5205        }
5206
5207        kvm_x86_ops->vcpu_free(vcpu);
5208}
5209
5210struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5211                                                unsigned int id)
5212{
5213        return kvm_x86_ops->vcpu_create(kvm, id);
5214}
5215
5216int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{
5218        int r;
5219
5220        /* We do fxsave: this must be aligned. */
5221        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223        vcpu->arch.mtrr_state.have_fixed = 1;
5224        vcpu_load(vcpu);
5225        r = kvm_arch_vcpu_reset(vcpu);
5226        if (r == 0)
5227                r = kvm_mmu_setup(vcpu);
5228        vcpu_put(vcpu);
5229        if (r < 0)
5230                goto free_vcpu;
5231
5232        return 0;
5233free_vcpu:
5234