linux/arch/x86/kvm/x86.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 *
   8 * Authors:
   9 *   Avi Kivity   <avi@qumranet.com>
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *
  12 * This work is licensed under the terms of the GNU GPL, version 2.  See
  13 * the COPYING file in the top-level directory.
  14 *
  15 */
  16
  17#include <linux/kvm_host.h>
  18#include "irq.h"
  19#include "mmu.h"
  20#include "i8254.h"
  21#include "tss.h"
  22
  23#include <linux/clocksource.h>
  24#include <linux/kvm.h>
  25#include <linux/fs.h>
  26#include <linux/vmalloc.h>
  27#include <linux/module.h>
  28#include <linux/mman.h>
  29#include <linux/highmem.h>
  30
  31#include <asm/uaccess.h>
  32#include <asm/msr.h>
  33#include <asm/desc.h>
  34
  35#define MAX_IO_MSRS 256
  36#define CR0_RESERVED_BITS                                               \
  37        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  38                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  39                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  40#define CR4_RESERVED_BITS                                               \
  41        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  42                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  43                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  44                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  45
  46#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  47/* EFER defaults:
  48 * - enable syscall per default because its emulated by KVM
  49 * - enable LME and LMA per default on 64 bit KVM
  50 */
  51#ifdef CONFIG_X86_64
  52static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  53#else
  54static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  55#endif
  56
  57#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  58#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  59
  60static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  61                                    struct kvm_cpuid_entry2 __user *entries);
  62
  63struct kvm_x86_ops *kvm_x86_ops;
  64
  65struct kvm_stats_debugfs_item debugfs_entries[] = {
  66        { "pf_fixed", VCPU_STAT(pf_fixed) },
  67        { "pf_guest", VCPU_STAT(pf_guest) },
  68        { "tlb_flush", VCPU_STAT(tlb_flush) },
  69        { "invlpg", VCPU_STAT(invlpg) },
  70        { "exits", VCPU_STAT(exits) },
  71        { "io_exits", VCPU_STAT(io_exits) },
  72        { "mmio_exits", VCPU_STAT(mmio_exits) },
  73        { "signal_exits", VCPU_STAT(signal_exits) },
  74        { "irq_window", VCPU_STAT(irq_window_exits) },
  75        { "nmi_window", VCPU_STAT(nmi_window_exits) },
  76        { "halt_exits", VCPU_STAT(halt_exits) },
  77        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  78        { "hypercalls", VCPU_STAT(hypercalls) },
  79        { "request_irq", VCPU_STAT(request_irq_exits) },
  80        { "irq_exits", VCPU_STAT(irq_exits) },
  81        { "host_state_reload", VCPU_STAT(host_state_reload) },
  82        { "efer_reload", VCPU_STAT(efer_reload) },
  83        { "fpu_reload", VCPU_STAT(fpu_reload) },
  84        { "insn_emulation", VCPU_STAT(insn_emulation) },
  85        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
  86        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
  87        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
  88        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
  89        { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
  90        { "mmu_flooded", VM_STAT(mmu_flooded) },
  91        { "mmu_recycled", VM_STAT(mmu_recycled) },
  92        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
  93        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
  94        { "largepages", VM_STAT(lpages) },
  95        { NULL }
  96};
  97
  98
  99unsigned long segment_base(u16 selector)
 100{
 101        struct descriptor_table gdt;
 102        struct desc_struct *d;
 103        unsigned long table_base;
 104        unsigned long v;
 105
 106        if (selector == 0)
 107                return 0;
 108
 109        asm("sgdt %0" : "=m"(gdt));
 110        table_base = gdt.base;
 111
 112        if (selector & 4) {           /* from ldt */
 113                u16 ldt_selector;
 114
 115                asm("sldt %0" : "=g"(ldt_selector));
 116                table_base = segment_base(ldt_selector);
 117        }
 118        d = (struct desc_struct *)(table_base + (selector & ~7));
 119        v = d->base0 | ((unsigned long)d->base1 << 16) |
 120                ((unsigned long)d->base2 << 24);
 121#ifdef CONFIG_X86_64
 122        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 123                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 124#endif
 125        return v;
 126}
 127EXPORT_SYMBOL_GPL(segment_base);
 128
 129u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 130{
 131        if (irqchip_in_kernel(vcpu->kvm))
 132                return vcpu->arch.apic_base;
 133        else
 134                return vcpu->arch.apic_base;
 135}
 136EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 137
 138void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 139{
 140        /* TODO: reserve bits check */
 141        if (irqchip_in_kernel(vcpu->kvm))
 142                kvm_lapic_set_base(vcpu, data);
 143        else
 144                vcpu->arch.apic_base = data;
 145}
 146EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 147
 148void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 149{
 150        WARN_ON(vcpu->arch.exception.pending);
 151        vcpu->arch.exception.pending = true;
 152        vcpu->arch.exception.has_error_code = false;
 153        vcpu->arch.exception.nr = nr;
 154}
 155EXPORT_SYMBOL_GPL(kvm_queue_exception);
 156
 157void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 158                           u32 error_code)
 159{
 160        ++vcpu->stat.pf_guest;
 161        if (vcpu->arch.exception.pending) {
 162                if (vcpu->arch.exception.nr == PF_VECTOR) {
 163                        printk(KERN_DEBUG "kvm: inject_page_fault:"
 164                                        " double fault 0x%lx\n", addr);
 165                        vcpu->arch.exception.nr = DF_VECTOR;
 166                        vcpu->arch.exception.error_code = 0;
 167                } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 168                        /* triple fault -> shutdown */
 169                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 170                }
 171                return;
 172        }
 173        vcpu->arch.cr2 = addr;
 174        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 175}
 176
 177void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 178{
 179        vcpu->arch.nmi_pending = 1;
 180}
 181EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 182
 183void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 184{
 185        WARN_ON(vcpu->arch.exception.pending);
 186        vcpu->arch.exception.pending = true;
 187        vcpu->arch.exception.has_error_code = true;
 188        vcpu->arch.exception.nr = nr;
 189        vcpu->arch.exception.error_code = error_code;
 190}
 191EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 192
 193static void __queue_exception(struct kvm_vcpu *vcpu)
 194{
 195        kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 196                                     vcpu->arch.exception.has_error_code,
 197                                     vcpu->arch.exception.error_code);
 198}
 199
 200/*
 201 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 202 * a #GP and return false.
 203 */
 204bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 205{
 206        if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 207                return true;
 208        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 209        return false;
 210}
 211EXPORT_SYMBOL_GPL(kvm_require_cpl);
 212
 213/*
 214 * Load the pae pdptrs.  Return true is they are all valid.
 215 */
 216int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 217{
 218        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 219        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 220        int i;
 221        int ret;
 222        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 223
 224        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 225                                  offset * sizeof(u64), sizeof(pdpte));
 226        if (ret < 0) {
 227                ret = 0;
 228                goto out;
 229        }
 230        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 231                if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 232                        ret = 0;
 233                        goto out;
 234                }
 235        }
 236        ret = 1;
 237
 238        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 239out:
 240
 241        return ret;
 242}
 243EXPORT_SYMBOL_GPL(load_pdptrs);
 244
 245static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 246{
 247        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 248        bool changed = true;
 249        int r;
 250
 251        if (is_long_mode(vcpu) || !is_pae(vcpu))
 252                return false;
 253
 254        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 255        if (r < 0)
 256                goto out;
 257        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 258out:
 259
 260        return changed;
 261}
 262
 263void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 264{
 265        if (cr0 & CR0_RESERVED_BITS) {
 266                printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 267                       cr0, vcpu->arch.cr0);
 268                kvm_inject_gp(vcpu, 0);
 269                return;
 270        }
 271
 272        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 273                printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 274                kvm_inject_gp(vcpu, 0);
 275                return;
 276        }
 277
 278        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 279                printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 280                       "and a clear PE flag\n");
 281                kvm_inject_gp(vcpu, 0);
 282                return;
 283        }
 284
 285        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 286#ifdef CONFIG_X86_64
 287                if ((vcpu->arch.shadow_efer & EFER_LME)) {
 288                        int cs_db, cs_l;
 289
 290                        if (!is_pae(vcpu)) {
 291                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 292                                       "in long mode while PAE is disabled\n");
 293                                kvm_inject_gp(vcpu, 0);
 294                                return;
 295                        }
 296                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 297                        if (cs_l) {
 298                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 299                                       "in long mode while CS.L == 1\n");
 300                                kvm_inject_gp(vcpu, 0);
 301                                return;
 302
 303                        }
 304                } else
 305#endif
 306                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 307                        printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 308                               "reserved bits\n");
 309                        kvm_inject_gp(vcpu, 0);
 310                        return;
 311                }
 312
 313        }
 314
 315        kvm_x86_ops->set_cr0(vcpu, cr0);
 316        vcpu->arch.cr0 = cr0;
 317
 318        kvm_mmu_reset_context(vcpu);
 319        return;
 320}
 321EXPORT_SYMBOL_GPL(kvm_set_cr0);
 322
 323void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 324{
 325        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 326        KVMTRACE_1D(LMSW, vcpu,
 327                    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
 328                    handler);
 329}
 330EXPORT_SYMBOL_GPL(kvm_lmsw);
 331
 332void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 333{
 334        unsigned long old_cr4 = vcpu->arch.cr4;
 335        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 336
 337        if (cr4 & CR4_RESERVED_BITS) {
 338                printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 339                kvm_inject_gp(vcpu, 0);
 340                return;
 341        }
 342
 343        if (is_long_mode(vcpu)) {
 344                if (!(cr4 & X86_CR4_PAE)) {
 345                        printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 346                               "in long mode\n");
 347                        kvm_inject_gp(vcpu, 0);
 348                        return;
 349                }
 350        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 351                   && ((cr4 ^ old_cr4) & pdptr_bits)
 352                   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 353                printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 354                kvm_inject_gp(vcpu, 0);
 355                return;
 356        }
 357
 358        if (cr4 & X86_CR4_VMXE) {
 359                printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 360                kvm_inject_gp(vcpu, 0);
 361                return;
 362        }
 363        kvm_x86_ops->set_cr4(vcpu, cr4);
 364        vcpu->arch.cr4 = cr4;
 365        kvm_mmu_reset_context(vcpu);
 366}
 367EXPORT_SYMBOL_GPL(kvm_set_cr4);
 368
 369void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 370{
 371        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 372                kvm_mmu_flush_tlb(vcpu);
 373                return;
 374        }
 375
 376        if (is_long_mode(vcpu)) {
 377                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 378                        printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 379                        kvm_inject_gp(vcpu, 0);
 380                        return;
 381                }
 382        } else {
 383                if (is_pae(vcpu)) {
 384                        if (cr3 & CR3_PAE_RESERVED_BITS) {
 385                                printk(KERN_DEBUG
 386                                       "set_cr3: #GP, reserved bits\n");
 387                                kvm_inject_gp(vcpu, 0);
 388                                return;
 389                        }
 390                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 391                                printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 392                                       "reserved bits\n");
 393                                kvm_inject_gp(vcpu, 0);
 394                                return;
 395                        }
 396                }
 397                /*
 398                 * We don't check reserved bits in nonpae mode, because
 399                 * this isn't enforced, and VMware depends on this.
 400                 */
 401        }
 402
 403        /*
 404         * Does the new cr3 value map to physical memory? (Note, we
 405         * catch an invalid cr3 even in real-mode, because it would
 406         * cause trouble later on when we turn on paging anyway.)
 407         *
 408         * A real CPU would silently accept an invalid cr3 and would
 409         * attempt to use it - with largely undefined (and often hard
 410         * to debug) behavior on the guest side.
 411         */
 412        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 413                kvm_inject_gp(vcpu, 0);
 414        else {
 415                vcpu->arch.cr3 = cr3;
 416                vcpu->arch.mmu.new_cr3(vcpu);
 417        }
 418}
 419EXPORT_SYMBOL_GPL(kvm_set_cr3);
 420
 421void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 422{
 423        if (cr8 & CR8_RESERVED_BITS) {
 424                printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 425                kvm_inject_gp(vcpu, 0);
 426                return;
 427        }
 428        if (irqchip_in_kernel(vcpu->kvm))
 429                kvm_lapic_set_tpr(vcpu, cr8);
 430        else
 431                vcpu->arch.cr8 = cr8;
 432}
 433EXPORT_SYMBOL_GPL(kvm_set_cr8);
 434
 435unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 436{
 437        if (irqchip_in_kernel(vcpu->kvm))
 438                return kvm_lapic_get_cr8(vcpu);
 439        else
 440                return vcpu->arch.cr8;
 441}
 442EXPORT_SYMBOL_GPL(kvm_get_cr8);
 443
 444/*
 445 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 446 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 447 *
 448 * This list is modified at module load time to reflect the
 449 * capabilities of the host cpu.
 450 */
 451static u32 msrs_to_save[] = {
 452        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 453        MSR_K6_STAR,
 454#ifdef CONFIG_X86_64
 455        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 456#endif
 457        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 458        MSR_IA32_PERF_STATUS,
 459};
 460
 461static unsigned num_msrs_to_save;
 462
 463static u32 emulated_msrs[] = {
 464        MSR_IA32_MISC_ENABLE,
 465};
 466
 467static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 468{
 469        if (efer & efer_reserved_bits) {
 470                printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 471                       efer);
 472                kvm_inject_gp(vcpu, 0);
 473                return;
 474        }
 475
 476        if (is_paging(vcpu)
 477            && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 478                printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 479                kvm_inject_gp(vcpu, 0);
 480                return;
 481        }
 482
 483        kvm_x86_ops->set_efer(vcpu, efer);
 484
 485        efer &= ~EFER_LMA;
 486        efer |= vcpu->arch.shadow_efer & EFER_LMA;
 487
 488        vcpu->arch.shadow_efer = efer;
 489}
 490
 491void kvm_enable_efer_bits(u64 mask)
 492{
 493       efer_reserved_bits &= ~mask;
 494}
 495EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 496
 497
 498/*
 499 * Writes msr value into into the appropriate "register".
 500 * Returns 0 on success, non-0 otherwise.
 501 * Assumes vcpu_load() was already called.
 502 */
 503int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 504{
 505        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 506}
 507
 508/*
 509 * Adapt set_msr() to msr_io()'s calling convention
 510 */
 511static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 512{
 513        return kvm_set_msr(vcpu, index, *data);
 514}
 515
 516static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 517{
 518        static int version;
 519        struct pvclock_wall_clock wc;
 520        struct timespec now, sys, boot;
 521
 522        if (!wall_clock)
 523                return;
 524
 525        version++;
 526
 527        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 528
 529        /*
 530         * The guest calculates current wall clock time by adding
 531         * system time (updated by kvm_write_guest_time below) to the
 532         * wall clock specified here.  guest system time equals host
 533         * system time for us, thus we must fill in host boot time here.
 534         */
 535        now = current_kernel_time();
 536        ktime_get_ts(&sys);
 537        boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 538
 539        wc.sec = boot.tv_sec;
 540        wc.nsec = boot.tv_nsec;
 541        wc.version = version;
 542
 543        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 544
 545        version++;
 546        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 547}
 548
 549static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 550{
 551        uint32_t quotient, remainder;
 552
 553        /* Don't try to replace with do_div(), this one calculates
 554         * "(dividend << 32) / divisor" */
 555        __asm__ ( "divl %4"
 556                  : "=a" (quotient), "=d" (remainder)
 557                  : "0" (0), "1" (dividend), "r" (divisor) );
 558        return quotient;
 559}
 560
 561static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 562{
 563        uint64_t nsecs = 1000000000LL;
 564        int32_t  shift = 0;
 565        uint64_t tps64;
 566        uint32_t tps32;
 567
 568        tps64 = tsc_khz * 1000LL;
 569        while (tps64 > nsecs*2) {
 570                tps64 >>= 1;
 571                shift--;
 572        }
 573
 574        tps32 = (uint32_t)tps64;
 575        while (tps32 <= (uint32_t)nsecs) {
 576                tps32 <<= 1;
 577                shift++;
 578        }
 579
 580        hv_clock->tsc_shift = shift;
 581        hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 582
 583        pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 584                 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
 585                 hv_clock->tsc_to_system_mul);
 586}
 587
 588static void kvm_write_guest_time(struct kvm_vcpu *v)
 589{
 590        struct timespec ts;
 591        unsigned long flags;
 592        struct kvm_vcpu_arch *vcpu = &v->arch;
 593        void *shared_kaddr;
 594
 595        if ((!vcpu->time_page))
 596                return;
 597
 598        if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
 599                kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
 600                vcpu->hv_clock_tsc_khz = tsc_khz;
 601        }
 602
 603        /* Keep irq disabled to prevent changes to the clock */
 604        local_irq_save(flags);
 605        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 606                          &vcpu->hv_clock.tsc_timestamp);
 607        ktime_get_ts(&ts);
 608        local_irq_restore(flags);
 609
 610        /* With all the info we got, fill in the values */
 611
 612        vcpu->hv_clock.system_time = ts.tv_nsec +
 613                                     (NSEC_PER_SEC * (u64)ts.tv_sec);
 614        /*
 615         * The interface expects us to write an even number signaling that the
 616         * update is finished. Since the guest won't see the intermediate
 617         * state, we just increase by 2 at the end.
 618         */
 619        vcpu->hv_clock.version += 2;
 620
 621        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 622
 623        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 624               sizeof(vcpu->hv_clock));
 625
 626        kunmap_atomic(shared_kaddr, KM_USER0);
 627
 628        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 629}
 630
 631static bool msr_mtrr_valid(unsigned msr)
 632{
 633        switch (msr) {
 634        case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 635        case MSR_MTRRfix64K_00000:
 636        case MSR_MTRRfix16K_80000:
 637        case MSR_MTRRfix16K_A0000:
 638        case MSR_MTRRfix4K_C0000:
 639        case MSR_MTRRfix4K_C8000:
 640        case MSR_MTRRfix4K_D0000:
 641        case MSR_MTRRfix4K_D8000:
 642        case MSR_MTRRfix4K_E0000:
 643        case MSR_MTRRfix4K_E8000:
 644        case MSR_MTRRfix4K_F0000:
 645        case MSR_MTRRfix4K_F8000:
 646        case MSR_MTRRdefType:
 647        case MSR_IA32_CR_PAT:
 648                return true;
 649        case 0x2f8:
 650                return true;
 651        }
 652        return false;
 653}
 654
 655static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 656{
 657        if (!msr_mtrr_valid(msr))
 658                return 1;
 659
 660        vcpu->arch.mtrr[msr - 0x200] = data;
 661        return 0;
 662}
 663
 664int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 665{
 666        switch (msr) {
 667        case MSR_EFER:
 668                set_efer(vcpu, data);
 669                break;
 670        case MSR_IA32_MC0_STATUS:
 671                pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 672                       __func__, data);
 673                break;
 674        case MSR_IA32_MCG_STATUS:
 675                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 676                        __func__, data);
 677                break;
 678        case MSR_IA32_MCG_CTL:
 679                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 680                        __func__, data);
 681                break;
 682        case MSR_IA32_UCODE_REV:
 683        case MSR_IA32_UCODE_WRITE:
 684                break;
 685        case 0x200 ... 0x2ff:
 686                return set_msr_mtrr(vcpu, msr, data);
 687        case MSR_IA32_APICBASE:
 688                kvm_set_apic_base(vcpu, data);
 689                break;
 690        case MSR_IA32_MISC_ENABLE:
 691                vcpu->arch.ia32_misc_enable_msr = data;
 692                break;
 693        case MSR_KVM_WALL_CLOCK:
 694                vcpu->kvm->arch.wall_clock = data;
 695                kvm_write_wall_clock(vcpu->kvm, data);
 696                break;
 697        case MSR_KVM_SYSTEM_TIME: {
 698                if (vcpu->arch.time_page) {
 699                        kvm_release_page_dirty(vcpu->arch.time_page);
 700                        vcpu->arch.time_page = NULL;
 701                }
 702
 703                vcpu->arch.time = data;
 704
 705                /* we verify if the enable bit is set... */
 706                if (!(data & 1))
 707                        break;
 708
 709                /* ...but clean it before doing the actual write */
 710                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 711
 712                down_read(&current->mm->mmap_sem);
 713                vcpu->arch.time_page =
 714                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 715                up_read(&current->mm->mmap_sem);
 716
 717                if (is_error_page(vcpu->arch.time_page)) {
 718                        kvm_release_page_clean(vcpu->arch.time_page);
 719                        vcpu->arch.time_page = NULL;
 720                }
 721
 722                kvm_write_guest_time(vcpu);
 723                break;
 724        }
 725        default:
 726                pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 727                return 1;
 728        }
 729        return 0;
 730}
 731EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 732
 733
 734/*
 735 * Reads an msr value (of 'msr_index') into 'pdata'.
 736 * Returns 0 on success, non-0 otherwise.
 737 * Assumes vcpu_load() was already called.
 738 */
 739int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 740{
 741        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 742}
 743
 744static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 745{
 746        if (!msr_mtrr_valid(msr))
 747                return 1;
 748
 749        *pdata = vcpu->arch.mtrr[msr - 0x200];
 750        return 0;
 751}
 752
 753int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 754{
 755        u64 data;
 756
 757        switch (msr) {
 758        case 0xc0010010: /* SYSCFG */
 759        case 0xc0010015: /* HWCR */
 760        case MSR_IA32_PLATFORM_ID:
 761        case MSR_IA32_P5_MC_ADDR:
 762        case MSR_IA32_P5_MC_TYPE:
 763        case MSR_IA32_MC0_CTL:
 764        case MSR_IA32_MCG_STATUS:
 765        case MSR_IA32_MCG_CAP:
 766        case MSR_IA32_MCG_CTL:
 767        case MSR_IA32_MC0_MISC:
 768        case MSR_IA32_MC0_MISC+4:
 769        case MSR_IA32_MC0_MISC+8:
 770        case MSR_IA32_MC0_MISC+12:
 771        case MSR_IA32_MC0_MISC+16:
 772        case MSR_IA32_MC0_MISC+20:
 773        case MSR_IA32_UCODE_REV:
 774        case MSR_IA32_EBL_CR_POWERON:
 775                data = 0;
 776                break;
 777        case MSR_MTRRcap:
 778                data = 0x500 | KVM_NR_VAR_MTRR;
 779                break;
 780        case 0x200 ... 0x2ff:
 781                return get_msr_mtrr(vcpu, msr, pdata);
 782        case 0xcd: /* fsb frequency */
 783                data = 3;
 784                break;
 785        case MSR_IA32_APICBASE:
 786                data = kvm_get_apic_base(vcpu);
 787                break;
 788        case MSR_IA32_MISC_ENABLE:
 789                data = vcpu->arch.ia32_misc_enable_msr;
 790                break;
 791        case MSR_IA32_PERF_STATUS:
 792                /* TSC increment by tick */
 793                data = 1000ULL;
 794                /* CPU multiplier */
 795                data |= (((uint64_t)4ULL) << 40);
 796                break;
 797        case MSR_EFER:
 798                data = vcpu->arch.shadow_efer;
 799                break;
 800        case MSR_KVM_WALL_CLOCK:
 801                data = vcpu->kvm->arch.wall_clock;
 802                break;
 803        case MSR_KVM_SYSTEM_TIME:
 804                data = vcpu->arch.time;
 805                break;
 806        default:
 807                pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 808                return 1;
 809        }
 810        *pdata = data;
 811        return 0;
 812}
 813EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 814
 815/*
 816 * Read or write a bunch of msrs. All parameters are kernel addresses.
 817 *
 818 * @return number of msrs set successfully.
 819 */
 820static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 821                    struct kvm_msr_entry *entries,
 822                    int (*do_msr)(struct kvm_vcpu *vcpu,
 823                                  unsigned index, u64 *data))
 824{
 825        int i;
 826
 827        vcpu_load(vcpu);
 828
 829        down_read(&vcpu->kvm->slots_lock);
 830        for (i = 0; i < msrs->nmsrs; ++i)
 831                if (do_msr(vcpu, entries[i].index, &entries[i].data))
 832                        break;
 833        up_read(&vcpu->kvm->slots_lock);
 834
 835        vcpu_put(vcpu);
 836
 837        return i;
 838}
 839
 840/*
 841 * Read or write a bunch of msrs. Parameters are user addresses.
 842 *
 843 * @return number of msrs set successfully.
 844 */
 845static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 846                  int (*do_msr)(struct kvm_vcpu *vcpu,
 847                                unsigned index, u64 *data),
 848                  int writeback)
 849{
 850        struct kvm_msrs msrs;
 851        struct kvm_msr_entry *entries;
 852        int r, n;
 853        unsigned size;
 854
 855        r = -EFAULT;
 856        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 857                goto out;
 858
 859        r = -E2BIG;
 860        if (msrs.nmsrs >= MAX_IO_MSRS)
 861                goto out;
 862
 863        r = -ENOMEM;
 864        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 865        entries = vmalloc(size);
 866        if (!entries)
 867                goto out;
 868
 869        r = -EFAULT;
 870        if (copy_from_user(entries, user_msrs->entries, size))
 871                goto out_free;
 872
 873        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 874        if (r < 0)
 875                goto out_free;
 876
 877        r = -EFAULT;
 878        if (writeback && copy_to_user(user_msrs->entries, entries, size))
 879                goto out_free;
 880
 881        r = n;
 882
 883out_free:
 884        vfree(entries);
 885out:
 886        return r;
 887}
 888
 889int kvm_dev_ioctl_check_extension(long ext)
 890{
 891        int r;
 892
 893        switch (ext) {
 894        case KVM_CAP_IRQCHIP:
 895        case KVM_CAP_HLT:
 896        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 897        case KVM_CAP_USER_MEMORY:
 898        case KVM_CAP_SET_TSS_ADDR:
 899        case KVM_CAP_EXT_CPUID:
 900        case KVM_CAP_CLOCKSOURCE:
 901        case KVM_CAP_PIT:
 902        case KVM_CAP_NOP_IO_DELAY:
 903        case KVM_CAP_MP_STATE:
 904        case KVM_CAP_SYNC_MMU:
 905                r = 1;
 906                break;
 907        case KVM_CAP_COALESCED_MMIO:
 908                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 909                break;
 910        case KVM_CAP_VAPIC:
 911                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 912                break;
 913        case KVM_CAP_NR_VCPUS:
 914                r = KVM_MAX_VCPUS;
 915                break;
 916        case KVM_CAP_NR_MEMSLOTS:
 917                r = KVM_MEMORY_SLOTS;
 918                break;
 919        case KVM_CAP_PV_MMU:
 920                r = !tdp_enabled;
 921                break;
 922        default:
 923                r = 0;
 924                break;
 925        }
 926        return r;
 927
 928}
 929
 930long kvm_arch_dev_ioctl(struct file *filp,
 931                        unsigned int ioctl, unsigned long arg)
 932{
 933        void __user *argp = (void __user *)arg;
 934        long r;
 935
 936        switch (ioctl) {
 937        case KVM_GET_MSR_INDEX_LIST: {
 938                struct kvm_msr_list __user *user_msr_list = argp;
 939                struct kvm_msr_list msr_list;
 940                unsigned n;
 941
 942                r = -EFAULT;
 943                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 944                        goto out;
 945                n = msr_list.nmsrs;
 946                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
 947                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 948                        goto out;
 949                r = -E2BIG;
 950                if (n < num_msrs_to_save)
 951                        goto out;
 952                r = -EFAULT;
 953                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 954                                 num_msrs_to_save * sizeof(u32)))
 955                        goto out;
 956                if (copy_to_user(user_msr_list->indices
 957                                 + num_msrs_to_save * sizeof(u32),
 958                                 &emulated_msrs,
 959                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
 960                        goto out;
 961                r = 0;
 962                break;
 963        }
 964        case KVM_GET_SUPPORTED_CPUID: {
 965                struct kvm_cpuid2 __user *cpuid_arg = argp;
 966                struct kvm_cpuid2 cpuid;
 967
 968                r = -EFAULT;
 969                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 970                        goto out;
 971                r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
 972                        cpuid_arg->entries);
 973                if (r)
 974                        goto out;
 975
 976                r = -EFAULT;
 977                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
 978                        goto out;
 979                r = 0;
 980                break;
 981        }
 982        default:
 983                r = -EINVAL;
 984        }
 985out:
 986        return r;
 987}
 988
 989void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 990{
 991        kvm_x86_ops->vcpu_load(vcpu, cpu);
 992        kvm_write_guest_time(vcpu);
 993}
 994
 995void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 996{
 997        kvm_x86_ops->vcpu_put(vcpu);
 998        kvm_put_guest_fpu(vcpu);
 999}
1000
1001static int is_efer_nx(void)
1002{
1003        unsigned long long efer = 0;
1004
1005        rdmsrl_safe(MSR_EFER, &efer);
1006        return efer & EFER_NX;
1007}
1008
1009static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1010{
1011        int i;
1012        struct kvm_cpuid_entry2 *e, *entry;
1013
1014        entry = NULL;
1015        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1016                e = &vcpu->arch.cpuid_entries[i];
1017                if (e->function == 0x80000001) {
1018                        entry = e;
1019                        break;
1020                }
1021        }
1022        if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1023                entry->edx &= ~(1 << 20);
1024                printk(KERN_INFO "kvm: guest NX capability removed\n");
1025        }
1026}
1027
1028/* when an old userspace process fills a new kernel module */
1029static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1030                                    struct kvm_cpuid *cpuid,
1031                                    struct kvm_cpuid_entry __user *entries)
1032{
1033        int r, i;
1034        struct kvm_cpuid_entry *cpuid_entries;
1035
1036        r = -E2BIG;
1037        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1038                goto out;
1039        r = -ENOMEM;
1040        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1041        if (!cpuid_entries)
1042                goto out;
1043        r = -EFAULT;
1044        if (copy_from_user(cpuid_entries, entries,
1045                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1046                goto out_free;
1047        for (i = 0; i < cpuid->nent; i++) {
1048                vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1049                vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1050                vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1051                vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1052                vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1053                vcpu->arch.cpuid_entries[i].index = 0;
1054                vcpu->arch.cpuid_entries[i].flags = 0;
1055                vcpu->arch.cpuid_entries[i].padding[0] = 0;
1056                vcpu->arch.cpuid_entries[i].padding[1] = 0;
1057                vcpu->arch.cpuid_entries[i].padding[2] = 0;
1058        }
1059        vcpu->arch.cpuid_nent = cpuid->nent;
1060        cpuid_fix_nx_cap(vcpu);
1061        r = 0;
1062
1063out_free:
1064        vfree(cpuid_entries);
1065out:
1066        return r;
1067}
1068
1069static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1070                                    struct kvm_cpuid2 *cpuid,
1071                                    struct kvm_cpuid_entry2 __user *entries)
1072{
1073        int r;
1074
1075        r = -E2BIG;
1076        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1077                goto out;
1078        r = -EFAULT;
1079        if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1080                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1081                goto out;
1082        vcpu->arch.cpuid_nent = cpuid->nent;
1083        return 0;
1084
1085out:
1086        return r;
1087}
1088
1089static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1090                                    struct kvm_cpuid2 *cpuid,
1091                                    struct kvm_cpuid_entry2 __user *entries)
1092{
1093        int r;
1094
1095        r = -E2BIG;
1096        if (cpuid->nent < vcpu->arch.cpuid_nent)
1097                goto out;
1098        r = -EFAULT;
1099        if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1100                           vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1101                goto out;
1102        return 0;
1103
1104out:
1105        cpuid->nent = vcpu->arch.cpuid_nent;
1106        return r;
1107}
1108
1109static inline u32 bit(int bitno)
1110{
1111        return 1 << (bitno & 31);
1112}
1113
1114static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1115                          u32 index)
1116{
1117        entry->function = function;
1118        entry->index = index;
1119        cpuid_count(entry->function, entry->index,
1120                &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1121        entry->flags = 0;
1122}
1123
1124static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1125                         u32 index, int *nent, int maxnent)
1126{
1127        const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1128                bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1129                bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1130                bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1131                bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1132                bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1133                bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1134                bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1135                bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1136                bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1137        const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1138                bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1139                bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1140                bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1141                bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1142                bit(X86_FEATURE_PGE) |
1143                bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1144                bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1145                bit(X86_FEATURE_SYSCALL) |
1146                (bit(X86_FEATURE_NX) && is_efer_nx()) |
1147#ifdef CONFIG_X86_64
1148                bit(X86_FEATURE_LM) |
1149#endif
1150                bit(X86_FEATURE_MMXEXT) |
1151                bit(X86_FEATURE_3DNOWEXT) |
1152                bit(X86_FEATURE_3DNOW);
1153        const u32 kvm_supported_word3_x86_features =
1154                bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1155        const u32 kvm_supported_word6_x86_features =
1156                bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
1157
1158        /* all func 2 cpuid_count() should be called on the same cpu */
1159        get_cpu();
1160        do_cpuid_1_ent(entry, function, index);
1161        ++*nent;
1162
1163        switch (function) {
1164        case 0:
1165                entry->eax = min(entry->eax, (u32)0xb);
1166                break;
1167        case 1:
1168                entry->edx &= kvm_supported_word0_x86_features;
1169                entry->ecx &= kvm_supported_word3_x86_features;
1170                break;
1171        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1172         * may return different values. This forces us to get_cpu() before
1173         * issuing the first command, and also to emulate this annoying behavior
1174         * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1175        case 2: {
1176                int t, times = entry->eax & 0xff;
1177
1178                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1179                for (t = 1; t < times && *nent < maxnent; ++t) {
1180                        do_cpuid_1_ent(&entry[t], function, 0);
1181                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1182                        ++*nent;
1183                }
1184                break;
1185        }
1186        /* function 4 and 0xb have additional index. */
1187        case 4: {
1188                int i, cache_type;
1189
1190                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1191                /* read more entries until cache_type is zero */
1192                for (i = 1; *nent < maxnent; ++i) {
1193                        cache_type = entry[i - 1].eax & 0x1f;
1194                        if (!cache_type)
1195                                break;
1196                        do_cpuid_1_ent(&entry[i], function, i);
1197                        entry[i].flags |=
1198                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1199                        ++*nent;
1200                }
1201                break;
1202        }
1203        case 0xb: {
1204                int i, level_type;
1205
1206                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1207                /* read more entries until level_type is zero */
1208                for (i = 1; *nent < maxnent; ++i) {
1209                        level_type = entry[i - 1].ecx & 0xff;
1210                        if (!level_type)
1211                                break;
1212                        do_cpuid_1_ent(&entry[i], function, i);
1213                        entry[i].flags |=
1214                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1215                        ++*nent;
1216                }
1217                break;
1218        }
1219        case 0x80000000:
1220                entry->eax = min(entry->eax, 0x8000001a);
1221                break;
1222        case 0x80000001:
1223                entry->edx &= kvm_supported_word1_x86_features;
1224                entry->ecx &= kvm_supported_word6_x86_features;
1225                break;
1226        }
1227        put_cpu();
1228}
1229
1230static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1231                                    struct kvm_cpuid_entry2 __user *entries)
1232{
1233        struct kvm_cpuid_entry2 *cpuid_entries;
1234        int limit, nent = 0, r = -E2BIG;
1235        u32 func;
1236
1237        if (cpuid->nent < 1)
1238                goto out;
1239        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1240                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1241        r = -ENOMEM;
1242        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1243        if (!cpuid_entries)
1244                goto out;
1245
1246        do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1247        limit = cpuid_entries[0].eax;
1248        for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1249                do_cpuid_ent(&cpuid_entries[nent], func, 0,
1250                                &nent, cpuid->nent);
1251        r = -E2BIG;
1252        if (nent >= cpuid->nent)
1253                goto out_free;
1254
1255        do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1256        limit = cpuid_entries[nent - 1].eax;
1257        for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1258                do_cpuid_ent(&cpuid_entries[nent], func, 0,
1259                               &nent, cpuid->nent);
1260        r = -EFAULT;
1261        if (copy_to_user(entries, cpuid_entries,
1262                        nent * sizeof(struct kvm_cpuid_entry2)))
1263                goto out_free;
1264        cpuid->nent = nent;
1265        r = 0;
1266
1267out_free:
1268        vfree(cpuid_entries);
1269out:
1270        return r;
1271}
1272
1273static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1274                                    struct kvm_lapic_state *s)
1275{
1276        vcpu_load(vcpu);
1277        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1278        vcpu_put(vcpu);
1279
1280        return 0;
1281}
1282
1283static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1284                                    struct kvm_lapic_state *s)
1285{
1286        vcpu_load(vcpu);
1287        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1288        kvm_apic_post_state_restore(vcpu);
1289        vcpu_put(vcpu);
1290
1291        return 0;
1292}
1293
1294static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1295                                    struct kvm_interrupt *irq)
1296{
1297        if (irq->irq < 0 || irq->irq >= 256)
1298                return -EINVAL;
1299        if (irqchip_in_kernel(vcpu->kvm))
1300                return -ENXIO;
1301        vcpu_load(vcpu);
1302
1303        set_bit(irq->irq, vcpu->arch.irq_pending);
1304        set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1305
1306        vcpu_put(vcpu);
1307
1308        return 0;
1309}
1310
1311static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1312                                           struct kvm_tpr_access_ctl *tac)
1313{
1314        if (tac->flags)
1315                return -EINVAL;
1316        vcpu->arch.tpr_access_reporting = !!tac->enabled;
1317        return 0;
1318}
1319
1320long kvm_arch_vcpu_ioctl(struct file *filp,
1321                         unsigned int ioctl, unsigned long arg)
1322{
1323        struct kvm_vcpu *vcpu = filp->private_data;
1324        void __user *argp = (void __user *)arg;
1325        int r;
1326        struct kvm_lapic_state *lapic = NULL;
1327
1328        switch (ioctl) {
1329        case KVM_GET_LAPIC: {
1330                lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1331
1332                r = -ENOMEM;
1333                if (!lapic)
1334                        goto out;
1335                r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1336                if (r)
1337                        goto out;
1338                r = -EFAULT;
1339                if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1340                        goto out;
1341                r = 0;
1342                break;
1343        }
1344        case KVM_SET_LAPIC: {
1345                lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1346                r = -ENOMEM;
1347                if (!lapic)
1348                        goto out;
1349                r = -EFAULT;
1350                if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1351                        goto out;
1352                r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1353                if (r)
1354                        goto out;
1355                r = 0;
1356                break;
1357        }
1358        case KVM_INTERRUPT: {
1359                struct kvm_interrupt irq;
1360
1361                r = -EFAULT;
1362                if (copy_from_user(&irq, argp, sizeof irq))
1363                        goto out;
1364                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1365                if (r)
1366                        goto out;
1367                r = 0;
1368                break;
1369        }
1370        case KVM_SET_CPUID: {
1371                struct kvm_cpuid __user *cpuid_arg = argp;
1372                struct kvm_cpuid cpuid;
1373
1374                r = -EFAULT;
1375                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1376                        goto out;
1377                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1378                if (r)
1379                        goto out;
1380                break;
1381        }
1382        case KVM_SET_CPUID2: {
1383                struct kvm_cpuid2 __user *cpuid_arg = argp;
1384                struct kvm_cpuid2 cpuid;
1385
1386                r = -EFAULT;
1387                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1388                        goto out;
1389                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1390                                cpuid_arg->entries);
1391                if (r)
1392                        goto out;
1393                break;
1394        }
1395        case KVM_GET_CPUID2: {
1396                struct kvm_cpuid2 __user *cpuid_arg = argp;
1397                struct kvm_cpuid2 cpuid;
1398
1399                r = -EFAULT;
1400                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1401                        goto out;
1402                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1403                                cpuid_arg->entries);
1404                if (r)
1405                        goto out;
1406                r = -EFAULT;
1407                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1408                        goto out;
1409                r = 0;
1410                break;
1411        }
1412        case KVM_GET_MSRS:
1413                r = msr_io(vcpu, argp, kvm_get_msr, 1);
1414                break;
1415        case KVM_SET_MSRS:
1416                r = msr_io(vcpu, argp, do_set_msr, 0);
1417                break;
1418        case KVM_TPR_ACCESS_REPORTING: {
1419                struct kvm_tpr_access_ctl tac;
1420
1421                r = -EFAULT;
1422                if (copy_from_user(&tac, argp, sizeof tac))
1423                        goto out;
1424                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1425                if (r)
1426                        goto out;
1427                r = -EFAULT;
1428                if (copy_to_user(argp, &tac, sizeof tac))
1429                        goto out;
1430                r = 0;
1431                break;
1432        };
1433        case KVM_SET_VAPIC_ADDR: {
1434                struct kvm_vapic_addr va;
1435
1436                r = -EINVAL;
1437                if (!irqchip_in_kernel(vcpu->kvm))
1438                        goto out;
1439                r = -EFAULT;
1440                if (copy_from_user(&va, argp, sizeof va))
1441                        goto out;
1442                r = 0;
1443                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1444                break;
1445        }
1446        default:
1447                r = -EINVAL;
1448        }
1449out:
1450        if (lapic)
1451                kfree(lapic);
1452        return r;
1453}
1454
1455static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1456{
1457        int ret;
1458
1459        if (addr > (unsigned int)(-3 * PAGE_SIZE))
1460                return -1;
1461        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1462        return ret;
1463}
1464
1465static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1466                                          u32 kvm_nr_mmu_pages)
1467{
1468        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1469                return -EINVAL;
1470
1471        down_write(&kvm->slots_lock);
1472        spin_lock(&kvm->mmu_lock);
1473
1474        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1475        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1476
1477        spin_unlock(&kvm->mmu_lock);
1478        up_write(&kvm->slots_lock);
1479        return 0;
1480}
1481
1482static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1483{
1484        return kvm->arch.n_alloc_mmu_pages;
1485}
1486
1487gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1488{
1489        int i;
1490        struct kvm_mem_alias *alias;
1491
1492        for (i = 0; i < kvm->arch.naliases; ++i) {
1493                alias = &kvm->arch.aliases[i];
1494                if (gfn >= alias->base_gfn
1495                    && gfn < alias->base_gfn + alias->npages)
1496                        return alias->target_gfn + gfn - alias->base_gfn;
1497        }
1498        return gfn;
1499}
1500
1501/*
1502 * Set a new alias region.  Aliases map a portion of physical memory into
1503 * another portion.  This is useful for memory windows, for example the PC
1504 * VGA region.
1505 */
1506static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1507                                         struct kvm_memory_alias *alias)
1508{
1509        int r, n;
1510        struct kvm_mem_alias *p;
1511
1512        r = -EINVAL;
1513        /* General sanity checks */
1514        if (alias->memory_size & (PAGE_SIZE - 1))
1515                goto out;
1516        if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1517                goto out;
1518        if (alias->slot >= KVM_ALIAS_SLOTS)
1519                goto out;
1520        if (alias->guest_phys_addr + alias->memory_size
1521            < alias->guest_phys_addr)
1522                goto out;
1523        if (alias->target_phys_addr + alias->memory_size
1524            < alias->target_phys_addr)
1525                goto out;
1526
1527        down_write(&kvm->slots_lock);
1528        spin_lock(&kvm->mmu_lock);
1529
1530        p = &kvm->arch.aliases[alias->slot];
1531        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1532        p->npages = alias->memory_size >> PAGE_SHIFT;
1533        p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1534
1535        for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1536                if (kvm->arch.aliases[n - 1].npages)
1537                        break;
1538        kvm->arch.naliases = n;
1539
1540        spin_unlock(&kvm->mmu_lock);
1541        kvm_mmu_zap_all(kvm);
1542
1543        up_write(&kvm->slots_lock);
1544
1545        return 0;
1546
1547out:
1548        return r;
1549}
1550
1551static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1552{
1553        int r;
1554
1555        r = 0;
1556        switch (chip->chip_id) {
1557        case KVM_IRQCHIP_PIC_MASTER:
1558                memcpy(&chip->chip.pic,
1559                        &pic_irqchip(kvm)->pics[0],
1560                        sizeof(struct kvm_pic_state));
1561                break;
1562        case KVM_IRQCHIP_PIC_SLAVE:
1563                memcpy(&chip->chip.pic,
1564                        &pic_irqchip(kvm)->pics[1],
1565                        sizeof(struct kvm_pic_state));
1566                break;
1567        case KVM_IRQCHIP_IOAPIC:
1568                memcpy(&chip->chip.ioapic,
1569                        ioapic_irqchip(kvm),
1570                        sizeof(struct kvm_ioapic_state));
1571                break;
1572        default:
1573                r = -EINVAL;
1574                break;
1575        }
1576        return r;
1577}
1578
1579static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1580{
1581        int r;
1582
1583        r = 0;
1584        switch (chip->chip_id) {
1585        case KVM_IRQCHIP_PIC_MASTER:
1586                memcpy(&pic_irqchip(kvm)->pics[0],
1587                        &chip->chip.pic,
1588                        sizeof(struct kvm_pic_state));
1589                break;
1590        case KVM_IRQCHIP_PIC_SLAVE:
1591                memcpy(&pic_irqchip(kvm)->pics[1],
1592                        &chip->chip.pic,
1593                        sizeof(struct kvm_pic_state));
1594                break;
1595        case KVM_IRQCHIP_IOAPIC:
1596                memcpy(ioapic_irqchip(kvm),
1597                        &chip->chip.ioapic,
1598                        sizeof(struct kvm_ioapic_state));
1599                break;
1600        default:
1601                r = -EINVAL;
1602                break;
1603        }
1604        kvm_pic_update_irq(pic_irqchip(kvm));
1605        return r;
1606}
1607
1608static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1609{
1610        int r = 0;
1611
1612        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1613        return r;
1614}
1615
1616static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1617{
1618        int r = 0;
1619
1620        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1621        kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1622        return r;
1623}
1624
1625/*
1626 * Get (and clear) the dirty memory log for a memory slot.
1627 */
1628int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1629                                      struct kvm_dirty_log *log)
1630{
1631        int r;
1632        int n;
1633        struct kvm_memory_slot *memslot;
1634        int is_dirty = 0;
1635
1636        down_write(&kvm->slots_lock);
1637
1638        r = kvm_get_dirty_log(kvm, log, &is_dirty);
1639        if (r)
1640                goto out;
1641
1642        /* If nothing is dirty, don't bother messing with page tables. */
1643        if (is_dirty) {
1644                spin_lock(&kvm->mmu_lock);
1645                kvm_mmu_slot_remove_write_access(kvm, log->slot);
1646                spin_unlock(&kvm->mmu_lock);
1647                kvm_flush_remote_tlbs(kvm);
1648                memslot = &kvm->memslots[log->slot];
1649                n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1650                memset(memslot->dirty_bitmap, 0, n);
1651        }
1652        r = 0;
1653out:
1654        up_write(&kvm->slots_lock);
1655        return r;
1656}
1657
1658long kvm_arch_vm_ioctl(struct file *filp,
1659                       unsigned int ioctl, unsigned long arg)
1660{
1661        struct kvm *kvm = filp->private_data;
1662        void __user *argp = (void __user *)arg;
1663        int r = -EINVAL;
1664        /*
1665         * This union makes it completely explicit to gcc-3.x
1666         * that these two variables' stack usage should be
1667         * combined, not added together.
1668         */
1669        union {
1670                struct kvm_pit_state ps;
1671                struct kvm_memory_alias alias;
1672        } u;
1673
1674        switch (ioctl) {
1675        case KVM_SET_TSS_ADDR:
1676                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1677                if (r < 0)
1678                        goto out;
1679                break;
1680        case KVM_SET_MEMORY_REGION: {
1681                struct kvm_memory_region kvm_mem;
1682                struct kvm_userspace_memory_region kvm_userspace_mem;
1683
1684                r = -EFAULT;
1685                if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1686                        goto out;
1687                kvm_userspace_mem.slot = kvm_mem.slot;
1688                kvm_userspace_mem.flags = kvm_mem.flags;
1689                kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1690                kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1691                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1692                if (r)
1693                        goto out;
1694                break;
1695        }
1696        case KVM_SET_NR_MMU_PAGES:
1697                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1698                if (r)
1699                        goto out;
1700                break;
1701        case KVM_GET_NR_MMU_PAGES:
1702                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1703                break;
1704        case KVM_SET_MEMORY_ALIAS:
1705                r = -EFAULT;
1706                if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1707                        goto out;
1708                r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1709                if (r)
1710                        goto out;
1711                break;
1712        case KVM_CREATE_IRQCHIP:
1713                r = -ENOMEM;
1714                kvm->arch.vpic = kvm_create_pic(kvm);
1715                if (kvm->arch.vpic) {
1716                        r = kvm_ioapic_init(kvm);
1717                        if (r) {
1718                                kfree(kvm->arch.vpic);
1719                                kvm->arch.vpic = NULL;
1720                                goto out;
1721                        }
1722                } else
1723                        goto out;
1724                break;
1725        case KVM_CREATE_PIT:
1726                r = -ENOMEM;
1727                kvm->arch.vpit = kvm_create_pit(kvm);
1728                if (kvm->arch.vpit)
1729                        r = 0;
1730                break;
1731        case KVM_IRQ_LINE: {
1732                struct kvm_irq_level irq_event;
1733
1734                r = -EFAULT;
1735                if (copy_from_user(&irq_event, argp, sizeof irq_event))
1736                        goto out;
1737                if (irqchip_in_kernel(kvm)) {
1738                        mutex_lock(&kvm->lock);
1739                        if (irq_event.irq < 16)
1740                                kvm_pic_set_irq(pic_irqchip(kvm),
1741                                        irq_event.irq,
1742                                        irq_event.level);
1743                        kvm_ioapic_set_irq(kvm->arch.vioapic,
1744                                        irq_event.irq,
1745                                        irq_event.level);
1746                        mutex_unlock(&kvm->lock);
1747                        r = 0;
1748                }
1749                break;
1750        }
1751        case KVM_GET_IRQCHIP: {
1752                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1753                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1754
1755                r = -ENOMEM;
1756                if (!chip)
1757                        goto out;
1758                r = -EFAULT;
1759                if (copy_from_user(chip, argp, sizeof *chip))
1760                        goto get_irqchip_out;
1761                r = -ENXIO;
1762                if (!irqchip_in_kernel(kvm))
1763                        goto get_irqchip_out;
1764                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1765                if (r)
1766                        goto get_irqchip_out;
1767                r = -EFAULT;
1768                if (copy_to_user(argp, chip, sizeof *chip))
1769                        goto get_irqchip_out;
1770                r = 0;
1771        get_irqchip_out:
1772                kfree(chip);
1773                if (r)
1774                        goto out;
1775                break;
1776        }
1777        case KVM_SET_IRQCHIP: {
1778                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1779                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1780
1781                r = -ENOMEM;
1782                if (!chip)
1783                        goto out;
1784                r = -EFAULT;
1785                if (copy_from_user(chip, argp, sizeof *chip))
1786                        goto set_irqchip_out;
1787                r = -ENXIO;
1788                if (!irqchip_in_kernel(kvm))
1789                        goto set_irqchip_out;
1790                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1791                if (r)
1792                        goto set_irqchip_out;
1793                r = 0;
1794        set_irqchip_out:
1795                kfree(chip);
1796                if (r)
1797                        goto out;
1798                break;
1799        }
1800        case KVM_GET_PIT: {
1801                r = -EFAULT;
1802                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1803                        goto out;
1804                r = -ENXIO;
1805                if (!kvm->arch.vpit)
1806                        goto out;
1807                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1808                if (r)
1809                        goto out;
1810                r = -EFAULT;
1811                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1812                        goto out;
1813                r = 0;
1814                break;
1815        }
1816        case KVM_SET_PIT: {
1817                r = -EFAULT;
1818                if (copy_from_user(&u.ps, argp, sizeof u.ps))
1819                        goto out;
1820                r = -ENXIO;
1821                if (!kvm->arch.vpit)
1822                        goto out;
1823                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1824                if (r)
1825                        goto out;
1826                r = 0;
1827                break;
1828        }
1829        default:
1830                ;
1831        }
1832out:
1833        return r;
1834}
1835
1836static void kvm_init_msr_list(void)
1837{
1838        u32 dummy[2];
1839        unsigned i, j;
1840
1841        for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1842                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1843                        continue;
1844                if (j < i)
1845                        msrs_to_save[j] = msrs_to_save[i];
1846                j++;
1847        }
1848        num_msrs_to_save = j;
1849}
1850
1851/*
1852 * Only apic need an MMIO device hook, so shortcut now..
1853 */
1854static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1855                                                gpa_t addr, int len,
1856                                                int is_write)
1857{
1858        struct kvm_io_device *dev;
1859
1860        if (vcpu->arch.apic) {
1861                dev = &vcpu->arch.apic->dev;
1862                if (dev->in_range(dev, addr, len, is_write))
1863                        return dev;
1864        }
1865        return NULL;
1866}
1867
1868
1869static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1870                                                gpa_t addr, int len,
1871                                                int is_write)
1872{
1873        struct kvm_io_device *dev;
1874
1875        dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1876        if (dev == NULL)
1877                dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1878                                          is_write);
1879        return dev;
1880}
1881
1882int emulator_read_std(unsigned long addr,
1883                             void *val,
1884                             unsigned int bytes,
1885                             struct kvm_vcpu *vcpu)
1886{
1887        void *data = val;
1888        int r = X86EMUL_CONTINUE;
1889
1890        while (bytes) {
1891                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1892                unsigned offset = addr & (PAGE_SIZE-1);
1893                unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1894                int ret;
1895
1896                if (gpa == UNMAPPED_GVA) {
1897                        r = X86EMUL_PROPAGATE_FAULT;
1898                        goto out;
1899                }
1900                ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1901                if (ret < 0) {
1902                        r = X86EMUL_UNHANDLEABLE;
1903                        goto out;
1904                }
1905
1906                bytes -= tocopy;
1907                data += tocopy;
1908                addr += tocopy;
1909        }
1910out:
1911        return r;
1912}
1913EXPORT_SYMBOL_GPL(emulator_read_std);
1914
1915static int emulator_read_emulated(unsigned long addr,
1916                                  void *val,
1917                                  unsigned int bytes,
1918                                  struct kvm_vcpu *vcpu)
1919{
1920        struct kvm_io_device *mmio_dev;
1921        gpa_t                 gpa;
1922
1923        if (vcpu->mmio_read_completed) {
1924                memcpy(val, vcpu->mmio_data, bytes);
1925                vcpu->mmio_read_completed = 0;
1926                return X86EMUL_CONTINUE;
1927        }
1928
1929        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1930
1931        /* For APIC access vmexit */
1932        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1933                goto mmio;
1934
1935        if (emulator_read_std(addr, val, bytes, vcpu)
1936                        == X86EMUL_CONTINUE)
1937                return X86EMUL_CONTINUE;
1938        if (gpa == UNMAPPED_GVA)
1939                return X86EMUL_PROPAGATE_FAULT;
1940
1941mmio:
1942        /*
1943         * Is this MMIO handled locally?
1944         */
1945        mutex_lock(&vcpu->kvm->lock);
1946        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1947        if (mmio_dev) {
1948                kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1949                mutex_unlock(&vcpu->kvm->lock);
1950                return X86EMUL_CONTINUE;
1951        }
1952        mutex_unlock(&vcpu->kvm->lock);
1953
1954        vcpu->mmio_needed = 1;
1955        vcpu->mmio_phys_addr = gpa;
1956        vcpu->mmio_size = bytes;
1957        vcpu->mmio_is_write = 0;
1958
1959        return X86EMUL_UNHANDLEABLE;
1960}
1961
1962int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1963                          const void *val, int bytes)
1964{
1965        int ret;
1966
1967        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1968        if (ret < 0)
1969                return 0;
1970        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1971        return 1;
1972}
1973
1974static int emulator_write_emulated_onepage(unsigned long addr,
1975                                           const void *val,
1976                                           unsigned int bytes,
1977                                           struct kvm_vcpu *vcpu)
1978{
1979        struct kvm_io_device *mmio_dev;
1980        gpa_t                 gpa;
1981
1982        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1983
1984        if (gpa == UNMAPPED_GVA) {
1985                kvm_inject_page_fault(vcpu, addr, 2);
1986                return X86EMUL_PROPAGATE_FAULT;
1987        }
1988
1989        /* For APIC access vmexit */
1990        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1991                goto mmio;
1992
1993        if (emulator_write_phys(vcpu, gpa, val, bytes))
1994                return X86EMUL_CONTINUE;
1995
1996mmio:
1997        /*
1998         * Is this MMIO handled locally?
1999         */
2000        mutex_lock(&vcpu->kvm->lock);
2001        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2002        if (mmio_dev) {
2003                kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2004                mutex_unlock(&vcpu->kvm->lock);
2005                return X86EMUL_CONTINUE;
2006        }
2007        mutex_unlock(&vcpu->kvm->lock);
2008
2009        vcpu->mmio_needed = 1;
2010        vcpu->mmio_phys_addr = gpa;
2011        vcpu->mmio_size = bytes;
2012        vcpu->mmio_is_write = 1;
2013        memcpy(vcpu->mmio_data, val, bytes);
2014
2015        return X86EMUL_CONTINUE;
2016}
2017
2018int emulator_write_emulated(unsigned long addr,
2019                                   const void *val,
2020                                   unsigned int bytes,
2021                                   struct kvm_vcpu *vcpu)
2022{
2023        /* Crossing a page boundary? */
2024        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2025                int rc, now;
2026
2027                now = -addr & ~PAGE_MASK;
2028                rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2029                if (rc != X86EMUL_CONTINUE)
2030                        return rc;
2031                addr += now;
2032                val += now;
2033                bytes -= now;
2034        }
2035        return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2036}
2037EXPORT_SYMBOL_GPL(emulator_write_emulated);
2038
2039static int emulator_cmpxchg_emulated(unsigned long addr,
2040                                     const void *old,
2041                                     const void *new,
2042                                     unsigned int bytes,
2043                                     struct kvm_vcpu *vcpu)
2044{
2045        static int reported;
2046
2047        if (!reported) {
2048                reported = 1;
2049                printk(KERN_WARNING "kvm: emulating exchange as write\n");
2050        }
2051#ifndef CONFIG_X86_64
2052        /* guests cmpxchg8b have to be emulated atomically */
2053        if (bytes == 8) {
2054                gpa_t gpa;
2055                struct page *page;
2056                char *kaddr;
2057                u64 val;
2058
2059                gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2060
2061                if (gpa == UNMAPPED_GVA ||
2062                   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2063                        goto emul_write;
2064
2065                if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2066                        goto emul_write;
2067
2068                val = *(u64 *)new;
2069
2070                down_read(&current->mm->mmap_sem);
2071                page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2072                up_read(&current->mm->mmap_sem);
2073
2074                kaddr = kmap_atomic(page, KM_USER0);
2075                set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2076                kunmap_atomic(kaddr, KM_USER0);
2077                kvm_release_page_dirty(page);
2078        }
2079emul_write:
2080#endif
2081
2082        return emulator_write_emulated(addr, new, bytes, vcpu);
2083}
2084
2085static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2086{
2087        return kvm_x86_ops->get_segment_base(vcpu, seg);
2088}
2089
2090int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2091{
2092        return X86EMUL_CONTINUE;
2093}
2094
2095int emulate_clts(struct kvm_vcpu *vcpu)
2096{
2097        KVMTRACE_0D(CLTS, vcpu, handler);
2098        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2099        return X86EMUL_CONTINUE;
2100}
2101
2102int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2103{
2104        struct kvm_vcpu *vcpu = ctxt->vcpu;
2105
2106        switch (dr) {
2107        case 0 ... 3:
2108                *dest = kvm_x86_ops->get_dr(vcpu, dr);
2109                return X86EMUL_CONTINUE;
2110        default:
2111                pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2112                return X86EMUL_UNHANDLEABLE;
2113        }
2114}
2115
2116int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2117{
2118        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2119        int exception;
2120
2121        kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2122        if (exception) {
2123                /* FIXME: better handling */
2124                return X86EMUL_UNHANDLEABLE;
2125        }
2126        return X86EMUL_CONTINUE;
2127}
2128
2129void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2130{
2131        u8 opcodes[4];
2132        unsigned long rip = vcpu->arch.rip;
2133        unsigned long rip_linear;
2134
2135        if (!printk_ratelimit())
2136                return;
2137
2138        rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2139
2140        emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2141
2142        printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2143               context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2144}
2145EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2146
2147static struct x86_emulate_ops emulate_ops = {
2148        .read_std            = emulator_read_std,
2149        .read_emulated       = emulator_read_emulated,
2150        .write_emulated      = emulator_write_emulated,
2151        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2152};
2153
2154int emulate_instruction(struct kvm_vcpu *vcpu,
2155                        struct kvm_run *run,
2156                        unsigned long cr2,
2157                        u16 error_code,
2158                        int emulation_type)
2159{
2160        int r;
2161        struct decode_cache *c;
2162
2163        vcpu->arch.mmio_fault_cr2 = cr2;
2164        kvm_x86_ops->cache_regs(vcpu);
2165
2166        vcpu->mmio_is_write = 0;
2167        vcpu->arch.pio.string = 0;
2168
2169        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2170                int cs_db, cs_l;
2171                kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2172
2173                vcpu->arch.emulate_ctxt.vcpu = vcpu;
2174                vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2175                vcpu->arch.emulate_ctxt.mode =
2176                        (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2177                        ? X86EMUL_MODE_REAL : cs_l
2178                        ? X86EMUL_MODE_PROT64 : cs_db
2179                        ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2180
2181                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2182
2183                /* Reject the instructions other than VMCALL/VMMCALL when
2184                 * try to emulate invalid opcode */
2185                c = &vcpu->arch.emulate_ctxt.decode;
2186                if ((emulation_type & EMULTYPE_TRAP_UD) &&
2187                    (!(c->twobyte && c->b == 0x01 &&
2188                      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2189                       c->modrm_mod == 3 && c->modrm_rm == 1)))
2190                        return EMULATE_FAIL;
2191
2192                ++vcpu->stat.insn_emulation;
2193                if (r)  {
2194                        ++vcpu->stat.insn_emulation_fail;
2195                        if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2196                                return EMULATE_DONE;
2197                        return EMULATE_FAIL;
2198                }
2199        }
2200
2201        r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2202
2203        if (vcpu->arch.pio.string)
2204                return EMULATE_DO_MMIO;
2205
2206        if ((r || vcpu->mmio_is_write) && run) {
2207                run->exit_reason = KVM_EXIT_MMIO;
2208                run->mmio.phys_addr = vcpu->mmio_phys_addr;
2209                memcpy(run->mmio.data, vcpu->mmio_data, 8);
2210                run->mmio.len = vcpu->mmio_size;
2211                run->mmio.is_write = vcpu->mmio_is_write;
2212        }
2213
2214        if (r) {
2215                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2216                        return EMULATE_DONE;
2217                if (!vcpu->mmio_needed) {
2218                        kvm_report_emulation_failure(vcpu, "mmio");
2219                        return EMULATE_FAIL;
2220                }
2221                return EMULATE_DO_MMIO;
2222        }
2223
2224        kvm_x86_ops->decache_regs(vcpu);
2225        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2226
2227        if (vcpu->mmio_is_write) {
2228                vcpu->mmio_needed = 0;
2229                return EMULATE_DO_MMIO;
2230        }
2231
2232        return EMULATE_DONE;
2233}
2234EXPORT_SYMBOL_GPL(emulate_instruction);
2235
2236static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2237{
2238        int i;
2239
2240        for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2241                if (vcpu->arch.pio.guest_pages[i]) {
2242                        kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2243                        vcpu->arch.pio.guest_pages[i] = NULL;
2244                }
2245}
2246
2247static int pio_copy_data(struct kvm_vcpu *vcpu)
2248{
2249        void *p = vcpu->arch.pio_data;
2250        void *q;
2251        unsigned bytes;
2252        int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
2253
2254        q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2255                 PAGE_KERNEL);
2256        if (!q) {
2257                free_pio_guest_pages(vcpu);
2258                return -ENOMEM;
2259        }
2260        q += vcpu->arch.pio.guest_page_offset;
2261        bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2262        if (vcpu->arch.pio.in)
2263                memcpy(q, p, bytes);
2264        else
2265                memcpy(p, q, bytes);
2266        q -= vcpu->arch.pio.guest_page_offset;
2267        vunmap(q);
2268        free_pio_guest_pages(vcpu);
2269        return 0;
2270}
2271
2272int complete_pio(struct kvm_vcpu *vcpu)
2273{
2274        struct kvm_pio_request *io = &vcpu->arch.pio;
2275        long delta;
2276        int r;
2277
2278        kvm_x86_ops->cache_regs(vcpu);
2279
2280        if (!io->string) {
2281                if (io->in)
2282                        memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
2283                               io->size);
2284        } else {
2285                if (io->in) {
2286                        r = pio_copy_data(vcpu);
2287                        if (r) {
2288                                kvm_x86_ops->cache_regs(vcpu);
2289                                return r;
2290                        }
2291                }
2292
2293                delta = 1;
2294                if (io->rep) {
2295                        delta *= io->cur_count;
2296                        /*
2297                         * The size of the register should really depend on
2298                         * current address size.
2299                         */
2300                        vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2301                }
2302                if (io->down)
2303                        delta = -delta;
2304                delta *= io->size;
2305                if (io->in)
2306                        vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2307                else
2308                        vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2309        }
2310
2311        kvm_x86_ops->decache_regs(vcpu);
2312
2313        io->count -= io->cur_count;
2314        io->cur_count = 0;
2315
2316        return 0;
2317}
2318
2319static void kernel_pio(struct kvm_io_device *pio_dev,
2320                       struct kvm_vcpu *vcpu,
2321                       void *pd)
2322{
2323        /* TODO: String I/O for in kernel device */
2324
2325        mutex_lock(&vcpu->kvm->lock);
2326        if (vcpu->arch.pio.in)
2327                kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2328                                  vcpu->arch.pio.size,
2329                                  pd);
2330        else
2331                kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2332                                   vcpu->arch.pio.size,
2333                                   pd);
2334        mutex_unlock(&vcpu->kvm->lock);
2335}
2336
2337static void pio_string_write(struct kvm_io_device *pio_dev,
2338                             struct kvm_vcpu *vcpu)
2339{
2340        struct kvm_pio_request *io = &vcpu->arch.pio;
2341        void *pd = vcpu->arch.pio_data;
2342        int i;
2343
2344        mutex_lock(&vcpu->kvm->lock);
2345        for (i = 0; i < io->cur_count; i++) {
2346                kvm_iodevice_write(pio_dev, io->port,
2347                                   io->size,
2348                                   pd);
2349                pd += io->size;
2350        }
2351        mutex_unlock(&vcpu->kvm->lock);
2352}
2353
2354static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2355                                               gpa_t addr, int len,
2356                                               int is_write)
2357{
2358        return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2359}
2360
2361int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2362                  int size, unsigned port)
2363{
2364        struct kvm_io_device *pio_dev;
2365
2366        vcpu->run->exit_reason = KVM_EXIT_IO;
2367        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2368        vcpu->run->io.size = vcpu->arch.pio.size = size;
2369        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2370        vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2371        vcpu->run->io.port = vcpu->arch.pio.port = port;
2372        vcpu->arch.pio.in = in;
2373        vcpu->arch.pio.string = 0;
2374        vcpu->arch.pio.down = 0;
2375        vcpu->arch.pio.guest_page_offset = 0;
2376        vcpu->arch.pio.rep = 0;
2377
2378        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2379                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2380                            handler);
2381        else
2382                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2383                            handler);
2384
2385        kvm_x86_ops->cache_regs(vcpu);
2386        memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2387
2388        kvm_x86_ops->skip_emulated_instruction(vcpu);
2389
2390        pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2391        if (pio_dev) {
2392                kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2393                complete_pio(vcpu);
2394                return 1;
2395        }
2396        return 0;
2397}
2398EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2399
2400int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2401                  int size, unsigned long count, int down,
2402                  gva_t address, int rep, unsigned port)
2403{
2404        unsigned now, in_page;
2405        int i, ret = 0;
2406        int nr_pages = 1;
2407        struct page *page;
2408        struct kvm_io_device *pio_dev;
2409
2410        vcpu->run->exit_reason = KVM_EXIT_IO;
2411        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2412        vcpu->run->io.size = vcpu->arch.pio.size = size;
2413        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2414        vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2415        vcpu->run->io.port = vcpu->arch.pio.port = port;
2416        vcpu->arch.pio.in = in;
2417        vcpu->arch.pio.string = 1;
2418        vcpu->arch.pio.down = down;
2419        vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2420        vcpu->arch.pio.rep = rep;
2421
2422        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2423                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2424                            handler);
2425        else
2426                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2427                            handler);
2428
2429        if (!count) {
2430                kvm_x86_ops->skip_emulated_instruction(vcpu);
2431                return 1;
2432        }
2433
2434        if (!down)
2435                in_page = PAGE_SIZE - offset_in_page(address);
2436        else
2437                in_page = offset_in_page(address) + size;
2438        now = min(count, (unsigned long)in_page / size);
2439        if (!now) {
2440                /*
2441                 * String I/O straddles page boundary.  Pin two guest pages
2442                 * so that we satisfy atomicity constraints.  Do just one
2443                 * transaction to avoid complexity.
2444                 */
2445                nr_pages = 2;
2446                now = 1;
2447        }
2448        if (down) {
2449                /*
2450                 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2451                 */
2452                pr_unimpl(vcpu, "guest string pio down\n");
2453                kvm_inject_gp(vcpu, 0);
2454                return 1;
2455        }
2456        vcpu->run->io.count = now;
2457        vcpu->arch.pio.cur_count = now;
2458
2459        if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2460                kvm_x86_ops->skip_emulated_instruction(vcpu);
2461
2462        for (i = 0; i < nr_pages; ++i) {
2463                page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2464                vcpu->arch.pio.guest_pages[i] = page;
2465                if (!page) {
2466                        kvm_inject_gp(vcpu, 0);
2467                        free_pio_guest_pages(vcpu);
2468                        return 1;
2469                }
2470        }
2471
2472        pio_dev = vcpu_find_pio_dev(vcpu, port,
2473                                    vcpu->arch.pio.cur_count,
2474                                    !vcpu->arch.pio.in);
2475        if (!vcpu->arch.pio.in) {
2476                /* string PIO write */
2477                ret = pio_copy_data(vcpu);
2478                if (ret >= 0 && pio_dev) {
2479                        pio_string_write(pio_dev, vcpu);
2480                        complete_pio(vcpu);
2481                        if (vcpu->arch.pio.count == 0)
2482                                ret = 1;
2483                }
2484        } else if (pio_dev)
2485                pr_unimpl(vcpu, "no string pio read support yet, "
2486                       "port %x size %d count %ld\n",
2487                        port, size, count);
2488
2489        return ret;
2490}
2491EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2492
2493int kvm_arch_init(void *opaque)
2494{
2495        int r;
2496        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2497
2498        if (kvm_x86_ops) {
2499                printk(KERN_ERR "kvm: already loaded the other module\n");
2500                r = -EEXIST;
2501                goto out;
2502        }
2503
2504        if (!ops->cpu_has_kvm_support()) {
2505                printk(KERN_ERR "kvm: no hardware support\n");
2506                r = -EOPNOTSUPP;
2507                goto out;
2508        }
2509        if (ops->disabled_by_bios()) {
2510                printk(KERN_ERR "kvm: disabled by bios\n");
2511                r = -EOPNOTSUPP;
2512                goto out;
2513        }
2514
2515        r = kvm_mmu_module_init();
2516        if (r)
2517                goto out;
2518
2519        kvm_init_msr_list();
2520
2521        kvm_x86_ops = ops;
2522        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2523        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2524        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2525                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
2526        return 0;
2527
2528out:
2529        return r;
2530}
2531
2532void kvm_arch_exit(void)
2533{
2534        kvm_x86_ops = NULL;
2535        kvm_mmu_module_exit();
2536}
2537
2538int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2539{
2540        ++vcpu->stat.halt_exits;
2541        KVMTRACE_0D(HLT, vcpu, handler);
2542        if (irqchip_in_kernel(vcpu->kvm)) {
2543                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2544                up_read(&vcpu->kvm->slots_lock);
2545                kvm_vcpu_block(vcpu);
2546                down_read(&vcpu->kvm->slots_lock);
2547                if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2548                        return -EINTR;
2549                return 1;
2550        } else {
2551                vcpu->run->exit_reason = KVM_EXIT_HLT;
2552                return 0;
2553        }
2554}
2555EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2556
2557static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2558                           unsigned long a1)
2559{
2560        if (is_long_mode(vcpu))
2561                return a0;
2562        else
2563                return a0 | ((gpa_t)a1 << 32);
2564}
2565
2566int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2567{
2568        unsigned long nr, a0, a1, a2, a3, ret;
2569        int r = 1;
2570
2571        kvm_x86_ops->cache_regs(vcpu);
2572
2573        nr = vcpu->arch.regs[VCPU_REGS_RAX];
2574        a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2575        a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2576        a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2577        a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2578
2579        KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2580
2581        if (!is_long_mode(vcpu)) {
2582                nr &= 0xFFFFFFFF;
2583                a0 &= 0xFFFFFFFF;
2584                a1 &= 0xFFFFFFFF;
2585                a2 &= 0xFFFFFFFF;
2586                a3 &= 0xFFFFFFFF;
2587        }
2588
2589        if (kvm_x86_ops->get_cpl(vcpu) != 0) {
2590                ret = -KVM_EPERM;
2591                goto out;
2592        }
2593
2594        switch (nr) {
2595        case KVM_HC_VAPIC_POLL_IRQ:
2596                ret = 0;
2597                break;
2598        case KVM_HC_MMU_OP:
2599                r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2600                break;
2601        default:
2602                ret = -KVM_ENOSYS;
2603                break;
2604        }
2605out:
2606        vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2607        kvm_x86_ops->decache_regs(vcpu);
2608        ++vcpu->stat.hypercalls;
2609        return r;
2610}
2611EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2612
2613int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2614{
2615        char instruction[3];
2616        int ret = 0;
2617
2618
2619        /*
2620         * Blow out the MMU to ensure that no other VCPU has an active mapping
2621         * to ensure that the updated hypercall appears atomically across all
2622         * VCPUs.
2623         */
2624        kvm_mmu_zap_all(vcpu->kvm);
2625
2626        kvm_x86_ops->cache_regs(vcpu);
2627        kvm_x86_ops->patch_hypercall(vcpu, instruction);
2628        if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2629            != X86EMUL_CONTINUE)
2630                ret = -EFAULT;
2631
2632        return ret;
2633}
2634
2635static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2636{
2637        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2638}
2639
2640void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2641{
2642        struct descriptor_table dt = { limit, base };
2643
2644        kvm_x86_ops->set_gdt(vcpu, &dt);
2645}
2646
2647void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2648{
2649        struct descriptor_table dt = { limit, base };
2650
2651        kvm_x86_ops->set_idt(vcpu, &dt);
2652}
2653
2654void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2655                   unsigned long *rflags)
2656{
2657        kvm_lmsw(vcpu, msw);
2658        *rflags = kvm_x86_ops->get_rflags(vcpu);
2659}
2660
2661unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2662{
2663        unsigned long value;
2664
2665        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2666        switch (cr) {
2667        case 0:
2668                value = vcpu->arch.cr0;
2669                break;
2670        case 2:
2671                value = vcpu->arch.cr2;
2672                break;
2673        case 3:
2674                value = vcpu->arch.cr3;
2675                break;
2676        case 4:
2677                value = vcpu->arch.cr4;
2678                break;
2679        case 8:
2680                value = kvm_get_cr8(vcpu);
2681                break;
2682        default:
2683                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2684                return 0;
2685        }
2686        KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2687                    (u32)((u64)value >> 32), handler);
2688
2689        return value;
2690}
2691
2692void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2693                     unsigned long *rflags)
2694{
2695        KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2696                    (u32)((u64)val >> 32), handler);
2697
2698        switch (cr) {
2699        case 0:
2700                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2701                *rflags = kvm_x86_ops->get_rflags(vcpu);
2702                break;
2703        case 2:
2704                vcpu->arch.cr2 = val;
2705                break;
2706        case 3:
2707                kvm_set_cr3(vcpu, val);
2708                break;
2709        case 4:
2710                kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2711                break;
2712        case 8:
2713                kvm_set_cr8(vcpu, val & 0xfUL);
2714                break;
2715        default:
2716                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2717        }
2718}
2719
2720static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2721{
2722        struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2723        int j, nent = vcpu->arch.cpuid_nent;
2724
2725        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2726        /* when no next entry is found, the current entry[i] is reselected */
2727        for (j = i + 1; j == i; j = (j + 1) % nent) {
2728                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2729                if (ej->function == e->function) {
2730                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2731                        return j;
2732                }
2733        }
2734        return 0; /* silence gcc, even though control never reaches here */
2735}
2736
2737/* find an entry with matching function, matching index (if needed), and that
2738 * should be read next (if it's stateful) */
2739static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2740        u32 function, u32 index)
2741{
2742        if (e->function != function)
2743                return 0;
2744        if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2745                return 0;
2746        if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2747                !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2748                return 0;
2749        return 1;
2750}
2751
2752void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2753{
2754        int i;
2755        u32 function, index;
2756        struct kvm_cpuid_entry2 *e, *best;
2757
2758        kvm_x86_ops->cache_regs(vcpu);
2759        function = vcpu->arch.regs[VCPU_REGS_RAX];
2760        index = vcpu->arch.regs[VCPU_REGS_RCX];
2761        vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2762        vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2763        vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2764        vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2765        best = NULL;
2766        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2767                e = &vcpu->arch.cpuid_entries[i];
2768                if (is_matching_cpuid_entry(e, function, index)) {
2769                        if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2770                                move_to_next_stateful_cpuid_entry(vcpu, i);
2771                        best = e;
2772                        break;
2773                }
2774                /*
2775                 * Both basic or both extended?
2776                 */
2777                if (((e->function ^ function) & 0x80000000) == 0)
2778                        if (!best || e->function > best->function)
2779                                best = e;
2780        }
2781        if (best) {
2782                vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2783                vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2784                vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2785                vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2786        }
2787        kvm_x86_ops->decache_regs(vcpu);
2788        kvm_x86_ops->skip_emulated_instruction(vcpu);
2789        KVMTRACE_5D(CPUID, vcpu, function,
2790                    (u32)vcpu->arch.regs[VCPU_REGS_RAX],
2791                    (u32)vcpu->arch.regs[VCPU_REGS_RBX],
2792                    (u32)vcpu->arch.regs[VCPU_REGS_RCX],
2793                    (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
2794}
2795EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2796
2797/*
2798 * Check if userspace requested an interrupt window, and that the
2799 * interrupt window is open.
2800 *
2801 * No need to exit to userspace if we already have an interrupt queued.
2802 */
2803static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2804                                          struct kvm_run *kvm_run)
2805{
2806        return (!vcpu->arch.irq_summary &&
2807                kvm_run->request_interrupt_window &&
2808                vcpu->arch.interrupt_window_open &&
2809                (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2810}
2811
2812static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2813                              struct kvm_run *kvm_run)
2814{
2815        kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2816        kvm_run->cr8 = kvm_get_cr8(vcpu);
2817        kvm_run->apic_base = kvm_get_apic_base(vcpu);
2818        if (irqchip_in_kernel(vcpu->kvm))
2819                kvm_run->ready_for_interrupt_injection = 1;
2820        else
2821                kvm_run->ready_for_interrupt_injection =
2822                                        (vcpu->arch.interrupt_window_open &&
2823                                         vcpu->arch.irq_summary == 0);
2824}
2825
2826static void vapic_enter(struct kvm_vcpu *vcpu)
2827{
2828        struct kvm_lapic *apic = vcpu->arch.apic;
2829        struct page *page;
2830
2831        if (!apic || !apic->vapic_addr)
2832                return;
2833
2834        down_read(&current->mm->mmap_sem);
2835        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2836        up_read(&current->mm->mmap_sem);
2837
2838        vcpu->arch.apic->vapic_page = page;
2839}
2840
2841static void vapic_exit(struct kvm_vcpu *vcpu)
2842{
2843        struct kvm_lapic *apic = vcpu->arch.apic;
2844
2845        if (!apic || !apic->vapic_addr)
2846                return;
2847
2848        down_read(&vcpu->kvm->slots_lock);
2849        kvm_release_page_dirty(apic->vapic_page);
2850        mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2851        up_read(&vcpu->kvm->slots_lock);
2852}
2853
2854static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2855{
2856        int r;
2857
2858        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2859                pr_debug("vcpu %d received sipi with vector # %x\n",
2860                       vcpu->vcpu_id, vcpu->arch.sipi_vector);
2861                kvm_lapic_reset(vcpu);
2862                r = kvm_x86_ops->vcpu_reset(vcpu);
2863                if (r)
2864                        return r;
2865                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2866        }
2867
2868        down_read(&vcpu->kvm->slots_lock);
2869        vapic_enter(vcpu);
2870
2871again:
2872        if (vcpu->requests)
2873                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2874                        kvm_mmu_unload(vcpu);
2875
2876        r = kvm_mmu_reload(vcpu);
2877        if (unlikely(r))
2878                goto out;
2879
2880        if (vcpu->requests) {
2881                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2882                        __kvm_migrate_timers(vcpu);
2883                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2884                        kvm_x86_ops->tlb_flush(vcpu);
2885                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2886                                       &vcpu->requests)) {
2887                        kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2888                        r = 0;
2889                        goto out;
2890                }
2891                if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2892                        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2893                        r = 0;
2894                        goto out;
2895                }
2896        }
2897
2898        clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2899        kvm_inject_pending_timer_irqs(vcpu);
2900
2901        preempt_disable();
2902
2903        kvm_x86_ops->prepare_guest_switch(vcpu);
2904        kvm_load_guest_fpu(vcpu);
2905
2906        local_irq_disable();
2907
2908        if (vcpu->requests || need_resched()) {
2909                local_irq_enable();
2910                preempt_enable();
2911                r = 1;
2912                goto out;
2913        }
2914
2915        if (signal_pending(current)) {
2916                local_irq_enable();
2917                preempt_enable();
2918                r = -EINTR;
2919                kvm_run->exit_reason = KVM_EXIT_INTR;
2920                ++vcpu->stat.signal_exits;
2921                goto out;
2922        }
2923
2924        if (vcpu->guest_debug.enabled)
2925                kvm_x86_ops->guest_debug_pre(vcpu);
2926
2927        vcpu->guest_mode = 1;
2928        /*
2929         * Make sure that guest_mode assignment won't happen after
2930         * testing the pending IRQ vector bitmap.
2931         */
2932        smp_wmb();
2933
2934        if (vcpu->arch.exception.pending)
2935                __queue_exception(vcpu);
2936        else if (irqchip_in_kernel(vcpu->kvm))
2937                kvm_x86_ops->inject_pending_irq(vcpu);
2938        else
2939                kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2940
2941        kvm_lapic_sync_to_vapic(vcpu);
2942
2943        up_read(&vcpu->kvm->slots_lock);
2944
2945        kvm_guest_enter();
2946
2947
2948        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2949        kvm_x86_ops->run(vcpu, kvm_run);
2950
2951        vcpu->guest_mode = 0;
2952        local_irq_enable();
2953
2954        ++vcpu->stat.exits;
2955
2956        /*
2957         * We must have an instruction between local_irq_enable() and
2958         * kvm_guest_exit(), so the timer interrupt isn't delayed by
2959         * the interrupt shadow.  The stat.exits increment will do nicely.
2960         * But we need to prevent reordering, hence this barrier():
2961         */
2962        barrier();
2963
2964        kvm_guest_exit();
2965
2966        preempt_enable();
2967
2968        down_read(&vcpu->kvm->slots_lock);
2969
2970        /*
2971         * Profile KVM exit RIPs:
2972         */
2973        if (unlikely(prof_on == KVM_PROFILING)) {
2974                kvm_x86_ops->cache_regs(vcpu);
2975                profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2976        }
2977
2978        if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2979                vcpu->arch.exception.pending = false;
2980
2981        kvm_lapic_sync_from_vapic(vcpu);
2982
2983        r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2984
2985        if (r > 0) {
2986                if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2987                        r = -EINTR;
2988                        kvm_run->exit_reason = KVM_EXIT_INTR;
2989                        ++vcpu->stat.request_irq_exits;
2990                        goto out;
2991                }
2992                if (!need_resched())
2993                        goto again;
2994        }
2995
2996out:
2997        up_read(&vcpu->kvm->slots_lock);
2998        if (r > 0) {
2999                kvm_resched(vcpu);
3000                down_read(&vcpu->kvm->slots_lock);
3001                goto again;
3002        }
3003
3004        post_kvm_run_save(vcpu, kvm_run);
3005
3006        vapic_exit(vcpu);
3007
3008        return r;
3009}
3010
3011int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012{
3013        int r;
3014        sigset_t sigsaved;
3015
3016        vcpu_load(vcpu);
3017
3018        if (vcpu->sigset_active)
3019                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3020
3021        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3022                kvm_vcpu_block(vcpu);
3023                r = -EAGAIN;
3024                goto out;
3025        }
3026
3027        /* re-sync apic's tpr */
3028        if (!irqchip_in_kernel(vcpu->kvm))
3029                kvm_set_cr8(vcpu, kvm_run->cr8);
3030
3031        if (vcpu->arch.pio.cur_count) {
3032                r = complete_pio(vcpu);
3033                if (r)
3034                        goto out;
3035        }
3036#if CONFIG_HAS_IOMEM
3037        if (vcpu->mmio_needed) {
3038                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3039                vcpu->mmio_read_completed = 1;
3040                vcpu->mmio_needed = 0;
3041
3042                down_read(&vcpu->kvm->slots_lock);
3043                r = emulate_instruction(vcpu, kvm_run,
3044                                        vcpu->arch.mmio_fault_cr2, 0,
3045                                        EMULTYPE_NO_DECODE);
3046                up_read(&vcpu->kvm->slots_lock);
3047                if (r == EMULATE_DO_MMIO) {
3048                        /*
3049                         * Read-modify-write.  Back to userspace.
3050                         */
3051                        r = 0;
3052                        goto out;
3053                }
3054        }
3055#endif
3056        if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
3057                kvm_x86_ops->cache_regs(vcpu);
3058                vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
3059                kvm_x86_ops->decache_regs(vcpu);
3060        }
3061
3062        r = __vcpu_run(vcpu, kvm_run);
3063
3064out:
3065        if (vcpu->sigset_active)
3066                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3067
3068        vcpu_put(vcpu);
3069        return r;
3070}
3071
3072int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3073{
3074        vcpu_load(vcpu);
3075
3076        kvm_x86_ops->cache_regs(vcpu);
3077
3078        regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
3079        regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
3080        regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
3081        regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
3082        regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
3083        regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
3084        regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3085        regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3086#ifdef CONFIG_X86_64
3087        regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
3088        regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
3089        regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
3090        regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
3091        regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
3092        regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
3093        regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
3094        regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
3095#endif
3096
3097        regs->rip = vcpu->arch.rip;
3098        regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3099
3100        /*
3101         * Don't leak debug flags in case they were set for guest debugging
3102         */
3103        if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
3104                regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3105
3106        vcpu_put(vcpu);
3107
3108        return 0;
3109}
3110
3111int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3112{
3113        vcpu_load(vcpu);
3114
3115        vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
3116        vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
3117        vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
3118        vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
3119        vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
3120        vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
3121        vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
3122        vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
3123#ifdef CONFIG_X86_64
3124        vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
3125        vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
3126        vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
3127        vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
3128        vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
3129        vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
3130        vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
3131        vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
3132#endif
3133
3134        vcpu->arch.rip = regs->rip;
3135        kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3136
3137        kvm_x86_ops->decache_regs(vcpu);
3138
3139        vcpu->arch.exception.pending = false;
3140
3141        vcpu_put(vcpu);
3142
3143        return 0;
3144}
3145
3146void kvm_get_segment(struct kvm_vcpu *vcpu,
3147                     struct kvm_segment *var, int seg)
3148{
3149        kvm_x86_ops->get_segment(vcpu, var, seg);
3150}
3151
3152void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3153{
3154        struct kvm_segment cs;
3155
3156        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3157        *db = cs.db;
3158        *l = cs.l;
3159}
3160EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3161
3162int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3163                                  struct kvm_sregs *sregs)
3164{
3165        struct descriptor_table dt;
3166        int pending_vec;
3167
3168        vcpu_load(vcpu);
3169
3170        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3171        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3172        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3173        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3174        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3175        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3176
3177        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3178        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3179
3180        kvm_x86_ops->get_idt(vcpu, &dt);
3181        sregs->idt.limit = dt.limit;
3182        sregs->idt.base = dt.base;
3183        kvm_x86_ops->get_gdt(vcpu, &dt);
3184        sregs->gdt.limit = dt.limit;
3185        sregs->gdt.base = dt.base;
3186
3187        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3188        sregs->cr0 = vcpu->arch.cr0;
3189        sregs->cr2 = vcpu->arch.cr2;
3190        sregs->cr3 = vcpu->arch.cr3;
3191        sregs->cr4 = vcpu->arch.cr4;
3192        sregs->cr8 = kvm_get_cr8(vcpu);
3193        sregs->efer = vcpu->arch.shadow_efer;
3194        sregs->apic_base = kvm_get_apic_base(vcpu);
3195
3196        if (irqchip_in_kernel(vcpu->kvm)) {
3197                memset(sregs->interrupt_bitmap, 0,
3198                       sizeof sregs->interrupt_bitmap);
3199                pending_vec = kvm_x86_ops->get_irq(vcpu);
3200                if (pending_vec >= 0)
3201                        set_bit(pending_vec,
3202                                (unsigned long *)sregs->interrupt_bitmap);
3203        } else
3204                memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3205                       sizeof sregs->interrupt_bitmap);
3206
3207        vcpu_put(vcpu);
3208
3209        return 0;
3210}
3211
3212int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3213                                    struct kvm_mp_state *mp_state)
3214{
3215        vcpu_load(vcpu);
3216        mp_state->mp_state = vcpu->arch.mp_state;
3217        vcpu_put(vcpu);
3218        return 0;
3219}
3220
3221int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3222                                    struct kvm_mp_state *mp_state)
3223{
3224        vcpu_load(vcpu);
3225        vcpu->arch.mp_state = mp_state->mp_state;
3226        vcpu_put(vcpu);
3227        return 0;
3228}
3229
3230static void kvm_set_segment(struct kvm_vcpu *vcpu,
3231                        struct kvm_segment *var, int seg)
3232{
3233        kvm_x86_ops->set_segment(vcpu, var, seg);
3234}
3235
3236static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3237                                   struct kvm_segment *kvm_desct)
3238{
3239        kvm_desct->base = seg_desc->base0;
3240        kvm_desct->base |= seg_desc->base1 << 16;
3241        kvm_desct->base |= seg_desc->base2 << 24;
3242        kvm_desct->limit = seg_desc->limit0;
3243        kvm_desct->limit |= seg_desc->limit << 16;
3244        if (seg_desc->g) {
3245                kvm_desct->limit <<= 12;
3246                kvm_desct->limit |= 0xfff;
3247        }
3248        kvm_desct->selector = selector;
3249        kvm_desct->type = seg_desc->type;
3250        kvm_desct->present = seg_desc->p;
3251        kvm_desct->dpl = seg_desc->dpl;
3252        kvm_desct->db = seg_desc->d;
3253        kvm_desct->s = seg_desc->s;
3254        kvm_desct->l = seg_desc->l;
3255        kvm_desct->g = seg_desc->g;
3256        kvm_desct->avl = seg_desc->avl;
3257        if (!selector)
3258                kvm_desct->unusable = 1;
3259        else
3260                kvm_desct->unusable = 0;
3261        kvm_desct->padding = 0;
3262}
3263
3264static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3265                                           u16 selector,
3266                                           struct descriptor_table *dtable)
3267{
3268        if (selector & 1 << 2) {
3269                struct kvm_segment kvm_seg;
3270
3271                kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3272
3273                if (kvm_seg.unusable)
3274                        dtable->limit = 0;
3275                else
3276                        dtable->limit = kvm_seg.limit;
3277                dtable->base = kvm_seg.base;
3278        }
3279        else
3280                kvm_x86_ops->get_gdt(vcpu, dtable);
3281}
3282
3283/* allowed just for 8 bytes segments */
3284static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3285                                         struct desc_struct *seg_desc)
3286{
3287        gpa_t gpa;
3288        struct descriptor_table dtable;
3289        u16 index = selector >> 3;
3290
3291        get_segment_descritptor_dtable(vcpu, selector, &dtable);
3292
3293        if (dtable.limit < index * 8 + 7) {
3294                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3295                return 1;
3296        }
3297        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3298        gpa += index * 8;
3299        return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3300}
3301
3302/* allowed just for 8 bytes segments */
3303static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3304                                         struct desc_struct *seg_desc)
3305{
3306        gpa_t gpa;
3307        struct descriptor_table dtable;
3308        u16 index = selector >> 3;
3309
3310        get_segment_descritptor_dtable(vcpu, selector, &dtable);
3311
3312        if (dtable.limit < index * 8 + 7)
3313                return 1;
3314        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3315        gpa += index * 8;
3316        return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3317}
3318
3319static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3320                             struct desc_struct *seg_desc)
3321{
3322        u32 base_addr;
3323
3324        base_addr = seg_desc->base0;
3325        base_addr |= (seg_desc->base1 << 16);
3326        base_addr |= (seg_desc->base2 << 24);
3327
3328        return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3329}
3330
3331static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3332{
3333        struct kvm_segment kvm_seg;
3334
3335        kvm_get_segment(vcpu, &kvm_seg, seg);
3336        return kvm_seg.selector;
3337}
3338
3339static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3340                                                u16 selector,
3341                                                struct kvm_segment *kvm_seg)
3342{
3343        struct desc_struct seg_desc;
3344
3345        if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3346                return 1;
3347        seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3348        return 0;
3349}
3350
3351int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3352{
3353        struct kvm_segment segvar = {
3354                .base = selector << 4,
3355                .limit = 0xffff,
3356                .selector = selector,
3357                .type = 3,
3358                .present = 1,
3359                .dpl = 3,
3360                .db = 0,
3361                .s = 1,
3362                .l = 0,
3363                .g = 0,
3364                .avl = 0,
3365                .unusable = 0,
3366        };
3367        kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3368        return 0;
3369}
3370
3371int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3372                                int type_bits, int seg)
3373{
3374        struct kvm_segment kvm_seg;
3375
3376        if (!(vcpu->arch.cr0 & X86_CR0_PE))
3377                return kvm_load_realmode_segment(vcpu, selector, seg);
3378        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3379                return 1;
3380        kvm_seg.type |= type_bits;
3381
3382        if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3383            seg != VCPU_SREG_LDTR)
3384                if (!kvm_seg.s)
3385                        kvm_seg.unusable = 1;
3386
3387        kvm_set_segment(vcpu, &kvm_seg, seg);
3388        return 0;
3389}
3390
3391static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3392                                struct tss_segment_32 *tss)
3393{
3394        tss->cr3 = vcpu->arch.cr3;
3395        tss->eip = vcpu->arch.rip;
3396        tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3397        tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
3398        tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3399        tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
3400        tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
3401        tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
3402        tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
3403        tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
3404        tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
3405
3406        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3407        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3408        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3409        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3410        tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3411        tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3412        tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3413        tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3414}
3415
3416static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3417                                  struct tss_segment_32 *tss)
3418{
3419        kvm_set_cr3(vcpu, tss->cr3);
3420
3421        vcpu->arch.rip = tss->eip;
3422        kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3423
3424        vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
3425        vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
3426        vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
3427        vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
3428        vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
3429        vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
3430        vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3431        vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3432
3433        if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3434                return 1;
3435
3436        if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3437                return 1;
3438
3439        if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3440                return 1;
3441
3442        if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3443                return 1;
3444
3445        if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3446                return 1;
3447
3448        if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3449                return 1;
3450
3451        if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3452                return 1;
3453        return 0;
3454}
3455
3456static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3457                                struct tss_segment_16 *tss)
3458{
3459        tss->ip = vcpu->arch.rip;
3460        tss->flag = kvm_x86_ops->get_rflags(vcpu);
3461        tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
3462        tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
3463        tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
3464        tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
3465        tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
3466        tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
3467        tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
3468        tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
3469
3470        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3471        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3472        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3473        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3474        tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3475        tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3476}
3477
3478static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3479                                 struct tss_segment_16 *tss)
3480{
3481        vcpu->arch.rip = tss->ip;
3482        kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3483        vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
3484        vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
3485        vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
3486        vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
3487        vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
3488        vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
3489        vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3490        vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3491
3492        if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3493                return 1;
3494
3495        if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3496                return 1;
3497
3498        if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3499                return 1;
3500
3501        if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3502                return 1;
3503
3504        if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3505                return 1;
3506        return 0;
3507}
3508
3509static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3510                       u32 old_tss_base,
3511                       struct desc_struct *nseg_desc)
3512{
3513        struct tss_segment_16 tss_segment_16;
3514        int ret = 0;
3515
3516        if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3517                           sizeof tss_segment_16))
3518                goto out;
3519
3520        save_state_to_tss16(vcpu, &tss_segment_16);
3521
3522        if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3523                            sizeof tss_segment_16))
3524                goto out;
3525
3526        if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3527                           &tss_segment_16, sizeof tss_segment_16))
3528                goto out;
3529
3530        if (load_state_from_tss16(vcpu, &tss_segment_16))
3531                goto out;
3532
3533        ret = 1;
3534out:
3535        return ret;
3536}
3537
3538static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3539                       u32 old_tss_base,
3540                       struct desc_struct *nseg_desc)
3541{
3542        struct tss_segment_32 tss_segment_32;
3543        int ret = 0;
3544
3545        if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3546                           sizeof tss_segment_32))
3547                goto out;
3548
3549        save_state_to_tss32(vcpu, &tss_segment_32);
3550
3551        if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3552                            sizeof tss_segment_32))
3553                goto out;
3554
3555        if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3556                           &tss_segment_32, sizeof tss_segment_32))
3557                goto out;
3558
3559        if (load_state_from_tss32(vcpu, &tss_segment_32))
3560                goto out;
3561
3562        ret = 1;
3563out:
3564        return ret;
3565}
3566
3567int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3568{
3569        struct kvm_segment tr_seg;
3570        struct desc_struct cseg_desc;
3571        struct desc_struct nseg_desc;
3572        int ret = 0;
3573        u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3574        u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3575
3576        old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3577
3578        /* FIXME: Handle errors. Failure to read either TSS or their
3579         * descriptors should generate a pagefault.
3580         */
3581        if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3582                goto out;
3583
3584        if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3585                goto out;
3586
3587        if (reason != TASK_SWITCH_IRET) {
3588                int cpl;
3589
3590                cpl = kvm_x86_ops->get_cpl(vcpu);
3591                if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3592                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3593                        return 1;
3594                }
3595        }
3596
3597        if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3598                kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3599                return 1;
3600        }
3601
3602        if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3603                cseg_desc.type &= ~(1 << 1); //clear the B flag
3604                save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3605        }
3606
3607        if (reason == TASK_SWITCH_IRET) {
3608                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3609                kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3610        }
3611
3612        kvm_x86_ops->skip_emulated_instruction(vcpu);
3613        kvm_x86_ops->cache_regs(vcpu);
3614
3615        if (nseg_desc.type & 8)
3616                ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3617                                         &nseg_desc);
3618        else
3619                ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3620                                         &nseg_desc);
3621
3622        if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3623                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3624                kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3625        }
3626
3627        if (reason != TASK_SWITCH_IRET) {
3628                nseg_desc.type |= (1 << 1);
3629                save_guest_segment_descriptor(vcpu, tss_selector,
3630                                              &nseg_desc);
3631        }
3632
3633        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3634        seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3635        tr_seg.type = 11;
3636        kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3637out:
3638        kvm_x86_ops->decache_regs(vcpu);
3639        return ret;
3640}
3641EXPORT_SYMBOL_GPL(kvm_task_switch);
3642
3643int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3644                                  struct kvm_sregs *sregs)
3645{
3646        int mmu_reset_needed = 0;
3647        int i, pending_vec, max_bits;
3648        struct descriptor_table dt;
3649
3650        vcpu_load(vcpu);
3651
3652        dt.limit = sregs->idt.limit;
3653        dt.base = sregs->idt.base;
3654        kvm_x86_ops->set_idt(vcpu, &dt);
3655        dt.limit = sregs->gdt.limit;
3656        dt.base = sregs->gdt.base;
3657        kvm_x86_ops->set_gdt(vcpu, &dt);
3658
3659        vcpu->arch.cr2 = sregs->cr2;
3660        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3661
3662        down_read(&vcpu->kvm->slots_lock);
3663        if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
3664                vcpu->arch.cr3 = sregs->cr3;
3665        else
3666                set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
3667        up_read(&vcpu->kvm->slots_lock);
3668
3669        kvm_set_cr8(vcpu, sregs->cr8);
3670
3671        mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3672        kvm_x86_ops->set_efer(vcpu, sregs->efer);
3673        kvm_set_apic_base(vcpu, sregs->apic_base);
3674
3675        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3676
3677        mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
3678        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
3679        vcpu->arch.cr0 = sregs->cr0;
3680
3681        mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
3682        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3683        if (!is_long_mode(vcpu) && is_pae(vcpu))
3684                load_pdptrs(vcpu, vcpu->arch.cr3);
3685
3686        if (mmu_reset_needed)
3687                kvm_mmu_reset_context(vcpu);
3688
3689        if (!irqchip_in_kernel(vcpu->kvm)) {
3690                memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3691                       sizeof vcpu->arch.irq_pending);
3692                vcpu->arch.irq_summary = 0;
3693                for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3694                        if (vcpu->arch.irq_pending[i])
3695                                __set_bit(i, &vcpu->arch.irq_summary);
3696        } else {
3697                max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3698                pending_vec = find_first_bit(
3699                        (const unsigned long *)sregs->interrupt_bitmap,
3700                        max_bits);
3701                /* Only pending external irq is handled here */
3702                if (pending_vec < max_bits) {
3703                        kvm_x86_ops->set_irq(vcpu, pending_vec);
3704                        pr_debug("Set back pending irq %d\n",
3705                                 pending_vec);
3706                }
3707        }
3708
3709        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3710        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3711        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3712        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3713        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3714        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3715
3716        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3717        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3718
3719        vcpu_put(vcpu);
3720
3721        return 0;
3722}
3723
3724int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
3725                                    struct kvm_debug_guest *dbg)
3726{
3727        int r;
3728
3729        vcpu_load(vcpu);
3730
3731        r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3732
3733        vcpu_put(vcpu);
3734
3735        return r;
3736}
3737
3738/*
3739 * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
3740 * we have asm/x86/processor.h
3741 */
3742struct fxsave {
3743        u16     cwd;
3744        u16     swd;
3745        u16     twd;
3746        u16     fop;
3747        u64     rip;
3748        u64     rdp;
3749        u32     mxcsr;
3750        u32     mxcsr_mask;
3751        u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
3752#ifdef CONFIG_X86_64
3753        u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
3754#else
3755        u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
3756#endif
3757};
3758
3759/*
3760 * Translate a guest virtual address to a guest physical address.
3761 */
3762int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3763                                    struct kvm_translation *tr)
3764{
3765        unsigned long vaddr = tr->linear_address;
3766        gpa_t gpa;
3767
3768        vcpu_load(vcpu);
3769        down_read(&vcpu->kvm->slots_lock);
3770        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
3771        up_read(&vcpu->kvm->slots_lock);
3772        tr->physical_address = gpa;
3773        tr->valid = gpa != UNMAPPED_GVA;
3774        tr->writeable = 1;
3775        tr->usermode = 0;
3776        vcpu_put(vcpu);
3777
3778        return 0;
3779}
3780
3781int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3782{
3783        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3784
3785        vcpu_load(vcpu);
3786
3787        memcpy(fpu->fpr, fxsave->st_space, 128);
3788        fpu->fcw = fxsave->cwd;
3789        fpu->fsw = fxsave->swd;
3790        fpu->ftwx = fxsave->twd;
3791        fpu->last_opcode = fxsave->fop;
3792        fpu->last_ip = fxsave->rip;
3793        fpu->last_dp = fxsave->rdp;
3794        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3795
3796        vcpu_put(vcpu);
3797
3798        return 0;
3799}
3800
3801int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3802{
3803        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3804
3805        vcpu_load(vcpu);
3806
3807        memcpy(fxsave->st_space, fpu->fpr, 128);
3808        fxsave->cwd = fpu->fcw;
3809        fxsave->swd = fpu->fsw;
3810        fxsave->twd = fpu->ftwx;
3811        fxsave->fop = fpu->last_opcode;
3812        fxsave->rip = fpu->last_ip;
3813        fxsave->rdp = fpu->last_dp;
3814        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3815
3816        vcpu_put(vcpu);
3817
3818        return 0;
3819}
3820
3821void fx_init(struct kvm_vcpu *vcpu)
3822{
3823        unsigned after_mxcsr_mask;
3824
3825        /*
3826         * Touch the fpu the first time in non atomic context as if
3827         * this is the first fpu instruction the exception handler
3828         * will fire before the instruction returns and it'll have to
3829         * allocate ram with GFP_KERNEL.
3830         */
3831        if (!used_math())
3832                kvm_fx_save(&vcpu->arch.host_fx_image);
3833
3834        /* Initialize guest FPU by resetting ours and saving into guest's */
3835        preempt_disable();
3836        kvm_fx_save(&vcpu->arch.host_fx_image);
3837        kvm_fx_finit();
3838        kvm_fx_save(&vcpu->arch.guest_fx_image);
3839        kvm_fx_restore(&vcpu->arch.host_fx_image);
3840        preempt_enable();
3841
3842        vcpu->arch.cr0 |= X86_CR0_ET;
3843        after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3844        vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3845        memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3846               0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3847}
3848EXPORT_SYMBOL_GPL(fx_init);
3849
3850void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3851{
3852        if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3853                return;
3854
3855        vcpu->guest_fpu_loaded = 1;
3856        kvm_fx_save(&vcpu->arch.host_fx_image);
3857        kvm_fx_restore(&vcpu->arch.guest_fx_image);
3858}
3859EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3860
3861void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3862{
3863        if (!vcpu->guest_fpu_loaded)
3864                return;
3865
3866        vcpu->guest_fpu_loaded = 0;
3867        kvm_fx_save(&vcpu->arch.guest_fx_image);
3868        kvm_fx_restore(&vcpu->arch.host_fx_image);
3869        ++vcpu->stat.fpu_reload;
3870}
3871EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3872
3873void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3874{
3875        kvm_x86_ops->vcpu_free(vcpu);
3876}
3877
3878struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3879                                                unsigned int id)
3880{
3881        return kvm_x86_ops->vcpu_create(kvm, id);
3882}
3883
3884int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3885{
3886        int r;
3887
3888        /* We do fxsave: this must be aligned. */
3889        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3890
3891        vcpu_load(vcpu);
3892        r = kvm_arch_vcpu_reset(vcpu);
3893        if (r == 0)
3894                r = kvm_mmu_setup(vcpu);
3895        vcpu_put(vcpu);
3896        if (r < 0)
3897                goto free_vcpu;
3898
3899        return 0;
3900free_vcpu:
3901        kvm_x86_ops->vcpu_free(vcpu);
3902        return r;
3903}
3904
3905void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3906{
3907        vcpu_load(vcpu);
3908        kvm_mmu_unload(vcpu);
3909        vcpu_put(vcpu);
3910
3911        kvm_x86_ops->vcpu_free(vcpu);
3912}
3913
3914int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3915{
3916        return kvm_x86_ops->vcpu_reset(vcpu);
3917}
3918
3919void kvm_arch_hardware_enable(void *garbage)
3920{
3921        kvm_x86_ops->hardware_enable(garbage);
3922}
3923
3924void kvm_arch_hardware_disable(void *garbage)
3925{
3926        kvm_x86_ops->hardware_disable(garbage);
3927}
3928
3929int kvm_arch_hardware_setup(void)
3930{
3931        return kvm_x86_ops->hardware_setup();
3932}
3933
3934void kvm_arch_hardware_unsetup(void)
3935{
3936        kvm_x86_ops->hardware_unsetup();
3937}
3938
3939void kvm_arch_check_processor_compat(void *rtn)
3940{
3941        kvm_x86_ops->check_processor_compatibility(rtn);
3942}
3943
3944int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3945{
3946        struct page *page;
3947        struct kvm *kvm;
3948        int r;
3949
3950        BUG_ON(vcpu->kvm == NULL);
3951        kvm = vcpu->kvm;
3952
3953        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3954        if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3955                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3956        else
3957                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
3958
3959        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3960        if (!page) {
3961                r = -ENOMEM;
3962                goto fail;
3963        }
3964        vcpu->arch.pio_data = page_address(page);
3965
3966        r = kvm_mmu_create(vcpu);
3967        if (r < 0)
3968                goto fail_free_pio_data;
3969
3970        if (irqchip_in_kernel(kvm)) {
3971                r = kvm_create_lapic(vcpu);
3972                if (r < 0)
3973                        goto fail_mmu_destroy;
3974        }
3975
3976        return 0;
3977
3978fail_mmu_destroy:
3979        kvm_mmu_destroy(vcpu);
3980fail_free_pio_data:
3981        free_page((unsigned long)vcpu->arch.pio_data);
3982fail:
3983        return r;
3984}
3985
3986void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3987{
3988        kvm_free_lapic(vcpu);
3989        down_read(&vcpu->kvm->slots_lock);
3990        kvm_mmu_destroy(vcpu);
3991        up_read(&vcpu->kvm->slots_lock);
3992        free_page((unsigned long)vcpu->arch.pio_data);
3993}
3994
3995struct  kvm *kvm_arch_create_vm(void)
3996{
3997        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3998
3999        if (!kvm)
4000                return ERR_PTR(-ENOMEM);
4001
4002        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4003
4004        return kvm;
4005}
4006
4007static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4008{
4009        vcpu_load(vcpu);
4010        kvm_mmu_unload(vcpu);
4011        vcpu_put(vcpu);
4012}
4013
4014static void kvm_free_vcpus(struct kvm *kvm)
4015{
4016        unsigned int i;
4017
4018        /*
4019         * Unpin any mmu pages first.
4020         */
4021        for (i = 0; i < KVM_MAX_VCPUS; ++i)
4022                if (kvm->vcpus[i])
4023                        kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4024        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4025                if (kvm->vcpus[i]) {
4026                        kvm_arch_vcpu_free(kvm->vcpus[i]);
4027                        kvm->vcpus[i] = NULL;
4028                }
4029        }
4030
4031}
4032
4033void kvm_arch_destroy_vm(struct kvm *kvm)
4034{
4035        kvm_free_pit(kvm);
4036        kfree(kvm->arch.vpic);
4037        kfree(kvm->arch.vioapic);
4038        kvm_free_vcpus(kvm);
4039        kvm_free_physmem(kvm);
4040        if (kvm->arch.apic_access_page)
4041                put_page(kvm->arch.apic_access_page);
4042        if (kvm->arch.ept_identity_pagetable)
4043                put_page(kvm->arch.ept_identity_pagetable);
4044        kfree(kvm);
4045}
4046
4047int kvm_arch_set_memory_region(struct kvm *kvm,
4048                                struct kvm_userspace_memory_region *mem,
4049                                struct kvm_memory_slot old,
4050                                int user_alloc)
4051{
4052        int npages = mem->memory_size >> PAGE_SHIFT;
4053        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4054
4055        /*To keep backward compatibility with older userspace,
4056         *x86 needs to hanlde !user_alloc case.
4057         */
4058        if (!user_alloc) {
4059                if (npages && !old.rmap) {
4060                        unsigned long userspace_addr;
4061
4062                        down_write(&current->mm->mmap_sem);
4063                        userspace_addr = do_mmap(NULL, 0,
4064                                                 npages * PAGE_SIZE,
4065                                                 PROT_READ | PROT_WRITE,
4066                                                 MAP_PRIVATE | MAP_ANONYMOUS,
4067                                                 0);
4068                        up_write(&current->mm->mmap_sem);
4069
4070                        if (IS_ERR((void *)userspace_addr))
4071                                return PTR_ERR((void *)userspace_addr);
4072
4073                        /* set userspace_addr atomically for kvm_hva_to_rmapp */
4074                        spin_lock(&kvm->mmu_lock);
4075                        memslot->userspace_addr = userspace_addr;
4076                        spin_unlock(&kvm->mmu_lock);
4077                } else {
4078                        if (!old.user_alloc && old.rmap) {
4079                                int ret;
4080
4081                                down_write(&current->mm->mmap_sem);
4082                                ret = do_munmap(current->mm, old.userspace_addr,
4083                                                old.npages * PAGE_SIZE);
4084                                up_write(&current->mm->mmap_sem);
4085                                if (ret < 0)
4086                                        printk(KERN_WARNING
4087                                       "kvm_vm_ioctl_set_memory_region: "
4088                                       "failed to munmap memory\n");
4089                        }
4090                }
4091        }
4092
4093        spin_lock(&kvm->mmu_lock);
4094        if (!kvm->arch.n_requested_mmu_pages) {
4095                unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4096                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4097        }
4098
4099        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4100        spin_unlock(&kvm->mmu_lock);
4101        kvm_flush_remote_tlbs(kvm);
4102
4103        return 0;
4104}
4105
4106void kvm_arch_flush_shadow(struct kvm *kvm)
4107{
4108        kvm_mmu_zap_all(kvm);
4109        kvm_reload_remote_mmus(kvm);
4110}
4111
4112int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4113{
4114        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4115               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
4116}
4117
4118static void vcpu_kick_intr(void *info)
4119{
4120#ifdef DEBUG
4121        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4122        printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4123#endif
4124}
4125
4126void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4127{
4128        int ipi_pcpu = vcpu->cpu;
4129        int cpu = get_cpu();
4130
4131        if (waitqueue_active(&vcpu->wq)) {
4132                wake_up_interruptible(&vcpu->wq);
4133                ++vcpu->stat.halt_wakeup;
4134        }
4135        /*
4136         * We may be called synchronously with irqs disabled in guest mode,
4137         * So need not to call smp_call_function_single() in that case.
4138         */
4139        if (vcpu->guest_mode && vcpu->cpu != cpu)
4140                smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4141        put_cpu();
4142}
4143