linux/arch/x86/kvm/x86.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright (C) 2008 Qumranet, Inc.
   8 * Copyright IBM Corporation, 2008
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *   Amit Shah    <amit.shah@qumranet.com>
  14 *   Ben-Ami Yassour <benami@il.ibm.com>
  15 *
  16 * This work is licensed under the terms of the GNU GPL, version 2.  See
  17 * the COPYING file in the top-level directory.
  18 *
  19 */
  20
  21#include <linux/kvm_host.h>
  22#include "irq.h"
  23#include "mmu.h"
  24#include "i8254.h"
  25#include "tss.h"
  26#include "kvm_cache_regs.h"
  27#include "x86.h"
  28
  29#include <linux/clocksource.h>
  30#include <linux/interrupt.h>
  31#include <linux/kvm.h>
  32#include <linux/fs.h>
  33#include <linux/vmalloc.h>
  34#include <linux/module.h>
  35#include <linux/mman.h>
  36#include <linux/highmem.h>
  37#include <linux/iommu.h>
  38#include <linux/intel-iommu.h>
  39#include <linux/cpufreq.h>
  40
  41#include <asm/uaccess.h>
  42#include <asm/msr.h>
  43#include <asm/desc.h>
  44#include <asm/mtrr.h>
  45
  46#define MAX_IO_MSRS 256
  47#define CR0_RESERVED_BITS                                               \
  48        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  49                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  50                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  51#define CR4_RESERVED_BITS                                               \
  52        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  53                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  54                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  55                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  56
  57#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  58/* EFER defaults:
  59 * - enable syscall per default because its emulated by KVM
  60 * - enable LME and LMA per default on 64 bit KVM
  61 */
  62#ifdef CONFIG_X86_64
  63static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  64#else
  65static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  66#endif
  67
  68#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  69#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  70
  71static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  72                                    struct kvm_cpuid_entry2 __user *entries);
  73struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
  74                                              u32 function, u32 index);
  75
  76struct kvm_x86_ops *kvm_x86_ops;
  77EXPORT_SYMBOL_GPL(kvm_x86_ops);
  78
  79struct kvm_stats_debugfs_item debugfs_entries[] = {
  80        { "pf_fixed", VCPU_STAT(pf_fixed) },
  81        { "pf_guest", VCPU_STAT(pf_guest) },
  82        { "tlb_flush", VCPU_STAT(tlb_flush) },
  83        { "invlpg", VCPU_STAT(invlpg) },
  84        { "exits", VCPU_STAT(exits) },
  85        { "io_exits", VCPU_STAT(io_exits) },
  86        { "mmio_exits", VCPU_STAT(mmio_exits) },
  87        { "signal_exits", VCPU_STAT(signal_exits) },
  88        { "irq_window", VCPU_STAT(irq_window_exits) },
  89        { "nmi_window", VCPU_STAT(nmi_window_exits) },
  90        { "halt_exits", VCPU_STAT(halt_exits) },
  91        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  92        { "hypercalls", VCPU_STAT(hypercalls) },
  93        { "request_irq", VCPU_STAT(request_irq_exits) },
  94        { "irq_exits", VCPU_STAT(irq_exits) },
  95        { "host_state_reload", VCPU_STAT(host_state_reload) },
  96        { "efer_reload", VCPU_STAT(efer_reload) },
  97        { "fpu_reload", VCPU_STAT(fpu_reload) },
  98        { "insn_emulation", VCPU_STAT(insn_emulation) },
  99        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 100        { "irq_injections", VCPU_STAT(irq_injections) },
 101        { "nmi_injections", VCPU_STAT(nmi_injections) },
 102        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 103        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 104        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 105        { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 106        { "mmu_flooded", VM_STAT(mmu_flooded) },
 107        { "mmu_recycled", VM_STAT(mmu_recycled) },
 108        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 109        { "mmu_unsync", VM_STAT(mmu_unsync) },
 110        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 111        { "largepages", VM_STAT(lpages) },
 112        { NULL }
 113};
 114
 115unsigned long segment_base(u16 selector)
 116{
 117        struct descriptor_table gdt;
 118        struct desc_struct *d;
 119        unsigned long table_base;
 120        unsigned long v;
 121
 122        if (selector == 0)
 123                return 0;
 124
 125        asm("sgdt %0" : "=m"(gdt));
 126        table_base = gdt.base;
 127
 128        if (selector & 4) {           /* from ldt */
 129                u16 ldt_selector;
 130
 131                asm("sldt %0" : "=g"(ldt_selector));
 132                table_base = segment_base(ldt_selector);
 133        }
 134        d = (struct desc_struct *)(table_base + (selector & ~7));
 135        v = d->base0 | ((unsigned long)d->base1 << 16) |
 136                ((unsigned long)d->base2 << 24);
 137#ifdef CONFIG_X86_64
 138        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 139                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 140#endif
 141        return v;
 142}
 143EXPORT_SYMBOL_GPL(segment_base);
 144
 145u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 146{
 147        if (irqchip_in_kernel(vcpu->kvm))
 148                return vcpu->arch.apic_base;
 149        else
 150                return vcpu->arch.apic_base;
 151}
 152EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 153
 154void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 155{
 156        /* TODO: reserve bits check */
 157        if (irqchip_in_kernel(vcpu->kvm))
 158                kvm_lapic_set_base(vcpu, data);
 159        else
 160                vcpu->arch.apic_base = data;
 161}
 162EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 163
 164void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 165{
 166        WARN_ON(vcpu->arch.exception.pending);
 167        vcpu->arch.exception.pending = true;
 168        vcpu->arch.exception.has_error_code = false;
 169        vcpu->arch.exception.nr = nr;
 170}
 171EXPORT_SYMBOL_GPL(kvm_queue_exception);
 172
 173void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 174                           u32 error_code)
 175{
 176        ++vcpu->stat.pf_guest;
 177
 178        if (vcpu->arch.exception.pending) {
 179                if (vcpu->arch.exception.nr == PF_VECTOR) {
 180                        printk(KERN_DEBUG "kvm: inject_page_fault:"
 181                                        " double fault 0x%lx\n", addr);
 182                        vcpu->arch.exception.nr = DF_VECTOR;
 183                        vcpu->arch.exception.error_code = 0;
 184                } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 185                        /* triple fault -> shutdown */
 186                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 187                }
 188                return;
 189        }
 190        vcpu->arch.cr2 = addr;
 191        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 192}
 193
 194void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 195{
 196        vcpu->arch.nmi_pending = 1;
 197}
 198EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 199
 200void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 201{
 202        WARN_ON(vcpu->arch.exception.pending);
 203        vcpu->arch.exception.pending = true;
 204        vcpu->arch.exception.has_error_code = true;
 205        vcpu->arch.exception.nr = nr;
 206        vcpu->arch.exception.error_code = error_code;
 207}
 208EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 209
 210static void __queue_exception(struct kvm_vcpu *vcpu)
 211{
 212        kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 213                                     vcpu->arch.exception.has_error_code,
 214                                     vcpu->arch.exception.error_code);
 215}
 216
 217/*
 218 * Load the pae pdptrs.  Return true is they are all valid.
 219 */
 220int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 221{
 222        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 223        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 224        int i;
 225        int ret;
 226        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 227
 228        ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 229                                  offset * sizeof(u64), sizeof(pdpte));
 230        if (ret < 0) {
 231                ret = 0;
 232                goto out;
 233        }
 234        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 235                if (is_present_pte(pdpte[i]) &&
 236                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 237                        ret = 0;
 238                        goto out;
 239                }
 240        }
 241        ret = 1;
 242
 243        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 244out:
 245
 246        return ret;
 247}
 248EXPORT_SYMBOL_GPL(load_pdptrs);
 249
 250static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 251{
 252        u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 253        bool changed = true;
 254        int r;
 255
 256        if (is_long_mode(vcpu) || !is_pae(vcpu))
 257                return false;
 258
 259        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 260        if (r < 0)
 261                goto out;
 262        changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 263out:
 264
 265        return changed;
 266}
 267
 268void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 269{
 270        if (cr0 & CR0_RESERVED_BITS) {
 271                printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 272                       cr0, vcpu->arch.cr0);
 273                kvm_inject_gp(vcpu, 0);
 274                return;
 275        }
 276
 277        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 278                printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 279                kvm_inject_gp(vcpu, 0);
 280                return;
 281        }
 282
 283        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 284                printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 285                       "and a clear PE flag\n");
 286                kvm_inject_gp(vcpu, 0);
 287                return;
 288        }
 289
 290        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 291#ifdef CONFIG_X86_64
 292                if ((vcpu->arch.shadow_efer & EFER_LME)) {
 293                        int cs_db, cs_l;
 294
 295                        if (!is_pae(vcpu)) {
 296                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 297                                       "in long mode while PAE is disabled\n");
 298                                kvm_inject_gp(vcpu, 0);
 299                                return;
 300                        }
 301                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 302                        if (cs_l) {
 303                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 304                                       "in long mode while CS.L == 1\n");
 305                                kvm_inject_gp(vcpu, 0);
 306                                return;
 307
 308                        }
 309                } else
 310#endif
 311                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 312                        printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 313                               "reserved bits\n");
 314                        kvm_inject_gp(vcpu, 0);
 315                        return;
 316                }
 317
 318        }
 319
 320        kvm_x86_ops->set_cr0(vcpu, cr0);
 321        vcpu->arch.cr0 = cr0;
 322
 323        kvm_mmu_reset_context(vcpu);
 324        return;
 325}
 326EXPORT_SYMBOL_GPL(kvm_set_cr0);
 327
 328void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 329{
 330        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 331        KVMTRACE_1D(LMSW, vcpu,
 332                    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
 333                    handler);
 334}
 335EXPORT_SYMBOL_GPL(kvm_lmsw);
 336
 337void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 338{
 339        unsigned long old_cr4 = vcpu->arch.cr4;
 340        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 341
 342        if (cr4 & CR4_RESERVED_BITS) {
 343                printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 344                kvm_inject_gp(vcpu, 0);
 345                return;
 346        }
 347
 348        if (is_long_mode(vcpu)) {
 349                if (!(cr4 & X86_CR4_PAE)) {
 350                        printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 351                               "in long mode\n");
 352                        kvm_inject_gp(vcpu, 0);
 353                        return;
 354                }
 355        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 356                   && ((cr4 ^ old_cr4) & pdptr_bits)
 357                   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 358                printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 359                kvm_inject_gp(vcpu, 0);
 360                return;
 361        }
 362
 363        if (cr4 & X86_CR4_VMXE) {
 364                printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 365                kvm_inject_gp(vcpu, 0);
 366                return;
 367        }
 368        kvm_x86_ops->set_cr4(vcpu, cr4);
 369        vcpu->arch.cr4 = cr4;
 370        vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 371        kvm_mmu_reset_context(vcpu);
 372}
 373EXPORT_SYMBOL_GPL(kvm_set_cr4);
 374
 375void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 376{
 377        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 378                kvm_mmu_sync_roots(vcpu);
 379                kvm_mmu_flush_tlb(vcpu);
 380                return;
 381        }
 382
 383        if (is_long_mode(vcpu)) {
 384                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 385                        printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 386                        kvm_inject_gp(vcpu, 0);
 387                        return;
 388                }
 389        } else {
 390                if (is_pae(vcpu)) {
 391                        if (cr3 & CR3_PAE_RESERVED_BITS) {
 392                                printk(KERN_DEBUG
 393                                       "set_cr3: #GP, reserved bits\n");
 394                                kvm_inject_gp(vcpu, 0);
 395                                return;
 396                        }
 397                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 398                                printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 399                                       "reserved bits\n");
 400                                kvm_inject_gp(vcpu, 0);
 401                                return;
 402                        }
 403                }
 404                /*
 405                 * We don't check reserved bits in nonpae mode, because
 406                 * this isn't enforced, and VMware depends on this.
 407                 */
 408        }
 409
 410        /*
 411         * Does the new cr3 value map to physical memory? (Note, we
 412         * catch an invalid cr3 even in real-mode, because it would
 413         * cause trouble later on when we turn on paging anyway.)
 414         *
 415         * A real CPU would silently accept an invalid cr3 and would
 416         * attempt to use it - with largely undefined (and often hard
 417         * to debug) behavior on the guest side.
 418         */
 419        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 420                kvm_inject_gp(vcpu, 0);
 421        else {
 422                vcpu->arch.cr3 = cr3;
 423                vcpu->arch.mmu.new_cr3(vcpu);
 424        }
 425}
 426EXPORT_SYMBOL_GPL(kvm_set_cr3);
 427
 428void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 429{
 430        if (cr8 & CR8_RESERVED_BITS) {
 431                printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 432                kvm_inject_gp(vcpu, 0);
 433                return;
 434        }
 435        if (irqchip_in_kernel(vcpu->kvm))
 436                kvm_lapic_set_tpr(vcpu, cr8);
 437        else
 438                vcpu->arch.cr8 = cr8;
 439}
 440EXPORT_SYMBOL_GPL(kvm_set_cr8);
 441
 442unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 443{
 444        if (irqchip_in_kernel(vcpu->kvm))
 445                return kvm_lapic_get_cr8(vcpu);
 446        else
 447                return vcpu->arch.cr8;
 448}
 449EXPORT_SYMBOL_GPL(kvm_get_cr8);
 450
 451static inline u32 bit(int bitno)
 452{
 453        return 1 << (bitno & 31);
 454}
 455
 456/*
 457 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 458 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 459 *
 460 * This list is modified at module load time to reflect the
 461 * capabilities of the host cpu.
 462 */
 463static u32 msrs_to_save[] = {
 464        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 465        MSR_K6_STAR,
 466#ifdef CONFIG_X86_64
 467        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 468#endif
 469        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 470        MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 471};
 472
 473static unsigned num_msrs_to_save;
 474
 475static u32 emulated_msrs[] = {
 476        MSR_IA32_MISC_ENABLE,
 477};
 478
 479static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 480{
 481        if (efer & efer_reserved_bits) {
 482                printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 483                       efer);
 484                kvm_inject_gp(vcpu, 0);
 485                return;
 486        }
 487
 488        if (is_paging(vcpu)
 489            && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 490                printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 491                kvm_inject_gp(vcpu, 0);
 492                return;
 493        }
 494
 495        if (efer & EFER_FFXSR) {
 496                struct kvm_cpuid_entry2 *feat;
 497
 498                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 499                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 500                        printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 501                        kvm_inject_gp(vcpu, 0);
 502                        return;
 503                }
 504        }
 505
 506        if (efer & EFER_SVME) {
 507                struct kvm_cpuid_entry2 *feat;
 508
 509                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 510                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 511                        printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 512                        kvm_inject_gp(vcpu, 0);
 513                        return;
 514                }
 515        }
 516
 517        kvm_x86_ops->set_efer(vcpu, efer);
 518
 519        efer &= ~EFER_LMA;
 520        efer |= vcpu->arch.shadow_efer & EFER_LMA;
 521
 522        vcpu->arch.shadow_efer = efer;
 523
 524        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 525        kvm_mmu_reset_context(vcpu);
 526}
 527
 528void kvm_enable_efer_bits(u64 mask)
 529{
 530       efer_reserved_bits &= ~mask;
 531}
 532EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 533
 534
 535/*
 536 * Writes msr value into into the appropriate "register".
 537 * Returns 0 on success, non-0 otherwise.
 538 * Assumes vcpu_load() was already called.
 539 */
 540int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 541{
 542        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 543}
 544
 545/*
 546 * Adapt set_msr() to msr_io()'s calling convention
 547 */
 548static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 549{
 550        return kvm_set_msr(vcpu, index, *data);
 551}
 552
 553static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 554{
 555        static int version;
 556        struct pvclock_wall_clock wc;
 557        struct timespec now, sys, boot;
 558
 559        if (!wall_clock)
 560                return;
 561
 562        version++;
 563
 564        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 565
 566        /*
 567         * The guest calculates current wall clock time by adding
 568         * system time (updated by kvm_write_guest_time below) to the
 569         * wall clock specified here.  guest system time equals host
 570         * system time for us, thus we must fill in host boot time here.
 571         */
 572        now = current_kernel_time();
 573        ktime_get_ts(&sys);
 574        boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 575
 576        wc.sec = boot.tv_sec;
 577        wc.nsec = boot.tv_nsec;
 578        wc.version = version;
 579
 580        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 581
 582        version++;
 583        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 584}
 585
 586static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 587{
 588        uint32_t quotient, remainder;
 589
 590        /* Don't try to replace with do_div(), this one calculates
 591         * "(dividend << 32) / divisor" */
 592        __asm__ ( "divl %4"
 593                  : "=a" (quotient), "=d" (remainder)
 594                  : "0" (0), "1" (dividend), "r" (divisor) );
 595        return quotient;
 596}
 597
 598static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 599{
 600        uint64_t nsecs = 1000000000LL;
 601        int32_t  shift = 0;
 602        uint64_t tps64;
 603        uint32_t tps32;
 604
 605        tps64 = tsc_khz * 1000LL;
 606        while (tps64 > nsecs*2) {
 607                tps64 >>= 1;
 608                shift--;
 609        }
 610
 611        tps32 = (uint32_t)tps64;
 612        while (tps32 <= (uint32_t)nsecs) {
 613                tps32 <<= 1;
 614                shift++;
 615        }
 616
 617        hv_clock->tsc_shift = shift;
 618        hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 619
 620        pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 621                 __func__, tsc_khz, hv_clock->tsc_shift,
 622                 hv_clock->tsc_to_system_mul);
 623}
 624
 625static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 626
 627static void kvm_write_guest_time(struct kvm_vcpu *v)
 628{
 629        struct timespec ts;
 630        unsigned long flags;
 631        struct kvm_vcpu_arch *vcpu = &v->arch;
 632        void *shared_kaddr;
 633        unsigned long this_tsc_khz;
 634
 635        if ((!vcpu->time_page))
 636                return;
 637
 638        this_tsc_khz = get_cpu_var(cpu_tsc_khz);
 639        if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
 640                kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 641                vcpu->hv_clock_tsc_khz = this_tsc_khz;
 642        }
 643        put_cpu_var(cpu_tsc_khz);
 644
 645        /* Keep irq disabled to prevent changes to the clock */
 646        local_irq_save(flags);
 647        kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 648                          &vcpu->hv_clock.tsc_timestamp);
 649        ktime_get_ts(&ts);
 650        local_irq_restore(flags);
 651
 652        /* With all the info we got, fill in the values */
 653
 654        vcpu->hv_clock.system_time = ts.tv_nsec +
 655                                     (NSEC_PER_SEC * (u64)ts.tv_sec);
 656        /*
 657         * The interface expects us to write an even number signaling that the
 658         * update is finished. Since the guest won't see the intermediate
 659         * state, we just increase by 2 at the end.
 660         */
 661        vcpu->hv_clock.version += 2;
 662
 663        shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 664
 665        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 666               sizeof(vcpu->hv_clock));
 667
 668        kunmap_atomic(shared_kaddr, KM_USER0);
 669
 670        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 671}
 672
 673static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 674{
 675        struct kvm_vcpu_arch *vcpu = &v->arch;
 676
 677        if (!vcpu->time_page)
 678                return 0;
 679        set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 680        return 1;
 681}
 682
 683static bool msr_mtrr_valid(unsigned msr)
 684{
 685        switch (msr) {
 686        case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 687        case MSR_MTRRfix64K_00000:
 688        case MSR_MTRRfix16K_80000:
 689        case MSR_MTRRfix16K_A0000:
 690        case MSR_MTRRfix4K_C0000:
 691        case MSR_MTRRfix4K_C8000:
 692        case MSR_MTRRfix4K_D0000:
 693        case MSR_MTRRfix4K_D8000:
 694        case MSR_MTRRfix4K_E0000:
 695        case MSR_MTRRfix4K_E8000:
 696        case MSR_MTRRfix4K_F0000:
 697        case MSR_MTRRfix4K_F8000:
 698        case MSR_MTRRdefType:
 699        case MSR_IA32_CR_PAT:
 700                return true;
 701        case 0x2f8:
 702                return true;
 703        }
 704        return false;
 705}
 706
 707static bool valid_pat_type(unsigned t)
 708{
 709        return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 710}
 711
 712static bool valid_mtrr_type(unsigned t)
 713{
 714        return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 715}
 716
 717static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 718{
 719        int i;
 720
 721        if (!msr_mtrr_valid(msr))
 722                return false;
 723
 724        if (msr == MSR_IA32_CR_PAT) {
 725                for (i = 0; i < 8; i++)
 726                        if (!valid_pat_type((data >> (i * 8)) & 0xff))
 727                                return false;
 728                return true;
 729        } else if (msr == MSR_MTRRdefType) {
 730                if (data & ~0xcff)
 731                        return false;
 732                return valid_mtrr_type(data & 0xff);
 733        } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 734                for (i = 0; i < 8 ; i++)
 735                        if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 736                                return false;
 737                return true;
 738        }
 739
 740        /* variable MTRRs */
 741        return valid_mtrr_type(data & 0xff);
 742}
 743
 744static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 745{
 746        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 747
 748        if (!mtrr_valid(vcpu, msr, data))
 749                return 1;
 750
 751        if (msr == MSR_MTRRdefType) {
 752                vcpu->arch.mtrr_state.def_type = data;
 753                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 754        } else if (msr == MSR_MTRRfix64K_00000)
 755                p[0] = data;
 756        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 757                p[1 + msr - MSR_MTRRfix16K_80000] = data;
 758        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 759                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 760        else if (msr == MSR_IA32_CR_PAT)
 761                vcpu->arch.pat = data;
 762        else {  /* Variable MTRRs */
 763                int idx, is_mtrr_mask;
 764                u64 *pt;
 765
 766                idx = (msr - 0x200) / 2;
 767                is_mtrr_mask = msr - 0x200 - 2 * idx;
 768                if (!is_mtrr_mask)
 769                        pt =
 770                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 771                else
 772                        pt =
 773                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 774                *pt = data;
 775        }
 776
 777        kvm_mmu_reset_context(vcpu);
 778        return 0;
 779}
 780
 781int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 782{
 783        switch (msr) {
 784        case MSR_EFER:
 785                set_efer(vcpu, data);
 786                break;
 787        case MSR_IA32_MC0_STATUS:
 788                pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 789                       __func__, data);
 790                break;
 791        case MSR_IA32_MCG_STATUS:
 792                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 793                        __func__, data);
 794                break;
 795        case MSR_IA32_MCG_CTL:
 796                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 797                        __func__, data);
 798                break;
 799        case MSR_IA32_DEBUGCTLMSR:
 800                if (!data) {
 801                        /* We support the non-activated case already */
 802                        break;
 803                } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 804                        /* Values other than LBR and BTF are vendor-specific,
 805                           thus reserved and should throw a #GP */
 806                        return 1;
 807                }
 808                pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 809                        __func__, data);
 810                break;
 811        case MSR_IA32_UCODE_REV:
 812        case MSR_IA32_UCODE_WRITE:
 813        case MSR_VM_HSAVE_PA:
 814                break;
 815        case 0x200 ... 0x2ff:
 816                return set_msr_mtrr(vcpu, msr, data);
 817        case MSR_IA32_APICBASE:
 818                kvm_set_apic_base(vcpu, data);
 819                break;
 820        case MSR_IA32_MISC_ENABLE:
 821                vcpu->arch.ia32_misc_enable_msr = data;
 822                break;
 823        case MSR_KVM_WALL_CLOCK:
 824                vcpu->kvm->arch.wall_clock = data;
 825                kvm_write_wall_clock(vcpu->kvm, data);
 826                break;
 827        case MSR_KVM_SYSTEM_TIME: {
 828                if (vcpu->arch.time_page) {
 829                        kvm_release_page_dirty(vcpu->arch.time_page);
 830                        vcpu->arch.time_page = NULL;
 831                }
 832
 833                vcpu->arch.time = data;
 834
 835                /* we verify if the enable bit is set... */
 836                if (!(data & 1))
 837                        break;
 838
 839                /* ...but clean it before doing the actual write */
 840                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 841
 842                vcpu->arch.time_page =
 843                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 844
 845                if (is_error_page(vcpu->arch.time_page)) {
 846                        kvm_release_page_clean(vcpu->arch.time_page);
 847                        vcpu->arch.time_page = NULL;
 848                }
 849
 850                kvm_request_guest_time_update(vcpu);
 851                break;
 852        }
 853        default:
 854                pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 855                return 1;
 856        }
 857        return 0;
 858}
 859EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 860
 861
 862/*
 863 * Reads an msr value (of 'msr_index') into 'pdata'.
 864 * Returns 0 on success, non-0 otherwise.
 865 * Assumes vcpu_load() was already called.
 866 */
 867int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 868{
 869        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 870}
 871
 872static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 873{
 874        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 875
 876        if (!msr_mtrr_valid(msr))
 877                return 1;
 878
 879        if (msr == MSR_MTRRdefType)
 880                *pdata = vcpu->arch.mtrr_state.def_type +
 881                         (vcpu->arch.mtrr_state.enabled << 10);
 882        else if (msr == MSR_MTRRfix64K_00000)
 883                *pdata = p[0];
 884        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 885                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 886        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 887                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 888        else if (msr == MSR_IA32_CR_PAT)
 889                *pdata = vcpu->arch.pat;
 890        else {  /* Variable MTRRs */
 891                int idx, is_mtrr_mask;
 892                u64 *pt;
 893
 894                idx = (msr - 0x200) / 2;
 895                is_mtrr_mask = msr - 0x200 - 2 * idx;
 896                if (!is_mtrr_mask)
 897                        pt =
 898                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 899                else
 900                        pt =
 901                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 902                *pdata = *pt;
 903        }
 904
 905        return 0;
 906}
 907
 908int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 909{
 910        u64 data;
 911
 912        switch (msr) {
 913        case 0xc0010010: /* SYSCFG */
 914        case 0xc0010015: /* HWCR */
 915        case MSR_IA32_PLATFORM_ID:
 916        case MSR_IA32_P5_MC_ADDR:
 917        case MSR_IA32_P5_MC_TYPE:
 918        case MSR_IA32_MC0_CTL:
 919        case MSR_IA32_MCG_STATUS:
 920        case MSR_IA32_MCG_CAP:
 921        case MSR_IA32_MCG_CTL:
 922        case MSR_IA32_MC0_MISC:
 923        case MSR_IA32_MC0_MISC+4:
 924        case MSR_IA32_MC0_MISC+8:
 925        case MSR_IA32_MC0_MISC+12:
 926        case MSR_IA32_MC0_MISC+16:
 927        case MSR_IA32_MC0_MISC+20:
 928        case MSR_IA32_UCODE_REV:
 929        case MSR_IA32_EBL_CR_POWERON:
 930        case MSR_IA32_DEBUGCTLMSR:
 931        case MSR_IA32_LASTBRANCHFROMIP:
 932        case MSR_IA32_LASTBRANCHTOIP:
 933        case MSR_IA32_LASTINTFROMIP:
 934        case MSR_IA32_LASTINTTOIP:
 935        case MSR_VM_HSAVE_PA:
 936        case MSR_P6_EVNTSEL0:
 937        case MSR_P6_EVNTSEL1:
 938        case MSR_K7_EVNTSEL0:
 939                data = 0;
 940                break;
 941        case MSR_MTRRcap:
 942                data = 0x500 | KVM_NR_VAR_MTRR;
 943                break;
 944        case 0x200 ... 0x2ff:
 945                return get_msr_mtrr(vcpu, msr, pdata);
 946        case 0xcd: /* fsb frequency */
 947                data = 3;
 948                break;
 949        case MSR_IA32_APICBASE:
 950                data = kvm_get_apic_base(vcpu);
 951                break;
 952        case MSR_IA32_MISC_ENABLE:
 953                data = vcpu->arch.ia32_misc_enable_msr;
 954                break;
 955        case MSR_IA32_PERF_STATUS:
 956                /* TSC increment by tick */
 957                data = 1000ULL;
 958                /* CPU multiplier */
 959                data |= (((uint64_t)4ULL) << 40);
 960                break;
 961        case MSR_EFER:
 962                data = vcpu->arch.shadow_efer;
 963                break;
 964        case MSR_KVM_WALL_CLOCK:
 965                data = vcpu->kvm->arch.wall_clock;
 966                break;
 967        case MSR_KVM_SYSTEM_TIME:
 968                data = vcpu->arch.time;
 969                break;
 970        default:
 971                pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 972                return 1;
 973        }
 974        *pdata = data;
 975        return 0;
 976}
 977EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 978
 979/*
 980 * Read or write a bunch of msrs. All parameters are kernel addresses.
 981 *
 982 * @return number of msrs set successfully.
 983 */
 984static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 985                    struct kvm_msr_entry *entries,
 986                    int (*do_msr)(struct kvm_vcpu *vcpu,
 987                                  unsigned index, u64 *data))
 988{
 989        int i;
 990
 991        vcpu_load(vcpu);
 992
 993        down_read(&vcpu->kvm->slots_lock);
 994        for (i = 0; i < msrs->nmsrs; ++i)
 995                if (do_msr(vcpu, entries[i].index, &entries[i].data))
 996                        break;
 997        up_read(&vcpu->kvm->slots_lock);
 998
 999        vcpu_put(vcpu);
1000
1001        return i;
1002}
1003
1004/*
1005 * Read or write a bunch of msrs. Parameters are user addresses.
1006 *
1007 * @return number of msrs set successfully.
1008 */
1009static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1010                  int (*do_msr)(struct kvm_vcpu *vcpu,
1011                                unsigned index, u64 *data),
1012                  int writeback)
1013{
1014        struct kvm_msrs msrs;
1015        struct kvm_msr_entry *entries;
1016        int r, n;
1017        unsigned size;
1018
1019        r = -EFAULT;
1020        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1021                goto out;
1022
1023        r = -E2BIG;
1024        if (msrs.nmsrs >= MAX_IO_MSRS)
1025                goto out;
1026
1027        r = -ENOMEM;
1028        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1029        entries = vmalloc(size);
1030        if (!entries)
1031                goto out;
1032
1033        r = -EFAULT;
1034        if (copy_from_user(entries, user_msrs->entries, size))
1035                goto out_free;
1036
1037        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1038        if (r < 0)
1039                goto out_free;
1040
1041        r = -EFAULT;
1042        if (writeback && copy_to_user(user_msrs->entries, entries, size))
1043                goto out_free;
1044
1045        r = n;
1046
1047out_free:
1048        vfree(entries);
1049out:
1050        return r;
1051}
1052
1053int kvm_dev_ioctl_check_extension(long ext)
1054{
1055        int r;
1056
1057        switch (ext) {
1058        case KVM_CAP_IRQCHIP:
1059        case KVM_CAP_HLT:
1060        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1061        case KVM_CAP_SET_TSS_ADDR:
1062        case KVM_CAP_EXT_CPUID:
1063        case KVM_CAP_CLOCKSOURCE:
1064        case KVM_CAP_PIT:
1065        case KVM_CAP_NOP_IO_DELAY:
1066        case KVM_CAP_MP_STATE:
1067        case KVM_CAP_SYNC_MMU:
1068        case KVM_CAP_REINJECT_CONTROL:
1069        case KVM_CAP_IRQ_INJECT_STATUS:
1070        case KVM_CAP_ASSIGN_DEV_IRQ:
1071                r = 1;
1072                break;
1073        case KVM_CAP_COALESCED_MMIO:
1074                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1075                break;
1076        case KVM_CAP_VAPIC:
1077                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1078                break;
1079        case KVM_CAP_NR_VCPUS:
1080                r = KVM_MAX_VCPUS;
1081                break;
1082        case KVM_CAP_NR_MEMSLOTS:
1083                r = KVM_MEMORY_SLOTS;
1084                break;
1085        case KVM_CAP_PV_MMU:
1086                r = !tdp_enabled;
1087                break;
1088        case KVM_CAP_IOMMU:
1089                r = iommu_found();
1090                break;
1091        default:
1092                r = 0;
1093                break;
1094        }
1095        return r;
1096
1097}
1098
1099long kvm_arch_dev_ioctl(struct file *filp,
1100                        unsigned int ioctl, unsigned long arg)
1101{
1102        void __user *argp = (void __user *)arg;
1103        long r;
1104
1105        switch (ioctl) {
1106        case KVM_GET_MSR_INDEX_LIST: {
1107                struct kvm_msr_list __user *user_msr_list = argp;
1108                struct kvm_msr_list msr_list;
1109                unsigned n;
1110
1111                r = -EFAULT;
1112                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1113                        goto out;
1114                n = msr_list.nmsrs;
1115                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1116                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1117                        goto out;
1118                r = -E2BIG;
1119                if (n < msr_list.nmsrs)
1120                        goto out;
1121                r = -EFAULT;
1122                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1123                                 num_msrs_to_save * sizeof(u32)))
1124                        goto out;
1125                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1126                                 &emulated_msrs,
1127                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1128                        goto out;
1129                r = 0;
1130                break;
1131        }
1132        case KVM_GET_SUPPORTED_CPUID: {
1133                struct kvm_cpuid2 __user *cpuid_arg = argp;
1134                struct kvm_cpuid2 cpuid;
1135
1136                r = -EFAULT;
1137                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1138                        goto out;
1139                r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1140                                                      cpuid_arg->entries);
1141                if (r)
1142                        goto out;
1143
1144                r = -EFAULT;
1145                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1146                        goto out;
1147                r = 0;
1148                break;
1149        }
1150        default:
1151                r = -EINVAL;
1152        }
1153out:
1154        return r;
1155}
1156
1157void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1158{
1159        kvm_x86_ops->vcpu_load(vcpu, cpu);
1160        kvm_request_guest_time_update(vcpu);
1161}
1162
1163void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1164{
1165        kvm_x86_ops->vcpu_put(vcpu);
1166        kvm_put_guest_fpu(vcpu);
1167}
1168
1169static int is_efer_nx(void)
1170{
1171        unsigned long long efer = 0;
1172
1173        rdmsrl_safe(MSR_EFER, &efer);
1174        return efer & EFER_NX;
1175}
1176
1177static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1178{
1179        int i;
1180        struct kvm_cpuid_entry2 *e, *entry;
1181
1182        entry = NULL;
1183        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1184                e = &vcpu->arch.cpuid_entries[i];
1185                if (e->function == 0x80000001) {
1186                        entry = e;
1187                        break;
1188                }
1189        }
1190        if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1191                entry->edx &= ~(1 << 20);
1192                printk(KERN_INFO "kvm: guest NX capability removed\n");
1193        }
1194}
1195
1196/* when an old userspace process fills a new kernel module */
1197static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1198                                    struct kvm_cpuid *cpuid,
1199                                    struct kvm_cpuid_entry __user *entries)
1200{
1201        int r, i;
1202        struct kvm_cpuid_entry *cpuid_entries;
1203
1204        r = -E2BIG;
1205        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1206                goto out;
1207        r = -ENOMEM;
1208        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1209        if (!cpuid_entries)
1210                goto out;
1211        r = -EFAULT;
1212        if (copy_from_user(cpuid_entries, entries,
1213                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1214                goto out_free;
1215        for (i = 0; i < cpuid->nent; i++) {
1216                vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1217                vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1218                vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1219                vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1220                vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1221                vcpu->arch.cpuid_entries[i].index = 0;
1222                vcpu->arch.cpuid_entries[i].flags = 0;
1223                vcpu->arch.cpuid_entries[i].padding[0] = 0;
1224                vcpu->arch.cpuid_entries[i].padding[1] = 0;
1225                vcpu->arch.cpuid_entries[i].padding[2] = 0;
1226        }
1227        vcpu->arch.cpuid_nent = cpuid->nent;
1228        cpuid_fix_nx_cap(vcpu);
1229        r = 0;
1230
1231out_free:
1232        vfree(cpuid_entries);
1233out:
1234        return r;
1235}
1236
1237static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1238                                     struct kvm_cpuid2 *cpuid,
1239                                     struct kvm_cpuid_entry2 __user *entries)
1240{
1241        int r;
1242
1243        r = -E2BIG;
1244        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1245                goto out;
1246        r = -EFAULT;
1247        if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1248                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1249                goto out;
1250        vcpu->arch.cpuid_nent = cpuid->nent;
1251        return 0;
1252
1253out:
1254        return r;
1255}
1256
1257static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1258                                     struct kvm_cpuid2 *cpuid,
1259                                     struct kvm_cpuid_entry2 __user *entries)
1260{
1261        int r;
1262
1263        r = -E2BIG;
1264        if (cpuid->nent < vcpu->arch.cpuid_nent)
1265                goto out;
1266        r = -EFAULT;
1267        if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1268                         vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1269                goto out;
1270        return 0;
1271
1272out:
1273        cpuid->nent = vcpu->arch.cpuid_nent;
1274        return r;
1275}
1276
1277static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1278                           u32 index)
1279{
1280        entry->function = function;
1281        entry->index = index;
1282        cpuid_count(entry->function, entry->index,
1283                    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1284        entry->flags = 0;
1285}
1286
1287#define F(x) bit(X86_FEATURE_##x)
1288
1289static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1290                         u32 index, int *nent, int maxnent)
1291{
1292        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1293#ifdef CONFIG_X86_64
1294        unsigned f_lm = F(LM);
1295#else
1296        unsigned f_lm = 0;
1297#endif
1298
1299        /* cpuid 1.edx */
1300        const u32 kvm_supported_word0_x86_features =
1301                F(FPU) | F(VME) | F(DE) | F(PSE) |
1302                F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1303                F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1304                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1305                F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1306                0 /* Reserved, DS, ACPI */ | F(MMX) |
1307                F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1308                0 /* HTT, TM, Reserved, PBE */;
1309        /* cpuid 0x80000001.edx */
1310        const u32 kvm_supported_word1_x86_features =
1311                F(FPU) | F(VME) | F(DE) | F(PSE) |
1312                F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1313                F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1314                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1315                F(PAT) | F(PSE36) | 0 /* Reserved */ |
1316                f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1317                F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1318                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1319        /* cpuid 1.ecx */
1320        const u32 kvm_supported_word4_x86_features =
1321                F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1322                0 /* DS-CPL, VMX, SMX, EST */ |
1323                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1324                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1325                0 /* Reserved, DCA */ | F(XMM4_1) |
1326                F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1327                0 /* Reserved, XSAVE, OSXSAVE */;
1328        /* cpuid 0x80000001.ecx */
1329        const u32 kvm_supported_word6_x86_features =
1330                F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1331                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1332                F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1333                0 /* SKINIT */ | 0 /* WDT */;
1334
1335        /* all calls to cpuid_count() should be made on the same cpu */
1336        get_cpu();
1337        do_cpuid_1_ent(entry, function, index);
1338        ++*nent;
1339
1340        switch (function) {
1341        case 0:
1342                entry->eax = min(entry->eax, (u32)0xb);
1343                break;
1344        case 1:
1345                entry->edx &= kvm_supported_word0_x86_features;
1346                entry->ecx &= kvm_supported_word4_x86_features;
1347                break;
1348        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1349         * may return different values. This forces us to get_cpu() before
1350         * issuing the first command, and also to emulate this annoying behavior
1351         * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1352        case 2: {
1353                int t, times = entry->eax & 0xff;
1354
1355                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1356                entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1357                for (t = 1; t < times && *nent < maxnent; ++t) {
1358                        do_cpuid_1_ent(&entry[t], function, 0);
1359                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1360                        ++*nent;
1361                }
1362                break;
1363        }
1364        /* function 4 and 0xb have additional index. */
1365        case 4: {
1366                int i, cache_type;
1367
1368                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1369                /* read more entries until cache_type is zero */
1370                for (i = 1; *nent < maxnent; ++i) {
1371                        cache_type = entry[i - 1].eax & 0x1f;
1372                        if (!cache_type)
1373                                break;
1374                        do_cpuid_1_ent(&entry[i], function, i);
1375                        entry[i].flags |=
1376                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1377                        ++*nent;
1378                }
1379                break;
1380        }
1381        case 0xb: {
1382                int i, level_type;
1383
1384                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1385                /* read more entries until level_type is zero */
1386                for (i = 1; *nent < maxnent; ++i) {
1387                        level_type = entry[i - 1].ecx & 0xff00;
1388                        if (!level_type)
1389                                break;
1390                        do_cpuid_1_ent(&entry[i], function, i);
1391                        entry[i].flags |=
1392                               KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1393                        ++*nent;
1394                }
1395                break;
1396        }
1397        case 0x80000000:
1398                entry->eax = min(entry->eax, 0x8000001a);
1399                break;
1400        case 0x80000001:
1401                entry->edx &= kvm_supported_word1_x86_features;
1402                entry->ecx &= kvm_supported_word6_x86_features;
1403                break;
1404        }
1405        put_cpu();
1406}
1407
1408#undef F
1409
1410static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1411                                     struct kvm_cpuid_entry2 __user *entries)
1412{
1413        struct kvm_cpuid_entry2 *cpuid_entries;
1414        int limit, nent = 0, r = -E2BIG;
1415        u32 func;
1416
1417        if (cpuid->nent < 1)
1418                goto out;
1419        r = -ENOMEM;
1420        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1421        if (!cpuid_entries)
1422                goto out;
1423
1424        do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1425        limit = cpuid_entries[0].eax;
1426        for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1427                do_cpuid_ent(&cpuid_entries[nent], func, 0,
1428                             &nent, cpuid->nent);
1429        r = -E2BIG;
1430        if (nent >= cpuid->nent)
1431                goto out_free;
1432
1433        do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1434        limit = cpuid_entries[nent - 1].eax;
1435        for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1436                do_cpuid_ent(&cpuid_entries[nent], func, 0,
1437                             &nent, cpuid->nent);
1438        r = -EFAULT;
1439        if (copy_to_user(entries, cpuid_entries,
1440                         nent * sizeof(struct kvm_cpuid_entry2)))
1441                goto out_free;
1442        cpuid->nent = nent;
1443        r = 0;
1444
1445out_free:
1446        vfree(cpuid_entries);
1447out:
1448        return r;
1449}
1450
1451static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1452                                    struct kvm_lapic_state *s)
1453{
1454        vcpu_load(vcpu);
1455        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1456        vcpu_put(vcpu);
1457
1458        return 0;
1459}
1460
1461static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1462                                    struct kvm_lapic_state *s)
1463{
1464        vcpu_load(vcpu);
1465        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1466        kvm_apic_post_state_restore(vcpu);
1467        vcpu_put(vcpu);
1468
1469        return 0;
1470}
1471
1472static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1473                                    struct kvm_interrupt *irq)
1474{
1475        if (irq->irq < 0 || irq->irq >= 256)
1476                return -EINVAL;
1477        if (irqchip_in_kernel(vcpu->kvm))
1478                return -ENXIO;
1479        vcpu_load(vcpu);
1480
1481        kvm_queue_interrupt(vcpu, irq->irq, false);
1482
1483        vcpu_put(vcpu);
1484
1485        return 0;
1486}
1487
1488static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1489{
1490        vcpu_load(vcpu);
1491        kvm_inject_nmi(vcpu);
1492        vcpu_put(vcpu);
1493
1494        return 0;
1495}
1496
1497static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1498                                           struct kvm_tpr_access_ctl *tac)
1499{
1500        if (tac->flags)
1501                return -EINVAL;
1502        vcpu->arch.tpr_access_reporting = !!tac->enabled;
1503        return 0;
1504}
1505
1506long kvm_arch_vcpu_ioctl(struct file *filp,
1507                         unsigned int ioctl, unsigned long arg)
1508{
1509        struct kvm_vcpu *vcpu = filp->private_data;
1510        void __user *argp = (void __user *)arg;
1511        int r;
1512        struct kvm_lapic_state *lapic = NULL;
1513
1514        switch (ioctl) {
1515        case KVM_GET_LAPIC: {
1516                lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1517
1518                r = -ENOMEM;
1519                if (!lapic)
1520                        goto out;
1521                r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1522                if (r)
1523                        goto out;
1524                r = -EFAULT;
1525                if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1526                        goto out;
1527                r = 0;
1528                break;
1529        }
1530        case KVM_SET_LAPIC: {
1531                lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1532                r = -ENOMEM;
1533                if (!lapic)
1534                        goto out;
1535                r = -EFAULT;
1536                if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1537                        goto out;
1538                r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1539                if (r)
1540                        goto out;
1541                r = 0;
1542                break;
1543        }
1544        case KVM_INTERRUPT: {
1545                struct kvm_interrupt irq;
1546
1547                r = -EFAULT;
1548                if (copy_from_user(&irq, argp, sizeof irq))
1549                        goto out;
1550                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1551                if (r)
1552                        goto out;
1553                r = 0;
1554                break;
1555        }
1556        case KVM_NMI: {
1557                r = kvm_vcpu_ioctl_nmi(vcpu);
1558                if (r)
1559                        goto out;
1560                r = 0;
1561                break;
1562        }
1563        case KVM_SET_CPUID: {
1564                struct kvm_cpuid __user *cpuid_arg = argp;
1565                struct kvm_cpuid cpuid;
1566
1567                r = -EFAULT;
1568                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1569                        goto out;
1570                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1571                if (r)
1572                        goto out;
1573                break;
1574        }
1575        case KVM_SET_CPUID2: {
1576                struct kvm_cpuid2 __user *cpuid_arg = argp;
1577                struct kvm_cpuid2 cpuid;
1578
1579                r = -EFAULT;
1580                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1581                        goto out;
1582                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1583                                              cpuid_arg->entries);
1584                if (r)
1585                        goto out;
1586                break;
1587        }
1588        case KVM_GET_CPUID2: {
1589                struct kvm_cpuid2 __user *cpuid_arg = argp;
1590                struct kvm_cpuid2 cpuid;
1591
1592                r = -EFAULT;
1593                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1594                        goto out;
1595                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1596                                              cpuid_arg->entries);
1597                if (r)
1598                        goto out;
1599                r = -EFAULT;
1600                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1601                        goto out;
1602                r = 0;
1603                break;
1604        }
1605        case KVM_GET_MSRS:
1606                r = msr_io(vcpu, argp, kvm_get_msr, 1);
1607                break;
1608        case KVM_SET_MSRS:
1609                r = msr_io(vcpu, argp, do_set_msr, 0);
1610                break;
1611        case KVM_TPR_ACCESS_REPORTING: {
1612                struct kvm_tpr_access_ctl tac;
1613
1614                r = -EFAULT;
1615                if (copy_from_user(&tac, argp, sizeof tac))
1616                        goto out;
1617                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1618                if (r)
1619                        goto out;
1620                r = -EFAULT;
1621                if (copy_to_user(argp, &tac, sizeof tac))
1622                        goto out;
1623                r = 0;
1624                break;
1625        };
1626        case KVM_SET_VAPIC_ADDR: {
1627                struct kvm_vapic_addr va;
1628
1629                r = -EINVAL;
1630                if (!irqchip_in_kernel(vcpu->kvm))
1631                        goto out;
1632                r = -EFAULT;
1633                if (copy_from_user(&va, argp, sizeof va))
1634                        goto out;
1635                r = 0;
1636                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1637                break;
1638        }
1639        default:
1640                r = -EINVAL;
1641        }
1642out:
1643        kfree(lapic);
1644        return r;
1645}
1646
1647static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1648{
1649        int ret;
1650
1651        if (addr > (unsigned int)(-3 * PAGE_SIZE))
1652                return -1;
1653        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1654        return ret;
1655}
1656
1657static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1658                                          u32 kvm_nr_mmu_pages)
1659{
1660        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1661                return -EINVAL;
1662
1663        down_write(&kvm->slots_lock);
1664        spin_lock(&kvm->mmu_lock);
1665
1666        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1667        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1668
1669        spin_unlock(&kvm->mmu_lock);
1670        up_write(&kvm->slots_lock);
1671        return 0;
1672}
1673
1674static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1675{
1676        return kvm->arch.n_alloc_mmu_pages;
1677}
1678
1679gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1680{
1681        int i;
1682        struct kvm_mem_alias *alias;
1683
1684        for (i = 0; i < kvm->arch.naliases; ++i) {
1685                alias = &kvm->arch.aliases[i];
1686                if (gfn >= alias->base_gfn
1687                    && gfn < alias->base_gfn + alias->npages)
1688                        return alias->target_gfn + gfn - alias->base_gfn;
1689        }
1690        return gfn;
1691}
1692
1693/*
1694 * Set a new alias region.  Aliases map a portion of physical memory into
1695 * another portion.  This is useful for memory windows, for example the PC
1696 * VGA region.
1697 */
1698static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1699                                         struct kvm_memory_alias *alias)
1700{
1701        int r, n;
1702        struct kvm_mem_alias *p;
1703
1704        r = -EINVAL;
1705        /* General sanity checks */
1706        if (alias->memory_size & (PAGE_SIZE - 1))
1707                goto out;
1708        if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1709                goto out;
1710        if (alias->slot >= KVM_ALIAS_SLOTS)
1711                goto out;
1712        if (alias->guest_phys_addr + alias->memory_size
1713            < alias->guest_phys_addr)
1714                goto out;
1715        if (alias->target_phys_addr + alias->memory_size
1716            < alias->target_phys_addr)
1717                goto out;
1718
1719        down_write(&kvm->slots_lock);
1720        spin_lock(&kvm->mmu_lock);
1721
1722        p = &kvm->arch.aliases[alias->slot];
1723        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1724        p->npages = alias->memory_size >> PAGE_SHIFT;
1725        p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1726
1727        for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1728                if (kvm->arch.aliases[n - 1].npages)
1729                        break;
1730        kvm->arch.naliases = n;
1731
1732        spin_unlock(&kvm->mmu_lock);
1733        kvm_mmu_zap_all(kvm);
1734
1735        up_write(&kvm->slots_lock);
1736
1737        return 0;
1738
1739out:
1740        return r;
1741}
1742
1743static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1744{
1745        int r;
1746
1747        r = 0;
1748        switch (chip->chip_id) {
1749        case KVM_IRQCHIP_PIC_MASTER:
1750                memcpy(&chip->chip.pic,
1751                        &pic_irqchip(kvm)->pics[0],
1752                        sizeof(struct kvm_pic_state));
1753                break;
1754        case KVM_IRQCHIP_PIC_SLAVE:
1755                memcpy(&chip->chip.pic,
1756                        &pic_irqchip(kvm)->pics[1],
1757                        sizeof(struct kvm_pic_state));
1758                break;
1759        case KVM_IRQCHIP_IOAPIC:
1760                memcpy(&chip->chip.ioapic,
1761                        ioapic_irqchip(kvm),
1762                        sizeof(struct kvm_ioapic_state));
1763                break;
1764        default:
1765                r = -EINVAL;
1766                break;
1767        }
1768        return r;
1769}
1770
1771static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1772{
1773        int r;
1774
1775        r = 0;
1776        switch (chip->chip_id) {
1777        case KVM_IRQCHIP_PIC_MASTER:
1778                memcpy(&pic_irqchip(kvm)->pics[0],
1779                        &chip->chip.pic,
1780                        sizeof(struct kvm_pic_state));
1781                break;
1782        case KVM_IRQCHIP_PIC_SLAVE:
1783                memcpy(&pic_irqchip(kvm)->pics[1],
1784                        &chip->chip.pic,
1785                        sizeof(struct kvm_pic_state));
1786                break;
1787        case KVM_IRQCHIP_IOAPIC:
1788                memcpy(ioapic_irqchip(kvm),
1789                        &chip->chip.ioapic,
1790                        sizeof(struct kvm_ioapic_state));
1791                break;
1792        default:
1793                r = -EINVAL;
1794                break;
1795        }
1796        kvm_pic_update_irq(pic_irqchip(kvm));
1797        return r;
1798}
1799
1800static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1801{
1802        int r = 0;
1803
1804        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1805        return r;
1806}
1807
1808static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1809{
1810        int r = 0;
1811
1812        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1813        kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1814        return r;
1815}
1816
1817static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1818                                 struct kvm_reinject_control *control)
1819{
1820        if (!kvm->arch.vpit)
1821                return -ENXIO;
1822        kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1823        return 0;
1824}
1825
1826/*
1827 * Get (and clear) the dirty memory log for a memory slot.
1828 */
1829int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1830                                      struct kvm_dirty_log *log)
1831{
1832        int r;
1833        int n;
1834        struct kvm_memory_slot *memslot;
1835        int is_dirty = 0;
1836
1837        down_write(&kvm->slots_lock);
1838
1839        r = kvm_get_dirty_log(kvm, log, &is_dirty);
1840        if (r)
1841                goto out;
1842
1843        /* If nothing is dirty, don't bother messing with page tables. */
1844        if (is_dirty) {
1845                spin_lock(&kvm->mmu_lock);
1846                kvm_mmu_slot_remove_write_access(kvm, log->slot);
1847                spin_unlock(&kvm->mmu_lock);
1848                kvm_flush_remote_tlbs(kvm);
1849                memslot = &kvm->memslots[log->slot];
1850                n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1851                memset(memslot->dirty_bitmap, 0, n);
1852        }
1853        r = 0;
1854out:
1855        up_write(&kvm->slots_lock);
1856        return r;
1857}
1858
1859long kvm_arch_vm_ioctl(struct file *filp,
1860                       unsigned int ioctl, unsigned long arg)
1861{
1862        struct kvm *kvm = filp->private_data;
1863        void __user *argp = (void __user *)arg;
1864        int r = -EINVAL;
1865        /*
1866         * This union makes it completely explicit to gcc-3.x
1867         * that these two variables' stack usage should be
1868         * combined, not added together.
1869         */
1870        union {
1871                struct kvm_pit_state ps;
1872                struct kvm_memory_alias alias;
1873        } u;
1874
1875        switch (ioctl) {
1876        case KVM_SET_TSS_ADDR:
1877                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1878                if (r < 0)
1879                        goto out;
1880                break;
1881        case KVM_SET_MEMORY_REGION: {
1882                struct kvm_memory_region kvm_mem;
1883                struct kvm_userspace_memory_region kvm_userspace_mem;
1884
1885                r = -EFAULT;
1886                if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1887                        goto out;
1888                kvm_userspace_mem.slot = kvm_mem.slot;
1889                kvm_userspace_mem.flags = kvm_mem.flags;
1890                kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1891                kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1892                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1893                if (r)
1894                        goto out;
1895                break;
1896        }
1897        case KVM_SET_NR_MMU_PAGES:
1898                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1899                if (r)
1900                        goto out;
1901                break;
1902        case KVM_GET_NR_MMU_PAGES:
1903                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1904                break;
1905        case KVM_SET_MEMORY_ALIAS:
1906                r = -EFAULT;
1907                if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1908                        goto out;
1909                r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1910                if (r)
1911                        goto out;
1912                break;
1913        case KVM_CREATE_IRQCHIP:
1914                r = -ENOMEM;
1915                kvm->arch.vpic = kvm_create_pic(kvm);
1916                if (kvm->arch.vpic) {
1917                        r = kvm_ioapic_init(kvm);
1918                        if (r) {
1919                                kfree(kvm->arch.vpic);
1920                                kvm->arch.vpic = NULL;
1921                                goto out;
1922                        }
1923                } else
1924                        goto out;
1925                r = kvm_setup_default_irq_routing(kvm);
1926                if (r) {
1927                        kfree(kvm->arch.vpic);
1928                        kfree(kvm->arch.vioapic);
1929                        goto out;
1930                }
1931                break;
1932        case KVM_CREATE_PIT:
1933                mutex_lock(&kvm->lock);
1934                r = -EEXIST;
1935                if (kvm->arch.vpit)
1936                        goto create_pit_unlock;
1937                r = -ENOMEM;
1938                kvm->arch.vpit = kvm_create_pit(kvm);
1939                if (kvm->arch.vpit)
1940                        r = 0;
1941        create_pit_unlock:
1942                mutex_unlock(&kvm->lock);
1943                break;
1944        case KVM_IRQ_LINE_STATUS:
1945        case KVM_IRQ_LINE: {
1946                struct kvm_irq_level irq_event;
1947
1948                r = -EFAULT;
1949                if (copy_from_user(&irq_event, argp, sizeof irq_event))
1950                        goto out;
1951                if (irqchip_in_kernel(kvm)) {
1952                        __s32 status;
1953                        mutex_lock(&kvm->lock);
1954                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1955                                        irq_event.irq, irq_event.level);
1956                        mutex_unlock(&kvm->lock);
1957                        if (ioctl == KVM_IRQ_LINE_STATUS) {
1958                                irq_event.status = status;
1959                                if (copy_to_user(argp, &irq_event,
1960                                                        sizeof irq_event))
1961                                        goto out;
1962                        }
1963                        r = 0;
1964                }
1965                break;
1966        }
1967        case KVM_GET_IRQCHIP: {
1968                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1969                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1970
1971                r = -ENOMEM;
1972                if (!chip)
1973                        goto out;
1974                r = -EFAULT;
1975                if (copy_from_user(chip, argp, sizeof *chip))
1976                        goto get_irqchip_out;
1977                r = -ENXIO;
1978                if (!irqchip_in_kernel(kvm))
1979                        goto get_irqchip_out;
1980                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1981                if (r)
1982                        goto get_irqchip_out;
1983                r = -EFAULT;
1984                if (copy_to_user(argp, chip, sizeof *chip))
1985                        goto get_irqchip_out;
1986                r = 0;
1987        get_irqchip_out:
1988                kfree(chip);
1989                if (r)
1990                        goto out;
1991                break;
1992        }
1993        case KVM_SET_IRQCHIP: {
1994                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1995                struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1996
1997                r = -ENOMEM;
1998                if (!chip)
1999                        goto out;
2000                r = -EFAULT;
2001                if (copy_from_user(chip, argp, sizeof *chip))
2002                        goto set_irqchip_out;
2003                r = -ENXIO;
2004                if (!irqchip_in_kernel(kvm))
2005                        goto set_irqchip_out;
2006                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2007                if (r)
2008                        goto set_irqchip_out;
2009                r = 0;
2010        set_irqchip_out:
2011                kfree(chip);
2012                if (r)
2013                        goto out;
2014                break;
2015        }
2016        case KVM_GET_PIT: {
2017                r = -EFAULT;
2018                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2019                        goto out;
2020                r = -ENXIO;
2021                if (!kvm->arch.vpit)
2022                        goto out;
2023                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2024                if (r)
2025                        goto out;
2026                r = -EFAULT;
2027                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2028                        goto out;
2029                r = 0;
2030                break;
2031        }
2032        case KVM_SET_PIT: {
2033                r = -EFAULT;
2034                if (copy_from_user(&u.ps, argp, sizeof u.ps))
2035                        goto out;
2036                r = -ENXIO;
2037                if (!kvm->arch.vpit)
2038                        goto out;
2039                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2040                if (r)
2041                        goto out;
2042                r = 0;
2043                break;
2044        }
2045        case KVM_REINJECT_CONTROL: {
2046                struct kvm_reinject_control control;
2047                r =  -EFAULT;
2048                if (copy_from_user(&control, argp, sizeof(control)))
2049                        goto out;
2050                r = kvm_vm_ioctl_reinject(kvm, &control);
2051                if (r)
2052                        goto out;
2053                r = 0;
2054                break;
2055        }
2056        default:
2057                ;
2058        }
2059out:
2060        return r;
2061}
2062
2063static void kvm_init_msr_list(void)
2064{
2065        u32 dummy[2];
2066        unsigned i, j;
2067
2068        for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2069                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2070                        continue;
2071                if (j < i)
2072                        msrs_to_save[j] = msrs_to_save[i];
2073                j++;
2074        }
2075        num_msrs_to_save = j;
2076}
2077
2078/*
2079 * Only apic need an MMIO device hook, so shortcut now..
2080 */
2081static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2082                                                gpa_t addr, int len,
2083                                                int is_write)
2084{
2085        struct kvm_io_device *dev;
2086
2087        if (vcpu->arch.apic) {
2088                dev = &vcpu->arch.apic->dev;
2089                if (dev->in_range(dev, addr, len, is_write))
2090                        return dev;
2091        }
2092        return NULL;
2093}
2094
2095
2096static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2097                                                gpa_t addr, int len,
2098                                                int is_write)
2099{
2100        struct kvm_io_device *dev;
2101
2102        dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
2103        if (dev == NULL)
2104                dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2105                                          is_write);
2106        return dev;
2107}
2108
2109static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2110                               struct kvm_vcpu *vcpu)
2111{
2112        void *data = val;
2113        int r = X86EMUL_CONTINUE;
2114
2115        while (bytes) {
2116                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2117                unsigned offset = addr & (PAGE_SIZE-1);
2118                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2119                int ret;
2120
2121                if (gpa == UNMAPPED_GVA) {
2122                        r = X86EMUL_PROPAGATE_FAULT;
2123                        goto out;
2124                }
2125                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2126                if (ret < 0) {
2127                        r = X86EMUL_UNHANDLEABLE;
2128                        goto out;
2129                }
2130
2131                bytes -= toread;
2132                data += toread;
2133                addr += toread;
2134        }
2135out:
2136        return r;
2137}
2138
2139static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2140                                struct kvm_vcpu *vcpu)
2141{
2142        void *data = val;
2143        int r = X86EMUL_CONTINUE;
2144
2145        while (bytes) {
2146                gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2147                unsigned offset = addr & (PAGE_SIZE-1);
2148                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2149                int ret;
2150
2151                if (gpa == UNMAPPED_GVA) {
2152                        r = X86EMUL_PROPAGATE_FAULT;
2153                        goto out;
2154                }
2155                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2156                if (ret < 0) {
2157                        r = X86EMUL_UNHANDLEABLE;
2158                        goto out;
2159                }
2160
2161                bytes -= towrite;
2162                data += towrite;
2163                addr += towrite;
2164        }
2165out:
2166        return r;
2167}
2168
2169
2170static int emulator_read_emulated(unsigned long addr,
2171                                  void *val,
2172                                  unsigned int bytes,
2173                                  struct kvm_vcpu *vcpu)
2174{
2175        struct kvm_io_device *mmio_dev;
2176        gpa_t                 gpa;
2177
2178        if (vcpu->mmio_read_completed) {
2179                memcpy(val, vcpu->mmio_data, bytes);
2180                vcpu->mmio_read_completed = 0;
2181                return X86EMUL_CONTINUE;
2182        }
2183
2184        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2185
2186        /* For APIC access vmexit */
2187        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2188                goto mmio;
2189
2190        if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2191                                == X86EMUL_CONTINUE)
2192                return X86EMUL_CONTINUE;
2193        if (gpa == UNMAPPED_GVA)
2194                return X86EMUL_PROPAGATE_FAULT;
2195
2196mmio:
2197        /*
2198         * Is this MMIO handled locally?
2199         */
2200        mutex_lock(&vcpu->kvm->lock);
2201        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2202        if (mmio_dev) {
2203                kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2204                mutex_unlock(&vcpu->kvm->lock);
2205                return X86EMUL_CONTINUE;
2206        }
2207        mutex_unlock(&vcpu->kvm->lock);
2208
2209        vcpu->mmio_needed = 1;
2210        vcpu->mmio_phys_addr = gpa;
2211        vcpu->mmio_size = bytes;
2212        vcpu->mmio_is_write = 0;
2213
2214        return X86EMUL_UNHANDLEABLE;
2215}
2216
2217int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2218                          const void *val, int bytes)
2219{
2220        int ret;
2221
2222        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2223        if (ret < 0)
2224                return 0;
2225        kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2226        return 1;
2227}
2228
2229static int emulator_write_emulated_onepage(unsigned long addr,
2230                                           const void *val,
2231                                           unsigned int bytes,
2232                                           struct kvm_vcpu *vcpu)
2233{
2234        struct kvm_io_device *mmio_dev;
2235        gpa_t                 gpa;
2236
2237        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2238
2239        if (gpa == UNMAPPED_GVA) {
2240                kvm_inject_page_fault(vcpu, addr, 2);
2241                return X86EMUL_PROPAGATE_FAULT;
2242        }
2243
2244        /* For APIC access vmexit */
2245        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2246                goto mmio;
2247
2248        if (emulator_write_phys(vcpu, gpa, val, bytes))
2249                return X86EMUL_CONTINUE;
2250
2251mmio:
2252        /*
2253         * Is this MMIO handled locally?
2254         */
2255        mutex_lock(&vcpu->kvm->lock);
2256        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2257        if (mmio_dev) {
2258                kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2259                mutex_unlock(&vcpu->kvm->lock);
2260                return X86EMUL_CONTINUE;
2261        }
2262        mutex_unlock(&vcpu->kvm->lock);
2263
2264        vcpu->mmio_needed = 1;
2265        vcpu->mmio_phys_addr = gpa;
2266        vcpu->mmio_size = bytes;
2267        vcpu->mmio_is_write = 1;
2268        memcpy(vcpu->mmio_data, val, bytes);
2269
2270        return X86EMUL_CONTINUE;
2271}
2272
2273int emulator_write_emulated(unsigned long addr,
2274                                   const void *val,
2275                                   unsigned int bytes,
2276                                   struct kvm_vcpu *vcpu)
2277{
2278        /* Crossing a page boundary? */
2279        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2280                int rc, now;
2281
2282                now = -addr & ~PAGE_MASK;
2283                rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2284                if (rc != X86EMUL_CONTINUE)
2285                        return rc;
2286                addr += now;
2287                val += now;
2288                bytes -= now;
2289        }
2290        return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2291}
2292EXPORT_SYMBOL_GPL(emulator_write_emulated);
2293
2294static int emulator_cmpxchg_emulated(unsigned long addr,
2295                                     const void *old,
2296                                     const void *new,
2297                                     unsigned int bytes,
2298                                     struct kvm_vcpu *vcpu)
2299{
2300        static int reported;
2301
2302        if (!reported) {
2303                reported = 1;
2304                printk(KERN_WARNING "kvm: emulating exchange as write\n");
2305        }
2306#ifndef CONFIG_X86_64
2307        /* guests cmpxchg8b have to be emulated atomically */
2308        if (bytes == 8) {
2309                gpa_t gpa;
2310                struct page *page;
2311                char *kaddr;
2312                u64 val;
2313
2314                gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2315
2316                if (gpa == UNMAPPED_GVA ||
2317                   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2318                        goto emul_write;
2319
2320                if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2321                        goto emul_write;
2322
2323                val = *(u64 *)new;
2324
2325                page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2326
2327                kaddr = kmap_atomic(page, KM_USER0);
2328                set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2329                kunmap_atomic(kaddr, KM_USER0);
2330                kvm_release_page_dirty(page);
2331        }
2332emul_write:
2333#endif
2334
2335        return emulator_write_emulated(addr, new, bytes, vcpu);
2336}
2337
2338static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2339{
2340        return kvm_x86_ops->get_segment_base(vcpu, seg);
2341}
2342
2343int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2344{
2345        kvm_mmu_invlpg(vcpu, address);
2346        return X86EMUL_CONTINUE;
2347}
2348
2349int emulate_clts(struct kvm_vcpu *vcpu)
2350{
2351        KVMTRACE_0D(CLTS, vcpu, handler);
2352        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2353        return X86EMUL_CONTINUE;
2354}
2355
2356int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2357{
2358        struct kvm_vcpu *vcpu = ctxt->vcpu;
2359
2360        switch (dr) {
2361        case 0 ... 3:
2362                *dest = kvm_x86_ops->get_dr(vcpu, dr);
2363                return X86EMUL_CONTINUE;
2364        default:
2365                pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2366                return X86EMUL_UNHANDLEABLE;
2367        }
2368}
2369
2370int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2371{
2372        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2373        int exception;
2374
2375        kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2376        if (exception) {
2377                /* FIXME: better handling */
2378                return X86EMUL_UNHANDLEABLE;
2379        }
2380        return X86EMUL_CONTINUE;
2381}
2382
2383void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2384{
2385        u8 opcodes[4];
2386        unsigned long rip = kvm_rip_read(vcpu);
2387        unsigned long rip_linear;
2388
2389        if (!printk_ratelimit())
2390                return;
2391
2392        rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2393
2394        kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2395
2396        printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2397               context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2398}
2399EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2400
2401static struct x86_emulate_ops emulate_ops = {
2402        .read_std            = kvm_read_guest_virt,
2403        .read_emulated       = emulator_read_emulated,
2404        .write_emulated      = emulator_write_emulated,
2405        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2406};
2407
2408static void cache_all_regs(struct kvm_vcpu *vcpu)
2409{
2410        kvm_register_read(vcpu, VCPU_REGS_RAX);
2411        kvm_register_read(vcpu, VCPU_REGS_RSP);
2412        kvm_register_read(vcpu, VCPU_REGS_RIP);
2413        vcpu->arch.regs_dirty = ~0;
2414}
2415
2416int emulate_instruction(struct kvm_vcpu *vcpu,
2417                        struct kvm_run *run,
2418                        unsigned long cr2,
2419                        u16 error_code,
2420                        int emulation_type)
2421{
2422        int r, shadow_mask;
2423        struct decode_cache *c;
2424
2425        kvm_clear_exception_queue(vcpu);
2426        vcpu->arch.mmio_fault_cr2 = cr2;
2427        /*
2428         * TODO: fix x86_emulate.c to use guest_read/write_register
2429         * instead of direct ->regs accesses, can save hundred cycles
2430         * on Intel for instructions that don't read/change RSP, for
2431         * for example.
2432         */
2433        cache_all_regs(vcpu);
2434
2435        vcpu->mmio_is_write = 0;
2436        vcpu->arch.pio.string = 0;
2437
2438        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2439                int cs_db, cs_l;
2440                kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2441
2442                vcpu->arch.emulate_ctxt.vcpu = vcpu;
2443                vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2444                vcpu->arch.emulate_ctxt.mode =
2445                        (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2446                        ? X86EMUL_MODE_REAL : cs_l
2447                        ? X86EMUL_MODE_PROT64 : cs_db
2448                        ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2449
2450                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2451
2452                /* Reject the instructions other than VMCALL/VMMCALL when
2453                 * try to emulate invalid opcode */
2454                c = &vcpu->arch.emulate_ctxt.decode;
2455                if ((emulation_type & EMULTYPE_TRAP_UD) &&
2456                    (!(c->twobyte && c->b == 0x01 &&
2457                      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2458                       c->modrm_mod == 3 && c->modrm_rm == 1)))
2459                        return EMULATE_FAIL;
2460
2461                ++vcpu->stat.insn_emulation;
2462                if (r)  {
2463                        ++vcpu->stat.insn_emulation_fail;
2464                        if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2465                                return EMULATE_DONE;
2466                        return EMULATE_FAIL;
2467                }
2468        }
2469
2470        if (emulation_type & EMULTYPE_SKIP) {
2471                kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2472                return EMULATE_DONE;
2473        }
2474
2475        r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2476        shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2477
2478        if (r == 0)
2479                kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2480
2481        if (vcpu->arch.pio.string)
2482                return EMULATE_DO_MMIO;
2483
2484        if ((r || vcpu->mmio_is_write) && run) {
2485                run->exit_reason = KVM_EXIT_MMIO;
2486                run->mmio.phys_addr = vcpu->mmio_phys_addr;
2487                memcpy(run->mmio.data, vcpu->mmio_data, 8);
2488                run->mmio.len = vcpu->mmio_size;
2489                run->mmio.is_write = vcpu->mmio_is_write;
2490        }
2491
2492        if (r) {
2493                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2494                        return EMULATE_DONE;
2495                if (!vcpu->mmio_needed) {
2496                        kvm_report_emulation_failure(vcpu, "mmio");
2497                        return EMULATE_FAIL;
2498                }
2499                return EMULATE_DO_MMIO;
2500        }
2501
2502        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2503
2504        if (vcpu->mmio_is_write) {
2505                vcpu->mmio_needed = 0;
2506                return EMULATE_DO_MMIO;
2507        }
2508
2509        return EMULATE_DONE;
2510}
2511EXPORT_SYMBOL_GPL(emulate_instruction);
2512
2513static int pio_copy_data(struct kvm_vcpu *vcpu)
2514{
2515        void *p = vcpu->arch.pio_data;
2516        gva_t q = vcpu->arch.pio.guest_gva;
2517        unsigned bytes;
2518        int ret;
2519
2520        bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2521        if (vcpu->arch.pio.in)
2522                ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2523        else
2524                ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2525        return ret;
2526}
2527
2528int complete_pio(struct kvm_vcpu *vcpu)
2529{
2530        struct kvm_pio_request *io = &vcpu->arch.pio;
2531        long delta;
2532        int r;
2533        unsigned long val;
2534
2535        if (!io->string) {
2536                if (io->in) {
2537                        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2538                        memcpy(&val, vcpu->arch.pio_data, io->size);
2539                        kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2540                }
2541        } else {
2542                if (io->in) {
2543                        r = pio_copy_data(vcpu);
2544                        if (r)
2545                                return r;
2546                }
2547
2548                delta = 1;
2549                if (io->rep) {
2550                        delta *= io->cur_count;
2551                        /*
2552                         * The size of the register should really depend on
2553                         * current address size.
2554                         */
2555                        val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2556                        val -= delta;
2557                        kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2558                }
2559                if (io->down)
2560                        delta = -delta;
2561                delta *= io->size;
2562                if (io->in) {
2563                        val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2564                        val += delta;
2565                        kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2566                } else {
2567                        val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2568                        val += delta;
2569                        kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2570                }
2571        }
2572
2573        io->count -= io->cur_count;
2574        io->cur_count = 0;
2575
2576        return 0;
2577}
2578
2579static void kernel_pio(struct kvm_io_device *pio_dev,
2580                       struct kvm_vcpu *vcpu,
2581                       void *pd)
2582{
2583        /* TODO: String I/O for in kernel device */
2584
2585        mutex_lock(&vcpu->kvm->lock);
2586        if (vcpu->arch.pio.in)
2587                kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2588                                  vcpu->arch.pio.size,
2589                                  pd);
2590        else
2591                kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2592                                   vcpu->arch.pio.size,
2593                                   pd);
2594        mutex_unlock(&vcpu->kvm->lock);
2595}
2596
2597static void pio_string_write(struct kvm_io_device *pio_dev,
2598                             struct kvm_vcpu *vcpu)
2599{
2600        struct kvm_pio_request *io = &vcpu->arch.pio;
2601        void *pd = vcpu->arch.pio_data;
2602        int i;
2603
2604        mutex_lock(&vcpu->kvm->lock);
2605        for (i = 0; i < io->cur_count; i++) {
2606                kvm_iodevice_write(pio_dev, io->port,
2607                                   io->size,
2608                                   pd);
2609                pd += io->size;
2610        }
2611        mutex_unlock(&vcpu->kvm->lock);
2612}
2613
2614static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2615                                               gpa_t addr, int len,
2616                                               int is_write)
2617{
2618        return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2619}
2620
2621int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2622                  int size, unsigned port)
2623{
2624        struct kvm_io_device *pio_dev;
2625        unsigned long val;
2626
2627        vcpu->run->exit_reason = KVM_EXIT_IO;
2628        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2629        vcpu->run->io.size = vcpu->arch.pio.size = size;
2630        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2631        vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2632        vcpu->run->io.port = vcpu->arch.pio.port = port;
2633        vcpu->arch.pio.in = in;
2634        vcpu->arch.pio.string = 0;
2635        vcpu->arch.pio.down = 0;
2636        vcpu->arch.pio.rep = 0;
2637
2638        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2639                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2640                            handler);
2641        else
2642                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2643                            handler);
2644
2645        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2646        memcpy(vcpu->arch.pio_data, &val, 4);
2647
2648        pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2649        if (pio_dev) {
2650                kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2651                complete_pio(vcpu);
2652                return 1;
2653        }
2654        return 0;
2655}
2656EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2657
2658int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2659                  int size, unsigned long count, int down,
2660                  gva_t address, int rep, unsigned port)
2661{
2662        unsigned now, in_page;
2663        int ret = 0;
2664        struct kvm_io_device *pio_dev;
2665
2666        vcpu->run->exit_reason = KVM_EXIT_IO;
2667        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2668        vcpu->run->io.size = vcpu->arch.pio.size = size;
2669        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2670        vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2671        vcpu->run->io.port = vcpu->arch.pio.port = port;
2672        vcpu->arch.pio.in = in;
2673        vcpu->arch.pio.string = 1;
2674        vcpu->arch.pio.down = down;
2675        vcpu->arch.pio.rep = rep;
2676
2677        if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2678                KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2679                            handler);
2680        else
2681                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2682                            handler);
2683
2684        if (!count) {
2685                kvm_x86_ops->skip_emulated_instruction(vcpu);
2686                return 1;
2687        }
2688
2689        if (!down)
2690                in_page = PAGE_SIZE - offset_in_page(address);
2691        else
2692                in_page = offset_in_page(address) + size;
2693        now = min(count, (unsigned long)in_page / size);
2694        if (!now)
2695                now = 1;
2696        if (down) {
2697                /*
2698                 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2699                 */
2700                pr_unimpl(vcpu, "guest string pio down\n");
2701                kvm_inject_gp(vcpu, 0);
2702                return 1;
2703        }
2704        vcpu->run->io.count = now;
2705        vcpu->arch.pio.cur_count = now;
2706
2707        if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2708                kvm_x86_ops->skip_emulated_instruction(vcpu);
2709
2710        vcpu->arch.pio.guest_gva = address;
2711
2712        pio_dev = vcpu_find_pio_dev(vcpu, port,
2713                                    vcpu->arch.pio.cur_count,
2714                                    !vcpu->arch.pio.in);
2715        if (!vcpu->arch.pio.in) {
2716                /* string PIO write */
2717                ret = pio_copy_data(vcpu);
2718                if (ret == X86EMUL_PROPAGATE_FAULT) {
2719                        kvm_inject_gp(vcpu, 0);
2720                        return 1;
2721                }
2722                if (ret == 0 && pio_dev) {
2723                        pio_string_write(pio_dev, vcpu);
2724                        complete_pio(vcpu);
2725                        if (vcpu->arch.pio.count == 0)
2726                                ret = 1;
2727                }
2728        } else if (pio_dev)
2729                pr_unimpl(vcpu, "no string pio read support yet, "
2730                       "port %x size %d count %ld\n",
2731                        port, size, count);
2732
2733        return ret;
2734}
2735EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2736
2737static void bounce_off(void *info)
2738{
2739        /* nothing */
2740}
2741
2742static unsigned int  ref_freq;
2743static unsigned long tsc_khz_ref;
2744
2745static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2746                                     void *data)
2747{
2748        struct cpufreq_freqs *freq = data;
2749        struct kvm *kvm;
2750        struct kvm_vcpu *vcpu;
2751        int i, send_ipi = 0;
2752
2753        if (!ref_freq)
2754                ref_freq = freq->old;
2755
2756        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2757                return 0;
2758        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2759                return 0;
2760        per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2761
2762        spin_lock(&kvm_lock);
2763        list_for_each_entry(kvm, &vm_list, vm_list) {
2764                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2765                        vcpu = kvm->vcpus[i];
2766                        if (!vcpu)
2767                                continue;
2768                        if (vcpu->cpu != freq->cpu)
2769                                continue;
2770                        if (!kvm_request_guest_time_update(vcpu))
2771                                continue;
2772                        if (vcpu->cpu != smp_processor_id())
2773                                send_ipi++;
2774                }
2775        }
2776        spin_unlock(&kvm_lock);
2777
2778        if (freq->old < freq->new && send_ipi) {
2779                /*
2780                 * We upscale the frequency.  Must make the guest
2781                 * doesn't see old kvmclock values while running with
2782                 * the new frequency, otherwise we risk the guest sees
2783                 * time go backwards.
2784                 *
2785                 * In case we update the frequency for another cpu
2786                 * (which might be in guest context) send an interrupt
2787                 * to kick the cpu out of guest context.  Next time
2788                 * guest context is entered kvmclock will be updated,
2789                 * so the guest will not see stale values.
2790                 */
2791                smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2792        }
2793        return 0;
2794}
2795
2796static struct notifier_block kvmclock_cpufreq_notifier_block = {
2797        .notifier_call  = kvmclock_cpufreq_notifier
2798};
2799
2800int kvm_arch_init(void *opaque)
2801{
2802        int r, cpu;
2803        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2804
2805        if (kvm_x86_ops) {
2806                printk(KERN_ERR "kvm: already loaded the other module\n");
2807                r = -EEXIST;
2808                goto out;
2809        }
2810
2811        if (!ops->cpu_has_kvm_support()) {
2812                printk(KERN_ERR "kvm: no hardware support\n");
2813                r = -EOPNOTSUPP;
2814                goto out;
2815        }
2816        if (ops->disabled_by_bios()) {
2817                printk(KERN_ERR "kvm: disabled by bios\n");
2818                r = -EOPNOTSUPP;
2819                goto out;
2820        }
2821
2822        r = kvm_mmu_module_init();
2823        if (r)
2824                goto out;
2825
2826        kvm_init_msr_list();
2827
2828        kvm_x86_ops = ops;
2829        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2830        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2831        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2832                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
2833
2834        for_each_possible_cpu(cpu)
2835                per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2836        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2837                tsc_khz_ref = tsc_khz;
2838                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2839                                          CPUFREQ_TRANSITION_NOTIFIER);
2840        }
2841
2842        return 0;
2843
2844out:
2845        return r;
2846}
2847
2848void kvm_arch_exit(void)
2849{
2850        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
2851                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
2852                                            CPUFREQ_TRANSITION_NOTIFIER);
2853        kvm_x86_ops = NULL;
2854        kvm_mmu_module_exit();
2855}
2856
2857int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2858{
2859        ++vcpu->stat.halt_exits;
2860        KVMTRACE_0D(HLT, vcpu, handler);
2861        if (irqchip_in_kernel(vcpu->kvm)) {
2862                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2863                return 1;
2864        } else {
2865                vcpu->run->exit_reason = KVM_EXIT_HLT;
2866                return 0;
2867        }
2868}
2869EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2870
2871static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2872                           unsigned long a1)
2873{
2874        if (is_long_mode(vcpu))
2875                return a0;
2876        else
2877                return a0 | ((gpa_t)a1 << 32);
2878}
2879
2880int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2881{
2882        unsigned long nr, a0, a1, a2, a3, ret;
2883        int r = 1;
2884
2885        nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2886        a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2887        a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2888        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2889        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2890
2891        KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2892
2893        if (!is_long_mode(vcpu)) {
2894                nr &= 0xFFFFFFFF;
2895                a0 &= 0xFFFFFFFF;
2896                a1 &= 0xFFFFFFFF;
2897                a2 &= 0xFFFFFFFF;
2898                a3 &= 0xFFFFFFFF;
2899        }
2900
2901        switch (nr) {
2902        case KVM_HC_VAPIC_POLL_IRQ:
2903                ret = 0;
2904                break;
2905        case KVM_HC_MMU_OP:
2906                r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2907                break;
2908        default:
2909                ret = -KVM_ENOSYS;
2910                break;
2911        }
2912        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2913        ++vcpu->stat.hypercalls;
2914        return r;
2915}
2916EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2917
2918int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2919{
2920        char instruction[3];
2921        int ret = 0;
2922        unsigned long rip = kvm_rip_read(vcpu);
2923
2924
2925        /*
2926         * Blow out the MMU to ensure that no other VCPU has an active mapping
2927         * to ensure that the updated hypercall appears atomically across all
2928         * VCPUs.
2929         */
2930        kvm_mmu_zap_all(vcpu->kvm);
2931
2932        kvm_x86_ops->patch_hypercall(vcpu, instruction);
2933        if (emulator_write_emulated(rip, instruction, 3, vcpu)
2934            != X86EMUL_CONTINUE)
2935                ret = -EFAULT;
2936
2937        return ret;
2938}
2939
2940static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2941{
2942        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2943}
2944
2945void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2946{
2947        struct descriptor_table dt = { limit, base };
2948
2949        kvm_x86_ops->set_gdt(vcpu, &dt);
2950}
2951
2952void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2953{
2954        struct descriptor_table dt = { limit, base };
2955
2956        kvm_x86_ops->set_idt(vcpu, &dt);
2957}
2958
2959void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2960                   unsigned long *rflags)
2961{
2962        kvm_lmsw(vcpu, msw);
2963        *rflags = kvm_x86_ops->get_rflags(vcpu);
2964}
2965
2966unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2967{
2968        unsigned long value;
2969
2970        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2971        switch (cr) {
2972        case 0:
2973                value = vcpu->arch.cr0;
2974                break;
2975        case 2:
2976                value = vcpu->arch.cr2;
2977                break;
2978        case 3:
2979                value = vcpu->arch.cr3;
2980                break;
2981        case 4:
2982                value = vcpu->arch.cr4;
2983                break;
2984        case 8:
2985                value = kvm_get_cr8(vcpu);
2986                break;
2987        default:
2988                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2989                return 0;
2990        }
2991        KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2992                    (u32)((u64)value >> 32), handler);
2993
2994        return value;
2995}
2996
2997void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2998                     unsigned long *rflags)
2999{
3000        KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
3001                    (u32)((u64)val >> 32), handler);
3002
3003        switch (cr) {
3004        case 0:
3005                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3006                *rflags = kvm_x86_ops->get_rflags(vcpu);
3007                break;
3008        case 2:
3009                vcpu->arch.cr2 = val;
3010                break;
3011        case 3:
3012                kvm_set_cr3(vcpu, val);
3013                break;
3014        case 4:
3015                kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3016                break;
3017        case 8:
3018                kvm_set_cr8(vcpu, val & 0xfUL);
3019                break;
3020        default:
3021                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3022        }
3023}
3024
3025static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3026{
3027        struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3028        int j, nent = vcpu->arch.cpuid_nent;
3029
3030        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3031        /* when no next entry is found, the current entry[i] is reselected */
3032        for (j = i + 1; ; j = (j + 1) % nent) {
3033                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3034                if (ej->function == e->function) {
3035                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3036                        return j;
3037                }
3038        }
3039        return 0; /* silence gcc, even though control never reaches here */
3040}
3041
3042/* find an entry with matching function, matching index (if needed), and that
3043 * should be read next (if it's stateful) */
3044static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3045        u32 function, u32 index)
3046{
3047        if (e->function != function)
3048                return 0;
3049        if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3050                return 0;
3051        if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3052            !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3053                return 0;
3054        return 1;
3055}
3056
3057struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3058                                              u32 function, u32 index)
3059{
3060        int i;
3061        struct kvm_cpuid_entry2 *best = NULL;
3062
3063        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3064                struct kvm_cpuid_entry2 *e;
3065
3066                e = &vcpu->arch.cpuid_entries[i];
3067                if (is_matching_cpuid_entry(e, function, index)) {
3068                        if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3069                                move_to_next_stateful_cpuid_entry(vcpu, i);
3070                        best = e;
3071                        break;
3072                }
3073                /*
3074                 * Both basic or both extended?
3075                 */
3076                if (((e->function ^ function) & 0x80000000) == 0)
3077                        if (!best || e->function > best->function)
3078                                best = e;
3079        }
3080        return best;
3081}
3082
3083int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3084{
3085        struct kvm_cpuid_entry2 *best;
3086
3087        best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3088        if (best)
3089                return best->eax & 0xff;
3090        return 36;
3091}
3092
3093void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3094{
3095        u32 function, index;
3096        struct kvm_cpuid_entry2 *best;
3097
3098        function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3099        index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3100        kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3101        kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3102        kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3103        kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3104        best = kvm_find_cpuid_entry(vcpu, function, index);
3105        if (best) {
3106                kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3107                kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3108                kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3109                kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3110        }
3111        kvm_x86_ops->skip_emulated_instruction(vcpu);
3112        KVMTRACE_5D(CPUID, vcpu, function,
3113                    (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
3114                    (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
3115                    (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
3116                    (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
3117}
3118EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3119
3120/*
3121 * Check if userspace requested an interrupt window, and that the
3122 * interrupt window is open.
3123 *
3124 * No need to exit to userspace if we already have an interrupt queued.
3125 */
3126static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3127                                          struct kvm_run *kvm_run)
3128{
3129        return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3130                kvm_run->request_interrupt_window &&
3131                kvm_arch_interrupt_allowed(vcpu));
3132}
3133
3134static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3135                              struct kvm_run *kvm_run)
3136{
3137        kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3138        kvm_run->cr8 = kvm_get_cr8(vcpu);
3139        kvm_run->apic_base = kvm_get_apic_base(vcpu);
3140        if (irqchip_in_kernel(vcpu->kvm))
3141                kvm_run->ready_for_interrupt_injection = 1;
3142        else
3143                kvm_run->ready_for_interrupt_injection =
3144                        kvm_arch_interrupt_allowed(vcpu) &&
3145                        !kvm_cpu_has_interrupt(vcpu) &&
3146                        !kvm_event_needs_reinjection(vcpu);
3147}
3148
3149static void vapic_enter(struct kvm_vcpu *vcpu)
3150{
3151        struct kvm_lapic *apic = vcpu->arch.apic;
3152        struct page *page;
3153
3154        if (!apic || !apic->vapic_addr)
3155                return;
3156
3157        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3158
3159        vcpu->arch.apic->vapic_page = page;
3160}
3161
3162static void vapic_exit(struct kvm_vcpu *vcpu)
3163{
3164        struct kvm_lapic *apic = vcpu->arch.apic;
3165
3166        if (!apic || !apic->vapic_addr)
3167                return;
3168
3169        down_read(&vcpu->kvm->slots_lock);
3170        kvm_release_page_dirty(apic->vapic_page);
3171        mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3172        up_read(&vcpu->kvm->slots_lock);
3173}
3174
3175static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3176{
3177        int max_irr, tpr;
3178
3179        if (!kvm_x86_ops->update_cr8_intercept)
3180                return;
3181
3182        if (!vcpu->arch.apic->vapic_addr)
3183                max_irr = kvm_lapic_find_highest_irr(vcpu);
3184        else
3185                max_irr = -1;
3186
3187        if (max_irr != -1)
3188                max_irr >>= 4;
3189
3190        tpr = kvm_lapic_get_cr8(vcpu);
3191
3192        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3193}
3194
3195static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3196{
3197        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3198                kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3199
3200        /* try to reinject previous events if any */
3201        if (vcpu->arch.nmi_injected) {
3202                kvm_x86_ops->set_nmi(vcpu);
3203                return;
3204        }
3205
3206        if (vcpu->arch.interrupt.pending) {
3207                kvm_x86_ops->set_irq(vcpu);
3208                return;
3209        }
3210
3211        /* try to inject new event if pending */
3212        if (vcpu->arch.nmi_pending) {
3213                if (kvm_x86_ops->nmi_allowed(vcpu)) {
3214                        vcpu->arch.nmi_pending = false;
3215                        vcpu->arch.nmi_injected = true;
3216                        kvm_x86_ops->set_nmi(vcpu);
3217                }
3218        } else if (kvm_cpu_has_interrupt(vcpu)) {
3219                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3220                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3221                                            false);
3222                        kvm_x86_ops->set_irq(vcpu);
3223                }
3224        }
3225}
3226
3227static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3228{
3229        int r;
3230        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3231                kvm_run->request_interrupt_window;
3232
3233        if (vcpu->requests)
3234                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3235                        kvm_mmu_unload(vcpu);
3236
3237        r = kvm_mmu_reload(vcpu);
3238        if (unlikely(r))
3239                goto out;
3240
3241        if (vcpu->requests) {
3242                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3243                        __kvm_migrate_timers(vcpu);
3244                if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3245                        kvm_write_guest_time(vcpu);
3246                if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3247                        kvm_mmu_sync_roots(vcpu);
3248                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3249                        kvm_x86_ops->tlb_flush(vcpu);
3250                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3251                                       &vcpu->requests)) {
3252                        kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3253                        r = 0;
3254                        goto out;
3255                }
3256                if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3257                        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3258                        r = 0;
3259                        goto out;
3260                }
3261        }
3262
3263        preempt_disable();
3264
3265        kvm_x86_ops->prepare_guest_switch(vcpu);
3266        kvm_load_guest_fpu(vcpu);
3267
3268        local_irq_disable();
3269
3270        clear_bit(KVM_REQ_KICK, &vcpu->requests);
3271        smp_mb__after_clear_bit();
3272
3273        if (vcpu->requests || need_resched() || signal_pending(current)) {
3274                local_irq_enable();
3275                preempt_enable();
3276                r = 1;
3277                goto out;
3278        }
3279
3280        if (vcpu->arch.exception.pending)
3281                __queue_exception(vcpu);
3282        else
3283                inject_pending_irq(vcpu, kvm_run);
3284
3285        /* enable NMI/IRQ window open exits if needed */
3286        if (vcpu->arch.nmi_pending)
3287                kvm_x86_ops->enable_nmi_window(vcpu);
3288        else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3289                kvm_x86_ops->enable_irq_window(vcpu);
3290
3291        if (kvm_lapic_enabled(vcpu)) {
3292                update_cr8_intercept(vcpu);
3293                kvm_lapic_sync_to_vapic(vcpu);
3294        }
3295
3296        up_read(&vcpu->kvm->slots_lock);
3297
3298        kvm_guest_enter();
3299
3300        get_debugreg(vcpu->arch.host_dr6, 6);
3301        get_debugreg(vcpu->arch.host_dr7, 7);
3302        if (unlikely(vcpu->arch.switch_db_regs)) {
3303                get_debugreg(vcpu->arch.host_db[0], 0);
3304                get_debugreg(vcpu->arch.host_db[1], 1);
3305                get_debugreg(vcpu->arch.host_db[2], 2);
3306                get_debugreg(vcpu->arch.host_db[3], 3);
3307
3308                set_debugreg(0, 7);
3309                set_debugreg(vcpu->arch.eff_db[0], 0);
3310                set_debugreg(vcpu->arch.eff_db[1], 1);
3311                set_debugreg(vcpu->arch.eff_db[2], 2);
3312                set_debugreg(vcpu->arch.eff_db[3], 3);
3313        }
3314
3315        KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3316        kvm_x86_ops->run(vcpu, kvm_run);
3317
3318        if (unlikely(vcpu->arch.switch_db_regs)) {
3319                set_debugreg(0, 7);
3320                set_debugreg(vcpu->arch.host_db[0], 0);
3321                set_debugreg(vcpu->arch.host_db[1], 1);
3322                set_debugreg(vcpu->arch.host_db[2], 2);
3323                set_debugreg(vcpu->arch.host_db[3], 3);
3324        }
3325        set_debugreg(vcpu->arch.host_dr6, 6);
3326        set_debugreg(vcpu->arch.host_dr7, 7);
3327
3328        set_bit(KVM_REQ_KICK, &vcpu->requests);
3329        local_irq_enable();
3330
3331        ++vcpu->stat.exits;
3332
3333        /*
3334         * We must have an instruction between local_irq_enable() and
3335         * kvm_guest_exit(), so the timer interrupt isn't delayed by
3336         * the interrupt shadow.  The stat.exits increment will do nicely.
3337         * But we need to prevent reordering, hence this barrier():
3338         */
3339        barrier();
3340
3341        kvm_guest_exit();
3342
3343        preempt_enable();
3344
3345        down_read(&vcpu->kvm->slots_lock);
3346
3347        /*
3348         * Profile KVM exit RIPs:
3349         */
3350        if (unlikely(prof_on == KVM_PROFILING)) {
3351                unsigned long rip = kvm_rip_read(vcpu);
3352                profile_hit(KVM_PROFILING, (void *)rip);
3353        }
3354
3355
3356        kvm_lapic_sync_from_vapic(vcpu);
3357
3358        r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3359out:
3360        return r;
3361}
3362
3363
3364static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3365{
3366        int r;
3367
3368        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3369                pr_debug("vcpu %d received sipi with vector # %x\n",
3370                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
3371                kvm_lapic_reset(vcpu);
3372                r = kvm_arch_vcpu_reset(vcpu);
3373                if (r)
3374                        return r;
3375                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3376        }
3377
3378        down_read(&vcpu->kvm->slots_lock);
3379        vapic_enter(vcpu);
3380
3381        r = 1;
3382        while (r > 0) {
3383                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3384                        r = vcpu_enter_guest(vcpu, kvm_run);
3385                else {
3386                        up_read(&vcpu->kvm->slots_lock);
3387                        kvm_vcpu_block(vcpu);
3388                        down_read(&vcpu->kvm->slots_lock);
3389                        if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3390                        {
3391                                switch(vcpu->arch.mp_state) {
3392                                case KVM_MP_STATE_HALTED:
3393                                        vcpu->arch.mp_state =
3394                                                KVM_MP_STATE_RUNNABLE;
3395                                case KVM_MP_STATE_RUNNABLE:
3396                                        break;
3397                                case KVM_MP_STATE_SIPI_RECEIVED:
3398                                default:
3399                                        r = -EINTR;
3400                                        break;
3401                                }
3402                        }
3403                }
3404
3405                if (r <= 0)
3406                        break;
3407
3408                clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3409                if (kvm_cpu_has_pending_timer(vcpu))
3410                        kvm_inject_pending_timer_irqs(vcpu);
3411
3412                if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3413                        r = -EINTR;
3414                        kvm_run->exit_reason = KVM_EXIT_INTR;
3415                        ++vcpu->stat.request_irq_exits;
3416                }
3417                if (signal_pending(current)) {
3418                        r = -EINTR;
3419                        kvm_run->exit_reason = KVM_EXIT_INTR;
3420                        ++vcpu->stat.signal_exits;
3421                }
3422                if (need_resched()) {
3423                        up_read(&vcpu->kvm->slots_lock);
3424                        kvm_resched(vcpu);
3425                        down_read(&vcpu->kvm->slots_lock);
3426                }
3427        }
3428
3429        up_read(&vcpu->kvm->slots_lock);
3430        post_kvm_run_save(vcpu, kvm_run);
3431
3432        vapic_exit(vcpu);
3433
3434        return r;
3435}
3436
3437int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3438{
3439        int r;
3440        sigset_t sigsaved;
3441
3442        vcpu_load(vcpu);
3443
3444        if (vcpu->sigset_active)
3445                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3446
3447        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3448                kvm_vcpu_block(vcpu);
3449                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3450                r = -EAGAIN;
3451                goto out;
3452        }
3453
3454        /* re-sync apic's tpr */
3455        if (!irqchip_in_kernel(vcpu->kvm))
3456                kvm_set_cr8(vcpu, kvm_run->cr8);
3457
3458        if (vcpu->arch.pio.cur_count) {
3459                r = complete_pio(vcpu);
3460                if (r)
3461                        goto out;
3462        }
3463#if CONFIG_HAS_IOMEM
3464        if (vcpu->mmio_needed) {
3465                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3466                vcpu->mmio_read_completed = 1;
3467                vcpu->mmio_needed = 0;
3468
3469                down_read(&vcpu->kvm->slots_lock);
3470                r = emulate_instruction(vcpu, kvm_run,
3471                                        vcpu->arch.mmio_fault_cr2, 0,
3472                                        EMULTYPE_NO_DECODE);
3473                up_read(&vcpu->kvm->slots_lock);
3474                if (r == EMULATE_DO_MMIO) {
3475                        /*
3476                         * Read-modify-write.  Back to userspace.
3477                         */
3478                        r = 0;
3479                        goto out;
3480                }
3481        }
3482#endif
3483        if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3484                kvm_register_write(vcpu, VCPU_REGS_RAX,
3485                                     kvm_run->hypercall.ret);
3486
3487        r = __vcpu_run(vcpu, kvm_run);
3488
3489out:
3490        if (vcpu->sigset_active)
3491                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3492
3493        vcpu_put(vcpu);
3494        return r;
3495}
3496
3497int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3498{
3499        vcpu_load(vcpu);
3500
3501        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3502        regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3503        regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3504        regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3505        regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3506        regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3507        regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3508        regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3509#ifdef CONFIG_X86_64
3510        regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3511        regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3512        regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3513        regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3514        regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3515        regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3516        regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3517        regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3518#endif
3519
3520        regs->rip = kvm_rip_read(vcpu);
3521        regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3522
3523        /*
3524         * Don't leak debug flags in case they were set for guest debugging
3525         */
3526        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3527                regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3528
3529        vcpu_put(vcpu);
3530
3531        return 0;
3532}
3533
3534int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3535{
3536        vcpu_load(vcpu);
3537
3538        kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3539        kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3540        kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3541        kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3542        kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3543        kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3544        kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3545        kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3546#ifdef CONFIG_X86_64
3547        kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3548        kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3549        kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3550        kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3551        kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3552        kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3553        kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3554        kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3555
3556#endif
3557
3558        kvm_rip_write(vcpu, regs->rip);
3559        kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3560
3561
3562        vcpu->arch.exception.pending = false;
3563
3564        vcpu_put(vcpu);
3565
3566        return 0;
3567}
3568
3569void kvm_get_segment(struct kvm_vcpu *vcpu,
3570                     struct kvm_segment *var, int seg)
3571{
3572        kvm_x86_ops->get_segment(vcpu, var, seg);
3573}
3574
3575void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3576{
3577        struct kvm_segment cs;
3578
3579        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3580        *db = cs.db;
3581        *l = cs.l;
3582}
3583EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3584
3585int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3586                                  struct kvm_sregs *sregs)
3587{
3588        struct descriptor_table dt;
3589
3590        vcpu_load(vcpu);
3591
3592        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3593        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3594        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3595        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3596        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3597        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3598
3599        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3600        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3601
3602        kvm_x86_ops->get_idt(vcpu, &dt);
3603        sregs->idt.limit = dt.limit;
3604        sregs->idt.base = dt.base;
3605        kvm_x86_ops->get_gdt(vcpu, &dt);
3606        sregs->gdt.limit = dt.limit;
3607        sregs->gdt.base = dt.base;
3608
3609        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3610        sregs->cr0 = vcpu->arch.cr0;
3611        sregs->cr2 = vcpu->arch.cr2;
3612        sregs->cr3 = vcpu->arch.cr3;
3613        sregs->cr4 = vcpu->arch.cr4;
3614        sregs->cr8 = kvm_get_cr8(vcpu);
3615        sregs->efer = vcpu->arch.shadow_efer;
3616        sregs->apic_base = kvm_get_apic_base(vcpu);
3617
3618        memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3619
3620        if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3621                set_bit(vcpu->arch.interrupt.nr,
3622                        (unsigned long *)sregs->interrupt_bitmap);
3623
3624        vcpu_put(vcpu);
3625
3626        return 0;
3627}
3628
3629int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3630                                    struct kvm_mp_state *mp_state)
3631{
3632        vcpu_load(vcpu);
3633        mp_state->mp_state = vcpu->arch.mp_state;
3634        vcpu_put(vcpu);
3635        return 0;
3636}
3637
3638int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3639                                    struct kvm_mp_state *mp_state)
3640{
3641        vcpu_load(vcpu);
3642        vcpu->arch.mp_state = mp_state->mp_state;
3643        vcpu_put(vcpu);
3644        return 0;
3645}
3646
3647static void kvm_set_segment(struct kvm_vcpu *vcpu,
3648                        struct kvm_segment *var, int seg)
3649{
3650        kvm_x86_ops->set_segment(vcpu, var, seg);
3651}
3652
3653static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3654                                   struct kvm_segment *kvm_desct)
3655{
3656        kvm_desct->base = seg_desc->base0;
3657        kvm_desct->base |= seg_desc->base1 << 16;
3658        kvm_desct->base |= seg_desc->base2 << 24;
3659        kvm_desct->limit = seg_desc->limit0;
3660        kvm_desct->limit |= seg_desc->limit << 16;
3661        if (seg_desc->g) {
3662                kvm_desct->limit <<= 12;
3663                kvm_desct->limit |= 0xfff;
3664        }
3665        kvm_desct->selector = selector;
3666        kvm_desct->type = seg_desc->type;
3667        kvm_desct->present = seg_desc->p;
3668        kvm_desct->dpl = seg_desc->dpl;
3669        kvm_desct->db = seg_desc->d;
3670        kvm_desct->s = seg_desc->s;
3671        kvm_desct->l = seg_desc->l;
3672        kvm_desct->g = seg_desc->g;
3673        kvm_desct->avl = seg_desc->avl;
3674        if (!selector)
3675                kvm_desct->unusable = 1;
3676        else
3677                kvm_desct->unusable = 0;
3678        kvm_desct->padding = 0;
3679}
3680
3681static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3682                                          u16 selector,
3683                                          struct descriptor_table *dtable)
3684{
3685        if (selector & 1 << 2) {
3686                struct kvm_segment kvm_seg;
3687
3688                kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3689
3690                if (kvm_seg.unusable)
3691                        dtable->limit = 0;
3692                else
3693                        dtable->limit = kvm_seg.limit;
3694                dtable->base = kvm_seg.base;
3695        }
3696        else
3697                kvm_x86_ops->get_gdt(vcpu, dtable);
3698}
3699
3700/* allowed just for 8 bytes segments */
3701static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3702                                         struct desc_struct *seg_desc)
3703{
3704        gpa_t gpa;
3705        struct descriptor_table dtable;
3706        u16 index = selector >> 3;
3707
3708        get_segment_descriptor_dtable(vcpu, selector, &dtable);
3709
3710        if (dtable.limit < index * 8 + 7) {
3711                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3712                return 1;
3713        }
3714        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3715        gpa += index * 8;
3716        return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3717}
3718
3719/* allowed just for 8 bytes segments */
3720static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3721                                         struct desc_struct *seg_desc)
3722{
3723        gpa_t gpa;
3724        struct descriptor_table dtable;
3725        u16 index = selector >> 3;
3726
3727        get_segment_descriptor_dtable(vcpu, selector, &dtable);
3728
3729        if (dtable.limit < index * 8 + 7)
3730                return 1;
3731        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3732        gpa += index * 8;
3733        return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3734}
3735
3736static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3737                             struct desc_struct *seg_desc)
3738{
3739        u32 base_addr;
3740
3741        base_addr = seg_desc->base0;
3742        base_addr |= (seg_desc->base1 << 16);
3743        base_addr |= (seg_desc->base2 << 24);
3744
3745        return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3746}
3747
3748static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3749{
3750        struct kvm_segment kvm_seg;
3751
3752        kvm_get_segment(vcpu, &kvm_seg, seg);
3753        return kvm_seg.selector;
3754}
3755
3756static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3757                                                u16 selector,
3758                                                struct kvm_segment *kvm_seg)
3759{
3760        struct desc_struct seg_desc;
3761
3762        if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3763                return 1;
3764        seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3765        return 0;
3766}
3767
3768static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3769{
3770        struct kvm_segment segvar = {
3771                .base = selector << 4,
3772                .limit = 0xffff,
3773                .selector = selector,
3774                .type = 3,
3775                .present = 1,
3776                .dpl = 3,
3777                .db = 0,
3778                .s = 1,
3779                .l = 0,
3780                .g = 0,
3781                .avl = 0,
3782                .unusable = 0,
3783        };
3784        kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3785        return 0;
3786}
3787
3788int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3789                                int type_bits, int seg)
3790{
3791        struct kvm_segment kvm_seg;
3792
3793        if (!(vcpu->arch.cr0 & X86_CR0_PE))
3794                return kvm_load_realmode_segment(vcpu, selector, seg);
3795        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3796                return 1;
3797        kvm_seg.type |= type_bits;
3798
3799        if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3800            seg != VCPU_SREG_LDTR)
3801                if (!kvm_seg.s)
3802                        kvm_seg.unusable = 1;
3803
3804        kvm_set_segment(vcpu, &kvm_seg, seg);
3805        return 0;
3806}
3807
3808static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3809                                struct tss_segment_32 *tss)
3810{
3811        tss->cr3 = vcpu->arch.cr3;
3812        tss->eip = kvm_rip_read(vcpu);
3813        tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3814        tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3815        tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3816        tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3817        tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3818        tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3819        tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3820        tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3821        tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3822        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3823        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3824        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3825        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3826        tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3827        tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3828        tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3829}
3830
3831static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3832                                  struct tss_segment_32 *tss)
3833{
3834        kvm_set_cr3(vcpu, tss->cr3);
3835
3836        kvm_rip_write(vcpu, tss->eip);
3837        kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3838
3839        kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3840        kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3841        kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3842        kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3843        kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3844        kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3845        kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3846        kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3847
3848        if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3849                return 1;
3850
3851        if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3852                return 1;
3853
3854        if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3855                return 1;
3856
3857        if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3858                return 1;
3859
3860        if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3861                return 1;
3862
3863        if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3864                return 1;
3865
3866        if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3867                return 1;
3868        return 0;
3869}
3870
3871static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3872                                struct tss_segment_16 *tss)
3873{
3874        tss->ip = kvm_rip_read(vcpu);
3875        tss->flag = kvm_x86_ops->get_rflags(vcpu);
3876        tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3877        tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3878        tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3879        tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3880        tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3881        tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3882        tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3883        tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3884
3885        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3886        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3887        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3888        tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3889        tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3890        tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3891}
3892
3893static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3894                                 struct tss_segment_16 *tss)
3895{
3896        kvm_rip_write(vcpu, tss->ip);
3897        kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3898        kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3899        kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3900        kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3901        kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3902        kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3903        kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3904        kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3905        kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3906
3907        if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3908                return 1;
3909
3910        if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3911                return 1;
3912
3913        if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3914                return 1;
3915
3916        if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3917                return 1;
3918
3919        if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3920                return 1;
3921        return 0;
3922}
3923
3924static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3925                              u16 old_tss_sel, u32 old_tss_base,
3926                              struct desc_struct *nseg_desc)
3927{
3928        struct tss_segment_16 tss_segment_16;
3929        int ret = 0;
3930
3931        if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3932                           sizeof tss_segment_16))
3933                goto out;
3934
3935        save_state_to_tss16(vcpu, &tss_segment_16);
3936
3937        if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3938                            sizeof tss_segment_16))
3939                goto out;
3940
3941        if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3942                           &tss_segment_16, sizeof tss_segment_16))
3943                goto out;
3944
3945        if (old_tss_sel != 0xffff) {
3946                tss_segment_16.prev_task_link = old_tss_sel;
3947
3948                if (kvm_write_guest(vcpu->kvm,
3949                                    get_tss_base_addr(vcpu, nseg_desc),
3950                                    &tss_segment_16.prev_task_link,
3951                                    sizeof tss_segment_16.prev_task_link))
3952                        goto out;
3953        }
3954
3955        if (load_state_from_tss16(vcpu, &tss_segment_16))
3956                goto out;
3957
3958        ret = 1;
3959out:
3960        return ret;
3961}
3962
3963static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3964                       u16 old_tss_sel, u32 old_tss_base,
3965                       struct desc_struct *nseg_desc)
3966{
3967        struct tss_segment_32 tss_segment_32;
3968        int ret = 0;
3969
3970        if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3971                           sizeof tss_segment_32))
3972                goto out;
3973
3974        save_state_to_tss32(vcpu, &tss_segment_32);
3975
3976        if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3977                            sizeof tss_segment_32))
3978                goto out;
3979
3980        if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3981                           &tss_segment_32, sizeof tss_segment_32))
3982                goto out;
3983
3984        if (old_tss_sel != 0xffff) {
3985                tss_segment_32.prev_task_link = old_tss_sel;
3986
3987                if (kvm_write_guest(vcpu->kvm,
3988                                    get_tss_base_addr(vcpu, nseg_desc),
3989                                    &tss_segment_32.prev_task_link,
3990                                    sizeof tss_segment_32.prev_task_link))
3991                        goto out;
3992        }
3993
3994        if (load_state_from_tss32(vcpu, &tss_segment_32))
3995                goto out;
3996
3997        ret = 1;
3998out:
3999        return ret;
4000}
4001
4002int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4003{
4004        struct kvm_segment tr_seg;
4005        struct desc_struct cseg_desc;
4006        struct desc_struct nseg_desc;
4007        int ret = 0;
4008        u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4009        u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4010
4011        old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4012
4013        /* FIXME: Handle errors. Failure to read either TSS or their
4014         * descriptors should generate a pagefault.
4015         */
4016        if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4017                goto out;
4018
4019        if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4020                goto out;
4021
4022        if (reason != TASK_SWITCH_IRET) {
4023                int cpl;
4024
4025                cpl = kvm_x86_ops->get_cpl(vcpu);
4026                if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4027                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4028                        return 1;
4029                }
4030        }
4031
4032        if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
4033                kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4034                return 1;
4035        }
4036
4037        if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4038                cseg_desc.type &= ~(1 << 1); //clear the B flag
4039                save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4040        }
4041
4042        if (reason == TASK_SWITCH_IRET) {
4043                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4044                kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4045        }
4046
4047        /* set back link to prev task only if NT bit is set in eflags
4048           note that old_tss_sel is not used afetr this point */
4049        if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4050                old_tss_sel = 0xffff;
4051
4052        /* set back link to prev task only if NT bit is set in eflags
4053           note that old_tss_sel is not used afetr this point */
4054        if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4055                old_tss_sel = 0xffff;
4056
4057        if (nseg_desc.type & 8)
4058                ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4059                                         old_tss_base, &nseg_desc);
4060        else
4061                ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4062                                         old_tss_base, &nseg_desc);
4063
4064        if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4065                u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4066                kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4067        }
4068
4069        if (reason != TASK_SWITCH_IRET) {
4070                nseg_desc.type |= (1 << 1);
4071                save_guest_segment_descriptor(vcpu, tss_selector,
4072                                              &nseg_desc);
4073        }
4074
4075        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4076        seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4077        tr_seg.type = 11;
4078        kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4079out:
4080        return ret;
4081}
4082EXPORT_SYMBOL_GPL(kvm_task_switch);
4083
4084int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4085                                  struct kvm_sregs *sregs)
4086{
4087        int mmu_reset_needed = 0;
4088        int pending_vec, max_bits;
4089        struct descriptor_table dt;
4090
4091        vcpu_load(vcpu);
4092
4093        dt.limit = sregs->idt.limit;
4094        dt.base = sregs->idt.base;
4095        kvm_x86_ops->set_idt(vcpu, &dt);
4096        dt.limit = sregs->gdt.limit;
4097        dt.base = sregs->gdt.base;
4098        kvm_x86_ops->set_gdt(vcpu, &dt);
4099
4100        vcpu->arch.cr2 = sregs->cr2;
4101        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4102
4103        down_read(&vcpu->kvm->slots_lock);
4104        if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4105                vcpu->arch.cr3 = sregs->cr3;
4106        else
4107                set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4108        up_read(&vcpu->kvm->slots_lock);
4109
4110        kvm_set_cr8(vcpu, sregs->cr8);
4111
4112        mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4113        kvm_x86_ops->set_efer(vcpu, sregs->efer);
4114        kvm_set_apic_base(vcpu, sregs->apic_base);
4115
4116        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4117
4118        mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4119        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4120        vcpu->arch.cr0 = sregs->cr0;
4121
4122        mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4123        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4124        if (!is_long_mode(vcpu) && is_pae(vcpu))
4125                load_pdptrs(vcpu, vcpu->arch.cr3);
4126
4127        if (mmu_reset_needed)
4128                kvm_mmu_reset_context(vcpu);
4129
4130        max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4131        pending_vec = find_first_bit(
4132                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4133        if (pending_vec < max_bits) {
4134                kvm_queue_interrupt(vcpu, pending_vec, false);
4135                pr_debug("Set back pending irq %d\n", pending_vec);
4136                if (irqchip_in_kernel(vcpu->kvm))
4137                        kvm_pic_clear_isr_ack(vcpu->kvm);
4138        }
4139
4140        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4141        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4142        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4143        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4144        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4145        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4146
4147        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4148        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4149
4150        /* Older userspace won't unhalt the vcpu on reset. */
4151        if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
4152            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4153            !(vcpu->arch.cr0 & X86_CR0_PE))
4154                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4155
4156        vcpu_put(vcpu);
4157
4158        return 0;
4159}
4160
4161int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4162                                        struct kvm_guest_debug *dbg)
4163{
4164        int i, r;
4165
4166        vcpu_load(vcpu);
4167
4168        if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4169            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4170                for (i = 0; i < KVM_NR_DB_REGS; ++i)
4171                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4172                vcpu->arch.switch_db_regs =
4173                        (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4174        } else {
4175                for (i = 0; i < KVM_NR_DB_REGS; i++)
4176                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4177                vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4178        }
4179
4180        r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4181
4182        if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4183                kvm_queue_exception(vcpu, DB_VECTOR);
4184        else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4185                kvm_queue_exception(vcpu, BP_VECTOR);
4186
4187        vcpu_put(vcpu);
4188
4189        return r;
4190}
4191
4192/*
4193 * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4194 * we have asm/x86/processor.h
4195 */
4196struct fxsave {
4197        u16     cwd;
4198        u16     swd;
4199        u16     twd;
4200        u16     fop;
4201        u64     rip;
4202        u64     rdp;
4203        u32     mxcsr;
4204        u32     mxcsr_mask;
4205        u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4206#ifdef CONFIG_X86_64
4207        u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4208#else
4209        u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4210#endif
4211};
4212
4213/*
4214 * Translate a guest virtual address to a guest physical address.
4215 */
4216int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4217                                    struct kvm_translation *tr)
4218{
4219        unsigned long vaddr = tr->linear_address;
4220        gpa_t gpa;
4221
4222        vcpu_load(vcpu);
4223        down_read(&vcpu->kvm->slots_lock);
4224        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4225        up_read(&vcpu->kvm->slots_lock);
4226        tr->physical_address = gpa;
4227        tr->valid = gpa != UNMAPPED_GVA;
4228        tr->writeable = 1;
4229        tr->usermode = 0;
4230        vcpu_put(vcpu);
4231
4232        return 0;
4233}
4234
4235int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4236{
4237        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4238
4239        vcpu_load(vcpu);
4240
4241        memcpy(fpu->fpr, fxsave->st_space, 128);
4242        fpu->fcw = fxsave->cwd;
4243        fpu->fsw = fxsave->swd;
4244        fpu->ftwx = fxsave->twd;
4245        fpu->last_opcode = fxsave->fop;
4246        fpu->last_ip = fxsave->rip;
4247        fpu->last_dp = fxsave->rdp;
4248        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4249
4250        vcpu_put(vcpu);
4251
4252        return 0;
4253}
4254
4255int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4256{
4257        struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4258
4259        vcpu_load(vcpu);
4260
4261        memcpy(fxsave->st_space, fpu->fpr, 128);
4262        fxsave->cwd = fpu->fcw;
4263        fxsave->swd = fpu->fsw;
4264        fxsave->twd = fpu->ftwx;
4265        fxsave->fop = fpu->last_opcode;
4266        fxsave->rip = fpu->last_ip;
4267        fxsave->rdp = fpu->last_dp;
4268        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4269
4270        vcpu_put(vcpu);
4271
4272        return 0;
4273}
4274
4275void fx_init(struct kvm_vcpu *vcpu)
4276{
4277        unsigned after_mxcsr_mask;
4278
4279        /*
4280         * Touch the fpu the first time in non atomic context as if
4281         * this is the first fpu instruction the exception handler
4282         * will fire before the instruction returns and it'll have to
4283         * allocate ram with GFP_KERNEL.
4284         */
4285        if (!used_math())
4286                kvm_fx_save(&vcpu->arch.host_fx_image);
4287
4288        /* Initialize guest FPU by resetting ours and saving into guest's */
4289        preempt_disable();
4290        kvm_fx_save(&vcpu->arch.host_fx_image);
4291        kvm_fx_finit();
4292        kvm_fx_save(&vcpu->arch.guest_fx_image);
4293        kvm_fx_restore(&vcpu->arch.host_fx_image);
4294        preempt_enable();
4295
4296        vcpu->arch.cr0 |= X86_CR0_ET;
4297        after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4298        vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4299        memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4300               0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4301}
4302EXPORT_SYMBOL_GPL(fx_init);
4303
4304void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4305{
4306        if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4307                return;
4308
4309        vcpu->guest_fpu_loaded = 1;
4310        kvm_fx_save(&vcpu->arch.host_fx_image);
4311        kvm_fx_restore(&vcpu->arch.guest_fx_image);
4312}
4313EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4314
4315void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4316{
4317        if (!vcpu->guest_fpu_loaded)
4318                return;
4319
4320        vcpu->guest_fpu_loaded = 0;
4321        kvm_fx_save(&vcpu->arch.guest_fx_image);
4322        kvm_fx_restore(&vcpu->arch.host_fx_image);
4323        ++vcpu->stat.fpu_reload;
4324}
4325EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4326
4327void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4328{
4329        if (vcpu->arch.time_page) {
4330                kvm_release_page_dirty(vcpu->arch.time_page);
4331                vcpu->arch.time_page = NULL;
4332        }
4333
4334        kvm_x86_ops->vcpu_free(vcpu);
4335}
4336
4337struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4338                                                unsigned int id)
4339{
4340        return kvm_x86_ops->vcpu_create(kvm, id);
4341}
4342
4343int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4344{
4345        int r;
4346
4347        /* We do fxsave: this must be aligned. */
4348        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4349
4350        vcpu->arch.mtrr_state.have_fixed = 1;
4351        vcpu_load(vcpu);
4352        r = kvm_arch_vcpu_reset(vcpu);
4353        if (r == 0)
4354                r = kvm_mmu_setup(vcpu);
4355        vcpu_put(vcpu);
4356        if (r < 0)
4357                goto free_vcpu;
4358
4359        return 0;
4360free_vcpu:
4361        kvm_x86_ops->vcpu_free(vcpu);
4362        return r;
4363}
4364
4365void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4366{
4367        vcpu_load(vcpu);
4368        kvm_mmu_unload(vcpu);
4369        vcpu_put(vcpu);
4370
4371        kvm_x86_ops->vcpu_free(vcpu);
4372}
4373
4374int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4375{
4376        vcpu->arch.nmi_pending = false;
4377        vcpu->arch.nmi_injected = false;
4378
4379        vcpu->arch.switch_db_regs = 0;
4380        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4381        vcpu->arch.dr6 = DR6_FIXED_1;
4382        vcpu->arch.dr7 = DR7_FIXED_1;
4383
4384        return kvm_x86_ops->vcpu_reset(vcpu);
4385}
4386
4387void kvm_arch_hardware_enable(void *garbage)
4388{
4389        kvm_x86_ops->hardware_enable(garbage);
4390}
4391
4392void kvm_arch_hardware_disable(void *garbage)
4393{
4394        kvm_x86_ops->hardware_disable(garbage);
4395}
4396
4397int kvm_arch_hardware_setup(void)
4398{
4399        return kvm_x86_ops->hardware_setup();
4400}
4401
4402void kvm_arch_hardware_unsetup(void)
4403{
4404        kvm_x86_ops->hardware_unsetup();
4405}
4406
4407void kvm_arch_check_processor_compat(void *rtn)
4408{
4409        kvm_x86_ops->check_processor_compatibility(rtn);
4410}
4411
4412int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4413{
4414        struct page *page;
4415        struct kvm *kvm;
4416        int r;
4417
4418        BUG_ON(vcpu->kvm == NULL);
4419        kvm = vcpu->kvm;
4420
4421        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4422        if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4423                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4424        else
4425                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4426
4427        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4428        if (!page) {
4429                r = -ENOMEM;
4430                goto fail;
4431        }
4432        vcpu->arch.pio_data = page_address(page);
4433
4434        r = kvm_mmu_create(vcpu);
4435        if (r < 0)
4436                goto fail_free_pio_data;
4437
4438        if (irqchip_in_kernel(kvm)) {
4439                r = kvm_create_lapic(vcpu);
4440                if (r < 0)
4441                        goto fail_mmu_destroy;
4442        }
4443
4444        return 0;
4445
4446fail_mmu_destroy:
4447        kvm_mmu_destroy(vcpu);
4448fail_free_pio_data:
4449        free_page((unsigned long)vcpu->arch.pio_data);
4450fail:
4451        return r;
4452}
4453
4454void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4455{
4456        kvm_free_lapic(vcpu);
4457        down_read(&vcpu->kvm->slots_lock);
4458        kvm_mmu_destroy(vcpu);
4459        up_read(&vcpu->kvm->slots_lock);
4460        free_page((unsigned long)vcpu->arch.pio_data);
4461}
4462
4463struct  kvm *kvm_arch_create_vm(void)
4464{
4465        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4466
4467        if (!kvm)
4468                return ERR_PTR(-ENOMEM);
4469
4470        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4471        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4472
4473        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4474        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4475
4476        rdtscll(kvm->arch.vm_init_tsc);
4477
4478        return kvm;
4479}
4480
4481static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4482{
4483        vcpu_load(vcpu);
4484        kvm_mmu_unload(vcpu);
4485        vcpu_put(vcpu);
4486}
4487
4488static void kvm_free_vcpus(struct kvm *kvm)
4489{
4490        unsigned int i;
4491
4492        /*
4493         * Unpin any mmu pages first.
4494         */
4495        for (i = 0; i < KVM_MAX_VCPUS; ++i)
4496                if (kvm->vcpus[i])
4497                        kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4498        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4499                if (kvm->vcpus[i]) {
4500                        kvm_arch_vcpu_free(kvm->vcpus[i]);
4501                        kvm->vcpus[i] = NULL;
4502                }
4503        }
4504
4505}
4506
4507void kvm_arch_sync_events(struct kvm *kvm)
4508{
4509        kvm_free_all_assigned_devices(kvm);
4510}
4511
4512void kvm_arch_destroy_vm(struct kvm *kvm)
4513{
4514        kvm_iommu_unmap_guest(kvm);
4515        kvm_free_pit(kvm);
4516        kfree(kvm->arch.vpic);
4517        kfree(kvm->arch.vioapic);
4518        kvm_free_vcpus(kvm);
4519        kvm_free_physmem(kvm);
4520        if (kvm->arch.apic_access_page)
4521                put_page(kvm->arch.apic_access_page);
4522        if (kvm->arch.ept_identity_pagetable)
4523                put_page(kvm->arch.ept_identity_pagetable);
4524        kfree(kvm);
4525}
4526
4527int kvm_arch_set_memory_region(struct kvm *kvm,
4528                                struct kvm_userspace_memory_region *mem,
4529                                struct kvm_memory_slot old,
4530                                int user_alloc)
4531{
4532        int npages = mem->memory_size >> PAGE_SHIFT;
4533        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4534
4535        /*To keep backward compatibility with older userspace,
4536         *x86 needs to hanlde !user_alloc case.
4537         */
4538        if (!user_alloc) {
4539                if (npages && !old.rmap) {
4540                        unsigned long userspace_addr;
4541
4542                        down_write(&current->mm->mmap_sem);
4543                        userspace_addr = do_mmap(NULL, 0,
4544                                                 npages * PAGE_SIZE,
4545                                                 PROT_READ | PROT_WRITE,
4546                                                 MAP_PRIVATE | MAP_ANONYMOUS,
4547                                                 0);
4548                        up_write(&current->mm->mmap_sem);
4549
4550                        if (IS_ERR((void *)userspace_addr))
4551                                return PTR_ERR((void *)userspace_addr);
4552
4553                        /* set userspace_addr atomically for kvm_hva_to_rmapp */
4554                        spin_lock(&kvm->mmu_lock);
4555                        memslot->userspace_addr = userspace_addr;
4556                        spin_unlock(&kvm->mmu_lock);
4557                } else {
4558                        if (!old.user_alloc && old.rmap) {
4559                                int ret;
4560
4561                                down_write(&current->mm->mmap_sem);
4562                                ret = do_munmap(current->mm, old.userspace_addr,
4563                                                old.npages * PAGE_SIZE);
4564                                up_write(&current->mm->mmap_sem);
4565                                if (ret < 0)
4566                                        printk(KERN_WARNING
4567                                       "kvm_vm_ioctl_set_memory_region: "
4568                                       "failed to munmap memory\n");
4569                        }
4570                }
4571        }
4572
4573        spin_lock(&kvm->mmu_lock);
4574        if (!kvm->arch.n_requested_mmu_pages) {
4575                unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4576                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4577        }
4578
4579        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4580        spin_unlock(&kvm->mmu_lock);
4581        kvm_flush_remote_tlbs(kvm);
4582
4583        return 0;
4584}
4585
4586void kvm_arch_flush_shadow(struct kvm *kvm)
4587{
4588        kvm_mmu_zap_all(kvm);
4589        kvm_reload_remote_mmus(kvm);
4590}
4591
4592int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4593{
4594        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4595               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4596               || vcpu->arch.nmi_pending;
4597}
4598
4599void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4600{
4601        int me;
4602        int cpu = vcpu->cpu;
4603
4604        if (waitqueue_active(&vcpu->wq)) {
4605                wake_up_interruptible(&vcpu->wq);
4606                ++vcpu->stat.halt_wakeup;
4607        }
4608
4609        me = get_cpu();
4610        if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4611                if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4612                        smp_send_reschedule(cpu);
4613        put_cpu();
4614}
4615
4616int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4617{
4618        return kvm_x86_ops->interrupt_allowed(vcpu);
4619}
4620
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.