linux/arch/x86/kvm/paging_tmpl.h
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * MMU support
   8 *
   9 * Copyright (C) 2006 Qumranet, Inc.
  10 *
  11 * Authors:
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *   Avi Kivity   <avi@qumranet.com>
  14 *
  15 * This work is licensed under the terms of the GNU GPL, version 2.  See
  16 * the COPYING file in the top-level directory.
  17 *
  18 */
  19
  20/*
  21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
  22 * so the code in this file is compiled twice, once per pte size.
  23 */
  24
  25#if PTTYPE == 64
  26        #define pt_element_t u64
  27        #define guest_walker guest_walker64
  28        #define FNAME(name) paging##64_##name
  29        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
  30        #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
  31        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
  32        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
  33        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
  34        #define PT_LEVEL_BITS PT64_LEVEL_BITS
  35        #ifdef CONFIG_X86_64
  36        #define PT_MAX_FULL_LEVELS 4
  37        #define CMPXCHG cmpxchg
  38        #else
  39        #define CMPXCHG cmpxchg64
  40        #define PT_MAX_FULL_LEVELS 2
  41        #endif
  42#elif PTTYPE == 32
  43        #define pt_element_t u32
  44        #define guest_walker guest_walker32
  45        #define FNAME(name) paging##32_##name
  46        #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
  47        #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
  48        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
  49        #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
  50        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
  51        #define PT_LEVEL_BITS PT32_LEVEL_BITS
  52        #define PT_MAX_FULL_LEVELS 2
  53        #define CMPXCHG cmpxchg
  54#else
  55        #error Invalid PTTYPE value
  56#endif
  57
  58#define gpte_to_gfn FNAME(gpte_to_gfn)
  59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
  60
  61/*
  62 * The guest_walker structure emulates the behavior of the hardware page
  63 * table walker.
  64 */
  65struct guest_walker {
  66        int level;
  67        gfn_t table_gfn[PT_MAX_FULL_LEVELS];
  68        pt_element_t ptes[PT_MAX_FULL_LEVELS];
  69        gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
  70        unsigned pt_access;
  71        unsigned pte_access;
  72        gfn_t gfn;
  73        u32 error_code;
  74};
  75
  76static gfn_t gpte_to_gfn(pt_element_t gpte)
  77{
  78        return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
  79}
  80
  81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
  82{
  83        return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
  84}
  85
  86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
  87                         gfn_t table_gfn, unsigned index,
  88                         pt_element_t orig_pte, pt_element_t new_pte)
  89{
  90        pt_element_t ret;
  91        pt_element_t *table;
  92        struct page *page;
  93
  94        down_read(&current->mm->mmap_sem);
  95        page = gfn_to_page(kvm, table_gfn);
  96        up_read(&current->mm->mmap_sem);
  97
  98        table = kmap_atomic(page, KM_USER0);
  99
 100        ret = CMPXCHG(&table[index], orig_pte, new_pte);
 101
 102        kunmap_atomic(table, KM_USER0);
 103
 104        kvm_release_page_dirty(page);
 105
 106        return (ret != orig_pte);
 107}
 108
 109static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 110{
 111        unsigned access;
 112
 113        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 114#if PTTYPE == 64
 115        if (is_nx(vcpu))
 116                access &= ~(gpte >> PT64_NX_SHIFT);
 117#endif
 118        return access;
 119}
 120
 121/*
 122 * Fetch a guest pte for a guest virtual address
 123 */
 124static int FNAME(walk_addr)(struct guest_walker *walker,
 125                            struct kvm_vcpu *vcpu, gva_t addr,
 126                            int write_fault, int user_fault, int fetch_fault)
 127{
 128        pt_element_t pte;
 129        gfn_t table_gfn;
 130        unsigned index, pt_access, pte_access;
 131        gpa_t pte_gpa;
 132
 133        pgprintk("%s: addr %lx\n", __func__, addr);
 134walk:
 135        walker->level = vcpu->arch.mmu.root_level;
 136        pte = vcpu->arch.cr3;
 137#if PTTYPE == 64
 138        if (!is_long_mode(vcpu)) {
 139                pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
 140                if (!is_present_pte(pte))
 141                        goto not_present;
 142                --walker->level;
 143        }
 144#endif
 145        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 146               (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
 147
 148        pt_access = ACC_ALL;
 149
 150        for (;;) {
 151                index = PT_INDEX(addr, walker->level);
 152
 153                table_gfn = gpte_to_gfn(pte);
 154                pte_gpa = gfn_to_gpa(table_gfn);
 155                pte_gpa += index * sizeof(pt_element_t);
 156                walker->table_gfn[walker->level - 1] = table_gfn;
 157                walker->pte_gpa[walker->level - 1] = pte_gpa;
 158                pgprintk("%s: table_gfn[%d] %lx\n", __func__,
 159                         walker->level - 1, table_gfn);
 160
 161                kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
 162
 163                if (!is_present_pte(pte))
 164                        goto not_present;
 165
 166                if (write_fault && !is_writeble_pte(pte))
 167                        if (user_fault || is_write_protection(vcpu))
 168                                goto access_error;
 169
 170                if (user_fault && !(pte & PT_USER_MASK))
 171                        goto access_error;
 172
 173#if PTTYPE == 64
 174                if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
 175                        goto access_error;
 176#endif
 177
 178                if (!(pte & PT_ACCESSED_MASK)) {
 179                        mark_page_dirty(vcpu->kvm, table_gfn);
 180                        if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
 181                            index, pte, pte|PT_ACCESSED_MASK))
 182                                goto walk;
 183                        pte |= PT_ACCESSED_MASK;
 184                }
 185
 186                pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 187
 188                walker->ptes[walker->level - 1] = pte;
 189
 190                if (walker->level == PT_PAGE_TABLE_LEVEL) {
 191                        walker->gfn = gpte_to_gfn(pte);
 192                        break;
 193                }
 194
 195                if (walker->level == PT_DIRECTORY_LEVEL
 196                    && (pte & PT_PAGE_SIZE_MASK)
 197                    && (PTTYPE == 64 || is_pse(vcpu))) {
 198                        walker->gfn = gpte_to_gfn_pde(pte);
 199                        walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
 200                        if (PTTYPE == 32 && is_cpuid_PSE36())
 201                                walker->gfn += pse36_gfn_delta(pte);
 202                        break;
 203                }
 204
 205                pt_access = pte_access;
 206                --walker->level;
 207        }
 208
 209        if (write_fault && !is_dirty_pte(pte)) {
 210                bool ret;
 211
 212                mark_page_dirty(vcpu->kvm, table_gfn);
 213                ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 214                            pte|PT_DIRTY_MASK);
 215                if (ret)
 216                        goto walk;
 217                pte |= PT_DIRTY_MASK;
 218                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
 219                walker->ptes[walker->level - 1] = pte;
 220        }
 221
 222        walker->pt_access = pt_access;
 223        walker->pte_access = pte_access;
 224        pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
 225                 __func__, (u64)pte, pt_access, pte_access);
 226        return 1;
 227
 228not_present:
 229        walker->error_code = 0;
 230        goto err;
 231
 232access_error:
 233        walker->error_code = PFERR_PRESENT_MASK;
 234
 235err:
 236        if (write_fault)
 237                walker->error_code |= PFERR_WRITE_MASK;
 238        if (user_fault)
 239                walker->error_code |= PFERR_USER_MASK;
 240        if (fetch_fault)
 241                walker->error_code |= PFERR_FETCH_MASK;
 242        return 0;
 243}
 244
 245static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 246                              u64 *spte, const void *pte)
 247{
 248        pt_element_t gpte;
 249        unsigned pte_access;
 250        pfn_t pfn;
 251        int largepage = vcpu->arch.update_pte.largepage;
 252
 253        gpte = *(const pt_element_t *)pte;
 254        if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
 255                if (!is_present_pte(gpte))
 256                        set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
 257                return;
 258        }
 259        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 260        pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
 261        if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 262                return;
 263        pfn = vcpu->arch.update_pte.pfn;
 264        if (is_error_pfn(pfn))
 265                return;
 266        if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
 267                return;
 268        kvm_get_pfn(pfn);
 269        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 270                     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
 271                     pfn, true);
 272}
 273
 274/*
 275 * Fetch a shadow pte for a specific level in the paging hierarchy.
 276 */
 277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 278                         struct guest_walker *walker,
 279                         int user_fault, int write_fault, int largepage,
 280                         int *ptwrite, pfn_t pfn)
 281{
 282        hpa_t shadow_addr;
 283        int level;
 284        u64 *shadow_ent;
 285        unsigned access = walker->pt_access;
 286
 287        if (!is_present_pte(walker->ptes[walker->level - 1]))
 288                return NULL;
 289
 290        shadow_addr = vcpu->arch.mmu.root_hpa;
 291        level = vcpu->arch.mmu.shadow_root_level;
 292        if (level == PT32E_ROOT_LEVEL) {
 293                shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
 294                shadow_addr &= PT64_BASE_ADDR_MASK;
 295                --level;
 296        }
 297
 298        for (; ; level--) {
 299                u32 index = SHADOW_PT_INDEX(addr, level);
 300                struct kvm_mmu_page *shadow_page;
 301                u64 shadow_pte;
 302                int metaphysical;
 303                gfn_t table_gfn;
 304
 305                shadow_ent = ((u64 *)__va(shadow_addr)) + index;
 306                if (level == PT_PAGE_TABLE_LEVEL)
 307                        break;
 308
 309                if (largepage && level == PT_DIRECTORY_LEVEL)
 310                        break;
 311
 312                if (is_shadow_present_pte(*shadow_ent)
 313                    && !is_large_pte(*shadow_ent)) {
 314                        shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
 315                        continue;
 316                }
 317
 318                if (is_large_pte(*shadow_ent))
 319                        rmap_remove(vcpu->kvm, shadow_ent);
 320
 321                if (level - 1 == PT_PAGE_TABLE_LEVEL
 322                    && walker->level == PT_DIRECTORY_LEVEL) {
 323                        metaphysical = 1;
 324                        if (!is_dirty_pte(walker->ptes[level - 1]))
 325                                access &= ~ACC_WRITE_MASK;
 326                        table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
 327                } else {
 328                        metaphysical = 0;
 329                        table_gfn = walker->table_gfn[level - 2];
 330                }
 331                shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 332                                               metaphysical, access,
 333                                               shadow_ent);
 334                if (!metaphysical) {
 335                        int r;
 336                        pt_element_t curr_pte;
 337                        r = kvm_read_guest_atomic(vcpu->kvm,
 338                                                  walker->pte_gpa[level - 2],
 339                                                  &curr_pte, sizeof(curr_pte));
 340                        if (r || curr_pte != walker->ptes[level - 2]) {
 341                                kvm_release_pfn_clean(pfn);
 342                                return NULL;
 343                        }
 344                }
 345                shadow_addr = __pa(shadow_page->spt);
 346                shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
 347                        | PT_WRITABLE_MASK | PT_USER_MASK;
 348                set_shadow_pte(shadow_ent, shadow_pte);
 349        }
 350
 351        mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 352                     user_fault, write_fault,
 353                     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
 354                     ptwrite, largepage, walker->gfn, pfn, false);
 355
 356        return shadow_ent;
 357}
 358
 359/*
 360 * Page fault handler.  There are several causes for a page fault:
 361 *   - there is no shadow pte for the guest pte
 362 *   - write access through a shadow pte marked read only so that we can set
 363 *     the dirty bit
 364 *   - write access to a shadow pte marked read only so we can update the page
 365 *     dirty bitmap, when userspace requests it
 366 *   - mmio access; in this case we will never install a present shadow pte
 367 *   - normal guest page fault due to the guest pte marked not present, not
 368 *     writable, or not executable
 369 *
 370 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
 371 *           a negative value on error.
 372 */
 373static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 374                               u32 error_code)
 375{
 376        int write_fault = error_code & PFERR_WRITE_MASK;
 377        int user_fault = error_code & PFERR_USER_MASK;
 378        int fetch_fault = error_code & PFERR_FETCH_MASK;
 379        struct guest_walker walker;
 380        u64 *shadow_pte;
 381        int write_pt = 0;
 382        int r;
 383        pfn_t pfn;
 384        int largepage = 0;
 385        unsigned long mmu_seq;
 386
 387        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 388        kvm_mmu_audit(vcpu, "pre page fault");
 389
 390        r = mmu_topup_memory_caches(vcpu);
 391        if (r)
 392                return r;
 393
 394        /*
 395         * Look up the shadow pte for the faulting address.
 396         */
 397        r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 398                             fetch_fault);
 399
 400        /*
 401         * The page is not mapped by the guest.  Let the guest handle it.
 402         */
 403        if (!r) {
 404                pgprintk("%s: guest page fault\n", __func__);
 405                inject_page_fault(vcpu, addr, walker.error_code);
 406                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 407                return 0;
 408        }
 409
 410        down_read(&current->mm->mmap_sem);
 411        if (walker.level == PT_DIRECTORY_LEVEL) {
 412                gfn_t large_gfn;
 413                large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
 414                if (is_largepage_backed(vcpu, large_gfn)) {
 415                        walker.gfn = large_gfn;
 416                        largepage = 1;
 417                }
 418        }
 419        mmu_seq = vcpu->kvm->mmu_notifier_seq;
 420        /* implicit mb(), we'll read before PT lock is unlocked */
 421        pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 422        up_read(&current->mm->mmap_sem);
 423
 424        /* mmio */
 425        if (is_error_pfn(pfn)) {
 426                pgprintk("gfn %lx is mmio\n", walker.gfn);
 427                kvm_release_pfn_clean(pfn);
 428                return 1;
 429        }
 430
 431        spin_lock(&vcpu->kvm->mmu_lock);
 432        if (mmu_notifier_retry(vcpu, mmu_seq))
 433                goto out_unlock;
 434        kvm_mmu_free_some_pages(vcpu);
 435        shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 436                                  largepage, &write_pt, pfn);
 437
 438        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 439                 shadow_pte, *shadow_pte, write_pt);
 440
 441        if (!write_pt)
 442                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 443
 444        ++vcpu->stat.pf_fixed;
 445        kvm_mmu_audit(vcpu, "post page fault (fixed)");
 446        spin_unlock(&vcpu->kvm->mmu_lock);
 447
 448        return write_pt;
 449
 450out_unlock:
 451        spin_unlock(&vcpu->kvm->mmu_lock);
 452        kvm_release_pfn_clean(pfn);
 453        return 0;
 454}
 455
 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 457{
 458        struct guest_walker walker;
 459        gpa_t gpa = UNMAPPED_GVA;
 460        int r;
 461
 462        r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
 463
 464        if (r) {
 465                gpa = gfn_to_gpa(walker.gfn);
 466                gpa |= vaddr & ~PAGE_MASK;
 467        }
 468
 469        return gpa;
 470}
 471
 472static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 473                                 struct kvm_mmu_page *sp)
 474{
 475        int i, j, offset, r;
 476        pt_element_t pt[256 / sizeof(pt_element_t)];
 477        gpa_t pte_gpa;
 478
 479        if (sp->role.metaphysical
 480            || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
 481                nonpaging_prefetch_page(vcpu, sp);
 482                return;
 483        }
 484
 485        pte_gpa = gfn_to_gpa(sp->gfn);
 486        if (PTTYPE == 32) {
 487                offset = sp->role.quadrant << PT64_LEVEL_BITS;
 488                pte_gpa += offset * sizeof(pt_element_t);
 489        }
 490
 491        for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
 492                r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
 493                pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
 494                for (j = 0; j < ARRAY_SIZE(pt); ++j)
 495                        if (r || is_present_pte(pt[j]))
 496                                sp->spt[i+j] = shadow_trap_nonpresent_pte;
 497                        else
 498                                sp->spt[i+j] = shadow_notrap_nonpresent_pte;
 499        }
 500}
 501
 502#undef pt_element_t
 503#undef guest_walker
 504#undef FNAME
 505#undef PT_BASE_ADDR_MASK
 506#undef PT_INDEX
 507#undef SHADOW_PT_INDEX
 508#undef PT_LEVEL_MASK
 509#undef PT_DIR_BASE_ADDR_MASK
 510#undef PT_LEVEL_BITS
 511#undef PT_MAX_FULL_LEVELS
 512#undef gpte_to_gfn
 513#undef gpte_to_gfn_pde
 514#undef CMPXCHG
 515