linux/arch/x86/xen/mmu.c
<<
>>
Prefs
   1/*
   2 * Xen mmu operations
   3 *
   4 * This file contains the various mmu fetch and update operations.
   5 * The most important job they must perform is the mapping between the
   6 * domain's pfn and the overall machine mfns.
   7 *
   8 * Xen allows guests to directly update the pagetable, in a controlled
   9 * fashion.  In other words, the guest modifies the same pagetable
  10 * that the CPU actually uses, which eliminates the overhead of having
  11 * a separate shadow pagetable.
  12 *
  13 * In order to allow this, it falls on the guest domain to map its
  14 * notion of a "physical" pfn - which is just a domain-local linear
  15 * address - into a real "machine address" which the CPU's MMU can
  16 * use.
  17 *
  18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19 * inserted directly into the pagetable.  When creating a new
  20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22 * the mfn back into a pfn.
  23 *
  24 * The other constraint is that all pages which make up a pagetable
  25 * must be mapped read-only in the guest.  This prevents uncontrolled
  26 * guest updates to the pagetable.  Xen strictly enforces this, and
  27 * will disallow any pagetable update which will end up mapping a
  28 * pagetable page RW, and will disallow using any writable page as a
  29 * pagetable.
  30 *
  31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32 * would need to validate the whole pagetable before going on.
  33 * Naturally, this is quite slow.  The solution is to "pin" a
  34 * pagetable, which enforces all the constraints on the pagetable even
  35 * when it is not actively in use.  This menas that Xen can be assured
  36 * that it is still valid when you do load it into %cr3, and doesn't
  37 * need to revalidate it.
  38 *
  39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40 */
  41#include <linux/sched.h>
  42#include <linux/highmem.h>
  43#include <linux/bug.h>
  44
  45#include <asm/pgtable.h>
  46#include <asm/tlbflush.h>
  47#include <asm/fixmap.h>
  48#include <asm/mmu_context.h>
  49#include <asm/paravirt.h>
  50#include <asm/linkage.h>
  51
  52#include <asm/xen/hypercall.h>
  53#include <asm/xen/hypervisor.h>
  54
  55#include <xen/page.h>
  56#include <xen/interface/xen.h>
  57
  58#include "multicalls.h"
  59#include "mmu.h"
  60
  61/*
  62 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
  63 * redzone above it, so round it up to a PGD boundary.
  64 */
  65#define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
  66
  67
  68#define P2M_ENTRIES_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long))
  69#define TOP_ENTRIES             (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
  70
  71/* Placeholder for holes in the address space */
  72static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
  73                { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
  74
  75 /* Array of pointers to pages containing p2m entries */
  76static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
  77                { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
  78
  79/* Arrays of p2m arrays expressed in mfns used for save/restore */
  80static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
  81
  82static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
  83        __page_aligned_bss;
  84
  85static inline unsigned p2m_top_index(unsigned long pfn)
  86{
  87        BUG_ON(pfn >= MAX_DOMAIN_PAGES);
  88        return pfn / P2M_ENTRIES_PER_PAGE;
  89}
  90
  91static inline unsigned p2m_index(unsigned long pfn)
  92{
  93        return pfn % P2M_ENTRIES_PER_PAGE;
  94}
  95
  96/* Build the parallel p2m_top_mfn structures */
  97void xen_setup_mfn_list_list(void)
  98{
  99        unsigned pfn, idx;
 100
 101        for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
 102                unsigned topidx = p2m_top_index(pfn);
 103
 104                p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 105        }
 106
 107        for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
 108                unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 109                p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 110        }
 111
 112        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 113
 114        HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 115                virt_to_mfn(p2m_top_mfn_list);
 116        HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
 117}
 118
 119/* Set up p2m_top to point to the domain-builder provided p2m pages */
 120void __init xen_build_dynamic_phys_to_machine(void)
 121{
 122        unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 123        unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 124        unsigned pfn;
 125
 126        for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 127                unsigned topidx = p2m_top_index(pfn);
 128
 129                p2m_top[topidx] = &mfn_list[pfn];
 130        }
 131}
 132
 133unsigned long get_phys_to_machine(unsigned long pfn)
 134{
 135        unsigned topidx, idx;
 136
 137        if (unlikely(pfn >= MAX_DOMAIN_PAGES))
 138                return INVALID_P2M_ENTRY;
 139
 140        topidx = p2m_top_index(pfn);
 141        idx = p2m_index(pfn);
 142        return p2m_top[topidx][idx];
 143}
 144EXPORT_SYMBOL_GPL(get_phys_to_machine);
 145
 146static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
 147{
 148        unsigned long *p;
 149        unsigned i;
 150
 151        p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 152        BUG_ON(p == NULL);
 153
 154        for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 155                p[i] = INVALID_P2M_ENTRY;
 156
 157        if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
 158                free_page((unsigned long)p);
 159        else
 160                *mfnp = virt_to_mfn(p);
 161}
 162
 163void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 164{
 165        unsigned topidx, idx;
 166
 167        if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 168                BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 169                return;
 170        }
 171
 172        if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 173                BUG_ON(mfn != INVALID_P2M_ENTRY);
 174                return;
 175        }
 176
 177        topidx = p2m_top_index(pfn);
 178        if (p2m_top[topidx] == p2m_missing) {
 179                /* no need to allocate a page to store an invalid entry */
 180                if (mfn == INVALID_P2M_ENTRY)
 181                        return;
 182                alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
 183        }
 184
 185        idx = p2m_index(pfn);
 186        p2m_top[topidx][idx] = mfn;
 187}
 188
 189xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 190{
 191        unsigned long address = (unsigned long)vaddr;
 192        unsigned int level;
 193        pte_t *pte = lookup_address(address, &level);
 194        unsigned offset = address & ~PAGE_MASK;
 195
 196        BUG_ON(pte == NULL);
 197
 198        return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 199}
 200
 201void make_lowmem_page_readonly(void *vaddr)
 202{
 203        pte_t *pte, ptev;
 204        unsigned long address = (unsigned long)vaddr;
 205        unsigned int level;
 206
 207        pte = lookup_address(address, &level);
 208        BUG_ON(pte == NULL);
 209
 210        ptev = pte_wrprotect(*pte);
 211
 212        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 213                BUG();
 214}
 215
 216void make_lowmem_page_readwrite(void *vaddr)
 217{
 218        pte_t *pte, ptev;
 219        unsigned long address = (unsigned long)vaddr;
 220        unsigned int level;
 221
 222        pte = lookup_address(address, &level);
 223        BUG_ON(pte == NULL);
 224
 225        ptev = pte_mkwrite(*pte);
 226
 227        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 228                BUG();
 229}
 230
 231
 232static bool page_pinned(void *ptr)
 233{
 234        struct page *page = virt_to_page(ptr);
 235
 236        return PagePinned(page);
 237}
 238
 239static void extend_mmu_update(const struct mmu_update *update)
 240{
 241        struct multicall_space mcs;
 242        struct mmu_update *u;
 243
 244        mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 245
 246        if (mcs.mc != NULL)
 247                mcs.mc->args[1]++;
 248        else {
 249                mcs = __xen_mc_entry(sizeof(*u));
 250                MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 251        }
 252
 253        u = mcs.args;
 254        *u = *update;
 255}
 256
 257void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 258{
 259        struct mmu_update u;
 260
 261        preempt_disable();
 262
 263        xen_mc_batch();
 264
 265        /* ptr may be ioremapped for 64-bit pagetable setup */
 266        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 267        u.val = pmd_val_ma(val);
 268        extend_mmu_update(&u);
 269
 270        xen_mc_issue(PARAVIRT_LAZY_MMU);
 271
 272        preempt_enable();
 273}
 274
 275void xen_set_pmd(pmd_t *ptr, pmd_t val)
 276{
 277        /* If page is not pinned, we can just update the entry
 278           directly */
 279        if (!page_pinned(ptr)) {
 280                *ptr = val;
 281                return;
 282        }
 283
 284        xen_set_pmd_hyper(ptr, val);
 285}
 286
 287/*
 288 * Associate a virtual page frame with a given physical page frame
 289 * and protection flags for that frame.
 290 */
 291void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 292{
 293        set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 294}
 295
 296void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 297                    pte_t *ptep, pte_t pteval)
 298{
 299        /* updates to init_mm may be done without lock */
 300        if (mm == &init_mm)
 301                preempt_disable();
 302
 303        if (mm == current->mm || mm == &init_mm) {
 304                if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 305                        struct multicall_space mcs;
 306                        mcs = xen_mc_entry(0);
 307
 308                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 309                        xen_mc_issue(PARAVIRT_LAZY_MMU);
 310                        goto out;
 311                } else
 312                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 313                                goto out;
 314        }
 315        xen_set_pte(ptep, pteval);
 316
 317out:
 318        if (mm == &init_mm)
 319                preempt_enable();
 320}
 321
 322pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 323{
 324        /* Just return the pte as-is.  We preserve the bits on commit */
 325        return *ptep;
 326}
 327
 328void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 329                                 pte_t *ptep, pte_t pte)
 330{
 331        struct mmu_update u;
 332
 333        xen_mc_batch();
 334
 335        u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 336        u.val = pte_val_ma(pte);
 337        extend_mmu_update(&u);
 338
 339        xen_mc_issue(PARAVIRT_LAZY_MMU);
 340}
 341
 342/* Assume pteval_t is equivalent to all the other *val_t types. */
 343static pteval_t pte_mfn_to_pfn(pteval_t val)
 344{
 345        if (val & _PAGE_PRESENT) {
 346                unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 347                pteval_t flags = val & PTE_FLAGS_MASK;
 348                val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 349        }
 350
 351        return val;
 352}
 353
 354static pteval_t pte_pfn_to_mfn(pteval_t val)
 355{
 356        if (val & _PAGE_PRESENT) {
 357                unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 358                pteval_t flags = val & PTE_FLAGS_MASK;
 359                val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 360        }
 361
 362        return val;
 363}
 364
 365pteval_t xen_pte_val(pte_t pte)
 366{
 367        return pte_mfn_to_pfn(pte.pte);
 368}
 369
 370pgdval_t xen_pgd_val(pgd_t pgd)
 371{
 372        return pte_mfn_to_pfn(pgd.pgd);
 373}
 374
 375pte_t xen_make_pte(pteval_t pte)
 376{
 377        pte = pte_pfn_to_mfn(pte);
 378        return native_make_pte(pte);
 379}
 380
 381pgd_t xen_make_pgd(pgdval_t pgd)
 382{
 383        pgd = pte_pfn_to_mfn(pgd);
 384        return native_make_pgd(pgd);
 385}
 386
 387pmdval_t xen_pmd_val(pmd_t pmd)
 388{
 389        return pte_mfn_to_pfn(pmd.pmd);
 390}
 391
 392void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 393{
 394        struct mmu_update u;
 395
 396        preempt_disable();
 397
 398        xen_mc_batch();
 399
 400        /* ptr may be ioremapped for 64-bit pagetable setup */
 401        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 402        u.val = pud_val_ma(val);
 403        extend_mmu_update(&u);
 404
 405        xen_mc_issue(PARAVIRT_LAZY_MMU);
 406
 407        preempt_enable();
 408}
 409
 410void xen_set_pud(pud_t *ptr, pud_t val)
 411{
 412        /* If page is not pinned, we can just update the entry
 413           directly */
 414        if (!page_pinned(ptr)) {
 415                *ptr = val;
 416                return;
 417        }
 418
 419        xen_set_pud_hyper(ptr, val);
 420}
 421
 422void xen_set_pte(pte_t *ptep, pte_t pte)
 423{
 424#ifdef CONFIG_X86_PAE
 425        ptep->pte_high = pte.pte_high;
 426        smp_wmb();
 427        ptep->pte_low = pte.pte_low;
 428#else
 429        *ptep = pte;
 430#endif
 431}
 432
 433#ifdef CONFIG_X86_PAE
 434void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 435{
 436        set_64bit((u64 *)ptep, native_pte_val(pte));
 437}
 438
 439void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 440{
 441        ptep->pte_low = 0;
 442        smp_wmb();              /* make sure low gets written first */
 443        ptep->pte_high = 0;
 444}
 445
 446void xen_pmd_clear(pmd_t *pmdp)
 447{
 448        set_pmd(pmdp, __pmd(0));
 449}
 450#endif  /* CONFIG_X86_PAE */
 451
 452pmd_t xen_make_pmd(pmdval_t pmd)
 453{
 454        pmd = pte_pfn_to_mfn(pmd);
 455        return native_make_pmd(pmd);
 456}
 457
 458#if PAGETABLE_LEVELS == 4
 459pudval_t xen_pud_val(pud_t pud)
 460{
 461        return pte_mfn_to_pfn(pud.pud);
 462}
 463
 464pud_t xen_make_pud(pudval_t pud)
 465{
 466        pud = pte_pfn_to_mfn(pud);
 467
 468        return native_make_pud(pud);
 469}
 470
 471pgd_t *xen_get_user_pgd(pgd_t *pgd)
 472{
 473        pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 474        unsigned offset = pgd - pgd_page;
 475        pgd_t *user_ptr = NULL;
 476
 477        if (offset < pgd_index(USER_LIMIT)) {
 478                struct page *page = virt_to_page(pgd_page);
 479                user_ptr = (pgd_t *)page->private;
 480                if (user_ptr)
 481                        user_ptr += offset;
 482        }
 483
 484        return user_ptr;
 485}
 486
 487static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 488{
 489        struct mmu_update u;
 490
 491        u.ptr = virt_to_machine(ptr).maddr;
 492        u.val = pgd_val_ma(val);
 493        extend_mmu_update(&u);
 494}
 495
 496/*
 497 * Raw hypercall-based set_pgd, intended for in early boot before
 498 * there's a page structure.  This implies:
 499 *  1. The only existing pagetable is the kernel's
 500 *  2. It is always pinned
 501 *  3. It has no user pagetable attached to it
 502 */
 503void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 504{
 505        preempt_disable();
 506
 507        xen_mc_batch();
 508
 509        __xen_set_pgd_hyper(ptr, val);
 510
 511        xen_mc_issue(PARAVIRT_LAZY_MMU);
 512
 513        preempt_enable();
 514}
 515
 516void xen_set_pgd(pgd_t *ptr, pgd_t val)
 517{
 518        pgd_t *user_ptr = xen_get_user_pgd(ptr);
 519
 520        /* If page is not pinned, we can just update the entry
 521           directly */
 522        if (!page_pinned(ptr)) {
 523                *ptr = val;
 524                if (user_ptr) {
 525                        WARN_ON(page_pinned(user_ptr));
 526                        *user_ptr = val;
 527                }
 528                return;
 529        }
 530
 531        /* If it's pinned, then we can at least batch the kernel and
 532           user updates together. */
 533        xen_mc_batch();
 534
 535        __xen_set_pgd_hyper(ptr, val);
 536        if (user_ptr)
 537                __xen_set_pgd_hyper(user_ptr, val);
 538
 539        xen_mc_issue(PARAVIRT_LAZY_MMU);
 540}
 541#endif  /* PAGETABLE_LEVELS == 4 */
 542
 543/*
 544 * (Yet another) pagetable walker.  This one is intended for pinning a
 545 * pagetable.  This means that it walks a pagetable and calls the
 546 * callback function on each page it finds making up the page table,
 547 * at every level.  It walks the entire pagetable, but it only bothers
 548 * pinning pte pages which are below limit.  In the normal case this
 549 * will be STACK_TOP_MAX, but at boot we need to pin up to
 550 * FIXADDR_TOP.
 551 *
 552 * For 32-bit the important bit is that we don't pin beyond there,
 553 * because then we start getting into Xen's ptes.
 554 *
 555 * For 64-bit, we must skip the Xen hole in the middle of the address
 556 * space, just after the big x86-64 virtual hole.
 557 */
 558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 559                    unsigned long limit)
 560{
 561        int flush = 0;
 562        unsigned hole_low, hole_high;
 563        unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 564        unsigned pgdidx, pudidx, pmdidx;
 565
 566        /* The limit is the last byte to be touched */
 567        limit--;
 568        BUG_ON(limit >= FIXADDR_TOP);
 569
 570        if (xen_feature(XENFEAT_auto_translated_physmap))
 571                return 0;
 572
 573        /*
 574         * 64-bit has a great big hole in the middle of the address
 575         * space, which contains the Xen mappings.  On 32-bit these
 576         * will end up making a zero-sized hole and so is a no-op.
 577         */
 578        hole_low = pgd_index(USER_LIMIT);
 579        hole_high = pgd_index(PAGE_OFFSET);
 580
 581        pgdidx_limit = pgd_index(limit);
 582#if PTRS_PER_PUD > 1
 583        pudidx_limit = pud_index(limit);
 584#else
 585        pudidx_limit = 0;
 586#endif
 587#if PTRS_PER_PMD > 1
 588        pmdidx_limit = pmd_index(limit);
 589#else
 590        pmdidx_limit = 0;
 591#endif
 592
 593        flush |= (*func)(virt_to_page(pgd), PT_PGD);
 594
 595        for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 596                pud_t *pud;
 597
 598                if (pgdidx >= hole_low && pgdidx < hole_high)
 599                        continue;
 600
 601                if (!pgd_val(pgd[pgdidx]))
 602                        continue;
 603
 604                pud = pud_offset(&pgd[pgdidx], 0);
 605
 606                if (PTRS_PER_PUD > 1) /* not folded */
 607                        flush |= (*func)(virt_to_page(pud), PT_PUD);
 608
 609                for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 610                        pmd_t *pmd;
 611
 612                        if (pgdidx == pgdidx_limit &&
 613                            pudidx > pudidx_limit)
 614                                goto out;
 615
 616                        if (pud_none(pud[pudidx]))
 617                                continue;
 618
 619                        pmd = pmd_offset(&pud[pudidx], 0);
 620
 621                        if (PTRS_PER_PMD > 1) /* not folded */
 622                                flush |= (*func)(virt_to_page(pmd), PT_PMD);
 623
 624                        for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 625                                struct page *pte;
 626
 627                                if (pgdidx == pgdidx_limit &&
 628                                    pudidx == pudidx_limit &&
 629                                    pmdidx > pmdidx_limit)
 630                                        goto out;
 631
 632                                if (pmd_none(pmd[pmdidx]))
 633                                        continue;
 634
 635                                pte = pmd_page(pmd[pmdidx]);
 636                                flush |= (*func)(pte, PT_PTE);
 637                        }
 638                }
 639        }
 640out:
 641
 642        return flush;
 643}
 644
 645static spinlock_t *lock_pte(struct page *page)
 646{
 647        spinlock_t *ptl = NULL;
 648
 649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
 650        ptl = __pte_lockptr(page);
 651        spin_lock(ptl);
 652#endif
 653
 654        return ptl;
 655}
 656
 657static void do_unlock(void *v)
 658{
 659        spinlock_t *ptl = v;
 660        spin_unlock(ptl);
 661}
 662
 663static void xen_do_pin(unsigned level, unsigned long pfn)
 664{
 665        struct mmuext_op *op;
 666        struct multicall_space mcs;
 667
 668        mcs = __xen_mc_entry(sizeof(*op));
 669        op = mcs.args;
 670        op->cmd = level;
 671        op->arg1.mfn = pfn_to_mfn(pfn);
 672        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 673}
 674
 675static int pin_page(struct page *page, enum pt_level level)
 676{
 677        unsigned pgfl = TestSetPagePinned(page);
 678        int flush;
 679
 680        if (pgfl)
 681                flush = 0;              /* already pinned */
 682        else if (PageHighMem(page))
 683                /* kmaps need flushing if we found an unpinned
 684                   highpage */
 685                flush = 1;
 686        else {
 687                void *pt = lowmem_page_address(page);
 688                unsigned long pfn = page_to_pfn(page);
 689                struct multicall_space mcs = __xen_mc_entry(0);
 690                spinlock_t *ptl;
 691
 692                flush = 0;
 693
 694                ptl = NULL;
 695                if (level == PT_PTE)
 696                        ptl = lock_pte(page);
 697
 698                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 699                                        pfn_pte(pfn, PAGE_KERNEL_RO),
 700                                        level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 701
 702                if (level == PT_PTE)
 703                        xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 704
 705                if (ptl) {
 706                        /* Queue a deferred unlock for when this batch
 707                           is completed. */
 708                        xen_mc_callback(do_unlock, ptl);
 709                }
 710        }
 711
 712        return flush;
 713}
 714
 715/* This is called just after a mm has been created, but it has not
 716   been used yet.  We need to make sure that its pagetable is all
 717   read-only, and can be pinned. */
 718void xen_pgd_pin(pgd_t *pgd)
 719{
 720        xen_mc_batch();
 721
 722        if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
 723                /* re-enable interrupts for kmap_flush_unused */
 724                xen_mc_issue(0);
 725                kmap_flush_unused();
 726                xen_mc_batch();
 727        }
 728
 729#ifdef CONFIG_X86_64
 730        {
 731                pgd_t *user_pgd = xen_get_user_pgd(pgd);
 732
 733                xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 734
 735                if (user_pgd) {
 736                        pin_page(virt_to_page(user_pgd), PT_PGD);
 737                        xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
 738                }
 739        }
 740#else /* CONFIG_X86_32 */
 741#ifdef CONFIG_X86_PAE
 742        /* Need to make sure unshared kernel PMD is pinnable */
 743        pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
 744#endif
 745        xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 746#endif /* CONFIG_X86_64 */
 747        xen_mc_issue(0);
 748}
 749
 750/*
 751 * On save, we need to pin all pagetables to make sure they get their
 752 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 753 * them (unpinned pgds are not currently in use, probably because the
 754 * process is under construction or destruction).
 755 */
 756void xen_mm_pin_all(void)
 757{
 758        unsigned long flags;
 759        struct page *page;
 760
 761        spin_lock_irqsave(&pgd_lock, flags);
 762
 763        list_for_each_entry(page, &pgd_list, lru) {
 764                if (!PagePinned(page)) {
 765                        xen_pgd_pin((pgd_t *)page_address(page));
 766                        SetPageSavePinned(page);
 767                }
 768        }
 769
 770        spin_unlock_irqrestore(&pgd_lock, flags);
 771}
 772
 773/*
 774 * The init_mm pagetable is really pinned as soon as its created, but
 775 * that's before we have page structures to store the bits.  So do all
 776 * the book-keeping now.
 777 */
 778static __init int mark_pinned(struct page *page, enum pt_level level)
 779{
 780        SetPagePinned(page);
 781        return 0;
 782}
 783
 784void __init xen_mark_init_mm_pinned(void)
 785{
 786        pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 787}
 788
 789static int unpin_page(struct page *page, enum pt_level level)
 790{
 791        unsigned pgfl = TestClearPagePinned(page);
 792
 793        if (pgfl && !PageHighMem(page)) {
 794                void *pt = lowmem_page_address(page);
 795                unsigned long pfn = page_to_pfn(page);
 796                spinlock_t *ptl = NULL;
 797                struct multicall_space mcs;
 798
 799                if (level == PT_PTE) {
 800                        ptl = lock_pte(page);
 801
 802                        xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 803                }
 804
 805                mcs = __xen_mc_entry(0);
 806
 807                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 808                                        pfn_pte(pfn, PAGE_KERNEL),
 809                                        level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 810
 811                if (ptl) {
 812                        /* unlock when batch completed */
 813                        xen_mc_callback(do_unlock, ptl);
 814                }
 815        }
 816
 817        return 0;               /* never need to flush on unpin */
 818}
 819
 820/* Release a pagetables pages back as normal RW */
 821static void xen_pgd_unpin(pgd_t *pgd)
 822{
 823        xen_mc_batch();
 824
 825        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 826
 827#ifdef CONFIG_X86_64
 828        {
 829                pgd_t *user_pgd = xen_get_user_pgd(pgd);
 830
 831                if (user_pgd) {
 832                        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
 833                        unpin_page(virt_to_page(user_pgd), PT_PGD);
 834                }
 835        }
 836#endif
 837
 838#ifdef CONFIG_X86_PAE
 839        /* Need to make sure unshared kernel PMD is unpinned */
 840        pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
 841#endif
 842
 843        pgd_walk(pgd, unpin_page, USER_LIMIT);
 844
 845        xen_mc_issue(0);
 846}
 847
 848/*
 849 * On resume, undo any pinning done at save, so that the rest of the
 850 * kernel doesn't see any unexpected pinned pagetables.
 851 */
 852void xen_mm_unpin_all(void)
 853{
 854        unsigned long flags;
 855        struct page *page;
 856
 857        spin_lock_irqsave(&pgd_lock, flags);
 858
 859        list_for_each_entry(page, &pgd_list, lru) {
 860                if (PageSavePinned(page)) {
 861                        BUG_ON(!PagePinned(page));
 862                        xen_pgd_unpin((pgd_t *)page_address(page));
 863                        ClearPageSavePinned(page);
 864                }
 865        }
 866
 867        spin_unlock_irqrestore(&pgd_lock, flags);
 868}
 869
 870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 871{
 872        spin_lock(&next->page_table_lock);
 873        xen_pgd_pin(next->pgd);
 874        spin_unlock(&next->page_table_lock);
 875}
 876
 877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 878{
 879        spin_lock(&mm->page_table_lock);
 880        xen_pgd_pin(mm->pgd);
 881        spin_unlock(&mm->page_table_lock);
 882}
 883
 884
 885#ifdef CONFIG_SMP
 886/* Another cpu may still have their %cr3 pointing at the pagetable, so
 887   we need to repoint it somewhere else before we can unpin it. */
 888static void drop_other_mm_ref(void *info)
 889{
 890        struct mm_struct *mm = info;
 891        struct mm_struct *active_mm;
 892
 893#ifdef CONFIG_X86_64
 894        active_mm = read_pda(active_mm);
 895#else
 896        active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
 897#endif
 898
 899        if (active_mm == mm)
 900                leave_mm(smp_processor_id());
 901
 902        /* If this cpu still has a stale cr3 reference, then make sure
 903           it has been flushed. */
 904        if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
 905                load_cr3(swapper_pg_dir);
 906                arch_flush_lazy_cpu_mode();
 907        }
 908}
 909
 910static void drop_mm_ref(struct mm_struct *mm)
 911{
 912        cpumask_t mask;
 913        unsigned cpu;
 914
 915        if (current->active_mm == mm) {
 916                if (current->mm == mm)
 917                        load_cr3(swapper_pg_dir);
 918                else
 919                        leave_mm(smp_processor_id());
 920                arch_flush_lazy_cpu_mode();
 921        }
 922
 923        /* Get the "official" set of cpus referring to our pagetable. */
 924        mask = mm->cpu_vm_mask;
 925
 926        /* It's possible that a vcpu may have a stale reference to our
 927           cr3, because its in lazy mode, and it hasn't yet flushed
 928           its set of pending hypercalls yet.  In this case, we can
 929           look at its actual current cr3 value, and force it to flush
 930           if needed. */
 931        for_each_online_cpu(cpu) {
 932                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 933                        cpu_set(cpu, mask);
 934        }
 935
 936        if (!cpus_empty(mask))
 937                smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 938}
 939#else
 940static void drop_mm_ref(struct mm_struct *mm)
 941{
 942        if (current->active_mm == mm)
 943                load_cr3(swapper_pg_dir);
 944}
 945#endif
 946
 947/*
 948 * While a process runs, Xen pins its pagetables, which means that the
 949 * hypervisor forces it to be read-only, and it controls all updates
 950 * to it.  This means that all pagetable updates have to go via the
 951 * hypervisor, which is moderately expensive.
 952 *
 953 * Since we're pulling the pagetable down, we switch to use init_mm,
 954 * unpin old process pagetable and mark it all read-write, which
 955 * allows further operations on it to be simple memory accesses.
 956 *
 957 * The only subtle point is that another CPU may be still using the
 958 * pagetable because of lazy tlb flushing.  This means we need need to
 959 * switch all CPUs off this pagetable before we can unpin it.
 960 */
 961void xen_exit_mmap(struct mm_struct *mm)
 962{
 963        get_cpu();              /* make sure we don't move around */
 964        drop_mm_ref(mm);
 965        put_cpu();
 966
 967        spin_lock(&mm->page_table_lock);
 968
 969        /* pgd may not be pinned in the error exit path of execve */
 970        if (page_pinned(mm->pgd))
 971                xen_pgd_unpin(mm->pgd);
 972
 973        spin_unlock(&mm->page_table_lock);
 974}
 975