linux/mm/rmap.c
<<
>>
Prefs
   1/*
   2 * mm/rmap.c - physical to virtual reverse mappings
   3 *
   4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5 * Released under the General Public License (GPL).
   6 *
   7 * Simple, low overhead reverse mapping scheme.
   8 * Please try to keep this thing as modular as possible.
   9 *
  10 * Provides methods for unmapping each kind of mapped page:
  11 * the anon methods track anonymous pages, and
  12 * the file methods track pages belonging to an inode.
  13 *
  14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18 */
  19
  20/*
  21 * Lock ordering in mm:
  22 *
  23 * inode->i_mutex       (while writing or truncating, not reading or faulting)
  24 *   inode->i_alloc_sem (vmtruncate_range)
  25 *   mm->mmap_sem
  26 *     page->flags PG_locked (lock_page)
  27 *       mapping->i_mmap_lock
  28 *         anon_vma->lock
  29 *           mm->page_table_lock or pte_lock
  30 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  31 *             swap_lock (in swap_duplicate, swap_info_get)
  32 *               mmlist_lock (in mmput, drain_mmlist and others)
  33 *               mapping->private_lock (in __set_page_dirty_buffers)
  34 *               inode_lock (in set_page_dirty's __mark_inode_dirty)
  35 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
  36 *                 mapping->tree_lock (widely used, in set_page_dirty,
  37 *                           in arch-dependent flush_dcache_mmap_lock,
  38 *                           within inode_lock in __sync_single_inode)
  39 */
  40
  41#include <linux/mm.h>
  42#include <linux/pagemap.h>
  43#include <linux/swap.h>
  44#include <linux/swapops.h>
  45#include <linux/slab.h>
  46#include <linux/init.h>
  47#include <linux/rmap.h>
  48#include <linux/rcupdate.h>
  49#include <linux/module.h>
  50#include <linux/kallsyms.h>
  51
  52#include <asm/tlbflush.h>
  53
  54struct kmem_cache *anon_vma_cachep;
  55
  56/* This must be called under the mmap_sem. */
  57int anon_vma_prepare(struct vm_area_struct *vma)
  58{
  59        struct anon_vma *anon_vma = vma->anon_vma;
  60
  61        might_sleep();
  62        if (unlikely(!anon_vma)) {
  63                struct mm_struct *mm = vma->vm_mm;
  64                struct anon_vma *allocated, *locked;
  65
  66                anon_vma = find_mergeable_anon_vma(vma);
  67                if (anon_vma) {
  68                        allocated = NULL;
  69                        locked = anon_vma;
  70                        spin_lock(&locked->lock);
  71                } else {
  72                        anon_vma = anon_vma_alloc();
  73                        if (unlikely(!anon_vma))
  74                                return -ENOMEM;
  75                        allocated = anon_vma;
  76                        locked = NULL;
  77                }
  78
  79                /* page_table_lock to protect against threads */
  80                spin_lock(&mm->page_table_lock);
  81                if (likely(!vma->anon_vma)) {
  82                        vma->anon_vma = anon_vma;
  83                        list_add_tail(&vma->anon_vma_node, &anon_vma->head);
  84                        allocated = NULL;
  85                }
  86                spin_unlock(&mm->page_table_lock);
  87
  88                if (locked)
  89                        spin_unlock(&locked->lock);
  90                if (unlikely(allocated))
  91                        anon_vma_free(allocated);
  92        }
  93        return 0;
  94}
  95
  96void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  97{
  98        BUG_ON(vma->anon_vma != next->anon_vma);
  99        list_del(&next->anon_vma_node);
 100}
 101
 102void __anon_vma_link(struct vm_area_struct *vma)
 103{
 104        struct anon_vma *anon_vma = vma->anon_vma;
 105
 106        if (anon_vma)
 107                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 108}
 109
 110void anon_vma_link(struct vm_area_struct *vma)
 111{
 112        struct anon_vma *anon_vma = vma->anon_vma;
 113
 114        if (anon_vma) {
 115                spin_lock(&anon_vma->lock);
 116                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 117                spin_unlock(&anon_vma->lock);
 118        }
 119}
 120
 121void anon_vma_unlink(struct vm_area_struct *vma)
 122{
 123        struct anon_vma *anon_vma = vma->anon_vma;
 124        int empty;
 125
 126        if (!anon_vma)
 127                return;
 128
 129        spin_lock(&anon_vma->lock);
 130        list_del(&vma->anon_vma_node);
 131
 132        /* We must garbage collect the anon_vma if it's empty */
 133        empty = list_empty(&anon_vma->head);
 134        spin_unlock(&anon_vma->lock);
 135
 136        if (empty)
 137                anon_vma_free(anon_vma);
 138}
 139
 140static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
 141                          unsigned long flags)
 142{
 143        struct anon_vma *anon_vma = data;
 144
 145        spin_lock_init(&anon_vma->lock);
 146        INIT_LIST_HEAD(&anon_vma->head);
 147}
 148
 149void __init anon_vma_init(void)
 150{
 151        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
 152                        0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
 153}
 154
 155/*
 156 * Getting a lock on a stable anon_vma from a page off the LRU is
 157 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 158 */
 159static struct anon_vma *page_lock_anon_vma(struct page *page)
 160{
 161        struct anon_vma *anon_vma;
 162        unsigned long anon_mapping;
 163
 164        rcu_read_lock();
 165        anon_mapping = (unsigned long) page->mapping;
 166        if (!(anon_mapping & PAGE_MAPPING_ANON))
 167                goto out;
 168        if (!page_mapped(page))
 169                goto out;
 170
 171        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 172        spin_lock(&anon_vma->lock);
 173        return anon_vma;
 174out:
 175        rcu_read_unlock();
 176        return NULL;
 177}
 178
 179static void page_unlock_anon_vma(struct anon_vma *anon_vma)
 180{
 181        spin_unlock(&anon_vma->lock);
 182        rcu_read_unlock();
 183}
 184
 185/*
 186 * At what user virtual address is page expected in vma?
 187 */
 188static inline unsigned long
 189vma_address(struct page *page, struct vm_area_struct *vma)
 190{
 191        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 192        unsigned long address;
 193
 194        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 195        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 196                /* page should be within any vma from prio_tree_next */
 197                BUG_ON(!PageAnon(page));
 198                return -EFAULT;
 199        }
 200        return address;
 201}
 202
 203/*
 204 * At what user virtual address is page expected in vma? checking that the
 205 * page matches the vma: currently only used on anon pages, by unuse_vma;
 206 */
 207unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 208{
 209        if (PageAnon(page)) {
 210                if ((void *)vma->anon_vma !=
 211                    (void *)page->mapping - PAGE_MAPPING_ANON)
 212                        return -EFAULT;
 213        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
 214                if (!vma->vm_file ||
 215                    vma->vm_file->f_mapping != page->mapping)
 216                        return -EFAULT;
 217        } else
 218                return -EFAULT;
 219        return vma_address(page, vma);
 220}
 221
 222/*
 223 * Check that @page is mapped at @address into @mm.
 224 *
 225 * On success returns with pte mapped and locked.
 226 */
 227pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 228                          unsigned long address, spinlock_t **ptlp)
 229{
 230        pgd_t *pgd;
 231        pud_t *pud;
 232        pmd_t *pmd;
 233        pte_t *pte;
 234        spinlock_t *ptl;
 235
 236        pgd = pgd_offset(mm, address);
 237        if (!pgd_present(*pgd))
 238                return NULL;
 239
 240        pud = pud_offset(pgd, address);
 241        if (!pud_present(*pud))
 242                return NULL;
 243
 244        pmd = pmd_offset(pud, address);
 245        if (!pmd_present(*pmd))
 246                return NULL;
 247
 248        pte = pte_offset_map(pmd, address);
 249        /* Make a quick check before getting the lock */
 250        if (!pte_present(*pte)) {
 251                pte_unmap(pte);
 252                return NULL;
 253        }
 254
 255        ptl = pte_lockptr(mm, pmd);
 256        spin_lock(ptl);
 257        if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 258                *ptlp = ptl;
 259                return pte;
 260        }
 261        pte_unmap_unlock(pte, ptl);
 262        return NULL;
 263}
 264
 265/*
 266 * Subfunctions of page_referenced: page_referenced_one called
 267 * repeatedly from either page_referenced_anon or page_referenced_file.
 268 */
 269static int page_referenced_one(struct page *page,
 270        struct vm_area_struct *vma, unsigned int *mapcount)
 271{
 272        struct mm_struct *mm = vma->vm_mm;
 273        unsigned long address;
 274        pte_t *pte;
 275        spinlock_t *ptl;
 276        int referenced = 0;
 277
 278        address = vma_address(page, vma);
 279        if (address == -EFAULT)
 280                goto out;
 281
 282        pte = page_check_address(page, mm, address, &ptl);
 283        if (!pte)
 284                goto out;
 285
 286        if (ptep_clear_flush_young(vma, address, pte))
 287                referenced++;
 288
 289        /* Pretend the page is referenced if the task has the
 290           swap token and is in the middle of a page fault. */
 291        if (mm != current->mm && has_swap_token(mm) &&
 292                        rwsem_is_locked(&mm->mmap_sem))
 293                referenced++;
 294
 295        (*mapcount)--;
 296        pte_unmap_unlock(pte, ptl);
 297out:
 298        return referenced;
 299}
 300
 301static int page_referenced_anon(struct page *page)
 302{
 303        unsigned int mapcount;
 304        struct anon_vma *anon_vma;
 305        struct vm_area_struct *vma;
 306        int referenced = 0;
 307
 308        anon_vma = page_lock_anon_vma(page);
 309        if (!anon_vma)
 310                return referenced;
 311
 312        mapcount = page_mapcount(page);
 313        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 314                referenced += page_referenced_one(page, vma, &mapcount);
 315                if (!mapcount)
 316                        break;
 317        }
 318
 319        page_unlock_anon_vma(anon_vma);
 320        return referenced;
 321}
 322
 323/**
 324 * page_referenced_file - referenced check for object-based rmap
 325 * @page: the page we're checking references on.
 326 *
 327 * For an object-based mapped page, find all the places it is mapped and
 328 * check/clear the referenced flag.  This is done by following the page->mapping
 329 * pointer, then walking the chain of vmas it holds.  It returns the number
 330 * of references it found.
 331 *
 332 * This function is only called from page_referenced for object-based pages.
 333 */
 334static int page_referenced_file(struct page *page)
 335{
 336        unsigned int mapcount;
 337        struct address_space *mapping = page->mapping;
 338        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 339        struct vm_area_struct *vma;
 340        struct prio_tree_iter iter;
 341        int referenced = 0;
 342
 343        /*
 344         * The caller's checks on page->mapping and !PageAnon have made
 345         * sure that this is a file page: the check for page->mapping
 346         * excludes the case just before it gets set on an anon page.
 347         */
 348        BUG_ON(PageAnon(page));
 349
 350        /*
 351         * The page lock not only makes sure that page->mapping cannot
 352         * suddenly be NULLified by truncation, it makes sure that the
 353         * structure at mapping cannot be freed and reused yet,
 354         * so we can safely take mapping->i_mmap_lock.
 355         */
 356        BUG_ON(!PageLocked(page));
 357
 358        spin_lock(&mapping->i_mmap_lock);
 359
 360        /*
 361         * i_mmap_lock does not stabilize mapcount at all, but mapcount
 362         * is more likely to be accurate if we note it after spinning.
 363         */
 364        mapcount = page_mapcount(page);
 365
 366        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 367                if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 368                                  == (VM_LOCKED|VM_MAYSHARE)) {
 369                        referenced++;
 370                        break;
 371                }
 372                referenced += page_referenced_one(page, vma, &mapcount);
 373                if (!mapcount)
 374                        break;
 375        }
 376
 377        spin_unlock(&mapping->i_mmap_lock);
 378        return referenced;
 379}
 380
 381/**
 382 * page_referenced - test if the page was referenced
 383 * @page: the page to test
 384 * @is_locked: caller holds lock on the page
 385 *
 386 * Quick test_and_clear_referenced for all mappings to a page,
 387 * returns the number of ptes which referenced the page.
 388 */
 389int page_referenced(struct page *page, int is_locked)
 390{
 391        int referenced = 0;
 392
 393        if (page_test_and_clear_young(page))
 394                referenced++;
 395
 396        if (TestClearPageReferenced(page))
 397                referenced++;
 398
 399        if (page_mapped(page) && page->mapping) {
 400                if (PageAnon(page))
 401                        referenced += page_referenced_anon(page);
 402                else if (is_locked)
 403                        referenced += page_referenced_file(page);
 404                else if (TestSetPageLocked(page))
 405                        referenced++;
 406                else {
 407                        if (page->mapping)
 408                                referenced += page_referenced_file(page);
 409                        unlock_page(page);
 410                }
 411        }
 412        return referenced;
 413}
 414
 415static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
 416{
 417        struct mm_struct *mm = vma->vm_mm;
 418        unsigned long address;
 419        pte_t *pte;
 420        spinlock_t *ptl;
 421        int ret = 0;
 422
 423        address = vma_address(page, vma);
 424        if (address == -EFAULT)
 425                goto out;
 426
 427        pte = page_check_address(page, mm, address, &ptl);
 428        if (!pte)
 429                goto out;
 430
 431        if (pte_dirty(*pte) || pte_write(*pte)) {
 432                pte_t entry;
 433
 434                flush_cache_page(vma, address, pte_pfn(*pte));
 435                entry = ptep_clear_flush(vma, address, pte);
 436                entry = pte_wrprotect(entry);
 437                entry = pte_mkclean(entry);
 438                set_pte_at(mm, address, pte, entry);
 439                lazy_mmu_prot_update(entry);
 440                ret = 1;
 441        }
 442
 443        pte_unmap_unlock(pte, ptl);
 444out:
 445        return ret;
 446}
 447
 448static int page_mkclean_file(struct address_space *mapping, struct page *page)
 449{
 450        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 451        struct vm_area_struct *vma;
 452        struct prio_tree_iter iter;
 453        int ret = 0;
 454
 455        BUG_ON(PageAnon(page));
 456
 457        spin_lock(&mapping->i_mmap_lock);
 458        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 459                if (vma->vm_flags & VM_SHARED)
 460                        ret += page_mkclean_one(page, vma);
 461        }
 462        spin_unlock(&mapping->i_mmap_lock);
 463        return ret;
 464}
 465
 466int page_mkclean(struct page *page)
 467{
 468        int ret = 0;
 469
 470        BUG_ON(!PageLocked(page));
 471
 472        if (page_mapped(page)) {
 473                struct address_space *mapping = page_mapping(page);
 474                if (mapping)
 475                        ret = page_mkclean_file(mapping, page);
 476                if (page_test_dirty(page)) {
 477                        page_clear_dirty(page);
 478                        ret = 1;
 479                }
 480        }
 481
 482        return ret;
 483}
 484EXPORT_SYMBOL_GPL(page_mkclean);
 485
 486/**
 487 * page_set_anon_rmap - setup new anonymous rmap
 488 * @page:       the page to add the mapping to
 489 * @vma:        the vm area in which the mapping is added
 490 * @address:    the user virtual address mapped
 491 */
 492static void __page_set_anon_rmap(struct page *page,
 493        struct vm_area_struct *vma, unsigned long address)
 494{
 495        struct anon_vma *anon_vma = vma->anon_vma;
 496
 497        BUG_ON(!anon_vma);
 498        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 499        page->mapping = (struct address_space *) anon_vma;
 500
 501        page->index = linear_page_index(vma, address);
 502
 503        /*
 504         * nr_mapped state can be updated without turning off
 505         * interrupts because it is not modified via interrupt.
 506         */
 507        __inc_zone_page_state(page, NR_ANON_PAGES);
 508}
 509
 510/**
 511 * page_set_anon_rmap - sanity check anonymous rmap addition
 512 * @page:       the page to add the mapping to
 513 * @vma:        the vm area in which the mapping is added
 514 * @address:    the user virtual address mapped
 515 */
 516static void __page_check_anon_rmap(struct page *page,
 517        struct vm_area_struct *vma, unsigned long address)
 518{
 519#ifdef CONFIG_DEBUG_VM
 520        /*
 521         * The page's anon-rmap details (mapping and index) are guaranteed to
 522         * be set up correctly at this point.
 523         *
 524         * We have exclusion against page_add_anon_rmap because the caller
 525         * always holds the page locked, except if called from page_dup_rmap,
 526         * in which case the page is already known to be setup.
 527         *
 528         * We have exclusion against page_add_new_anon_rmap because those pages
 529         * are initially only visible via the pagetables, and the pte is locked
 530         * over the call to page_add_new_anon_rmap.
 531         */
 532        struct anon_vma *anon_vma = vma->anon_vma;
 533        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 534        BUG_ON(page->mapping != (struct address_space *)anon_vma);
 535        BUG_ON(page->index != linear_page_index(vma, address));
 536#endif
 537}
 538
 539/**
 540 * page_add_anon_rmap - add pte mapping to an anonymous page
 541 * @page:       the page to add the mapping to
 542 * @vma:        the vm area in which the mapping is added
 543 * @address:    the user virtual address mapped
 544 *
 545 * The caller needs to hold the pte lock and the page must be locked.
 546 */
 547void page_add_anon_rmap(struct page *page,
 548        struct vm_area_struct *vma, unsigned long address)
 549{
 550        VM_BUG_ON(!PageLocked(page));
 551        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 552        if (atomic_inc_and_test(&page->_mapcount))
 553                __page_set_anon_rmap(page, vma, address);
 554        else
 555                __page_check_anon_rmap(page, vma, address);
 556}
 557
 558/*
 559 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
 560 * @page:       the page to add the mapping to
 561 * @vma:        the vm area in which the mapping is added
 562 * @address:    the user virtual address mapped
 563 *
 564 * Same as page_add_anon_rmap but must only be called on *new* pages.
 565 * This means the inc-and-test can be bypassed.
 566 * Page does not have to be locked.
 567 */
 568void page_add_new_anon_rmap(struct page *page,
 569        struct vm_area_struct *vma, unsigned long address)
 570{
 571        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 572        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
 573        __page_set_anon_rmap(page, vma, address);
 574}
 575
 576/**
 577 * page_add_file_rmap - add pte mapping to a file page
 578 * @page: the page to add the mapping to
 579 *
 580 * The caller needs to hold the pte lock.
 581 */
 582void page_add_file_rmap(struct page *page)
 583{
 584        if (atomic_inc_and_test(&page->_mapcount))
 585                __inc_zone_page_state(page, NR_FILE_MAPPED);
 586}
 587
 588#ifdef CONFIG_DEBUG_VM
 589/**
 590 * page_dup_rmap - duplicate pte mapping to a page
 591 * @page:       the page to add the mapping to
 592 *
 593 * For copy_page_range only: minimal extract from page_add_file_rmap /
 594 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
 595 * quicker.
 596 *
 597 * The caller needs to hold the pte lock.
 598 */
 599void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 600{
 601        BUG_ON(page_mapcount(page) == 0);
 602        if (PageAnon(page))
 603                __page_check_anon_rmap(page, vma, address);
 604        atomic_inc(&page->_mapcount);
 605}
 606#endif
 607
 608/**
 609 * page_remove_rmap - take down pte mapping from a page
 610 * @page: page to remove mapping from
 611 *
 612 * The caller needs to hold the pte lock.
 613 */
 614void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 615{
 616        if (atomic_add_negative(-1, &page->_mapcount)) {
 617                if (unlikely(page_mapcount(page) < 0)) {
 618                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 619                        printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
 620                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 621                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 622                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 623                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
 624                        if (vma->vm_ops)
 625                                print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
 626                        if (vma->vm_file && vma->vm_file->f_op)
 627                                print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
 628                        BUG();
 629                }
 630
 631                /*
 632                 * It would be tidy to reset the PageAnon mapping here,
 633                 * but that might overwrite a racing page_add_anon_rmap
 634                 * which increments mapcount after us but sets mapping
 635                 * before us: so leave the reset to free_hot_cold_page,
 636                 * and remember that it's only reliable while mapped.
 637                 * Leaving it set also helps swapoff to reinstate ptes
 638                 * faster for those pages still in swapcache.
 639                 */
 640                if (page_test_dirty(page)) {
 641                        page_clear_dirty(page);
 642                        set_page_dirty(page);
 643                }
 644                __dec_zone_page_state(page,
 645                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 646        }
 647}
 648
 649/*
 650 * Subfunctions of try_to_unmap: try_to_unmap_one called
 651 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 652 */
 653static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 654                                int migration)
 655{
 656        struct mm_struct *mm = vma->vm_mm;
 657        unsigned long address;
 658        pte_t *pte;
 659        pte_t pteval;
 660        spinlock_t *ptl;
 661        int ret = SWAP_AGAIN;
 662
 663        address = vma_address(page, vma);
 664        if (address == -EFAULT)
 665                goto out;
 666
 667        pte = page_check_address(page, mm, address, &ptl);
 668        if (!pte)
 669                goto out;
 670
 671        /*
 672         * If the page is mlock()d, we cannot swap it out.
 673         * If it's recently referenced (perhaps page_referenced
 674         * skipped over this mm) then we should reactivate it.
 675         */
 676        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
 677                        (ptep_clear_flush_young(vma, address, pte)))) {
 678                ret = SWAP_FAIL;
 679                goto out_unmap;
 680        }
 681
 682        /* Nuke the page table entry. */
 683        flush_cache_page(vma, address, page_to_pfn(page));
 684        pteval = ptep_clear_flush(vma, address, pte);
 685
 686        /* Move the dirty bit to the physical page now the pte is gone. */
 687        if (pte_dirty(pteval))
 688                set_page_dirty(page);
 689
 690        /* Update high watermark before we lower rss */
 691        update_hiwater_rss(mm);
 692
 693        if (PageAnon(page)) {
 694                swp_entry_t entry = { .val = page_private(page) };
 695
 696                if (PageSwapCache(page)) {
 697                        /*
 698                         * Store the swap location in the pte.
 699                         * See handle_pte_fault() ...
 700                         */
 701                        swap_duplicate(entry);
 702                        if (list_empty(&mm->mmlist)) {
 703                                spin_lock(&mmlist_lock);
 704                                if (list_empty(&mm->mmlist))
 705                                        list_add(&mm->mmlist, &init_mm.mmlist);
 706                                spin_unlock(&mmlist_lock);
 707                        }
 708                        dec_mm_counter(mm, anon_rss);
 709#ifdef CONFIG_MIGRATION
 710                } else {
 711                        /*
 712                         * Store the pfn of the page in a special migration
 713                         * pte. do_swap_page() will wait until the migration
 714                         * pte is removed and then restart fault handling.
 715                         */
 716                        BUG_ON(!migration);
 717                        entry = make_migration_entry(page, pte_write(pteval));
 718#endif
 719                }
 720                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 721                BUG_ON(pte_file(*pte));
 722        } else
 723#ifdef CONFIG_MIGRATION
 724        if (migration) {
 725                /* Establish migration entry for a file page */
 726                swp_entry_t entry;
 727                entry = make_migration_entry(page, pte_write(pteval));
 728                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 729        } else
 730#endif
 731                dec_mm_counter(mm, file_rss);
 732
 733
 734        page_remove_rmap(page, vma);
 735        page_cache_release(page);
 736
 737out_unmap:
 738        pte_unmap_unlock(pte, ptl);
 739out:
 740        return ret;
 741}
 742
 743/*
 744 * objrmap doesn't work for nonlinear VMAs because the assumption that
 745 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 746 * Consequently, given a particular page and its ->index, we cannot locate the
 747 * ptes which are mapping that page without an exhaustive linear search.
 748 *
 749 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 750 * maps the file to which the target page belongs.  The ->vm_private_data field
 751 * holds the current cursor into that scan.  Successive searches will circulate
 752 * around the vma's virtual address space.
 753 *
 754 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 755 * more scanning pressure is placed against them as well.   Eventually pages
 756 * will become fully unmapped and are eligible for eviction.
 757 *
 758 * For very sparsely populated VMAs this is a little inefficient - chances are
 759 * there there won't be many ptes located within the scan cluster.  In this case
 760 * maybe we could scan further - to the end of the pte page, perhaps.
 761 */
 762#define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 763#define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 764
 765static void try_to_unmap_cluster(unsigned long cursor,
 766        unsigned int *mapcount, struct vm_area_struct *vma)
 767{
 768        struct mm_struct *mm = vma->vm_mm;
 769        pgd_t *pgd;
 770        pud_t *pud;
 771        pmd_t *pmd;
 772        pte_t *pte;
 773        pte_t pteval;
 774        spinlock_t *ptl;
 775        struct page *page;
 776        unsigned long address;
 777        unsigned long end;
 778
 779        address = (vma->vm_start + cursor) & CLUSTER_MASK;
 780        end = address + CLUSTER_SIZE;
 781        if (address < vma->vm_start)
 782                address = vma->vm_start;
 783        if (end > vma->vm_end)
 784                end = vma->vm_end;
 785
 786        pgd = pgd_offset(mm, address);
 787        if (!pgd_present(*pgd))
 788                return;
 789
 790        pud = pud_offset(pgd, address);
 791        if (!pud_present(*pud))
 792                return;
 793
 794        pmd = pmd_offset(pud, address);
 795        if (!pmd_present(*pmd))
 796                return;
 797
 798        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 799
 800        /* Update high watermark before we lower rss */
 801        update_hiwater_rss(mm);
 802
 803        for (; address < end; pte++, address += PAGE_SIZE) {
 804                if (!pte_present(*pte))
 805                        continue;
 806                page = vm_normal_page(vma, address, *pte);
 807                BUG_ON(!page || PageAnon(page));
 808
 809                if (ptep_clear_flush_young(vma, address, pte))
 810                        continue;
 811
 812                /* Nuke the page table entry. */
 813                flush_cache_page(vma, address, pte_pfn(*pte));
 814                pteval = ptep_clear_flush(vma, address, pte);
 815
 816                /* If nonlinear, store the file page offset in the pte. */
 817                if (page->index != linear_page_index(vma, address))
 818                        set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
 819
 820                /* Move the dirty bit to the physical page now the pte is gone. */
 821                if (pte_dirty(pteval))
 822                        set_page_dirty(page);
 823
 824                page_remove_rmap(page, vma);
 825                page_cache_release(page);
 826                dec_mm_counter(mm, file_rss);
 827                (*mapcount)--;
 828        }
 829        pte_unmap_unlock(pte - 1, ptl);
 830}
 831
 832static int try_to_unmap_anon(struct page *page, int migration)
 833{
 834        struct anon_vma *anon_vma;
 835        struct vm_area_struct *vma;
 836        int ret = SWAP_AGAIN;
 837
 838        anon_vma = page_lock_anon_vma(page);
 839        if (!anon_vma)
 840                return ret;
 841
 842        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 843                ret = try_to_unmap_one(page, vma, migration);
 844                if (ret == SWAP_FAIL || !page_mapped(page))
 845                        break;
 846        }
 847
 848        page_unlock_anon_vma(anon_vma);
 849        return ret;
 850}
 851
 852/**
 853 * try_to_unmap_file - unmap file page using the object-based rmap method
 854 * @page: the page to unmap
 855 *
 856 * Find all the mappings of a page using the mapping pointer and the vma chains
 857 * contained in the address_space struct it points to.
 858 *
 859 * This function is only called from try_to_unmap for object-based pages.
 860 */
 861static int try_to_unmap_file(struct page *page, int migration)
 862{
 863        struct address_space *mapping = page->mapping;
 864        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 865        struct vm_area_struct *vma;
 866        struct prio_tree_iter iter;
 867        int ret = SWAP_AGAIN;
 868        unsigned long cursor;
 869        unsigned long max_nl_cursor = 0;
 870        unsigned long max_nl_size = 0;
 871        unsigned int mapcount;
 872
 873        spin_lock(&mapping->i_mmap_lock);
 874        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 875                ret = try_to_unmap_one(page, vma, migration);
 876                if (ret == SWAP_FAIL || !page_mapped(page))
 877                        goto out;
 878        }
 879
 880        if (list_empty(&mapping->i_mmap_nonlinear))
 881                goto out;
 882
 883        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 884                                                shared.vm_set.list) {
 885                if ((vma->vm_flags & VM_LOCKED) && !migration)
 886                        continue;
 887                cursor = (unsigned long) vma->vm_private_data;
 888                if (cursor > max_nl_cursor)
 889                        max_nl_cursor = cursor;
 890                cursor = vma->vm_end - vma->vm_start;
 891                if (cursor > max_nl_size)
 892                        max_nl_size = cursor;
 893        }
 894
 895        if (max_nl_size == 0) { /* any nonlinears locked or reserved */
 896                ret = SWAP_FAIL;
 897                goto out;
 898        }
 899
 900        /*
 901         * We don't try to search for this page in the nonlinear vmas,
 902         * and page_referenced wouldn't have found it anyway.  Instead
 903         * just walk the nonlinear vmas trying to age and unmap some.
 904         * The mapcount of the page we came in with is irrelevant,
 905         * but even so use it as a guide to how hard we should try?
 906         */
 907        mapcount = page_mapcount(page);
 908        if (!mapcount)
 909                goto out;
 910        cond_resched_lock(&mapping->i_mmap_lock);
 911
 912        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 913        if (max_nl_cursor == 0)
 914                max_nl_cursor = CLUSTER_SIZE;
 915
 916        do {
 917                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 918                                                shared.vm_set.list) {
 919                        if ((vma->vm_flags & VM_LOCKED) && !migration)
 920                                continue;
 921                        cursor = (unsigned long) vma->vm_private_data;
 922                        while ( cursor < max_nl_cursor &&
 923                                cursor < vma->vm_end - vma->vm_start) {
 924                                try_to_unmap_cluster(cursor, &mapcount, vma);
 925                                cursor += CLUSTER_SIZE;
 926                                vma->vm_private_data = (void *) cursor;
 927                                if ((int)mapcount <= 0)
 928                                        goto out;
 929                        }
 930                        vma->vm_private_data = (void *) max_nl_cursor;
 931                }
 932                cond_resched_lock(&mapping->i_mmap_lock);
 933                max_nl_cursor += CLUSTER_SIZE;
 934        } while (max_nl_cursor <= max_nl_size);
 935
 936        /*
 937         * Don't loop forever (perhaps all the remaining pages are
 938         * in locked vmas).  Reset cursor on all unreserved nonlinear
 939         * vmas, now forgetting on which ones it had fallen behind.
 940         */
 941        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 942                vma->vm_private_data = NULL;
 943out:
 944        spin_unlock(&mapping->i_mmap_lock);
 945        return ret;
 946}
 947
 948/**
 949 * try_to_unmap - try to remove all page table mappings to a page
 950 * @page: the page to get unmapped
 951 *
 952 * Tries to remove all the page table entries which are mapping this
 953 * page, used in the pageout path.  Caller must hold the page lock.
 954 * Return values are:
 955 *
 956 * SWAP_SUCCESS - we succeeded in removing all mappings
 957 * SWAP_AGAIN   - we missed a mapping, try again later
 958 * SWAP_FAIL    - the page is unswappable
 959 */
 960int try_to_unmap(struct page *page, int migration)
 961{
 962        int ret;
 963
 964        BUG_ON(!PageLocked(page));
 965
 966        if (PageAnon(page))
 967                ret = try_to_unmap_anon(page, migration);
 968        else
 969                ret = try_to_unmap_file(page, migration);
 970
 971        if (!page_mapped(page))
 972                ret = SWAP_SUCCESS;
 973        return ret;
 974}
 975
 976
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.