linux/mm/migrate.c
<<
>>
Prefs
   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/module.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/ksm.h>
  25#include <linux/rmap.h>
  26#include <linux/topology.h>
  27#include <linux/cpu.h>
  28#include <linux/cpuset.h>
  29#include <linux/writeback.h>
  30#include <linux/mempolicy.h>
  31#include <linux/vmalloc.h>
  32#include <linux/security.h>
  33#include <linux/memcontrol.h>
  34#include <linux/syscalls.h>
  35#include <linux/hugetlb.h>
  36#include <linux/gfp.h>
  37
  38#include <asm/tlbflush.h>
  39
  40#include "internal.h"
  41
  42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  43
  44/*
  45 * migrate_prep() needs to be called before we start compiling a list of pages
  46 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  47 * undesirable, use migrate_prep_local()
  48 */
  49int migrate_prep(void)
  50{
  51        /*
  52         * Clear the LRU lists so pages can be isolated.
  53         * Note that pages may be moved off the LRU after we have
  54         * drained them. Those pages will fail to migrate like other
  55         * pages that may be busy.
  56         */
  57        lru_add_drain_all();
  58
  59        return 0;
  60}
  61
  62/* Do the necessary work of migrate_prep but not if it involves other CPUs */
  63int migrate_prep_local(void)
  64{
  65        lru_add_drain();
  66
  67        return 0;
  68}
  69
  70/*
  71 * Add isolated pages on the list back to the LRU under page lock
  72 * to avoid leaking evictable pages back onto unevictable list.
  73 */
  74void putback_lru_pages(struct list_head *l)
  75{
  76        struct page *page;
  77        struct page *page2;
  78
  79        list_for_each_entry_safe(page, page2, l, lru) {
  80                list_del(&page->lru);
  81                dec_zone_page_state(page, NR_ISOLATED_ANON +
  82                                page_is_file_cache(page));
  83                putback_lru_page(page);
  84        }
  85}
  86
  87/*
  88 * Restore a potential migration pte to a working pte entry
  89 */
  90static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  91                                 unsigned long addr, void *old)
  92{
  93        struct mm_struct *mm = vma->vm_mm;
  94        swp_entry_t entry;
  95        pgd_t *pgd;
  96        pud_t *pud;
  97        pmd_t *pmd;
  98        pte_t *ptep, pte;
  99        spinlock_t *ptl;
 100
 101        if (unlikely(PageHuge(new))) {
 102                ptep = huge_pte_offset(mm, addr);
 103                if (!ptep)
 104                        goto out;
 105                ptl = &mm->page_table_lock;
 106        } else {
 107                pgd = pgd_offset(mm, addr);
 108                if (!pgd_present(*pgd))
 109                        goto out;
 110
 111                pud = pud_offset(pgd, addr);
 112                if (!pud_present(*pud))
 113                        goto out;
 114
 115                pmd = pmd_offset(pud, addr);
 116                if (!pmd_present(*pmd))
 117                        goto out;
 118
 119                ptep = pte_offset_map(pmd, addr);
 120
 121                if (!is_swap_pte(*ptep)) {
 122                        pte_unmap(ptep);
 123                        goto out;
 124                }
 125
 126                ptl = pte_lockptr(mm, pmd);
 127        }
 128
 129        spin_lock(ptl);
 130        pte = *ptep;
 131        if (!is_swap_pte(pte))
 132                goto unlock;
 133
 134        entry = pte_to_swp_entry(pte);
 135
 136        if (!is_migration_entry(entry) ||
 137            migration_entry_to_page(entry) != old)
 138                goto unlock;
 139
 140        get_page(new);
 141        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 142        if (is_write_migration_entry(entry))
 143                pte = pte_mkwrite(pte);
 144#ifdef CONFIG_HUGETLB_PAGE
 145        if (PageHuge(new))
 146                pte = pte_mkhuge(pte);
 147#endif
 148        flush_cache_page(vma, addr, pte_pfn(pte));
 149        set_pte_at(mm, addr, ptep, pte);
 150
 151        if (PageHuge(new)) {
 152                if (PageAnon(new))
 153                        hugepage_add_anon_rmap(new, vma, addr);
 154                else
 155                        page_dup_rmap(new);
 156        } else if (PageAnon(new))
 157                page_add_anon_rmap(new, vma, addr);
 158        else
 159                page_add_file_rmap(new);
 160
 161        /* No need to invalidate - it was non-present before */
 162        update_mmu_cache(vma, addr, ptep);
 163unlock:
 164        pte_unmap_unlock(ptep, ptl);
 165out:
 166        return SWAP_AGAIN;
 167}
 168
 169/*
 170 * Get rid of all migration entries and replace them by
 171 * references to the indicated page.
 172 */
 173static void remove_migration_ptes(struct page *old, struct page *new)
 174{
 175        rmap_walk(new, remove_migration_pte, old);
 176}
 177
 178/*
 179 * Something used the pte of a page under migration. We need to
 180 * get to the page and wait until migration is finished.
 181 * When we return from this function the fault will be retried.
 182 *
 183 * This function is called from do_swap_page().
 184 */
 185void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 186                                unsigned long address)
 187{
 188        pte_t *ptep, pte;
 189        spinlock_t *ptl;
 190        swp_entry_t entry;
 191        struct page *page;
 192
 193        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 194        pte = *ptep;
 195        if (!is_swap_pte(pte))
 196                goto out;
 197
 198        entry = pte_to_swp_entry(pte);
 199        if (!is_migration_entry(entry))
 200                goto out;
 201
 202        page = migration_entry_to_page(entry);
 203
 204        /*
 205         * Once radix-tree replacement of page migration started, page_count
 206         * *must* be zero. And, we don't want to call wait_on_page_locked()
 207         * against a page without get_page().
 208         * So, we use get_page_unless_zero(), here. Even failed, page fault
 209         * will occur again.
 210         */
 211        if (!get_page_unless_zero(page))
 212                goto out;
 213        pte_unmap_unlock(ptep, ptl);
 214        wait_on_page_locked(page);
 215        put_page(page);
 216        return;
 217out:
 218        pte_unmap_unlock(ptep, ptl);
 219}
 220
 221/*
 222 * Replace the page in the mapping.
 223 *
 224 * The number of remaining references must be:
 225 * 1 for anonymous pages without a mapping
 226 * 2 for pages with a mapping
 227 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 228 */
 229static int migrate_page_move_mapping(struct address_space *mapping,
 230                struct page *newpage, struct page *page)
 231{
 232        int expected_count;
 233        void **pslot;
 234
 235        if (!mapping) {
 236                /* Anonymous page without mapping */
 237                if (page_count(page) != 1)
 238                        return -EAGAIN;
 239                return 0;
 240        }
 241
 242        spin_lock_irq(&mapping->tree_lock);
 243
 244        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 245                                        page_index(page));
 246
 247        expected_count = 2 + page_has_private(page);
 248        if (page_count(page) != expected_count ||
 249                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 250                spin_unlock_irq(&mapping->tree_lock);
 251                return -EAGAIN;
 252        }
 253
 254        if (!page_freeze_refs(page, expected_count)) {
 255                spin_unlock_irq(&mapping->tree_lock);
 256                return -EAGAIN;
 257        }
 258
 259        /*
 260         * Now we know that no one else is looking at the page.
 261         */
 262        get_page(newpage);      /* add cache reference */
 263        if (PageSwapCache(page)) {
 264                SetPageSwapCache(newpage);
 265                set_page_private(newpage, page_private(page));
 266        }
 267
 268        radix_tree_replace_slot(pslot, newpage);
 269
 270        page_unfreeze_refs(page, expected_count);
 271        /*
 272         * Drop cache reference from old page.
 273         * We know this isn't the last reference.
 274         */
 275        __put_page(page);
 276
 277        /*
 278         * If moved to a different zone then also account
 279         * the page for that zone. Other VM counters will be
 280         * taken care of when we establish references to the
 281         * new page and drop references to the old page.
 282         *
 283         * Note that anonymous pages are accounted for
 284         * via NR_FILE_PAGES and NR_ANON_PAGES if they
 285         * are mapped to swap space.
 286         */
 287        __dec_zone_page_state(page, NR_FILE_PAGES);
 288        __inc_zone_page_state(newpage, NR_FILE_PAGES);
 289        if (PageSwapBacked(page)) {
 290                __dec_zone_page_state(page, NR_SHMEM);
 291                __inc_zone_page_state(newpage, NR_SHMEM);
 292        }
 293        spin_unlock_irq(&mapping->tree_lock);
 294
 295        return 0;
 296}
 297
 298/*
 299 * The expected number of remaining references is the same as that
 300 * of migrate_page_move_mapping().
 301 */
 302int migrate_huge_page_move_mapping(struct address_space *mapping,
 303                                   struct page *newpage, struct page *page)
 304{
 305        int expected_count;
 306        void **pslot;
 307
 308        if (!mapping) {
 309                if (page_count(page) != 1)
 310                        return -EAGAIN;
 311                return 0;
 312        }
 313
 314        spin_lock_irq(&mapping->tree_lock);
 315
 316        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 317                                        page_index(page));
 318
 319        expected_count = 2 + page_has_private(page);
 320        if (page_count(page) != expected_count ||
 321                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 322                spin_unlock_irq(&mapping->tree_lock);
 323                return -EAGAIN;
 324        }
 325
 326        if (!page_freeze_refs(page, expected_count)) {
 327                spin_unlock_irq(&mapping->tree_lock);
 328                return -EAGAIN;
 329        }
 330
 331        get_page(newpage);
 332
 333        radix_tree_replace_slot(pslot, newpage);
 334
 335        page_unfreeze_refs(page, expected_count);
 336
 337        __put_page(page);
 338
 339        spin_unlock_irq(&mapping->tree_lock);
 340        return 0;
 341}
 342
 343/*
 344 * Copy the page to its new location
 345 */
 346void migrate_page_copy(struct page *newpage, struct page *page)
 347{
 348        if (PageHuge(page))
 349                copy_huge_page(newpage, page);
 350        else
 351                copy_highpage(newpage, page);
 352
 353        if (PageError(page))
 354                SetPageError(newpage);
 355        if (PageReferenced(page))
 356                SetPageReferenced(newpage);
 357        if (PageUptodate(page))
 358                SetPageUptodate(newpage);
 359        if (TestClearPageActive(page)) {
 360                VM_BUG_ON(PageUnevictable(page));
 361                SetPageActive(newpage);
 362        } else if (TestClearPageUnevictable(page))
 363                SetPageUnevictable(newpage);
 364        if (PageChecked(page))
 365                SetPageChecked(newpage);
 366        if (PageMappedToDisk(page))
 367                SetPageMappedToDisk(newpage);
 368
 369        if (PageDirty(page)) {
 370                clear_page_dirty_for_io(page);
 371                /*
 372                 * Want to mark the page and the radix tree as dirty, and
 373                 * redo the accounting that clear_page_dirty_for_io undid,
 374                 * but we can't use set_page_dirty because that function
 375                 * is actually a signal that all of the page has become dirty.
 376                 * Wheras only part of our page may be dirty.
 377                 */
 378                __set_page_dirty_nobuffers(newpage);
 379        }
 380
 381        mlock_migrate_page(newpage, page);
 382        ksm_migrate_page(newpage, page);
 383
 384        ClearPageSwapCache(page);
 385        ClearPagePrivate(page);
 386        set_page_private(page, 0);
 387        page->mapping = NULL;
 388
 389        /*
 390         * If any waiters have accumulated on the new page then
 391         * wake them up.
 392         */
 393        if (PageWriteback(newpage))
 394                end_page_writeback(newpage);
 395}
 396
 397/************************************************************
 398 *                    Migration functions
 399 ***********************************************************/
 400
 401/* Always fail migration. Used for mappings that are not movable */
 402int fail_migrate_page(struct address_space *mapping,
 403                        struct page *newpage, struct page *page)
 404{
 405        return -EIO;
 406}
 407EXPORT_SYMBOL(fail_migrate_page);
 408
 409/*
 410 * Common logic to directly migrate a single page suitable for
 411 * pages that do not use PagePrivate/PagePrivate2.
 412 *
 413 * Pages are locked upon entry and exit.
 414 */
 415int migrate_page(struct address_space *mapping,
 416                struct page *newpage, struct page *page)
 417{
 418        int rc;
 419
 420        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 421
 422        rc = migrate_page_move_mapping(mapping, newpage, page);
 423
 424        if (rc)
 425                return rc;
 426
 427        migrate_page_copy(newpage, page);
 428        return 0;
 429}
 430EXPORT_SYMBOL(migrate_page);
 431
 432#ifdef CONFIG_BLOCK
 433/*
 434 * Migration function for pages with buffers. This function can only be used
 435 * if the underlying filesystem guarantees that no other references to "page"
 436 * exist.
 437 */
 438int buffer_migrate_page(struct address_space *mapping,
 439                struct page *newpage, struct page *page)
 440{
 441        struct buffer_head *bh, *head;
 442        int rc;
 443
 444        if (!page_has_buffers(page))
 445                return migrate_page(mapping, newpage, page);
 446
 447        head = page_buffers(page);
 448
 449        rc = migrate_page_move_mapping(mapping, newpage, page);
 450
 451        if (rc)
 452                return rc;
 453
 454        bh = head;
 455        do {
 456                get_bh(bh);
 457                lock_buffer(bh);
 458                bh = bh->b_this_page;
 459
 460        } while (bh != head);
 461
 462        ClearPagePrivate(page);
 463        set_page_private(newpage, page_private(page));
 464        set_page_private(page, 0);
 465        put_page(page);
 466        get_page(newpage);
 467
 468        bh = head;
 469        do {
 470                set_bh_page(bh, newpage, bh_offset(bh));
 471                bh = bh->b_this_page;
 472
 473        } while (bh != head);
 474
 475        SetPagePrivate(newpage);
 476
 477        migrate_page_copy(newpage, page);
 478
 479        bh = head;
 480        do {
 481                unlock_buffer(bh);
 482                put_bh(bh);
 483                bh = bh->b_this_page;
 484
 485        } while (bh != head);
 486
 487        return 0;
 488}
 489EXPORT_SYMBOL(buffer_migrate_page);
 490#endif
 491
 492/*
 493 * Writeback a page to clean the dirty state
 494 */
 495static int writeout(struct address_space *mapping, struct page *page)
 496{
 497        struct writeback_control wbc = {
 498                .sync_mode = WB_SYNC_NONE,
 499                .nr_to_write = 1,
 500                .range_start = 0,
 501                .range_end = LLONG_MAX,
 502                .for_reclaim = 1
 503        };
 504        int rc;
 505
 506        if (!mapping->a_ops->writepage)
 507                /* No write method for the address space */
 508                return -EINVAL;
 509
 510        if (!clear_page_dirty_for_io(page))
 511                /* Someone else already triggered a write */
 512                return -EAGAIN;
 513
 514        /*
 515         * A dirty page may imply that the underlying filesystem has
 516         * the page on some queue. So the page must be clean for
 517         * migration. Writeout may mean we loose the lock and the
 518         * page state is no longer what we checked for earlier.
 519         * At this point we know that the migration attempt cannot
 520         * be successful.
 521         */
 522        remove_migration_ptes(page, page);
 523
 524        rc = mapping->a_ops->writepage(page, &wbc);
 525
 526        if (rc != AOP_WRITEPAGE_ACTIVATE)
 527                /* unlocked. Relock */
 528                lock_page(page);
 529
 530        return (rc < 0) ? -EIO : -EAGAIN;
 531}
 532
 533/*
 534 * Default handling if a filesystem does not provide a migration function.
 535 */
 536static int fallback_migrate_page(struct address_space *mapping,
 537        struct page *newpage, struct page *page)
 538{
 539        if (PageDirty(page))
 540                return writeout(mapping, page);
 541
 542        /*
 543         * Buffers may be managed in a filesystem specific way.
 544         * We must have no buffers or drop them.
 545         */
 546        if (page_has_private(page) &&
 547            !try_to_release_page(page, GFP_KERNEL))
 548                return -EAGAIN;
 549
 550        return migrate_page(mapping, newpage, page);
 551}
 552
 553/*
 554 * Move a page to a newly allocated page
 555 * The page is locked and all ptes have been successfully removed.
 556 *
 557 * The new page will have replaced the old page if this function
 558 * is successful.
 559 *
 560 * Return value:
 561 *   < 0 - error code
 562 *  == 0 - success
 563 */
 564static int move_to_new_page(struct page *newpage, struct page *page,
 565                                                int remap_swapcache)
 566{
 567        struct address_space *mapping;
 568        int rc;
 569
 570        /*
 571         * Block others from accessing the page when we get around to
 572         * establishing additional references. We are the only one
 573         * holding a reference to the new page at this point.
 574         */
 575        if (!trylock_page(newpage))
 576                BUG();
 577
 578        /* Prepare mapping for the new page.*/
 579        newpage->index = page->index;
 580        newpage->mapping = page->mapping;
 581        if (PageSwapBacked(page))
 582                SetPageSwapBacked(newpage);
 583
 584        mapping = page_mapping(page);
 585        if (!mapping)
 586                rc = migrate_page(mapping, newpage, page);
 587        else if (mapping->a_ops->migratepage)
 588                /*
 589                 * Most pages have a mapping and most filesystems
 590                 * should provide a migration function. Anonymous
 591                 * pages are part of swap space which also has its
 592                 * own migration function. This is the most common
 593                 * path for page migration.
 594                 */
 595                rc = mapping->a_ops->migratepage(mapping,
 596                                                newpage, page);
 597        else
 598                rc = fallback_migrate_page(mapping, newpage, page);
 599
 600        if (rc) {
 601                newpage->mapping = NULL;
 602        } else {
 603                if (remap_swapcache)
 604                        remove_migration_ptes(page, newpage);
 605        }
 606
 607        unlock_page(newpage);
 608
 609        return rc;
 610}
 611
 612/*
 613 * Obtain the lock on page, remove all ptes and migrate the page
 614 * to the newly allocated page in newpage.
 615 */
 616static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 617                        struct page *page, int force, int offlining)
 618{
 619        int rc = 0;
 620        int *result = NULL;
 621        struct page *newpage = get_new_page(page, private, &result);
 622        int remap_swapcache = 1;
 623        int charge = 0;
 624        struct mem_cgroup *mem = NULL;
 625        struct anon_vma *anon_vma = NULL;
 626
 627        if (!newpage)
 628                return -ENOMEM;
 629
 630        if (page_count(page) == 1) {
 631                /* page was freed from under us. So we are done. */
 632                goto move_newpage;
 633        }
 634
 635        /* prepare cgroup just returns 0 or -ENOMEM */
 636        rc = -EAGAIN;
 637
 638        if (!trylock_page(page)) {
 639                if (!force)
 640                        goto move_newpage;
 641                lock_page(page);
 642        }
 643
 644        /*
 645         * Only memory hotplug's offline_pages() caller has locked out KSM,
 646         * and can safely migrate a KSM page.  The other cases have skipped
 647         * PageKsm along with PageReserved - but it is only now when we have
 648         * the page lock that we can be certain it will not go KSM beneath us
 649         * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
 650         * its pagecount raised, but only here do we take the page lock which
 651         * serializes that).
 652         */
 653        if (PageKsm(page) && !offlining) {
 654                rc = -EBUSY;
 655                goto unlock;
 656        }
 657
 658        /* charge against new page */
 659        charge = mem_cgroup_prepare_migration(page, newpage, &mem);
 660        if (charge == -ENOMEM) {
 661                rc = -ENOMEM;
 662                goto unlock;
 663        }
 664        BUG_ON(charge);
 665
 666        if (PageWriteback(page)) {
 667                if (!force)
 668                        goto uncharge;
 669                wait_on_page_writeback(page);
 670        }
 671        /*
 672         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 673         * we cannot notice that anon_vma is freed while we migrates a page.
 674         * This get_anon_vma() delays freeing anon_vma pointer until the end
 675         * of migration. File cache pages are no problem because of page_lock()
 676         * File Caches may use write_page() or lock_page() in migration, then,
 677         * just care Anon page here.
 678         */
 679        if (PageAnon(page)) {
 680                /*
 681                 * Only page_lock_anon_vma() understands the subtleties of
 682                 * getting a hold on an anon_vma from outside one of its mms.
 683                 */
 684                anon_vma = page_lock_anon_vma(page);
 685                if (anon_vma) {
 686                        /*
 687                         * Take a reference count on the anon_vma if the
 688                         * page is mapped so that it is guaranteed to
 689                         * exist when the page is remapped later
 690                         */
 691                        get_anon_vma(anon_vma);
 692                        page_unlock_anon_vma(anon_vma);
 693                } else if (PageSwapCache(page)) {
 694                        /*
 695                         * We cannot be sure that the anon_vma of an unmapped
 696                         * swapcache page is safe to use because we don't
 697                         * know in advance if the VMA that this page belonged
 698                         * to still exists. If the VMA and others sharing the
 699                         * data have been freed, then the anon_vma could
 700                         * already be invalid.
 701                         *
 702                         * To avoid this possibility, swapcache pages get
 703                         * migrated but are not remapped when migration
 704                         * completes
 705                         */
 706                        remap_swapcache = 0;
 707                } else {
 708                        goto uncharge;
 709                }
 710        }
 711
 712        /*
 713         * Corner case handling:
 714         * 1. When a new swap-cache page is read into, it is added to the LRU
 715         * and treated as swapcache but it has no rmap yet.
 716         * Calling try_to_unmap() against a page->mapping==NULL page will
 717         * trigger a BUG.  So handle it here.
 718         * 2. An orphaned page (see truncate_complete_page) might have
 719         * fs-private metadata. The page can be picked up due to memory
 720         * offlining.  Everywhere else except page reclaim, the page is
 721         * invisible to the vm, so the page can not be migrated.  So try to
 722         * free the metadata, so the page can be freed.
 723         */
 724        if (!page->mapping) {
 725                VM_BUG_ON(PageAnon(page));
 726                if (page_has_private(page)) {
 727                        try_to_free_buffers(page);
 728                        goto uncharge;
 729                }
 730                goto skip_unmap;
 731        }
 732
 733        /* Establish migration ptes or remove ptes */
 734        try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 735
 736skip_unmap:
 737        if (!page_mapped(page))
 738                rc = move_to_new_page(newpage, page, remap_swapcache);
 739
 740        if (rc && remap_swapcache)
 741                remove_migration_ptes(page, page);
 742
 743        /* Drop an anon_vma reference if we took one */
 744        if (anon_vma)
 745                drop_anon_vma(anon_vma);
 746
 747uncharge:
 748        if (!charge)
 749                mem_cgroup_end_migration(mem, page, newpage);
 750unlock:
 751        unlock_page(page);
 752
 753        if (rc != -EAGAIN) {
 754                /*
 755                 * A page that has been migrated has all references
 756                 * removed and will be freed. A page that has not been
 757                 * migrated will have kepts its references and be
 758                 * restored.
 759                 */
 760                list_del(&page->lru);
 761                dec_zone_page_state(page, NR_ISOLATED_ANON +
 762                                page_is_file_cache(page));
 763                putback_lru_page(page);
 764        }
 765
 766move_newpage:
 767
 768        /*
 769         * Move the new page to the LRU. If migration was not successful
 770         * then this will free the page.
 771         */
 772        putback_lru_page(newpage);
 773
 774        if (result) {
 775                if (rc)
 776                        *result = rc;
 777                else
 778                        *result = page_to_nid(newpage);
 779        }
 780        return rc;
 781}
 782
 783/*
 784 * Counterpart of unmap_and_move_page() for hugepage migration.
 785 *
 786 * This function doesn't wait the completion of hugepage I/O
 787 * because there is no race between I/O and migration for hugepage.
 788 * Note that currently hugepage I/O occurs only in direct I/O
 789 * where no lock is held and PG_writeback is irrelevant,
 790 * and writeback status of all subpages are counted in the reference
 791 * count of the head page (i.e. if all subpages of a 2MB hugepage are
 792 * under direct I/O, the reference of the head page is 512 and a bit more.)
 793 * This means that when we try to migrate hugepage whose subpages are
 794 * doing direct I/O, some references remain after try_to_unmap() and
 795 * hugepage migration fails without data corruption.
 796 *
 797 * There is also no race when direct I/O is issued on the page under migration,
 798 * because then pte is replaced with migration swap entry and direct I/O code
 799 * will wait in the page fault for migration to complete.
 800 */
 801static int unmap_and_move_huge_page(new_page_t get_new_page,
 802                                unsigned long private, struct page *hpage,
 803                                int force, int offlining)
 804{
 805        int rc = 0;
 806        int *result = NULL;
 807        struct page *new_hpage = get_new_page(hpage, private, &result);
 808        struct anon_vma *anon_vma = NULL;
 809
 810        if (!new_hpage)
 811                return -ENOMEM;
 812
 813        rc = -EAGAIN;
 814
 815        if (!trylock_page(hpage)) {
 816                if (!force)
 817                        goto out;
 818                lock_page(hpage);
 819        }
 820
 821        if (PageAnon(hpage)) {
 822                anon_vma = page_lock_anon_vma(hpage);
 823                if (anon_vma) {
 824                        get_anon_vma(anon_vma);
 825                        page_unlock_anon_vma(anon_vma);
 826                }
 827        }
 828
 829        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 830
 831        if (!page_mapped(hpage))
 832                rc = move_to_new_page(new_hpage, hpage, 1);
 833
 834        if (rc)
 835                remove_migration_ptes(hpage, hpage);
 836
 837        if (anon_vma)
 838                drop_anon_vma(anon_vma);
 839out:
 840        unlock_page(hpage);
 841
 842        if (rc != -EAGAIN) {
 843                list_del(&hpage->lru);
 844                put_page(hpage);
 845        }
 846
 847        put_page(new_hpage);
 848
 849        if (result) {
 850                if (rc)
 851                        *result = rc;
 852                else
 853                        *result = page_to_nid(new_hpage);
 854        }
 855        return rc;
 856}
 857
 858/*
 859 * migrate_pages
 860 *
 861 * The function takes one list of pages to migrate and a function
 862 * that determines from the page to be migrated and the private data
 863 * the target of the move and allocates the page.
 864 *
 865 * The function returns after 10 attempts or if no pages
 866 * are movable anymore because to has become empty
 867 * or no retryable pages exist anymore.
 868 * Caller should call putback_lru_pages to return pages to the LRU
 869 * or free list.
 870 *
 871 * Return: Number of pages not migrated or error code.
 872 */
 873int migrate_pages(struct list_head *from,
 874                new_page_t get_new_page, unsigned long private, int offlining)
 875{
 876        int retry = 1;
 877        int nr_failed = 0;
 878        int pass = 0;
 879        struct page *page;
 880        struct page *page2;
 881        int swapwrite = current->flags & PF_SWAPWRITE;
 882        int rc;
 883
 884        if (!swapwrite)
 885                current->flags |= PF_SWAPWRITE;
 886
 887        for(pass = 0; pass < 10 && retry; pass++) {
 888                retry = 0;
 889
 890                list_for_each_entry_safe(page, page2, from, lru) {
 891                        cond_resched();
 892
 893                        rc = unmap_and_move(get_new_page, private,
 894                                                page, pass > 2, offlining);
 895
 896                        switch(rc) {
 897                        case -ENOMEM:
 898                                goto out;
 899                        case -EAGAIN:
 900                                retry++;
 901                                break;
 902                        case 0:
 903                                break;
 904                        default:
 905                                /* Permanent failure */
 906                                nr_failed++;
 907                                break;
 908                        }
 909                }
 910        }
 911        rc = 0;
 912out:
 913        if (!swapwrite)
 914                current->flags &= ~PF_SWAPWRITE;
 915
 916        if (rc)
 917                return rc;
 918
 919        return nr_failed + retry;
 920}
 921
 922int migrate_huge_pages(struct list_head *from,
 923                new_page_t get_new_page, unsigned long private, int offlining)
 924{
 925        int retry = 1;
 926        int nr_failed = 0;
 927        int pass = 0;
 928        struct page *page;
 929        struct page *page2;
 930        int rc;
 931
 932        for (pass = 0; pass < 10 && retry; pass++) {
 933                retry = 0;
 934
 935                list_for_each_entry_safe(page, page2, from, lru) {
 936                        cond_resched();
 937
 938                        rc = unmap_and_move_huge_page(get_new_page,
 939                                        private, page, pass > 2, offlining);
 940
 941                        switch(rc) {
 942                        case -ENOMEM:
 943                                goto out;
 944                        case -EAGAIN:
 945                                retry++;
 946                                break;
 947                        case 0:
 948                                break;
 949                        default:
 950                                /* Permanent failure */
 951                                nr_failed++;
 952                                break;
 953                        }
 954                }
 955        }
 956        rc = 0;
 957out:
 958
 959        list_for_each_entry_safe(page, page2, from, lru)
 960                put_page(page);
 961
 962        if (rc)
 963                return rc;
 964
 965        return nr_failed + retry;
 966}
 967
 968#ifdef CONFIG_NUMA
 969/*
 970 * Move a list of individual pages
 971 */
 972struct page_to_node {
 973        unsigned long addr;
 974        struct page *page;
 975        int node;
 976        int status;
 977};
 978
 979static struct page *new_page_node(struct page *p, unsigned long private,
 980                int **result)
 981{
 982        struct page_to_node *pm = (struct page_to_node *)private;
 983
 984        while (pm->node != MAX_NUMNODES && pm->page != p)
 985                pm++;
 986
 987        if (pm->node == MAX_NUMNODES)
 988                return NULL;
 989
 990        *result = &pm->status;
 991
 992        return alloc_pages_exact_node(pm->node,
 993                                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 994}
 995
 996/*
 997 * Move a set of pages as indicated in the pm array. The addr
 998 * field must be set to the virtual address of the page to be moved
 999 * and the node number must contain a valid target node.
1000 * The pm array ends with node = MAX_NUMNODES.
1001 */
1002static int do_move_page_to_node_array(struct mm_struct *mm,
1003                                      struct page_to_node *pm,
1004                                      int migrate_all)
1005{
1006        int err;
1007        struct page_to_node *pp;
1008        LIST_HEAD(pagelist);
1009
1010        down_read(&mm->mmap_sem);
1011
1012        /*
1013         * Build a list of pages to migrate
1014         */
1015        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1016                struct vm_area_struct *vma;
1017                struct page *page;
1018
1019                err = -EFAULT;
1020                vma = find_vma(mm, pp->addr);
1021                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1022                        goto set_status;
1023
1024                page = follow_page(vma, pp->addr, FOLL_GET);
1025
1026                err = PTR_ERR(page);
1027                if (IS_ERR(page))
1028                        goto set_status;
1029
1030                err = -ENOENT;
1031                if (!page)
1032                        goto set_status;
1033
1034                /* Use PageReserved to check for zero page */
1035                if (PageReserved(page) || PageKsm(page))
1036                        goto put_and_set;
1037
1038                pp->page = page;
1039                err = page_to_nid(page);
1040
1041                if (err == pp->node)
1042                        /*
1043                         * Node already in the right place
1044                         */
1045                        goto put_and_set;
1046
1047                err = -EACCES;
1048                if (page_mapcount(page) > 1 &&
1049                                !migrate_all)
1050                        goto put_and_set;
1051
1052                err = isolate_lru_page(page);
1053                if (!err) {
1054                        list_add_tail(&page->lru, &pagelist);
1055                        inc_zone_page_state(page, NR_ISOLATED_ANON +
1056                                            page_is_file_cache(page));
1057                }
1058put_and_set:
1059                /*
1060                 * Either remove the duplicate refcount from
1061                 * isolate_lru_page() or drop the page ref if it was
1062                 * not isolated.
1063                 */
1064                put_page(page);
1065set_status:
1066                pp->status = err;
1067        }
1068
1069        err = 0;
1070        if (!list_empty(&pagelist)) {
1071                err = migrate_pages(&pagelist, new_page_node,
1072                                (unsigned long)pm, 0);
1073                if (err)
1074                        putback_lru_pages(&pagelist);
1075        }
1076
1077        up_read(&mm->mmap_sem);
1078        return err;
1079}
1080
1081/*
1082 * Migrate an array of page address onto an array of nodes and fill
1083 * the corresponding array of status.
1084 */
1085static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
1086                         unsigned long nr_pages,
1087                         const void __user * __user *pages,
1088                         const int __user *nodes,
1089                         int __user *status, int flags)
1090{
1091        struct page_to_node *pm;
1092        nodemask_t task_nodes;
1093        unsigned long chunk_nr_pages;
1094        unsigned long chunk_start;
1095        int err;
1096
1097        task_nodes = cpuset_mems_allowed(task);
1098
1099        err = -ENOMEM;
1100        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1101        if (!pm)
1102                goto out;
1103
1104        migrate_prep();
1105
1106        /*
1107         * Store a chunk of page_to_node array in a page,
1108         * but keep the last one as a marker
1109         */
1110        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1111
1112        for (chunk_start = 0;
1113             chunk_start < nr_pages;
1114             chunk_start += chunk_nr_pages) {
1115                int j;
1116
1117                if (chunk_start + chunk_nr_pages > nr_pages)
1118                        chunk_nr_pages = nr_pages - chunk_start;
1119
1120                /* fill the chunk pm with addrs and nodes from user-space */
1121                for (j = 0; j < chunk_nr_pages; j++) {
1122                        const void __user *p;
1123                        int node;
1124
1125                        err = -EFAULT;
1126                        if (get_user(p, pages + j + chunk_start))
1127                                goto out_pm;
1128                        pm[j].addr = (unsigned long) p;
1129
1130                        if (get_user(node, nodes + j + chunk_start))
1131                                goto out_pm;
1132
1133                        err = -ENODEV;
1134                        if (node < 0 || node >= MAX_NUMNODES)
1135                                goto out_pm;
1136
1137                        if (!node_state(node, N_HIGH_MEMORY))
1138                                goto out_pm;
1139
1140                        err = -EACCES;
1141                        if (!node_isset(node, task_nodes))
1142                                goto out_pm;
1143
1144                        pm[j].node = node;
1145                }
1146
1147                /* End marker for this chunk */
1148                pm[chunk_nr_pages].node = MAX_NUMNODES;
1149
1150                /* Migrate this chunk */
1151                err = do_move_page_to_node_array(mm, pm,
1152                                                 flags & MPOL_MF_MOVE_ALL);
1153                if (err < 0)
1154                        goto out_pm;
1155
1156                /* Return status information */
1157                for (j = 0; j < chunk_nr_pages; j++)
1158                        if (put_user(pm[j].status, status + j + chunk_start)) {
1159                                err = -EFAULT;
1160                                goto out_pm;
1161                        }
1162        }
1163        err = 0;
1164
1165out_pm:
1166        free_page((unsigned long)pm);
1167out:
1168        return err;
1169}
1170
1171/*
1172 * Determine the nodes of an array of pages and store it in an array of status.
1173 */
1174static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1175                                const void __user **pages, int *status)
1176{
1177        unsigned long i;
1178
1179        down_read(&mm->mmap_sem);
1180
1181        for (i = 0; i < nr_pages; i++) {
1182                unsigned long addr = (unsigned long)(*pages);
1183                struct vm_area_struct *vma;
1184                struct page *page;
1185                int err = -EFAULT;
1186
1187                vma = find_vma(mm, addr);
1188                if (!vma || addr < vma->vm_start)
1189                        goto set_status;
1190
1191                page = follow_page(vma, addr, 0);
1192
1193                err = PTR_ERR(page);
1194                if (IS_ERR(page))
1195                        goto set_status;
1196
1197                err = -ENOENT;
1198                /* Use PageReserved to check for zero page */
1199                if (!page || PageReserved(page) || PageKsm(page))
1200                        goto set_status;
1201
1202                err = page_to_nid(page);
1203set_status:
1204                *status = err;
1205
1206                pages++;
1207                status++;
1208        }
1209
1210        up_read(&mm->mmap_sem);
1211}
1212
1213/*
1214 * Determine the nodes of a user array of pages and store it in
1215 * a user array of status.
1216 */
1217static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1218                         const void __user * __user *pages,
1219                         int __user *status)
1220{
1221#define DO_PAGES_STAT_CHUNK_NR 16
1222        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1223        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1224
1225        while (nr_pages) {
1226                unsigned long chunk_nr;
1227
1228                chunk_nr = nr_pages;
1229                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1230                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1231
1232                if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1233                        break;
1234
1235                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1236
1237                if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1238                        break;
1239
1240                pages += chunk_nr;
1241                status += chunk_nr;
1242                nr_pages -= chunk_nr;
1243        }
1244        return nr_pages ? -EFAULT : 0;
1245}
1246
1247/*
1248 * Move a list of pages in the address space of the currently executing
1249 * process.
1250 */
1251SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1252                const void __user * __user *, pages,
1253                const int __user *, nodes,
1254                int __user *, status, int, flags)
1255{
1256        const struct cred *cred = current_cred(), *tcred;
1257        struct task_struct *task;
1258        struct mm_struct *mm;
1259        int err;
1260
1261        /* Check flags */
1262        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1263                return -EINVAL;
1264
1265        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1266                return -EPERM;
1267
1268        /* Find the mm_struct */
1269        read_lock(&tasklist_lock);
1270        task = pid ? find_task_by_vpid(pid) : current;
1271        if (!task) {
1272                read_unlock(&tasklist_lock);
1273                return -ESRCH;
1274        }
1275        mm = get_task_mm(task);
1276        read_unlock(&tasklist_lock);
1277
1278        if (!mm)
1279                return -EINVAL;
1280
1281        /*
1282         * Check if this process has the right to modify the specified
1283         * process. The right exists if the process has administrative
1284         * capabilities, superuser privileges or the same
1285         * userid as the target process.
1286         */
1287        rcu_read_lock();
1288        tcred = __task_cred(task);
1289        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1290            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1291            !capable(CAP_SYS_NICE)) {
1292                rcu_read_unlock();
1293                err = -EPERM;
1294                goto out;
1295        }
1296        rcu_read_unlock();
1297
1298        err = security_task_movememory(task);
1299        if (err)
1300                goto out;
1301
1302        if (nodes) {
1303                err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1304                                    flags);
1305        } else {
1306                err = do_pages_stat(mm, nr_pages, pages, status);
1307        }
1308
1309out:
1310        mmput(mm);
1311        return err;
1312}
1313
1314/*
1315 * Call migration functions in the vma_ops that may prepare
1316 * memory in a vm for migration. migration functions may perform
1317 * the migration for vmas that do not have an underlying page struct.
1318 */
1319int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1320        const nodemask_t *from, unsigned long flags)
1321{
1322        struct vm_area_struct *vma;
1323        int err = 0;
1324
1325        for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1326                if (vma->vm_ops && vma->vm_ops->migrate) {
1327                        err = vma->vm_ops->migrate(vma, to, from, flags);
1328                        if (err)
1329                                break;
1330                }
1331        }
1332        return err;
1333}
1334#endif
1335
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.