linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66   could replace all the switch()es with a mempolicy_ops structure.
  67*/
  68
  69#include <linux/mempolicy.h>
  70#include <linux/mm.h>
  71#include <linux/highmem.h>
  72#include <linux/hugetlb.h>
  73#include <linux/kernel.h>
  74#include <linux/sched.h>
  75#include <linux/mm.h>
  76#include <linux/nodemask.h>
  77#include <linux/cpuset.h>
  78#include <linux/gfp.h>
  79#include <linux/slab.h>
  80#include <linux/string.h>
  81#include <linux/module.h>
  82#include <linux/interrupt.h>
  83#include <linux/init.h>
  84#include <linux/compat.h>
  85#include <linux/mempolicy.h>
  86#include <linux/swap.h>
  87#include <linux/seq_file.h>
  88#include <linux/proc_fs.h>
  89#include <linux/migrate.h>
  90#include <linux/rmap.h>
  91#include <linux/security.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96/* Internal flags */
  97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101static struct kmem_cache *policy_cache;
 102static struct kmem_cache *sn_cache;
 103
 104/* Highest zone. An specific allocation for a zone below that is not
 105   policied. */
 106enum zone_type policy_zone = 0;
 107
 108struct mempolicy default_policy = {
 109        .refcnt = ATOMIC_INIT(1), /* never free it */
 110        .policy = MPOL_DEFAULT,
 111};
 112
 113/* Do sanity checking on a policy */
 114static int mpol_check_policy(int mode, nodemask_t *nodes)
 115{
 116        int empty = nodes_empty(*nodes);
 117
 118        switch (mode) {
 119        case MPOL_DEFAULT:
 120                if (!empty)
 121                        return -EINVAL;
 122                break;
 123        case MPOL_BIND:
 124        case MPOL_INTERLEAVE:
 125                /* Preferred will only use the first bit, but allow
 126                   more for now. */
 127                if (empty)
 128                        return -EINVAL;
 129                break;
 130        }
 131        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 132}
 133
 134/* Generate a custom zonelist for the BIND policy. */
 135static struct zonelist *bind_zonelist(nodemask_t *nodes)
 136{
 137        struct zonelist *zl;
 138        int num, max, nd;
 139        enum zone_type k;
 140
 141        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 142        max++;                  /* space for zlcache_ptr (see mmzone.h) */
 143        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144        if (!zl)
 145                return ERR_PTR(-ENOMEM);
 146        zl->zlcache_ptr = NULL;
 147        num = 0;
 148        /* First put in the highest zones from all nodes, then all the next 
 149           lower zones etc. Avoid empty zones because the memory allocator
 150           doesn't like them. If you implement node hot removal you
 151           have to fix that. */
 152        k = MAX_NR_ZONES - 1;
 153        while (1) {
 154                for_each_node_mask(nd, *nodes) { 
 155                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
 156                        if (z->present_pages > 0) 
 157                                zl->zones[num++] = z;
 158                }
 159                if (k == 0)
 160                        break;
 161                k--;
 162        }
 163        if (num == 0) {
 164                kfree(zl);
 165                return ERR_PTR(-EINVAL);
 166        }
 167        zl->zones[num] = NULL;
 168        return zl;
 169}
 170
 171/* Create a new policy */
 172static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 173{
 174        struct mempolicy *policy;
 175
 176        pr_debug("setting mode %d nodes[0] %lx\n",
 177                 mode, nodes ? nodes_addr(*nodes)[0] : -1);
 178
 179        if (mode == MPOL_DEFAULT)
 180                return NULL;
 181        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 182        if (!policy)
 183                return ERR_PTR(-ENOMEM);
 184        atomic_set(&policy->refcnt, 1);
 185        switch (mode) {
 186        case MPOL_INTERLEAVE:
 187                policy->v.nodes = *nodes;
 188                if (nodes_weight(*nodes) == 0) {
 189                        kmem_cache_free(policy_cache, policy);
 190                        return ERR_PTR(-EINVAL);
 191                }
 192                break;
 193        case MPOL_PREFERRED:
 194                policy->v.preferred_node = first_node(*nodes);
 195                if (policy->v.preferred_node >= MAX_NUMNODES)
 196                        policy->v.preferred_node = -1;
 197                break;
 198        case MPOL_BIND:
 199                policy->v.zonelist = bind_zonelist(nodes);
 200                if (IS_ERR(policy->v.zonelist)) {
 201                        void *error_code = policy->v.zonelist;
 202                        kmem_cache_free(policy_cache, policy);
 203                        return error_code;
 204                }
 205                break;
 206        }
 207        policy->policy = mode;
 208        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 209        return policy;
 210}
 211
 212static void gather_stats(struct page *, void *, int pte_dirty);
 213static void migrate_page_add(struct page *page, struct list_head *pagelist,
 214                                unsigned long flags);
 215
 216/* Scan through pages checking if pages follow certain conditions. */
 217static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 218                unsigned long addr, unsigned long end,
 219                const nodemask_t *nodes, unsigned long flags,
 220                void *private)
 221{
 222        pte_t *orig_pte;
 223        pte_t *pte;
 224        spinlock_t *ptl;
 225
 226        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 227        do {
 228                struct page *page;
 229                int nid;
 230
 231                if (!pte_present(*pte))
 232                        continue;
 233                page = vm_normal_page(vma, addr, *pte);
 234                if (!page)
 235                        continue;
 236                /*
 237                 * The check for PageReserved here is important to avoid
 238                 * handling zero pages and other pages that may have been
 239                 * marked special by the system.
 240                 *
 241                 * If the PageReserved would not be checked here then f.e.
 242                 * the location of the zero page could have an influence
 243                 * on MPOL_MF_STRICT, zero pages would be counted for
 244                 * the per node stats, and there would be useless attempts
 245                 * to put zero pages on the migration list.
 246                 */
 247                if (PageReserved(page))
 248                        continue;
 249                nid = page_to_nid(page);
 250                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 251                        continue;
 252
 253                if (flags & MPOL_MF_STATS)
 254                        gather_stats(page, private, pte_dirty(*pte));
 255                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 256                        migrate_page_add(page, private, flags);
 257                else
 258                        break;
 259        } while (pte++, addr += PAGE_SIZE, addr != end);
 260        pte_unmap_unlock(orig_pte, ptl);
 261        return addr != end;
 262}
 263
 264static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 265                unsigned long addr, unsigned long end,
 266                const nodemask_t *nodes, unsigned long flags,
 267                void *private)
 268{
 269        pmd_t *pmd;
 270        unsigned long next;
 271
 272        pmd = pmd_offset(pud, addr);
 273        do {
 274                next = pmd_addr_end(addr, end);
 275                if (pmd_none_or_clear_bad(pmd))
 276                        continue;
 277                if (check_pte_range(vma, pmd, addr, next, nodes,
 278                                    flags, private))
 279                        return -EIO;
 280        } while (pmd++, addr = next, addr != end);
 281        return 0;
 282}
 283
 284static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 285                unsigned long addr, unsigned long end,
 286                const nodemask_t *nodes, unsigned long flags,
 287                void *private)
 288{
 289        pud_t *pud;
 290        unsigned long next;
 291
 292        pud = pud_offset(pgd, addr);
 293        do {
 294                next = pud_addr_end(addr, end);
 295                if (pud_none_or_clear_bad(pud))
 296                        continue;
 297                if (check_pmd_range(vma, pud, addr, next, nodes,
 298                                    flags, private))
 299                        return -EIO;
 300        } while (pud++, addr = next, addr != end);
 301        return 0;
 302}
 303
 304static inline int check_pgd_range(struct vm_area_struct *vma,
 305                unsigned long addr, unsigned long end,
 306                const nodemask_t *nodes, unsigned long flags,
 307                void *private)
 308{
 309        pgd_t *pgd;
 310        unsigned long next;
 311
 312        pgd = pgd_offset(vma->vm_mm, addr);
 313        do {
 314                next = pgd_addr_end(addr, end);
 315                if (pgd_none_or_clear_bad(pgd))
 316                        continue;
 317                if (check_pud_range(vma, pgd, addr, next, nodes,
 318                                    flags, private))
 319                        return -EIO;
 320        } while (pgd++, addr = next, addr != end);
 321        return 0;
 322}
 323
 324/*
 325 * Check if all pages in a range are on a set of nodes.
 326 * If pagelist != NULL then isolate pages from the LRU and
 327 * put them on the pagelist.
 328 */
 329static struct vm_area_struct *
 330check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 331                const nodemask_t *nodes, unsigned long flags, void *private)
 332{
 333        int err;
 334        struct vm_area_struct *first, *vma, *prev;
 335
 336        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 337
 338                err = migrate_prep();
 339                if (err)
 340                        return ERR_PTR(err);
 341        }
 342
 343        first = find_vma(mm, start);
 344        if (!first)
 345                return ERR_PTR(-EFAULT);
 346        prev = NULL;
 347        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 348                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 349                        if (!vma->vm_next && vma->vm_end < end)
 350                                return ERR_PTR(-EFAULT);
 351                        if (prev && prev->vm_end < vma->vm_start)
 352                                return ERR_PTR(-EFAULT);
 353                }
 354                if (!is_vm_hugetlb_page(vma) &&
 355                    ((flags & MPOL_MF_STRICT) ||
 356                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 357                                vma_migratable(vma)))) {
 358                        unsigned long endvma = vma->vm_end;
 359
 360                        if (endvma > end)
 361                                endvma = end;
 362                        if (vma->vm_start > start)
 363                                start = vma->vm_start;
 364                        err = check_pgd_range(vma, start, endvma, nodes,
 365                                                flags, private);
 366                        if (err) {
 367                                first = ERR_PTR(err);
 368                                break;
 369                        }
 370                }
 371                prev = vma;
 372        }
 373        return first;
 374}
 375
 376/* Apply policy to a single VMA */
 377static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 378{
 379        int err = 0;
 380        struct mempolicy *old = vma->vm_policy;
 381
 382        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 383                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 384                 vma->vm_ops, vma->vm_file,
 385                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 386
 387        if (vma->vm_ops && vma->vm_ops->set_policy)
 388                err = vma->vm_ops->set_policy(vma, new);
 389        if (!err) {
 390                mpol_get(new);
 391                vma->vm_policy = new;
 392                mpol_free(old);
 393        }
 394        return err;
 395}
 396
 397/* Step 2: apply policy to a range and do splits. */
 398static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 399                       unsigned long end, struct mempolicy *new)
 400{
 401        struct vm_area_struct *next;
 402        int err;
 403
 404        err = 0;
 405        for (; vma && vma->vm_start < end; vma = next) {
 406                next = vma->vm_next;
 407                if (vma->vm_start < start)
 408                        err = split_vma(vma->vm_mm, vma, start, 1);
 409                if (!err && vma->vm_end > end)
 410                        err = split_vma(vma->vm_mm, vma, end, 0);
 411                if (!err)
 412                        err = policy_vma(vma, new);
 413                if (err)
 414                        break;
 415        }
 416        return err;
 417}
 418
 419static int contextualize_policy(int mode, nodemask_t *nodes)
 420{
 421        if (!nodes)
 422                return 0;
 423
 424        cpuset_update_task_memory_state();
 425        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 426                return -EINVAL;
 427        return mpol_check_policy(mode, nodes);
 428}
 429
 430
 431/*
 432 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 433 * mempolicy.  Allows more rapid checking of this (combined perhaps
 434 * with other PF_* flag bits) on memory allocation hot code paths.
 435 *
 436 * If called from outside this file, the task 'p' should -only- be
 437 * a newly forked child not yet visible on the task list, because
 438 * manipulating the task flags of a visible task is not safe.
 439 *
 440 * The above limitation is why this routine has the funny name
 441 * mpol_fix_fork_child_flag().
 442 *
 443 * It is also safe to call this with a task pointer of current,
 444 * which the static wrapper mpol_set_task_struct_flag() does,
 445 * for use within this file.
 446 */
 447
 448void mpol_fix_fork_child_flag(struct task_struct *p)
 449{
 450        if (p->mempolicy)
 451                p->flags |= PF_MEMPOLICY;
 452        else
 453                p->flags &= ~PF_MEMPOLICY;
 454}
 455
 456static void mpol_set_task_struct_flag(void)
 457{
 458        mpol_fix_fork_child_flag(current);
 459}
 460
 461/* Set the process memory policy */
 462long do_set_mempolicy(int mode, nodemask_t *nodes)
 463{
 464        struct mempolicy *new;
 465
 466        if (contextualize_policy(mode, nodes))
 467                return -EINVAL;
 468        new = mpol_new(mode, nodes);
 469        if (IS_ERR(new))
 470                return PTR_ERR(new);
 471        mpol_free(current->mempolicy);
 472        current->mempolicy = new;
 473        mpol_set_task_struct_flag();
 474        if (new && new->policy == MPOL_INTERLEAVE)
 475                current->il_next = first_node(new->v.nodes);
 476        return 0;
 477}
 478
 479/* Fill a zone bitmap for a policy */
 480static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 481{
 482        int i;
 483
 484        nodes_clear(*nodes);
 485        switch (p->policy) {
 486        case MPOL_BIND:
 487                for (i = 0; p->v.zonelist->zones[i]; i++)
 488                        node_set(zone_to_nid(p->v.zonelist->zones[i]),
 489                                *nodes);
 490                break;
 491        case MPOL_DEFAULT:
 492                break;
 493        case MPOL_INTERLEAVE:
 494                *nodes = p->v.nodes;
 495                break;
 496        case MPOL_PREFERRED:
 497                /* or use current node instead of online map? */
 498                if (p->v.preferred_node < 0)
 499                        *nodes = node_online_map;
 500                else
 501                        node_set(p->v.preferred_node, *nodes);
 502                break;
 503        default:
 504                BUG();
 505        }
 506}
 507
 508static int lookup_node(struct mm_struct *mm, unsigned long addr)
 509{
 510        struct page *p;
 511        int err;
 512
 513        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 514        if (err >= 0) {
 515                err = page_to_nid(p);
 516                put_page(p);
 517        }
 518        return err;
 519}
 520
 521/* Retrieve NUMA policy */
 522long do_get_mempolicy(int *policy, nodemask_t *nmask,
 523                        unsigned long addr, unsigned long flags)
 524{
 525        int err;
 526        struct mm_struct *mm = current->mm;
 527        struct vm_area_struct *vma = NULL;
 528        struct mempolicy *pol = current->mempolicy;
 529
 530        cpuset_update_task_memory_state();
 531        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 532                return -EINVAL;
 533        if (flags & MPOL_F_ADDR) {
 534                down_read(&mm->mmap_sem);
 535                vma = find_vma_intersection(mm, addr, addr+1);
 536                if (!vma) {
 537                        up_read(&mm->mmap_sem);
 538                        return -EFAULT;
 539                }
 540                if (vma->vm_ops && vma->vm_ops->get_policy)
 541                        pol = vma->vm_ops->get_policy(vma, addr);
 542                else
 543                        pol = vma->vm_policy;
 544        } else if (addr)
 545                return -EINVAL;
 546
 547        if (!pol)
 548                pol = &default_policy;
 549
 550        if (flags & MPOL_F_NODE) {
 551                if (flags & MPOL_F_ADDR) {
 552                        err = lookup_node(mm, addr);
 553                        if (err < 0)
 554                                goto out;
 555                        *policy = err;
 556                } else if (pol == current->mempolicy &&
 557                                pol->policy == MPOL_INTERLEAVE) {
 558                        *policy = current->il_next;
 559                } else {
 560                        err = -EINVAL;
 561                        goto out;
 562                }
 563        } else
 564                *policy = pol->policy;
 565
 566        if (vma) {
 567                up_read(&current->mm->mmap_sem);
 568                vma = NULL;
 569        }
 570
 571        err = 0;
 572        if (nmask)
 573                get_zonemask(pol, nmask);
 574
 575 out:
 576        if (vma)
 577                up_read(&current->mm->mmap_sem);
 578        return err;
 579}
 580
 581#ifdef CONFIG_MIGRATION
 582/*
 583 * page migration
 584 */
 585static void migrate_page_add(struct page *page, struct list_head *pagelist,
 586                                unsigned long flags)
 587{
 588        /*
 589         * Avoid migrating a page that is shared with others.
 590         */
 591        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 592                isolate_lru_page(page, pagelist);
 593}
 594
 595static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 596{
 597        return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 598}
 599
 600/*
 601 * Migrate pages from one node to a target node.
 602 * Returns error or the number of pages not migrated.
 603 */
 604int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 605{
 606        nodemask_t nmask;
 607        LIST_HEAD(pagelist);
 608        int err = 0;
 609
 610        nodes_clear(nmask);
 611        node_set(source, nmask);
 612
 613        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 614                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 615
 616        if (!list_empty(&pagelist))
 617                err = migrate_pages(&pagelist, new_node_page, dest);
 618
 619        return err;
 620}
 621
 622/*
 623 * Move pages between the two nodesets so as to preserve the physical
 624 * layout as much as possible.
 625 *
 626 * Returns the number of page that could not be moved.
 627 */
 628int do_migrate_pages(struct mm_struct *mm,
 629        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 630{
 631        LIST_HEAD(pagelist);
 632        int busy = 0;
 633        int err = 0;
 634        nodemask_t tmp;
 635
 636        down_read(&mm->mmap_sem);
 637
 638        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 639        if (err)
 640                goto out;
 641
 642/*
 643 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 644 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 645 * bit in 'tmp', and return that <source, dest> pair for migration.
 646 * The pair of nodemasks 'to' and 'from' define the map.
 647 *
 648 * If no pair of bits is found that way, fallback to picking some
 649 * pair of 'source' and 'dest' bits that are not the same.  If the
 650 * 'source' and 'dest' bits are the same, this represents a node
 651 * that will be migrating to itself, so no pages need move.
 652 *
 653 * If no bits are left in 'tmp', or if all remaining bits left
 654 * in 'tmp' correspond to the same bit in 'to', return false
 655 * (nothing left to migrate).
 656 *
 657 * This lets us pick a pair of nodes to migrate between, such that
 658 * if possible the dest node is not already occupied by some other
 659 * source node, minimizing the risk of overloading the memory on a
 660 * node that would happen if we migrated incoming memory to a node
 661 * before migrating outgoing memory source that same node.
 662 *
 663 * A single scan of tmp is sufficient.  As we go, we remember the
 664 * most recent <s, d> pair that moved (s != d).  If we find a pair
 665 * that not only moved, but what's better, moved to an empty slot
 666 * (d is not set in tmp), then we break out then, with that pair.
 667 * Otherwise when we finish scannng from_tmp, we at least have the
 668 * most recent <s, d> pair that moved.  If we get all the way through
 669 * the scan of tmp without finding any node that moved, much less
 670 * moved to an empty node, then there is nothing left worth migrating.
 671 */
 672
 673        tmp = *from_nodes;
 674        while (!nodes_empty(tmp)) {
 675                int s,d;
 676                int source = -1;
 677                int dest = 0;
 678
 679                for_each_node_mask(s, tmp) {
 680                        d = node_remap(s, *from_nodes, *to_nodes);
 681                        if (s == d)
 682                                continue;
 683
 684                        source = s;     /* Node moved. Memorize */
 685                        dest = d;
 686
 687                        /* dest not in remaining from nodes? */
 688                        if (!node_isset(dest, tmp))
 689                                break;
 690                }
 691                if (source == -1)
 692                        break;
 693
 694                node_clear(source, tmp);
 695                err = migrate_to_node(mm, source, dest, flags);
 696                if (err > 0)
 697                        busy += err;
 698                if (err < 0)
 699                        break;
 700        }
 701out:
 702        up_read(&mm->mmap_sem);
 703        if (err < 0)
 704                return err;
 705        return busy;
 706
 707}
 708
 709static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 710{
 711        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 712
 713        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 714                                        page_address_in_vma(page, vma));
 715}
 716#else
 717
 718static void migrate_page_add(struct page *page, struct list_head *pagelist,
 719                                unsigned long flags)
 720{
 721}
 722
 723int do_migrate_pages(struct mm_struct *mm,
 724        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 725{
 726        return -ENOSYS;
 727}
 728
 729static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 730{
 731        return NULL;
 732}
 733#endif
 734
 735long do_mbind(unsigned long start, unsigned long len,
 736                unsigned long mode, nodemask_t *nmask, unsigned long flags)
 737{
 738        struct vm_area_struct *vma;
 739        struct mm_struct *mm = current->mm;
 740        struct mempolicy *new;
 741        unsigned long end;
 742        int err;
 743        LIST_HEAD(pagelist);
 744
 745        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 746                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 747            || mode > MPOL_MAX)
 748                return -EINVAL;
 749        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 750                return -EPERM;
 751
 752        if (start & ~PAGE_MASK)
 753                return -EINVAL;
 754
 755        if (mode == MPOL_DEFAULT)
 756                flags &= ~MPOL_MF_STRICT;
 757
 758        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 759        end = start + len;
 760
 761        if (end < start)
 762                return -EINVAL;
 763        if (end == start)
 764                return 0;
 765
 766        if (mpol_check_policy(mode, nmask))
 767                return -EINVAL;
 768
 769        new = mpol_new(mode, nmask);
 770        if (IS_ERR(new))
 771                return PTR_ERR(new);
 772
 773        /*
 774         * If we are using the default policy then operation
 775         * on discontinuous address spaces is okay after all
 776         */
 777        if (!new)
 778                flags |= MPOL_MF_DISCONTIG_OK;
 779
 780        pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 781                 mode, nmask ? nodes_addr(*nmask)[0] : -1);
 782
 783        down_write(&mm->mmap_sem);
 784        vma = check_range(mm, start, end, nmask,
 785                          flags | MPOL_MF_INVERT, &pagelist);
 786
 787        err = PTR_ERR(vma);
 788        if (!IS_ERR(vma)) {
 789                int nr_failed = 0;
 790
 791                err = mbind_range(vma, start, end, new);
 792
 793                if (!list_empty(&pagelist))
 794                        nr_failed = migrate_pages(&pagelist, new_vma_page,
 795                                                (unsigned long)vma);
 796
 797                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 798                        err = -EIO;
 799        }
 800
 801        up_write(&mm->mmap_sem);
 802        mpol_free(new);
 803        return err;
 804}
 805
 806/*
 807 * User space interface with variable sized bitmaps for nodelists.
 808 */
 809
 810/* Copy a node mask from user space. */
 811static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 812                     unsigned long maxnode)
 813{
 814        unsigned long k;
 815        unsigned long nlongs;
 816        unsigned long endmask;
 817
 818        --maxnode;
 819        nodes_clear(*nodes);
 820        if (maxnode == 0 || !nmask)
 821                return 0;
 822        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 823                return -EINVAL;
 824
 825        nlongs = BITS_TO_LONGS(maxnode);
 826        if ((maxnode % BITS_PER_LONG) == 0)
 827                endmask = ~0UL;
 828        else
 829                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 830
 831        /* When the user specified more nodes than supported just check
 832           if the non supported part is all zero. */
 833        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 834                if (nlongs > PAGE_SIZE/sizeof(long))
 835                        return -EINVAL;
 836                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 837                        unsigned long t;
 838                        if (get_user(t, nmask + k))
 839                                return -EFAULT;
 840                        if (k == nlongs - 1) {
 841                                if (t & endmask)
 842                                        return -EINVAL;
 843                        } else if (t)
 844                                return -EINVAL;
 845                }
 846                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 847                endmask = ~0UL;
 848        }
 849
 850        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 851                return -EFAULT;
 852        nodes_addr(*nodes)[nlongs-1] &= endmask;
 853        return 0;
 854}
 855
 856/* Copy a kernel node mask to user space */
 857static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 858                              nodemask_t *nodes)
 859{
 860        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 861        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 862
 863        if (copy > nbytes) {
 864                if (copy > PAGE_SIZE)
 865                        return -EINVAL;
 866                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 867                        return -EFAULT;
 868                copy = nbytes;
 869        }
 870        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 871}
 872
 873asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 874                        unsigned long mode,
 875                        unsigned long __user *nmask, unsigned long maxnode,
 876                        unsigned flags)
 877{
 878        nodemask_t nodes;
 879        int err;
 880
 881        err = get_nodes(&nodes, nmask, maxnode);
 882        if (err)
 883                return err;
 884#ifdef CONFIG_CPUSETS
 885        /* Restrict the nodes to the allowed nodes in the cpuset */
 886        nodes_and(nodes, nodes, current->mems_allowed);
 887#endif
 888        return do_mbind(start, len, mode, &nodes, flags);
 889}
 890
 891/* Set the process memory policy */
 892asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 893                unsigned long maxnode)
 894{
 895        int err;
 896        nodemask_t nodes;
 897
 898        if (mode < 0 || mode > MPOL_MAX)
 899                return -EINVAL;
 900        err = get_nodes(&nodes, nmask, maxnode);
 901        if (err)
 902                return err;
 903        return do_set_mempolicy(mode, &nodes);
 904}
 905
 906asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 907                const unsigned long __user *old_nodes,
 908                const unsigned long __user *new_nodes)
 909{
 910        struct mm_struct *mm;
 911        struct task_struct *task;
 912        nodemask_t old;
 913        nodemask_t new;
 914        nodemask_t task_nodes;
 915        int err;
 916
 917        err = get_nodes(&old, old_nodes, maxnode);
 918        if (err)
 919                return err;
 920
 921        err = get_nodes(&new, new_nodes, maxnode);
 922        if (err)
 923                return err;
 924
 925        /* Find the mm_struct */
 926        read_lock(&tasklist_lock);
 927        task = pid ? find_task_by_pid(pid) : current;
 928        if (!task) {
 929                read_unlock(&tasklist_lock);
 930                return -ESRCH;
 931        }
 932        mm = get_task_mm(task);
 933        read_unlock(&tasklist_lock);
 934
 935        if (!mm)
 936                return -EINVAL;
 937
 938        /*
 939         * Check if this process has the right to modify the specified
 940         * process. The right exists if the process has administrative
 941         * capabilities, superuser privileges or the same
 942         * userid as the target process.
 943         */
 944        if ((current->euid != task->suid) && (current->euid != task->uid) &&
 945            (current->uid != task->suid) && (current->uid != task->uid) &&
 946            !capable(CAP_SYS_NICE)) {
 947                err = -EPERM;
 948                goto out;
 949        }
 950
 951        task_nodes = cpuset_mems_allowed(task);
 952        /* Is the user allowed to access the target nodes? */
 953        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 954                err = -EPERM;
 955                goto out;
 956        }
 957
 958        if (!nodes_subset(new, node_online_map)) {
 959                err = -EINVAL;
 960                goto out;
 961        }
 962
 963        err = security_task_movememory(task);
 964        if (err)
 965                goto out;
 966
 967        err = do_migrate_pages(mm, &old, &new,
 968                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 969out:
 970        mmput(mm);
 971        return err;
 972}
 973
 974
 975/* Retrieve NUMA policy */
 976asmlinkage long sys_get_mempolicy(int __user *policy,
 977                                unsigned long __user *nmask,
 978                                unsigned long maxnode,
 979                                unsigned long addr, unsigned long flags)
 980{
 981        int err, pval;
 982        nodemask_t nodes;
 983
 984        if (nmask != NULL && maxnode < MAX_NUMNODES)
 985                return -EINVAL;
 986
 987        err = do_get_mempolicy(&pval, &nodes, addr, flags);
 988
 989        if (err)
 990                return err;
 991
 992        if (policy && put_user(pval, policy))
 993                return -EFAULT;
 994
 995        if (nmask)
 996                err = copy_nodes_to_user(nmask, maxnode, &nodes);
 997
 998        return err;
 999}
1000
1001#ifdef CONFIG_COMPAT
1002
1003asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1004                                     compat_ulong_t __user *nmask,
1005                                     compat_ulong_t maxnode,
1006                                     compat_ulong_t addr, compat_ulong_t flags)
1007{
1008        long err;
1009        unsigned long __user *nm = NULL;
1010        unsigned long nr_bits, alloc_size;
1011        DECLARE_BITMAP(bm, MAX_NUMNODES);
1012
1013        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1014        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1015
1016        if (nmask)
1017                nm = compat_alloc_user_space(alloc_size);
1018
1019        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1020
1021        if (!err && nmask) {
1022                err = copy_from_user(bm, nm, alloc_size);
1023                /* ensure entire bitmap is zeroed */
1024                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1025                err |= compat_put_bitmap(nmask, bm, nr_bits);
1026        }
1027
1028        return err;
1029}
1030
1031asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1032                                     compat_ulong_t maxnode)
1033{
1034        long err = 0;
1035        unsigned long __user *nm = NULL;
1036        unsigned long nr_bits, alloc_size;
1037        DECLARE_BITMAP(bm, MAX_NUMNODES);
1038
1039        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1040        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1041
1042        if (nmask) {
1043                err = compat_get_bitmap(bm, nmask, nr_bits);
1044                nm = compat_alloc_user_space(alloc_size);
1045                err |= copy_to_user(nm, bm, alloc_size);
1046        }
1047
1048        if (err)
1049                return -EFAULT;
1050
1051        return sys_set_mempolicy(mode, nm, nr_bits+1);
1052}
1053
1054asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1055                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1056                             compat_ulong_t maxnode, compat_ulong_t flags)
1057{
1058        long err = 0;
1059        unsigned long __user *nm = NULL;
1060        unsigned long nr_bits, alloc_size;
1061        nodemask_t bm;
1062
1063        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1064        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1065
1066        if (nmask) {
1067                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1068                nm = compat_alloc_user_space(alloc_size);
1069                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1070        }
1071
1072        if (err)
1073                return -EFAULT;
1074
1075        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1076}
1077
1078#endif
1079
1080/*
1081 * get_vma_policy(@task, @vma, @addr)
1082 * @task - task for fallback if vma policy == default
1083 * @vma   - virtual memory area whose policy is sought
1084 * @addr  - address in @vma for shared policy lookup
1085 *
1086 * Returns effective policy for a VMA at specified address.
1087 * Falls back to @task or system default policy, as necessary.
1088 * Returned policy has extra reference count if shared, vma,
1089 * or some other task's policy [show_numa_maps() can pass
1090 * @task != current].  It is the caller's responsibility to
1091 * free the reference in these cases.
1092 */
1093static struct mempolicy * get_vma_policy(struct task_struct *task,
1094                struct vm_area_struct *vma, unsigned long addr)
1095{
1096        struct mempolicy *pol = task->mempolicy;
1097        int shared_pol = 0;
1098
1099        if (vma) {
1100                if (vma->vm_ops && vma->vm_ops->get_policy) {
1101                        pol = vma->vm_ops->get_policy(vma, addr);
1102                        shared_pol = 1; /* if pol non-NULL, add ref below */
1103                } else if (vma->vm_policy &&
1104                                vma->vm_policy->policy != MPOL_DEFAULT)
1105                        pol = vma->vm_policy;
1106        }
1107        if (!pol)
1108                pol = &default_policy;
1109        else if (!shared_pol && pol != current->mempolicy)
1110                mpol_get(pol);  /* vma or other task's policy */
1111        return pol;
1112}
1113
1114/* Return a zonelist representing a mempolicy */
1115static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1116{
1117        int nd;
1118
1119        switch (policy->policy) {
1120        case MPOL_PREFERRED:
1121                nd = policy->v.preferred_node;
1122                if (nd < 0)
1123                        nd = numa_node_id();
1124                break;
1125        case MPOL_BIND:
1126                /* Lower zones don't get a policy applied */
1127                /* Careful: current->mems_allowed might have moved */
1128                if (gfp_zone(gfp) >= policy_zone)
1129                        if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1130                                return policy->v.zonelist;
1131                /*FALL THROUGH*/
1132        case MPOL_INTERLEAVE: /* should not happen */
1133        case MPOL_DEFAULT:
1134                nd = numa_node_id();
1135                break;
1136        default:
1137                nd = 0;
1138                BUG();
1139        }
1140        return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1141}
1142
1143/* Do dynamic interleaving for a process */
1144static unsigned interleave_nodes(struct mempolicy *policy)
1145{
1146        unsigned nid, next;
1147        struct task_struct *me = current;
1148
1149        nid = me->il_next;
1150        next = next_node(nid, policy->v.nodes);
1151        if (next >= MAX_NUMNODES)
1152                next = first_node(policy->v.nodes);
1153        me->il_next = next;
1154        return nid;
1155}
1156
1157/*
1158 * Depending on the memory policy provide a node from which to allocate the
1159 * next slab entry.
1160 */
1161unsigned slab_node(struct mempolicy *policy)
1162{
1163        int pol = policy ? policy->policy : MPOL_DEFAULT;
1164
1165        switch (pol) {
1166        case MPOL_INTERLEAVE:
1167                return interleave_nodes(policy);
1168
1169        case MPOL_BIND:
1170                /*
1171                 * Follow bind policy behavior and start allocation at the
1172                 * first node.
1173                 */
1174                return zone_to_nid(policy->v.zonelist->zones[0]);
1175
1176        case MPOL_PREFERRED:
1177                if (policy->v.preferred_node >= 0)
1178                        return policy->v.preferred_node;
1179                /* Fall through */
1180
1181        default:
1182                return numa_node_id();
1183        }
1184}
1185
1186/* Do static interleaving for a VMA with known offset. */
1187static unsigned offset_il_node(struct mempolicy *pol,
1188                struct vm_area_struct *vma, unsigned long off)
1189{
1190        unsigned nnodes = nodes_weight(pol->v.nodes);
1191        unsigned target = (unsigned)off % nnodes;
1192        int c;
1193        int nid = -1;
1194
1195        c = 0;
1196        do {
1197                nid = next_node(nid, pol->v.nodes);
1198                c++;
1199        } while (c <= target);
1200        return nid;
1201}
1202
1203/* Determine a node number for interleave */
1204static inline unsigned interleave_nid(struct mempolicy *pol,
1205                 struct vm_area_struct *vma, unsigned long addr, int shift)
1206{
1207        if (vma) {
1208                unsigned long off;
1209
1210                /*
1211                 * for small pages, there is no difference between
1212                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1213                 * for huge pages, since vm_pgoff is in units of small
1214                 * pages, we need to shift off the always 0 bits to get
1215                 * a useful offset.
1216                 */
1217                BUG_ON(shift < PAGE_SHIFT);
1218                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1219                off += (addr - vma->vm_start) >> shift;
1220                return offset_il_node(pol, vma, off);
1221        } else
1222                return interleave_nodes(pol);
1223}
1224
1225#ifdef CONFIG_HUGETLBFS
1226/*
1227 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1228 * @vma = virtual memory area whose policy is sought
1229 * @addr = address in @vma for shared policy lookup and interleave policy
1230 * @gfp_flags = for requested zone
1231 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1232 *
1233 * Returns a zonelist suitable for a huge page allocation.
1234 * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1235 * If it is also a policy for which get_vma_policy() returns an extra
1236 * reference, we must hold that reference until after allocation.
1237 * In that case, return policy via @mpol so hugetlb allocation can drop
1238 * the reference.  For non-'BIND referenced policies, we can/do drop the
1239 * reference here, so the caller doesn't need to know about the special case
1240 * for default and current task policy.
1241 */
1242struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1243                                gfp_t gfp_flags, struct mempolicy **mpol)
1244{
1245        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1246        struct zonelist *zl;
1247
1248        *mpol = NULL;           /* probably no unref needed */
1249        if (pol->policy == MPOL_INTERLEAVE) {
1250                unsigned nid;
1251
1252                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1253                __mpol_free(pol);               /* finished with pol */
1254                return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1255        }
1256
1257        zl = zonelist_policy(GFP_HIGHUSER, pol);
1258        if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1259                if (pol->policy != MPOL_BIND)
1260                        __mpol_free(pol);       /* finished with pol */
1261                else
1262                        *mpol = pol;    /* unref needed after allocation */
1263        }
1264        return zl;
1265}
1266#endif
1267
1268/* Allocate a page in interleaved policy.
1269   Own path because it needs to do special accounting. */
1270static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1271                                        unsigned nid)
1272{
1273        struct zonelist *zl;
1274        struct page *page;
1275
1276        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1277        page = __alloc_pages(gfp, order, zl);
1278        if (page && page_zone(page) == zl->zones[0])
1279                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1280        return page;
1281}
1282
1283/**
1284 *      alloc_page_vma  - Allocate a page for a VMA.
1285 *
1286 *      @gfp:
1287 *      %GFP_USER    user allocation.
1288 *      %GFP_KERNEL  kernel allocations,
1289 *      %GFP_HIGHMEM highmem/user allocations,
1290 *      %GFP_FS      allocation should not call back into a file system.
1291 *      %GFP_ATOMIC  don't sleep.
1292 *
1293 *      @vma:  Pointer to VMA or NULL if not available.
1294 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1295 *
1296 *      This function allocates a page from the kernel page pool and applies
1297 *      a NUMA policy associated with the VMA or the current process.
1298 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1299 *      mm_struct of the VMA to prevent it from going away. Should be used for
1300 *      all allocations for pages that will be mapped into
1301 *      user space. Returns NULL when no page can be allocated.
1302 *
1303 *      Should be called with the mm_sem of the vma hold.
1304 */
1305struct page *
1306alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1307{
1308        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1309        struct zonelist *zl;
1310
1311        cpuset_update_task_memory_state();
1312
1313        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1314                unsigned nid;
1315
1316                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1317                return alloc_page_interleave(gfp, 0, nid);
1318        }
1319        zl = zonelist_policy(gfp, pol);
1320        if (pol != &default_policy && pol != current->mempolicy) {
1321                /*
1322                 * slow path: ref counted policy -- shared or vma
1323                 */
1324                struct page *page =  __alloc_pages(gfp, 0, zl);
1325                __mpol_free(pol);
1326                return page;
1327        }
1328        /*
1329         * fast path:  default or task policy
1330         */
1331        return __alloc_pages(gfp, 0, zl);
1332}
1333
1334/**
1335 *      alloc_pages_current - Allocate pages.
1336 *
1337 *      @gfp:
1338 *              %GFP_USER   user allocation,
1339 *              %GFP_KERNEL kernel allocation,
1340 *              %GFP_HIGHMEM highmem allocation,
1341 *              %GFP_FS     don't call back into a file system.
1342 *              %GFP_ATOMIC don't sleep.
1343 *      @order: Power of two of allocation size in pages. 0 is a single page.
1344 *
1345 *      Allocate a page from the kernel page pool.  When not in
1346 *      interrupt context and apply the current process NUMA policy.
1347 *      Returns NULL when no page can be allocated.
1348 *
1349 *      Don't call cpuset_update_task_memory_state() unless
1350 *      1) it's ok to take cpuset_sem (can WAIT), and
1351 *      2) allocating for current task (not interrupt).
1352 */
1353struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1354{
1355        struct mempolicy *pol = current->mempolicy;
1356
1357        if ((gfp & __GFP_WAIT) && !in_interrupt())
1358                cpuset_update_task_memory_state();
1359        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1360                pol = &default_policy;
1361        if (pol->policy == MPOL_INTERLEAVE)
1362                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1363        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1364}
1365EXPORT_SYMBOL(alloc_pages_current);
1366
1367/*
1368 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1369 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1370 * with the mems_allowed returned by cpuset_mems_allowed().  This
1371 * keeps mempolicies cpuset relative after its cpuset moves.  See
1372 * further kernel/cpuset.c update_nodemask().
1373 */
1374void *cpuset_being_rebound;
1375
1376/* Slow path of a mempolicy copy */
1377struct mempolicy *__mpol_copy(struct mempolicy *old)
1378{
1379        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1380
1381        if (!new)
1382                return ERR_PTR(-ENOMEM);
1383        if (current_cpuset_is_being_rebound()) {
1384                nodemask_t mems = cpuset_mems_allowed(current);
1385                mpol_rebind_policy(old, &mems);
1386        }
1387        *new = *old;
1388        atomic_set(&new->refcnt, 1);
1389        if (new->policy == MPOL_BIND) {
1390                int sz = ksize(old->v.zonelist);
1391                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1392                if (!new->v.zonelist) {
1393                        kmem_cache_free(policy_cache, new);
1394                        return ERR_PTR(-ENOMEM);
1395                }
1396        }
1397        return new;
1398}
1399
1400/* Slow path of a mempolicy comparison */
1401int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1402{
1403        if (!a || !b)
1404                return 0;
1405        if (a->policy != b->policy)
1406                return 0;
1407        switch (a->policy) {
1408        case MPOL_DEFAULT:
1409                return 1;
1410        case MPOL_INTERLEAVE:
1411                return nodes_equal(a->v.nodes, b->v.nodes);
1412        case MPOL_PREFERRED:
1413                return a->v.preferred_node == b->v.preferred_node;
1414        case MPOL_BIND: {
1415                int i;
1416                for (i = 0; a->v.zonelist->zones[i]; i++)
1417                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1418                                return 0;
1419                return b->v.zonelist->zones[i] == NULL;
1420        }
1421        default:
1422                BUG();
1423                return 0;
1424        }
1425}
1426
1427/* Slow path of a mpol destructor. */
1428void __mpol_free(struct mempolicy *p)
1429{
1430        if (!atomic_dec_and_test(&p->refcnt))
1431                return;
1432        if (p->policy == MPOL_BIND)
1433                kfree(p->v.zonelist);
1434        p->policy = MPOL_DEFAULT;
1435        kmem_cache_free(policy_cache, p);
1436}
1437
1438/*
1439 * Shared memory backing store policy support.
1440 *
1441 * Remember policies even when nobody has shared memory mapped.
1442 * The policies are kept in Red-Black tree linked from the inode.
1443 * They are protected by the sp->lock spinlock, which should be held
1444 * for any accesses to the tree.
1445 */
1446
1447/* lookup first element intersecting start-end */
1448/* Caller holds sp->lock */
1449static struct sp_node *
1450sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1451{
1452        struct rb_node *n = sp->root.rb_node;
1453
1454        while (n) {
1455                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1456
1457                if (start >= p->end)
1458                        n = n->rb_right;
1459                else if (end <= p->start)
1460                        n = n->rb_left;
1461                else
1462                        break;
1463        }
1464        if (!n)
1465                return NULL;
1466        for (;;) {
1467                struct sp_node *w = NULL;
1468                struct rb_node *prev = rb_prev(n);
1469                if (!prev)
1470                        break;
1471                w = rb_entry(prev, struct sp_node, nd);
1472                if (w->end <= start)
1473                        break;
1474                n = prev;
1475        }
1476        return rb_entry(n, struct sp_node, nd);
1477}
1478
1479/* Insert a new shared policy into the list. */
1480/* Caller holds sp->lock */
1481static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1482{
1483        struct rb_node **p = &sp->root.rb_node;
1484        struct rb_node *parent = NULL;
1485        struct sp_node *nd;
1486
1487        while (*p) {
1488                parent = *p;
1489                nd = rb_entry(parent, struct sp_node, nd);
1490                if (new->start < nd->start)
1491                        p = &(*p)->rb_left;
1492                else if (new->end > nd->end)
1493                        p = &(*p)->rb_right;
1494                else
1495                        BUG();
1496        }
1497        rb_link_node(&new->nd, parent, p);
1498        rb_insert_color(&new->nd, &sp->root);
1499        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1500                 new->policy ? new->policy->policy : 0);
1501}
1502
1503/* Find shared policy intersecting idx */
1504struct mempolicy *
1505mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1506{
1507        struct mempolicy *pol = NULL;
1508        struct sp_node *sn;
1509
1510        if (!sp->root.rb_node)
1511                return NULL;
1512        spin_lock(&sp->lock);
1513        sn = sp_lookup(sp, idx, idx+1);
1514        if (sn) {
1515                mpol_get(sn->policy);
1516                pol = sn->policy;
1517        }
1518        spin_unlock(&sp->lock);
1519        return pol;
1520}
1521
1522static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1523{
1524        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1525        rb_erase(&n->nd, &sp->root);
1526        mpol_free(n->policy);
1527        kmem_cache_free(sn_cache, n);
1528}
1529
1530struct sp_node *
1531sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1532{
1533        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1534
1535        if (!n)
1536                return NULL;
1537        n->start = start;
1538        n->end = end;
1539        mpol_get(pol);
1540        n->policy = pol;
1541        return n;
1542}
1543
1544/* Replace a policy range. */
1545static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1546                                 unsigned long end, struct sp_node *new)
1547{
1548        struct sp_node *n, *new2 = NULL;
1549
1550restart:
1551        spin_lock(&sp->lock);
1552        n = sp_lookup(sp, start, end);
1553        /* Take care of old policies in the same range. */
1554        while (n && n->start < end) {
1555                struct rb_node *next = rb_next(&n->nd);
1556                if (n->start >= start) {
1557                        if (n->end <= end)
1558                                sp_delete(sp, n);
1559                        else
1560                                n->start = end;
1561                } else {
1562                        /* Old policy spanning whole new range. */
1563                        if (n->end > end) {
1564                                if (!new2) {
1565                                        spin_unlock(&sp->lock);
1566                                        new2 = sp_alloc(end, n->end, n->policy);
1567                                        if (!new2)
1568                                                return -ENOMEM;
1569                                        goto restart;
1570                                }
1571                                n->end = start;
1572                                sp_insert(sp, new2);
1573                                new2 = NULL;
1574                                break;
1575                        } else
1576                                n->end = start;
1577                }
1578                if (!next)
1579                        break;
1580                n = rb_entry(next, struct sp_node, nd);
1581        }
1582        if (new)
1583                sp_insert(sp, new);
1584        spin_unlock(&sp->lock);
1585        if (new2) {
1586                mpol_free(new2->policy);
1587                kmem_cache_free(sn_cache, new2);
1588        }
1589        return 0;
1590}
1591
1592void mpol_shared_policy_init(struct shared_policy *info, int policy,
1593                                nodemask_t *policy_nodes)
1594{
1595        info->root = RB_ROOT;
1596        spin_lock_init(&info->lock);
1597
1598        if (policy != MPOL_DEFAULT) {
1599                struct mempolicy *newpol;
1600
1601                /* Falls back to MPOL_DEFAULT on any error */
1602                newpol = mpol_new(policy, policy_nodes);
1603                if (!IS_ERR(newpol)) {
1604                        /* Create pseudo-vma that contains just the policy */
1605                        struct vm_area_struct pvma;
1606
1607                        memset(&pvma, 0, sizeof(struct vm_area_struct));
1608                        /* Policy covers entire file */
1609                        pvma.vm_end = TASK_SIZE;
1610                        mpol_set_shared_policy(info, &pvma, newpol);
1611                        mpol_free(newpol);
1612                }
1613        }
1614}
1615
1616int mpol_set_shared_policy(struct shared_policy *info,
1617                        struct vm_area_struct *vma, struct mempolicy *npol)
1618{
1619        int err;
1620        struct sp_node *new = NULL;
1621        unsigned long sz = vma_pages(vma);
1622
1623        pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1624                 vma->vm_pgoff,
1625                 sz, npol? npol->policy : -1,
1626                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1627
1628        if (npol) {
1629                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1630                if (!new)
1631                        return -ENOMEM;
1632        }
1633        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1634        if (err && new)
1635                kmem_cache_free(sn_cache, new);
1636        return err;
1637}
1638
1639/* Free a backing policy store on inode delete. */
1640void mpol_free_shared_policy(struct shared_policy *p)
1641{
1642        struct sp_node *n;
1643        struct rb_node *next;
1644
1645        if (!p->root.rb_node)
1646                return;
1647        spin_lock(&p->lock);
1648        next = rb_first(&p->root);
1649        while (next) {
1650                n = rb_entry(next, struct sp_node, nd);
1651                next = rb_next(&n->nd);
1652                rb_erase(&n->nd, &p->root);
1653                mpol_free(n->policy);
1654                kmem_cache_free(sn_cache, n);
1655        }
1656        spin_unlock(&p->lock);
1657}
1658
1659/* assumes fs == KERNEL_DS */
1660void __init numa_policy_init(void)
1661{
1662        nodemask_t interleave_nodes;
1663        unsigned long largest = 0;
1664        int nid, prefer = 0;
1665
1666        policy_cache = kmem_cache_create("numa_policy",
1667                                         sizeof(struct mempolicy),
1668                                         0, SLAB_PANIC, NULL);
1669
1670        sn_cache = kmem_cache_create("shared_policy_node",
1671                                     sizeof(struct sp_node),
1672                                     0, SLAB_PANIC, NULL);
1673
1674        /*
1675         * Set interleaving policy for system init. Interleaving is only
1676         * enabled across suitably sized nodes (default is >= 16MB), or
1677         * fall back to the largest node if they're all smaller.
1678         */
1679        nodes_clear(interleave_nodes);
1680        for_each_online_node(nid) {
1681                unsigned long total_pages = node_present_pages(nid);
1682
1683                /* Preserve the largest node */
1684                if (largest < total_pages) {
1685                        largest = total_pages;
1686                        prefer = nid;
1687                }
1688
1689                /* Interleave this node? */
1690                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1691                        node_set(nid, interleave_nodes);
1692        }
1693
1694        /* All too small, use the largest */
1695        if (unlikely(nodes_empty(interleave_nodes)))
1696                node_set(prefer, interleave_nodes);
1697
1698        if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1699                printk("numa_policy_init: interleaving failed\n");
1700}
1701
1702/* Reset policy of current process to default */
1703void numa_default_policy(void)
1704{
1705        do_set_mempolicy(MPOL_DEFAULT, NULL);
1706}
1707
1708/* Migrate a policy to a different set of nodes */
1709void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1710{
1711        nodemask_t *mpolmask;
1712        nodemask_t tmp;
1713
1714        if (!pol)
1715                return;
1716        mpolmask = &pol->cpuset_mems_allowed;
1717        if (nodes_equal(*mpolmask, *newmask))
1718                return;
1719
1720        switch (pol->policy) {
1721        case MPOL_DEFAULT:
1722                break;
1723        case MPOL_INTERLEAVE:
1724                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1725                pol->v.nodes = tmp;
1726                *mpolmask = *newmask;
1727                current->il_next = node_remap(current->il_next,
1728                                                *mpolmask, *newmask);
1729                break;
1730        case MPOL_PREFERRED:
1731                pol->v.preferred_node = node_remap(pol->v.preferred_node,
1732                                                *mpolmask, *newmask);
1733                *mpolmask = *newmask;
1734                break;
1735        case MPOL_BIND: {
1736                nodemask_t nodes;
1737                struct zone **z;
1738                struct zonelist *zonelist;
1739
1740                nodes_clear(nodes);
1741                for (z = pol->v.zonelist->zones; *z; z++)
1742                        node_set(zone_to_nid(*z), nodes);
1743                nodes_remap(tmp, nodes, *mpolmask, *newmask);
1744                nodes = tmp;
1745
1746                zonelist = bind_zonelist(&nodes);
1747
1748                /* If no mem, then zonelist is NULL and we keep old zonelist.
1749                 * If that old zonelist has no remaining mems_allowed nodes,
1750                 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1751                 */
1752
1753                if (!IS_ERR(zonelist)) {
1754                        /* Good - got mem - substitute new zonelist */
1755                        kfree(pol->v.zonelist);
1756                        pol->v.zonelist = zonelist;
1757                }
1758                *mpolmask = *newmask;
1759                break;
1760        }
1761        default:
1762                BUG();
1763                break;
1764        }
1765}
1766
1767/*
1768 * Wrapper for mpol_rebind_policy() that just requires task
1769 * pointer, and updates task mempolicy.
1770 */
1771
1772void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1773{
1774        mpol_rebind_policy(tsk->mempolicy, new);
1775}
1776
1777/*
1778 * Rebind each vma in mm to new nodemask.
1779 *
1780 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1781 */
1782
1783void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1784{
1785        struct vm_area_struct *vma;
1786
1787        down_write(&mm->mmap_sem);
1788        for (vma = mm->mmap; vma; vma = vma->vm_next)
1789                mpol_rebind_policy(vma->vm_policy, new);
1790        up_write(&mm->mmap_sem);
1791}
1792
1793/*
1794 * Display pages allocated per node and memory policy via /proc.
1795 */
1796
1797static const char * const policy_types[] =
1798        { "default", "prefer", "bind", "interleave" };
1799
1800/*
1801 * Convert a mempolicy into a string.
1802 * Returns the number of characters in buffer (if positive)
1803 * or an error (negative)
1804 */
1805static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1806{
1807        char *p = buffer;
1808        int l;
1809        nodemask_t nodes;
1810        int mode = pol ? pol->policy : MPOL_DEFAULT;
1811
1812        switch (mode) {
1813        case MPOL_DEFAULT:
1814                nodes_clear(nodes);
1815                break;
1816
1817        case MPOL_PREFERRED:
1818                nodes_clear(nodes);
1819                node_set(pol->v.preferred_node, nodes);
1820                break;
1821
1822        case MPOL_BIND:
1823                get_zonemask(pol, &nodes);
1824                break;
1825
1826        case MPOL_INTERLEAVE:
1827                nodes = pol->v.nodes;
1828                break;
1829
1830        default:
1831                BUG();
1832                return -EFAULT;
1833        }
1834
1835        l = strlen(policy_types[mode]);
1836        if (buffer + maxlen < p + l + 1)
1837                return -ENOSPC;
1838
1839        strcpy(p, policy_types[mode]);
1840        p += l;
1841
1842        if (!nodes_empty(nodes)) {
1843                if (buffer + maxlen < p + 2)
1844                        return -ENOSPC;
1845                *p++ = '=';
1846                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1847        }
1848        return p - buffer;
1849}
1850
1851struct numa_maps {
1852        unsigned long pages;
1853        unsigned long anon;
1854        unsigned long active;
1855        unsigned long writeback;
1856        unsigned long mapcount_max;
1857        unsigned long dirty;
1858        unsigned long swapcache;
1859        unsigned long node[MAX_NUMNODES];
1860};
1861
1862static void gather_stats(struct page *page, void *private, int pte_dirty)
1863{
1864        struct numa_maps *md = private;
1865        int count = page_mapcount(page);
1866
1867        md->pages++;
1868        if (pte_dirty || PageDirty(page))
1869                md->dirty++;
1870
1871        if (PageSwapCache(page))
1872                md->swapcache++;
1873
1874        if (PageActive(page))
1875                md->active++;
1876
1877        if (PageWriteback(page))
1878                md->writeback++;
1879
1880        if (PageAnon(page))
1881                md->anon++;
1882
1883        if (count > md->mapcount_max)
1884                md->mapcount_max = count;
1885
1886        md->node[page_to_nid(page)]++;
1887}
1888
1889#ifdef CONFIG_HUGETLB_PAGE
1890static void check_huge_range(struct vm_area_struct *vma,
1891                unsigned long start, unsigned long end,
1892                struct numa_maps *md)
1893{
1894        unsigned long addr;
1895        struct page *page;
1896
1897        for (addr = start; addr < end; addr += HPAGE_SIZE) {
1898                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1899                pte_t pte;
1900
1901                if (!ptep)
1902                        continue;
1903
1904                pte = *ptep;
1905                if (pte_none(pte))
1906                        continue;
1907
1908                page = pte_page(pte);
1909                if (!page)
1910                        continue;
1911
1912                gather_stats(page, md, pte_dirty(*ptep));
1913        }
1914}
1915#else
1916static inline void check_huge_range(struct vm_area_struct *vma,
1917                unsigned long start, unsigned long end,
1918                struct numa_maps *md)
1919{
1920}
1921#endif
1922
1923int show_numa_map(struct seq_file *m, void *v)
1924{
1925        struct proc_maps_private *priv = m->private;
1926        struct vm_area_struct *vma = v;
1927        struct numa_maps *md;
1928        struct file *file = vma->vm_file;
1929        struct mm_struct *mm = vma->vm_mm;
1930        struct mempolicy *pol;
1931        int n;
1932        char buffer[50];
1933
1934        if (!mm)
1935                return 0;
1936
1937        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1938        if (!md)
1939                return 0;
1940
1941        pol = get_vma_policy(priv->task, vma, vma->vm_start);
1942        mpol_to_str(buffer, sizeof(buffer), pol);
1943        /*
1944         * unref shared or other task's mempolicy
1945         */
1946        if (pol != &default_policy && pol != current->mempolicy)
1947                __mpol_free(pol);
1948
1949        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1950
1951        if (file) {
1952                seq_printf(m, " file=");
1953                seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1954        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1955                seq_printf(m, " heap");
1956        } else if (vma->vm_start <= mm->start_stack &&
1957                        vma->vm_end >= mm->start_stack) {
1958                seq_printf(m, " stack");
1959        }
1960
1961        if (is_vm_hugetlb_page(vma)) {
1962                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1963                seq_printf(m, " huge");
1964        } else {
1965                check_pgd_range(vma, vma->vm_start, vma->vm_end,
1966                                &node_online_map, MPOL_MF_STATS, md);
1967        }
1968
1969        if (!md->pages)
1970                goto out;
1971
1972        if (md->anon)
1973                seq_printf(m," anon=%lu",md->anon);
1974
1975        if (md->dirty)
1976                seq_printf(m," dirty=%lu",md->dirty);
1977
1978        if (md->pages != md->anon && md->pages != md->dirty)
1979                seq_printf(m, " mapped=%lu", md->pages);
1980
1981        if (md->mapcount_max > 1)
1982                seq_printf(m, " mapmax=%lu", md->mapcount_max);
1983
1984        if (md->swapcache)
1985                seq_printf(m," swapcache=%lu", md->swapcache);
1986
1987        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1988                seq_printf(m," active=%lu", md->active);
1989
1990        if (md->writeback)
1991                seq_printf(m," writeback=%lu", md->writeback);
1992
1993        for_each_online_node(n)
1994                if (md->node[n])
1995                        seq_printf(m, " N%d=%lu", n, md->node[n]);
1996out:
1997        seq_putc(m, '\n');
1998        kfree(md);
1999
2000        if (m->count < m->size)
2001                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2002        return 0;
2003}
2004
2005
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.