linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#include <linux/mempolicy.h>
  69#include <linux/mm.h>
  70#include <linux/highmem.h>
  71#include <linux/hugetlb.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/nodemask.h>
  75#include <linux/cpuset.h>
  76#include <linux/slab.h>
  77#include <linux/string.h>
  78#include <linux/export.h>
  79#include <linux/nsproxy.h>
  80#include <linux/interrupt.h>
  81#include <linux/init.h>
  82#include <linux/compat.h>
  83#include <linux/swap.h>
  84#include <linux/seq_file.h>
  85#include <linux/proc_fs.h>
  86#include <linux/migrate.h>
  87#include <linux/ksm.h>
  88#include <linux/rmap.h>
  89#include <linux/security.h>
  90#include <linux/syscalls.h>
  91#include <linux/ctype.h>
  92#include <linux/mm_inline.h>
  93#include <linux/mmu_notifier.h>
  94
  95#include <asm/tlbflush.h>
  96#include <asm/uaccess.h>
  97#include <linux/random.h>
  98
  99#include "internal.h"
 100
 101/* Internal flags */
 102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105static struct kmem_cache *policy_cache;
 106static struct kmem_cache *sn_cache;
 107
 108/* Highest zone. An specific allocation for a zone below that is not
 109   policied. */
 110enum zone_type policy_zone = 0;
 111
 112/*
 113 * run-time system-wide default policy => local allocation
 114 */
 115static struct mempolicy default_policy = {
 116        .refcnt = ATOMIC_INIT(1), /* never free it */
 117        .mode = MPOL_PREFERRED,
 118        .flags = MPOL_F_LOCAL,
 119};
 120
 121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 122
 123static struct mempolicy *get_task_policy(struct task_struct *p)
 124{
 125        struct mempolicy *pol = p->mempolicy;
 126        int node;
 127
 128        if (!pol) {
 129                node = numa_node_id();
 130                if (node != -1)
 131                        pol = &preferred_node_policy[node];
 132
 133                /* preferred_node_policy is not initialised early in boot */
 134                if (!pol->mode)
 135                        pol = NULL;
 136        }
 137
 138        return pol;
 139}
 140
 141static const struct mempolicy_operations {
 142        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 143        /*
 144         * If read-side task has no lock to protect task->mempolicy, write-side
 145         * task will rebind the task->mempolicy by two step. The first step is
 146         * setting all the newly nodes, and the second step is cleaning all the
 147         * disallowed nodes. In this way, we can avoid finding no node to alloc
 148         * page.
 149         * If we have a lock to protect task->mempolicy in read-side, we do
 150         * rebind directly.
 151         *
 152         * step:
 153         *      MPOL_REBIND_ONCE - do rebind work at once
 154         *      MPOL_REBIND_STEP1 - set all the newly nodes
 155         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 156         */
 157        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 158                        enum mpol_rebind_step step);
 159} mpol_ops[MPOL_MAX];
 160
 161/* Check that the nodemask contains at least one populated zone */
 162static int is_valid_nodemask(const nodemask_t *nodemask)
 163{
 164        int nd, k;
 165
 166        for_each_node_mask(nd, *nodemask) {
 167                struct zone *z;
 168
 169                for (k = 0; k <= policy_zone; k++) {
 170                        z = &NODE_DATA(nd)->node_zones[k];
 171                        if (z->present_pages > 0)
 172                                return 1;
 173                }
 174        }
 175
 176        return 0;
 177}
 178
 179static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 180{
 181        return pol->flags & MPOL_MODE_FLAGS;
 182}
 183
 184static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 185                                   const nodemask_t *rel)
 186{
 187        nodemask_t tmp;
 188        nodes_fold(tmp, *orig, nodes_weight(*rel));
 189        nodes_onto(*ret, tmp, *rel);
 190}
 191
 192static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 193{
 194        if (nodes_empty(*nodes))
 195                return -EINVAL;
 196        pol->v.nodes = *nodes;
 197        return 0;
 198}
 199
 200static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 201{
 202        if (!nodes)
 203                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 204        else if (nodes_empty(*nodes))
 205                return -EINVAL;                 /*  no allowed nodes */
 206        else
 207                pol->v.preferred_node = first_node(*nodes);
 208        return 0;
 209}
 210
 211static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 212{
 213        if (!is_valid_nodemask(nodes))
 214                return -EINVAL;
 215        pol->v.nodes = *nodes;
 216        return 0;
 217}
 218
 219/*
 220 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 221 * any, for the new policy.  mpol_new() has already validated the nodes
 222 * parameter with respect to the policy mode and flags.  But, we need to
 223 * handle an empty nodemask with MPOL_PREFERRED here.
 224 *
 225 * Must be called holding task's alloc_lock to protect task's mems_allowed
 226 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 227 */
 228static int mpol_set_nodemask(struct mempolicy *pol,
 229                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 230{
 231        int ret;
 232
 233        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 234        if (pol == NULL)
 235                return 0;
 236        /* Check N_MEMORY */
 237        nodes_and(nsc->mask1,
 238                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 239
 240        VM_BUG_ON(!nodes);
 241        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 242                nodes = NULL;   /* explicit local allocation */
 243        else {
 244                if (pol->flags & MPOL_F_RELATIVE_NODES)
 245                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 246                else
 247                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 248
 249                if (mpol_store_user_nodemask(pol))
 250                        pol->w.user_nodemask = *nodes;
 251                else
 252                        pol->w.cpuset_mems_allowed =
 253                                                cpuset_current_mems_allowed;
 254        }
 255
 256        if (nodes)
 257                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 258        else
 259                ret = mpol_ops[pol->mode].create(pol, NULL);
 260        return ret;
 261}
 262
 263/*
 264 * This function just creates a new policy, does some check and simple
 265 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 266 */
 267static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 268                                  nodemask_t *nodes)
 269{
 270        struct mempolicy *policy;
 271
 272        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 273                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 274
 275        if (mode == MPOL_DEFAULT) {
 276                if (nodes && !nodes_empty(*nodes))
 277                        return ERR_PTR(-EINVAL);
 278                return NULL;
 279        }
 280        VM_BUG_ON(!nodes);
 281
 282        /*
 283         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 284         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 285         * All other modes require a valid pointer to a non-empty nodemask.
 286         */
 287        if (mode == MPOL_PREFERRED) {
 288                if (nodes_empty(*nodes)) {
 289                        if (((flags & MPOL_F_STATIC_NODES) ||
 290                             (flags & MPOL_F_RELATIVE_NODES)))
 291                                return ERR_PTR(-EINVAL);
 292                }
 293        } else if (mode == MPOL_LOCAL) {
 294                if (!nodes_empty(*nodes))
 295                        return ERR_PTR(-EINVAL);
 296                mode = MPOL_PREFERRED;
 297        } else if (nodes_empty(*nodes))
 298                return ERR_PTR(-EINVAL);
 299        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 300        if (!policy)
 301                return ERR_PTR(-ENOMEM);
 302        atomic_set(&policy->refcnt, 1);
 303        policy->mode = mode;
 304        policy->flags = flags;
 305
 306        return policy;
 307}
 308
 309/* Slow path of a mpol destructor. */
 310void __mpol_put(struct mempolicy *p)
 311{
 312        if (!atomic_dec_and_test(&p->refcnt))
 313                return;
 314        kmem_cache_free(policy_cache, p);
 315}
 316
 317static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 318                                enum mpol_rebind_step step)
 319{
 320}
 321
 322/*
 323 * step:
 324 *      MPOL_REBIND_ONCE  - do rebind work at once
 325 *      MPOL_REBIND_STEP1 - set all the newly nodes
 326 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 327 */
 328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 329                                 enum mpol_rebind_step step)
 330{
 331        nodemask_t tmp;
 332
 333        if (pol->flags & MPOL_F_STATIC_NODES)
 334                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 335        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 336                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 337        else {
 338                /*
 339                 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 340                 * result
 341                 */
 342                if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 343                        nodes_remap(tmp, pol->v.nodes,
 344                                        pol->w.cpuset_mems_allowed, *nodes);
 345                        pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 346                } else if (step == MPOL_REBIND_STEP2) {
 347                        tmp = pol->w.cpuset_mems_allowed;
 348                        pol->w.cpuset_mems_allowed = *nodes;
 349                } else
 350                        BUG();
 351        }
 352
 353        if (nodes_empty(tmp))
 354                tmp = *nodes;
 355
 356        if (step == MPOL_REBIND_STEP1)
 357                nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 358        else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 359                pol->v.nodes = tmp;
 360        else
 361                BUG();
 362
 363        if (!node_isset(current->il_next, tmp)) {
 364                current->il_next = next_node(current->il_next, tmp);
 365                if (current->il_next >= MAX_NUMNODES)
 366                        current->il_next = first_node(tmp);
 367                if (current->il_next >= MAX_NUMNODES)
 368                        current->il_next = numa_node_id();
 369        }
 370}
 371
 372static void mpol_rebind_preferred(struct mempolicy *pol,
 373                                  const nodemask_t *nodes,
 374                                  enum mpol_rebind_step step)
 375{
 376        nodemask_t tmp;
 377
 378        if (pol->flags & MPOL_F_STATIC_NODES) {
 379                int node = first_node(pol->w.user_nodemask);
 380
 381                if (node_isset(node, *nodes)) {
 382                        pol->v.preferred_node = node;
 383                        pol->flags &= ~MPOL_F_LOCAL;
 384                } else
 385                        pol->flags |= MPOL_F_LOCAL;
 386        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 387                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 388                pol->v.preferred_node = first_node(tmp);
 389        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 390                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 391                                                   pol->w.cpuset_mems_allowed,
 392                                                   *nodes);
 393                pol->w.cpuset_mems_allowed = *nodes;
 394        }
 395}
 396
 397/*
 398 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 399 *
 400 * If read-side task has no lock to protect task->mempolicy, write-side
 401 * task will rebind the task->mempolicy by two step. The first step is
 402 * setting all the newly nodes, and the second step is cleaning all the
 403 * disallowed nodes. In this way, we can avoid finding no node to alloc
 404 * page.
 405 * If we have a lock to protect task->mempolicy in read-side, we do
 406 * rebind directly.
 407 *
 408 * step:
 409 *      MPOL_REBIND_ONCE  - do rebind work at once
 410 *      MPOL_REBIND_STEP1 - set all the newly nodes
 411 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 412 */
 413static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 414                                enum mpol_rebind_step step)
 415{
 416        if (!pol)
 417                return;
 418        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 419            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 420                return;
 421
 422        if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 423                return;
 424
 425        if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 426                BUG();
 427
 428        if (step == MPOL_REBIND_STEP1)
 429                pol->flags |= MPOL_F_REBINDING;
 430        else if (step == MPOL_REBIND_STEP2)
 431                pol->flags &= ~MPOL_F_REBINDING;
 432        else if (step >= MPOL_REBIND_NSTEP)
 433                BUG();
 434
 435        mpol_ops[pol->mode].rebind(pol, newmask, step);
 436}
 437
 438/*
 439 * Wrapper for mpol_rebind_policy() that just requires task
 440 * pointer, and updates task mempolicy.
 441 *
 442 * Called with task's alloc_lock held.
 443 */
 444
 445void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 446                        enum mpol_rebind_step step)
 447{
 448        mpol_rebind_policy(tsk->mempolicy, new, step);
 449}
 450
 451/*
 452 * Rebind each vma in mm to new nodemask.
 453 *
 454 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 455 */
 456
 457void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 458{
 459        struct vm_area_struct *vma;
 460
 461        down_write(&mm->mmap_sem);
 462        for (vma = mm->mmap; vma; vma = vma->vm_next)
 463                mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 464        up_write(&mm->mmap_sem);
 465}
 466
 467static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 468        [MPOL_DEFAULT] = {
 469                .rebind = mpol_rebind_default,
 470        },
 471        [MPOL_INTERLEAVE] = {
 472                .create = mpol_new_interleave,
 473                .rebind = mpol_rebind_nodemask,
 474        },
 475        [MPOL_PREFERRED] = {
 476                .create = mpol_new_preferred,
 477                .rebind = mpol_rebind_preferred,
 478        },
 479        [MPOL_BIND] = {
 480                .create = mpol_new_bind,
 481                .rebind = mpol_rebind_nodemask,
 482        },
 483};
 484
 485static void migrate_page_add(struct page *page, struct list_head *pagelist,
 486                                unsigned long flags);
 487
 488/* Scan through pages checking if pages follow certain conditions. */
 489static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 490                unsigned long addr, unsigned long end,
 491                const nodemask_t *nodes, unsigned long flags,
 492                void *private)
 493{
 494        pte_t *orig_pte;
 495        pte_t *pte;
 496        spinlock_t *ptl;
 497
 498        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 499        do {
 500                struct page *page;
 501                int nid;
 502
 503                if (!pte_present(*pte))
 504                        continue;
 505                page = vm_normal_page(vma, addr, *pte);
 506                if (!page)
 507                        continue;
 508                /*
 509                 * vm_normal_page() filters out zero pages, but there might
 510                 * still be PageReserved pages to skip, perhaps in a VDSO.
 511                 * And we cannot move PageKsm pages sensibly or safely yet.
 512                 */
 513                if (PageReserved(page) || PageKsm(page))
 514                        continue;
 515                nid = page_to_nid(page);
 516                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 517                        continue;
 518
 519                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 520                        migrate_page_add(page, private, flags);
 521                else
 522                        break;
 523        } while (pte++, addr += PAGE_SIZE, addr != end);
 524        pte_unmap_unlock(orig_pte, ptl);
 525        return addr != end;
 526}
 527
 528static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 529                unsigned long addr, unsigned long end,
 530                const nodemask_t *nodes, unsigned long flags,
 531                void *private)
 532{
 533        pmd_t *pmd;
 534        unsigned long next;
 535
 536        pmd = pmd_offset(pud, addr);
 537        do {
 538                next = pmd_addr_end(addr, end);
 539                split_huge_page_pmd(vma, addr, pmd);
 540                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 541                        continue;
 542                if (check_pte_range(vma, pmd, addr, next, nodes,
 543                                    flags, private))
 544                        return -EIO;
 545        } while (pmd++, addr = next, addr != end);
 546        return 0;
 547}
 548
 549static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 550                unsigned long addr, unsigned long end,
 551                const nodemask_t *nodes, unsigned long flags,
 552                void *private)
 553{
 554        pud_t *pud;
 555        unsigned long next;
 556
 557        pud = pud_offset(pgd, addr);
 558        do {
 559                next = pud_addr_end(addr, end);
 560                if (pud_none_or_clear_bad(pud))
 561                        continue;
 562                if (check_pmd_range(vma, pud, addr, next, nodes,
 563                                    flags, private))
 564                        return -EIO;
 565        } while (pud++, addr = next, addr != end);
 566        return 0;
 567}
 568
 569static inline int check_pgd_range(struct vm_area_struct *vma,
 570                unsigned long addr, unsigned long end,
 571                const nodemask_t *nodes, unsigned long flags,
 572                void *private)
 573{
 574        pgd_t *pgd;
 575        unsigned long next;
 576
 577        pgd = pgd_offset(vma->vm_mm, addr);
 578        do {
 579                next = pgd_addr_end(addr, end);
 580                if (pgd_none_or_clear_bad(pgd))
 581                        continue;
 582                if (check_pud_range(vma, pgd, addr, next, nodes,
 583                                    flags, private))
 584                        return -EIO;
 585        } while (pgd++, addr = next, addr != end);
 586        return 0;
 587}
 588
 589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 590/*
 591 * This is used to mark a range of virtual addresses to be inaccessible.
 592 * These are later cleared by a NUMA hinting fault. Depending on these
 593 * faults, pages may be migrated for better NUMA placement.
 594 *
 595 * This is assuming that NUMA faults are handled using PROT_NONE. If
 596 * an architecture makes a different choice, it will need further
 597 * changes to the core.
 598 */
 599unsigned long change_prot_numa(struct vm_area_struct *vma,
 600                        unsigned long addr, unsigned long end)
 601{
 602        int nr_updated;
 603        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 604
 605        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 606        if (nr_updated)
 607                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 608
 609        return nr_updated;
 610}
 611#else
 612static unsigned long change_prot_numa(struct vm_area_struct *vma,
 613                        unsigned long addr, unsigned long end)
 614{
 615        return 0;
 616}
 617#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 618
 619/*
 620 * Check if all pages in a range are on a set of nodes.
 621 * If pagelist != NULL then isolate pages from the LRU and
 622 * put them on the pagelist.
 623 */
 624static struct vm_area_struct *
 625check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 626                const nodemask_t *nodes, unsigned long flags, void *private)
 627{
 628        int err;
 629        struct vm_area_struct *first, *vma, *prev;
 630
 631
 632        first = find_vma(mm, start);
 633        if (!first)
 634                return ERR_PTR(-EFAULT);
 635        prev = NULL;
 636        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 637                unsigned long endvma = vma->vm_end;
 638
 639                if (endvma > end)
 640                        endvma = end;
 641                if (vma->vm_start > start)
 642                        start = vma->vm_start;
 643
 644                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 645                        if (!vma->vm_next && vma->vm_end < end)
 646                                return ERR_PTR(-EFAULT);
 647                        if (prev && prev->vm_end < vma->vm_start)
 648                                return ERR_PTR(-EFAULT);
 649                }
 650
 651                if (is_vm_hugetlb_page(vma))
 652                        goto next;
 653
 654                if (flags & MPOL_MF_LAZY) {
 655                        change_prot_numa(vma, start, endvma);
 656                        goto next;
 657                }
 658
 659                if ((flags & MPOL_MF_STRICT) ||
 660                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 661                      vma_migratable(vma))) {
 662
 663                        err = check_pgd_range(vma, start, endvma, nodes,
 664                                                flags, private);
 665                        if (err) {
 666                                first = ERR_PTR(err);
 667                                break;
 668                        }
 669                }
 670next:
 671                prev = vma;
 672        }
 673        return first;
 674}
 675
 676/*
 677 * Apply policy to a single VMA
 678 * This must be called with the mmap_sem held for writing.
 679 */
 680static int vma_replace_policy(struct vm_area_struct *vma,
 681                                                struct mempolicy *pol)
 682{
 683        int err;
 684        struct mempolicy *old;
 685        struct mempolicy *new;
 686
 687        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 688                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 689                 vma->vm_ops, vma->vm_file,
 690                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 691
 692        new = mpol_dup(pol);
 693        if (IS_ERR(new))
 694                return PTR_ERR(new);
 695
 696        if (vma->vm_ops && vma->vm_ops->set_policy) {
 697                err = vma->vm_ops->set_policy(vma, new);
 698                if (err)
 699                        goto err_out;
 700        }
 701
 702        old = vma->vm_policy;
 703        vma->vm_policy = new; /* protected by mmap_sem */
 704        mpol_put(old);
 705
 706        return 0;
 707 err_out:
 708        mpol_put(new);
 709        return err;
 710}
 711
 712/* Step 2: apply policy to a range and do splits. */
 713static int mbind_range(struct mm_struct *mm, unsigned long start,
 714                       unsigned long end, struct mempolicy *new_pol)
 715{
 716        struct vm_area_struct *next;
 717        struct vm_area_struct *prev;
 718        struct vm_area_struct *vma;
 719        int err = 0;
 720        pgoff_t pgoff;
 721        unsigned long vmstart;
 722        unsigned long vmend;
 723
 724        vma = find_vma(mm, start);
 725        if (!vma || vma->vm_start > start)
 726                return -EFAULT;
 727
 728        prev = vma->vm_prev;
 729        if (start > vma->vm_start)
 730                prev = vma;
 731
 732        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 733                next = vma->vm_next;
 734                vmstart = max(start, vma->vm_start);
 735                vmend   = min(end, vma->vm_end);
 736
 737                if (mpol_equal(vma_policy(vma), new_pol))
 738                        continue;
 739
 740                pgoff = vma->vm_pgoff +
 741                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 742                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 743                                  vma->anon_vma, vma->vm_file, pgoff,
 744                                  new_pol);
 745                if (prev) {
 746                        vma = prev;
 747                        next = vma->vm_next;
 748                        continue;
 749                }
 750                if (vma->vm_start != vmstart) {
 751                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 752                        if (err)
 753                                goto out;
 754                }
 755                if (vma->vm_end != vmend) {
 756                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 757                        if (err)
 758                                goto out;
 759                }
 760                err = vma_replace_policy(vma, new_pol);
 761                if (err)
 762                        goto out;
 763        }
 764
 765 out:
 766        return err;
 767}
 768
 769/*
 770 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 771 * mempolicy.  Allows more rapid checking of this (combined perhaps
 772 * with other PF_* flag bits) on memory allocation hot code paths.
 773 *
 774 * If called from outside this file, the task 'p' should -only- be
 775 * a newly forked child not yet visible on the task list, because
 776 * manipulating the task flags of a visible task is not safe.
 777 *
 778 * The above limitation is why this routine has the funny name
 779 * mpol_fix_fork_child_flag().
 780 *
 781 * It is also safe to call this with a task pointer of current,
 782 * which the static wrapper mpol_set_task_struct_flag() does,
 783 * for use within this file.
 784 */
 785
 786void mpol_fix_fork_child_flag(struct task_struct *p)
 787{
 788        if (p->mempolicy)
 789                p->flags |= PF_MEMPOLICY;
 790        else
 791                p->flags &= ~PF_MEMPOLICY;
 792}
 793
 794static void mpol_set_task_struct_flag(void)
 795{
 796        mpol_fix_fork_child_flag(current);
 797}
 798
 799/* Set the process memory policy */
 800static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 801                             nodemask_t *nodes)
 802{
 803        struct mempolicy *new, *old;
 804        struct mm_struct *mm = current->mm;
 805        NODEMASK_SCRATCH(scratch);
 806        int ret;
 807
 808        if (!scratch)
 809                return -ENOMEM;
 810
 811        new = mpol_new(mode, flags, nodes);
 812        if (IS_ERR(new)) {
 813                ret = PTR_ERR(new);
 814                goto out;
 815        }
 816        /*
 817         * prevent changing our mempolicy while show_numa_maps()
 818         * is using it.
 819         * Note:  do_set_mempolicy() can be called at init time
 820         * with no 'mm'.
 821         */
 822        if (mm)
 823                down_write(&mm->mmap_sem);
 824        task_lock(current);
 825        ret = mpol_set_nodemask(new, nodes, scratch);
 826        if (ret) {
 827                task_unlock(current);
 828                if (mm)
 829                        up_write(&mm->mmap_sem);
 830                mpol_put(new);
 831                goto out;
 832        }
 833        old = current->mempolicy;
 834        current->mempolicy = new;
 835        mpol_set_task_struct_flag();
 836        if (new && new->mode == MPOL_INTERLEAVE &&
 837            nodes_weight(new->v.nodes))
 838                current->il_next = first_node(new->v.nodes);
 839        task_unlock(current);
 840        if (mm)
 841                up_write(&mm->mmap_sem);
 842
 843        mpol_put(old);
 844        ret = 0;
 845out:
 846        NODEMASK_SCRATCH_FREE(scratch);
 847        return ret;
 848}
 849
 850/*
 851 * Return nodemask for policy for get_mempolicy() query
 852 *
 853 * Called with task's alloc_lock held
 854 */
 855static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 856{
 857        nodes_clear(*nodes);
 858        if (p == &default_policy)
 859                return;
 860
 861        switch (p->mode) {
 862        case MPOL_BIND:
 863                /* Fall through */
 864        case MPOL_INTERLEAVE:
 865                *nodes = p->v.nodes;
 866                break;
 867        case MPOL_PREFERRED:
 868                if (!(p->flags & MPOL_F_LOCAL))
 869                        node_set(p->v.preferred_node, *nodes);
 870                /* else return empty node mask for local allocation */
 871                break;
 872        default:
 873                BUG();
 874        }
 875}
 876
 877static int lookup_node(struct mm_struct *mm, unsigned long addr)
 878{
 879        struct page *p;
 880        int err;
 881
 882        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 883        if (err >= 0) {
 884                err = page_to_nid(p);
 885                put_page(p);
 886        }
 887        return err;
 888}
 889
 890/* Retrieve NUMA policy */
 891static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 892                             unsigned long addr, unsigned long flags)
 893{
 894        int err;
 895        struct mm_struct *mm = current->mm;
 896        struct vm_area_struct *vma = NULL;
 897        struct mempolicy *pol = current->mempolicy;
 898
 899        if (flags &
 900                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 901                return -EINVAL;
 902
 903        if (flags & MPOL_F_MEMS_ALLOWED) {
 904                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 905                        return -EINVAL;
 906                *policy = 0;    /* just so it's initialized */
 907                task_lock(current);
 908                *nmask  = cpuset_current_mems_allowed;
 909                task_unlock(current);
 910                return 0;
 911        }
 912
 913        if (flags & MPOL_F_ADDR) {
 914                /*
 915                 * Do NOT fall back to task policy if the
 916                 * vma/shared policy at addr is NULL.  We
 917                 * want to return MPOL_DEFAULT in this case.
 918                 */
 919                down_read(&mm->mmap_sem);
 920                vma = find_vma_intersection(mm, addr, addr+1);
 921                if (!vma) {
 922                        up_read(&mm->mmap_sem);
 923                        return -EFAULT;
 924                }
 925                if (vma->vm_ops && vma->vm_ops->get_policy)
 926                        pol = vma->vm_ops->get_policy(vma, addr);
 927                else
 928                        pol = vma->vm_policy;
 929        } else if (addr)
 930                return -EINVAL;
 931
 932        if (!pol)
 933                pol = &default_policy;  /* indicates default behavior */
 934
 935        if (flags & MPOL_F_NODE) {
 936                if (flags & MPOL_F_ADDR) {
 937                        err = lookup_node(mm, addr);
 938                        if (err < 0)
 939                                goto out;
 940                        *policy = err;
 941                } else if (pol == current->mempolicy &&
 942                                pol->mode == MPOL_INTERLEAVE) {
 943                        *policy = current->il_next;
 944                } else {
 945                        err = -EINVAL;
 946                        goto out;
 947                }
 948        } else {
 949                *policy = pol == &default_policy ? MPOL_DEFAULT :
 950                                                pol->mode;
 951                /*
 952                 * Internal mempolicy flags must be masked off before exposing
 953                 * the policy to userspace.
 954                 */
 955                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 956        }
 957
 958        if (vma) {
 959                up_read(&current->mm->mmap_sem);
 960                vma = NULL;
 961        }
 962
 963        err = 0;
 964        if (nmask) {
 965                if (mpol_store_user_nodemask(pol)) {
 966                        *nmask = pol->w.user_nodemask;
 967                } else {
 968                        task_lock(current);
 969                        get_policy_nodemask(pol, nmask);
 970                        task_unlock(current);
 971                }
 972        }
 973
 974 out:
 975        mpol_cond_put(pol);
 976        if (vma)
 977                up_read(&current->mm->mmap_sem);
 978        return err;
 979}
 980
 981#ifdef CONFIG_MIGRATION
 982/*
 983 * page migration
 984 */
 985static void migrate_page_add(struct page *page, struct list_head *pagelist,
 986                                unsigned long flags)
 987{
 988        /*
 989         * Avoid migrating a page that is shared with others.
 990         */
 991        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 992                if (!isolate_lru_page(page)) {
 993                        list_add_tail(&page->lru, pagelist);
 994                        inc_zone_page_state(page, NR_ISOLATED_ANON +
 995                                            page_is_file_cache(page));
 996                }
 997        }
 998}
 999
1000static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1001{
1002        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1003}
1004
1005/*
1006 * Migrate pages from one node to a target node.
1007 * Returns error or the number of pages not migrated.
1008 */
1009static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1010                           int flags)
1011{
1012        nodemask_t nmask;
1013        LIST_HEAD(pagelist);
1014        int err = 0;
1015
1016        nodes_clear(nmask);
1017        node_set(source, nmask);
1018
1019        /*
1020         * This does not "check" the range but isolates all pages that
1021         * need migration.  Between passing in the full user address
1022         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1023         */
1024        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1025        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1026                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1027
1028        if (!list_empty(&pagelist)) {
1029                err = migrate_pages(&pagelist, new_node_page, dest,
1030                                                        false, MIGRATE_SYNC,
1031                                                        MR_SYSCALL);
1032                if (err)
1033                        putback_lru_pages(&pagelist);
1034        }
1035
1036        return err;
1037}
1038
1039/*
1040 * Move pages between the two nodesets so as to preserve the physical
1041 * layout as much as possible.
1042 *
1043 * Returns the number of page that could not be moved.
1044 */
1045int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1046                     const nodemask_t *to, int flags)
1047{
1048        int busy = 0;
1049        int err;
1050        nodemask_t tmp;
1051
1052        err = migrate_prep();
1053        if (err)
1054                return err;
1055
1056        down_read(&mm->mmap_sem);
1057
1058        err = migrate_vmas(mm, from, to, flags);
1059        if (err)
1060                goto out;
1061
1062        /*
1063         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1064         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1065         * bit in 'tmp', and return that <source, dest> pair for migration.
1066         * The pair of nodemasks 'to' and 'from' define the map.
1067         *
1068         * If no pair of bits is found that way, fallback to picking some
1069         * pair of 'source' and 'dest' bits that are not the same.  If the
1070         * 'source' and 'dest' bits are the same, this represents a node
1071         * that will be migrating to itself, so no pages need move.
1072         *
1073         * If no bits are left in 'tmp', or if all remaining bits left
1074         * in 'tmp' correspond to the same bit in 'to', return false
1075         * (nothing left to migrate).
1076         *
1077         * This lets us pick a pair of nodes to migrate between, such that
1078         * if possible the dest node is not already occupied by some other
1079         * source node, minimizing the risk of overloading the memory on a
1080         * node that would happen if we migrated incoming memory to a node
1081         * before migrating outgoing memory source that same node.
1082         *
1083         * A single scan of tmp is sufficient.  As we go, we remember the
1084         * most recent <s, d> pair that moved (s != d).  If we find a pair
1085         * that not only moved, but what's better, moved to an empty slot
1086         * (d is not set in tmp), then we break out then, with that pair.
1087         * Otherwise when we finish scanning from_tmp, we at least have the
1088         * most recent <s, d> pair that moved.  If we get all the way through
1089         * the scan of tmp without finding any node that moved, much less
1090         * moved to an empty node, then there is nothing left worth migrating.
1091         */
1092
1093        tmp = *from;
1094        while (!nodes_empty(tmp)) {
1095                int s,d;
1096                int source = -1;
1097                int dest = 0;
1098
1099                for_each_node_mask(s, tmp) {
1100
1101                        /*
1102                         * do_migrate_pages() tries to maintain the relative
1103                         * node relationship of the pages established between
1104                         * threads and memory areas.
1105                         *
1106                         * However if the number of source nodes is not equal to
1107                         * the number of destination nodes we can not preserve
1108                         * this node relative relationship.  In that case, skip
1109                         * copying memory from a node that is in the destination
1110                         * mask.
1111                         *
1112                         * Example: [2,3,4] -> [3,4,5] moves everything.
1113                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1114                         */
1115
1116                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1117                                                (node_isset(s, *to)))
1118                                continue;
1119
1120                        d = node_remap(s, *from, *to);
1121                        if (s == d)
1122                                continue;
1123
1124                        source = s;     /* Node moved. Memorize */
1125                        dest = d;
1126
1127                        /* dest not in remaining from nodes? */
1128                        if (!node_isset(dest, tmp))
1129                                break;
1130                }
1131                if (source == -1)
1132                        break;
1133
1134                node_clear(source, tmp);
1135                err = migrate_to_node(mm, source, dest, flags);
1136                if (err > 0)
1137                        busy += err;
1138                if (err < 0)
1139                        break;
1140        }
1141out:
1142        up_read(&mm->mmap_sem);
1143        if (err < 0)
1144                return err;
1145        return busy;
1146
1147}
1148
1149/*
1150 * Allocate a new page for page migration based on vma policy.
1151 * Start assuming that page is mapped by vma pointed to by @private.
1152 * Search forward from there, if not.  N.B., this assumes that the
1153 * list of pages handed to migrate_pages()--which is how we get here--
1154 * is in virtual address order.
1155 */
1156static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1157{
1158        struct vm_area_struct *vma = (struct vm_area_struct *)private;
1159        unsigned long uninitialized_var(address);
1160
1161        while (vma) {
1162                address = page_address_in_vma(page, vma);
1163                if (address != -EFAULT)
1164                        break;
1165                vma = vma->vm_next;
1166        }
1167
1168        /*
1169         * if !vma, alloc_page_vma() will use task or system default policy
1170         */
1171        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1172}
1173#else
1174
1175static void migrate_page_add(struct page *page, struct list_head *pagelist,
1176                                unsigned long flags)
1177{
1178}
1179
1180int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1181                     const nodemask_t *to, int flags)
1182{
1183        return -ENOSYS;
1184}
1185
1186static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1187{
1188        return NULL;
1189}
1190#endif
1191
1192static long do_mbind(unsigned long start, unsigned long len,
1193                     unsigned short mode, unsigned short mode_flags,
1194                     nodemask_t *nmask, unsigned long flags)
1195{
1196        struct vm_area_struct *vma;
1197        struct mm_struct *mm = current->mm;
1198        struct mempolicy *new;
1199        unsigned long end;
1200        int err;
1201        LIST_HEAD(pagelist);
1202
1203        if (flags & ~(unsigned long)MPOL_MF_VALID)
1204                return -EINVAL;
1205        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1206                return -EPERM;
1207
1208        if (start & ~PAGE_MASK)
1209                return -EINVAL;
1210
1211        if (mode == MPOL_DEFAULT)
1212                flags &= ~MPOL_MF_STRICT;
1213
1214        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1215        end = start + len;
1216
1217        if (end < start)
1218                return -EINVAL;
1219        if (end == start)
1220                return 0;
1221
1222        new = mpol_new(mode, mode_flags, nmask);
1223        if (IS_ERR(new))
1224                return PTR_ERR(new);
1225
1226        if (flags & MPOL_MF_LAZY)
1227                new->flags |= MPOL_F_MOF;
1228
1229        /*
1230         * If we are using the default policy then operation
1231         * on discontinuous address spaces is okay after all
1232         */
1233        if (!new)
1234                flags |= MPOL_MF_DISCONTIG_OK;
1235
1236        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237                 start, start + len, mode, mode_flags,
1238                 nmask ? nodes_addr(*nmask)[0] : -1);
1239
1240        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241
1242                err = migrate_prep();
1243                if (err)
1244                        goto mpol_out;
1245        }
1246        {
1247                NODEMASK_SCRATCH(scratch);
1248                if (scratch) {
1249                        down_write(&mm->mmap_sem);
1250                        task_lock(current);
1251                        err = mpol_set_nodemask(new, nmask, scratch);
1252                        task_unlock(current);
1253                        if (err)
1254                                up_write(&mm->mmap_sem);
1255                } else
1256                        err = -ENOMEM;
1257                NODEMASK_SCRATCH_FREE(scratch);
1258        }
1259        if (err)
1260                goto mpol_out;
1261
1262        vma = check_range(mm, start, end, nmask,
1263                          flags | MPOL_MF_INVERT, &pagelist);
1264
1265        err = PTR_ERR(vma);     /* maybe ... */
1266        if (!IS_ERR(vma))
1267                err = mbind_range(mm, start, end, new);
1268
1269        if (!err) {
1270                int nr_failed = 0;
1271
1272                if (!list_empty(&pagelist)) {
1273                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274                        nr_failed = migrate_pages(&pagelist, new_vma_page,
1275                                                (unsigned long)vma,
1276                                                false, MIGRATE_SYNC,
1277                                                MR_MEMPOLICY_MBIND);
1278                        if (nr_failed)
1279                                putback_lru_pages(&pagelist);
1280                }
1281
1282                if (nr_failed && (flags & MPOL_MF_STRICT))
1283                        err = -EIO;
1284        } else
1285                putback_lru_pages(&pagelist);
1286
1287        up_write(&mm->mmap_sem);
1288 mpol_out:
1289        mpol_put(new);
1290        return err;
1291}
1292
1293/*
1294 * User space interface with variable sized bitmaps for nodelists.
1295 */
1296
1297/* Copy a node mask from user space. */
1298static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1299                     unsigned long maxnode)
1300{
1301        unsigned long k;
1302        unsigned long nlongs;
1303        unsigned long endmask;
1304
1305        --maxnode;
1306        nodes_clear(*nodes);
1307        if (maxnode == 0 || !nmask)
1308                return 0;
1309        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1310                return -EINVAL;
1311
1312        nlongs = BITS_TO_LONGS(maxnode);
1313        if ((maxnode % BITS_PER_LONG) == 0)
1314                endmask = ~0UL;
1315        else
1316                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1317
1318        /* When the user specified more nodes than supported just check
1319           if the non supported part is all zero. */
1320        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1321                if (nlongs > PAGE_SIZE/sizeof(long))
1322                        return -EINVAL;
1323                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1324                        unsigned long t;
1325                        if (get_user(t, nmask + k))
1326                                return -EFAULT;
1327                        if (k == nlongs - 1) {
1328                                if (t & endmask)
1329                                        return -EINVAL;
1330                        } else if (t)
1331                                return -EINVAL;
1332                }
1333                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1334                endmask = ~0UL;
1335        }
1336
1337        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1338                return -EFAULT;
1339        nodes_addr(*nodes)[nlongs-1] &= endmask;
1340        return 0;
1341}
1342
1343/* Copy a kernel node mask to user space */
1344static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1345                              nodemask_t *nodes)
1346{
1347        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1348        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1349
1350        if (copy > nbytes) {
1351                if (copy > PAGE_SIZE)
1352                        return -EINVAL;
1353                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1354                        return -EFAULT;
1355                copy = nbytes;
1356        }
1357        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1358}
1359
1360SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1361                unsigned long, mode, unsigned long __user *, nmask,
1362                unsigned long, maxnode, unsigned, flags)
1363{
1364        nodemask_t nodes;
1365        int err;
1366        unsigned short mode_flags;
1367
1368        mode_flags = mode & MPOL_MODE_FLAGS;
1369        mode &= ~MPOL_MODE_FLAGS;
1370        if (mode >= MPOL_MAX)
1371                return -EINVAL;
1372        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1373            (mode_flags & MPOL_F_RELATIVE_NODES))
1374                return -EINVAL;
1375        err = get_nodes(&nodes, nmask, maxnode);
1376        if (err)
1377                return err;
1378        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1379}
1380
1381/* Set the process memory policy */
1382SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1383                unsigned long, maxnode)
1384{
1385        int err;
1386        nodemask_t nodes;
1387        unsigned short flags;
1388
1389        flags = mode & MPOL_MODE_FLAGS;
1390        mode &= ~MPOL_MODE_FLAGS;
1391        if ((unsigned int)mode >= MPOL_MAX)
1392                return -EINVAL;
1393        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1394                return -EINVAL;
1395        err = get_nodes(&nodes, nmask, maxnode);
1396        if (err)
1397                return err;
1398        return do_set_mempolicy(mode, flags, &nodes);
1399}
1400
1401SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1402                const unsigned long __user *, old_nodes,
1403                const unsigned long __user *, new_nodes)
1404{
1405        const struct cred *cred = current_cred(), *tcred;
1406        struct mm_struct *mm = NULL;
1407        struct task_struct *task;
1408        nodemask_t task_nodes;
1409        int err;
1410        nodemask_t *old;
1411        nodemask_t *new;
1412        NODEMASK_SCRATCH(scratch);
1413
1414        if (!scratch)
1415                return -ENOMEM;
1416
1417        old = &scratch->mask1;
1418        new = &scratch->mask2;
1419
1420        err = get_nodes(old, old_nodes, maxnode);
1421        if (err)
1422                goto out;
1423
1424        err = get_nodes(new, new_nodes, maxnode);
1425        if (err)
1426                goto out;
1427
1428        /* Find the mm_struct */
1429        rcu_read_lock();
1430        task = pid ? find_task_by_vpid(pid) : current;
1431        if (!task) {
1432                rcu_read_unlock();
1433                err = -ESRCH;
1434                goto out;
1435        }
1436        get_task_struct(task);
1437
1438        err = -EINVAL;
1439
1440        /*
1441         * Check if this process has the right to modify the specified
1442         * process. The right exists if the process has administrative
1443         * capabilities, superuser privileges or the same
1444         * userid as the target process.
1445         */
1446        tcred = __task_cred(task);
1447        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1448            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1449            !capable(CAP_SYS_NICE)) {
1450                rcu_read_unlock();
1451                err = -EPERM;
1452                goto out_put;
1453        }
1454        rcu_read_unlock();
1455
1456        task_nodes = cpuset_mems_allowed(task);
1457        /* Is the user allowed to access the target nodes? */
1458        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1459                err = -EPERM;
1460                goto out_put;
1461        }
1462
1463        if (!nodes_subset(*new, node_states[N_MEMORY])) {
1464                err = -EINVAL;
1465                goto out_put;
1466        }
1467
1468        err = security_task_movememory(task);
1469        if (err)
1470                goto out_put;
1471
1472        mm = get_task_mm(task);
1473        put_task_struct(task);
1474
1475        if (!mm) {
1476                err = -EINVAL;
1477                goto out;
1478        }
1479
1480        err = do_migrate_pages(mm, old, new,
1481                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1482
1483        mmput(mm);
1484out:
1485        NODEMASK_SCRATCH_FREE(scratch);
1486
1487        return err;
1488
1489out_put:
1490        put_task_struct(task);
1491        goto out;
1492
1493}
1494
1495
1496/* Retrieve NUMA policy */
1497SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498                unsigned long __user *, nmask, unsigned long, maxnode,
1499                unsigned long, addr, unsigned long, flags)
1500{
1501        int err;
1502        int uninitialized_var(pval);
1503        nodemask_t nodes;
1504
1505        if (nmask != NULL && maxnode < MAX_NUMNODES)
1506                return -EINVAL;
1507
1508        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1509
1510        if (err)
1511                return err;
1512
1513        if (policy && put_user(pval, policy))
1514                return -EFAULT;
1515
1516        if (nmask)
1517                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1518
1519        return err;
1520}
1521
1522#ifdef CONFIG_COMPAT
1523
1524asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1525                                     compat_ulong_t __user *nmask,
1526                                     compat_ulong_t maxnode,
1527                                     compat_ulong_t addr, compat_ulong_t flags)
1528{
1529        long err;
1530        unsigned long __user *nm = NULL;
1531        unsigned long nr_bits, alloc_size;
1532        DECLARE_BITMAP(bm, MAX_NUMNODES);
1533
1534        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536
1537        if (nmask)
1538                nm = compat_alloc_user_space(alloc_size);
1539
1540        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1541
1542        if (!err && nmask) {
1543                unsigned long copy_size;
1544                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1545                err = copy_from_user(bm, nm, copy_size);
1546                /* ensure entire bitmap is zeroed */
1547                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1548                err |= compat_put_bitmap(nmask, bm, nr_bits);
1549        }
1550
1551        return err;
1552}
1553
1554asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1555                                     compat_ulong_t maxnode)
1556{
1557        long err = 0;
1558        unsigned long __user *nm = NULL;
1559        unsigned long nr_bits, alloc_size;
1560        DECLARE_BITMAP(bm, MAX_NUMNODES);
1561
1562        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1563        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1564
1565        if (nmask) {
1566                err = compat_get_bitmap(bm, nmask, nr_bits);
1567                nm = compat_alloc_user_space(alloc_size);
1568                err |= copy_to_user(nm, bm, alloc_size);
1569        }
1570
1571        if (err)
1572                return -EFAULT;
1573
1574        return sys_set_mempolicy(mode, nm, nr_bits+1);
1575}
1576
1577asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1578                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1579                             compat_ulong_t maxnode, compat_ulong_t flags)
1580{
1581        long err = 0;
1582        unsigned long __user *nm = NULL;
1583        unsigned long nr_bits, alloc_size;
1584        nodemask_t bm;
1585
1586        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1587        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1588
1589        if (nmask) {
1590                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1591                nm = compat_alloc_user_space(alloc_size);
1592                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1593        }
1594
1595        if (err)
1596                return -EFAULT;
1597
1598        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1599}
1600
1601#endif
1602
1603/*
1604 * get_vma_policy(@task, @vma, @addr)
1605 * @task - task for fallback if vma policy == default
1606 * @vma   - virtual memory area whose policy is sought
1607 * @addr  - address in @vma for shared policy lookup
1608 *
1609 * Returns effective policy for a VMA at specified address.
1610 * Falls back to @task or system default policy, as necessary.
1611 * Current or other task's task mempolicy and non-shared vma policies must be
1612 * protected by task_lock(task) by the caller.
1613 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1614 * count--added by the get_policy() vm_op, as appropriate--to protect against
1615 * freeing by another task.  It is the caller's responsibility to free the
1616 * extra reference for shared policies.
1617 */
1618struct mempolicy *get_vma_policy(struct task_struct *task,
1619                struct vm_area_struct *vma, unsigned long addr)
1620{
1621        struct mempolicy *pol = get_task_policy(task);
1622
1623        if (vma) {
1624                if (vma->vm_ops && vma->vm_ops->get_policy) {
1625                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1626                                                                        addr);
1627                        if (vpol)
1628                                pol = vpol;
1629                } else if (vma->vm_policy) {
1630                        pol = vma->vm_policy;
1631
1632                        /*
1633                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1634                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1635                         * count on these policies which will be dropped by
1636                         * mpol_cond_put() later
1637                         */
1638                        if (mpol_needs_cond_ref(pol))
1639                                mpol_get(pol);
1640                }
1641        }
1642        if (!pol)
1643                pol = &default_policy;
1644        return pol;
1645}
1646
1647/*
1648 * Return a nodemask representing a mempolicy for filtering nodes for
1649 * page allocation
1650 */
1651static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652{
1653        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1654        if (unlikely(policy->mode == MPOL_BIND) &&
1655                        gfp_zone(gfp) >= policy_zone &&
1656                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657                return &policy->v.nodes;
1658
1659        return NULL;
1660}
1661
1662/* Return a zonelist indicated by gfp for node representing a mempolicy */
1663static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1664        int nd)
1665{
1666        switch (policy->mode) {
1667        case MPOL_PREFERRED:
1668                if (!(policy->flags & MPOL_F_LOCAL))
1669                        nd = policy->v.preferred_node;
1670                break;
1671        case MPOL_BIND:
1672                /*
1673                 * Normally, MPOL_BIND allocations are node-local within the
1674                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1675                 * current node isn't part of the mask, we use the zonelist for
1676                 * the first node in the mask instead.
1677                 */
1678                if (unlikely(gfp & __GFP_THISNODE) &&
1679                                unlikely(!node_isset(nd, policy->v.nodes)))
1680                        nd = first_node(policy->v.nodes);
1681                break;
1682        default:
1683                BUG();
1684        }
1685        return node_zonelist(nd, gfp);
1686}
1687
1688/* Do dynamic interleaving for a process */
1689static unsigned interleave_nodes(struct mempolicy *policy)
1690{
1691        unsigned nid, next;
1692        struct task_struct *me = current;
1693
1694        nid = me->il_next;
1695        next = next_node(nid, policy->v.nodes);
1696        if (next >= MAX_NUMNODES)
1697                next = first_node(policy->v.nodes);
1698        if (next < MAX_NUMNODES)
1699                me->il_next = next;
1700        return nid;
1701}
1702
1703/*
1704 * Depending on the memory policy provide a node from which to allocate the
1705 * next slab entry.
1706 * @policy must be protected by freeing by the caller.  If @policy is
1707 * the current task's mempolicy, this protection is implicit, as only the
1708 * task can change it's policy.  The system default policy requires no
1709 * such protection.
1710 */
1711unsigned slab_node(void)
1712{
1713        struct mempolicy *policy;
1714
1715        if (in_interrupt())
1716                return numa_node_id();
1717
1718        policy = current->mempolicy;
1719        if (!policy || policy->flags & MPOL_F_LOCAL)
1720                return numa_node_id();
1721
1722        switch (policy->mode) {
1723        case MPOL_PREFERRED:
1724                /*
1725                 * handled MPOL_F_LOCAL above
1726                 */
1727                return policy->v.preferred_node;
1728
1729        case MPOL_INTERLEAVE:
1730                return interleave_nodes(policy);
1731
1732        case MPOL_BIND: {
1733                /*
1734                 * Follow bind policy behavior and start allocation at the
1735                 * first node.
1736                 */
1737                struct zonelist *zonelist;
1738                struct zone *zone;
1739                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1740                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1741                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1742                                                        &policy->v.nodes,
1743                                                        &zone);
1744                return zone ? zone->node : numa_node_id();
1745        }
1746
1747        default:
1748                BUG();
1749        }
1750}
1751
1752/* Do static interleaving for a VMA with known offset. */
1753static unsigned offset_il_node(struct mempolicy *pol,
1754                struct vm_area_struct *vma, unsigned long off)
1755{
1756        unsigned nnodes = nodes_weight(pol->v.nodes);
1757        unsigned target;
1758        int c;
1759        int nid = -1;
1760
1761        if (!nnodes)
1762                return numa_node_id();
1763        target = (unsigned int)off % nnodes;
1764        c = 0;
1765        do {
1766                nid = next_node(nid, pol->v.nodes);
1767                c++;
1768        } while (c <= target);
1769        return nid;
1770}
1771
1772/* Determine a node number for interleave */
1773static inline unsigned interleave_nid(struct mempolicy *pol,
1774                 struct vm_area_struct *vma, unsigned long addr, int shift)
1775{
1776        if (vma) {
1777                unsigned long off;
1778
1779                /*
1780                 * for small pages, there is no difference between
1781                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1782                 * for huge pages, since vm_pgoff is in units of small
1783                 * pages, we need to shift off the always 0 bits to get
1784                 * a useful offset.
1785                 */
1786                BUG_ON(shift < PAGE_SHIFT);
1787                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1788                off += (addr - vma->vm_start) >> shift;
1789                return offset_il_node(pol, vma, off);
1790        } else
1791                return interleave_nodes(pol);
1792}
1793
1794/*
1795 * Return the bit number of a random bit set in the nodemask.
1796 * (returns -1 if nodemask is empty)
1797 */
1798int node_random(const nodemask_t *maskp)
1799{
1800        int w, bit = -1;
1801
1802        w = nodes_weight(*maskp);
1803        if (w)
1804                bit = bitmap_ord_to_pos(maskp->bits,
1805                        get_random_int() % w, MAX_NUMNODES);
1806        return bit;
1807}
1808
1809#ifdef CONFIG_HUGETLBFS
1810/*
1811 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1812 * @vma = virtual memory area whose policy is sought
1813 * @addr = address in @vma for shared policy lookup and interleave policy
1814 * @gfp_flags = for requested zone
1815 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1816 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1817 *
1818 * Returns a zonelist suitable for a huge page allocation and a pointer
1819 * to the struct mempolicy for conditional unref after allocation.
1820 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1821 * @nodemask for filtering the zonelist.
1822 *
1823 * Must be protected by get_mems_allowed()
1824 */
1825struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1826                                gfp_t gfp_flags, struct mempolicy **mpol,
1827                                nodemask_t **nodemask)
1828{
1829        struct zonelist *zl;
1830
1831        *mpol = get_vma_policy(current, vma, addr);
1832        *nodemask = NULL;       /* assume !MPOL_BIND */
1833
1834        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1835                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1836                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1837        } else {
1838                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1839                if ((*mpol)->mode == MPOL_BIND)
1840                        *nodemask = &(*mpol)->v.nodes;
1841        }
1842        return zl;
1843}
1844
1845/*
1846 * init_nodemask_of_mempolicy
1847 *
1848 * If the current task's mempolicy is "default" [NULL], return 'false'
1849 * to indicate default policy.  Otherwise, extract the policy nodemask
1850 * for 'bind' or 'interleave' policy into the argument nodemask, or
1851 * initialize the argument nodemask to contain the single node for
1852 * 'preferred' or 'local' policy and return 'true' to indicate presence
1853 * of non-default mempolicy.
1854 *
1855 * We don't bother with reference counting the mempolicy [mpol_get/put]
1856 * because the current task is examining it's own mempolicy and a task's
1857 * mempolicy is only ever changed by the task itself.
1858 *
1859 * N.B., it is the caller's responsibility to free a returned nodemask.
1860 */
1861bool init_nodemask_of_mempolicy(nodemask_t *mask)
1862{
1863        struct mempolicy *mempolicy;
1864        int nid;
1865
1866        if (!(mask && current->mempolicy))
1867                return false;
1868
1869        task_lock(current);
1870        mempolicy = current->mempolicy;
1871        switch (mempolicy->mode) {
1872        case MPOL_PREFERRED:
1873                if (mempolicy->flags & MPOL_F_LOCAL)
1874                        nid = numa_node_id();
1875                else
1876                        nid = mempolicy->v.preferred_node;
1877                init_nodemask_of_node(mask, nid);
1878                break;
1879
1880        case MPOL_BIND:
1881                /* Fall through */
1882        case MPOL_INTERLEAVE:
1883                *mask =  mempolicy->v.nodes;
1884                break;
1885
1886        default:
1887                BUG();
1888        }
1889        task_unlock(current);
1890
1891        return true;
1892}
1893#endif
1894
1895/*
1896 * mempolicy_nodemask_intersects
1897 *
1898 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1899 * policy.  Otherwise, check for intersection between mask and the policy
1900 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1901 * policy, always return true since it may allocate elsewhere on fallback.
1902 *
1903 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1904 */
1905bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1906                                        const nodemask_t *mask)
1907{
1908        struct mempolicy *mempolicy;
1909        bool ret = true;
1910
1911        if (!mask)
1912                return ret;
1913        task_lock(tsk);
1914        mempolicy = tsk->mempolicy;
1915        if (!mempolicy)
1916                goto out;
1917
1918        switch (mempolicy->mode) {
1919        case MPOL_PREFERRED:
1920                /*
1921                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1922                 * allocate from, they may fallback to other nodes when oom.
1923                 * Thus, it's possible for tsk to have allocated memory from
1924                 * nodes in mask.
1925                 */
1926                break;
1927        case MPOL_BIND:
1928        case MPOL_INTERLEAVE:
1929                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1930                break;
1931        default:
1932                BUG();
1933        }
1934out:
1935        task_unlock(tsk);
1936        return ret;
1937}
1938
1939/* Allocate a page in interleaved policy.
1940   Own path because it needs to do special accounting. */
1941static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1942                                        unsigned nid)
1943{
1944        struct zonelist *zl;
1945        struct page *page;
1946
1947        zl = node_zonelist(nid, gfp);
1948        page = __alloc_pages(gfp, order, zl);
1949        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1950                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1951        return page;
1952}
1953
1954/**
1955 *      alloc_pages_vma - Allocate a page for a VMA.
1956 *
1957 *      @gfp:
1958 *      %GFP_USER    user allocation.
1959 *      %GFP_KERNEL  kernel allocations,
1960 *      %GFP_HIGHMEM highmem/user allocations,
1961 *      %GFP_FS      allocation should not call back into a file system.
1962 *      %GFP_ATOMIC  don't sleep.
1963 *
1964 *      @order:Order of the GFP allocation.
1965 *      @vma:  Pointer to VMA or NULL if not available.
1966 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1967 *
1968 *      This function allocates a page from the kernel page pool and applies
1969 *      a NUMA policy associated with the VMA or the current process.
1970 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1971 *      mm_struct of the VMA to prevent it from going away. Should be used for
1972 *      all allocations for pages that will be mapped into
1973 *      user space. Returns NULL when no page can be allocated.
1974 *
1975 *      Should be called with the mm_sem of the vma hold.
1976 */
1977struct page *
1978alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1979                unsigned long addr, int node)
1980{
1981        struct mempolicy *pol;
1982        struct page *page;
1983        unsigned int cpuset_mems_cookie;
1984
1985retry_cpuset:
1986        pol = get_vma_policy(current, vma, addr);
1987        cpuset_mems_cookie = get_mems_allowed();
1988
1989        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1990                unsigned nid;
1991
1992                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1993                mpol_cond_put(pol);
1994                page = alloc_page_interleave(gfp, order, nid);
1995                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1996                        goto retry_cpuset;
1997
1998                return page;
1999        }
2000        page = __alloc_pages_nodemask(gfp, order,
2001                                      policy_zonelist(gfp, pol, node),
2002                                      policy_nodemask(gfp, pol));
2003        if (unlikely(mpol_needs_cond_ref(pol)))
2004                __mpol_put(pol);
2005        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2006                goto retry_cpuset;
2007        return page;
2008}
2009
2010/**
2011 *      alloc_pages_current - Allocate pages.
2012 *
2013 *      @gfp:
2014 *              %GFP_USER   user allocation,
2015 *              %GFP_KERNEL kernel allocation,
2016 *              %GFP_HIGHMEM highmem allocation,
2017 *              %GFP_FS     don't call back into a file system.
2018 *              %GFP_ATOMIC don't sleep.
2019 *      @order: Power of two of allocation size in pages. 0 is a single page.
2020 *
2021 *      Allocate a page from the kernel page pool.  When not in
2022 *      interrupt context and apply the current process NUMA policy.
2023 *      Returns NULL when no page can be allocated.
2024 *
2025 *      Don't call cpuset_update_task_memory_state() unless
2026 *      1) it's ok to take cpuset_sem (can WAIT), and
2027 *      2) allocating for current task (not interrupt).
2028 */
2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2030{
2031        struct mempolicy *pol = get_task_policy(current);
2032        struct page *page;
2033        unsigned int cpuset_mems_cookie;
2034
2035        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2036                pol = &default_policy;
2037
2038retry_cpuset:
2039        cpuset_mems_cookie = get_mems_allowed();
2040
2041        /*
2042         * No reference counting needed for current->mempolicy
2043         * nor system default_policy
2044         */
2045        if (pol->mode == MPOL_INTERLEAVE)
2046                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2047        else
2048                page = __alloc_pages_nodemask(gfp, order,
2049                                policy_zonelist(gfp, pol, numa_node_id()),
2050                                policy_nodemask(gfp, pol));
2051
2052        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2053                goto retry_cpuset;
2054
2055        return page;
2056}
2057EXPORT_SYMBOL(alloc_pages_current);
2058
2059/*
2060 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2061 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2062 * with the mems_allowed returned by cpuset_mems_allowed().  This
2063 * keeps mempolicies cpuset relative after its cpuset moves.  See
2064 * further kernel/cpuset.c update_nodemask().
2065 *
2066 * current's mempolicy may be rebinded by the other task(the task that changes
2067 * cpuset's mems), so we needn't do rebind work for current task.
2068 */
2069
2070/* Slow path of a mempolicy duplicate */
2071struct mempolicy *__mpol_dup(struct mempolicy *old)
2072{
2073        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2074
2075        if (!new)
2076                return ERR_PTR(-ENOMEM);
2077
2078        /* task's mempolicy is protected by alloc_lock */
2079        if (old == current->mempolicy) {
2080                task_lock(current);
2081                *new = *old;
2082                task_unlock(current);
2083        } else
2084                *new = *old;
2085
2086        rcu_read_lock();
2087        if (current_cpuset_is_being_rebound()) {
2088                nodemask_t mems = cpuset_mems_allowed(current);
2089                if (new->flags & MPOL_F_REBINDING)
2090                        mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2091                else
2092                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2093        }
2094        rcu_read_unlock();
2095        atomic_set(&new->refcnt, 1);
2096        return new;
2097}
2098
2099/* Slow path of a mempolicy comparison */
2100bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2101{
2102        if (!a || !b)
2103                return false;
2104        if (a->mode != b->mode)
2105                return false;
2106        if (a->flags != b->flags)
2107                return false;
2108        if (mpol_store_user_nodemask(a))
2109                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2110                        return false;
2111
2112        switch (a->mode) {
2113        case MPOL_BIND:
2114                /* Fall through */
2115        case MPOL_INTERLEAVE:
2116                return !!nodes_equal(a->v.nodes, b->v.nodes);
2117        case MPOL_PREFERRED:
2118                return a->v.preferred_node == b->v.preferred_node;
2119        default:
2120                BUG();
2121                return false;
2122        }
2123}
2124
2125/*
2126 * Shared memory backing store policy support.
2127 *
2128 * Remember policies even when nobody has shared memory mapped.
2129 * The policies are kept in Red-Black tree linked from the inode.
2130 * They are protected by the sp->lock spinlock, which should be held
2131 * for any accesses to the tree.
2132 */
2133
2134/* lookup first element intersecting start-end */
2135/* Caller holds sp->lock */
2136static struct sp_node *
2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138{
2139        struct rb_node *n = sp->root.rb_node;
2140
2141        while (n) {
2142                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2143
2144                if (start >= p->end)
2145                        n = n->rb_right;
2146                else if (end <= p->start)
2147                        n = n->rb_left;
2148                else
2149                        break;
2150        }
2151        if (!n)
2152                return NULL;
2153        for (;;) {
2154                struct sp_node *w = NULL;
2155                struct rb_node *prev = rb_prev(n);
2156                if (!prev)
2157                        break;
2158                w = rb_entry(prev, struct sp_node, nd);
2159                if (w->end <= start)
2160                        break;
2161                n = prev;
2162        }
2163        return rb_entry(n, struct sp_node, nd);
2164}
2165
2166/* Insert a new shared policy into the list. */
2167/* Caller holds sp->lock */
2168static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2169{
2170        struct rb_node **p = &sp->root.rb_node;
2171        struct rb_node *parent = NULL;
2172        struct sp_node *nd;
2173
2174        while (*p) {
2175                parent = *p;
2176                nd = rb_entry(parent, struct sp_node, nd);
2177                if (new->start < nd->start)
2178                        p = &(*p)->rb_left;
2179                else if (new->end > nd->end)
2180                        p = &(*p)->rb_right;
2181                else
2182                        BUG();
2183        }
2184        rb_link_node(&new->nd, parent, p);
2185        rb_insert_color(&new->nd, &sp->root);
2186        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2187                 new->policy ? new->policy->mode : 0);
2188}
2189
2190/* Find shared policy intersecting idx */
2191struct mempolicy *
2192mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2193{
2194        struct mempolicy *pol = NULL;
2195        struct sp_node *sn;
2196
2197        if (!sp->root.rb_node)
2198                return NULL;
2199        spin_lock(&sp->lock);
2200        sn = sp_lookup(sp, idx, idx+1);
2201        if (sn) {
2202                mpol_get(sn->policy);
2203                pol = sn->policy;
2204        }
2205        spin_unlock(&sp->lock);
2206        return pol;
2207}
2208
2209static void sp_free(struct sp_node *n)
2210{
2211        mpol_put(n->policy);
2212        kmem_cache_free(sn_cache, n);
2213}
2214
2215/**
2216 * mpol_misplaced - check whether current page node is valid in policy
2217 *
2218 * @page   - page to be checked
2219 * @vma    - vm area where page mapped
2220 * @addr   - virtual address where page mapped
2221 *
2222 * Lookup current policy node id for vma,addr and "compare to" page's
2223 * node id.
2224 *
2225 * Returns:
2226 *      -1      - not misplaced, page is in the right node
2227 *      node    - node id where the page should be
2228 *
2229 * Policy determination "mimics" alloc_page_vma().
2230 * Called from fault path where we know the vma and faulting address.
2231 */
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234        struct mempolicy *pol;
2235        struct zone *zone;
2236        int curnid = page_to_nid(page);
2237        unsigned long pgoff;
2238        int polnid = -1;
2239        int ret = -1;
2240
2241        BUG_ON(!vma);
2242
2243        pol = get_vma_policy(current, vma, addr);
2244        if (!(pol->flags & MPOL_F_MOF))
2245                goto out;
2246
2247        switch (pol->mode) {
2248        case MPOL_INTERLEAVE:
2249                BUG_ON(addr >= vma->vm_end);
2250                BUG_ON(addr < vma->vm_start);
2251
2252                pgoff = vma->vm_pgoff;
2253                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254                polnid = offset_il_node(pol, vma, pgoff);
2255                break;
2256
2257        case MPOL_PREFERRED:
2258                if (pol->flags & MPOL_F_LOCAL)
2259                        polnid = numa_node_id();
2260                else
2261                        polnid = pol->v.preferred_node;
2262                break;
2263
2264        case MPOL_BIND:
2265                /*
2266                 * allows binding to multiple nodes.
2267                 * use current page if in policy nodemask,
2268                 * else select nearest allowed node, if any.
2269                 * If no allowed nodes, use current [!misplaced].
2270                 */
2271                if (node_isset(curnid, pol->v.nodes))
2272                        goto out;
2273                (void)first_zones_zonelist(
2274                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275                                gfp_zone(GFP_HIGHUSER),
2276                                &pol->v.nodes, &zone);
2277                polnid = zone->node;
2278                break;
2279
2280        default:
2281                BUG();
2282        }
2283
2284        /* Migrate the page towards the node whose CPU is referencing it */
2285        if (pol->flags & MPOL_F_MORON) {
2286                int last_nid;
2287
2288                polnid = numa_node_id();
2289
2290                /*
2291                 * Multi-stage node selection is used in conjunction
2292                 * with a periodic migration fault to build a temporal
2293                 * task<->page relation. By using a two-stage filter we
2294                 * remove short/unlikely relations.
2295                 *
2296                 * Using P(p) ~ n_p / n_t as per frequentist
2297                 * probability, we can equate a task's usage of a
2298                 * particular page (n_p) per total usage of this
2299                 * page (n_t) (in a given time-span) to a probability.
2300                 *
2301                 * Our periodic faults will sample this probability and
2302                 * getting the same result twice in a row, given these
2303                 * samples are fully independent, is then given by
2304                 * P(n)^2, provided our sample period is sufficiently
2305                 * short compared to the usage pattern.
2306                 *
2307                 * This quadric squishes small probabilities, making
2308                 * it less likely we act on an unlikely task<->page
2309                 * relation.
2310                 */
2311                last_nid = page_xchg_last_nid(page, polnid);
2312                if (last_nid != polnid)
2313                        goto out;
2314        }
2315
2316        if (curnid != polnid)
2317                ret = polnid;
2318out:
2319        mpol_cond_put(pol);
2320
2321        return ret;
2322}
2323
2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2325{
2326        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2327        rb_erase(&n->nd, &sp->root);
2328        sp_free(n);
2329}
2330
2331static void sp_node_init(struct sp_node *node, unsigned long start,
2332                        unsigned long end, struct mempolicy *pol)
2333{
2334        node->start = start;
2335        node->end = end;
2336        node->policy = pol;
2337}
2338
2339static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2340                                struct mempolicy *pol)
2341{
2342        struct sp_node *n;
2343        struct mempolicy *newpol;
2344
2345        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2346        if (!n)
2347                return NULL;
2348
2349        newpol = mpol_dup(pol);
2350        if (IS_ERR(newpol)) {
2351                kmem_cache_free(sn_cache, n);
2352                return NULL;
2353        }
2354        newpol->flags |= MPOL_F_SHARED;
2355        sp_node_init(n, start, end, newpol);
2356
2357        return n;
2358}
2359
2360/* Replace a policy range. */
2361static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2362                                 unsigned long end, struct sp_node *new)
2363{
2364        struct sp_node *n;
2365        struct sp_node *n_new = NULL;
2366        struct mempolicy *mpol_new = NULL;
2367        int ret = 0;
2368
2369restart:
2370        spin_lock(&sp->lock);
2371        n = sp_lookup(sp, start, end);
2372        /* Take care of old policies in the same range. */
2373        while (n && n->start < end) {
2374                struct rb_node *next = rb_next(&n->nd);
2375                if (n->start >= start) {
2376                        if (n->end <= end)
2377                                sp_delete(sp, n);
2378                        else
2379                                n->start = end;
2380                } else {
2381                        /* Old policy spanning whole new range. */
2382                        if (n->end > end) {
2383                                if (!n_new)
2384                                        goto alloc_new;
2385
2386                                *mpol_new = *n->policy;
2387                                atomic_set(&mpol_new->refcnt, 1);
2388                                sp_node_init(n_new, n->end, end, mpol_new);
2389                                sp_insert(sp, n_new);
2390                                n->end = start;
2391                                n_new = NULL;
2392                                mpol_new = NULL;
2393                                break;
2394                        } else
2395                                n->end = start;
2396                }
2397                if (!next)
2398                        break;
2399                n = rb_entry(next, struct sp_node, nd);
2400        }
2401        if (new)
2402                sp_insert(sp, new);
2403        spin_unlock(&sp->lock);
2404        ret = 0;
2405
2406err_out:
2407        if (mpol_new)
2408                mpol_put(mpol_new);
2409        if (n_new)
2410                kmem_cache_free(sn_cache, n_new);
2411
2412        return ret;
2413
2414alloc_new:
2415        spin_unlock(&sp->lock);
2416        ret = -ENOMEM;
2417        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2418        if (!n_new)
2419                goto err_out;
2420        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2421        if (!mpol_new)
2422                goto err_out;
2423        goto restart;
2424}
2425
2426/**
2427 * mpol_shared_policy_init - initialize shared policy for inode
2428 * @sp: pointer to inode shared policy
2429 * @mpol:  struct mempolicy to install
2430 *
2431 * Install non-NULL @mpol in inode's shared policy rb-tree.
2432 * On entry, the current task has a reference on a non-NULL @mpol.
2433 * This must be released on exit.
2434 * This is called at get_inode() calls and we can use GFP_KERNEL.
2435 */
2436void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2437{
2438        int ret;
2439
2440        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2441        spin_lock_init(&sp->lock);
2442
2443        if (mpol) {
2444                struct vm_area_struct pvma;
2445                struct mempolicy *new;
2446                NODEMASK_SCRATCH(scratch);
2447
2448                if (!scratch)
2449                        goto put_mpol;
2450                /* contextualize the tmpfs mount point mempolicy */
2451                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2452                if (IS_ERR(new))
2453                        goto free_scratch; /* no valid nodemask intersection */
2454
2455                task_lock(current);
2456                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2457                task_unlock(current);
2458                if (ret)
2459                        goto put_new;
2460
2461                /* Create pseudo-vma that contains just the policy */
2462                memset(&pvma, 0, sizeof(struct vm_area_struct));
2463                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2464                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2465
2466put_new:
2467                mpol_put(new);                  /* drop initial ref */
2468free_scratch:
2469                NODEMASK_SCRATCH_FREE(scratch);
2470put_mpol:
2471                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2472        }
2473}
2474
2475int mpol_set_shared_policy(struct shared_policy *info,
2476                        struct vm_area_struct *vma, struct mempolicy *npol)
2477{
2478        int err;
2479        struct sp_node *new = NULL;
2480        unsigned long sz = vma_pages(vma);
2481
2482        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2483                 vma->vm_pgoff,
2484                 sz, npol ? npol->mode : -1,
2485                 npol ? npol->flags : -1,
2486                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2487
2488        if (npol) {
2489                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2490                if (!new)
2491                        return -ENOMEM;
2492        }
2493        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2494        if (err && new)
2495                sp_free(new);
2496        return err;
2497}
2498
2499/* Free a backing policy store on inode delete. */
2500void mpol_free_shared_policy(struct shared_policy *p)
2501{
2502        struct sp_node *n;
2503        struct rb_node *next;
2504
2505        if (!p->root.rb_node)
2506                return;
2507        spin_lock(&p->lock);
2508        next = rb_first(&p->root);
2509        while (next) {
2510                n = rb_entry(next, struct sp_node, nd);
2511                next = rb_next(&n->nd);
2512                sp_delete(p, n);
2513        }
2514        spin_unlock(&p->lock);
2515}
2516
2517#ifdef CONFIG_NUMA_BALANCING
2518static bool __initdata numabalancing_override;
2519
2520static void __init check_numabalancing_enable(void)
2521{
2522        bool numabalancing_default = false;
2523
2524        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2525                numabalancing_default = true;
2526
2527        if (nr_node_ids > 1 && !numabalancing_override) {
2528                printk(KERN_INFO "Enabling automatic NUMA balancing. "
2529                        "Configure with numa_balancing= or sysctl");
2530                set_numabalancing_state(numabalancing_default);
2531        }
2532}
2533
2534static int __init setup_numabalancing(char *str)
2535{
2536        int ret = 0;
2537        if (!str)
2538                goto out;
2539        numabalancing_override = true;
2540
2541        if (!strcmp(str, "enable")) {
2542                set_numabalancing_state(true);
2543                ret = 1;
2544        } else if (!strcmp(str, "disable")) {
2545                set_numabalancing_state(false);
2546                ret = 1;
2547        }
2548out:
2549        if (!ret)
2550                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2551
2552        return ret;
2553}
2554__setup("numa_balancing=", setup_numabalancing);
2555#else
2556static inline void __init check_numabalancing_enable(void)
2557{
2558}
2559#endif /* CONFIG_NUMA_BALANCING */
2560
2561/* assumes fs == KERNEL_DS */
2562void __init numa_policy_init(void)
2563{
2564        nodemask_t interleave_nodes;
2565        unsigned long largest = 0;
2566        int nid, prefer = 0;
2567
2568        policy_cache = kmem_cache_create("numa_policy",
2569                                         sizeof(struct mempolicy),
2570                                         0, SLAB_PANIC, NULL);
2571
2572        sn_cache = kmem_cache_create("shared_policy_node",
2573                                     sizeof(struct sp_node),
2574                                     0, SLAB_PANIC, NULL);
2575
2576        for_each_node(nid) {
2577                preferred_node_policy[nid] = (struct mempolicy) {
2578                        .refcnt = ATOMIC_INIT(1),
2579                        .mode = MPOL_PREFERRED,
2580                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2581                        .v = { .preferred_node = nid, },
2582                };
2583        }
2584
2585        /*
2586         * Set interleaving policy for system init. Interleaving is only
2587         * enabled across suitably sized nodes (default is >= 16MB), or
2588         * fall back to the largest node if they're all smaller.
2589         */
2590        nodes_clear(interleave_nodes);
2591        for_each_node_state(nid, N_MEMORY) {
2592                unsigned long total_pages = node_present_pages(nid);
2593
2594                /* Preserve the largest node */
2595                if (largest < total_pages) {
2596                        largest = total_pages;
2597                        prefer = nid;
2598                }
2599
2600                /* Interleave this node? */
2601                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2602                        node_set(nid, interleave_nodes);
2603        }
2604
2605        /* All too small, use the largest */
2606        if (unlikely(nodes_empty(interleave_nodes)))
2607                node_set(prefer, interleave_nodes);
2608
2609        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2610                printk("numa_policy_init: interleaving failed\n");
2611
2612        check_numabalancing_enable();
2613}
2614
2615/* Reset policy of current process to default */
2616void numa_default_policy(void)
2617{
2618        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2619}
2620
2621/*
2622 * Parse and format mempolicy from/to strings
2623 */
2624
2625/*
2626 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2627 */
2628static const char * const policy_modes[] =
2629{
2630        [MPOL_DEFAULT]    = "default",
2631        [MPOL_PREFERRED]  = "prefer",
2632        [MPOL_BIND]       = "bind",
2633        [MPOL_INTERLEAVE] = "interleave",
2634        [MPOL_LOCAL]      = "local",
2635};
2636
2637
2638#ifdef CONFIG_TMPFS
2639/**
2640 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2641 * @str:  string containing mempolicy to parse
2642 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2643 *
2644 * Format of input:
2645 *      <mode>[=<flags>][:<nodelist>]
2646 *
2647 * On success, returns 0, else 1
2648 */
2649int mpol_parse_str(char *str, struct mempolicy **mpol)
2650{
2651        struct mempolicy *new = NULL;
2652        unsigned short mode;
2653        unsigned short mode_flags;
2654        nodemask_t nodes;
2655        char *nodelist = strchr(str, ':');
2656        char *flags = strchr(str, '=');
2657        int err = 1;
2658
2659        if (nodelist) {
2660                /* NUL-terminate mode or flags string */
2661                *nodelist++ = '\0';
2662                if (nodelist_parse(nodelist, nodes))
2663                        goto out;
2664                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2665                        goto out;
2666        } else
2667                nodes_clear(nodes);
2668
2669        if (flags)
2670                *flags++ = '\0';        /* terminate mode string */
2671
2672        for (mode = 0; mode < MPOL_MAX; mode++) {
2673                if (!strcmp(str, policy_modes[mode])) {
2674                        break;
2675                }
2676        }
2677        if (mode >= MPOL_MAX)
2678                goto out;
2679
2680        switch (mode) {
2681        case MPOL_PREFERRED:
2682                /*
2683                 * Insist on a nodelist of one node only
2684                 */
2685                if (nodelist) {
2686                        char *rest = nodelist;
2687                        while (isdigit(*rest))
2688                                rest++;
2689                        if (*rest)
2690                                goto out;
2691                }
2692                break;
2693        case MPOL_INTERLEAVE:
2694                /*
2695                 * Default to online nodes with memory if no nodelist
2696                 */
2697                if (!nodelist)
2698                        nodes = node_states[N_MEMORY];
2699                break;
2700        case MPOL_LOCAL:
2701                /*
2702                 * Don't allow a nodelist;  mpol_new() checks flags
2703                 */
2704                if (nodelist)
2705                        goto out;
2706                mode = MPOL_PREFERRED;
2707                break;
2708        case MPOL_DEFAULT:
2709                /*
2710                 * Insist on a empty nodelist
2711                 */
2712                if (!nodelist)
2713                        err = 0;
2714                goto out;
2715        case MPOL_BIND:
2716                /*
2717                 * Insist on a nodelist
2718                 */
2719                if (!nodelist)
2720                        goto out;
2721        }
2722
2723        mode_flags = 0;
2724        if (flags) {
2725                /*
2726                 * Currently, we only support two mutually exclusive
2727                 * mode flags.
2728                 */
2729                if (!strcmp(flags, "static"))
2730                        mode_flags |= MPOL_F_STATIC_NODES;
2731                else if (!strcmp(flags, "relative"))
2732                        mode_flags |= MPOL_F_RELATIVE_NODES;
2733                else
2734                        goto out;
2735        }
2736
2737        new = mpol_new(mode, mode_flags, &nodes);
2738        if (IS_ERR(new))
2739                goto out;
2740
2741        /*
2742         * Save nodes for mpol_to_str() to show the tmpfs mount options
2743         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2744         */
2745        if (mode != MPOL_PREFERRED)
2746                new->v.nodes = nodes;
2747        else if (nodelist)
2748                new->v.preferred_node = first_node(nodes);
2749        else
2750                new->flags |= MPOL_F_LOCAL;
2751
2752        /*
2753         * Save nodes for contextualization: this will be used to "clone"
2754         * the mempolicy in a specific context [cpuset] at a later time.
2755         */
2756        new->w.user_nodemask = nodes;
2757
2758        err = 0;
2759
2760out:
2761        /* Restore string for error message */
2762        if (nodelist)
2763                *--nodelist = ':';
2764        if (flags)
2765                *--flags = '=';
2766        if (!err)
2767                *mpol = new;
2768        return err;
2769}
2770#endif /* CONFIG_TMPFS */
2771
2772/**
2773 * mpol_to_str - format a mempolicy structure for printing
2774 * @buffer:  to contain formatted mempolicy string
2775 * @maxlen:  length of @buffer
2776 * @pol:  pointer to mempolicy to be formatted
2777 *
2778 * Convert a mempolicy into a string.
2779 * Returns the number of characters in buffer (if positive)
2780 * or an error (negative)
2781 */
2782int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2783{
2784        char *p = buffer;
2785        int l;
2786        nodemask_t nodes;
2787        unsigned short mode;
2788        unsigned short flags = pol ? pol->flags : 0;
2789
2790        /*
2791         * Sanity check:  room for longest mode, flag and some nodes
2792         */
2793        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2794
2795        if (!pol || pol == &default_policy)
2796                mode = MPOL_DEFAULT;
2797        else
2798                mode = pol->mode;
2799
2800        switch (mode) {
2801        case MPOL_DEFAULT:
2802                nodes_clear(nodes);
2803                break;
2804
2805        case MPOL_PREFERRED:
2806                nodes_clear(nodes);
2807                if (flags & MPOL_F_LOCAL)
2808                        mode = MPOL_LOCAL;
2809                else
2810                        node_set(pol->v.preferred_node, nodes);
2811                break;
2812
2813        case MPOL_BIND:
2814                /* Fall through */
2815        case MPOL_INTERLEAVE:
2816                nodes = pol->v.nodes;
2817                break;
2818
2819        default:
2820                return -EINVAL;
2821        }
2822
2823        l = strlen(policy_modes[mode]);
2824        if (buffer + maxlen < p + l + 1)
2825                return -ENOSPC;
2826
2827        strcpy(p, policy_modes[mode]);
2828        p += l;
2829
2830        if (flags & MPOL_MODE_FLAGS) {
2831                if (buffer + maxlen < p + 2)
2832                        return -ENOSPC;
2833                *p++ = '=';
2834
2835                /*
2836                 * Currently, the only defined flags are mutually exclusive
2837                 */
2838                if (flags & MPOL_F_STATIC_NODES)
2839                        p += snprintf(p, buffer + maxlen - p, "static");
2840                else if (flags & MPOL_F_RELATIVE_NODES)
2841                        p += snprintf(p, buffer + maxlen - p, "relative");
2842        }
2843
2844        if (!nodes_empty(nodes)) {
2845                if (buffer + maxlen < p + 2)
2846                        return -ENOSPC;
2847                *p++ = ':';
2848                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2849        }
2850        return p - buffer;
2851}
2852