linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#include <linux/mempolicy.h>
  69#include <linux/mm.h>
  70#include <linux/highmem.h>
  71#include <linux/hugetlb.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/nodemask.h>
  75#include <linux/cpuset.h>
  76#include <linux/slab.h>
  77#include <linux/string.h>
  78#include <linux/export.h>
  79#include <linux/nsproxy.h>
  80#include <linux/interrupt.h>
  81#include <linux/init.h>
  82#include <linux/compat.h>
  83#include <linux/swap.h>
  84#include <linux/seq_file.h>
  85#include <linux/proc_fs.h>
  86#include <linux/migrate.h>
  87#include <linux/ksm.h>
  88#include <linux/rmap.h>
  89#include <linux/security.h>
  90#include <linux/syscalls.h>
  91#include <linux/ctype.h>
  92#include <linux/mm_inline.h>
  93#include <linux/mmu_notifier.h>
  94
  95#include <asm/tlbflush.h>
  96#include <asm/uaccess.h>
  97#include <linux/random.h>
  98
  99#include "internal.h"
 100
 101/* Internal flags */
 102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105static struct kmem_cache *policy_cache;
 106static struct kmem_cache *sn_cache;
 107
 108/* Highest zone. An specific allocation for a zone below that is not
 109   policied. */
 110enum zone_type policy_zone = 0;
 111
 112/*
 113 * run-time system-wide default policy => local allocation
 114 */
 115static struct mempolicy default_policy = {
 116        .refcnt = ATOMIC_INIT(1), /* never free it */
 117        .mode = MPOL_PREFERRED,
 118        .flags = MPOL_F_LOCAL,
 119};
 120
 121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 122
 123static struct mempolicy *get_task_policy(struct task_struct *p)
 124{
 125        struct mempolicy *pol = p->mempolicy;
 126        int node;
 127
 128        if (!pol) {
 129                node = numa_node_id();
 130                if (node != NUMA_NO_NODE)
 131                        pol = &preferred_node_policy[node];
 132
 133                /* preferred_node_policy is not initialised early in boot */
 134                if (!pol->mode)
 135                        pol = NULL;
 136        }
 137
 138        return pol;
 139}
 140
 141static const struct mempolicy_operations {
 142        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 143        /*
 144         * If read-side task has no lock to protect task->mempolicy, write-side
 145         * task will rebind the task->mempolicy by two step. The first step is
 146         * setting all the newly nodes, and the second step is cleaning all the
 147         * disallowed nodes. In this way, we can avoid finding no node to alloc
 148         * page.
 149         * If we have a lock to protect task->mempolicy in read-side, we do
 150         * rebind directly.
 151         *
 152         * step:
 153         *      MPOL_REBIND_ONCE - do rebind work at once
 154         *      MPOL_REBIND_STEP1 - set all the newly nodes
 155         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 156         */
 157        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 158                        enum mpol_rebind_step step);
 159} mpol_ops[MPOL_MAX];
 160
 161/* Check that the nodemask contains at least one populated zone */
 162static int is_valid_nodemask(const nodemask_t *nodemask)
 163{
 164        return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 165}
 166
 167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 168{
 169        return pol->flags & MPOL_MODE_FLAGS;
 170}
 171
 172static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 173                                   const nodemask_t *rel)
 174{
 175        nodemask_t tmp;
 176        nodes_fold(tmp, *orig, nodes_weight(*rel));
 177        nodes_onto(*ret, tmp, *rel);
 178}
 179
 180static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 181{
 182        if (nodes_empty(*nodes))
 183                return -EINVAL;
 184        pol->v.nodes = *nodes;
 185        return 0;
 186}
 187
 188static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 189{
 190        if (!nodes)
 191                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 192        else if (nodes_empty(*nodes))
 193                return -EINVAL;                 /*  no allowed nodes */
 194        else
 195                pol->v.preferred_node = first_node(*nodes);
 196        return 0;
 197}
 198
 199static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 200{
 201        if (!is_valid_nodemask(nodes))
 202                return -EINVAL;
 203        pol->v.nodes = *nodes;
 204        return 0;
 205}
 206
 207/*
 208 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 209 * any, for the new policy.  mpol_new() has already validated the nodes
 210 * parameter with respect to the policy mode and flags.  But, we need to
 211 * handle an empty nodemask with MPOL_PREFERRED here.
 212 *
 213 * Must be called holding task's alloc_lock to protect task's mems_allowed
 214 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 215 */
 216static int mpol_set_nodemask(struct mempolicy *pol,
 217                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 218{
 219        int ret;
 220
 221        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 222        if (pol == NULL)
 223                return 0;
 224        /* Check N_MEMORY */
 225        nodes_and(nsc->mask1,
 226                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 227
 228        VM_BUG_ON(!nodes);
 229        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 230                nodes = NULL;   /* explicit local allocation */
 231        else {
 232                if (pol->flags & MPOL_F_RELATIVE_NODES)
 233                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 234                else
 235                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
 236
 237                if (mpol_store_user_nodemask(pol))
 238                        pol->w.user_nodemask = *nodes;
 239                else
 240                        pol->w.cpuset_mems_allowed =
 241                                                cpuset_current_mems_allowed;
 242        }
 243
 244        if (nodes)
 245                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 246        else
 247                ret = mpol_ops[pol->mode].create(pol, NULL);
 248        return ret;
 249}
 250
 251/*
 252 * This function just creates a new policy, does some check and simple
 253 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 254 */
 255static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 256                                  nodemask_t *nodes)
 257{
 258        struct mempolicy *policy;
 259
 260        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 261                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 262
 263        if (mode == MPOL_DEFAULT) {
 264                if (nodes && !nodes_empty(*nodes))
 265                        return ERR_PTR(-EINVAL);
 266                return NULL;
 267        }
 268        VM_BUG_ON(!nodes);
 269
 270        /*
 271         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 272         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 273         * All other modes require a valid pointer to a non-empty nodemask.
 274         */
 275        if (mode == MPOL_PREFERRED) {
 276                if (nodes_empty(*nodes)) {
 277                        if (((flags & MPOL_F_STATIC_NODES) ||
 278                             (flags & MPOL_F_RELATIVE_NODES)))
 279                                return ERR_PTR(-EINVAL);
 280                }
 281        } else if (mode == MPOL_LOCAL) {
 282                if (!nodes_empty(*nodes))
 283                        return ERR_PTR(-EINVAL);
 284                mode = MPOL_PREFERRED;
 285        } else if (nodes_empty(*nodes))
 286                return ERR_PTR(-EINVAL);
 287        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 288        if (!policy)
 289                return ERR_PTR(-ENOMEM);
 290        atomic_set(&policy->refcnt, 1);
 291        policy->mode = mode;
 292        policy->flags = flags;
 293
 294        return policy;
 295}
 296
 297/* Slow path of a mpol destructor. */
 298void __mpol_put(struct mempolicy *p)
 299{
 300        if (!atomic_dec_and_test(&p->refcnt))
 301                return;
 302        kmem_cache_free(policy_cache, p);
 303}
 304
 305static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 306                                enum mpol_rebind_step step)
 307{
 308}
 309
 310/*
 311 * step:
 312 *      MPOL_REBIND_ONCE  - do rebind work at once
 313 *      MPOL_REBIND_STEP1 - set all the newly nodes
 314 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 315 */
 316static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 317                                 enum mpol_rebind_step step)
 318{
 319        nodemask_t tmp;
 320
 321        if (pol->flags & MPOL_F_STATIC_NODES)
 322                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 323        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 324                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 325        else {
 326                /*
 327                 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 328                 * result
 329                 */
 330                if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 331                        nodes_remap(tmp, pol->v.nodes,
 332                                        pol->w.cpuset_mems_allowed, *nodes);
 333                        pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 334                } else if (step == MPOL_REBIND_STEP2) {
 335                        tmp = pol->w.cpuset_mems_allowed;
 336                        pol->w.cpuset_mems_allowed = *nodes;
 337                } else
 338                        BUG();
 339        }
 340
 341        if (nodes_empty(tmp))
 342                tmp = *nodes;
 343
 344        if (step == MPOL_REBIND_STEP1)
 345                nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 346        else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 347                pol->v.nodes = tmp;
 348        else
 349                BUG();
 350
 351        if (!node_isset(current->il_next, tmp)) {
 352                current->il_next = next_node(current->il_next, tmp);
 353                if (current->il_next >= MAX_NUMNODES)
 354                        current->il_next = first_node(tmp);
 355                if (current->il_next >= MAX_NUMNODES)
 356                        current->il_next = numa_node_id();
 357        }
 358}
 359
 360static void mpol_rebind_preferred(struct mempolicy *pol,
 361                                  const nodemask_t *nodes,
 362                                  enum mpol_rebind_step step)
 363{
 364        nodemask_t tmp;
 365
 366        if (pol->flags & MPOL_F_STATIC_NODES) {
 367                int node = first_node(pol->w.user_nodemask);
 368
 369                if (node_isset(node, *nodes)) {
 370                        pol->v.preferred_node = node;
 371                        pol->flags &= ~MPOL_F_LOCAL;
 372                } else
 373                        pol->flags |= MPOL_F_LOCAL;
 374        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 375                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 376                pol->v.preferred_node = first_node(tmp);
 377        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 378                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 379                                                   pol->w.cpuset_mems_allowed,
 380                                                   *nodes);
 381                pol->w.cpuset_mems_allowed = *nodes;
 382        }
 383}
 384
 385/*
 386 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 387 *
 388 * If read-side task has no lock to protect task->mempolicy, write-side
 389 * task will rebind the task->mempolicy by two step. The first step is
 390 * setting all the newly nodes, and the second step is cleaning all the
 391 * disallowed nodes. In this way, we can avoid finding no node to alloc
 392 * page.
 393 * If we have a lock to protect task->mempolicy in read-side, we do
 394 * rebind directly.
 395 *
 396 * step:
 397 *      MPOL_REBIND_ONCE  - do rebind work at once
 398 *      MPOL_REBIND_STEP1 - set all the newly nodes
 399 *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 400 */
 401static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 402                                enum mpol_rebind_step step)
 403{
 404        if (!pol)
 405                return;
 406        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 407            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 408                return;
 409
 410        if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 411                return;
 412
 413        if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 414                BUG();
 415
 416        if (step == MPOL_REBIND_STEP1)
 417                pol->flags |= MPOL_F_REBINDING;
 418        else if (step == MPOL_REBIND_STEP2)
 419                pol->flags &= ~MPOL_F_REBINDING;
 420        else if (step >= MPOL_REBIND_NSTEP)
 421                BUG();
 422
 423        mpol_ops[pol->mode].rebind(pol, newmask, step);
 424}
 425
 426/*
 427 * Wrapper for mpol_rebind_policy() that just requires task
 428 * pointer, and updates task mempolicy.
 429 *
 430 * Called with task's alloc_lock held.
 431 */
 432
 433void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 434                        enum mpol_rebind_step step)
 435{
 436        mpol_rebind_policy(tsk->mempolicy, new, step);
 437}
 438
 439/*
 440 * Rebind each vma in mm to new nodemask.
 441 *
 442 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 443 */
 444
 445void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 446{
 447        struct vm_area_struct *vma;
 448
 449        down_write(&mm->mmap_sem);
 450        for (vma = mm->mmap; vma; vma = vma->vm_next)
 451                mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 452        up_write(&mm->mmap_sem);
 453}
 454
 455static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 456        [MPOL_DEFAULT] = {
 457                .rebind = mpol_rebind_default,
 458        },
 459        [MPOL_INTERLEAVE] = {
 460                .create = mpol_new_interleave,
 461                .rebind = mpol_rebind_nodemask,
 462        },
 463        [MPOL_PREFERRED] = {
 464                .create = mpol_new_preferred,
 465                .rebind = mpol_rebind_preferred,
 466        },
 467        [MPOL_BIND] = {
 468                .create = mpol_new_bind,
 469                .rebind = mpol_rebind_nodemask,
 470        },
 471};
 472
 473static void migrate_page_add(struct page *page, struct list_head *pagelist,
 474                                unsigned long flags);
 475
 476/* Scan through pages checking if pages follow certain conditions. */
 477static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 478                unsigned long addr, unsigned long end,
 479                const nodemask_t *nodes, unsigned long flags,
 480                void *private)
 481{
 482        pte_t *orig_pte;
 483        pte_t *pte;
 484        spinlock_t *ptl;
 485
 486        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 487        do {
 488                struct page *page;
 489                int nid;
 490
 491                if (!pte_present(*pte))
 492                        continue;
 493                page = vm_normal_page(vma, addr, *pte);
 494                if (!page)
 495                        continue;
 496                /*
 497                 * vm_normal_page() filters out zero pages, but there might
 498                 * still be PageReserved pages to skip, perhaps in a VDSO.
 499                 */
 500                if (PageReserved(page))
 501                        continue;
 502                nid = page_to_nid(page);
 503                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 504                        continue;
 505
 506                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 507                        migrate_page_add(page, private, flags);
 508                else
 509                        break;
 510        } while (pte++, addr += PAGE_SIZE, addr != end);
 511        pte_unmap_unlock(orig_pte, ptl);
 512        return addr != end;
 513}
 514
 515static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 516                unsigned long addr, unsigned long end,
 517                const nodemask_t *nodes, unsigned long flags,
 518                void *private)
 519{
 520        pmd_t *pmd;
 521        unsigned long next;
 522
 523        pmd = pmd_offset(pud, addr);
 524        do {
 525                next = pmd_addr_end(addr, end);
 526                split_huge_page_pmd(vma, addr, pmd);
 527                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 528                        continue;
 529                if (check_pte_range(vma, pmd, addr, next, nodes,
 530                                    flags, private))
 531                        return -EIO;
 532        } while (pmd++, addr = next, addr != end);
 533        return 0;
 534}
 535
 536static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 537                unsigned long addr, unsigned long end,
 538                const nodemask_t *nodes, unsigned long flags,
 539                void *private)
 540{
 541        pud_t *pud;
 542        unsigned long next;
 543
 544        pud = pud_offset(pgd, addr);
 545        do {
 546                next = pud_addr_end(addr, end);
 547                if (pud_none_or_clear_bad(pud))
 548                        continue;
 549                if (check_pmd_range(vma, pud, addr, next, nodes,
 550                                    flags, private))
 551                        return -EIO;
 552        } while (pud++, addr = next, addr != end);
 553        return 0;
 554}
 555
 556static inline int check_pgd_range(struct vm_area_struct *vma,
 557                unsigned long addr, unsigned long end,
 558                const nodemask_t *nodes, unsigned long flags,
 559                void *private)
 560{
 561        pgd_t *pgd;
 562        unsigned long next;
 563
 564        pgd = pgd_offset(vma->vm_mm, addr);
 565        do {
 566                next = pgd_addr_end(addr, end);
 567                if (pgd_none_or_clear_bad(pgd))
 568                        continue;
 569                if (check_pud_range(vma, pgd, addr, next, nodes,
 570                                    flags, private))
 571                        return -EIO;
 572        } while (pgd++, addr = next, addr != end);
 573        return 0;
 574}
 575
 576#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 577/*
 578 * This is used to mark a range of virtual addresses to be inaccessible.
 579 * These are later cleared by a NUMA hinting fault. Depending on these
 580 * faults, pages may be migrated for better NUMA placement.
 581 *
 582 * This is assuming that NUMA faults are handled using PROT_NONE. If
 583 * an architecture makes a different choice, it will need further
 584 * changes to the core.
 585 */
 586unsigned long change_prot_numa(struct vm_area_struct *vma,
 587                        unsigned long addr, unsigned long end)
 588{
 589        int nr_updated;
 590        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 591
 592        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 593        if (nr_updated)
 594                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 595
 596        return nr_updated;
 597}
 598#else
 599static unsigned long change_prot_numa(struct vm_area_struct *vma,
 600                        unsigned long addr, unsigned long end)
 601{
 602        return 0;
 603}
 604#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 605
 606/*
 607 * Check if all pages in a range are on a set of nodes.
 608 * If pagelist != NULL then isolate pages from the LRU and
 609 * put them on the pagelist.
 610 */
 611static struct vm_area_struct *
 612check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 613                const nodemask_t *nodes, unsigned long flags, void *private)
 614{
 615        int err;
 616        struct vm_area_struct *first, *vma, *prev;
 617
 618
 619        first = find_vma(mm, start);
 620        if (!first)
 621                return ERR_PTR(-EFAULT);
 622        prev = NULL;
 623        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 624                unsigned long endvma = vma->vm_end;
 625
 626                if (endvma > end)
 627                        endvma = end;
 628                if (vma->vm_start > start)
 629                        start = vma->vm_start;
 630
 631                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 632                        if (!vma->vm_next && vma->vm_end < end)
 633                                return ERR_PTR(-EFAULT);
 634                        if (prev && prev->vm_end < vma->vm_start)
 635                                return ERR_PTR(-EFAULT);
 636                }
 637
 638                if (is_vm_hugetlb_page(vma))
 639                        goto next;
 640
 641                if (flags & MPOL_MF_LAZY) {
 642                        change_prot_numa(vma, start, endvma);
 643                        goto next;
 644                }
 645
 646                if ((flags & MPOL_MF_STRICT) ||
 647                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 648                      vma_migratable(vma))) {
 649
 650                        err = check_pgd_range(vma, start, endvma, nodes,
 651                                                flags, private);
 652                        if (err) {
 653                                first = ERR_PTR(err);
 654                                break;
 655                        }
 656                }
 657next:
 658                prev = vma;
 659        }
 660        return first;
 661}
 662
 663/*
 664 * Apply policy to a single VMA
 665 * This must be called with the mmap_sem held for writing.
 666 */
 667static int vma_replace_policy(struct vm_area_struct *vma,
 668                                                struct mempolicy *pol)
 669{
 670        int err;
 671        struct mempolicy *old;
 672        struct mempolicy *new;
 673
 674        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 675                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 676                 vma->vm_ops, vma->vm_file,
 677                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 678
 679        new = mpol_dup(pol);
 680        if (IS_ERR(new))
 681                return PTR_ERR(new);
 682
 683        if (vma->vm_ops && vma->vm_ops->set_policy) {
 684                err = vma->vm_ops->set_policy(vma, new);
 685                if (err)
 686                        goto err_out;
 687        }
 688
 689        old = vma->vm_policy;
 690        vma->vm_policy = new; /* protected by mmap_sem */
 691        mpol_put(old);
 692
 693        return 0;
 694 err_out:
 695        mpol_put(new);
 696        return err;
 697}
 698
 699/* Step 2: apply policy to a range and do splits. */
 700static int mbind_range(struct mm_struct *mm, unsigned long start,
 701                       unsigned long end, struct mempolicy *new_pol)
 702{
 703        struct vm_area_struct *next;
 704        struct vm_area_struct *prev;
 705        struct vm_area_struct *vma;
 706        int err = 0;
 707        pgoff_t pgoff;
 708        unsigned long vmstart;
 709        unsigned long vmend;
 710
 711        vma = find_vma(mm, start);
 712        if (!vma || vma->vm_start > start)
 713                return -EFAULT;
 714
 715        prev = vma->vm_prev;
 716        if (start > vma->vm_start)
 717                prev = vma;
 718
 719        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 720                next = vma->vm_next;
 721                vmstart = max(start, vma->vm_start);
 722                vmend   = min(end, vma->vm_end);
 723
 724                if (mpol_equal(vma_policy(vma), new_pol))
 725                        continue;
 726
 727                pgoff = vma->vm_pgoff +
 728                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 729                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 730                                  vma->anon_vma, vma->vm_file, pgoff,
 731                                  new_pol);
 732                if (prev) {
 733                        vma = prev;
 734                        next = vma->vm_next;
 735                        continue;
 736                }
 737                if (vma->vm_start != vmstart) {
 738                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
 739                        if (err)
 740                                goto out;
 741                }
 742                if (vma->vm_end != vmend) {
 743                        err = split_vma(vma->vm_mm, vma, vmend, 0);
 744                        if (err)
 745                                goto out;
 746                }
 747                err = vma_replace_policy(vma, new_pol);
 748                if (err)
 749                        goto out;
 750        }
 751
 752 out:
 753        return err;
 754}
 755
 756/*
 757 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 758 * mempolicy.  Allows more rapid checking of this (combined perhaps
 759 * with other PF_* flag bits) on memory allocation hot code paths.
 760 *
 761 * If called from outside this file, the task 'p' should -only- be
 762 * a newly forked child not yet visible on the task list, because
 763 * manipulating the task flags of a visible task is not safe.
 764 *
 765 * The above limitation is why this routine has the funny name
 766 * mpol_fix_fork_child_flag().
 767 *
 768 * It is also safe to call this with a task pointer of current,
 769 * which the static wrapper mpol_set_task_struct_flag() does,
 770 * for use within this file.
 771 */
 772
 773void mpol_fix_fork_child_flag(struct task_struct *p)
 774{
 775        if (p->mempolicy)
 776                p->flags |= PF_MEMPOLICY;
 777        else
 778                p->flags &= ~PF_MEMPOLICY;
 779}
 780
 781static void mpol_set_task_struct_flag(void)
 782{
 783        mpol_fix_fork_child_flag(current);
 784}
 785
 786/* Set the process memory policy */
 787static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 788                             nodemask_t *nodes)
 789{
 790        struct mempolicy *new, *old;
 791        struct mm_struct *mm = current->mm;
 792        NODEMASK_SCRATCH(scratch);
 793        int ret;
 794
 795        if (!scratch)
 796                return -ENOMEM;
 797
 798        new = mpol_new(mode, flags, nodes);
 799        if (IS_ERR(new)) {
 800                ret = PTR_ERR(new);
 801                goto out;
 802        }
 803        /*
 804         * prevent changing our mempolicy while show_numa_maps()
 805         * is using it.
 806         * Note:  do_set_mempolicy() can be called at init time
 807         * with no 'mm'.
 808         */
 809        if (mm)
 810                down_write(&mm->mmap_sem);
 811        task_lock(current);
 812        ret = mpol_set_nodemask(new, nodes, scratch);
 813        if (ret) {
 814                task_unlock(current);
 815                if (mm)
 816                        up_write(&mm->mmap_sem);
 817                mpol_put(new);
 818                goto out;
 819        }
 820        old = current->mempolicy;
 821        current->mempolicy = new;
 822        mpol_set_task_struct_flag();
 823        if (new && new->mode == MPOL_INTERLEAVE &&
 824            nodes_weight(new->v.nodes))
 825                current->il_next = first_node(new->v.nodes);
 826        task_unlock(current);
 827        if (mm)
 828                up_write(&mm->mmap_sem);
 829
 830        mpol_put(old);
 831        ret = 0;
 832out:
 833        NODEMASK_SCRATCH_FREE(scratch);
 834        return ret;
 835}
 836
 837/*
 838 * Return nodemask for policy for get_mempolicy() query
 839 *
 840 * Called with task's alloc_lock held
 841 */
 842static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 843{
 844        nodes_clear(*nodes);
 845        if (p == &default_policy)
 846                return;
 847
 848        switch (p->mode) {
 849        case MPOL_BIND:
 850                /* Fall through */
 851        case MPOL_INTERLEAVE:
 852                *nodes = p->v.nodes;
 853                break;
 854        case MPOL_PREFERRED:
 855                if (!(p->flags & MPOL_F_LOCAL))
 856                        node_set(p->v.preferred_node, *nodes);
 857                /* else return empty node mask for local allocation */
 858                break;
 859        default:
 860                BUG();
 861        }
 862}
 863
 864static int lookup_node(struct mm_struct *mm, unsigned long addr)
 865{
 866        struct page *p;
 867        int err;
 868
 869        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 870        if (err >= 0) {
 871                err = page_to_nid(p);
 872                put_page(p);
 873        }
 874        return err;
 875}
 876
 877/* Retrieve NUMA policy */
 878static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 879                             unsigned long addr, unsigned long flags)
 880{
 881        int err;
 882        struct mm_struct *mm = current->mm;
 883        struct vm_area_struct *vma = NULL;
 884        struct mempolicy *pol = current->mempolicy;
 885
 886        if (flags &
 887                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 888                return -EINVAL;
 889
 890        if (flags & MPOL_F_MEMS_ALLOWED) {
 891                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 892                        return -EINVAL;
 893                *policy = 0;    /* just so it's initialized */
 894                task_lock(current);
 895                *nmask  = cpuset_current_mems_allowed;
 896                task_unlock(current);
 897                return 0;
 898        }
 899
 900        if (flags & MPOL_F_ADDR) {
 901                /*
 902                 * Do NOT fall back to task policy if the
 903                 * vma/shared policy at addr is NULL.  We
 904                 * want to return MPOL_DEFAULT in this case.
 905                 */
 906                down_read(&mm->mmap_sem);
 907                vma = find_vma_intersection(mm, addr, addr+1);
 908                if (!vma) {
 909                        up_read(&mm->mmap_sem);
 910                        return -EFAULT;
 911                }
 912                if (vma->vm_ops && vma->vm_ops->get_policy)
 913                        pol = vma->vm_ops->get_policy(vma, addr);
 914                else
 915                        pol = vma->vm_policy;
 916        } else if (addr)
 917                return -EINVAL;
 918
 919        if (!pol)
 920                pol = &default_policy;  /* indicates default behavior */
 921
 922        if (flags & MPOL_F_NODE) {
 923                if (flags & MPOL_F_ADDR) {
 924                        err = lookup_node(mm, addr);
 925                        if (err < 0)
 926                                goto out;
 927                        *policy = err;
 928                } else if (pol == current->mempolicy &&
 929                                pol->mode == MPOL_INTERLEAVE) {
 930                        *policy = current->il_next;
 931                } else {
 932                        err = -EINVAL;
 933                        goto out;
 934                }
 935        } else {
 936                *policy = pol == &default_policy ? MPOL_DEFAULT :
 937                                                pol->mode;
 938                /*
 939                 * Internal mempolicy flags must be masked off before exposing
 940                 * the policy to userspace.
 941                 */
 942                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 943        }
 944
 945        if (vma) {
 946                up_read(&current->mm->mmap_sem);
 947                vma = NULL;
 948        }
 949
 950        err = 0;
 951        if (nmask) {
 952                if (mpol_store_user_nodemask(pol)) {
 953                        *nmask = pol->w.user_nodemask;
 954                } else {
 955                        task_lock(current);
 956                        get_policy_nodemask(pol, nmask);
 957                        task_unlock(current);
 958                }
 959        }
 960
 961 out:
 962        mpol_cond_put(pol);
 963        if (vma)
 964                up_read(&current->mm->mmap_sem);
 965        return err;
 966}
 967
 968#ifdef CONFIG_MIGRATION
 969/*
 970 * page migration
 971 */
 972static void migrate_page_add(struct page *page, struct list_head *pagelist,
 973                                unsigned long flags)
 974{
 975        /*
 976         * Avoid migrating a page that is shared with others.
 977         */
 978        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 979                if (!isolate_lru_page(page)) {
 980                        list_add_tail(&page->lru, pagelist);
 981                        inc_zone_page_state(page, NR_ISOLATED_ANON +
 982                                            page_is_file_cache(page));
 983                }
 984        }
 985}
 986
 987static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 988{
 989        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 990}
 991
 992/*
 993 * Migrate pages from one node to a target node.
 994 * Returns error or the number of pages not migrated.
 995 */
 996static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 997                           int flags)
 998{
 999        nodemask_t nmask;
1000        LIST_HEAD(pagelist);
1001        int err = 0;
1002
1003        nodes_clear(nmask);
1004        node_set(source, nmask);
1005
1006        /*
1007         * This does not "check" the range but isolates all pages that
1008         * need migration.  Between passing in the full user address
1009         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1010         */
1011        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1012        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1013                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1014
1015        if (!list_empty(&pagelist)) {
1016                err = migrate_pages(&pagelist, new_node_page, dest,
1017                                        MIGRATE_SYNC, MR_SYSCALL);
1018                if (err)
1019                        putback_lru_pages(&pagelist);
1020        }
1021
1022        return err;
1023}
1024
1025/*
1026 * Move pages between the two nodesets so as to preserve the physical
1027 * layout as much as possible.
1028 *
1029 * Returns the number of page that could not be moved.
1030 */
1031int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1032                     const nodemask_t *to, int flags)
1033{
1034        int busy = 0;
1035        int err;
1036        nodemask_t tmp;
1037
1038        err = migrate_prep();
1039        if (err)
1040                return err;
1041
1042        down_read(&mm->mmap_sem);
1043
1044        err = migrate_vmas(mm, from, to, flags);
1045        if (err)
1046                goto out;
1047
1048        /*
1049         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1050         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1051         * bit in 'tmp', and return that <source, dest> pair for migration.
1052         * The pair of nodemasks 'to' and 'from' define the map.
1053         *
1054         * If no pair of bits is found that way, fallback to picking some
1055         * pair of 'source' and 'dest' bits that are not the same.  If the
1056         * 'source' and 'dest' bits are the same, this represents a node
1057         * that will be migrating to itself, so no pages need move.
1058         *
1059         * If no bits are left in 'tmp', or if all remaining bits left
1060         * in 'tmp' correspond to the same bit in 'to', return false
1061         * (nothing left to migrate).
1062         *
1063         * This lets us pick a pair of nodes to migrate between, such that
1064         * if possible the dest node is not already occupied by some other
1065         * source node, minimizing the risk of overloading the memory on a
1066         * node that would happen if we migrated incoming memory to a node
1067         * before migrating outgoing memory source that same node.
1068         *
1069         * A single scan of tmp is sufficient.  As we go, we remember the
1070         * most recent <s, d> pair that moved (s != d).  If we find a pair
1071         * that not only moved, but what's better, moved to an empty slot
1072         * (d is not set in tmp), then we break out then, with that pair.
1073         * Otherwise when we finish scanning from_tmp, we at least have the
1074         * most recent <s, d> pair that moved.  If we get all the way through
1075         * the scan of tmp without finding any node that moved, much less
1076         * moved to an empty node, then there is nothing left worth migrating.
1077         */
1078
1079        tmp = *from;
1080        while (!nodes_empty(tmp)) {
1081                int s,d;
1082                int source = -1;
1083                int dest = 0;
1084
1085                for_each_node_mask(s, tmp) {
1086
1087                        /*
1088                         * do_migrate_pages() tries to maintain the relative
1089                         * node relationship of the pages established between
1090                         * threads and memory areas.
1091                         *
1092                         * However if the number of source nodes is not equal to
1093                         * the number of destination nodes we can not preserve
1094                         * this node relative relationship.  In that case, skip
1095                         * copying memory from a node that is in the destination
1096                         * mask.
1097                         *
1098                         * Example: [2,3,4] -> [3,4,5] moves everything.
1099                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1100                         */
1101
1102                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
1103                                                (node_isset(s, *to)))
1104                                continue;
1105
1106                        d = node_remap(s, *from, *to);
1107                        if (s == d)
1108                                continue;
1109
1110                        source = s;     /* Node moved. Memorize */
1111                        dest = d;
1112
1113                        /* dest not in remaining from nodes? */
1114                        if (!node_isset(dest, tmp))
1115                                break;
1116                }
1117                if (source == -1)
1118                        break;
1119
1120                node_clear(source, tmp);
1121                err = migrate_to_node(mm, source, dest, flags);
1122                if (err > 0)
1123                        busy += err;
1124                if (err < 0)
1125                        break;
1126        }
1127out:
1128        up_read(&mm->mmap_sem);
1129        if (err < 0)
1130                return err;
1131        return busy;
1132
1133}
1134
1135/*
1136 * Allocate a new page for page migration based on vma policy.
1137 * Start assuming that page is mapped by vma pointed to by @private.
1138 * Search forward from there, if not.  N.B., this assumes that the
1139 * list of pages handed to migrate_pages()--which is how we get here--
1140 * is in virtual address order.
1141 */
1142static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1143{
1144        struct vm_area_struct *vma = (struct vm_area_struct *)private;
1145        unsigned long uninitialized_var(address);
1146
1147        while (vma) {
1148                address = page_address_in_vma(page, vma);
1149                if (address != -EFAULT)
1150                        break;
1151                vma = vma->vm_next;
1152        }
1153
1154        /*
1155         * if !vma, alloc_page_vma() will use task or system default policy
1156         */
1157        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1158}
1159#else
1160
1161static void migrate_page_add(struct page *page, struct list_head *pagelist,
1162                                unsigned long flags)
1163{
1164}
1165
1166int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1167                     const nodemask_t *to, int flags)
1168{
1169        return -ENOSYS;
1170}
1171
1172static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1173{
1174        return NULL;
1175}
1176#endif
1177
1178static long do_mbind(unsigned long start, unsigned long len,
1179                     unsigned short mode, unsigned short mode_flags,
1180                     nodemask_t *nmask, unsigned long flags)
1181{
1182        struct vm_area_struct *vma;
1183        struct mm_struct *mm = current->mm;
1184        struct mempolicy *new;
1185        unsigned long end;
1186        int err;
1187        LIST_HEAD(pagelist);
1188
1189        if (flags & ~(unsigned long)MPOL_MF_VALID)
1190                return -EINVAL;
1191        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1192                return -EPERM;
1193
1194        if (start & ~PAGE_MASK)
1195                return -EINVAL;
1196
1197        if (mode == MPOL_DEFAULT)
1198                flags &= ~MPOL_MF_STRICT;
1199
1200        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1201        end = start + len;
1202
1203        if (end < start)
1204                return -EINVAL;
1205        if (end == start)
1206                return 0;
1207
1208        new = mpol_new(mode, mode_flags, nmask);
1209        if (IS_ERR(new))
1210                return PTR_ERR(new);
1211
1212        if (flags & MPOL_MF_LAZY)
1213                new->flags |= MPOL_F_MOF;
1214
1215        /*
1216         * If we are using the default policy then operation
1217         * on discontinuous address spaces is okay after all
1218         */
1219        if (!new)
1220                flags |= MPOL_MF_DISCONTIG_OK;
1221
1222        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1223                 start, start + len, mode, mode_flags,
1224                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1225
1226        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1227
1228                err = migrate_prep();
1229                if (err)
1230                        goto mpol_out;
1231        }
1232        {
1233                NODEMASK_SCRATCH(scratch);
1234                if (scratch) {
1235                        down_write(&mm->mmap_sem);
1236                        task_lock(current);
1237                        err = mpol_set_nodemask(new, nmask, scratch);
1238                        task_unlock(current);
1239                        if (err)
1240                                up_write(&mm->mmap_sem);
1241                } else
1242                        err = -ENOMEM;
1243                NODEMASK_SCRATCH_FREE(scratch);
1244        }
1245        if (err)
1246                goto mpol_out;
1247
1248        vma = check_range(mm, start, end, nmask,
1249                          flags | MPOL_MF_INVERT, &pagelist);
1250
1251        err = PTR_ERR(vma);     /* maybe ... */
1252        if (!IS_ERR(vma))
1253                err = mbind_range(mm, start, end, new);
1254
1255        if (!err) {
1256                int nr_failed = 0;
1257
1258                if (!list_empty(&pagelist)) {
1259                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1260                        nr_failed = migrate_pages(&pagelist, new_vma_page,
1261                                        (unsigned long)vma,
1262                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1263                        if (nr_failed)
1264                                putback_lru_pages(&pagelist);
1265                }
1266
1267                if (nr_failed && (flags & MPOL_MF_STRICT))
1268                        err = -EIO;
1269        } else
1270                putback_lru_pages(&pagelist);
1271
1272        up_write(&mm->mmap_sem);
1273 mpol_out:
1274        mpol_put(new);
1275        return err;
1276}
1277
1278/*
1279 * User space interface with variable sized bitmaps for nodelists.
1280 */
1281
1282/* Copy a node mask from user space. */
1283static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1284                     unsigned long maxnode)
1285{
1286        unsigned long k;
1287        unsigned long nlongs;
1288        unsigned long endmask;
1289
1290        --maxnode;
1291        nodes_clear(*nodes);
1292        if (maxnode == 0 || !nmask)
1293                return 0;
1294        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1295                return -EINVAL;
1296
1297        nlongs = BITS_TO_LONGS(maxnode);
1298        if ((maxnode % BITS_PER_LONG) == 0)
1299                endmask = ~0UL;
1300        else
1301                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1302
1303        /* When the user specified more nodes than supported just check
1304           if the non supported part is all zero. */
1305        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1306                if (nlongs > PAGE_SIZE/sizeof(long))
1307                        return -EINVAL;
1308                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1309                        unsigned long t;
1310                        if (get_user(t, nmask + k))
1311                                return -EFAULT;
1312                        if (k == nlongs - 1) {
1313                                if (t & endmask)
1314                                        return -EINVAL;
1315                        } else if (t)
1316                                return -EINVAL;
1317                }
1318                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1319                endmask = ~0UL;
1320        }
1321
1322        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1323                return -EFAULT;
1324        nodes_addr(*nodes)[nlongs-1] &= endmask;
1325        return 0;
1326}
1327
1328/* Copy a kernel node mask to user space */
1329static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1330                              nodemask_t *nodes)
1331{
1332        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1333        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1334
1335        if (copy > nbytes) {
1336                if (copy > PAGE_SIZE)
1337                        return -EINVAL;
1338                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1339                        return -EFAULT;
1340                copy = nbytes;
1341        }
1342        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1343}
1344
1345SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1346                unsigned long, mode, unsigned long __user *, nmask,
1347                unsigned long, maxnode, unsigned, flags)
1348{
1349        nodemask_t nodes;
1350        int err;
1351        unsigned short mode_flags;
1352
1353        mode_flags = mode & MPOL_MODE_FLAGS;
1354        mode &= ~MPOL_MODE_FLAGS;
1355        if (mode >= MPOL_MAX)
1356                return -EINVAL;
1357        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1358            (mode_flags & MPOL_F_RELATIVE_NODES))
1359                return -EINVAL;
1360        err = get_nodes(&nodes, nmask, maxnode);
1361        if (err)
1362                return err;
1363        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1364}
1365
1366/* Set the process memory policy */
1367SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1368                unsigned long, maxnode)
1369{
1370        int err;
1371        nodemask_t nodes;
1372        unsigned short flags;
1373
1374        flags = mode & MPOL_MODE_FLAGS;
1375        mode &= ~MPOL_MODE_FLAGS;
1376        if ((unsigned int)mode >= MPOL_MAX)
1377                return -EINVAL;
1378        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1379                return -EINVAL;
1380        err = get_nodes(&nodes, nmask, maxnode);
1381        if (err)
1382                return err;
1383        return do_set_mempolicy(mode, flags, &nodes);
1384}
1385
1386SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1387                const unsigned long __user *, old_nodes,
1388                const unsigned long __user *, new_nodes)
1389{
1390        const struct cred *cred = current_cred(), *tcred;
1391        struct mm_struct *mm = NULL;
1392        struct task_struct *task;
1393        nodemask_t task_nodes;
1394        int err;
1395        nodemask_t *old;
1396        nodemask_t *new;
1397        NODEMASK_SCRATCH(scratch);
1398
1399        if (!scratch)
1400                return -ENOMEM;
1401
1402        old = &scratch->mask1;
1403        new = &scratch->mask2;
1404
1405        err = get_nodes(old, old_nodes, maxnode);
1406        if (err)
1407                goto out;
1408
1409        err = get_nodes(new, new_nodes, maxnode);
1410        if (err)
1411                goto out;
1412
1413        /* Find the mm_struct */
1414        rcu_read_lock();
1415        task = pid ? find_task_by_vpid(pid) : current;
1416        if (!task) {
1417                rcu_read_unlock();
1418                err = -ESRCH;
1419                goto out;
1420        }
1421        get_task_struct(task);
1422
1423        err = -EINVAL;
1424
1425        /*
1426         * Check if this process has the right to modify the specified
1427         * process. The right exists if the process has administrative
1428         * capabilities, superuser privileges or the same
1429         * userid as the target process.
1430         */
1431        tcred = __task_cred(task);
1432        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1433            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1434            !capable(CAP_SYS_NICE)) {
1435                rcu_read_unlock();
1436                err = -EPERM;
1437                goto out_put;
1438        }
1439        rcu_read_unlock();
1440
1441        task_nodes = cpuset_mems_allowed(task);
1442        /* Is the user allowed to access the target nodes? */
1443        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1444                err = -EPERM;
1445                goto out_put;
1446        }
1447
1448        if (!nodes_subset(*new, node_states[N_MEMORY])) {
1449                err = -EINVAL;
1450                goto out_put;
1451        }
1452
1453        err = security_task_movememory(task);
1454        if (err)
1455                goto out_put;
1456
1457        mm = get_task_mm(task);
1458        put_task_struct(task);
1459
1460        if (!mm) {
1461                err = -EINVAL;
1462                goto out;
1463        }
1464
1465        err = do_migrate_pages(mm, old, new,
1466                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1467
1468        mmput(mm);
1469out:
1470        NODEMASK_SCRATCH_FREE(scratch);
1471
1472        return err;
1473
1474out_put:
1475        put_task_struct(task);
1476        goto out;
1477
1478}
1479
1480
1481/* Retrieve NUMA policy */
1482SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1483                unsigned long __user *, nmask, unsigned long, maxnode,
1484                unsigned long, addr, unsigned long, flags)
1485{
1486        int err;
1487        int uninitialized_var(pval);
1488        nodemask_t nodes;
1489
1490        if (nmask != NULL && maxnode < MAX_NUMNODES)
1491                return -EINVAL;
1492
1493        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1494
1495        if (err)
1496                return err;
1497
1498        if (policy && put_user(pval, policy))
1499                return -EFAULT;
1500
1501        if (nmask)
1502                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1503
1504        return err;
1505}
1506
1507#ifdef CONFIG_COMPAT
1508
1509asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1510                                     compat_ulong_t __user *nmask,
1511                                     compat_ulong_t maxnode,
1512                                     compat_ulong_t addr, compat_ulong_t flags)
1513{
1514        long err;
1515        unsigned long __user *nm = NULL;
1516        unsigned long nr_bits, alloc_size;
1517        DECLARE_BITMAP(bm, MAX_NUMNODES);
1518
1519        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1520        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1521
1522        if (nmask)
1523                nm = compat_alloc_user_space(alloc_size);
1524
1525        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1526
1527        if (!err && nmask) {
1528                unsigned long copy_size;
1529                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1530                err = copy_from_user(bm, nm, copy_size);
1531                /* ensure entire bitmap is zeroed */
1532                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1533                err |= compat_put_bitmap(nmask, bm, nr_bits);
1534        }
1535
1536        return err;
1537}
1538
1539asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1540                                     compat_ulong_t maxnode)
1541{
1542        long err = 0;
1543        unsigned long __user *nm = NULL;
1544        unsigned long nr_bits, alloc_size;
1545        DECLARE_BITMAP(bm, MAX_NUMNODES);
1546
1547        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1548        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1549
1550        if (nmask) {
1551                err = compat_get_bitmap(bm, nmask, nr_bits);
1552                nm = compat_alloc_user_space(alloc_size);
1553                err |= copy_to_user(nm, bm, alloc_size);
1554        }
1555
1556        if (err)
1557                return -EFAULT;
1558
1559        return sys_set_mempolicy(mode, nm, nr_bits+1);
1560}
1561
1562asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1563                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1564                             compat_ulong_t maxnode, compat_ulong_t flags)
1565{
1566        long err = 0;
1567        unsigned long __user *nm = NULL;
1568        unsigned long nr_bits, alloc_size;
1569        nodemask_t bm;
1570
1571        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1572        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1573
1574        if (nmask) {
1575                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1576                nm = compat_alloc_user_space(alloc_size);
1577                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1578        }
1579
1580        if (err)
1581                return -EFAULT;
1582
1583        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1584}
1585
1586#endif
1587
1588/*
1589 * get_vma_policy(@task, @vma, @addr)
1590 * @task - task for fallback if vma policy == default
1591 * @vma   - virtual memory area whose policy is sought
1592 * @addr  - address in @vma for shared policy lookup
1593 *
1594 * Returns effective policy for a VMA at specified address.
1595 * Falls back to @task or system default policy, as necessary.
1596 * Current or other task's task mempolicy and non-shared vma policies must be
1597 * protected by task_lock(task) by the caller.
1598 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1599 * count--added by the get_policy() vm_op, as appropriate--to protect against
1600 * freeing by another task.  It is the caller's responsibility to free the
1601 * extra reference for shared policies.
1602 */
1603struct mempolicy *get_vma_policy(struct task_struct *task,
1604                struct vm_area_struct *vma, unsigned long addr)
1605{
1606        struct mempolicy *pol = get_task_policy(task);
1607
1608        if (vma) {
1609                if (vma->vm_ops && vma->vm_ops->get_policy) {
1610                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1611                                                                        addr);
1612                        if (vpol)
1613                                pol = vpol;
1614                } else if (vma->vm_policy) {
1615                        pol = vma->vm_policy;
1616
1617                        /*
1618                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
1619                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1620                         * count on these policies which will be dropped by
1621                         * mpol_cond_put() later
1622                         */
1623                        if (mpol_needs_cond_ref(pol))
1624                                mpol_get(pol);
1625                }
1626        }
1627        if (!pol)
1628                pol = &default_policy;
1629        return pol;
1630}
1631
1632static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1633{
1634        enum zone_type dynamic_policy_zone = policy_zone;
1635
1636        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1637
1638        /*
1639         * if policy->v.nodes has movable memory only,
1640         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1641         *
1642         * policy->v.nodes is intersect with node_states[N_MEMORY].
1643         * so if the following test faile, it implies
1644         * policy->v.nodes has movable memory only.
1645         */
1646        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1647                dynamic_policy_zone = ZONE_MOVABLE;
1648
1649        return zone >= dynamic_policy_zone;
1650}
1651
1652/*
1653 * Return a nodemask representing a mempolicy for filtering nodes for
1654 * page allocation
1655 */
1656static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1657{
1658        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1659        if (unlikely(policy->mode == MPOL_BIND) &&
1660                        apply_policy_zone(policy, gfp_zone(gfp)) &&
1661                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1662                return &policy->v.nodes;
1663
1664        return NULL;
1665}
1666
1667/* Return a zonelist indicated by gfp for node representing a mempolicy */
1668static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1669        int nd)
1670{
1671        switch (policy->mode) {
1672        case MPOL_PREFERRED:
1673                if (!(policy->flags & MPOL_F_LOCAL))
1674                        nd = policy->v.preferred_node;
1675                break;
1676        case MPOL_BIND:
1677                /*
1678                 * Normally, MPOL_BIND allocations are node-local within the
1679                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1680                 * current node isn't part of the mask, we use the zonelist for
1681                 * the first node in the mask instead.
1682                 */
1683                if (unlikely(gfp & __GFP_THISNODE) &&
1684                                unlikely(!node_isset(nd, policy->v.nodes)))
1685                        nd = first_node(policy->v.nodes);
1686                break;
1687        default:
1688                BUG();
1689        }
1690        return node_zonelist(nd, gfp);
1691}
1692
1693/* Do dynamic interleaving for a process */
1694static unsigned interleave_nodes(struct mempolicy *policy)
1695{
1696        unsigned nid, next;
1697        struct task_struct *me = current;
1698
1699        nid = me->il_next;
1700        next = next_node(nid, policy->v.nodes);
1701        if (next >= MAX_NUMNODES)
1702                next = first_node(policy->v.nodes);
1703        if (next < MAX_NUMNODES)
1704                me->il_next = next;
1705        return nid;
1706}
1707
1708/*
1709 * Depending on the memory policy provide a node from which to allocate the
1710 * next slab entry.
1711 * @policy must be protected by freeing by the caller.  If @policy is
1712 * the current task's mempolicy, this protection is implicit, as only the
1713 * task can change it's policy.  The system default policy requires no
1714 * such protection.
1715 */
1716unsigned slab_node(void)
1717{
1718        struct mempolicy *policy;
1719
1720        if (in_interrupt())
1721                return numa_node_id();
1722
1723        policy = current->mempolicy;
1724        if (!policy || policy->flags & MPOL_F_LOCAL)
1725                return numa_node_id();
1726
1727        switch (policy->mode) {
1728        case MPOL_PREFERRED:
1729                /*
1730                 * handled MPOL_F_LOCAL above
1731                 */
1732                return policy->v.preferred_node;
1733
1734        case MPOL_INTERLEAVE:
1735                return interleave_nodes(policy);
1736
1737        case MPOL_BIND: {
1738                /*
1739                 * Follow bind policy behavior and start allocation at the
1740                 * first node.
1741                 */
1742                struct zonelist *zonelist;
1743                struct zone *zone;
1744                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1745                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1746                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1747                                                        &policy->v.nodes,
1748                                                        &zone);
1749                return zone ? zone->node : numa_node_id();
1750        }
1751
1752        default:
1753                BUG();
1754        }
1755}
1756
1757/* Do static interleaving for a VMA with known offset. */
1758static unsigned offset_il_node(struct mempolicy *pol,
1759                struct vm_area_struct *vma, unsigned long off)
1760{
1761        unsigned nnodes = nodes_weight(pol->v.nodes);
1762        unsigned target;
1763        int c;
1764        int nid = -1;
1765
1766        if (!nnodes)
1767                return numa_node_id();
1768        target = (unsigned int)off % nnodes;
1769        c = 0;
1770        do {
1771                nid = next_node(nid, pol->v.nodes);
1772                c++;
1773        } while (c <= target);
1774        return nid;
1775}
1776
1777/* Determine a node number for interleave */
1778static inline unsigned interleave_nid(struct mempolicy *pol,
1779                 struct vm_area_struct *vma, unsigned long addr, int shift)
1780{
1781        if (vma) {
1782                unsigned long off;
1783
1784                /*
1785                 * for small pages, there is no difference between
1786                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1787                 * for huge pages, since vm_pgoff is in units of small
1788                 * pages, we need to shift off the always 0 bits to get
1789                 * a useful offset.
1790                 */
1791                BUG_ON(shift < PAGE_SHIFT);
1792                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1793                off += (addr - vma->vm_start) >> shift;
1794                return offset_il_node(pol, vma, off);
1795        } else
1796                return interleave_nodes(pol);
1797}
1798
1799/*
1800 * Return the bit number of a random bit set in the nodemask.
1801 * (returns -1 if nodemask is empty)
1802 */
1803int node_random(const nodemask_t *maskp)
1804{
1805        int w, bit = -1;
1806
1807        w = nodes_weight(*maskp);
1808        if (w)
1809                bit = bitmap_ord_to_pos(maskp->bits,
1810                        get_random_int() % w, MAX_NUMNODES);
1811        return bit;
1812}
1813
1814#ifdef CONFIG_HUGETLBFS
1815/*
1816 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1817 * @vma = virtual memory area whose policy is sought
1818 * @addr = address in @vma for shared policy lookup and interleave policy
1819 * @gfp_flags = for requested zone
1820 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1821 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1822 *
1823 * Returns a zonelist suitable for a huge page allocation and a pointer
1824 * to the struct mempolicy for conditional unref after allocation.
1825 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1826 * @nodemask for filtering the zonelist.
1827 *
1828 * Must be protected by get_mems_allowed()
1829 */
1830struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1831                                gfp_t gfp_flags, struct mempolicy **mpol,
1832                                nodemask_t **nodemask)
1833{
1834        struct zonelist *zl;
1835
1836        *mpol = get_vma_policy(current, vma, addr);
1837        *nodemask = NULL;       /* assume !MPOL_BIND */
1838
1839        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1840                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1841                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1842        } else {
1843                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1844                if ((*mpol)->mode == MPOL_BIND)
1845                        *nodemask = &(*mpol)->v.nodes;
1846        }
1847        return zl;
1848}
1849
1850/*
1851 * init_nodemask_of_mempolicy
1852 *
1853 * If the current task's mempolicy is "default" [NULL], return 'false'
1854 * to indicate default policy.  Otherwise, extract the policy nodemask
1855 * for 'bind' or 'interleave' policy into the argument nodemask, or
1856 * initialize the argument nodemask to contain the single node for
1857 * 'preferred' or 'local' policy and return 'true' to indicate presence
1858 * of non-default mempolicy.
1859 *
1860 * We don't bother with reference counting the mempolicy [mpol_get/put]
1861 * because the current task is examining it's own mempolicy and a task's
1862 * mempolicy is only ever changed by the task itself.
1863 *
1864 * N.B., it is the caller's responsibility to free a returned nodemask.
1865 */
1866bool init_nodemask_of_mempolicy(nodemask_t *mask)
1867{
1868        struct mempolicy *mempolicy;
1869        int nid;
1870
1871        if (!(mask && current->mempolicy))
1872                return false;
1873
1874        task_lock(current);
1875        mempolicy = current->mempolicy;
1876        switch (mempolicy->mode) {
1877        case MPOL_PREFERRED:
1878                if (mempolicy->flags & MPOL_F_LOCAL)
1879                        nid = numa_node_id();
1880                else
1881                        nid = mempolicy->v.preferred_node;
1882                init_nodemask_of_node(mask, nid);
1883                break;
1884
1885        case MPOL_BIND:
1886                /* Fall through */
1887        case MPOL_INTERLEAVE:
1888                *mask =  mempolicy->v.nodes;
1889                break;
1890
1891        default:
1892                BUG();
1893        }
1894        task_unlock(current);
1895
1896        return true;
1897}
1898#endif
1899
1900/*
1901 * mempolicy_nodemask_intersects
1902 *
1903 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1904 * policy.  Otherwise, check for intersection between mask and the policy
1905 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1906 * policy, always return true since it may allocate elsewhere on fallback.
1907 *
1908 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1909 */
1910bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1911                                        const nodemask_t *mask)
1912{
1913        struct mempolicy *mempolicy;
1914        bool ret = true;
1915
1916        if (!mask)
1917                return ret;
1918        task_lock(tsk);
1919        mempolicy = tsk->mempolicy;
1920        if (!mempolicy)
1921                goto out;
1922
1923        switch (mempolicy->mode) {
1924        case MPOL_PREFERRED:
1925                /*
1926                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1927                 * allocate from, they may fallback to other nodes when oom.
1928                 * Thus, it's possible for tsk to have allocated memory from
1929                 * nodes in mask.
1930                 */
1931                break;
1932        case MPOL_BIND:
1933        case MPOL_INTERLEAVE:
1934                ret = nodes_intersects(mempolicy->v.nodes, *mask);
1935                break;
1936        default:
1937                BUG();
1938        }
1939out:
1940        task_unlock(tsk);
1941        return ret;
1942}
1943
1944/* Allocate a page in interleaved policy.
1945   Own path because it needs to do special accounting. */
1946static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1947                                        unsigned nid)
1948{
1949        struct zonelist *zl;
1950        struct page *page;
1951
1952        zl = node_zonelist(nid, gfp);
1953        page = __alloc_pages(gfp, order, zl);
1954        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1955                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1956        return page;
1957}
1958
1959/**
1960 *      alloc_pages_vma - Allocate a page for a VMA.
1961 *
1962 *      @gfp:
1963 *      %GFP_USER    user allocation.
1964 *      %GFP_KERNEL  kernel allocations,
1965 *      %GFP_HIGHMEM highmem/user allocations,
1966 *      %GFP_FS      allocation should not call back into a file system.
1967 *      %GFP_ATOMIC  don't sleep.
1968 *
1969 *      @order:Order of the GFP allocation.
1970 *      @vma:  Pointer to VMA or NULL if not available.
1971 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1972 *
1973 *      This function allocates a page from the kernel page pool and applies
1974 *      a NUMA policy associated with the VMA or the current process.
1975 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1976 *      mm_struct of the VMA to prevent it from going away. Should be used for
1977 *      all allocations for pages that will be mapped into
1978 *      user space. Returns NULL when no page can be allocated.
1979 *
1980 *      Should be called with the mm_sem of the vma hold.
1981 */
1982struct page *
1983alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1984                unsigned long addr, int node)
1985{
1986        struct mempolicy *pol;
1987        struct page *page;
1988        unsigned int cpuset_mems_cookie;
1989
1990retry_cpuset:
1991        pol = get_vma_policy(current, vma, addr);
1992        cpuset_mems_cookie = get_mems_allowed();
1993
1994        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1995                unsigned nid;
1996
1997                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1998                mpol_cond_put(pol);
1999                page = alloc_page_interleave(gfp, order, nid);
2000                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2001                        goto retry_cpuset;
2002
2003                return page;
2004        }
2005        page = __alloc_pages_nodemask(gfp, order,
2006                                      policy_zonelist(gfp, pol, node),
2007                                      policy_nodemask(gfp, pol));
2008        if (unlikely(mpol_needs_cond_ref(pol)))
2009                __mpol_put(pol);
2010        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2011                goto retry_cpuset;
2012        return page;
2013}
2014
2015/**
2016 *      alloc_pages_current - Allocate pages.
2017 *
2018 *      @gfp:
2019 *              %GFP_USER   user allocation,
2020 *              %GFP_KERNEL kernel allocation,
2021 *              %GFP_HIGHMEM highmem allocation,
2022 *              %GFP_FS     don't call back into a file system.
2023 *              %GFP_ATOMIC don't sleep.
2024 *      @order: Power of two of allocation size in pages. 0 is a single page.
2025 *
2026 *      Allocate a page from the kernel page pool.  When not in
2027 *      interrupt context and apply the current process NUMA policy.
2028 *      Returns NULL when no page can be allocated.
2029 *
2030 *      Don't call cpuset_update_task_memory_state() unless
2031 *      1) it's ok to take cpuset_sem (can WAIT), and
2032 *      2) allocating for current task (not interrupt).
2033 */
2034struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2035{
2036        struct mempolicy *pol = get_task_policy(current);
2037        struct page *page;
2038        unsigned int cpuset_mems_cookie;
2039
2040        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2041                pol = &default_policy;
2042
2043retry_cpuset:
2044        cpuset_mems_cookie = get_mems_allowed();
2045
2046        /*
2047         * No reference counting needed for current->mempolicy
2048         * nor system default_policy
2049         */
2050        if (pol->mode == MPOL_INTERLEAVE)
2051                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2052        else
2053                page = __alloc_pages_nodemask(gfp, order,
2054                                policy_zonelist(gfp, pol, numa_node_id()),
2055                                policy_nodemask(gfp, pol));
2056
2057        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2058                goto retry_cpuset;
2059
2060        return page;
2061}
2062EXPORT_SYMBOL(alloc_pages_current);
2063
2064/*
2065 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2066 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2067 * with the mems_allowed returned by cpuset_mems_allowed().  This
2068 * keeps mempolicies cpuset relative after its cpuset moves.  See
2069 * further kernel/cpuset.c update_nodemask().
2070 *
2071 * current's mempolicy may be rebinded by the other task(the task that changes
2072 * cpuset's mems), so we needn't do rebind work for current task.
2073 */
2074
2075/* Slow path of a mempolicy duplicate */
2076struct mempolicy *__mpol_dup(struct mempolicy *old)
2077{
2078        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2079
2080        if (!new)
2081                return ERR_PTR(-ENOMEM);
2082
2083        /* task's mempolicy is protected by alloc_lock */
2084        if (old == current->mempolicy) {
2085                task_lock(current);
2086                *new = *old;
2087                task_unlock(current);
2088        } else
2089                *new = *old;
2090
2091        rcu_read_lock();
2092        if (current_cpuset_is_being_rebound()) {
2093                nodemask_t mems = cpuset_mems_allowed(current);
2094                if (new->flags & MPOL_F_REBINDING)
2095                        mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2096                else
2097                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2098        }
2099        rcu_read_unlock();
2100        atomic_set(&new->refcnt, 1);
2101        return new;
2102}
2103
2104/* Slow path of a mempolicy comparison */
2105bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2106{
2107        if (!a || !b)
2108                return false;
2109        if (a->mode != b->mode)
2110                return false;
2111        if (a->flags != b->flags)
2112                return false;
2113        if (mpol_store_user_nodemask(a))
2114                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2115                        return false;
2116
2117        switch (a->mode) {
2118        case MPOL_BIND:
2119                /* Fall through */
2120        case MPOL_INTERLEAVE:
2121                return !!nodes_equal(a->v.nodes, b->v.nodes);
2122        case MPOL_PREFERRED:
2123                return a->v.preferred_node == b->v.preferred_node;
2124        default:
2125                BUG();
2126                return false;
2127        }
2128}
2129
2130/*
2131 * Shared memory backing store policy support.
2132 *
2133 * Remember policies even when nobody has shared memory mapped.
2134 * The policies are kept in Red-Black tree linked from the inode.
2135 * They are protected by the sp->lock spinlock, which should be held
2136 * for any accesses to the tree.
2137 */
2138
2139/* lookup first element intersecting start-end */
2140/* Caller holds sp->lock */
2141static struct sp_node *
2142sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2143{
2144        struct rb_node *n = sp->root.rb_node;
2145
2146        while (n) {
2147                struct sp_node *p = rb_entry(n, struct sp_node, nd);
2148
2149                if (start >= p->end)
2150                        n = n->rb_right;
2151                else if (end <= p->start)
2152                        n = n->rb_left;
2153                else
2154                        break;
2155        }
2156        if (!n)
2157                return NULL;
2158        for (;;) {
2159                struct sp_node *w = NULL;
2160                struct rb_node *prev = rb_prev(n);
2161                if (!prev)
2162                        break;
2163                w = rb_entry(prev, struct sp_node, nd);
2164                if (w->end <= start)
2165                        break;
2166                n = prev;
2167        }
2168        return rb_entry(n, struct sp_node, nd);
2169}
2170
2171/* Insert a new shared policy into the list. */
2172/* Caller holds sp->lock */
2173static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2174{
2175        struct rb_node **p = &sp->root.rb_node;
2176        struct rb_node *parent = NULL;
2177        struct sp_node *nd;
2178
2179        while (*p) {
2180                parent = *p;
2181                nd = rb_entry(parent, struct sp_node, nd);
2182                if (new->start < nd->start)
2183                        p = &(*p)->rb_left;
2184                else if (new->end > nd->end)
2185                        p = &(*p)->rb_right;
2186                else
2187                        BUG();
2188        }
2189        rb_link_node(&new->nd, parent, p);
2190        rb_insert_color(&new->nd, &sp->root);
2191        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2192                 new->policy ? new->policy->mode : 0);
2193}
2194
2195/* Find shared policy intersecting idx */
2196struct mempolicy *
2197mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2198{
2199        struct mempolicy *pol = NULL;
2200        struct sp_node *sn;
2201
2202        if (!sp->root.rb_node)
2203                return NULL;
2204        spin_lock(&sp->lock);
2205        sn = sp_lookup(sp, idx, idx+1);
2206        if (sn) {
2207                mpol_get(sn->policy);
2208                pol = sn->policy;
2209        }
2210        spin_unlock(&sp->lock);
2211        return pol;
2212}
2213
2214static void sp_free(struct sp_node *n)
2215{
2216        mpol_put(n->policy);
2217        kmem_cache_free(sn_cache, n);
2218}
2219
2220/**
2221 * mpol_misplaced - check whether current page node is valid in policy
2222 *
2223 * @page   - page to be checked
2224 * @vma    - vm area where page mapped
2225 * @addr   - virtual address where page mapped
2226 *
2227 * Lookup current policy node id for vma,addr and "compare to" page's
2228 * node id.
2229 *
2230 * Returns:
2231 *      -1      - not misplaced, page is in the right node
2232 *      node    - node id where the page should be
2233 *
2234 * Policy determination "mimics" alloc_page_vma().
2235 * Called from fault path where we know the vma and faulting address.
2236 */
2237int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2238{
2239        struct mempolicy *pol;
2240        struct zone *zone;
2241        int curnid = page_to_nid(page);
2242        unsigned long pgoff;
2243        int polnid = -1;
2244        int ret = -1;
2245
2246        BUG_ON(!vma);
2247
2248        pol = get_vma_policy(current, vma, addr);
2249        if (!(pol->flags & MPOL_F_MOF))
2250                goto out;
2251
2252        switch (pol->mode) {
2253        case MPOL_INTERLEAVE:
2254                BUG_ON(addr >= vma->vm_end);
2255                BUG_ON(addr < vma->vm_start);
2256
2257                pgoff = vma->vm_pgoff;
2258                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2259                polnid = offset_il_node(pol, vma, pgoff);
2260                break;
2261
2262        case MPOL_PREFERRED:
2263                if (pol->flags & MPOL_F_LOCAL)
2264                        polnid = numa_node_id();
2265                else
2266                        polnid = pol->v.preferred_node;
2267                break;
2268
2269        case MPOL_BIND:
2270                /*
2271                 * allows binding to multiple nodes.
2272                 * use current page if in policy nodemask,
2273                 * else select nearest allowed node, if any.
2274                 * If no allowed nodes, use current [!misplaced].
2275                 */
2276                if (node_isset(curnid, pol->v.nodes))
2277                        goto out;
2278                (void)first_zones_zonelist(
2279                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2280                                gfp_zone(GFP_HIGHUSER),
2281                                &pol->v.nodes, &zone);
2282                polnid = zone->node;
2283                break;
2284
2285        default:
2286                BUG();
2287        }
2288
2289        /* Migrate the page towards the node whose CPU is referencing it */
2290        if (pol->flags & MPOL_F_MORON) {
2291                int last_nid;
2292
2293                polnid = numa_node_id();
2294
2295                /*
2296                 * Multi-stage node selection is used in conjunction
2297                 * with a periodic migration fault to build a temporal
2298                 * task<->page relation. By using a two-stage filter we
2299                 * remove short/unlikely relations.
2300                 *
2301                 * Using P(p) ~ n_p / n_t as per frequentist
2302                 * probability, we can equate a task's usage of a
2303                 * particular page (n_p) per total usage of this
2304                 * page (n_t) (in a given time-span) to a probability.
2305                 *
2306                 * Our periodic faults will sample this probability and
2307                 * getting the same result twice in a row, given these
2308                 * samples are fully independent, is then given by
2309                 * P(n)^2, provided our sample period is sufficiently
2310                 * short compared to the usage pattern.
2311                 *
2312                 * This quadric squishes small probabilities, making
2313                 * it less likely we act on an unlikely task<->page
2314                 * relation.
2315                 */
2316                last_nid = page_nid_xchg_last(page, polnid);
2317                if (last_nid != polnid)
2318                        goto out;
2319        }
2320
2321        if (curnid != polnid)
2322                ret = polnid;
2323out:
2324        mpol_cond_put(pol);
2325
2326        return ret;
2327}
2328
2329static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2330{
2331        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2332        rb_erase(&n->nd, &sp->root);
2333        sp_free(n);
2334}
2335
2336static void sp_node_init(struct sp_node *node, unsigned long start,
2337                        unsigned long end, struct mempolicy *pol)
2338{
2339        node->start = start;
2340        node->end = end;
2341        node->policy = pol;
2342}
2343
2344static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2345                                struct mempolicy *pol)
2346{
2347        struct sp_node *n;
2348        struct mempolicy *newpol;
2349
2350        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2351        if (!n)
2352                return NULL;
2353
2354        newpol = mpol_dup(pol);
2355        if (IS_ERR(newpol)) {
2356                kmem_cache_free(sn_cache, n);
2357                return NULL;
2358        }
2359        newpol->flags |= MPOL_F_SHARED;
2360        sp_node_init(n, start, end, newpol);
2361
2362        return n;
2363}
2364
2365/* Replace a policy range. */
2366static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2367                                 unsigned long end, struct sp_node *new)
2368{
2369        struct sp_node *n;
2370        struct sp_node *n_new = NULL;
2371        struct mempolicy *mpol_new = NULL;
2372        int ret = 0;
2373
2374restart:
2375        spin_lock(&sp->lock);
2376        n = sp_lookup(sp, start, end);
2377        /* Take care of old policies in the same range. */
2378        while (n && n->start < end) {
2379                struct rb_node *next = rb_next(&n->nd);
2380                if (n->start >= start) {
2381                        if (n->end <= end)
2382                                sp_delete(sp, n);
2383                        else
2384                                n->start = end;
2385                } else {
2386                        /* Old policy spanning whole new range. */
2387                        if (n->end > end) {
2388                                if (!n_new)
2389                                        goto alloc_new;
2390
2391                                *mpol_new = *n->policy;
2392                                atomic_set(&mpol_new->refcnt, 1);
2393                                sp_node_init(n_new, end, n->end, mpol_new);
2394                                n->end = start;
2395                                sp_insert(sp, n_new);
2396                                n_new = NULL;
2397                                mpol_new = NULL;
2398                                break;
2399                        } else
2400                                n->end = start;
2401                }
2402                if (!next)
2403                        break;
2404                n = rb_entry(next, struct sp_node, nd);
2405        }
2406        if (new)
2407                sp_insert(sp, new);
2408        spin_unlock(&sp->lock);
2409        ret = 0;
2410
2411err_out:
2412        if (mpol_new)
2413                mpol_put(mpol_new);
2414        if (n_new)
2415                kmem_cache_free(sn_cache, n_new);
2416
2417        return ret;
2418
2419alloc_new:
2420        spin_unlock(&sp->lock);
2421        ret = -ENOMEM;
2422        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2423        if (!n_new)
2424                goto err_out;
2425        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2426        if (!mpol_new)
2427                goto err_out;
2428        goto restart;
2429}
2430
2431/**
2432 * mpol_shared_policy_init - initialize shared policy for inode
2433 * @sp: pointer to inode shared policy
2434 * @mpol:  struct mempolicy to install
2435 *
2436 * Install non-NULL @mpol in inode's shared policy rb-tree.
2437 * On entry, the current task has a reference on a non-NULL @mpol.
2438 * This must be released on exit.
2439 * This is called at get_inode() calls and we can use GFP_KERNEL.
2440 */
2441void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2442{
2443        int ret;
2444
2445        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2446        spin_lock_init(&sp->lock);
2447
2448        if (mpol) {
2449                struct vm_area_struct pvma;
2450                struct mempolicy *new;
2451                NODEMASK_SCRATCH(scratch);
2452
2453                if (!scratch)
2454                        goto put_mpol;
2455                /* contextualize the tmpfs mount point mempolicy */
2456                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2457                if (IS_ERR(new))
2458                        goto free_scratch; /* no valid nodemask intersection */
2459
2460                task_lock(current);
2461                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2462                task_unlock(current);
2463                if (ret)
2464                        goto put_new;
2465
2466                /* Create pseudo-vma that contains just the policy */
2467                memset(&pvma, 0, sizeof(struct vm_area_struct));
2468                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2469                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2470
2471put_new:
2472                mpol_put(new);                  /* drop initial ref */
2473free_scratch:
2474                NODEMASK_SCRATCH_FREE(scratch);
2475put_mpol:
2476                mpol_put(mpol); /* drop our incoming ref on sb mpol */
2477        }
2478}
2479
2480int mpol_set_shared_policy(struct shared_policy *info,
2481                        struct vm_area_struct *vma, struct mempolicy *npol)
2482{
2483        int err;
2484        struct sp_node *new = NULL;
2485        unsigned long sz = vma_pages(vma);
2486
2487        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2488                 vma->vm_pgoff,
2489                 sz, npol ? npol->mode : -1,
2490                 npol ? npol->flags : -1,
2491                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2492
2493        if (npol) {
2494                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2495                if (!new)
2496                        return -ENOMEM;
2497        }
2498        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2499        if (err && new)
2500                sp_free(new);
2501        return err;
2502}
2503
2504/* Free a backing policy store on inode delete. */
2505void mpol_free_shared_policy(struct shared_policy *p)
2506{
2507        struct sp_node *n;
2508        struct rb_node *next;
2509
2510        if (!p->root.rb_node)
2511                return;
2512        spin_lock(&p->lock);
2513        next = rb_first(&p->root);
2514        while (next) {
2515                n = rb_entry(next, struct sp_node, nd);
2516                next = rb_next(&n->nd);
2517                sp_delete(p, n);
2518        }
2519        spin_unlock(&p->lock);
2520}
2521
2522#ifdef CONFIG_NUMA_BALANCING
2523static bool __initdata numabalancing_override;
2524
2525static void __init check_numabalancing_enable(void)
2526{
2527        bool numabalancing_default = false;
2528
2529        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2530                numabalancing_default = true;
2531
2532        if (nr_node_ids > 1 && !numabalancing_override) {
2533                printk(KERN_INFO "Enabling automatic NUMA balancing. "
2534                        "Configure with numa_balancing= or sysctl");
2535                set_numabalancing_state(numabalancing_default);
2536        }
2537}
2538
2539static int __init setup_numabalancing(char *str)
2540{
2541        int ret = 0;
2542        if (!str)
2543                goto out;
2544        numabalancing_override = true;
2545
2546        if (!strcmp(str, "enable")) {
2547                set_numabalancing_state(true);
2548                ret = 1;
2549        } else if (!strcmp(str, "disable")) {
2550                set_numabalancing_state(false);
2551                ret = 1;
2552        }
2553out:
2554        if (!ret)
2555                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2556
2557        return ret;
2558}
2559__setup("numa_balancing=", setup_numabalancing);
2560#else
2561static inline void __init check_numabalancing_enable(void)
2562{
2563}
2564#endif /* CONFIG_NUMA_BALANCING */
2565
2566/* assumes fs == KERNEL_DS */
2567void __init numa_policy_init(void)
2568{
2569        nodemask_t interleave_nodes;
2570        unsigned long largest = 0;
2571        int nid, prefer = 0;
2572
2573        policy_cache = kmem_cache_create("numa_policy",
2574                                         sizeof(struct mempolicy),
2575                                         0, SLAB_PANIC, NULL);
2576
2577        sn_cache = kmem_cache_create("shared_policy_node",
2578                                     sizeof(struct sp_node),
2579                                     0, SLAB_PANIC, NULL);
2580
2581        for_each_node(nid) {
2582                preferred_node_policy[nid] = (struct mempolicy) {
2583                        .refcnt = ATOMIC_INIT(1),
2584                        .mode = MPOL_PREFERRED,
2585                        .flags = MPOL_F_MOF | MPOL_F_MORON,
2586                        .v = { .preferred_node = nid, },
2587                };
2588        }
2589
2590        /*
2591         * Set interleaving policy for system init. Interleaving is only
2592         * enabled across suitably sized nodes (default is >= 16MB), or
2593         * fall back to the largest node if they're all smaller.
2594         */
2595        nodes_clear(interleave_nodes);
2596        for_each_node_state(nid, N_MEMORY) {
2597                unsigned long total_pages = node_present_pages(nid);
2598
2599                /* Preserve the largest node */
2600                if (largest < total_pages) {
2601                        largest = total_pages;
2602                        prefer = nid;
2603                }
2604
2605                /* Interleave this node? */
2606                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2607                        node_set(nid, interleave_nodes);
2608        }
2609
2610        /* All too small, use the largest */
2611        if (unlikely(nodes_empty(interleave_nodes)))
2612                node_set(prefer, interleave_nodes);
2613
2614        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2615                printk("numa_policy_init: interleaving failed\n");
2616
2617        check_numabalancing_enable();
2618}
2619
2620/* Reset policy of current process to default */
2621void numa_default_policy(void)
2622{
2623        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2624}
2625
2626/*
2627 * Parse and format mempolicy from/to strings
2628 */
2629
2630/*
2631 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2632 */
2633static const char * const policy_modes[] =
2634{
2635        [MPOL_DEFAULT]    = "default",
2636        [MPOL_PREFERRED]  = "prefer",
2637        [MPOL_BIND]       = "bind",
2638        [MPOL_INTERLEAVE] = "interleave",
2639        [MPOL_LOCAL]      = "local",
2640};
2641
2642
2643#ifdef CONFIG_TMPFS
2644/**
2645 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2646 * @str:  string containing mempolicy to parse
2647 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2648 *
2649 * Format of input:
2650 *      <mode>[=<flags>][:<nodelist>]
2651 *
2652 * On success, returns 0, else 1
2653 */
2654int mpol_parse_str(char *str, struct mempolicy **mpol)
2655{
2656        struct mempolicy *new = NULL;
2657        unsigned short mode;
2658        unsigned short mode_flags;
2659        nodemask_t nodes;
2660        char *nodelist = strchr(str, ':');
2661        char *flags = strchr(str, '=');
2662        int err = 1;
2663
2664        if (nodelist) {
2665                /* NUL-terminate mode or flags string */
2666                *nodelist++ = '\0';
2667                if (nodelist_parse(nodelist, nodes))
2668                        goto out;
2669                if (!nodes_subset(nodes, node_states[N_MEMORY]))
2670                        goto out;
2671        } else
2672                nodes_clear(nodes);
2673
2674        if (flags)
2675                *flags++ = '\0';        /* terminate mode string */
2676
2677        for (mode = 0; mode < MPOL_MAX; mode++) {
2678                if (!strcmp(str, policy_modes[mode])) {
2679                        break;
2680                }
2681        }
2682        if (mode >= MPOL_MAX)
2683                goto out;
2684
2685        switch (mode) {
2686        case MPOL_PREFERRED:
2687                /*
2688                 * Insist on a nodelist of one node only
2689                 */
2690                if (nodelist) {
2691                        char *rest = nodelist;
2692                        while (isdigit(*rest))
2693                                rest++;
2694                        if (*rest)
2695                                goto out;
2696                }
2697                break;
2698        case MPOL_INTERLEAVE:
2699                /*
2700                 * Default to online nodes with memory if no nodelist
2701                 */
2702                if (!nodelist)
2703                        nodes = node_states[N_MEMORY];
2704                break;
2705        case MPOL_LOCAL:
2706                /*
2707                 * Don't allow a nodelist;  mpol_new() checks flags
2708                 */
2709                if (nodelist)
2710                        goto out;
2711                mode = MPOL_PREFERRED;
2712                break;
2713        case MPOL_DEFAULT:
2714                /*
2715                 * Insist on a empty nodelist
2716                 */
2717                if (!nodelist)
2718                        err = 0;
2719                goto out;
2720        case MPOL_BIND:
2721                /*
2722                 * Insist on a nodelist
2723                 */
2724                if (!nodelist)
2725                        goto out;
2726        }
2727
2728        mode_flags = 0;
2729        if (flags) {
2730                /*
2731                 * Currently, we only support two mutually exclusive
2732                 * mode flags.
2733                 */
2734                if (!strcmp(flags, "static"))
2735                        mode_flags |= MPOL_F_STATIC_NODES;
2736                else if (!strcmp(flags, "relative"))
2737                        mode_flags |= MPOL_F_RELATIVE_NODES;
2738                else
2739                        goto out;
2740        }
2741
2742        new = mpol_new(mode, mode_flags, &nodes);
2743        if (IS_ERR(new))
2744                goto out;
2745
2746        /*
2747         * Save nodes for mpol_to_str() to show the tmpfs mount options
2748         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2749         */
2750        if (mode != MPOL_PREFERRED)
2751                new->v.nodes = nodes;
2752        else if (nodelist)
2753                new->v.preferred_node = first_node(nodes);
2754        else
2755                new->flags |= MPOL_F_LOCAL;
2756
2757        /*
2758         * Save nodes for contextualization: this will be used to "clone"
2759         * the mempolicy in a specific context [cpuset] at a later time.
2760         */
2761        new->w.user_nodemask = nodes;
2762
2763        err = 0;
2764
2765out:
2766        /* Restore string for error message */
2767        if (nodelist)
2768                *--nodelist = ':';
2769        if (flags)
2770                *--flags = '=';
2771        if (!err)
2772                *mpol = new;
2773        return err;
2774}
2775#endif /* CONFIG_TMPFS */
2776
2777/**
2778 * mpol_to_str - format a mempolicy structure for printing
2779 * @buffer:  to contain formatted mempolicy string
2780 * @maxlen:  length of @buffer
2781 * @pol:  pointer to mempolicy to be formatted
2782 *
2783 * Convert a mempolicy into a string.
2784 * Returns the number of characters in buffer (if positive)
2785 * or an error (negative)
2786 */
2787int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2788{
2789        char *p = buffer;
2790        int l;
2791        nodemask_t nodes;
2792        unsigned short mode;
2793        unsigned short flags = pol ? pol->flags : 0;
2794
2795        /*
2796         * Sanity check:  room for longest mode, flag and some nodes
2797         */
2798        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2799
2800        if (!pol || pol == &default_policy)
2801                mode = MPOL_DEFAULT;
2802        else
2803                mode = pol->mode;
2804
2805        switch (mode) {
2806        case MPOL_DEFAULT:
2807                nodes_clear(nodes);
2808                break;
2809
2810        case MPOL_PREFERRED:
2811                nodes_clear(nodes);
2812                if (flags & MPOL_F_LOCAL)
2813                        mode = MPOL_LOCAL;
2814                else
2815                        node_set(pol->v.preferred_node, nodes);
2816                break;
2817
2818        case MPOL_BIND:
2819                /* Fall through */
2820        case MPOL_INTERLEAVE:
2821                nodes = pol->v.nodes;
2822                break;
2823
2824        default:
2825                return -EINVAL;
2826        }
2827
2828        l = strlen(policy_modes[mode]);
2829        if (buffer + maxlen < p + l + 1)
2830                return -ENOSPC;
2831
2832        strcpy(p, policy_modes[mode]);
2833        p += l;
2834
2835        if (flags & MPOL_MODE_FLAGS) {
2836                if (buffer + maxlen < p + 2)
2837                        return -ENOSPC;
2838                *p++ = '=';
2839
2840                /*
2841                 * Currently, the only defined flags are mutually exclusive
2842                 */
2843                if (flags & MPOL_F_STATIC_NODES)
2844                        p += snprintf(p, buffer + maxlen - p, "static");
2845                else if (flags & MPOL_F_RELATIVE_NODES)
2846                        p += snprintf(p, buffer + maxlen - p, "relative");
2847        }
2848
2849        if (!nodes_empty(nodes)) {
2850                if (buffer + maxlen < p + 2)
2851                        return -ENOSPC;
2852                *p++ = ':';
2853                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2854        }
2855        return p - buffer;
2856}
2857
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.