1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/export.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
94
95#include <asm/tlbflush.h>
96#include <asm/uaccess.h>
97#include <linux/random.h>
98
99#include "internal.h"
100
101
102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
104
105static struct kmem_cache *policy_cache;
106static struct kmem_cache *sn_cache;
107
108
109
110enum zone_type policy_zone = 0;
111
112
113
114
115static struct mempolicy default_policy = {
116 .refcnt = ATOMIC_INIT(1),
117 .mode = MPOL_PREFERRED,
118 .flags = MPOL_F_LOCAL,
119};
120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139}
140
141static const struct mempolicy_operations {
142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
158 enum mpol_rebind_step step);
159} mpol_ops[MPOL_MAX];
160
161
162static int is_valid_nodemask(const nodemask_t *nodemask)
163{
164 int nd, k;
165
166 for_each_node_mask(nd, *nodemask) {
167 struct zone *z;
168
169 for (k = 0; k <= policy_zone; k++) {
170 z = &NODE_DATA(nd)->node_zones[k];
171 if (z->present_pages > 0)
172 return 1;
173 }
174 }
175
176 return 0;
177}
178
179static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
180{
181 return pol->flags & MPOL_MODE_FLAGS;
182}
183
184static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
185 const nodemask_t *rel)
186{
187 nodemask_t tmp;
188 nodes_fold(tmp, *orig, nodes_weight(*rel));
189 nodes_onto(*ret, tmp, *rel);
190}
191
192static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
193{
194 if (nodes_empty(*nodes))
195 return -EINVAL;
196 pol->v.nodes = *nodes;
197 return 0;
198}
199
200static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
201{
202 if (!nodes)
203 pol->flags |= MPOL_F_LOCAL;
204 else if (nodes_empty(*nodes))
205 return -EINVAL;
206 else
207 pol->v.preferred_node = first_node(*nodes);
208 return 0;
209}
210
211static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
212{
213 if (!is_valid_nodemask(nodes))
214 return -EINVAL;
215 pol->v.nodes = *nodes;
216 return 0;
217}
218
219
220
221
222
223
224
225
226
227
228static int mpol_set_nodemask(struct mempolicy *pol,
229 const nodemask_t *nodes, struct nodemask_scratch *nsc)
230{
231 int ret;
232
233
234 if (pol == NULL)
235 return 0;
236
237 nodes_and(nsc->mask1,
238 cpuset_current_mems_allowed, node_states[N_MEMORY]);
239
240 VM_BUG_ON(!nodes);
241 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
242 nodes = NULL;
243 else {
244 if (pol->flags & MPOL_F_RELATIVE_NODES)
245 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
246 else
247 nodes_and(nsc->mask2, *nodes, nsc->mask1);
248
249 if (mpol_store_user_nodemask(pol))
250 pol->w.user_nodemask = *nodes;
251 else
252 pol->w.cpuset_mems_allowed =
253 cpuset_current_mems_allowed;
254 }
255
256 if (nodes)
257 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
258 else
259 ret = mpol_ops[pol->mode].create(pol, NULL);
260 return ret;
261}
262
263
264
265
266
267static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
268 nodemask_t *nodes)
269{
270 struct mempolicy *policy;
271
272 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
273 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
274
275 if (mode == MPOL_DEFAULT) {
276 if (nodes && !nodes_empty(*nodes))
277 return ERR_PTR(-EINVAL);
278 return NULL;
279 }
280 VM_BUG_ON(!nodes);
281
282
283
284
285
286
287 if (mode == MPOL_PREFERRED) {
288 if (nodes_empty(*nodes)) {
289 if (((flags & MPOL_F_STATIC_NODES) ||
290 (flags & MPOL_F_RELATIVE_NODES)))
291 return ERR_PTR(-EINVAL);
292 }
293 } else if (mode == MPOL_LOCAL) {
294 if (!nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
296 mode = MPOL_PREFERRED;
297 } else if (nodes_empty(*nodes))
298 return ERR_PTR(-EINVAL);
299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
300 if (!policy)
301 return ERR_PTR(-ENOMEM);
302 atomic_set(&policy->refcnt, 1);
303 policy->mode = mode;
304 policy->flags = flags;
305
306 return policy;
307}
308
309
310void __mpol_put(struct mempolicy *p)
311{
312 if (!atomic_dec_and_test(&p->refcnt))
313 return;
314 kmem_cache_free(policy_cache, p);
315}
316
317static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
318 enum mpol_rebind_step step)
319{
320}
321
322
323
324
325
326
327
328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
329 enum mpol_rebind_step step)
330{
331 nodemask_t tmp;
332
333 if (pol->flags & MPOL_F_STATIC_NODES)
334 nodes_and(tmp, pol->w.user_nodemask, *nodes);
335 else if (pol->flags & MPOL_F_RELATIVE_NODES)
336 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
337 else {
338
339
340
341
342 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
343 nodes_remap(tmp, pol->v.nodes,
344 pol->w.cpuset_mems_allowed, *nodes);
345 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
346 } else if (step == MPOL_REBIND_STEP2) {
347 tmp = pol->w.cpuset_mems_allowed;
348 pol->w.cpuset_mems_allowed = *nodes;
349 } else
350 BUG();
351 }
352
353 if (nodes_empty(tmp))
354 tmp = *nodes;
355
356 if (step == MPOL_REBIND_STEP1)
357 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
358 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
359 pol->v.nodes = tmp;
360 else
361 BUG();
362
363 if (!node_isset(current->il_next, tmp)) {
364 current->il_next = next_node(current->il_next, tmp);
365 if (current->il_next >= MAX_NUMNODES)
366 current->il_next = first_node(tmp);
367 if (current->il_next >= MAX_NUMNODES)
368 current->il_next = numa_node_id();
369 }
370}
371
372static void mpol_rebind_preferred(struct mempolicy *pol,
373 const nodemask_t *nodes,
374 enum mpol_rebind_step step)
375{
376 nodemask_t tmp;
377
378 if (pol->flags & MPOL_F_STATIC_NODES) {
379 int node = first_node(pol->w.user_nodemask);
380
381 if (node_isset(node, *nodes)) {
382 pol->v.preferred_node = node;
383 pol->flags &= ~MPOL_F_LOCAL;
384 } else
385 pol->flags |= MPOL_F_LOCAL;
386 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
387 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
388 pol->v.preferred_node = first_node(tmp);
389 } else if (!(pol->flags & MPOL_F_LOCAL)) {
390 pol->v.preferred_node = node_remap(pol->v.preferred_node,
391 pol->w.cpuset_mems_allowed,
392 *nodes);
393 pol->w.cpuset_mems_allowed = *nodes;
394 }
395}
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
414 enum mpol_rebind_step step)
415{
416 if (!pol)
417 return;
418 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
419 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
420 return;
421
422 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
423 return;
424
425 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
426 BUG();
427
428 if (step == MPOL_REBIND_STEP1)
429 pol->flags |= MPOL_F_REBINDING;
430 else if (step == MPOL_REBIND_STEP2)
431 pol->flags &= ~MPOL_F_REBINDING;
432 else if (step >= MPOL_REBIND_NSTEP)
433 BUG();
434
435 mpol_ops[pol->mode].rebind(pol, newmask, step);
436}
437
438
439
440
441
442
443
444
445void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
446 enum mpol_rebind_step step)
447{
448 mpol_rebind_policy(tsk->mempolicy, new, step);
449}
450
451
452
453
454
455
456
457void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
458{
459 struct vm_area_struct *vma;
460
461 down_write(&mm->mmap_sem);
462 for (vma = mm->mmap; vma; vma = vma->vm_next)
463 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
464 up_write(&mm->mmap_sem);
465}
466
467static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
468 [MPOL_DEFAULT] = {
469 .rebind = mpol_rebind_default,
470 },
471 [MPOL_INTERLEAVE] = {
472 .create = mpol_new_interleave,
473 .rebind = mpol_rebind_nodemask,
474 },
475 [MPOL_PREFERRED] = {
476 .create = mpol_new_preferred,
477 .rebind = mpol_rebind_preferred,
478 },
479 [MPOL_BIND] = {
480 .create = mpol_new_bind,
481 .rebind = mpol_rebind_nodemask,
482 },
483};
484
485static void migrate_page_add(struct page *page, struct list_head *pagelist,
486 unsigned long flags);
487
488
489static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
490 unsigned long addr, unsigned long end,
491 const nodemask_t *nodes, unsigned long flags,
492 void *private)
493{
494 pte_t *orig_pte;
495 pte_t *pte;
496 spinlock_t *ptl;
497
498 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
499 do {
500 struct page *page;
501 int nid;
502
503 if (!pte_present(*pte))
504 continue;
505 page = vm_normal_page(vma, addr, *pte);
506 if (!page)
507 continue;
508
509
510
511
512
513 if (PageReserved(page) || PageKsm(page))
514 continue;
515 nid = page_to_nid(page);
516 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
517 continue;
518
519 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
520 migrate_page_add(page, private, flags);
521 else
522 break;
523 } while (pte++, addr += PAGE_SIZE, addr != end);
524 pte_unmap_unlock(orig_pte, ptl);
525 return addr != end;
526}
527
528static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
529 unsigned long addr, unsigned long end,
530 const nodemask_t *nodes, unsigned long flags,
531 void *private)
532{
533 pmd_t *pmd;
534 unsigned long next;
535
536 pmd = pmd_offset(pud, addr);
537 do {
538 next = pmd_addr_end(addr, end);
539 split_huge_page_pmd(vma, addr, pmd);
540 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
541 continue;
542 if (check_pte_range(vma, pmd, addr, next, nodes,
543 flags, private))
544 return -EIO;
545 } while (pmd++, addr = next, addr != end);
546 return 0;
547}
548
549static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
550 unsigned long addr, unsigned long end,
551 const nodemask_t *nodes, unsigned long flags,
552 void *private)
553{
554 pud_t *pud;
555 unsigned long next;
556
557 pud = pud_offset(pgd, addr);
558 do {
559 next = pud_addr_end(addr, end);
560 if (pud_none_or_clear_bad(pud))
561 continue;
562 if (check_pmd_range(vma, pud, addr, next, nodes,
563 flags, private))
564 return -EIO;
565 } while (pud++, addr = next, addr != end);
566 return 0;
567}
568
569static inline int check_pgd_range(struct vm_area_struct *vma,
570 unsigned long addr, unsigned long end,
571 const nodemask_t *nodes, unsigned long flags,
572 void *private)
573{
574 pgd_t *pgd;
575 unsigned long next;
576
577 pgd = pgd_offset(vma->vm_mm, addr);
578 do {
579 next = pgd_addr_end(addr, end);
580 if (pgd_none_or_clear_bad(pgd))
581 continue;
582 if (check_pud_range(vma, pgd, addr, next, nodes,
583 flags, private))
584 return -EIO;
585 } while (pgd++, addr = next, addr != end);
586 return 0;
587}
588
589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590
591
592
593
594
595
596
597
598
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604
605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 if (nr_updated)
607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608
609 return nr_updated;
610}
611#else
612static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 unsigned long addr, unsigned long end)
614{
615 return 0;
616}
617#endif
618
619
620
621
622
623
624static struct vm_area_struct *
625check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
626 const nodemask_t *nodes, unsigned long flags, void *private)
627{
628 int err;
629 struct vm_area_struct *first, *vma, *prev;
630
631
632 first = find_vma(mm, start);
633 if (!first)
634 return ERR_PTR(-EFAULT);
635 prev = NULL;
636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 unsigned long endvma = vma->vm_end;
638
639 if (endvma > end)
640 endvma = end;
641 if (vma->vm_start > start)
642 start = vma->vm_start;
643
644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
645 if (!vma->vm_next && vma->vm_end < end)
646 return ERR_PTR(-EFAULT);
647 if (prev && prev->vm_end < vma->vm_start)
648 return ERR_PTR(-EFAULT);
649 }
650
651 if (is_vm_hugetlb_page(vma))
652 goto next;
653
654 if (flags & MPOL_MF_LAZY) {
655 change_prot_numa(vma, start, endvma);
656 goto next;
657 }
658
659 if ((flags & MPOL_MF_STRICT) ||
660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
661 vma_migratable(vma))) {
662
663 err = check_pgd_range(vma, start, endvma, nodes,
664 flags, private);
665 if (err) {
666 first = ERR_PTR(err);
667 break;
668 }
669 }
670next:
671 prev = vma;
672 }
673 return first;
674}
675
676
677
678
679
680static int vma_replace_policy(struct vm_area_struct *vma,
681 struct mempolicy *pol)
682{
683 int err;
684 struct mempolicy *old;
685 struct mempolicy *new;
686
687 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
688 vma->vm_start, vma->vm_end, vma->vm_pgoff,
689 vma->vm_ops, vma->vm_file,
690 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
691
692 new = mpol_dup(pol);
693 if (IS_ERR(new))
694 return PTR_ERR(new);
695
696 if (vma->vm_ops && vma->vm_ops->set_policy) {
697 err = vma->vm_ops->set_policy(vma, new);
698 if (err)
699 goto err_out;
700 }
701
702 old = vma->vm_policy;
703 vma->vm_policy = new;
704 mpol_put(old);
705
706 return 0;
707 err_out:
708 mpol_put(new);
709 return err;
710}
711
712
713static int mbind_range(struct mm_struct *mm, unsigned long start,
714 unsigned long end, struct mempolicy *new_pol)
715{
716 struct vm_area_struct *next;
717 struct vm_area_struct *prev;
718 struct vm_area_struct *vma;
719 int err = 0;
720 pgoff_t pgoff;
721 unsigned long vmstart;
722 unsigned long vmend;
723
724 vma = find_vma(mm, start);
725 if (!vma || vma->vm_start > start)
726 return -EFAULT;
727
728 prev = vma->vm_prev;
729 if (start > vma->vm_start)
730 prev = vma;
731
732 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
733 next = vma->vm_next;
734 vmstart = max(start, vma->vm_start);
735 vmend = min(end, vma->vm_end);
736
737 if (mpol_equal(vma_policy(vma), new_pol))
738 continue;
739
740 pgoff = vma->vm_pgoff +
741 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
742 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
743 vma->anon_vma, vma->vm_file, pgoff,
744 new_pol);
745 if (prev) {
746 vma = prev;
747 next = vma->vm_next;
748 continue;
749 }
750 if (vma->vm_start != vmstart) {
751 err = split_vma(vma->vm_mm, vma, vmstart, 1);
752 if (err)
753 goto out;
754 }
755 if (vma->vm_end != vmend) {
756 err = split_vma(vma->vm_mm, vma, vmend, 0);
757 if (err)
758 goto out;
759 }
760 err = vma_replace_policy(vma, new_pol);
761 if (err)
762 goto out;
763 }
764
765 out:
766 return err;
767}
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786void mpol_fix_fork_child_flag(struct task_struct *p)
787{
788 if (p->mempolicy)
789 p->flags |= PF_MEMPOLICY;
790 else
791 p->flags &= ~PF_MEMPOLICY;
792}
793
794static void mpol_set_task_struct_flag(void)
795{
796 mpol_fix_fork_child_flag(current);
797}
798
799
800static long do_set_mempolicy(unsigned short mode, unsigned short flags,
801 nodemask_t *nodes)
802{
803 struct mempolicy *new, *old;
804 struct mm_struct *mm = current->mm;
805 NODEMASK_SCRATCH(scratch);
806 int ret;
807
808 if (!scratch)
809 return -ENOMEM;
810
811 new = mpol_new(mode, flags, nodes);
812 if (IS_ERR(new)) {
813 ret = PTR_ERR(new);
814 goto out;
815 }
816
817
818
819
820
821
822 if (mm)
823 down_write(&mm->mmap_sem);
824 task_lock(current);
825 ret = mpol_set_nodemask(new, nodes, scratch);
826 if (ret) {
827 task_unlock(current);
828 if (mm)
829 up_write(&mm->mmap_sem);
830 mpol_put(new);
831 goto out;
832 }
833 old = current->mempolicy;
834 current->mempolicy = new;
835 mpol_set_task_struct_flag();
836 if (new && new->mode == MPOL_INTERLEAVE &&
837 nodes_weight(new->v.nodes))
838 current->il_next = first_node(new->v.nodes);
839 task_unlock(current);
840 if (mm)
841 up_write(&mm->mmap_sem);
842
843 mpol_put(old);
844 ret = 0;
845out:
846 NODEMASK_SCRATCH_FREE(scratch);
847 return ret;
848}
849
850
851
852
853
854
855static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
856{
857 nodes_clear(*nodes);
858 if (p == &default_policy)
859 return;
860
861 switch (p->mode) {
862 case MPOL_BIND:
863
864 case MPOL_INTERLEAVE:
865 *nodes = p->v.nodes;
866 break;
867 case MPOL_PREFERRED:
868 if (!(p->flags & MPOL_F_LOCAL))
869 node_set(p->v.preferred_node, *nodes);
870
871 break;
872 default:
873 BUG();
874 }
875}
876
877static int lookup_node(struct mm_struct *mm, unsigned long addr)
878{
879 struct page *p;
880 int err;
881
882 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
883 if (err >= 0) {
884 err = page_to_nid(p);
885 put_page(p);
886 }
887 return err;
888}
889
890
891static long do_get_mempolicy(int *policy, nodemask_t *nmask,
892 unsigned long addr, unsigned long flags)
893{
894 int err;
895 struct mm_struct *mm = current->mm;
896 struct vm_area_struct *vma = NULL;
897 struct mempolicy *pol = current->mempolicy;
898
899 if (flags &
900 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
901 return -EINVAL;
902
903 if (flags & MPOL_F_MEMS_ALLOWED) {
904 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
905 return -EINVAL;
906 *policy = 0;
907 task_lock(current);
908 *nmask = cpuset_current_mems_allowed;
909 task_unlock(current);
910 return 0;
911 }
912
913 if (flags & MPOL_F_ADDR) {
914
915
916
917
918
919 down_read(&mm->mmap_sem);
920 vma = find_vma_intersection(mm, addr, addr+1);
921 if (!vma) {
922 up_read(&mm->mmap_sem);
923 return -EFAULT;
924 }
925 if (vma->vm_ops && vma->vm_ops->get_policy)
926 pol = vma->vm_ops->get_policy(vma, addr);
927 else
928 pol = vma->vm_policy;
929 } else if (addr)
930 return -EINVAL;
931
932 if (!pol)
933 pol = &default_policy;
934
935 if (flags & MPOL_F_NODE) {
936 if (flags & MPOL_F_ADDR) {
937 err = lookup_node(mm, addr);
938 if (err < 0)
939 goto out;
940 *policy = err;
941 } else if (pol == current->mempolicy &&
942 pol->mode == MPOL_INTERLEAVE) {
943 *policy = current->il_next;
944 } else {
945 err = -EINVAL;
946 goto out;
947 }
948 } else {
949 *policy = pol == &default_policy ? MPOL_DEFAULT :
950 pol->mode;
951
952
953
954
955 *policy |= (pol->flags & MPOL_MODE_FLAGS);
956 }
957
958 if (vma) {
959 up_read(¤t->mm->mmap_sem);
960 vma = NULL;
961 }
962
963 err = 0;
964 if (nmask) {
965 if (mpol_store_user_nodemask(pol)) {
966 *nmask = pol->w.user_nodemask;
967 } else {
968 task_lock(current);
969 get_policy_nodemask(pol, nmask);
970 task_unlock(current);
971 }
972 }
973
974 out:
975 mpol_cond_put(pol);
976 if (vma)
977 up_read(¤t->mm->mmap_sem);
978 return err;
979}
980
981#ifdef CONFIG_MIGRATION
982
983
984
985static void migrate_page_add(struct page *page, struct list_head *pagelist,
986 unsigned long flags)
987{
988
989
990
991 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
992 if (!isolate_lru_page(page)) {
993 list_add_tail(&page->lru, pagelist);
994 inc_zone_page_state(page, NR_ISOLATED_ANON +
995 page_is_file_cache(page));
996 }
997 }
998}
999
1000static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1001{
1002 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1003}
1004
1005
1006
1007
1008
1009static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1010 int flags)
1011{
1012 nodemask_t nmask;
1013 LIST_HEAD(pagelist);
1014 int err = 0;
1015
1016 nodes_clear(nmask);
1017 node_set(source, nmask);
1018
1019
1020
1021
1022
1023
1024 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1025 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1026 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1027
1028 if (!list_empty(&pagelist)) {
1029 err = migrate_pages(&pagelist, new_node_page, dest,
1030 false, MIGRATE_SYNC,
1031 MR_SYSCALL);
1032 if (err)
1033 putback_lru_pages(&pagelist);
1034 }
1035
1036 return err;
1037}
1038
1039
1040
1041
1042
1043
1044
1045int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1046 const nodemask_t *to, int flags)
1047{
1048 int busy = 0;
1049 int err;
1050 nodemask_t tmp;
1051
1052 err = migrate_prep();
1053 if (err)
1054 return err;
1055
1056 down_read(&mm->mmap_sem);
1057
1058 err = migrate_vmas(mm, from, to, flags);
1059 if (err)
1060 goto out;
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093 tmp = *from;
1094 while (!nodes_empty(tmp)) {
1095 int s,d;
1096 int source = -1;
1097 int dest = 0;
1098
1099 for_each_node_mask(s, tmp) {
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1117 (node_isset(s, *to)))
1118 continue;
1119
1120 d = node_remap(s, *from, *to);
1121 if (s == d)
1122 continue;
1123
1124 source = s;
1125 dest = d;
1126
1127
1128 if (!node_isset(dest, tmp))
1129 break;
1130 }
1131 if (source == -1)
1132 break;
1133
1134 node_clear(source, tmp);
1135 err = migrate_to_node(mm, source, dest, flags);
1136 if (err > 0)
1137 busy += err;
1138 if (err < 0)
1139 break;
1140 }
1141out:
1142 up_read(&mm->mmap_sem);
1143 if (err < 0)
1144 return err;
1145 return busy;
1146
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1157{
1158 struct vm_area_struct *vma = (struct vm_area_struct *)private;
1159 unsigned long uninitialized_var(address);
1160
1161 while (vma) {
1162 address = page_address_in_vma(page, vma);
1163 if (address != -EFAULT)
1164 break;
1165 vma = vma->vm_next;
1166 }
1167
1168
1169
1170
1171 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1172}
1173#else
1174
1175static void migrate_page_add(struct page *page, struct list_head *pagelist,
1176 unsigned long flags)
1177{
1178}
1179
1180int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1181 const nodemask_t *to, int flags)
1182{
1183 return -ENOSYS;
1184}
1185
1186static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1187{
1188 return NULL;
1189}
1190#endif
1191
1192static long do_mbind(unsigned long start, unsigned long len,
1193 unsigned short mode, unsigned short mode_flags,
1194 nodemask_t *nmask, unsigned long flags)
1195{
1196 struct vm_area_struct *vma;
1197 struct mm_struct *mm = current->mm;
1198 struct mempolicy *new;
1199 unsigned long end;
1200 int err;
1201 LIST_HEAD(pagelist);
1202
1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1204 return -EINVAL;
1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1206 return -EPERM;
1207
1208 if (start & ~PAGE_MASK)
1209 return -EINVAL;
1210
1211 if (mode == MPOL_DEFAULT)
1212 flags &= ~MPOL_MF_STRICT;
1213
1214 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1215 end = start + len;
1216
1217 if (end < start)
1218 return -EINVAL;
1219 if (end == start)
1220 return 0;
1221
1222 new = mpol_new(mode, mode_flags, nmask);
1223 if (IS_ERR(new))
1224 return PTR_ERR(new);
1225
1226 if (flags & MPOL_MF_LAZY)
1227 new->flags |= MPOL_F_MOF;
1228
1229
1230
1231
1232
1233 if (!new)
1234 flags |= MPOL_MF_DISCONTIG_OK;
1235
1236 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237 start, start + len, mode, mode_flags,
1238 nmask ? nodes_addr(*nmask)[0] : -1);
1239
1240 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241
1242 err = migrate_prep();
1243 if (err)
1244 goto mpol_out;
1245 }
1246 {
1247 NODEMASK_SCRATCH(scratch);
1248 if (scratch) {
1249 down_write(&mm->mmap_sem);
1250 task_lock(current);
1251 err = mpol_set_nodemask(new, nmask, scratch);
1252 task_unlock(current);
1253 if (err)
1254 up_write(&mm->mmap_sem);
1255 } else
1256 err = -ENOMEM;
1257 NODEMASK_SCRATCH_FREE(scratch);
1258 }
1259 if (err)
1260 goto mpol_out;
1261
1262 vma = check_range(mm, start, end, nmask,
1263 flags | MPOL_MF_INVERT, &pagelist);
1264
1265 err = PTR_ERR(vma);
1266 if (!IS_ERR(vma))
1267 err = mbind_range(mm, start, end, new);
1268
1269 if (!err) {
1270 int nr_failed = 0;
1271
1272 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1275 (unsigned long)vma,
1276 false, MIGRATE_SYNC,
1277 MR_MEMPOLICY_MBIND);
1278 if (nr_failed)
1279 putback_lru_pages(&pagelist);
1280 }
1281
1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1283 err = -EIO;
1284 } else
1285 putback_lru_pages(&pagelist);
1286
1287 up_write(&mm->mmap_sem);
1288 mpol_out:
1289 mpol_put(new);
1290 return err;
1291}
1292
1293
1294
1295
1296
1297
1298static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1299 unsigned long maxnode)
1300{
1301 unsigned long k;
1302 unsigned long nlongs;
1303 unsigned long endmask;
1304
1305 --maxnode;
1306 nodes_clear(*nodes);
1307 if (maxnode == 0 || !nmask)
1308 return 0;
1309 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1310 return -EINVAL;
1311
1312 nlongs = BITS_TO_LONGS(maxnode);
1313 if ((maxnode % BITS_PER_LONG) == 0)
1314 endmask = ~0UL;
1315 else
1316 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1317
1318
1319
1320 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1321 if (nlongs > PAGE_SIZE/sizeof(long))
1322 return -EINVAL;
1323 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1324 unsigned long t;
1325 if (get_user(t, nmask + k))
1326 return -EFAULT;
1327 if (k == nlongs - 1) {
1328 if (t & endmask)
1329 return -EINVAL;
1330 } else if (t)
1331 return -EINVAL;
1332 }
1333 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1334 endmask = ~0UL;
1335 }
1336
1337 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1338 return -EFAULT;
1339 nodes_addr(*nodes)[nlongs-1] &= endmask;
1340 return 0;
1341}
1342
1343
1344static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1345 nodemask_t *nodes)
1346{
1347 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1348 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1349
1350 if (copy > nbytes) {
1351 if (copy > PAGE_SIZE)
1352 return -EINVAL;
1353 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1354 return -EFAULT;
1355 copy = nbytes;
1356 }
1357 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1358}
1359
1360SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1361 unsigned long, mode, unsigned long __user *, nmask,
1362 unsigned long, maxnode, unsigned, flags)
1363{
1364 nodemask_t nodes;
1365 int err;
1366 unsigned short mode_flags;
1367
1368 mode_flags = mode & MPOL_MODE_FLAGS;
1369 mode &= ~MPOL_MODE_FLAGS;
1370 if (mode >= MPOL_MAX)
1371 return -EINVAL;
1372 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1373 (mode_flags & MPOL_F_RELATIVE_NODES))
1374 return -EINVAL;
1375 err = get_nodes(&nodes, nmask, maxnode);
1376 if (err)
1377 return err;
1378 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1379}
1380
1381
1382SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1383 unsigned long, maxnode)
1384{
1385 int err;
1386 nodemask_t nodes;
1387 unsigned short flags;
1388
1389 flags = mode & MPOL_MODE_FLAGS;
1390 mode &= ~MPOL_MODE_FLAGS;
1391 if ((unsigned int)mode >= MPOL_MAX)
1392 return -EINVAL;
1393 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1394 return -EINVAL;
1395 err = get_nodes(&nodes, nmask, maxnode);
1396 if (err)
1397 return err;
1398 return do_set_mempolicy(mode, flags, &nodes);
1399}
1400
1401SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1402 const unsigned long __user *, old_nodes,
1403 const unsigned long __user *, new_nodes)
1404{
1405 const struct cred *cred = current_cred(), *tcred;
1406 struct mm_struct *mm = NULL;
1407 struct task_struct *task;
1408 nodemask_t task_nodes;
1409 int err;
1410 nodemask_t *old;
1411 nodemask_t *new;
1412 NODEMASK_SCRATCH(scratch);
1413
1414 if (!scratch)
1415 return -ENOMEM;
1416
1417 old = &scratch->mask1;
1418 new = &scratch->mask2;
1419
1420 err = get_nodes(old, old_nodes, maxnode);
1421 if (err)
1422 goto out;
1423
1424 err = get_nodes(new, new_nodes, maxnode);
1425 if (err)
1426 goto out;
1427
1428
1429 rcu_read_lock();
1430 task = pid ? find_task_by_vpid(pid) : current;
1431 if (!task) {
1432 rcu_read_unlock();
1433 err = -ESRCH;
1434 goto out;
1435 }
1436 get_task_struct(task);
1437
1438 err = -EINVAL;
1439
1440
1441
1442
1443
1444
1445
1446 tcred = __task_cred(task);
1447 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1448 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1449 !capable(CAP_SYS_NICE)) {
1450 rcu_read_unlock();
1451 err = -EPERM;
1452 goto out_put;
1453 }
1454 rcu_read_unlock();
1455
1456 task_nodes = cpuset_mems_allowed(task);
1457
1458 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1459 err = -EPERM;
1460 goto out_put;
1461 }
1462
1463 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1464 err = -EINVAL;
1465 goto out_put;
1466 }
1467
1468 err = security_task_movememory(task);
1469 if (err)
1470 goto out_put;
1471
1472 mm = get_task_mm(task);
1473 put_task_struct(task);
1474
1475 if (!mm) {
1476 err = -EINVAL;
1477 goto out;
1478 }
1479
1480 err = do_migrate_pages(mm, old, new,
1481 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1482
1483 mmput(mm);
1484out:
1485 NODEMASK_SCRATCH_FREE(scratch);
1486
1487 return err;
1488
1489out_put:
1490 put_task_struct(task);
1491 goto out;
1492
1493}
1494
1495
1496
1497SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498 unsigned long __user *, nmask, unsigned long, maxnode,
1499 unsigned long, addr, unsigned long, flags)
1500{
1501 int err;
1502 int uninitialized_var(pval);
1503 nodemask_t nodes;
1504
1505 if (nmask != NULL && maxnode < MAX_NUMNODES)
1506 return -EINVAL;
1507
1508 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1509
1510 if (err)
1511 return err;
1512
1513 if (policy && put_user(pval, policy))
1514 return -EFAULT;
1515
1516 if (nmask)
1517 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1518
1519 return err;
1520}
1521
1522#ifdef CONFIG_COMPAT
1523
1524asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1525 compat_ulong_t __user *nmask,
1526 compat_ulong_t maxnode,
1527 compat_ulong_t addr, compat_ulong_t flags)
1528{
1529 long err;
1530 unsigned long __user *nm = NULL;
1531 unsigned long nr_bits, alloc_size;
1532 DECLARE_BITMAP(bm, MAX_NUMNODES);
1533
1534 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536
1537 if (nmask)
1538 nm = compat_alloc_user_space(alloc_size);
1539
1540 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1541
1542 if (!err && nmask) {
1543 unsigned long copy_size;
1544 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1545 err = copy_from_user(bm, nm, copy_size);
1546
1547 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1548 err |= compat_put_bitmap(nmask, bm, nr_bits);
1549 }
1550
1551 return err;
1552}
1553
1554asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1555 compat_ulong_t maxnode)
1556{
1557 long err = 0;
1558 unsigned long __user *nm = NULL;
1559 unsigned long nr_bits, alloc_size;
1560 DECLARE_BITMAP(bm, MAX_NUMNODES);
1561
1562 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1563 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1564
1565 if (nmask) {
1566 err = compat_get_bitmap(bm, nmask, nr_bits);
1567 nm = compat_alloc_user_space(alloc_size);
1568 err |= copy_to_user(nm, bm, alloc_size);
1569 }
1570
1571 if (err)
1572 return -EFAULT;
1573
1574 return sys_set_mempolicy(mode, nm, nr_bits+1);
1575}
1576
1577asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1578 compat_ulong_t mode, compat_ulong_t __user *nmask,
1579 compat_ulong_t maxnode, compat_ulong_t flags)
1580{
1581 long err = 0;
1582 unsigned long __user *nm = NULL;
1583 unsigned long nr_bits, alloc_size;
1584 nodemask_t bm;
1585
1586 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1587 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1588
1589 if (nmask) {
1590 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1591 nm = compat_alloc_user_space(alloc_size);
1592 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1593 }
1594
1595 if (err)
1596 return -EFAULT;
1597
1598 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1599}
1600
1601#endif
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618struct mempolicy *get_vma_policy(struct task_struct *task,
1619 struct vm_area_struct *vma, unsigned long addr)
1620{
1621 struct mempolicy *pol = get_task_policy(task);
1622
1623 if (vma) {
1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
1625 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1626 addr);
1627 if (vpol)
1628 pol = vpol;
1629 } else if (vma->vm_policy) {
1630 pol = vma->vm_policy;
1631
1632
1633
1634
1635
1636
1637
1638 if (mpol_needs_cond_ref(pol))
1639 mpol_get(pol);
1640 }
1641 }
1642 if (!pol)
1643 pol = &default_policy;
1644 return pol;
1645}
1646
1647
1648
1649
1650
1651static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652{
1653
1654 if (unlikely(policy->mode == MPOL_BIND) &&
1655 gfp_zone(gfp) >= policy_zone &&
1656 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657 return &policy->v.nodes;
1658
1659 return NULL;
1660}
1661
1662
1663static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1664 int nd)
1665{
1666 switch (policy->mode) {
1667 case MPOL_PREFERRED:
1668 if (!(policy->flags & MPOL_F_LOCAL))
1669 nd = policy->v.preferred_node;
1670 break;
1671 case MPOL_BIND:
1672
1673
1674
1675
1676
1677
1678 if (unlikely(gfp & __GFP_THISNODE) &&
1679 unlikely(!node_isset(nd, policy->v.nodes)))
1680 nd = first_node(policy->v.nodes);
1681 break;
1682 default:
1683 BUG();
1684 }
1685 return node_zonelist(nd, gfp);
1686}
1687
1688
1689static unsigned interleave_nodes(struct mempolicy *policy)
1690{
1691 unsigned nid, next;
1692 struct task_struct *me = current;
1693
1694 nid = me->il_next;
1695 next = next_node(nid, policy->v.nodes);
1696 if (next >= MAX_NUMNODES)
1697 next = first_node(policy->v.nodes);
1698 if (next < MAX_NUMNODES)
1699 me->il_next = next;
1700 return nid;
1701}
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711unsigned slab_node(void)
1712{
1713 struct mempolicy *policy;
1714
1715 if (in_interrupt())
1716 return numa_node_id();
1717
1718 policy = current->mempolicy;
1719 if (!policy || policy->flags & MPOL_F_LOCAL)
1720 return numa_node_id();
1721
1722 switch (policy->mode) {
1723 case MPOL_PREFERRED:
1724
1725
1726
1727 return policy->v.preferred_node;
1728
1729 case MPOL_INTERLEAVE:
1730 return interleave_nodes(policy);
1731
1732 case MPOL_BIND: {
1733
1734
1735
1736
1737 struct zonelist *zonelist;
1738 struct zone *zone;
1739 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1740 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1741 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1742 &policy->v.nodes,
1743 &zone);
1744 return zone ? zone->node : numa_node_id();
1745 }
1746
1747 default:
1748 BUG();
1749 }
1750}
1751
1752
1753static unsigned offset_il_node(struct mempolicy *pol,
1754 struct vm_area_struct *vma, unsigned long off)
1755{
1756 unsigned nnodes = nodes_weight(pol->v.nodes);
1757 unsigned target;
1758 int c;
1759 int nid = -1;
1760
1761 if (!nnodes)
1762 return numa_node_id();
1763 target = (unsigned int)off % nnodes;
1764 c = 0;
1765 do {
1766 nid = next_node(nid, pol->v.nodes);
1767 c++;
1768 } while (c <= target);
1769 return nid;
1770}
1771
1772
1773static inline unsigned interleave_nid(struct mempolicy *pol,
1774 struct vm_area_struct *vma, unsigned long addr, int shift)
1775{
1776 if (vma) {
1777 unsigned long off;
1778
1779
1780
1781
1782
1783
1784
1785
1786 BUG_ON(shift < PAGE_SHIFT);
1787 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1788 off += (addr - vma->vm_start) >> shift;
1789 return offset_il_node(pol, vma, off);
1790 } else
1791 return interleave_nodes(pol);
1792}
1793
1794
1795
1796
1797
1798int node_random(const nodemask_t *maskp)
1799{
1800 int w, bit = -1;
1801
1802 w = nodes_weight(*maskp);
1803 if (w)
1804 bit = bitmap_ord_to_pos(maskp->bits,
1805 get_random_int() % w, MAX_NUMNODES);
1806 return bit;
1807}
1808
1809#ifdef CONFIG_HUGETLBFS
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1826 gfp_t gfp_flags, struct mempolicy **mpol,
1827 nodemask_t **nodemask)
1828{
1829 struct zonelist *zl;
1830
1831 *mpol = get_vma_policy(current, vma, addr);
1832 *nodemask = NULL;
1833
1834 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1835 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1836 huge_page_shift(hstate_vma(vma))), gfp_flags);
1837 } else {
1838 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1839 if ((*mpol)->mode == MPOL_BIND)
1840 *nodemask = &(*mpol)->v.nodes;
1841 }
1842 return zl;
1843}
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861bool init_nodemask_of_mempolicy(nodemask_t *mask)
1862{
1863 struct mempolicy *mempolicy;
1864 int nid;
1865
1866 if (!(mask && current->mempolicy))
1867 return false;
1868
1869 task_lock(current);
1870 mempolicy = current->mempolicy;
1871 switch (mempolicy->mode) {
1872 case MPOL_PREFERRED:
1873 if (mempolicy->flags & MPOL_F_LOCAL)
1874 nid = numa_node_id();
1875 else
1876 nid = mempolicy->v.preferred_node;
1877 init_nodemask_of_node(mask, nid);
1878 break;
1879
1880 case MPOL_BIND:
1881
1882 case MPOL_INTERLEAVE:
1883 *mask = mempolicy->v.nodes;
1884 break;
1885
1886 default:
1887 BUG();
1888 }
1889 task_unlock(current);
1890
1891 return true;
1892}
1893#endif
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1906 const nodemask_t *mask)
1907{
1908 struct mempolicy *mempolicy;
1909 bool ret = true;
1910
1911 if (!mask)
1912 return ret;
1913 task_lock(tsk);
1914 mempolicy = tsk->mempolicy;
1915 if (!mempolicy)
1916 goto out;
1917
1918 switch (mempolicy->mode) {
1919 case MPOL_PREFERRED:
1920
1921
1922
1923
1924
1925
1926 break;
1927 case MPOL_BIND:
1928 case MPOL_INTERLEAVE:
1929 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1930 break;
1931 default:
1932 BUG();
1933 }
1934out:
1935 task_unlock(tsk);
1936 return ret;
1937}
1938
1939
1940
1941static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1942 unsigned nid)
1943{
1944 struct zonelist *zl;
1945 struct page *page;
1946
1947 zl = node_zonelist(nid, gfp);
1948 page = __alloc_pages(gfp, order, zl);
1949 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1950 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1951 return page;
1952}
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977struct page *
1978alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1979 unsigned long addr, int node)
1980{
1981 struct mempolicy *pol;
1982 struct page *page;
1983 unsigned int cpuset_mems_cookie;
1984
1985retry_cpuset:
1986 pol = get_vma_policy(current, vma, addr);
1987 cpuset_mems_cookie = get_mems_allowed();
1988
1989 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1990 unsigned nid;
1991
1992 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1993 mpol_cond_put(pol);
1994 page = alloc_page_interleave(gfp, order, nid);
1995 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1996 goto retry_cpuset;
1997
1998 return page;
1999 }
2000 page = __alloc_pages_nodemask(gfp, order,
2001 policy_zonelist(gfp, pol, node),
2002 policy_nodemask(gfp, pol));
2003 if (unlikely(mpol_needs_cond_ref(pol)))
2004 __mpol_put(pol);
2005 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2006 goto retry_cpuset;
2007 return page;
2008}
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2030{
2031 struct mempolicy *pol = get_task_policy(current);
2032 struct page *page;
2033 unsigned int cpuset_mems_cookie;
2034
2035 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2036 pol = &default_policy;
2037
2038retry_cpuset:
2039 cpuset_mems_cookie = get_mems_allowed();
2040
2041
2042
2043
2044
2045 if (pol->mode == MPOL_INTERLEAVE)
2046 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2047 else
2048 page = __alloc_pages_nodemask(gfp, order,
2049 policy_zonelist(gfp, pol, numa_node_id()),
2050 policy_nodemask(gfp, pol));
2051
2052 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2053 goto retry_cpuset;
2054
2055 return page;
2056}
2057EXPORT_SYMBOL(alloc_pages_current);
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071struct mempolicy *__mpol_dup(struct mempolicy *old)
2072{
2073 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2074
2075 if (!new)
2076 return ERR_PTR(-ENOMEM);
2077
2078
2079 if (old == current->mempolicy) {
2080 task_lock(current);
2081 *new = *old;
2082 task_unlock(current);
2083 } else
2084 *new = *old;
2085
2086 rcu_read_lock();
2087 if (current_cpuset_is_being_rebound()) {
2088 nodemask_t mems = cpuset_mems_allowed(current);
2089 if (new->flags & MPOL_F_REBINDING)
2090 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2091 else
2092 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2093 }
2094 rcu_read_unlock();
2095 atomic_set(&new->refcnt, 1);
2096 return new;
2097}
2098
2099
2100bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2101{
2102 if (!a || !b)
2103 return false;
2104 if (a->mode != b->mode)
2105 return false;
2106 if (a->flags != b->flags)
2107 return false;
2108 if (mpol_store_user_nodemask(a))
2109 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2110 return false;
2111
2112 switch (a->mode) {
2113 case MPOL_BIND:
2114
2115 case MPOL_INTERLEAVE:
2116 return !!nodes_equal(a->v.nodes, b->v.nodes);
2117 case MPOL_PREFERRED:
2118 return a->v.preferred_node == b->v.preferred_node;
2119 default:
2120 BUG();
2121 return false;
2122 }
2123}
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136static struct sp_node *
2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138{
2139 struct rb_node *n = sp->root.rb_node;
2140
2141 while (n) {
2142 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2143
2144 if (start >= p->end)
2145 n = n->rb_right;
2146 else if (end <= p->start)
2147 n = n->rb_left;
2148 else
2149 break;
2150 }
2151 if (!n)
2152 return NULL;
2153 for (;;) {
2154 struct sp_node *w = NULL;
2155 struct rb_node *prev = rb_prev(n);
2156 if (!prev)
2157 break;
2158 w = rb_entry(prev, struct sp_node, nd);
2159 if (w->end <= start)
2160 break;
2161 n = prev;
2162 }
2163 return rb_entry(n, struct sp_node, nd);
2164}
2165
2166
2167
2168static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2169{
2170 struct rb_node **p = &sp->root.rb_node;
2171 struct rb_node *parent = NULL;
2172 struct sp_node *nd;
2173
2174 while (*p) {
2175 parent = *p;
2176 nd = rb_entry(parent, struct sp_node, nd);
2177 if (new->start < nd->start)
2178 p = &(*p)->rb_left;
2179 else if (new->end > nd->end)
2180 p = &(*p)->rb_right;
2181 else
2182 BUG();
2183 }
2184 rb_link_node(&new->nd, parent, p);
2185 rb_insert_color(&new->nd, &sp->root);
2186 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2187 new->policy ? new->policy->mode : 0);
2188}
2189
2190
2191struct mempolicy *
2192mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2193{
2194 struct mempolicy *pol = NULL;
2195 struct sp_node *sn;
2196
2197 if (!sp->root.rb_node)
2198 return NULL;
2199 spin_lock(&sp->lock);
2200 sn = sp_lookup(sp, idx, idx+1);
2201 if (sn) {
2202 mpol_get(sn->policy);
2203 pol = sn->policy;
2204 }
2205 spin_unlock(&sp->lock);
2206 return pol;
2207}
2208
2209static void sp_free(struct sp_node *n)
2210{
2211 mpol_put(n->policy);
2212 kmem_cache_free(sn_cache, n);
2213}
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234 struct mempolicy *pol;
2235 struct zone *zone;
2236 int curnid = page_to_nid(page);
2237 unsigned long pgoff;
2238 int polnid = -1;
2239 int ret = -1;
2240
2241 BUG_ON(!vma);
2242
2243 pol = get_vma_policy(current, vma, addr);
2244 if (!(pol->flags & MPOL_F_MOF))
2245 goto out;
2246
2247 switch (pol->mode) {
2248 case MPOL_INTERLEAVE:
2249 BUG_ON(addr >= vma->vm_end);
2250 BUG_ON(addr < vma->vm_start);
2251
2252 pgoff = vma->vm_pgoff;
2253 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 polnid = offset_il_node(pol, vma, pgoff);
2255 break;
2256
2257 case MPOL_PREFERRED:
2258 if (pol->flags & MPOL_F_LOCAL)
2259 polnid = numa_node_id();
2260 else
2261 polnid = pol->v.preferred_node;
2262 break;
2263
2264 case MPOL_BIND:
2265
2266
2267
2268
2269
2270
2271 if (node_isset(curnid, pol->v.nodes))
2272 goto out;
2273 (void)first_zones_zonelist(
2274 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 gfp_zone(GFP_HIGHUSER),
2276 &pol->v.nodes, &zone);
2277 polnid = zone->node;
2278 break;
2279
2280 default:
2281 BUG();
2282 }
2283
2284
2285 if (pol->flags & MPOL_F_MORON) {
2286 int last_nid;
2287
2288 polnid = numa_node_id();
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311 last_nid = page_xchg_last_nid(page, polnid);
2312 if (last_nid != polnid)
2313 goto out;
2314 }
2315
2316 if (curnid != polnid)
2317 ret = polnid;
2318out:
2319 mpol_cond_put(pol);
2320
2321 return ret;
2322}
2323
2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2325{
2326 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2327 rb_erase(&n->nd, &sp->root);
2328 sp_free(n);
2329}
2330
2331static void sp_node_init(struct sp_node *node, unsigned long start,
2332 unsigned long end, struct mempolicy *pol)
2333{
2334 node->start = start;
2335 node->end = end;
2336 node->policy = pol;
2337}
2338
2339static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2340 struct mempolicy *pol)
2341{
2342 struct sp_node *n;
2343 struct mempolicy *newpol;
2344
2345 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2346 if (!n)
2347 return NULL;
2348
2349 newpol = mpol_dup(pol);
2350 if (IS_ERR(newpol)) {
2351 kmem_cache_free(sn_cache, n);
2352 return NULL;
2353 }
2354 newpol->flags |= MPOL_F_SHARED;
2355 sp_node_init(n, start, end, newpol);
2356
2357 return n;
2358}
2359
2360
2361static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2362 unsigned long end, struct sp_node *new)
2363{
2364 struct sp_node *n;
2365 struct sp_node *n_new = NULL;
2366 struct mempolicy *mpol_new = NULL;
2367 int ret = 0;
2368
2369restart:
2370 spin_lock(&sp->lock);
2371 n = sp_lookup(sp, start, end);
2372
2373 while (n && n->start < end) {
2374 struct rb_node *next = rb_next(&n->nd);
2375 if (n->start >= start) {
2376 if (n->end <= end)
2377 sp_delete(sp, n);
2378 else
2379 n->start = end;
2380 } else {
2381
2382 if (n->end > end) {
2383 if (!n_new)
2384 goto alloc_new;
2385
2386 *mpol_new = *n->policy;
2387 atomic_set(&mpol_new->refcnt, 1);
2388 sp_node_init(n_new, n->end, end, mpol_new);
2389 sp_insert(sp, n_new);
2390 n->end = start;
2391 n_new = NULL;
2392 mpol_new = NULL;
2393 break;
2394 } else
2395 n->end = start;
2396 }
2397 if (!next)
2398 break;
2399 n = rb_entry(next, struct sp_node, nd);
2400 }
2401 if (new)
2402 sp_insert(sp, new);
2403 spin_unlock(&sp->lock);
2404 ret = 0;
2405
2406err_out:
2407 if (mpol_new)
2408 mpol_put(mpol_new);
2409 if (n_new)
2410 kmem_cache_free(sn_cache, n_new);
2411
2412 return ret;
2413
2414alloc_new:
2415 spin_unlock(&sp->lock);
2416 ret = -ENOMEM;
2417 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2418 if (!n_new)
2419 goto err_out;
2420 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2421 if (!mpol_new)
2422 goto err_out;
2423 goto restart;
2424}
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2437{
2438 int ret;
2439
2440 sp->root = RB_ROOT;
2441 spin_lock_init(&sp->lock);
2442
2443 if (mpol) {
2444 struct vm_area_struct pvma;
2445 struct mempolicy *new;
2446 NODEMASK_SCRATCH(scratch);
2447
2448 if (!scratch)
2449 goto put_mpol;
2450
2451 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2452 if (IS_ERR(new))
2453 goto free_scratch;
2454
2455 task_lock(current);
2456 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2457 task_unlock(current);
2458 if (ret)
2459 goto put_new;
2460
2461
2462 memset(&pvma, 0, sizeof(struct vm_area_struct));
2463 pvma.vm_end = TASK_SIZE;
2464 mpol_set_shared_policy(sp, &pvma, new);
2465
2466put_new:
2467 mpol_put(new);
2468free_scratch:
2469 NODEMASK_SCRATCH_FREE(scratch);
2470put_mpol:
2471 mpol_put(mpol);
2472 }
2473}
2474
2475int mpol_set_shared_policy(struct shared_policy *info,
2476 struct vm_area_struct *vma, struct mempolicy *npol)
2477{
2478 int err;
2479 struct sp_node *new = NULL;
2480 unsigned long sz = vma_pages(vma);
2481
2482 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2483 vma->vm_pgoff,
2484 sz, npol ? npol->mode : -1,
2485 npol ? npol->flags : -1,
2486 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2487
2488 if (npol) {
2489 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2490 if (!new)
2491 return -ENOMEM;
2492 }
2493 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2494 if (err && new)
2495 sp_free(new);
2496 return err;
2497}
2498
2499
2500void mpol_free_shared_policy(struct shared_policy *p)
2501{
2502 struct sp_node *n;
2503 struct rb_node *next;
2504
2505 if (!p->root.rb_node)
2506 return;
2507 spin_lock(&p->lock);
2508 next = rb_first(&p->root);
2509 while (next) {
2510 n = rb_entry(next, struct sp_node, nd);
2511 next = rb_next(&n->nd);
2512 sp_delete(p, n);
2513 }
2514 spin_unlock(&p->lock);
2515}
2516
2517#ifdef CONFIG_NUMA_BALANCING
2518static bool __initdata numabalancing_override;
2519
2520static void __init check_numabalancing_enable(void)
2521{
2522 bool numabalancing_default = false;
2523
2524 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2525 numabalancing_default = true;
2526
2527 if (nr_node_ids > 1 && !numabalancing_override) {
2528 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2529 "Configure with numa_balancing= or sysctl");
2530 set_numabalancing_state(numabalancing_default);
2531 }
2532}
2533
2534static int __init setup_numabalancing(char *str)
2535{
2536 int ret = 0;
2537 if (!str)
2538 goto out;
2539 numabalancing_override = true;
2540
2541 if (!strcmp(str, "enable")) {
2542 set_numabalancing_state(true);
2543 ret = 1;
2544 } else if (!strcmp(str, "disable")) {
2545 set_numabalancing_state(false);
2546 ret = 1;
2547 }
2548out:
2549 if (!ret)
2550 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2551
2552 return ret;
2553}
2554__setup("numa_balancing=", setup_numabalancing);
2555#else
2556static inline void __init check_numabalancing_enable(void)
2557{
2558}
2559#endif
2560
2561
2562void __init numa_policy_init(void)
2563{
2564 nodemask_t interleave_nodes;
2565 unsigned long largest = 0;
2566 int nid, prefer = 0;
2567
2568 policy_cache = kmem_cache_create("numa_policy",
2569 sizeof(struct mempolicy),
2570 0, SLAB_PANIC, NULL);
2571
2572 sn_cache = kmem_cache_create("shared_policy_node",
2573 sizeof(struct sp_node),
2574 0, SLAB_PANIC, NULL);
2575
2576 for_each_node(nid) {
2577 preferred_node_policy[nid] = (struct mempolicy) {
2578 .refcnt = ATOMIC_INIT(1),
2579 .mode = MPOL_PREFERRED,
2580 .flags = MPOL_F_MOF | MPOL_F_MORON,
2581 .v = { .preferred_node = nid, },
2582 };
2583 }
2584
2585
2586
2587
2588
2589
2590 nodes_clear(interleave_nodes);
2591 for_each_node_state(nid, N_MEMORY) {
2592 unsigned long total_pages = node_present_pages(nid);
2593
2594
2595 if (largest < total_pages) {
2596 largest = total_pages;
2597 prefer = nid;
2598 }
2599
2600
2601 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2602 node_set(nid, interleave_nodes);
2603 }
2604
2605
2606 if (unlikely(nodes_empty(interleave_nodes)))
2607 node_set(prefer, interleave_nodes);
2608
2609 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2610 printk("numa_policy_init: interleaving failed\n");
2611
2612 check_numabalancing_enable();
2613}
2614
2615
2616void numa_default_policy(void)
2617{
2618 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2619}
2620
2621
2622
2623
2624
2625
2626
2627
2628static const char * const policy_modes[] =
2629{
2630 [MPOL_DEFAULT] = "default",
2631 [MPOL_PREFERRED] = "prefer",
2632 [MPOL_BIND] = "bind",
2633 [MPOL_INTERLEAVE] = "interleave",
2634 [MPOL_LOCAL] = "local",
2635};
2636
2637
2638#ifdef CONFIG_TMPFS
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649int mpol_parse_str(char *str, struct mempolicy **mpol)
2650{
2651 struct mempolicy *new = NULL;
2652 unsigned short mode;
2653 unsigned short mode_flags;
2654 nodemask_t nodes;
2655 char *nodelist = strchr(str, ':');
2656 char *flags = strchr(str, '=');
2657 int err = 1;
2658
2659 if (nodelist) {
2660
2661 *nodelist++ = '\0';
2662 if (nodelist_parse(nodelist, nodes))
2663 goto out;
2664 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2665 goto out;
2666 } else
2667 nodes_clear(nodes);
2668
2669 if (flags)
2670 *flags++ = '\0';
2671
2672 for (mode = 0; mode < MPOL_MAX; mode++) {
2673 if (!strcmp(str, policy_modes[mode])) {
2674 break;
2675 }
2676 }
2677 if (mode >= MPOL_MAX)
2678 goto out;
2679
2680 switch (mode) {
2681 case MPOL_PREFERRED:
2682
2683
2684
2685 if (nodelist) {
2686 char *rest = nodelist;
2687 while (isdigit(*rest))
2688 rest++;
2689 if (*rest)
2690 goto out;
2691 }
2692 break;
2693 case MPOL_INTERLEAVE:
2694
2695
2696
2697 if (!nodelist)
2698 nodes = node_states[N_MEMORY];
2699 break;
2700 case MPOL_LOCAL:
2701
2702
2703
2704 if (nodelist)
2705 goto out;
2706 mode = MPOL_PREFERRED;
2707 break;
2708 case MPOL_DEFAULT:
2709
2710
2711
2712 if (!nodelist)
2713 err = 0;
2714 goto out;
2715 case MPOL_BIND:
2716
2717
2718
2719 if (!nodelist)
2720 goto out;
2721 }
2722
2723 mode_flags = 0;
2724 if (flags) {
2725
2726
2727
2728
2729 if (!strcmp(flags, "static"))
2730 mode_flags |= MPOL_F_STATIC_NODES;
2731 else if (!strcmp(flags, "relative"))
2732 mode_flags |= MPOL_F_RELATIVE_NODES;
2733 else
2734 goto out;
2735 }
2736
2737 new = mpol_new(mode, mode_flags, &nodes);
2738 if (IS_ERR(new))
2739 goto out;
2740
2741
2742
2743
2744
2745 if (mode != MPOL_PREFERRED)
2746 new->v.nodes = nodes;
2747 else if (nodelist)
2748 new->v.preferred_node = first_node(nodes);
2749 else
2750 new->flags |= MPOL_F_LOCAL;
2751
2752
2753
2754
2755
2756 new->w.user_nodemask = nodes;
2757
2758 err = 0;
2759
2760out:
2761
2762 if (nodelist)
2763 *--nodelist = ':';
2764 if (flags)
2765 *--flags = '=';
2766 if (!err)
2767 *mpol = new;
2768 return err;
2769}
2770#endif
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2783{
2784 char *p = buffer;
2785 int l;
2786 nodemask_t nodes;
2787 unsigned short mode;
2788 unsigned short flags = pol ? pol->flags : 0;
2789
2790
2791
2792
2793 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2794
2795 if (!pol || pol == &default_policy)
2796 mode = MPOL_DEFAULT;
2797 else
2798 mode = pol->mode;
2799
2800 switch (mode) {
2801 case MPOL_DEFAULT:
2802 nodes_clear(nodes);
2803 break;
2804
2805 case MPOL_PREFERRED:
2806 nodes_clear(nodes);
2807 if (flags & MPOL_F_LOCAL)
2808 mode = MPOL_LOCAL;
2809 else
2810 node_set(pol->v.preferred_node, nodes);
2811 break;
2812
2813 case MPOL_BIND:
2814
2815 case MPOL_INTERLEAVE:
2816 nodes = pol->v.nodes;
2817 break;
2818
2819 default:
2820 return -EINVAL;
2821 }
2822
2823 l = strlen(policy_modes[mode]);
2824 if (buffer + maxlen < p + l + 1)
2825 return -ENOSPC;
2826
2827 strcpy(p, policy_modes[mode]);
2828 p += l;
2829
2830 if (flags & MPOL_MODE_FLAGS) {
2831 if (buffer + maxlen < p + 2)
2832 return -ENOSPC;
2833 *p++ = '=';
2834
2835
2836
2837
2838 if (flags & MPOL_F_STATIC_NODES)
2839 p += snprintf(p, buffer + maxlen - p, "static");
2840 else if (flags & MPOL_F_RELATIVE_NODES)
2841 p += snprintf(p, buffer + maxlen - p, "relative");
2842 }
2843
2844 if (!nodes_empty(nodes)) {
2845 if (buffer + maxlen < p + 2)
2846 return -ENOSPC;
2847 *p++ = ':';
2848 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2849 }
2850 return p - buffer;
2851}
2852