1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/pagewalk.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/sched/mm.h>
77#include <linux/sched/numa_balancing.h>
78#include <linux/sched/task.h>
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
81#include <linux/slab.h>
82#include <linux/string.h>
83#include <linux/export.h>
84#include <linux/nsproxy.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
88#include <linux/ptrace.h>
89#include <linux/swap.h>
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
92#include <linux/migrate.h>
93#include <linux/ksm.h>
94#include <linux/rmap.h>
95#include <linux/security.h>
96#include <linux/syscalls.h>
97#include <linux/ctype.h>
98#include <linux/mm_inline.h>
99#include <linux/mmu_notifier.h>
100#include <linux/printk.h>
101#include <linux/swapops.h>
102
103#include <asm/tlbflush.h>
104#include <linux/uaccess.h>
105
106#include "internal.h"
107
108
109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
111
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
114
115
116
117enum zone_type policy_zone = 0;
118
119
120
121
122static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1),
124 .mode = MPOL_LOCAL,
125};
126
127static struct mempolicy preferred_node_policy[MAX_NUMNODES];
128
129
130
131
132
133
134
135int numa_map_to_online_node(int node)
136{
137 int min_dist = INT_MAX, dist, n, min_node;
138
139 if (node == NUMA_NO_NODE || node_online(node))
140 return node;
141
142 min_node = node;
143 for_each_online_node(n) {
144 dist = node_distance(node, n);
145 if (dist < min_dist) {
146 min_dist = dist;
147 min_node = n;
148 }
149 }
150
151 return min_node;
152}
153EXPORT_SYMBOL_GPL(numa_map_to_online_node);
154
155struct mempolicy *get_task_policy(struct task_struct *p)
156{
157 struct mempolicy *pol = p->mempolicy;
158 int node;
159
160 if (pol)
161 return pol;
162
163 node = numa_node_id();
164 if (node != NUMA_NO_NODE) {
165 pol = &preferred_node_policy[node];
166
167 if (pol->mode)
168 return pol;
169 }
170
171 return &default_policy;
172}
173
174static const struct mempolicy_operations {
175 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
176 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
177} mpol_ops[MPOL_MAX];
178
179static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
180{
181 return pol->flags & MPOL_MODE_FLAGS;
182}
183
184static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
185 const nodemask_t *rel)
186{
187 nodemask_t tmp;
188 nodes_fold(tmp, *orig, nodes_weight(*rel));
189 nodes_onto(*ret, tmp, *rel);
190}
191
192static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
193{
194 if (nodes_empty(*nodes))
195 return -EINVAL;
196 pol->nodes = *nodes;
197 return 0;
198}
199
200static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
201{
202 if (nodes_empty(*nodes))
203 return -EINVAL;
204
205 nodes_clear(pol->nodes);
206 node_set(first_node(*nodes), pol->nodes);
207 return 0;
208}
209
210static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
211{
212 if (nodes_empty(*nodes))
213 return -EINVAL;
214 pol->nodes = *nodes;
215 return 0;
216}
217
218
219
220
221
222
223
224
225
226static int mpol_set_nodemask(struct mempolicy *pol,
227 const nodemask_t *nodes, struct nodemask_scratch *nsc)
228{
229 int ret;
230
231
232
233
234
235
236 if (!pol || pol->mode == MPOL_LOCAL)
237 return 0;
238
239
240 nodes_and(nsc->mask1,
241 cpuset_current_mems_allowed, node_states[N_MEMORY]);
242
243 VM_BUG_ON(!nodes);
244
245 if (pol->flags & MPOL_F_RELATIVE_NODES)
246 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247 else
248 nodes_and(nsc->mask2, *nodes, nsc->mask1);
249
250 if (mpol_store_user_nodemask(pol))
251 pol->w.user_nodemask = *nodes;
252 else
253 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
254
255 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
256 return ret;
257}
258
259
260
261
262
263static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
264 nodemask_t *nodes)
265{
266 struct mempolicy *policy;
267
268 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
269 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
270
271 if (mode == MPOL_DEFAULT) {
272 if (nodes && !nodes_empty(*nodes))
273 return ERR_PTR(-EINVAL);
274 return NULL;
275 }
276 VM_BUG_ON(!nodes);
277
278
279
280
281
282
283 if (mode == MPOL_PREFERRED) {
284 if (nodes_empty(*nodes)) {
285 if (((flags & MPOL_F_STATIC_NODES) ||
286 (flags & MPOL_F_RELATIVE_NODES)))
287 return ERR_PTR(-EINVAL);
288
289 mode = MPOL_LOCAL;
290 }
291 } else if (mode == MPOL_LOCAL) {
292 if (!nodes_empty(*nodes) ||
293 (flags & MPOL_F_STATIC_NODES) ||
294 (flags & MPOL_F_RELATIVE_NODES))
295 return ERR_PTR(-EINVAL);
296 } else if (nodes_empty(*nodes))
297 return ERR_PTR(-EINVAL);
298 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
299 if (!policy)
300 return ERR_PTR(-ENOMEM);
301 atomic_set(&policy->refcnt, 1);
302 policy->mode = mode;
303 policy->flags = flags;
304
305 return policy;
306}
307
308
309void __mpol_put(struct mempolicy *p)
310{
311 if (!atomic_dec_and_test(&p->refcnt))
312 return;
313 kmem_cache_free(policy_cache, p);
314}
315
316static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
317{
318}
319
320static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
321{
322 nodemask_t tmp;
323
324 if (pol->flags & MPOL_F_STATIC_NODES)
325 nodes_and(tmp, pol->w.user_nodemask, *nodes);
326 else if (pol->flags & MPOL_F_RELATIVE_NODES)
327 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328 else {
329 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
330 *nodes);
331 pol->w.cpuset_mems_allowed = *nodes;
332 }
333
334 if (nodes_empty(tmp))
335 tmp = *nodes;
336
337 pol->nodes = tmp;
338}
339
340static void mpol_rebind_preferred(struct mempolicy *pol,
341 const nodemask_t *nodes)
342{
343 pol->w.cpuset_mems_allowed = *nodes;
344}
345
346
347
348
349
350
351
352
353static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
354{
355 if (!pol)
356 return;
357 if (!mpol_store_user_nodemask(pol) &&
358 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
359 return;
360
361 mpol_ops[pol->mode].rebind(pol, newmask);
362}
363
364
365
366
367
368
369
370
371void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
372{
373 mpol_rebind_policy(tsk->mempolicy, new);
374}
375
376
377
378
379
380
381
382void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
383{
384 struct vm_area_struct *vma;
385
386 mmap_write_lock(mm);
387 for (vma = mm->mmap; vma; vma = vma->vm_next)
388 mpol_rebind_policy(vma->vm_policy, new);
389 mmap_write_unlock(mm);
390}
391
392static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393 [MPOL_DEFAULT] = {
394 .rebind = mpol_rebind_default,
395 },
396 [MPOL_INTERLEAVE] = {
397 .create = mpol_new_interleave,
398 .rebind = mpol_rebind_nodemask,
399 },
400 [MPOL_PREFERRED] = {
401 .create = mpol_new_preferred,
402 .rebind = mpol_rebind_preferred,
403 },
404 [MPOL_BIND] = {
405 .create = mpol_new_bind,
406 .rebind = mpol_rebind_nodemask,
407 },
408 [MPOL_LOCAL] = {
409 .rebind = mpol_rebind_default,
410 },
411};
412
413static int migrate_page_add(struct page *page, struct list_head *pagelist,
414 unsigned long flags);
415
416struct queue_pages {
417 struct list_head *pagelist;
418 unsigned long flags;
419 nodemask_t *nmask;
420 unsigned long start;
421 unsigned long end;
422 struct vm_area_struct *first;
423};
424
425
426
427
428
429
430
431static inline bool queue_pages_required(struct page *page,
432 struct queue_pages *qp)
433{
434 int nid = page_to_nid(page);
435 unsigned long flags = qp->flags;
436
437 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
438}
439
440
441
442
443
444
445
446
447
448
449
450
451static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
452 unsigned long end, struct mm_walk *walk)
453 __releases(ptl)
454{
455 int ret = 0;
456 struct page *page;
457 struct queue_pages *qp = walk->private;
458 unsigned long flags;
459
460 if (unlikely(is_pmd_migration_entry(*pmd))) {
461 ret = -EIO;
462 goto unlock;
463 }
464 page = pmd_page(*pmd);
465 if (is_huge_zero_page(page)) {
466 spin_unlock(ptl);
467 walk->action = ACTION_CONTINUE;
468 goto out;
469 }
470 if (!queue_pages_required(page, qp))
471 goto unlock;
472
473 flags = qp->flags;
474
475 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
476 if (!vma_migratable(walk->vma) ||
477 migrate_page_add(page, qp->pagelist, flags)) {
478 ret = 1;
479 goto unlock;
480 }
481 } else
482 ret = -EIO;
483unlock:
484 spin_unlock(ptl);
485out:
486 return ret;
487}
488
489
490
491
492
493
494
495
496
497
498
499
500
501static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
502 unsigned long end, struct mm_walk *walk)
503{
504 struct vm_area_struct *vma = walk->vma;
505 struct page *page;
506 struct queue_pages *qp = walk->private;
507 unsigned long flags = qp->flags;
508 int ret;
509 bool has_unmovable = false;
510 pte_t *pte, *mapped_pte;
511 spinlock_t *ptl;
512
513 ptl = pmd_trans_huge_lock(pmd, vma);
514 if (ptl) {
515 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
516 if (ret != 2)
517 return ret;
518 }
519
520
521 if (pmd_trans_unstable(pmd))
522 return 0;
523
524 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
525 for (; addr != end; pte++, addr += PAGE_SIZE) {
526 if (!pte_present(*pte))
527 continue;
528 page = vm_normal_page(vma, addr, *pte);
529 if (!page)
530 continue;
531
532
533
534
535 if (PageReserved(page))
536 continue;
537 if (!queue_pages_required(page, qp))
538 continue;
539 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
540
541 if (!vma_migratable(vma)) {
542 has_unmovable = true;
543 break;
544 }
545
546
547
548
549
550
551 if (migrate_page_add(page, qp->pagelist, flags))
552 has_unmovable = true;
553 } else
554 break;
555 }
556 pte_unmap_unlock(mapped_pte, ptl);
557 cond_resched();
558
559 if (has_unmovable)
560 return 1;
561
562 return addr != end ? -EIO : 0;
563}
564
565static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
566 unsigned long addr, unsigned long end,
567 struct mm_walk *walk)
568{
569 int ret = 0;
570#ifdef CONFIG_HUGETLB_PAGE
571 struct queue_pages *qp = walk->private;
572 unsigned long flags = (qp->flags & MPOL_MF_VALID);
573 struct page *page;
574 spinlock_t *ptl;
575 pte_t entry;
576
577 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
578 entry = huge_ptep_get(pte);
579 if (!pte_present(entry))
580 goto unlock;
581 page = pte_page(entry);
582 if (!queue_pages_required(page, qp))
583 goto unlock;
584
585 if (flags == MPOL_MF_STRICT) {
586
587
588
589
590 ret = -EIO;
591 goto unlock;
592 }
593
594 if (!vma_migratable(walk->vma)) {
595
596
597
598
599
600
601 ret = 1;
602 goto unlock;
603 }
604
605
606 if (flags & (MPOL_MF_MOVE_ALL) ||
607 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
608 if (!isolate_huge_page(page, qp->pagelist) &&
609 (flags & MPOL_MF_STRICT))
610
611
612
613
614 ret = 1;
615 }
616unlock:
617 spin_unlock(ptl);
618#else
619 BUG();
620#endif
621 return ret;
622}
623
624#ifdef CONFIG_NUMA_BALANCING
625
626
627
628
629
630
631
632
633
634unsigned long change_prot_numa(struct vm_area_struct *vma,
635 unsigned long addr, unsigned long end)
636{
637 int nr_updated;
638
639 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
640 if (nr_updated)
641 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
642
643 return nr_updated;
644}
645#else
646static unsigned long change_prot_numa(struct vm_area_struct *vma,
647 unsigned long addr, unsigned long end)
648{
649 return 0;
650}
651#endif
652
653static int queue_pages_test_walk(unsigned long start, unsigned long end,
654 struct mm_walk *walk)
655{
656 struct vm_area_struct *vma = walk->vma;
657 struct queue_pages *qp = walk->private;
658 unsigned long endvma = vma->vm_end;
659 unsigned long flags = qp->flags;
660
661
662 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
663
664 if (!qp->first) {
665 qp->first = vma;
666 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
667 (qp->start < vma->vm_start))
668
669 return -EFAULT;
670 }
671 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
672 ((vma->vm_end < qp->end) &&
673 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
674
675 return -EFAULT;
676
677
678
679
680
681 if (!vma_migratable(vma) &&
682 !(flags & MPOL_MF_STRICT))
683 return 1;
684
685 if (endvma > end)
686 endvma = end;
687
688 if (flags & MPOL_MF_LAZY) {
689
690 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
691 !(vma->vm_flags & VM_MIXEDMAP))
692 change_prot_numa(vma, start, endvma);
693 return 1;
694 }
695
696
697 if (flags & MPOL_MF_VALID)
698 return 0;
699 return 1;
700}
701
702static const struct mm_walk_ops queue_pages_walk_ops = {
703 .hugetlb_entry = queue_pages_hugetlb,
704 .pmd_entry = queue_pages_pte_range,
705 .test_walk = queue_pages_test_walk,
706};
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723static int
724queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
725 nodemask_t *nodes, unsigned long flags,
726 struct list_head *pagelist)
727{
728 int err;
729 struct queue_pages qp = {
730 .pagelist = pagelist,
731 .flags = flags,
732 .nmask = nodes,
733 .start = start,
734 .end = end,
735 .first = NULL,
736 };
737
738 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
739
740 if (!qp.first)
741
742 err = -EFAULT;
743
744 return err;
745}
746
747
748
749
750
751static int vma_replace_policy(struct vm_area_struct *vma,
752 struct mempolicy *pol)
753{
754 int err;
755 struct mempolicy *old;
756 struct mempolicy *new;
757
758 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
759 vma->vm_start, vma->vm_end, vma->vm_pgoff,
760 vma->vm_ops, vma->vm_file,
761 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
762
763 new = mpol_dup(pol);
764 if (IS_ERR(new))
765 return PTR_ERR(new);
766
767 if (vma->vm_ops && vma->vm_ops->set_policy) {
768 err = vma->vm_ops->set_policy(vma, new);
769 if (err)
770 goto err_out;
771 }
772
773 old = vma->vm_policy;
774 vma->vm_policy = new;
775 mpol_put(old);
776
777 return 0;
778 err_out:
779 mpol_put(new);
780 return err;
781}
782
783
784static int mbind_range(struct mm_struct *mm, unsigned long start,
785 unsigned long end, struct mempolicy *new_pol)
786{
787 struct vm_area_struct *next;
788 struct vm_area_struct *prev;
789 struct vm_area_struct *vma;
790 int err = 0;
791 pgoff_t pgoff;
792 unsigned long vmstart;
793 unsigned long vmend;
794
795 vma = find_vma(mm, start);
796 VM_BUG_ON(!vma);
797
798 prev = vma->vm_prev;
799 if (start > vma->vm_start)
800 prev = vma;
801
802 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
803 next = vma->vm_next;
804 vmstart = max(start, vma->vm_start);
805 vmend = min(end, vma->vm_end);
806
807 if (mpol_equal(vma_policy(vma), new_pol))
808 continue;
809
810 pgoff = vma->vm_pgoff +
811 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
812 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
813 vma->anon_vma, vma->vm_file, pgoff,
814 new_pol, vma->vm_userfaultfd_ctx);
815 if (prev) {
816 vma = prev;
817 next = vma->vm_next;
818 if (mpol_equal(vma_policy(vma), new_pol))
819 continue;
820
821 goto replace;
822 }
823 if (vma->vm_start != vmstart) {
824 err = split_vma(vma->vm_mm, vma, vmstart, 1);
825 if (err)
826 goto out;
827 }
828 if (vma->vm_end != vmend) {
829 err = split_vma(vma->vm_mm, vma, vmend, 0);
830 if (err)
831 goto out;
832 }
833 replace:
834 err = vma_replace_policy(vma, new_pol);
835 if (err)
836 goto out;
837 }
838
839 out:
840 return err;
841}
842
843
844static long do_set_mempolicy(unsigned short mode, unsigned short flags,
845 nodemask_t *nodes)
846{
847 struct mempolicy *new, *old;
848 NODEMASK_SCRATCH(scratch);
849 int ret;
850
851 if (!scratch)
852 return -ENOMEM;
853
854 new = mpol_new(mode, flags, nodes);
855 if (IS_ERR(new)) {
856 ret = PTR_ERR(new);
857 goto out;
858 }
859
860 if (flags & MPOL_F_NUMA_BALANCING) {
861 if (new && new->mode == MPOL_BIND) {
862 new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
863 } else {
864 ret = -EINVAL;
865 mpol_put(new);
866 goto out;
867 }
868 }
869
870 ret = mpol_set_nodemask(new, nodes, scratch);
871 if (ret) {
872 mpol_put(new);
873 goto out;
874 }
875 task_lock(current);
876 old = current->mempolicy;
877 current->mempolicy = new;
878 if (new && new->mode == MPOL_INTERLEAVE)
879 current->il_prev = MAX_NUMNODES-1;
880 task_unlock(current);
881 mpol_put(old);
882 ret = 0;
883out:
884 NODEMASK_SCRATCH_FREE(scratch);
885 return ret;
886}
887
888
889
890
891
892
893static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
894{
895 nodes_clear(*nodes);
896 if (p == &default_policy)
897 return;
898
899 switch (p->mode) {
900 case MPOL_BIND:
901 case MPOL_INTERLEAVE:
902 case MPOL_PREFERRED:
903 *nodes = p->nodes;
904 break;
905 case MPOL_LOCAL:
906
907 break;
908 default:
909 BUG();
910 }
911}
912
913static int lookup_node(struct mm_struct *mm, unsigned long addr)
914{
915 struct page *p = NULL;
916 int err;
917
918 int locked = 1;
919 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
920 if (err > 0) {
921 err = page_to_nid(p);
922 put_page(p);
923 }
924 if (locked)
925 mmap_read_unlock(mm);
926 return err;
927}
928
929
930static long do_get_mempolicy(int *policy, nodemask_t *nmask,
931 unsigned long addr, unsigned long flags)
932{
933 int err;
934 struct mm_struct *mm = current->mm;
935 struct vm_area_struct *vma = NULL;
936 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
937
938 if (flags &
939 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
940 return -EINVAL;
941
942 if (flags & MPOL_F_MEMS_ALLOWED) {
943 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
944 return -EINVAL;
945 *policy = 0;
946 task_lock(current);
947 *nmask = cpuset_current_mems_allowed;
948 task_unlock(current);
949 return 0;
950 }
951
952 if (flags & MPOL_F_ADDR) {
953
954
955
956
957
958 mmap_read_lock(mm);
959 vma = vma_lookup(mm, addr);
960 if (!vma) {
961 mmap_read_unlock(mm);
962 return -EFAULT;
963 }
964 if (vma->vm_ops && vma->vm_ops->get_policy)
965 pol = vma->vm_ops->get_policy(vma, addr);
966 else
967 pol = vma->vm_policy;
968 } else if (addr)
969 return -EINVAL;
970
971 if (!pol)
972 pol = &default_policy;
973
974 if (flags & MPOL_F_NODE) {
975 if (flags & MPOL_F_ADDR) {
976
977
978
979
980
981
982 pol_refcount = pol;
983 vma = NULL;
984 mpol_get(pol);
985 err = lookup_node(mm, addr);
986 if (err < 0)
987 goto out;
988 *policy = err;
989 } else if (pol == current->mempolicy &&
990 pol->mode == MPOL_INTERLEAVE) {
991 *policy = next_node_in(current->il_prev, pol->nodes);
992 } else {
993 err = -EINVAL;
994 goto out;
995 }
996 } else {
997 *policy = pol == &default_policy ? MPOL_DEFAULT :
998 pol->mode;
999
1000
1001
1002
1003 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1004 }
1005
1006 err = 0;
1007 if (nmask) {
1008 if (mpol_store_user_nodemask(pol)) {
1009 *nmask = pol->w.user_nodemask;
1010 } else {
1011 task_lock(current);
1012 get_policy_nodemask(pol, nmask);
1013 task_unlock(current);
1014 }
1015 }
1016
1017 out:
1018 mpol_cond_put(pol);
1019 if (vma)
1020 mmap_read_unlock(mm);
1021 if (pol_refcount)
1022 mpol_put(pol_refcount);
1023 return err;
1024}
1025
1026#ifdef CONFIG_MIGRATION
1027
1028
1029
1030static int migrate_page_add(struct page *page, struct list_head *pagelist,
1031 unsigned long flags)
1032{
1033 struct page *head = compound_head(page);
1034
1035
1036
1037 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1038 if (!isolate_lru_page(head)) {
1039 list_add_tail(&head->lru, pagelist);
1040 mod_node_page_state(page_pgdat(head),
1041 NR_ISOLATED_ANON + page_is_file_lru(head),
1042 thp_nr_pages(head));
1043 } else if (flags & MPOL_MF_STRICT) {
1044
1045
1046
1047
1048
1049
1050
1051 return -EIO;
1052 }
1053 }
1054
1055 return 0;
1056}
1057
1058
1059
1060
1061
1062static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1063 int flags)
1064{
1065 nodemask_t nmask;
1066 LIST_HEAD(pagelist);
1067 int err = 0;
1068 struct migration_target_control mtc = {
1069 .nid = dest,
1070 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1071 };
1072
1073 nodes_clear(nmask);
1074 node_set(source, nmask);
1075
1076
1077
1078
1079
1080
1081 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1082 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1083 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1084
1085 if (!list_empty(&pagelist)) {
1086 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1087 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1088 if (err)
1089 putback_movable_pages(&pagelist);
1090 }
1091
1092 return err;
1093}
1094
1095
1096
1097
1098
1099
1100
1101int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1102 const nodemask_t *to, int flags)
1103{
1104 int busy = 0;
1105 int err = 0;
1106 nodemask_t tmp;
1107
1108 lru_cache_disable();
1109
1110 mmap_read_lock(mm);
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143 tmp = *from;
1144 while (!nodes_empty(tmp)) {
1145 int s, d;
1146 int source = NUMA_NO_NODE;
1147 int dest = 0;
1148
1149 for_each_node_mask(s, tmp) {
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1167 (node_isset(s, *to)))
1168 continue;
1169
1170 d = node_remap(s, *from, *to);
1171 if (s == d)
1172 continue;
1173
1174 source = s;
1175 dest = d;
1176
1177
1178 if (!node_isset(dest, tmp))
1179 break;
1180 }
1181 if (source == NUMA_NO_NODE)
1182 break;
1183
1184 node_clear(source, tmp);
1185 err = migrate_to_node(mm, source, dest, flags);
1186 if (err > 0)
1187 busy += err;
1188 if (err < 0)
1189 break;
1190 }
1191 mmap_read_unlock(mm);
1192
1193 lru_cache_enable();
1194 if (err < 0)
1195 return err;
1196 return busy;
1197
1198}
1199
1200
1201
1202
1203
1204
1205
1206
1207static struct page *new_page(struct page *page, unsigned long start)
1208{
1209 struct vm_area_struct *vma;
1210 unsigned long address;
1211
1212 vma = find_vma(current->mm, start);
1213 while (vma) {
1214 address = page_address_in_vma(page, vma);
1215 if (address != -EFAULT)
1216 break;
1217 vma = vma->vm_next;
1218 }
1219
1220 if (PageHuge(page)) {
1221 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1222 vma, address);
1223 } else if (PageTransHuge(page)) {
1224 struct page *thp;
1225
1226 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1227 HPAGE_PMD_ORDER);
1228 if (!thp)
1229 return NULL;
1230 prep_transhuge_page(thp);
1231 return thp;
1232 }
1233
1234
1235
1236 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1237 vma, address);
1238}
1239#else
1240
1241static int migrate_page_add(struct page *page, struct list_head *pagelist,
1242 unsigned long flags)
1243{
1244 return -EIO;
1245}
1246
1247int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1248 const nodemask_t *to, int flags)
1249{
1250 return -ENOSYS;
1251}
1252
1253static struct page *new_page(struct page *page, unsigned long start)
1254{
1255 return NULL;
1256}
1257#endif
1258
1259static long do_mbind(unsigned long start, unsigned long len,
1260 unsigned short mode, unsigned short mode_flags,
1261 nodemask_t *nmask, unsigned long flags)
1262{
1263 struct mm_struct *mm = current->mm;
1264 struct mempolicy *new;
1265 unsigned long end;
1266 int err;
1267 int ret;
1268 LIST_HEAD(pagelist);
1269
1270 if (flags & ~(unsigned long)MPOL_MF_VALID)
1271 return -EINVAL;
1272 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1273 return -EPERM;
1274
1275 if (start & ~PAGE_MASK)
1276 return -EINVAL;
1277
1278 if (mode == MPOL_DEFAULT)
1279 flags &= ~MPOL_MF_STRICT;
1280
1281 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1282 end = start + len;
1283
1284 if (end < start)
1285 return -EINVAL;
1286 if (end == start)
1287 return 0;
1288
1289 new = mpol_new(mode, mode_flags, nmask);
1290 if (IS_ERR(new))
1291 return PTR_ERR(new);
1292
1293 if (flags & MPOL_MF_LAZY)
1294 new->flags |= MPOL_F_MOF;
1295
1296
1297
1298
1299
1300 if (!new)
1301 flags |= MPOL_MF_DISCONTIG_OK;
1302
1303 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1304 start, start + len, mode, mode_flags,
1305 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1306
1307 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1308
1309 lru_cache_disable();
1310 }
1311 {
1312 NODEMASK_SCRATCH(scratch);
1313 if (scratch) {
1314 mmap_write_lock(mm);
1315 err = mpol_set_nodemask(new, nmask, scratch);
1316 if (err)
1317 mmap_write_unlock(mm);
1318 } else
1319 err = -ENOMEM;
1320 NODEMASK_SCRATCH_FREE(scratch);
1321 }
1322 if (err)
1323 goto mpol_out;
1324
1325 ret = queue_pages_range(mm, start, end, nmask,
1326 flags | MPOL_MF_INVERT, &pagelist);
1327
1328 if (ret < 0) {
1329 err = ret;
1330 goto up_out;
1331 }
1332
1333 err = mbind_range(mm, start, end, new);
1334
1335 if (!err) {
1336 int nr_failed = 0;
1337
1338 if (!list_empty(&pagelist)) {
1339 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1340 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1341 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1342 if (nr_failed)
1343 putback_movable_pages(&pagelist);
1344 }
1345
1346 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1347 err = -EIO;
1348 } else {
1349up_out:
1350 if (!list_empty(&pagelist))
1351 putback_movable_pages(&pagelist);
1352 }
1353
1354 mmap_write_unlock(mm);
1355mpol_out:
1356 mpol_put(new);
1357 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1358 lru_cache_enable();
1359 return err;
1360}
1361
1362
1363
1364
1365
1366
1367static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1368 unsigned long maxnode)
1369{
1370 unsigned long k;
1371 unsigned long t;
1372 unsigned long nlongs;
1373 unsigned long endmask;
1374
1375 --maxnode;
1376 nodes_clear(*nodes);
1377 if (maxnode == 0 || !nmask)
1378 return 0;
1379 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1380 return -EINVAL;
1381
1382 nlongs = BITS_TO_LONGS(maxnode);
1383 if ((maxnode % BITS_PER_LONG) == 0)
1384 endmask = ~0UL;
1385 else
1386 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1398 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1399 if (get_user(t, nmask + k))
1400 return -EFAULT;
1401 if (k == nlongs - 1) {
1402 if (t & endmask)
1403 return -EINVAL;
1404 } else if (t)
1405 return -EINVAL;
1406 }
1407 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1408 endmask = ~0UL;
1409 }
1410
1411 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1412 unsigned long valid_mask = endmask;
1413
1414 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1415 if (get_user(t, nmask + nlongs - 1))
1416 return -EFAULT;
1417 if (t & valid_mask)
1418 return -EINVAL;
1419 }
1420
1421 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1422 return -EFAULT;
1423 nodes_addr(*nodes)[nlongs-1] &= endmask;
1424 return 0;
1425}
1426
1427
1428static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1429 nodemask_t *nodes)
1430{
1431 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1432 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1433
1434 if (copy > nbytes) {
1435 if (copy > PAGE_SIZE)
1436 return -EINVAL;
1437 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1438 return -EFAULT;
1439 copy = nbytes;
1440 }
1441 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1442}
1443
1444
1445static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1446{
1447 *flags = *mode & MPOL_MODE_FLAGS;
1448 *mode &= ~MPOL_MODE_FLAGS;
1449 if ((unsigned int)(*mode) >= MPOL_MAX)
1450 return -EINVAL;
1451 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1452 return -EINVAL;
1453
1454 return 0;
1455}
1456
1457static long kernel_mbind(unsigned long start, unsigned long len,
1458 unsigned long mode, const unsigned long __user *nmask,
1459 unsigned long maxnode, unsigned int flags)
1460{
1461 unsigned short mode_flags;
1462 nodemask_t nodes;
1463 int lmode = mode;
1464 int err;
1465
1466 start = untagged_addr(start);
1467 err = sanitize_mpol_flags(&lmode, &mode_flags);
1468 if (err)
1469 return err;
1470
1471 err = get_nodes(&nodes, nmask, maxnode);
1472 if (err)
1473 return err;
1474
1475 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1476}
1477
1478SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1479 unsigned long, mode, const unsigned long __user *, nmask,
1480 unsigned long, maxnode, unsigned int, flags)
1481{
1482 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1483}
1484
1485
1486static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1487 unsigned long maxnode)
1488{
1489 unsigned short mode_flags;
1490 nodemask_t nodes;
1491 int lmode = mode;
1492 int err;
1493
1494 err = sanitize_mpol_flags(&lmode, &mode_flags);
1495 if (err)
1496 return err;
1497
1498 err = get_nodes(&nodes, nmask, maxnode);
1499 if (err)
1500 return err;
1501
1502 return do_set_mempolicy(lmode, mode_flags, &nodes);
1503}
1504
1505SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1506 unsigned long, maxnode)
1507{
1508 return kernel_set_mempolicy(mode, nmask, maxnode);
1509}
1510
1511static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1512 const unsigned long __user *old_nodes,
1513 const unsigned long __user *new_nodes)
1514{
1515 struct mm_struct *mm = NULL;
1516 struct task_struct *task;
1517 nodemask_t task_nodes;
1518 int err;
1519 nodemask_t *old;
1520 nodemask_t *new;
1521 NODEMASK_SCRATCH(scratch);
1522
1523 if (!scratch)
1524 return -ENOMEM;
1525
1526 old = &scratch->mask1;
1527 new = &scratch->mask2;
1528
1529 err = get_nodes(old, old_nodes, maxnode);
1530 if (err)
1531 goto out;
1532
1533 err = get_nodes(new, new_nodes, maxnode);
1534 if (err)
1535 goto out;
1536
1537
1538 rcu_read_lock();
1539 task = pid ? find_task_by_vpid(pid) : current;
1540 if (!task) {
1541 rcu_read_unlock();
1542 err = -ESRCH;
1543 goto out;
1544 }
1545 get_task_struct(task);
1546
1547 err = -EINVAL;
1548
1549
1550
1551
1552
1553 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1554 rcu_read_unlock();
1555 err = -EPERM;
1556 goto out_put;
1557 }
1558 rcu_read_unlock();
1559
1560 task_nodes = cpuset_mems_allowed(task);
1561
1562 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1563 err = -EPERM;
1564 goto out_put;
1565 }
1566
1567 task_nodes = cpuset_mems_allowed(current);
1568 nodes_and(*new, *new, task_nodes);
1569 if (nodes_empty(*new))
1570 goto out_put;
1571
1572 err = security_task_movememory(task);
1573 if (err)
1574 goto out_put;
1575
1576 mm = get_task_mm(task);
1577 put_task_struct(task);
1578
1579 if (!mm) {
1580 err = -EINVAL;
1581 goto out;
1582 }
1583
1584 err = do_migrate_pages(mm, old, new,
1585 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1586
1587 mmput(mm);
1588out:
1589 NODEMASK_SCRATCH_FREE(scratch);
1590
1591 return err;
1592
1593out_put:
1594 put_task_struct(task);
1595 goto out;
1596
1597}
1598
1599SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1600 const unsigned long __user *, old_nodes,
1601 const unsigned long __user *, new_nodes)
1602{
1603 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1604}
1605
1606
1607
1608static int kernel_get_mempolicy(int __user *policy,
1609 unsigned long __user *nmask,
1610 unsigned long maxnode,
1611 unsigned long addr,
1612 unsigned long flags)
1613{
1614 int err;
1615 int pval;
1616 nodemask_t nodes;
1617
1618 if (nmask != NULL && maxnode < nr_node_ids)
1619 return -EINVAL;
1620
1621 addr = untagged_addr(addr);
1622
1623 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1624
1625 if (err)
1626 return err;
1627
1628 if (policy && put_user(pval, policy))
1629 return -EFAULT;
1630
1631 if (nmask)
1632 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1633
1634 return err;
1635}
1636
1637SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1638 unsigned long __user *, nmask, unsigned long, maxnode,
1639 unsigned long, addr, unsigned long, flags)
1640{
1641 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1642}
1643
1644#ifdef CONFIG_COMPAT
1645
1646COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1647 compat_ulong_t __user *, nmask,
1648 compat_ulong_t, maxnode,
1649 compat_ulong_t, addr, compat_ulong_t, flags)
1650{
1651 long err;
1652 unsigned long __user *nm = NULL;
1653 unsigned long nr_bits, alloc_size;
1654 DECLARE_BITMAP(bm, MAX_NUMNODES);
1655
1656 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1657 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1658
1659 if (nmask)
1660 nm = compat_alloc_user_space(alloc_size);
1661
1662 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1663
1664 if (!err && nmask) {
1665 unsigned long copy_size;
1666 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1667 err = copy_from_user(bm, nm, copy_size);
1668
1669 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1670 err |= compat_put_bitmap(nmask, bm, nr_bits);
1671 }
1672
1673 return err;
1674}
1675
1676COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1677 compat_ulong_t, maxnode)
1678{
1679 unsigned long __user *nm = NULL;
1680 unsigned long nr_bits, alloc_size;
1681 DECLARE_BITMAP(bm, MAX_NUMNODES);
1682
1683 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1684 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1685
1686 if (nmask) {
1687 if (compat_get_bitmap(bm, nmask, nr_bits))
1688 return -EFAULT;
1689 nm = compat_alloc_user_space(alloc_size);
1690 if (copy_to_user(nm, bm, alloc_size))
1691 return -EFAULT;
1692 }
1693
1694 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1695}
1696
1697COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1698 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1699 compat_ulong_t, maxnode, compat_ulong_t, flags)
1700{
1701 unsigned long __user *nm = NULL;
1702 unsigned long nr_bits, alloc_size;
1703 nodemask_t bm;
1704
1705 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1706 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1707
1708 if (nmask) {
1709 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1710 return -EFAULT;
1711 nm = compat_alloc_user_space(alloc_size);
1712 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1713 return -EFAULT;
1714 }
1715
1716 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1717}
1718
1719COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1720 compat_ulong_t, maxnode,
1721 const compat_ulong_t __user *, old_nodes,
1722 const compat_ulong_t __user *, new_nodes)
1723{
1724 unsigned long __user *old = NULL;
1725 unsigned long __user *new = NULL;
1726 nodemask_t tmp_mask;
1727 unsigned long nr_bits;
1728 unsigned long size;
1729
1730 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1731 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1732 if (old_nodes) {
1733 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1734 return -EFAULT;
1735 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1736 if (new_nodes)
1737 new = old + size / sizeof(unsigned long);
1738 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1739 return -EFAULT;
1740 }
1741 if (new_nodes) {
1742 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1743 return -EFAULT;
1744 if (new == NULL)
1745 new = compat_alloc_user_space(size);
1746 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1747 return -EFAULT;
1748 }
1749 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1750}
1751
1752#endif
1753
1754bool vma_migratable(struct vm_area_struct *vma)
1755{
1756 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1757 return false;
1758
1759
1760
1761
1762
1763 if (vma_is_dax(vma))
1764 return false;
1765
1766 if (is_vm_hugetlb_page(vma) &&
1767 !hugepage_migration_supported(hstate_vma(vma)))
1768 return false;
1769
1770
1771
1772
1773
1774
1775 if (vma->vm_file &&
1776 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1777 < policy_zone)
1778 return false;
1779 return true;
1780}
1781
1782struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1783 unsigned long addr)
1784{
1785 struct mempolicy *pol = NULL;
1786
1787 if (vma) {
1788 if (vma->vm_ops && vma->vm_ops->get_policy) {
1789 pol = vma->vm_ops->get_policy(vma, addr);
1790 } else if (vma->vm_policy) {
1791 pol = vma->vm_policy;
1792
1793
1794
1795
1796
1797
1798
1799 if (mpol_needs_cond_ref(pol))
1800 mpol_get(pol);
1801 }
1802 }
1803
1804 return pol;
1805}
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1820 unsigned long addr)
1821{
1822 struct mempolicy *pol = __get_vma_policy(vma, addr);
1823
1824 if (!pol)
1825 pol = get_task_policy(current);
1826
1827 return pol;
1828}
1829
1830bool vma_policy_mof(struct vm_area_struct *vma)
1831{
1832 struct mempolicy *pol;
1833
1834 if (vma->vm_ops && vma->vm_ops->get_policy) {
1835 bool ret = false;
1836
1837 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1838 if (pol && (pol->flags & MPOL_F_MOF))
1839 ret = true;
1840 mpol_cond_put(pol);
1841
1842 return ret;
1843 }
1844
1845 pol = vma->vm_policy;
1846 if (!pol)
1847 pol = get_task_policy(current);
1848
1849 return pol->flags & MPOL_F_MOF;
1850}
1851
1852static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1853{
1854 enum zone_type dynamic_policy_zone = policy_zone;
1855
1856 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1867 dynamic_policy_zone = ZONE_MOVABLE;
1868
1869 return zone >= dynamic_policy_zone;
1870}
1871
1872
1873
1874
1875
1876nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1877{
1878
1879 if (unlikely(policy->mode == MPOL_BIND) &&
1880 apply_policy_zone(policy, gfp_zone(gfp)) &&
1881 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1882 return &policy->nodes;
1883
1884 return NULL;
1885}
1886
1887
1888static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1889{
1890 if (policy->mode == MPOL_PREFERRED) {
1891 nd = first_node(policy->nodes);
1892 } else {
1893
1894
1895
1896
1897
1898 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1899 }
1900
1901 return nd;
1902}
1903
1904
1905static unsigned interleave_nodes(struct mempolicy *policy)
1906{
1907 unsigned next;
1908 struct task_struct *me = current;
1909
1910 next = next_node_in(me->il_prev, policy->nodes);
1911 if (next < MAX_NUMNODES)
1912 me->il_prev = next;
1913 return next;
1914}
1915
1916
1917
1918
1919
1920unsigned int mempolicy_slab_node(void)
1921{
1922 struct mempolicy *policy;
1923 int node = numa_mem_id();
1924
1925 if (in_interrupt())
1926 return node;
1927
1928 policy = current->mempolicy;
1929 if (!policy)
1930 return node;
1931
1932 switch (policy->mode) {
1933 case MPOL_PREFERRED:
1934 return first_node(policy->nodes);
1935
1936 case MPOL_INTERLEAVE:
1937 return interleave_nodes(policy);
1938
1939 case MPOL_BIND: {
1940 struct zoneref *z;
1941
1942
1943
1944
1945
1946 struct zonelist *zonelist;
1947 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1948 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1949 z = first_zones_zonelist(zonelist, highest_zoneidx,
1950 &policy->nodes);
1951 return z->zone ? zone_to_nid(z->zone) : node;
1952 }
1953 case MPOL_LOCAL:
1954 return node;
1955
1956 default:
1957 BUG();
1958 }
1959}
1960
1961
1962
1963
1964
1965
1966static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1967{
1968 unsigned nnodes = nodes_weight(pol->nodes);
1969 unsigned target;
1970 int i;
1971 int nid;
1972
1973 if (!nnodes)
1974 return numa_node_id();
1975 target = (unsigned int)n % nnodes;
1976 nid = first_node(pol->nodes);
1977 for (i = 0; i < target; i++)
1978 nid = next_node(nid, pol->nodes);
1979 return nid;
1980}
1981
1982
1983static inline unsigned interleave_nid(struct mempolicy *pol,
1984 struct vm_area_struct *vma, unsigned long addr, int shift)
1985{
1986 if (vma) {
1987 unsigned long off;
1988
1989
1990
1991
1992
1993
1994
1995
1996 BUG_ON(shift < PAGE_SHIFT);
1997 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1998 off += (addr - vma->vm_start) >> shift;
1999 return offset_il_node(pol, off);
2000 } else
2001 return interleave_nodes(pol);
2002}
2003
2004#ifdef CONFIG_HUGETLBFS
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2021 struct mempolicy **mpol, nodemask_t **nodemask)
2022{
2023 int nid;
2024
2025 *mpol = get_vma_policy(vma, addr);
2026 *nodemask = NULL;
2027
2028 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2029 nid = interleave_nid(*mpol, vma, addr,
2030 huge_page_shift(hstate_vma(vma)));
2031 } else {
2032 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2033 if ((*mpol)->mode == MPOL_BIND)
2034 *nodemask = &(*mpol)->nodes;
2035 }
2036 return nid;
2037}
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055bool init_nodemask_of_mempolicy(nodemask_t *mask)
2056{
2057 struct mempolicy *mempolicy;
2058
2059 if (!(mask && current->mempolicy))
2060 return false;
2061
2062 task_lock(current);
2063 mempolicy = current->mempolicy;
2064 switch (mempolicy->mode) {
2065 case MPOL_PREFERRED:
2066 case MPOL_BIND:
2067 case MPOL_INTERLEAVE:
2068 *mask = mempolicy->nodes;
2069 break;
2070
2071 case MPOL_LOCAL:
2072 init_nodemask_of_node(mask, numa_node_id());
2073 break;
2074
2075 default:
2076 BUG();
2077 }
2078 task_unlock(current);
2079
2080 return true;
2081}
2082#endif
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094bool mempolicy_in_oom_domain(struct task_struct *tsk,
2095 const nodemask_t *mask)
2096{
2097 struct mempolicy *mempolicy;
2098 bool ret = true;
2099
2100 if (!mask)
2101 return ret;
2102
2103 task_lock(tsk);
2104 mempolicy = tsk->mempolicy;
2105 if (mempolicy && mempolicy->mode == MPOL_BIND)
2106 ret = nodes_intersects(mempolicy->nodes, *mask);
2107 task_unlock(tsk);
2108
2109 return ret;
2110}
2111
2112
2113
2114static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2115 unsigned nid)
2116{
2117 struct page *page;
2118
2119 page = __alloc_pages(gfp, order, nid, NULL);
2120
2121 if (!static_branch_likely(&vm_numa_stat_key))
2122 return page;
2123 if (page && page_to_nid(page) == nid) {
2124 preempt_disable();
2125 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2126 preempt_enable();
2127 }
2128 return page;
2129}
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2148 unsigned long addr, int node, bool hugepage)
2149{
2150 struct mempolicy *pol;
2151 struct page *page;
2152 int preferred_nid;
2153 nodemask_t *nmask;
2154
2155 pol = get_vma_policy(vma, addr);
2156
2157 if (pol->mode == MPOL_INTERLEAVE) {
2158 unsigned nid;
2159
2160 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2161 mpol_cond_put(pol);
2162 page = alloc_page_interleave(gfp, order, nid);
2163 goto out;
2164 }
2165
2166 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2167 int hpage_node = node;
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179 if (pol->mode == MPOL_PREFERRED)
2180 hpage_node = first_node(pol->nodes);
2181
2182 nmask = policy_nodemask(gfp, pol);
2183 if (!nmask || node_isset(hpage_node, *nmask)) {
2184 mpol_cond_put(pol);
2185
2186
2187
2188
2189 page = __alloc_pages_node(hpage_node,
2190 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2191
2192
2193
2194
2195
2196
2197
2198 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2199 page = __alloc_pages_node(hpage_node,
2200 gfp, order);
2201
2202 goto out;
2203 }
2204 }
2205
2206 nmask = policy_nodemask(gfp, pol);
2207 preferred_nid = policy_node(gfp, pol, node);
2208 page = __alloc_pages(gfp, order, preferred_nid, nmask);
2209 mpol_cond_put(pol);
2210out:
2211 return page;
2212}
2213EXPORT_SYMBOL(alloc_pages_vma);
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229struct page *alloc_pages(gfp_t gfp, unsigned order)
2230{
2231 struct mempolicy *pol = &default_policy;
2232 struct page *page;
2233
2234 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2235 pol = get_task_policy(current);
2236
2237
2238
2239
2240
2241 if (pol->mode == MPOL_INTERLEAVE)
2242 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2243 else
2244 page = __alloc_pages(gfp, order,
2245 policy_node(gfp, pol, numa_node_id()),
2246 policy_nodemask(gfp, pol));
2247
2248 return page;
2249}
2250EXPORT_SYMBOL(alloc_pages);
2251
2252int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2253{
2254 struct mempolicy *pol = mpol_dup(vma_policy(src));
2255
2256 if (IS_ERR(pol))
2257 return PTR_ERR(pol);
2258 dst->vm_policy = pol;
2259 return 0;
2260}
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274struct mempolicy *__mpol_dup(struct mempolicy *old)
2275{
2276 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2277
2278 if (!new)
2279 return ERR_PTR(-ENOMEM);
2280
2281
2282 if (old == current->mempolicy) {
2283 task_lock(current);
2284 *new = *old;
2285 task_unlock(current);
2286 } else
2287 *new = *old;
2288
2289 if (current_cpuset_is_being_rebound()) {
2290 nodemask_t mems = cpuset_mems_allowed(current);
2291 mpol_rebind_policy(new, &mems);
2292 }
2293 atomic_set(&new->refcnt, 1);
2294 return new;
2295}
2296
2297
2298bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2299{
2300 if (!a || !b)
2301 return false;
2302 if (a->mode != b->mode)
2303 return false;
2304 if (a->flags != b->flags)
2305 return false;
2306 if (mpol_store_user_nodemask(a))
2307 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2308 return false;
2309
2310 switch (a->mode) {
2311 case MPOL_BIND:
2312 case MPOL_INTERLEAVE:
2313 case MPOL_PREFERRED:
2314 return !!nodes_equal(a->nodes, b->nodes);
2315 case MPOL_LOCAL:
2316 return true;
2317 default:
2318 BUG();
2319 return false;
2320 }
2321}
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336static struct sp_node *
2337sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2338{
2339 struct rb_node *n = sp->root.rb_node;
2340
2341 while (n) {
2342 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2343
2344 if (start >= p->end)
2345 n = n->rb_right;
2346 else if (end <= p->start)
2347 n = n->rb_left;
2348 else
2349 break;
2350 }
2351 if (!n)
2352 return NULL;
2353 for (;;) {
2354 struct sp_node *w = NULL;
2355 struct rb_node *prev = rb_prev(n);
2356 if (!prev)
2357 break;
2358 w = rb_entry(prev, struct sp_node, nd);
2359 if (w->end <= start)
2360 break;
2361 n = prev;
2362 }
2363 return rb_entry(n, struct sp_node, nd);
2364}
2365
2366
2367
2368
2369
2370static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2371{
2372 struct rb_node **p = &sp->root.rb_node;
2373 struct rb_node *parent = NULL;
2374 struct sp_node *nd;
2375
2376 while (*p) {
2377 parent = *p;
2378 nd = rb_entry(parent, struct sp_node, nd);
2379 if (new->start < nd->start)
2380 p = &(*p)->rb_left;
2381 else if (new->end > nd->end)
2382 p = &(*p)->rb_right;
2383 else
2384 BUG();
2385 }
2386 rb_link_node(&new->nd, parent, p);
2387 rb_insert_color(&new->nd, &sp->root);
2388 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2389 new->policy ? new->policy->mode : 0);
2390}
2391
2392
2393struct mempolicy *
2394mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2395{
2396 struct mempolicy *pol = NULL;
2397 struct sp_node *sn;
2398
2399 if (!sp->root.rb_node)
2400 return NULL;
2401 read_lock(&sp->lock);
2402 sn = sp_lookup(sp, idx, idx+1);
2403 if (sn) {
2404 mpol_get(sn->policy);
2405 pol = sn->policy;
2406 }
2407 read_unlock(&sp->lock);
2408 return pol;
2409}
2410
2411static void sp_free(struct sp_node *n)
2412{
2413 mpol_put(n->policy);
2414 kmem_cache_free(sn_cache, n);
2415}
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2432{
2433 struct mempolicy *pol;
2434 struct zoneref *z;
2435 int curnid = page_to_nid(page);
2436 unsigned long pgoff;
2437 int thiscpu = raw_smp_processor_id();
2438 int thisnid = cpu_to_node(thiscpu);
2439 int polnid = NUMA_NO_NODE;
2440 int ret = -1;
2441
2442 pol = get_vma_policy(vma, addr);
2443 if (!(pol->flags & MPOL_F_MOF))
2444 goto out;
2445
2446 switch (pol->mode) {
2447 case MPOL_INTERLEAVE:
2448 pgoff = vma->vm_pgoff;
2449 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2450 polnid = offset_il_node(pol, pgoff);
2451 break;
2452
2453 case MPOL_PREFERRED:
2454 polnid = first_node(pol->nodes);
2455 break;
2456
2457 case MPOL_LOCAL:
2458 polnid = numa_node_id();
2459 break;
2460
2461 case MPOL_BIND:
2462
2463 if (pol->flags & MPOL_F_MORON) {
2464 if (node_isset(thisnid, pol->nodes))
2465 break;
2466 goto out;
2467 }
2468
2469
2470
2471
2472
2473
2474
2475 if (node_isset(curnid, pol->nodes))
2476 goto out;
2477 z = first_zones_zonelist(
2478 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2479 gfp_zone(GFP_HIGHUSER),
2480 &pol->nodes);
2481 polnid = zone_to_nid(z->zone);
2482 break;
2483
2484 default:
2485 BUG();
2486 }
2487
2488
2489 if (pol->flags & MPOL_F_MORON) {
2490 polnid = thisnid;
2491
2492 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2493 goto out;
2494 }
2495
2496 if (curnid != polnid)
2497 ret = polnid;
2498out:
2499 mpol_cond_put(pol);
2500
2501 return ret;
2502}
2503
2504
2505
2506
2507
2508
2509
2510void mpol_put_task_policy(struct task_struct *task)
2511{
2512 struct mempolicy *pol;
2513
2514 task_lock(task);
2515 pol = task->mempolicy;
2516 task->mempolicy = NULL;
2517 task_unlock(task);
2518 mpol_put(pol);
2519}
2520
2521static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2522{
2523 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2524 rb_erase(&n->nd, &sp->root);
2525 sp_free(n);
2526}
2527
2528static void sp_node_init(struct sp_node *node, unsigned long start,
2529 unsigned long end, struct mempolicy *pol)
2530{
2531 node->start = start;
2532 node->end = end;
2533 node->policy = pol;
2534}
2535
2536static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2537 struct mempolicy *pol)
2538{
2539 struct sp_node *n;
2540 struct mempolicy *newpol;
2541
2542 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2543 if (!n)
2544 return NULL;
2545
2546 newpol = mpol_dup(pol);
2547 if (IS_ERR(newpol)) {
2548 kmem_cache_free(sn_cache, n);
2549 return NULL;
2550 }
2551 newpol->flags |= MPOL_F_SHARED;
2552 sp_node_init(n, start, end, newpol);
2553
2554 return n;
2555}
2556
2557
2558static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2559 unsigned long end, struct sp_node *new)
2560{
2561 struct sp_node *n;
2562 struct sp_node *n_new = NULL;
2563 struct mempolicy *mpol_new = NULL;
2564 int ret = 0;
2565
2566restart:
2567 write_lock(&sp->lock);
2568 n = sp_lookup(sp, start, end);
2569
2570 while (n && n->start < end) {
2571 struct rb_node *next = rb_next(&n->nd);
2572 if (n->start >= start) {
2573 if (n->end <= end)
2574 sp_delete(sp, n);
2575 else
2576 n->start = end;
2577 } else {
2578
2579 if (n->end > end) {
2580 if (!n_new)
2581 goto alloc_new;
2582
2583 *mpol_new = *n->policy;
2584 atomic_set(&mpol_new->refcnt, 1);
2585 sp_node_init(n_new, end, n->end, mpol_new);
2586 n->end = start;
2587 sp_insert(sp, n_new);
2588 n_new = NULL;
2589 mpol_new = NULL;
2590 break;
2591 } else
2592 n->end = start;
2593 }
2594 if (!next)
2595 break;
2596 n = rb_entry(next, struct sp_node, nd);
2597 }
2598 if (new)
2599 sp_insert(sp, new);
2600 write_unlock(&sp->lock);
2601 ret = 0;
2602
2603err_out:
2604 if (mpol_new)
2605 mpol_put(mpol_new);
2606 if (n_new)
2607 kmem_cache_free(sn_cache, n_new);
2608
2609 return ret;
2610
2611alloc_new:
2612 write_unlock(&sp->lock);
2613 ret = -ENOMEM;
2614 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2615 if (!n_new)
2616 goto err_out;
2617 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2618 if (!mpol_new)
2619 goto err_out;
2620 goto restart;
2621}
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2634{
2635 int ret;
2636
2637 sp->root = RB_ROOT;
2638 rwlock_init(&sp->lock);
2639
2640 if (mpol) {
2641 struct vm_area_struct pvma;
2642 struct mempolicy *new;
2643 NODEMASK_SCRATCH(scratch);
2644
2645 if (!scratch)
2646 goto put_mpol;
2647
2648 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2649 if (IS_ERR(new))
2650 goto free_scratch;
2651
2652 task_lock(current);
2653 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2654 task_unlock(current);
2655 if (ret)
2656 goto put_new;
2657
2658
2659 vma_init(&pvma, NULL);
2660 pvma.vm_end = TASK_SIZE;
2661 mpol_set_shared_policy(sp, &pvma, new);
2662
2663put_new:
2664 mpol_put(new);
2665free_scratch:
2666 NODEMASK_SCRATCH_FREE(scratch);
2667put_mpol:
2668 mpol_put(mpol);
2669 }
2670}
2671
2672int mpol_set_shared_policy(struct shared_policy *info,
2673 struct vm_area_struct *vma, struct mempolicy *npol)
2674{
2675 int err;
2676 struct sp_node *new = NULL;
2677 unsigned long sz = vma_pages(vma);
2678
2679 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2680 vma->vm_pgoff,
2681 sz, npol ? npol->mode : -1,
2682 npol ? npol->flags : -1,
2683 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2684
2685 if (npol) {
2686 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2687 if (!new)
2688 return -ENOMEM;
2689 }
2690 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2691 if (err && new)
2692 sp_free(new);
2693 return err;
2694}
2695
2696
2697void mpol_free_shared_policy(struct shared_policy *p)
2698{
2699 struct sp_node *n;
2700 struct rb_node *next;
2701
2702 if (!p->root.rb_node)
2703 return;
2704 write_lock(&p->lock);
2705 next = rb_first(&p->root);
2706 while (next) {
2707 n = rb_entry(next, struct sp_node, nd);
2708 next = rb_next(&n->nd);
2709 sp_delete(p, n);
2710 }
2711 write_unlock(&p->lock);
2712}
2713
2714#ifdef CONFIG_NUMA_BALANCING
2715static int __initdata numabalancing_override;
2716
2717static void __init check_numabalancing_enable(void)
2718{
2719 bool numabalancing_default = false;
2720
2721 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2722 numabalancing_default = true;
2723
2724
2725 if (numabalancing_override)
2726 set_numabalancing_state(numabalancing_override == 1);
2727
2728 if (num_online_nodes() > 1 && !numabalancing_override) {
2729 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2730 numabalancing_default ? "Enabling" : "Disabling");
2731 set_numabalancing_state(numabalancing_default);
2732 }
2733}
2734
2735static int __init setup_numabalancing(char *str)
2736{
2737 int ret = 0;
2738 if (!str)
2739 goto out;
2740
2741 if (!strcmp(str, "enable")) {
2742 numabalancing_override = 1;
2743 ret = 1;
2744 } else if (!strcmp(str, "disable")) {
2745 numabalancing_override = -1;
2746 ret = 1;
2747 }
2748out:
2749 if (!ret)
2750 pr_warn("Unable to parse numa_balancing=\n");
2751
2752 return ret;
2753}
2754__setup("numa_balancing=", setup_numabalancing);
2755#else
2756static inline void __init check_numabalancing_enable(void)
2757{
2758}
2759#endif
2760
2761
2762void __init numa_policy_init(void)
2763{
2764 nodemask_t interleave_nodes;
2765 unsigned long largest = 0;
2766 int nid, prefer = 0;
2767
2768 policy_cache = kmem_cache_create("numa_policy",
2769 sizeof(struct mempolicy),
2770 0, SLAB_PANIC, NULL);
2771
2772 sn_cache = kmem_cache_create("shared_policy_node",
2773 sizeof(struct sp_node),
2774 0, SLAB_PANIC, NULL);
2775
2776 for_each_node(nid) {
2777 preferred_node_policy[nid] = (struct mempolicy) {
2778 .refcnt = ATOMIC_INIT(1),
2779 .mode = MPOL_PREFERRED,
2780 .flags = MPOL_F_MOF | MPOL_F_MORON,
2781 .nodes = nodemask_of_node(nid),
2782 };
2783 }
2784
2785
2786
2787
2788
2789
2790 nodes_clear(interleave_nodes);
2791 for_each_node_state(nid, N_MEMORY) {
2792 unsigned long total_pages = node_present_pages(nid);
2793
2794
2795 if (largest < total_pages) {
2796 largest = total_pages;
2797 prefer = nid;
2798 }
2799
2800
2801 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2802 node_set(nid, interleave_nodes);
2803 }
2804
2805
2806 if (unlikely(nodes_empty(interleave_nodes)))
2807 node_set(prefer, interleave_nodes);
2808
2809 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2810 pr_err("%s: interleaving failed\n", __func__);
2811
2812 check_numabalancing_enable();
2813}
2814
2815
2816void numa_default_policy(void)
2817{
2818 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2819}
2820
2821
2822
2823
2824
2825static const char * const policy_modes[] =
2826{
2827 [MPOL_DEFAULT] = "default",
2828 [MPOL_PREFERRED] = "prefer",
2829 [MPOL_BIND] = "bind",
2830 [MPOL_INTERLEAVE] = "interleave",
2831 [MPOL_LOCAL] = "local",
2832};
2833
2834
2835#ifdef CONFIG_TMPFS
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846int mpol_parse_str(char *str, struct mempolicy **mpol)
2847{
2848 struct mempolicy *new = NULL;
2849 unsigned short mode_flags;
2850 nodemask_t nodes;
2851 char *nodelist = strchr(str, ':');
2852 char *flags = strchr(str, '=');
2853 int err = 1, mode;
2854
2855 if (flags)
2856 *flags++ = '\0';
2857
2858 if (nodelist) {
2859
2860 *nodelist++ = '\0';
2861 if (nodelist_parse(nodelist, nodes))
2862 goto out;
2863 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2864 goto out;
2865 } else
2866 nodes_clear(nodes);
2867
2868 mode = match_string(policy_modes, MPOL_MAX, str);
2869 if (mode < 0)
2870 goto out;
2871
2872 switch (mode) {
2873 case MPOL_PREFERRED:
2874
2875
2876
2877
2878
2879 if (nodelist) {
2880 char *rest = nodelist;
2881 while (isdigit(*rest))
2882 rest++;
2883 if (*rest)
2884 goto out;
2885 if (nodes_empty(nodes))
2886 goto out;
2887 }
2888 break;
2889 case MPOL_INTERLEAVE:
2890
2891
2892
2893 if (!nodelist)
2894 nodes = node_states[N_MEMORY];
2895 break;
2896 case MPOL_LOCAL:
2897
2898
2899
2900 if (nodelist)
2901 goto out;
2902 break;
2903 case MPOL_DEFAULT:
2904
2905
2906
2907 if (!nodelist)
2908 err = 0;
2909 goto out;
2910 case MPOL_BIND:
2911
2912
2913
2914 if (!nodelist)
2915 goto out;
2916 }
2917
2918 mode_flags = 0;
2919 if (flags) {
2920
2921
2922
2923
2924 if (!strcmp(flags, "static"))
2925 mode_flags |= MPOL_F_STATIC_NODES;
2926 else if (!strcmp(flags, "relative"))
2927 mode_flags |= MPOL_F_RELATIVE_NODES;
2928 else
2929 goto out;
2930 }
2931
2932 new = mpol_new(mode, mode_flags, &nodes);
2933 if (IS_ERR(new))
2934 goto out;
2935
2936
2937
2938
2939
2940 if (mode != MPOL_PREFERRED) {
2941 new->nodes = nodes;
2942 } else if (nodelist) {
2943 nodes_clear(new->nodes);
2944 node_set(first_node(nodes), new->nodes);
2945 } else {
2946 new->mode = MPOL_LOCAL;
2947 }
2948
2949
2950
2951
2952
2953 new->w.user_nodemask = nodes;
2954
2955 err = 0;
2956
2957out:
2958
2959 if (nodelist)
2960 *--nodelist = ':';
2961 if (flags)
2962 *--flags = '=';
2963 if (!err)
2964 *mpol = new;
2965 return err;
2966}
2967#endif
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2980{
2981 char *p = buffer;
2982 nodemask_t nodes = NODE_MASK_NONE;
2983 unsigned short mode = MPOL_DEFAULT;
2984 unsigned short flags = 0;
2985
2986 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2987 mode = pol->mode;
2988 flags = pol->flags;
2989 }
2990
2991 switch (mode) {
2992 case MPOL_DEFAULT:
2993 case MPOL_LOCAL:
2994 break;
2995 case MPOL_PREFERRED:
2996 case MPOL_BIND:
2997 case MPOL_INTERLEAVE:
2998 nodes = pol->nodes;
2999 break;
3000 default:
3001 WARN_ON_ONCE(1);
3002 snprintf(p, maxlen, "unknown");
3003 return;
3004 }
3005
3006 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3007
3008 if (flags & MPOL_MODE_FLAGS) {
3009 p += snprintf(p, buffer + maxlen - p, "=");
3010
3011
3012
3013
3014 if (flags & MPOL_F_STATIC_NODES)
3015 p += snprintf(p, buffer + maxlen - p, "static");
3016 else if (flags & MPOL_F_RELATIVE_NODES)
3017 p += snprintf(p, buffer + maxlen - p, "relative");
3018 }
3019
3020 if (!nodes_empty(nodes))
3021 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3022 nodemask_pr_args(&nodes));
3023}
3024