1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)
100
101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache;
103
104
105
106enum zone_type policy_zone = 0;
107
108
109
110
111struct mempolicy default_policy = {
112 .refcnt = ATOMIC_INIT(1),
113 .mode = MPOL_PREFERRED,
114 .flags = MPOL_F_LOCAL,
115};
116
117static const struct mempolicy_operations {
118 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
119 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
120} mpol_ops[MPOL_MAX];
121
122
123static int is_valid_nodemask(const nodemask_t *nodemask)
124{
125 int nd, k;
126
127
128 k = policy_zone;
129
130 for_each_node_mask(nd, *nodemask) {
131 struct zone *z;
132
133 for (k = 0; k <= policy_zone; k++) {
134 z = &NODE_DATA(nd)->node_zones[k];
135 if (z->present_pages > 0)
136 return 1;
137 }
138 }
139
140 return 0;
141}
142
143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
144{
145 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
146}
147
148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
149 const nodemask_t *rel)
150{
151 nodemask_t tmp;
152 nodes_fold(tmp, *orig, nodes_weight(*rel));
153 nodes_onto(*ret, tmp, *rel);
154}
155
156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
157{
158 if (nodes_empty(*nodes))
159 return -EINVAL;
160 pol->v.nodes = *nodes;
161 return 0;
162}
163
164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
165{
166 if (!nodes)
167 pol->flags |= MPOL_F_LOCAL;
168 else if (nodes_empty(*nodes))
169 return -EINVAL;
170 else
171 pol->v.preferred_node = first_node(*nodes);
172 return 0;
173}
174
175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
176{
177 if (!is_valid_nodemask(nodes))
178 return -EINVAL;
179 pol->v.nodes = *nodes;
180 return 0;
181}
182
183
184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
185 nodemask_t *nodes)
186{
187 struct mempolicy *policy;
188 nodemask_t cpuset_context_nmask;
189 int ret;
190
191 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
192 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
193
194 if (mode == MPOL_DEFAULT) {
195 if (nodes && !nodes_empty(*nodes))
196 return ERR_PTR(-EINVAL);
197 return NULL;
198 }
199 VM_BUG_ON(!nodes);
200
201
202
203
204
205
206 if (mode == MPOL_PREFERRED) {
207 if (nodes_empty(*nodes)) {
208 if (((flags & MPOL_F_STATIC_NODES) ||
209 (flags & MPOL_F_RELATIVE_NODES)))
210 return ERR_PTR(-EINVAL);
211 nodes = NULL;
212 }
213 } else if (nodes_empty(*nodes))
214 return ERR_PTR(-EINVAL);
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 policy->mode = mode;
220 policy->flags = flags;
221
222 if (nodes) {
223
224
225
226 cpuset_update_task_memory_state();
227 if (flags & MPOL_F_RELATIVE_NODES)
228 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
229 &cpuset_current_mems_allowed);
230 else
231 nodes_and(cpuset_context_nmask, *nodes,
232 cpuset_current_mems_allowed);
233 if (mpol_store_user_nodemask(policy))
234 policy->w.user_nodemask = *nodes;
235 else
236 policy->w.cpuset_mems_allowed =
237 cpuset_mems_allowed(current);
238 }
239
240 ret = mpol_ops[mode].create(policy,
241 nodes ? &cpuset_context_nmask : NULL);
242 if (ret < 0) {
243 kmem_cache_free(policy_cache, policy);
244 return ERR_PTR(ret);
245 }
246 return policy;
247}
248
249
250void __mpol_put(struct mempolicy *p)
251{
252 if (!atomic_dec_and_test(&p->refcnt))
253 return;
254 kmem_cache_free(policy_cache, p);
255}
256
257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
258{
259}
260
261static void mpol_rebind_nodemask(struct mempolicy *pol,
262 const nodemask_t *nodes)
263{
264 nodemask_t tmp;
265
266 if (pol->flags & MPOL_F_STATIC_NODES)
267 nodes_and(tmp, pol->w.user_nodemask, *nodes);
268 else if (pol->flags & MPOL_F_RELATIVE_NODES)
269 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
270 else {
271 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
272 *nodes);
273 pol->w.cpuset_mems_allowed = *nodes;
274 }
275
276 pol->v.nodes = tmp;
277 if (!node_isset(current->il_next, tmp)) {
278 current->il_next = next_node(current->il_next, tmp);
279 if (current->il_next >= MAX_NUMNODES)
280 current->il_next = first_node(tmp);
281 if (current->il_next >= MAX_NUMNODES)
282 current->il_next = numa_node_id();
283 }
284}
285
286static void mpol_rebind_preferred(struct mempolicy *pol,
287 const nodemask_t *nodes)
288{
289 nodemask_t tmp;
290
291 if (pol->flags & MPOL_F_STATIC_NODES) {
292 int node = first_node(pol->w.user_nodemask);
293
294 if (node_isset(node, *nodes)) {
295 pol->v.preferred_node = node;
296 pol->flags &= ~MPOL_F_LOCAL;
297 } else
298 pol->flags |= MPOL_F_LOCAL;
299 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
300 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
301 pol->v.preferred_node = first_node(tmp);
302 } else if (!(pol->flags & MPOL_F_LOCAL)) {
303 pol->v.preferred_node = node_remap(pol->v.preferred_node,
304 pol->w.cpuset_mems_allowed,
305 *nodes);
306 pol->w.cpuset_mems_allowed = *nodes;
307 }
308}
309
310
311static void mpol_rebind_policy(struct mempolicy *pol,
312 const nodemask_t *newmask)
313{
314 if (!pol)
315 return;
316 if (!mpol_store_user_nodemask(pol) &&
317 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
318 return;
319 mpol_ops[pol->mode].rebind(pol, newmask);
320}
321
322
323
324
325
326
327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
328{
329 mpol_rebind_policy(tsk->mempolicy, new);
330}
331
332
333
334
335
336
337
338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
339{
340 struct vm_area_struct *vma;
341
342 down_write(&mm->mmap_sem);
343 for (vma = mm->mmap; vma; vma = vma->vm_next)
344 mpol_rebind_policy(vma->vm_policy, new);
345 up_write(&mm->mmap_sem);
346}
347
348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
349 [MPOL_DEFAULT] = {
350 .rebind = mpol_rebind_default,
351 },
352 [MPOL_INTERLEAVE] = {
353 .create = mpol_new_interleave,
354 .rebind = mpol_rebind_nodemask,
355 },
356 [MPOL_PREFERRED] = {
357 .create = mpol_new_preferred,
358 .rebind = mpol_rebind_preferred,
359 },
360 [MPOL_BIND] = {
361 .create = mpol_new_bind,
362 .rebind = mpol_rebind_nodemask,
363 },
364};
365
366static void gather_stats(struct page *, void *, int pte_dirty);
367static void migrate_page_add(struct page *page, struct list_head *pagelist,
368 unsigned long flags);
369
370
371static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
372 unsigned long addr, unsigned long end,
373 const nodemask_t *nodes, unsigned long flags,
374 void *private)
375{
376 pte_t *orig_pte;
377 pte_t *pte;
378 spinlock_t *ptl;
379
380 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
381 do {
382 struct page *page;
383 int nid;
384
385 if (!pte_present(*pte))
386 continue;
387 page = vm_normal_page(vma, addr, *pte);
388 if (!page)
389 continue;
390
391
392
393
394
395
396
397
398
399
400
401 if (PageReserved(page))
402 continue;
403 nid = page_to_nid(page);
404 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
405 continue;
406
407 if (flags & MPOL_MF_STATS)
408 gather_stats(page, private, pte_dirty(*pte));
409 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
410 migrate_page_add(page, private, flags);
411 else
412 break;
413 } while (pte++, addr += PAGE_SIZE, addr != end);
414 pte_unmap_unlock(orig_pte, ptl);
415 return addr != end;
416}
417
418static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
419 unsigned long addr, unsigned long end,
420 const nodemask_t *nodes, unsigned long flags,
421 void *private)
422{
423 pmd_t *pmd;
424 unsigned long next;
425
426 pmd = pmd_offset(pud, addr);
427 do {
428 next = pmd_addr_end(addr, end);
429 if (pmd_none_or_clear_bad(pmd))
430 continue;
431 if (check_pte_range(vma, pmd, addr, next, nodes,
432 flags, private))
433 return -EIO;
434 } while (pmd++, addr = next, addr != end);
435 return 0;
436}
437
438static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
439 unsigned long addr, unsigned long end,
440 const nodemask_t *nodes, unsigned long flags,
441 void *private)
442{
443 pud_t *pud;
444 unsigned long next;
445
446 pud = pud_offset(pgd, addr);
447 do {
448 next = pud_addr_end(addr, end);
449 if (pud_none_or_clear_bad(pud))
450 continue;
451 if (check_pmd_range(vma, pud, addr, next, nodes,
452 flags, private))
453 return -EIO;
454 } while (pud++, addr = next, addr != end);
455 return 0;
456}
457
458static inline int check_pgd_range(struct vm_area_struct *vma,
459 unsigned long addr, unsigned long end,
460 const nodemask_t *nodes, unsigned long flags,
461 void *private)
462{
463 pgd_t *pgd;
464 unsigned long next;
465
466 pgd = pgd_offset(vma->vm_mm, addr);
467 do {
468 next = pgd_addr_end(addr, end);
469 if (pgd_none_or_clear_bad(pgd))
470 continue;
471 if (check_pud_range(vma, pgd, addr, next, nodes,
472 flags, private))
473 return -EIO;
474 } while (pgd++, addr = next, addr != end);
475 return 0;
476}
477
478
479
480
481
482
483static struct vm_area_struct *
484check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
485 const nodemask_t *nodes, unsigned long flags, void *private)
486{
487 int err;
488 struct vm_area_struct *first, *vma, *prev;
489
490 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
491
492 err = migrate_prep();
493 if (err)
494 return ERR_PTR(err);
495 }
496
497 first = find_vma(mm, start);
498 if (!first)
499 return ERR_PTR(-EFAULT);
500 prev = NULL;
501 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
502 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
503 if (!vma->vm_next && vma->vm_end < end)
504 return ERR_PTR(-EFAULT);
505 if (prev && prev->vm_end < vma->vm_start)
506 return ERR_PTR(-EFAULT);
507 }
508 if (!is_vm_hugetlb_page(vma) &&
509 ((flags & MPOL_MF_STRICT) ||
510 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
511 vma_migratable(vma)))) {
512 unsigned long endvma = vma->vm_end;
513
514 if (endvma > end)
515 endvma = end;
516 if (vma->vm_start > start)
517 start = vma->vm_start;
518 err = check_pgd_range(vma, start, endvma, nodes,
519 flags, private);
520 if (err) {
521 first = ERR_PTR(err);
522 break;
523 }
524 }
525 prev = vma;
526 }
527 return first;
528}
529
530
531static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
532{
533 int err = 0;
534 struct mempolicy *old = vma->vm_policy;
535
536 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
537 vma->vm_start, vma->vm_end, vma->vm_pgoff,
538 vma->vm_ops, vma->vm_file,
539 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
540
541 if (vma->vm_ops && vma->vm_ops->set_policy)
542 err = vma->vm_ops->set_policy(vma, new);
543 if (!err) {
544 mpol_get(new);
545 vma->vm_policy = new;
546 mpol_put(old);
547 }
548 return err;
549}
550
551
552static int mbind_range(struct vm_area_struct *vma, unsigned long start,
553 unsigned long end, struct mempolicy *new)
554{
555 struct vm_area_struct *next;
556 int err;
557
558 err = 0;
559 for (; vma && vma->vm_start < end; vma = next) {
560 next = vma->vm_next;
561 if (vma->vm_start < start)
562 err = split_vma(vma->vm_mm, vma, start, 1);
563 if (!err && vma->vm_end > end)
564 err = split_vma(vma->vm_mm, vma, end, 0);
565 if (!err)
566 err = policy_vma(vma, new);
567 if (err)
568 break;
569 }
570 return err;
571}
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590void mpol_fix_fork_child_flag(struct task_struct *p)
591{
592 if (p->mempolicy)
593 p->flags |= PF_MEMPOLICY;
594 else
595 p->flags &= ~PF_MEMPOLICY;
596}
597
598static void mpol_set_task_struct_flag(void)
599{
600 mpol_fix_fork_child_flag(current);
601}
602
603
604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
605 nodemask_t *nodes)
606{
607 struct mempolicy *new;
608 struct mm_struct *mm = current->mm;
609
610 new = mpol_new(mode, flags, nodes);
611 if (IS_ERR(new))
612 return PTR_ERR(new);
613
614
615
616
617
618
619
620 if (mm)
621 down_write(&mm->mmap_sem);
622 mpol_put(current->mempolicy);
623 current->mempolicy = new;
624 mpol_set_task_struct_flag();
625 if (new && new->mode == MPOL_INTERLEAVE &&
626 nodes_weight(new->v.nodes))
627 current->il_next = first_node(new->v.nodes);
628 if (mm)
629 up_write(&mm->mmap_sem);
630
631 return 0;
632}
633
634
635
636
637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
638{
639 nodes_clear(*nodes);
640 if (p == &default_policy)
641 return;
642
643 switch (p->mode) {
644 case MPOL_BIND:
645
646 case MPOL_INTERLEAVE:
647 *nodes = p->v.nodes;
648 break;
649 case MPOL_PREFERRED:
650 if (!(p->flags & MPOL_F_LOCAL))
651 node_set(p->v.preferred_node, *nodes);
652
653 break;
654 default:
655 BUG();
656 }
657}
658
659static int lookup_node(struct mm_struct *mm, unsigned long addr)
660{
661 struct page *p;
662 int err;
663
664 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
665 if (err >= 0) {
666 err = page_to_nid(p);
667 put_page(p);
668 }
669 return err;
670}
671
672
673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 unsigned long addr, unsigned long flags)
675{
676 int err;
677 struct mm_struct *mm = current->mm;
678 struct vm_area_struct *vma = NULL;
679 struct mempolicy *pol = current->mempolicy;
680
681 cpuset_update_task_memory_state();
682 if (flags &
683 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
684 return -EINVAL;
685
686 if (flags & MPOL_F_MEMS_ALLOWED) {
687 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
688 return -EINVAL;
689 *policy = 0;
690 *nmask = cpuset_current_mems_allowed;
691 return 0;
692 }
693
694 if (flags & MPOL_F_ADDR) {
695
696
697
698
699
700 down_read(&mm->mmap_sem);
701 vma = find_vma_intersection(mm, addr, addr+1);
702 if (!vma) {
703 up_read(&mm->mmap_sem);
704 return -EFAULT;
705 }
706 if (vma->vm_ops && vma->vm_ops->get_policy)
707 pol = vma->vm_ops->get_policy(vma, addr);
708 else
709 pol = vma->vm_policy;
710 } else if (addr)
711 return -EINVAL;
712
713 if (!pol)
714 pol = &default_policy;
715
716 if (flags & MPOL_F_NODE) {
717 if (flags & MPOL_F_ADDR) {
718 err = lookup_node(mm, addr);
719 if (err < 0)
720 goto out;
721 *policy = err;
722 } else if (pol == current->mempolicy &&
723 pol->mode == MPOL_INTERLEAVE) {
724 *policy = current->il_next;
725 } else {
726 err = -EINVAL;
727 goto out;
728 }
729 } else {
730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode;
732
733
734
735
736 *policy |= (pol->flags & MPOL_MODE_FLAGS);
737 }
738
739 if (vma) {
740 up_read(¤t->mm->mmap_sem);
741 vma = NULL;
742 }
743
744 err = 0;
745 if (nmask)
746 get_policy_nodemask(pol, nmask);
747
748 out:
749 mpol_cond_put(pol);
750 if (vma)
751 up_read(¤t->mm->mmap_sem);
752 return err;
753}
754
755#ifdef CONFIG_MIGRATION
756
757
758
759static void migrate_page_add(struct page *page, struct list_head *pagelist,
760 unsigned long flags)
761{
762
763
764
765 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
766 isolate_lru_page(page, pagelist);
767}
768
769static struct page *new_node_page(struct page *page, unsigned long node, int **x)
770{
771 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
772}
773
774
775
776
777
778static int migrate_to_node(struct mm_struct *mm, int source, int dest,
779 int flags)
780{
781 nodemask_t nmask;
782 LIST_HEAD(pagelist);
783 int err = 0;
784
785 nodes_clear(nmask);
786 node_set(source, nmask);
787
788 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
789 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
790
791 if (!list_empty(&pagelist))
792 err = migrate_pages(&pagelist, new_node_page, dest);
793
794 return err;
795}
796
797
798
799
800
801
802
803int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{
806 int busy = 0;
807 int err = 0;
808 nodemask_t tmp;
809
810 down_read(&mm->mmap_sem);
811
812 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
813 if (err)
814 goto out;
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847 tmp = *from_nodes;
848 while (!nodes_empty(tmp)) {
849 int s,d;
850 int source = -1;
851 int dest = 0;
852
853 for_each_node_mask(s, tmp) {
854 d = node_remap(s, *from_nodes, *to_nodes);
855 if (s == d)
856 continue;
857
858 source = s;
859 dest = d;
860
861
862 if (!node_isset(dest, tmp))
863 break;
864 }
865 if (source == -1)
866 break;
867
868 node_clear(source, tmp);
869 err = migrate_to_node(mm, source, dest, flags);
870 if (err > 0)
871 busy += err;
872 if (err < 0)
873 break;
874 }
875out:
876 up_read(&mm->mmap_sem);
877 if (err < 0)
878 return err;
879 return busy;
880
881}
882
883
884
885
886
887
888
889
890static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
891{
892 struct vm_area_struct *vma = (struct vm_area_struct *)private;
893 unsigned long uninitialized_var(address);
894
895 while (vma) {
896 address = page_address_in_vma(page, vma);
897 if (address != -EFAULT)
898 break;
899 vma = vma->vm_next;
900 }
901
902
903
904
905 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
906}
907#else
908
909static void migrate_page_add(struct page *page, struct list_head *pagelist,
910 unsigned long flags)
911{
912}
913
914int do_migrate_pages(struct mm_struct *mm,
915 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
916{
917 return -ENOSYS;
918}
919
920static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
921{
922 return NULL;
923}
924#endif
925
926static long do_mbind(unsigned long start, unsigned long len,
927 unsigned short mode, unsigned short mode_flags,
928 nodemask_t *nmask, unsigned long flags)
929{
930 struct vm_area_struct *vma;
931 struct mm_struct *mm = current->mm;
932 struct mempolicy *new;
933 unsigned long end;
934 int err;
935 LIST_HEAD(pagelist);
936
937 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
938 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
939 return -EINVAL;
940 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
941 return -EPERM;
942
943 if (start & ~PAGE_MASK)
944 return -EINVAL;
945
946 if (mode == MPOL_DEFAULT)
947 flags &= ~MPOL_MF_STRICT;
948
949 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
950 end = start + len;
951
952 if (end < start)
953 return -EINVAL;
954 if (end == start)
955 return 0;
956
957 new = mpol_new(mode, mode_flags, nmask);
958 if (IS_ERR(new))
959 return PTR_ERR(new);
960
961
962
963
964
965 if (!new)
966 flags |= MPOL_MF_DISCONTIG_OK;
967
968 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
969 start, start + len, mode, mode_flags,
970 nmask ? nodes_addr(*nmask)[0] : -1);
971
972 down_write(&mm->mmap_sem);
973 vma = check_range(mm, start, end, nmask,
974 flags | MPOL_MF_INVERT, &pagelist);
975
976 err = PTR_ERR(vma);
977 if (!IS_ERR(vma)) {
978 int nr_failed = 0;
979
980 err = mbind_range(vma, start, end, new);
981
982 if (!list_empty(&pagelist))
983 nr_failed = migrate_pages(&pagelist, new_vma_page,
984 (unsigned long)vma);
985
986 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
987 err = -EIO;
988 } else
989 putback_lru_pages(&pagelist);
990
991 up_write(&mm->mmap_sem);
992 mpol_put(new);
993 return err;
994}
995
996
997
998
999
1000
1001static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1002 unsigned long maxnode)
1003{
1004 unsigned long k;
1005 unsigned long nlongs;
1006 unsigned long endmask;
1007
1008 --maxnode;
1009 nodes_clear(*nodes);
1010 if (maxnode == 0 || !nmask)
1011 return 0;
1012 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1013 return -EINVAL;
1014
1015 nlongs = BITS_TO_LONGS(maxnode);
1016 if ((maxnode % BITS_PER_LONG) == 0)
1017 endmask = ~0UL;
1018 else
1019 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1020
1021
1022
1023 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1024 if (nlongs > PAGE_SIZE/sizeof(long))
1025 return -EINVAL;
1026 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1027 unsigned long t;
1028 if (get_user(t, nmask + k))
1029 return -EFAULT;
1030 if (k == nlongs - 1) {
1031 if (t & endmask)
1032 return -EINVAL;
1033 } else if (t)
1034 return -EINVAL;
1035 }
1036 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1037 endmask = ~0UL;
1038 }
1039
1040 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1041 return -EFAULT;
1042 nodes_addr(*nodes)[nlongs-1] &= endmask;
1043 return 0;
1044}
1045
1046
1047static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1048 nodemask_t *nodes)
1049{
1050 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1051 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1052
1053 if (copy > nbytes) {
1054 if (copy > PAGE_SIZE)
1055 return -EINVAL;
1056 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1057 return -EFAULT;
1058 copy = nbytes;
1059 }
1060 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1061}
1062
1063SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1064 unsigned long, mode, unsigned long __user *, nmask,
1065 unsigned long, maxnode, unsigned, flags)
1066{
1067 nodemask_t nodes;
1068 int err;
1069 unsigned short mode_flags;
1070
1071 mode_flags = mode & MPOL_MODE_FLAGS;
1072 mode &= ~MPOL_MODE_FLAGS;
1073 if (mode >= MPOL_MAX)
1074 return -EINVAL;
1075 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1076 (mode_flags & MPOL_F_RELATIVE_NODES))
1077 return -EINVAL;
1078 err = get_nodes(&nodes, nmask, maxnode);
1079 if (err)
1080 return err;
1081 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1082}
1083
1084
1085SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1086 unsigned long, maxnode)
1087{
1088 int err;
1089 nodemask_t nodes;
1090 unsigned short flags;
1091
1092 flags = mode & MPOL_MODE_FLAGS;
1093 mode &= ~MPOL_MODE_FLAGS;
1094 if ((unsigned int)mode >= MPOL_MAX)
1095 return -EINVAL;
1096 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1097 return -EINVAL;
1098 err = get_nodes(&nodes, nmask, maxnode);
1099 if (err)
1100 return err;
1101 return do_set_mempolicy(mode, flags, &nodes);
1102}
1103
1104SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1105 const unsigned long __user *, old_nodes,
1106 const unsigned long __user *, new_nodes)
1107{
1108 struct mm_struct *mm;
1109 struct task_struct *task;
1110 nodemask_t old;
1111 nodemask_t new;
1112 nodemask_t task_nodes;
1113 int err;
1114
1115 err = get_nodes(&old, old_nodes, maxnode);
1116 if (err)
1117 return err;
1118
1119 err = get_nodes(&new, new_nodes, maxnode);
1120 if (err)
1121 return err;
1122
1123
1124 read_lock(&tasklist_lock);
1125 task = pid ? find_task_by_vpid(pid) : current;
1126 if (!task) {
1127 read_unlock(&tasklist_lock);
1128 return -ESRCH;
1129 }
1130 mm = get_task_mm(task);
1131 read_unlock(&tasklist_lock);
1132
1133 if (!mm)
1134 return -EINVAL;
1135
1136
1137
1138
1139
1140
1141
1142 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1143 (current->uid != task->suid) && (current->uid != task->uid) &&
1144 !capable(CAP_SYS_NICE)) {
1145 err = -EPERM;
1146 goto out;
1147 }
1148
1149 task_nodes = cpuset_mems_allowed(task);
1150
1151 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1152 err = -EPERM;
1153 goto out;
1154 }
1155
1156 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1157 err = -EINVAL;
1158 goto out;
1159 }
1160
1161 err = security_task_movememory(task);
1162 if (err)
1163 goto out;
1164
1165 err = do_migrate_pages(mm, &old, &new,
1166 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1167out:
1168 mmput(mm);
1169 return err;
1170}
1171
1172
1173
1174SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1175 unsigned long __user *, nmask, unsigned long, maxnode,
1176 unsigned long, addr, unsigned long, flags)
1177{
1178 int err;
1179 int uninitialized_var(pval);
1180 nodemask_t nodes;
1181
1182 if (nmask != NULL && maxnode < MAX_NUMNODES)
1183 return -EINVAL;
1184
1185 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1186
1187 if (err)
1188 return err;
1189
1190 if (policy && put_user(pval, policy))
1191 return -EFAULT;
1192
1193 if (nmask)
1194 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1195
1196 return err;
1197}
1198
1199#ifdef CONFIG_COMPAT
1200
1201asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1202 compat_ulong_t __user *nmask,
1203 compat_ulong_t maxnode,
1204 compat_ulong_t addr, compat_ulong_t flags)
1205{
1206 long err;
1207 unsigned long __user *nm = NULL;
1208 unsigned long nr_bits, alloc_size;
1209 DECLARE_BITMAP(bm, MAX_NUMNODES);
1210
1211 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1212 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1213
1214 if (nmask)
1215 nm = compat_alloc_user_space(alloc_size);
1216
1217 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1218
1219 if (!err && nmask) {
1220 err = copy_from_user(bm, nm, alloc_size);
1221
1222 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1223 err |= compat_put_bitmap(nmask, bm, nr_bits);
1224 }
1225
1226 return err;
1227}
1228
1229asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1230 compat_ulong_t maxnode)
1231{
1232 long err = 0;
1233 unsigned long __user *nm = NULL;
1234 unsigned long nr_bits, alloc_size;
1235 DECLARE_BITMAP(bm, MAX_NUMNODES);
1236
1237 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1238 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1239
1240 if (nmask) {
1241 err = compat_get_bitmap(bm, nmask, nr_bits);
1242 nm = compat_alloc_user_space(alloc_size);
1243 err |= copy_to_user(nm, bm, alloc_size);
1244 }
1245
1246 if (err)
1247 return -EFAULT;
1248
1249 return sys_set_mempolicy(mode, nm, nr_bits+1);
1250}
1251
1252asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1253 compat_ulong_t mode, compat_ulong_t __user *nmask,
1254 compat_ulong_t maxnode, compat_ulong_t flags)
1255{
1256 long err = 0;
1257 unsigned long __user *nm = NULL;
1258 unsigned long nr_bits, alloc_size;
1259 nodemask_t bm;
1260
1261 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1262 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1263
1264 if (nmask) {
1265 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1266 nm = compat_alloc_user_space(alloc_size);
1267 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1268 }
1269
1270 if (err)
1271 return -EFAULT;
1272
1273 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1274}
1275
1276#endif
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294static struct mempolicy *get_vma_policy(struct task_struct *task,
1295 struct vm_area_struct *vma, unsigned long addr)
1296{
1297 struct mempolicy *pol = task->mempolicy;
1298
1299 if (vma) {
1300 if (vma->vm_ops && vma->vm_ops->get_policy) {
1301 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1302 addr);
1303 if (vpol)
1304 pol = vpol;
1305 } else if (vma->vm_policy)
1306 pol = vma->vm_policy;
1307 }
1308 if (!pol)
1309 pol = &default_policy;
1310 return pol;
1311}
1312
1313
1314
1315
1316
1317static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1318{
1319
1320 if (unlikely(policy->mode == MPOL_BIND) &&
1321 gfp_zone(gfp) >= policy_zone &&
1322 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1323 return &policy->v.nodes;
1324
1325 return NULL;
1326}
1327
1328
1329static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1330{
1331 int nd = numa_node_id();
1332
1333 switch (policy->mode) {
1334 case MPOL_PREFERRED:
1335 if (!(policy->flags & MPOL_F_LOCAL))
1336 nd = policy->v.preferred_node;
1337 break;
1338 case MPOL_BIND:
1339
1340
1341
1342
1343
1344
1345 if (unlikely(gfp & __GFP_THISNODE) &&
1346 unlikely(!node_isset(nd, policy->v.nodes)))
1347 nd = first_node(policy->v.nodes);
1348 break;
1349 case MPOL_INTERLEAVE:
1350 break;
1351 default:
1352 BUG();
1353 }
1354 return node_zonelist(nd, gfp);
1355}
1356
1357
1358static unsigned interleave_nodes(struct mempolicy *policy)
1359{
1360 unsigned nid, next;
1361 struct task_struct *me = current;
1362
1363 nid = me->il_next;
1364 next = next_node(nid, policy->v.nodes);
1365 if (next >= MAX_NUMNODES)
1366 next = first_node(policy->v.nodes);
1367 if (next < MAX_NUMNODES)
1368 me->il_next = next;
1369 return nid;
1370}
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380unsigned slab_node(struct mempolicy *policy)
1381{
1382 if (!policy || policy->flags & MPOL_F_LOCAL)
1383 return numa_node_id();
1384
1385 switch (policy->mode) {
1386 case MPOL_PREFERRED:
1387
1388
1389
1390 return policy->v.preferred_node;
1391
1392 case MPOL_INTERLEAVE:
1393 return interleave_nodes(policy);
1394
1395 case MPOL_BIND: {
1396
1397
1398
1399
1400 struct zonelist *zonelist;
1401 struct zone *zone;
1402 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1403 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1404 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1405 &policy->v.nodes,
1406 &zone);
1407 return zone ? zone->node : numa_node_id();
1408 }
1409
1410 default:
1411 BUG();
1412 }
1413}
1414
1415
1416static unsigned offset_il_node(struct mempolicy *pol,
1417 struct vm_area_struct *vma, unsigned long off)
1418{
1419 unsigned nnodes = nodes_weight(pol->v.nodes);
1420 unsigned target;
1421 int c;
1422 int nid = -1;
1423
1424 if (!nnodes)
1425 return numa_node_id();
1426 target = (unsigned int)off % nnodes;
1427 c = 0;
1428 do {
1429 nid = next_node(nid, pol->v.nodes);
1430 c++;
1431 } while (c <= target);
1432 return nid;
1433}
1434
1435
1436static inline unsigned interleave_nid(struct mempolicy *pol,
1437 struct vm_area_struct *vma, unsigned long addr, int shift)
1438{
1439 if (vma) {
1440 unsigned long off;
1441
1442
1443
1444
1445
1446
1447
1448
1449 BUG_ON(shift < PAGE_SHIFT);
1450 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1451 off += (addr - vma->vm_start) >> shift;
1452 return offset_il_node(pol, vma, off);
1453 } else
1454 return interleave_nodes(pol);
1455}
1456
1457#ifdef CONFIG_HUGETLBFS
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1472 gfp_t gfp_flags, struct mempolicy **mpol,
1473 nodemask_t **nodemask)
1474{
1475 struct zonelist *zl;
1476
1477 *mpol = get_vma_policy(current, vma, addr);
1478 *nodemask = NULL;
1479
1480 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1481 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1482 huge_page_shift(hstate_vma(vma))), gfp_flags);
1483 } else {
1484 zl = policy_zonelist(gfp_flags, *mpol);
1485 if ((*mpol)->mode == MPOL_BIND)
1486 *nodemask = &(*mpol)->v.nodes;
1487 }
1488 return zl;
1489}
1490#endif
1491
1492
1493
1494static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1495 unsigned nid)
1496{
1497 struct zonelist *zl;
1498 struct page *page;
1499
1500 zl = node_zonelist(nid, gfp);
1501 page = __alloc_pages(gfp, order, zl);
1502 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1503 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1504 return page;
1505}
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529struct page *
1530alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1531{
1532 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1533 struct zonelist *zl;
1534
1535 cpuset_update_task_memory_state();
1536
1537 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1538 unsigned nid;
1539
1540 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1541 mpol_cond_put(pol);
1542 return alloc_page_interleave(gfp, 0, nid);
1543 }
1544 zl = policy_zonelist(gfp, pol);
1545 if (unlikely(mpol_needs_cond_ref(pol))) {
1546
1547
1548
1549 struct page *page = __alloc_pages_nodemask(gfp, 0,
1550 zl, policy_nodemask(gfp, pol));
1551 __mpol_put(pol);
1552 return page;
1553 }
1554
1555
1556
1557 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1558}
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1580{
1581 struct mempolicy *pol = current->mempolicy;
1582
1583 if ((gfp & __GFP_WAIT) && !in_interrupt())
1584 cpuset_update_task_memory_state();
1585 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1586 pol = &default_policy;
1587
1588
1589
1590
1591
1592 if (pol->mode == MPOL_INTERLEAVE)
1593 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1594 return __alloc_pages_nodemask(gfp, order,
1595 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1596}
1597EXPORT_SYMBOL(alloc_pages_current);
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608struct mempolicy *__mpol_dup(struct mempolicy *old)
1609{
1610 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1611
1612 if (!new)
1613 return ERR_PTR(-ENOMEM);
1614 if (current_cpuset_is_being_rebound()) {
1615 nodemask_t mems = cpuset_mems_allowed(current);
1616 mpol_rebind_policy(old, &mems);
1617 }
1618 *new = *old;
1619 atomic_set(&new->refcnt, 1);
1620 return new;
1621}
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1634 struct mempolicy *frompol)
1635{
1636 if (!mpol_needs_cond_ref(frompol))
1637 return frompol;
1638
1639 *tompol = *frompol;
1640 tompol->flags &= ~MPOL_F_SHARED;
1641 __mpol_put(frompol);
1642 return tompol;
1643}
1644
1645static int mpol_match_intent(const struct mempolicy *a,
1646 const struct mempolicy *b)
1647{
1648 if (a->flags != b->flags)
1649 return 0;
1650 if (!mpol_store_user_nodemask(a))
1651 return 1;
1652 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1653}
1654
1655
1656int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1657{
1658 if (!a || !b)
1659 return 0;
1660 if (a->mode != b->mode)
1661 return 0;
1662 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1663 return 0;
1664 switch (a->mode) {
1665 case MPOL_BIND:
1666
1667 case MPOL_INTERLEAVE:
1668 return nodes_equal(a->v.nodes, b->v.nodes);
1669 case MPOL_PREFERRED:
1670 return a->v.preferred_node == b->v.preferred_node &&
1671 a->flags == b->flags;
1672 default:
1673 BUG();
1674 return 0;
1675 }
1676}
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689static struct sp_node *
1690sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1691{
1692 struct rb_node *n = sp->root.rb_node;
1693
1694 while (n) {
1695 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1696
1697 if (start >= p->end)
1698 n = n->rb_right;
1699 else if (end <= p->start)
1700 n = n->rb_left;
1701 else
1702 break;
1703 }
1704 if (!n)
1705 return NULL;
1706 for (;;) {
1707 struct sp_node *w = NULL;
1708 struct rb_node *prev = rb_prev(n);
1709 if (!prev)
1710 break;
1711 w = rb_entry(prev, struct sp_node, nd);
1712 if (w->end <= start)
1713 break;
1714 n = prev;
1715 }
1716 return rb_entry(n, struct sp_node, nd);
1717}
1718
1719
1720
1721static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1722{
1723 struct rb_node **p = &sp->root.rb_node;
1724 struct rb_node *parent = NULL;
1725 struct sp_node *nd;
1726
1727 while (*p) {
1728 parent = *p;
1729 nd = rb_entry(parent, struct sp_node, nd);
1730 if (new->start < nd->start)
1731 p = &(*p)->rb_left;
1732 else if (new->end > nd->end)
1733 p = &(*p)->rb_right;
1734 else
1735 BUG();
1736 }
1737 rb_link_node(&new->nd, parent, p);
1738 rb_insert_color(&new->nd, &sp->root);
1739 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1740 new->policy ? new->policy->mode : 0);
1741}
1742
1743
1744struct mempolicy *
1745mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1746{
1747 struct mempolicy *pol = NULL;
1748 struct sp_node *sn;
1749
1750 if (!sp->root.rb_node)
1751 return NULL;
1752 spin_lock(&sp->lock);
1753 sn = sp_lookup(sp, idx, idx+1);
1754 if (sn) {
1755 mpol_get(sn->policy);
1756 pol = sn->policy;
1757 }
1758 spin_unlock(&sp->lock);
1759 return pol;
1760}
1761
1762static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1763{
1764 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1765 rb_erase(&n->nd, &sp->root);
1766 mpol_put(n->policy);
1767 kmem_cache_free(sn_cache, n);
1768}
1769
1770static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1771 struct mempolicy *pol)
1772{
1773 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1774
1775 if (!n)
1776 return NULL;
1777 n->start = start;
1778 n->end = end;
1779 mpol_get(pol);
1780 pol->flags |= MPOL_F_SHARED;
1781 n->policy = pol;
1782 return n;
1783}
1784
1785
1786static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1787 unsigned long end, struct sp_node *new)
1788{
1789 struct sp_node *n, *new2 = NULL;
1790
1791restart:
1792 spin_lock(&sp->lock);
1793 n = sp_lookup(sp, start, end);
1794
1795 while (n && n->start < end) {
1796 struct rb_node *next = rb_next(&n->nd);
1797 if (n->start >= start) {
1798 if (n->end <= end)
1799 sp_delete(sp, n);
1800 else
1801 n->start = end;
1802 } else {
1803
1804 if (n->end > end) {
1805 if (!new2) {
1806 spin_unlock(&sp->lock);
1807 new2 = sp_alloc(end, n->end, n->policy);
1808 if (!new2)
1809 return -ENOMEM;
1810 goto restart;
1811 }
1812 n->end = start;
1813 sp_insert(sp, new2);
1814 new2 = NULL;
1815 break;
1816 } else
1817 n->end = start;
1818 }
1819 if (!next)
1820 break;
1821 n = rb_entry(next, struct sp_node, nd);
1822 }
1823 if (new)
1824 sp_insert(sp, new);
1825 spin_unlock(&sp->lock);
1826 if (new2) {
1827 mpol_put(new2->policy);
1828 kmem_cache_free(sn_cache, new2);
1829 }
1830 return 0;
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1843{
1844 sp->root = RB_ROOT;
1845 spin_lock_init(&sp->lock);
1846
1847 if (mpol) {
1848 struct vm_area_struct pvma;
1849 struct mempolicy *new;
1850
1851
1852 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1853 mpol_put(mpol);
1854 if (IS_ERR(new))
1855 return;
1856
1857
1858 memset(&pvma, 0, sizeof(struct vm_area_struct));
1859 pvma.vm_end = TASK_SIZE;
1860 mpol_set_shared_policy(sp, &pvma, new);
1861 mpol_put(new);
1862 }
1863}
1864
1865int mpol_set_shared_policy(struct shared_policy *info,
1866 struct vm_area_struct *vma, struct mempolicy *npol)
1867{
1868 int err;
1869 struct sp_node *new = NULL;
1870 unsigned long sz = vma_pages(vma);
1871
1872 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1873 vma->vm_pgoff,
1874 sz, npol ? npol->mode : -1,
1875 npol ? npol->flags : -1,
1876 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1877
1878 if (npol) {
1879 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1880 if (!new)
1881 return -ENOMEM;
1882 }
1883 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1884 if (err && new)
1885 kmem_cache_free(sn_cache, new);
1886 return err;
1887}
1888
1889
1890void mpol_free_shared_policy(struct shared_policy *p)
1891{
1892 struct sp_node *n;
1893 struct rb_node *next;
1894
1895 if (!p->root.rb_node)
1896 return;
1897 spin_lock(&p->lock);
1898 next = rb_first(&p->root);
1899 while (next) {
1900 n = rb_entry(next, struct sp_node, nd);
1901 next = rb_next(&n->nd);
1902 rb_erase(&n->nd, &p->root);
1903 mpol_put(n->policy);
1904 kmem_cache_free(sn_cache, n);
1905 }
1906 spin_unlock(&p->lock);
1907}
1908
1909
1910void __init numa_policy_init(void)
1911{
1912 nodemask_t interleave_nodes;
1913 unsigned long largest = 0;
1914 int nid, prefer = 0;
1915
1916 policy_cache = kmem_cache_create("numa_policy",
1917 sizeof(struct mempolicy),
1918 0, SLAB_PANIC, NULL);
1919
1920 sn_cache = kmem_cache_create("shared_policy_node",
1921 sizeof(struct sp_node),
1922 0, SLAB_PANIC, NULL);
1923
1924
1925
1926
1927
1928
1929 nodes_clear(interleave_nodes);
1930 for_each_node_state(nid, N_HIGH_MEMORY) {
1931 unsigned long total_pages = node_present_pages(nid);
1932
1933
1934 if (largest < total_pages) {
1935 largest = total_pages;
1936 prefer = nid;
1937 }
1938
1939
1940 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1941 node_set(nid, interleave_nodes);
1942 }
1943
1944
1945 if (unlikely(nodes_empty(interleave_nodes)))
1946 node_set(prefer, interleave_nodes);
1947
1948 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1949 printk("numa_policy_init: interleaving failed\n");
1950}
1951
1952
1953void numa_default_policy(void)
1954{
1955 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1956}
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1967static const char * const policy_types[] =
1968 { "default", "prefer", "bind", "interleave", "local" };
1969
1970
1971#ifdef CONFIG_TMPFS
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1991{
1992 struct mempolicy *new = NULL;
1993 unsigned short uninitialized_var(mode);
1994 unsigned short uninitialized_var(mode_flags);
1995 nodemask_t nodes;
1996 char *nodelist = strchr(str, ':');
1997 char *flags = strchr(str, '=');
1998 int i;
1999 int err = 1;
2000
2001 if (nodelist) {
2002
2003 *nodelist++ = '\0';
2004 if (nodelist_parse(nodelist, nodes))
2005 goto out;
2006 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2007 goto out;
2008 } else
2009 nodes_clear(nodes);
2010
2011 if (flags)
2012 *flags++ = '\0';
2013
2014 for (i = 0; i <= MPOL_LOCAL; i++) {
2015 if (!strcmp(str, policy_types[i])) {
2016 mode = i;
2017 break;
2018 }
2019 }
2020 if (i > MPOL_LOCAL)
2021 goto out;
2022
2023 switch (mode) {
2024 case MPOL_PREFERRED:
2025
2026
2027
2028 if (nodelist) {
2029 char *rest = nodelist;
2030 while (isdigit(*rest))
2031 rest++;
2032 if (*rest)
2033 goto out;
2034 }
2035 break;
2036 case MPOL_INTERLEAVE:
2037
2038
2039
2040 if (!nodelist)
2041 nodes = node_states[N_HIGH_MEMORY];
2042 break;
2043 case MPOL_LOCAL:
2044
2045
2046
2047 if (nodelist)
2048 goto out;
2049 mode = MPOL_PREFERRED;
2050 break;
2051 case MPOL_DEFAULT:
2052
2053
2054
2055 if (!nodelist)
2056 err = 0;
2057 goto out;
2058 case MPOL_BIND:
2059
2060
2061
2062 if (!nodelist)
2063 goto out;
2064 }
2065
2066 mode_flags = 0;
2067 if (flags) {
2068
2069
2070
2071
2072 if (!strcmp(flags, "static"))
2073 mode_flags |= MPOL_F_STATIC_NODES;
2074 else if (!strcmp(flags, "relative"))
2075 mode_flags |= MPOL_F_RELATIVE_NODES;
2076 else
2077 goto out;
2078 }
2079
2080 new = mpol_new(mode, mode_flags, &nodes);
2081 if (IS_ERR(new))
2082 goto out;
2083 err = 0;
2084 if (no_context) {
2085
2086 new->w.user_nodemask = nodes;
2087 }
2088
2089out:
2090
2091 if (nodelist)
2092 *--nodelist = ':';
2093 if (flags)
2094 *--flags = '=';
2095 if (!err)
2096 *mpol = new;
2097 return err;
2098}
2099#endif
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2113{
2114 char *p = buffer;
2115 int l;
2116 nodemask_t nodes;
2117 unsigned short mode;
2118 unsigned short flags = pol ? pol->flags : 0;
2119
2120
2121
2122
2123 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2124
2125 if (!pol || pol == &default_policy)
2126 mode = MPOL_DEFAULT;
2127 else
2128 mode = pol->mode;
2129
2130 switch (mode) {
2131 case MPOL_DEFAULT:
2132 nodes_clear(nodes);
2133 break;
2134
2135 case MPOL_PREFERRED:
2136 nodes_clear(nodes);
2137 if (flags & MPOL_F_LOCAL)
2138 mode = MPOL_LOCAL;
2139 else
2140 node_set(pol->v.preferred_node, nodes);
2141 break;
2142
2143 case MPOL_BIND:
2144
2145 case MPOL_INTERLEAVE:
2146 if (no_context)
2147 nodes = pol->w.user_nodemask;
2148 else
2149 nodes = pol->v.nodes;
2150 break;
2151
2152 default:
2153 BUG();
2154 }
2155
2156 l = strlen(policy_types[mode]);
2157 if (buffer + maxlen < p + l + 1)
2158 return -ENOSPC;
2159
2160 strcpy(p, policy_types[mode]);
2161 p += l;
2162
2163 if (flags & MPOL_MODE_FLAGS) {
2164 if (buffer + maxlen < p + 2)
2165 return -ENOSPC;
2166 *p++ = '=';
2167
2168
2169
2170
2171 if (flags & MPOL_F_STATIC_NODES)
2172 p += snprintf(p, buffer + maxlen - p, "static");
2173 else if (flags & MPOL_F_RELATIVE_NODES)
2174 p += snprintf(p, buffer + maxlen - p, "relative");
2175 }
2176
2177 if (!nodes_empty(nodes)) {
2178 if (buffer + maxlen < p + 2)
2179 return -ENOSPC;
2180 *p++ = ':';
2181 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2182 }
2183 return p - buffer;
2184}
2185
2186struct numa_maps {
2187 unsigned long pages;
2188 unsigned long anon;
2189 unsigned long active;
2190 unsigned long writeback;
2191 unsigned long mapcount_max;
2192 unsigned long dirty;
2193 unsigned long swapcache;
2194 unsigned long node[MAX_NUMNODES];
2195};
2196
2197static void gather_stats(struct page *page, void *private, int pte_dirty)
2198{
2199 struct numa_maps *md = private;
2200 int count = page_mapcount(page);
2201
2202 md->pages++;
2203 if (pte_dirty || PageDirty(page))
2204 md->dirty++;
2205
2206 if (PageSwapCache(page))
2207 md->swapcache++;
2208
2209 if (PageActive(page))
2210 md->active++;
2211
2212 if (PageWriteback(page))
2213 md->writeback++;
2214
2215 if (PageAnon(page))
2216 md->anon++;
2217
2218 if (count > md->mapcount_max)
2219 md->mapcount_max = count;
2220
2221 md->node[page_to_nid(page)]++;
2222}
2223
2224#ifdef CONFIG_HUGETLB_PAGE
2225static void check_huge_range(struct vm_area_struct *vma,
2226 unsigned long start, unsigned long end,
2227 struct numa_maps *md)
2228{
2229 unsigned long addr;
2230 struct page *page;
2231 struct hstate *h = hstate_vma(vma);
2232 unsigned long sz = huge_page_size(h);
2233
2234 for (addr = start; addr < end; addr += sz) {
2235 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2236 addr & huge_page_mask(h));
2237 pte_t pte;
2238
2239 if (!ptep)
2240 continue;
2241
2242 pte = *ptep;
2243 if (pte_none(pte))
2244 continue;
2245
2246 page = pte_page(pte);
2247 if (!page)
2248 continue;
2249
2250 gather_stats(page, md, pte_dirty(*ptep));
2251 }
2252}
2253#else
2254static inline void check_huge_range(struct vm_area_struct *vma,
2255 unsigned long start, unsigned long end,
2256 struct numa_maps *md)
2257{
2258}
2259#endif
2260
2261
2262
2263
2264int show_numa_map(struct seq_file *m, void *v)
2265{
2266 struct proc_maps_private *priv = m->private;
2267 struct vm_area_struct *vma = v;
2268 struct numa_maps *md;
2269 struct file *file = vma->vm_file;
2270 struct mm_struct *mm = vma->vm_mm;
2271 struct mempolicy *pol;
2272 int n;
2273 char buffer[50];
2274
2275 if (!mm)
2276 return 0;
2277
2278 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2279 if (!md)
2280 return 0;
2281
2282 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2283 mpol_to_str(buffer, sizeof(buffer), pol, 0);
2284 mpol_cond_put(pol);
2285
2286 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2287
2288 if (file) {
2289 seq_printf(m, " file=");
2290 seq_path(m, &file->f_path, "\n\t= ");
2291 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2292 seq_printf(m, " heap");
2293 } else if (vma->vm_start <= mm->start_stack &&
2294 vma->vm_end >= mm->start_stack) {
2295 seq_printf(m, " stack");
2296 }
2297
2298 if (is_vm_hugetlb_page(vma)) {
2299 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2300 seq_printf(m, " huge");
2301 } else {
2302 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2303 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2304 }
2305
2306 if (!md->pages)
2307 goto out;
2308
2309 if (md->anon)
2310 seq_printf(m," anon=%lu",md->anon);
2311
2312 if (md->dirty)
2313 seq_printf(m," dirty=%lu",md->dirty);
2314
2315 if (md->pages != md->anon && md->pages != md->dirty)
2316 seq_printf(m, " mapped=%lu", md->pages);
2317
2318 if (md->mapcount_max > 1)
2319 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2320
2321 if (md->swapcache)
2322 seq_printf(m," swapcache=%lu", md->swapcache);
2323
2324 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2325 seq_printf(m," active=%lu", md->active);
2326
2327 if (md->writeback)
2328 seq_printf(m," writeback=%lu", md->writeback);
2329
2330 for_each_node_state(n, N_HIGH_MEMORY)
2331 if (md->node[n])
2332 seq_printf(m, " N%d=%lu", n, md->node[n]);
2333out:
2334 seq_putc(m, '\n');
2335 kfree(md);
2336
2337 if (m->count < m->size)
2338 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2339 return 0;
2340}
2341