1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96#include "internal.h"
97
98
99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
101#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)
102
103static struct kmem_cache *policy_cache;
104static struct kmem_cache *sn_cache;
105
106
107
108enum zone_type policy_zone = 0;
109
110
111
112
113struct mempolicy default_policy = {
114 .refcnt = ATOMIC_INIT(1),
115 .mode = MPOL_PREFERRED,
116 .flags = MPOL_F_LOCAL,
117};
118
119static const struct mempolicy_operations {
120 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122} mpol_ops[MPOL_MAX];
123
124
125static int is_valid_nodemask(const nodemask_t *nodemask)
126{
127 int nd, k;
128
129
130 k = policy_zone;
131
132 for_each_node_mask(nd, *nodemask) {
133 struct zone *z;
134
135 for (k = 0; k <= policy_zone; k++) {
136 z = &NODE_DATA(nd)->node_zones[k];
137 if (z->present_pages > 0)
138 return 1;
139 }
140 }
141
142 return 0;
143}
144
145static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146{
147 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148}
149
150static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151 const nodemask_t *rel)
152{
153 nodemask_t tmp;
154 nodes_fold(tmp, *orig, nodes_weight(*rel));
155 nodes_onto(*ret, tmp, *rel);
156}
157
158static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159{
160 if (nodes_empty(*nodes))
161 return -EINVAL;
162 pol->v.nodes = *nodes;
163 return 0;
164}
165
166static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167{
168 if (!nodes)
169 pol->flags |= MPOL_F_LOCAL;
170 else if (nodes_empty(*nodes))
171 return -EINVAL;
172 else
173 pol->v.preferred_node = first_node(*nodes);
174 return 0;
175}
176
177static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178{
179 if (!is_valid_nodemask(nodes))
180 return -EINVAL;
181 pol->v.nodes = *nodes;
182 return 0;
183}
184
185
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes)
188{
189 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
195
196 if (mode == MPOL_DEFAULT) {
197 if (nodes && !nodes_empty(*nodes))
198 return ERR_PTR(-EINVAL);
199 return NULL;
200 }
201 VM_BUG_ON(!nodes);
202
203
204
205
206
207
208 if (mode == MPOL_PREFERRED) {
209 if (nodes_empty(*nodes)) {
210 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL);
213 nodes = NULL;
214 }
215 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL);
217 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
218 if (!policy)
219 return ERR_PTR(-ENOMEM);
220 atomic_set(&policy->refcnt, 1);
221 policy->mode = mode;
222 policy->flags = flags;
223
224 if (nodes) {
225
226
227
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy;
249}
250
251
252void __mpol_put(struct mempolicy *p)
253{
254 if (!atomic_dec_and_test(&p->refcnt))
255 return;
256 kmem_cache_free(policy_cache, p);
257}
258
259static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
260{
261}
262
263static void mpol_rebind_nodemask(struct mempolicy *pol,
264 const nodemask_t *nodes)
265{
266 nodemask_t tmp;
267
268 if (pol->flags & MPOL_F_STATIC_NODES)
269 nodes_and(tmp, pol->w.user_nodemask, *nodes);
270 else if (pol->flags & MPOL_F_RELATIVE_NODES)
271 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
272 else {
273 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
274 *nodes);
275 pol->w.cpuset_mems_allowed = *nodes;
276 }
277
278 pol->v.nodes = tmp;
279 if (!node_isset(current->il_next, tmp)) {
280 current->il_next = next_node(current->il_next, tmp);
281 if (current->il_next >= MAX_NUMNODES)
282 current->il_next = first_node(tmp);
283 if (current->il_next >= MAX_NUMNODES)
284 current->il_next = numa_node_id();
285 }
286}
287
288static void mpol_rebind_preferred(struct mempolicy *pol,
289 const nodemask_t *nodes)
290{
291 nodemask_t tmp;
292
293 if (pol->flags & MPOL_F_STATIC_NODES) {
294 int node = first_node(pol->w.user_nodemask);
295
296 if (node_isset(node, *nodes)) {
297 pol->v.preferred_node = node;
298 pol->flags &= ~MPOL_F_LOCAL;
299 } else
300 pol->flags |= MPOL_F_LOCAL;
301 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
302 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
303 pol->v.preferred_node = first_node(tmp);
304 } else if (!(pol->flags & MPOL_F_LOCAL)) {
305 pol->v.preferred_node = node_remap(pol->v.preferred_node,
306 pol->w.cpuset_mems_allowed,
307 *nodes);
308 pol->w.cpuset_mems_allowed = *nodes;
309 }
310}
311
312
313static void mpol_rebind_policy(struct mempolicy *pol,
314 const nodemask_t *newmask)
315{
316 if (!pol)
317 return;
318 if (!mpol_store_user_nodemask(pol) &&
319 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
320 return;
321 mpol_ops[pol->mode].rebind(pol, newmask);
322}
323
324
325
326
327
328
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
330{
331 mpol_rebind_policy(tsk->mempolicy, new);
332}
333
334
335
336
337
338
339
340void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
341{
342 struct vm_area_struct *vma;
343
344 down_write(&mm->mmap_sem);
345 for (vma = mm->mmap; vma; vma = vma->vm_next)
346 mpol_rebind_policy(vma->vm_policy, new);
347 up_write(&mm->mmap_sem);
348}
349
350static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
351 [MPOL_DEFAULT] = {
352 .rebind = mpol_rebind_default,
353 },
354 [MPOL_INTERLEAVE] = {
355 .create = mpol_new_interleave,
356 .rebind = mpol_rebind_nodemask,
357 },
358 [MPOL_PREFERRED] = {
359 .create = mpol_new_preferred,
360 .rebind = mpol_rebind_preferred,
361 },
362 [MPOL_BIND] = {
363 .create = mpol_new_bind,
364 .rebind = mpol_rebind_nodemask,
365 },
366};
367
368static void gather_stats(struct page *, void *, int pte_dirty);
369static void migrate_page_add(struct page *page, struct list_head *pagelist,
370 unsigned long flags);
371
372
373static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
374 unsigned long addr, unsigned long end,
375 const nodemask_t *nodes, unsigned long flags,
376 void *private)
377{
378 pte_t *orig_pte;
379 pte_t *pte;
380 spinlock_t *ptl;
381
382 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
383 do {
384 struct page *page;
385 int nid;
386
387 if (!pte_present(*pte))
388 continue;
389 page = vm_normal_page(vma, addr, *pte);
390 if (!page)
391 continue;
392
393
394
395
396
397
398
399
400
401
402
403 if (PageReserved(page))
404 continue;
405 nid = page_to_nid(page);
406 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
407 continue;
408
409 if (flags & MPOL_MF_STATS)
410 gather_stats(page, private, pte_dirty(*pte));
411 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
412 migrate_page_add(page, private, flags);
413 else
414 break;
415 } while (pte++, addr += PAGE_SIZE, addr != end);
416 pte_unmap_unlock(orig_pte, ptl);
417 return addr != end;
418}
419
420static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
421 unsigned long addr, unsigned long end,
422 const nodemask_t *nodes, unsigned long flags,
423 void *private)
424{
425 pmd_t *pmd;
426 unsigned long next;
427
428 pmd = pmd_offset(pud, addr);
429 do {
430 next = pmd_addr_end(addr, end);
431 if (pmd_none_or_clear_bad(pmd))
432 continue;
433 if (check_pte_range(vma, pmd, addr, next, nodes,
434 flags, private))
435 return -EIO;
436 } while (pmd++, addr = next, addr != end);
437 return 0;
438}
439
440static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
441 unsigned long addr, unsigned long end,
442 const nodemask_t *nodes, unsigned long flags,
443 void *private)
444{
445 pud_t *pud;
446 unsigned long next;
447
448 pud = pud_offset(pgd, addr);
449 do {
450 next = pud_addr_end(addr, end);
451 if (pud_none_or_clear_bad(pud))
452 continue;
453 if (check_pmd_range(vma, pud, addr, next, nodes,
454 flags, private))
455 return -EIO;
456 } while (pud++, addr = next, addr != end);
457 return 0;
458}
459
460static inline int check_pgd_range(struct vm_area_struct *vma,
461 unsigned long addr, unsigned long end,
462 const nodemask_t *nodes, unsigned long flags,
463 void *private)
464{
465 pgd_t *pgd;
466 unsigned long next;
467
468 pgd = pgd_offset(vma->vm_mm, addr);
469 do {
470 next = pgd_addr_end(addr, end);
471 if (pgd_none_or_clear_bad(pgd))
472 continue;
473 if (check_pud_range(vma, pgd, addr, next, nodes,
474 flags, private))
475 return -EIO;
476 } while (pgd++, addr = next, addr != end);
477 return 0;
478}
479
480
481
482
483
484
485static struct vm_area_struct *
486check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
487 const nodemask_t *nodes, unsigned long flags, void *private)
488{
489 int err;
490 struct vm_area_struct *first, *vma, *prev;
491
492
493 first = find_vma(mm, start);
494 if (!first)
495 return ERR_PTR(-EFAULT);
496 prev = NULL;
497 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
498 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
499 if (!vma->vm_next && vma->vm_end < end)
500 return ERR_PTR(-EFAULT);
501 if (prev && prev->vm_end < vma->vm_start)
502 return ERR_PTR(-EFAULT);
503 }
504 if (!is_vm_hugetlb_page(vma) &&
505 ((flags & MPOL_MF_STRICT) ||
506 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
507 vma_migratable(vma)))) {
508 unsigned long endvma = vma->vm_end;
509
510 if (endvma > end)
511 endvma = end;
512 if (vma->vm_start > start)
513 start = vma->vm_start;
514 err = check_pgd_range(vma, start, endvma, nodes,
515 flags, private);
516 if (err) {
517 first = ERR_PTR(err);
518 break;
519 }
520 }
521 prev = vma;
522 }
523 return first;
524}
525
526
527static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
528{
529 int err = 0;
530 struct mempolicy *old = vma->vm_policy;
531
532 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
533 vma->vm_start, vma->vm_end, vma->vm_pgoff,
534 vma->vm_ops, vma->vm_file,
535 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
536
537 if (vma->vm_ops && vma->vm_ops->set_policy)
538 err = vma->vm_ops->set_policy(vma, new);
539 if (!err) {
540 mpol_get(new);
541 vma->vm_policy = new;
542 mpol_put(old);
543 }
544 return err;
545}
546
547
548static int mbind_range(struct vm_area_struct *vma, unsigned long start,
549 unsigned long end, struct mempolicy *new)
550{
551 struct vm_area_struct *next;
552 int err;
553
554 err = 0;
555 for (; vma && vma->vm_start < end; vma = next) {
556 next = vma->vm_next;
557 if (vma->vm_start < start)
558 err = split_vma(vma->vm_mm, vma, start, 1);
559 if (!err && vma->vm_end > end)
560 err = split_vma(vma->vm_mm, vma, end, 0);
561 if (!err)
562 err = policy_vma(vma, new);
563 if (err)
564 break;
565 }
566 return err;
567}
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586void mpol_fix_fork_child_flag(struct task_struct *p)
587{
588 if (p->mempolicy)
589 p->flags |= PF_MEMPOLICY;
590 else
591 p->flags &= ~PF_MEMPOLICY;
592}
593
594static void mpol_set_task_struct_flag(void)
595{
596 mpol_fix_fork_child_flag(current);
597}
598
599
600static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes)
602{
603 struct mempolicy *new;
604 struct mm_struct *mm = current->mm;
605
606 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new))
608 return PTR_ERR(new);
609
610
611
612
613
614
615
616 if (mm)
617 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy);
619 current->mempolicy = new;
620 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes);
624 if (mm)
625 up_write(&mm->mmap_sem);
626
627 return 0;
628}
629
630
631
632
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{
635 nodes_clear(*nodes);
636 if (p == &default_policy)
637 return;
638
639 switch (p->mode) {
640 case MPOL_BIND:
641
642 case MPOL_INTERLEAVE:
643 *nodes = p->v.nodes;
644 break;
645 case MPOL_PREFERRED:
646 if (!(p->flags & MPOL_F_LOCAL))
647 node_set(p->v.preferred_node, *nodes);
648
649 break;
650 default:
651 BUG();
652 }
653}
654
655static int lookup_node(struct mm_struct *mm, unsigned long addr)
656{
657 struct page *p;
658 int err;
659
660 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
661 if (err >= 0) {
662 err = page_to_nid(p);
663 put_page(p);
664 }
665 return err;
666}
667
668
669static long do_get_mempolicy(int *policy, nodemask_t *nmask,
670 unsigned long addr, unsigned long flags)
671{
672 int err;
673 struct mm_struct *mm = current->mm;
674 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy;
676
677 cpuset_update_task_memory_state();
678 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL;
681
682 if (flags & MPOL_F_MEMS_ALLOWED) {
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL;
685 *policy = 0;
686 *nmask = cpuset_current_mems_allowed;
687 return 0;
688 }
689
690 if (flags & MPOL_F_ADDR) {
691
692
693
694
695
696 down_read(&mm->mmap_sem);
697 vma = find_vma_intersection(mm, addr, addr+1);
698 if (!vma) {
699 up_read(&mm->mmap_sem);
700 return -EFAULT;
701 }
702 if (vma->vm_ops && vma->vm_ops->get_policy)
703 pol = vma->vm_ops->get_policy(vma, addr);
704 else
705 pol = vma->vm_policy;
706 } else if (addr)
707 return -EINVAL;
708
709 if (!pol)
710 pol = &default_policy;
711
712 if (flags & MPOL_F_NODE) {
713 if (flags & MPOL_F_ADDR) {
714 err = lookup_node(mm, addr);
715 if (err < 0)
716 goto out;
717 *policy = err;
718 } else if (pol == current->mempolicy &&
719 pol->mode == MPOL_INTERLEAVE) {
720 *policy = current->il_next;
721 } else {
722 err = -EINVAL;
723 goto out;
724 }
725 } else {
726 *policy = pol == &default_policy ? MPOL_DEFAULT :
727 pol->mode;
728
729
730
731
732 *policy |= (pol->flags & MPOL_MODE_FLAGS);
733 }
734
735 if (vma) {
736 up_read(¤t->mm->mmap_sem);
737 vma = NULL;
738 }
739
740 err = 0;
741 if (nmask)
742 get_policy_nodemask(pol, nmask);
743
744 out:
745 mpol_cond_put(pol);
746 if (vma)
747 up_read(¤t->mm->mmap_sem);
748 return err;
749}
750
751#ifdef CONFIG_MIGRATION
752
753
754
755static void migrate_page_add(struct page *page, struct list_head *pagelist,
756 unsigned long flags)
757{
758
759
760
761 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
762 if (!isolate_lru_page(page)) {
763 list_add_tail(&page->lru, pagelist);
764 }
765 }
766}
767
768static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
771}
772
773
774
775
776
777static int migrate_to_node(struct mm_struct *mm, int source, int dest,
778 int flags)
779{
780 nodemask_t nmask;
781 LIST_HEAD(pagelist);
782 int err = 0;
783
784 nodes_clear(nmask);
785 node_set(source, nmask);
786
787 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
788 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
789
790 if (!list_empty(&pagelist))
791 err = migrate_pages(&pagelist, new_node_page, dest);
792
793 return err;
794}
795
796
797
798
799
800
801
802int do_migrate_pages(struct mm_struct *mm,
803 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
804{
805 int busy = 0;
806 int err;
807 nodemask_t tmp;
808
809 err = migrate_prep();
810 if (err)
811 return err;
812
813 down_read(&mm->mmap_sem);
814
815 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
816 if (err)
817 goto out;
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850 tmp = *from_nodes;
851 while (!nodes_empty(tmp)) {
852 int s,d;
853 int source = -1;
854 int dest = 0;
855
856 for_each_node_mask(s, tmp) {
857 d = node_remap(s, *from_nodes, *to_nodes);
858 if (s == d)
859 continue;
860
861 source = s;
862 dest = d;
863
864
865 if (!node_isset(dest, tmp))
866 break;
867 }
868 if (source == -1)
869 break;
870
871 node_clear(source, tmp);
872 err = migrate_to_node(mm, source, dest, flags);
873 if (err > 0)
874 busy += err;
875 if (err < 0)
876 break;
877 }
878out:
879 up_read(&mm->mmap_sem);
880 if (err < 0)
881 return err;
882 return busy;
883
884}
885
886
887
888
889
890
891
892
893static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
894{
895 struct vm_area_struct *vma = (struct vm_area_struct *)private;
896 unsigned long uninitialized_var(address);
897
898 while (vma) {
899 address = page_address_in_vma(page, vma);
900 if (address != -EFAULT)
901 break;
902 vma = vma->vm_next;
903 }
904
905
906
907
908 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
909}
910#else
911
912static void migrate_page_add(struct page *page, struct list_head *pagelist,
913 unsigned long flags)
914{
915}
916
917int do_migrate_pages(struct mm_struct *mm,
918 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
919{
920 return -ENOSYS;
921}
922
923static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
924{
925 return NULL;
926}
927#endif
928
929static long do_mbind(unsigned long start, unsigned long len,
930 unsigned short mode, unsigned short mode_flags,
931 nodemask_t *nmask, unsigned long flags)
932{
933 struct vm_area_struct *vma;
934 struct mm_struct *mm = current->mm;
935 struct mempolicy *new;
936 unsigned long end;
937 int err;
938 LIST_HEAD(pagelist);
939
940 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
941 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
942 return -EINVAL;
943 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
944 return -EPERM;
945
946 if (start & ~PAGE_MASK)
947 return -EINVAL;
948
949 if (mode == MPOL_DEFAULT)
950 flags &= ~MPOL_MF_STRICT;
951
952 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
953 end = start + len;
954
955 if (end < start)
956 return -EINVAL;
957 if (end == start)
958 return 0;
959
960 new = mpol_new(mode, mode_flags, nmask);
961 if (IS_ERR(new))
962 return PTR_ERR(new);
963
964
965
966
967
968 if (!new)
969 flags |= MPOL_MF_DISCONTIG_OK;
970
971 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
972 start, start + len, mode, mode_flags,
973 nmask ? nodes_addr(*nmask)[0] : -1);
974
975 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
976
977 err = migrate_prep();
978 if (err)
979 return err;
980 }
981 down_write(&mm->mmap_sem);
982 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist);
984
985 err = PTR_ERR(vma);
986 if (!IS_ERR(vma)) {
987 int nr_failed = 0;
988
989 err = mbind_range(vma, start, end, new);
990
991 if (!list_empty(&pagelist))
992 nr_failed = migrate_pages(&pagelist, new_vma_page,
993 (unsigned long)vma);
994
995 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
996 err = -EIO;
997 }
998
999 up_write(&mm->mmap_sem);
1000 mpol_put(new);
1001 return err;
1002}
1003
1004
1005
1006
1007
1008
1009static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1010 unsigned long maxnode)
1011{
1012 unsigned long k;
1013 unsigned long nlongs;
1014 unsigned long endmask;
1015
1016 --maxnode;
1017 nodes_clear(*nodes);
1018 if (maxnode == 0 || !nmask)
1019 return 0;
1020 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1021 return -EINVAL;
1022
1023 nlongs = BITS_TO_LONGS(maxnode);
1024 if ((maxnode % BITS_PER_LONG) == 0)
1025 endmask = ~0UL;
1026 else
1027 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1028
1029
1030
1031 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1032 if (nlongs > PAGE_SIZE/sizeof(long))
1033 return -EINVAL;
1034 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1035 unsigned long t;
1036 if (get_user(t, nmask + k))
1037 return -EFAULT;
1038 if (k == nlongs - 1) {
1039 if (t & endmask)
1040 return -EINVAL;
1041 } else if (t)
1042 return -EINVAL;
1043 }
1044 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1045 endmask = ~0UL;
1046 }
1047
1048 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1049 return -EFAULT;
1050 nodes_addr(*nodes)[nlongs-1] &= endmask;
1051 return 0;
1052}
1053
1054
1055static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1056 nodemask_t *nodes)
1057{
1058 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1059 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1060
1061 if (copy > nbytes) {
1062 if (copy > PAGE_SIZE)
1063 return -EINVAL;
1064 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1065 return -EFAULT;
1066 copy = nbytes;
1067 }
1068 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1069}
1070
1071SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1072 unsigned long, mode, unsigned long __user *, nmask,
1073 unsigned long, maxnode, unsigned, flags)
1074{
1075 nodemask_t nodes;
1076 int err;
1077 unsigned short mode_flags;
1078
1079 mode_flags = mode & MPOL_MODE_FLAGS;
1080 mode &= ~MPOL_MODE_FLAGS;
1081 if (mode >= MPOL_MAX)
1082 return -EINVAL;
1083 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1084 (mode_flags & MPOL_F_RELATIVE_NODES))
1085 return -EINVAL;
1086 err = get_nodes(&nodes, nmask, maxnode);
1087 if (err)
1088 return err;
1089 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1090}
1091
1092
1093SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1094 unsigned long, maxnode)
1095{
1096 int err;
1097 nodemask_t nodes;
1098 unsigned short flags;
1099
1100 flags = mode & MPOL_MODE_FLAGS;
1101 mode &= ~MPOL_MODE_FLAGS;
1102 if ((unsigned int)mode >= MPOL_MAX)
1103 return -EINVAL;
1104 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1105 return -EINVAL;
1106 err = get_nodes(&nodes, nmask, maxnode);
1107 if (err)
1108 return err;
1109 return do_set_mempolicy(mode, flags, &nodes);
1110}
1111
1112SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1113 const unsigned long __user *, old_nodes,
1114 const unsigned long __user *, new_nodes)
1115{
1116 struct mm_struct *mm;
1117 struct task_struct *task;
1118 nodemask_t old;
1119 nodemask_t new;
1120 nodemask_t task_nodes;
1121 int err;
1122
1123 err = get_nodes(&old, old_nodes, maxnode);
1124 if (err)
1125 return err;
1126
1127 err = get_nodes(&new, new_nodes, maxnode);
1128 if (err)
1129 return err;
1130
1131
1132 read_lock(&tasklist_lock);
1133 task = pid ? find_task_by_vpid(pid) : current;
1134 if (!task) {
1135 read_unlock(&tasklist_lock);
1136 return -ESRCH;
1137 }
1138 mm = get_task_mm(task);
1139 read_unlock(&tasklist_lock);
1140
1141 if (!mm)
1142 return -EINVAL;
1143
1144
1145
1146
1147
1148
1149
1150 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1151 (current->uid != task->suid) && (current->uid != task->uid) &&
1152 !capable(CAP_SYS_NICE)) {
1153 err = -EPERM;
1154 goto out;
1155 }
1156
1157 task_nodes = cpuset_mems_allowed(task);
1158
1159 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1160 err = -EPERM;
1161 goto out;
1162 }
1163
1164 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1165 err = -EINVAL;
1166 goto out;
1167 }
1168
1169 err = security_task_movememory(task);
1170 if (err)
1171 goto out;
1172
1173 err = do_migrate_pages(mm, &old, &new,
1174 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1175out:
1176 mmput(mm);
1177 return err;
1178}
1179
1180
1181
1182SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1183 unsigned long __user *, nmask, unsigned long, maxnode,
1184 unsigned long, addr, unsigned long, flags)
1185{
1186 int err;
1187 int uninitialized_var(pval);
1188 nodemask_t nodes;
1189
1190 if (nmask != NULL && maxnode < MAX_NUMNODES)
1191 return -EINVAL;
1192
1193 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1194
1195 if (err)
1196 return err;
1197
1198 if (policy && put_user(pval, policy))
1199 return -EFAULT;
1200
1201 if (nmask)
1202 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1203
1204 return err;
1205}
1206
1207#ifdef CONFIG_COMPAT
1208
1209asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1210 compat_ulong_t __user *nmask,
1211 compat_ulong_t maxnode,
1212 compat_ulong_t addr, compat_ulong_t flags)
1213{
1214 long err;
1215 unsigned long __user *nm = NULL;
1216 unsigned long nr_bits, alloc_size;
1217 DECLARE_BITMAP(bm, MAX_NUMNODES);
1218
1219 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1220 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1221
1222 if (nmask)
1223 nm = compat_alloc_user_space(alloc_size);
1224
1225 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1226
1227 if (!err && nmask) {
1228 err = copy_from_user(bm, nm, alloc_size);
1229
1230 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1231 err |= compat_put_bitmap(nmask, bm, nr_bits);
1232 }
1233
1234 return err;
1235}
1236
1237asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1238 compat_ulong_t maxnode)
1239{
1240 long err = 0;
1241 unsigned long __user *nm = NULL;
1242 unsigned long nr_bits, alloc_size;
1243 DECLARE_BITMAP(bm, MAX_NUMNODES);
1244
1245 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1246 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1247
1248 if (nmask) {
1249 err = compat_get_bitmap(bm, nmask, nr_bits);
1250 nm = compat_alloc_user_space(alloc_size);
1251 err |= copy_to_user(nm, bm, alloc_size);
1252 }
1253
1254 if (err)
1255 return -EFAULT;
1256
1257 return sys_set_mempolicy(mode, nm, nr_bits+1);
1258}
1259
1260asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1261 compat_ulong_t mode, compat_ulong_t __user *nmask,
1262 compat_ulong_t maxnode, compat_ulong_t flags)
1263{
1264 long err = 0;
1265 unsigned long __user *nm = NULL;
1266 unsigned long nr_bits, alloc_size;
1267 nodemask_t bm;
1268
1269 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1270 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1271
1272 if (nmask) {
1273 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1274 nm = compat_alloc_user_space(alloc_size);
1275 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1276 }
1277
1278 if (err)
1279 return -EFAULT;
1280
1281 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1282}
1283
1284#endif
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302static struct mempolicy *get_vma_policy(struct task_struct *task,
1303 struct vm_area_struct *vma, unsigned long addr)
1304{
1305 struct mempolicy *pol = task->mempolicy;
1306
1307 if (vma) {
1308 if (vma->vm_ops && vma->vm_ops->get_policy) {
1309 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1310 addr);
1311 if (vpol)
1312 pol = vpol;
1313 } else if (vma->vm_policy)
1314 pol = vma->vm_policy;
1315 }
1316 if (!pol)
1317 pol = &default_policy;
1318 return pol;
1319}
1320
1321
1322
1323
1324
1325static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1326{
1327
1328 if (unlikely(policy->mode == MPOL_BIND) &&
1329 gfp_zone(gfp) >= policy_zone &&
1330 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1331 return &policy->v.nodes;
1332
1333 return NULL;
1334}
1335
1336
1337static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1338{
1339 int nd = numa_node_id();
1340
1341 switch (policy->mode) {
1342 case MPOL_PREFERRED:
1343 if (!(policy->flags & MPOL_F_LOCAL))
1344 nd = policy->v.preferred_node;
1345 break;
1346 case MPOL_BIND:
1347
1348
1349
1350
1351
1352
1353 if (unlikely(gfp & __GFP_THISNODE) &&
1354 unlikely(!node_isset(nd, policy->v.nodes)))
1355 nd = first_node(policy->v.nodes);
1356 break;
1357 case MPOL_INTERLEAVE:
1358 break;
1359 default:
1360 BUG();
1361 }
1362 return node_zonelist(nd, gfp);
1363}
1364
1365
1366static unsigned interleave_nodes(struct mempolicy *policy)
1367{
1368 unsigned nid, next;
1369 struct task_struct *me = current;
1370
1371 nid = me->il_next;
1372 next = next_node(nid, policy->v.nodes);
1373 if (next >= MAX_NUMNODES)
1374 next = first_node(policy->v.nodes);
1375 if (next < MAX_NUMNODES)
1376 me->il_next = next;
1377 return nid;
1378}
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388unsigned slab_node(struct mempolicy *policy)
1389{
1390 if (!policy || policy->flags & MPOL_F_LOCAL)
1391 return numa_node_id();
1392
1393 switch (policy->mode) {
1394 case MPOL_PREFERRED:
1395
1396
1397
1398 return policy->v.preferred_node;
1399
1400 case MPOL_INTERLEAVE:
1401 return interleave_nodes(policy);
1402
1403 case MPOL_BIND: {
1404
1405
1406
1407
1408 struct zonelist *zonelist;
1409 struct zone *zone;
1410 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1411 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1412 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1413 &policy->v.nodes,
1414 &zone);
1415 return zone->node;
1416 }
1417
1418 default:
1419 BUG();
1420 }
1421}
1422
1423
1424static unsigned offset_il_node(struct mempolicy *pol,
1425 struct vm_area_struct *vma, unsigned long off)
1426{
1427 unsigned nnodes = nodes_weight(pol->v.nodes);
1428 unsigned target;
1429 int c;
1430 int nid = -1;
1431
1432 if (!nnodes)
1433 return numa_node_id();
1434 target = (unsigned int)off % nnodes;
1435 c = 0;
1436 do {
1437 nid = next_node(nid, pol->v.nodes);
1438 c++;
1439 } while (c <= target);
1440 return nid;
1441}
1442
1443
1444static inline unsigned interleave_nid(struct mempolicy *pol,
1445 struct vm_area_struct *vma, unsigned long addr, int shift)
1446{
1447 if (vma) {
1448 unsigned long off;
1449
1450
1451
1452
1453
1454
1455
1456
1457 BUG_ON(shift < PAGE_SHIFT);
1458 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1459 off += (addr - vma->vm_start) >> shift;
1460 return offset_il_node(pol, vma, off);
1461 } else
1462 return interleave_nodes(pol);
1463}
1464
1465#ifdef CONFIG_HUGETLBFS
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1480 gfp_t gfp_flags, struct mempolicy **mpol,
1481 nodemask_t **nodemask)
1482{
1483 struct zonelist *zl;
1484
1485 *mpol = get_vma_policy(current, vma, addr);
1486 *nodemask = NULL;
1487
1488 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1489 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1490 huge_page_shift(hstate_vma(vma))), gfp_flags);
1491 } else {
1492 zl = policy_zonelist(gfp_flags, *mpol);
1493 if ((*mpol)->mode == MPOL_BIND)
1494 *nodemask = &(*mpol)->v.nodes;
1495 }
1496 return zl;
1497}
1498#endif
1499
1500
1501
1502static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1503 unsigned nid)
1504{
1505 struct zonelist *zl;
1506 struct page *page;
1507
1508 zl = node_zonelist(nid, gfp);
1509 page = __alloc_pages(gfp, order, zl);
1510 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1511 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1512 return page;
1513}
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537struct page *
1538alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1539{
1540 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1541 struct zonelist *zl;
1542
1543 cpuset_update_task_memory_state();
1544
1545 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1546 unsigned nid;
1547
1548 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1549 mpol_cond_put(pol);
1550 return alloc_page_interleave(gfp, 0, nid);
1551 }
1552 zl = policy_zonelist(gfp, pol);
1553 if (unlikely(mpol_needs_cond_ref(pol))) {
1554
1555
1556
1557 struct page *page = __alloc_pages_nodemask(gfp, 0,
1558 zl, policy_nodemask(gfp, pol));
1559 __mpol_put(pol);
1560 return page;
1561 }
1562
1563
1564
1565 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1588{
1589 struct mempolicy *pol = current->mempolicy;
1590
1591 if ((gfp & __GFP_WAIT) && !in_interrupt())
1592 cpuset_update_task_memory_state();
1593 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1594 pol = &default_policy;
1595
1596
1597
1598
1599
1600 if (pol->mode == MPOL_INTERLEAVE)
1601 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1602 return __alloc_pages_nodemask(gfp, order,
1603 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1604}
1605EXPORT_SYMBOL(alloc_pages_current);
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616struct mempolicy *__mpol_dup(struct mempolicy *old)
1617{
1618 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1619
1620 if (!new)
1621 return ERR_PTR(-ENOMEM);
1622 if (current_cpuset_is_being_rebound()) {
1623 nodemask_t mems = cpuset_mems_allowed(current);
1624 mpol_rebind_policy(old, &mems);
1625 }
1626 *new = *old;
1627 atomic_set(&new->refcnt, 1);
1628 return new;
1629}
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1642 struct mempolicy *frompol)
1643{
1644 if (!mpol_needs_cond_ref(frompol))
1645 return frompol;
1646
1647 *tompol = *frompol;
1648 tompol->flags &= ~MPOL_F_SHARED;
1649 __mpol_put(frompol);
1650 return tompol;
1651}
1652
1653static int mpol_match_intent(const struct mempolicy *a,
1654 const struct mempolicy *b)
1655{
1656 if (a->flags != b->flags)
1657 return 0;
1658 if (!mpol_store_user_nodemask(a))
1659 return 1;
1660 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1661}
1662
1663
1664int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1665{
1666 if (!a || !b)
1667 return 0;
1668 if (a->mode != b->mode)
1669 return 0;
1670 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1671 return 0;
1672 switch (a->mode) {
1673 case MPOL_BIND:
1674
1675 case MPOL_INTERLEAVE:
1676 return nodes_equal(a->v.nodes, b->v.nodes);
1677 case MPOL_PREFERRED:
1678 return a->v.preferred_node == b->v.preferred_node &&
1679 a->flags == b->flags;
1680 default:
1681 BUG();
1682 return 0;
1683 }
1684}
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697static struct sp_node *
1698sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1699{
1700 struct rb_node *n = sp->root.rb_node;
1701
1702 while (n) {
1703 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1704
1705 if (start >= p->end)
1706 n = n->rb_right;
1707 else if (end <= p->start)
1708 n = n->rb_left;
1709 else
1710 break;
1711 }
1712 if (!n)
1713 return NULL;
1714 for (;;) {
1715 struct sp_node *w = NULL;
1716 struct rb_node *prev = rb_prev(n);
1717 if (!prev)
1718 break;
1719 w = rb_entry(prev, struct sp_node, nd);
1720 if (w->end <= start)
1721 break;
1722 n = prev;
1723 }
1724 return rb_entry(n, struct sp_node, nd);
1725}
1726
1727
1728
1729static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1730{
1731 struct rb_node **p = &sp->root.rb_node;
1732 struct rb_node *parent = NULL;
1733 struct sp_node *nd;
1734
1735 while (*p) {
1736 parent = *p;
1737 nd = rb_entry(parent, struct sp_node, nd);
1738 if (new->start < nd->start)
1739 p = &(*p)->rb_left;
1740 else if (new->end > nd->end)
1741 p = &(*p)->rb_right;
1742 else
1743 BUG();
1744 }
1745 rb_link_node(&new->nd, parent, p);
1746 rb_insert_color(&new->nd, &sp->root);
1747 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1748 new->policy ? new->policy->mode : 0);
1749}
1750
1751
1752struct mempolicy *
1753mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1754{
1755 struct mempolicy *pol = NULL;
1756 struct sp_node *sn;
1757
1758 if (!sp->root.rb_node)
1759 return NULL;
1760 spin_lock(&sp->lock);
1761 sn = sp_lookup(sp, idx, idx+1);
1762 if (sn) {
1763 mpol_get(sn->policy);
1764 pol = sn->policy;
1765 }
1766 spin_unlock(&sp->lock);
1767 return pol;
1768}
1769
1770static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1771{
1772 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1773 rb_erase(&n->nd, &sp->root);
1774 mpol_put(n->policy);
1775 kmem_cache_free(sn_cache, n);
1776}
1777
1778static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1779 struct mempolicy *pol)
1780{
1781 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1782
1783 if (!n)
1784 return NULL;
1785 n->start = start;
1786 n->end = end;
1787 mpol_get(pol);
1788 pol->flags |= MPOL_F_SHARED;
1789 n->policy = pol;
1790 return n;
1791}
1792
1793
1794static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1795 unsigned long end, struct sp_node *new)
1796{
1797 struct sp_node *n, *new2 = NULL;
1798
1799restart:
1800 spin_lock(&sp->lock);
1801 n = sp_lookup(sp, start, end);
1802
1803 while (n && n->start < end) {
1804 struct rb_node *next = rb_next(&n->nd);
1805 if (n->start >= start) {
1806 if (n->end <= end)
1807 sp_delete(sp, n);
1808 else
1809 n->start = end;
1810 } else {
1811
1812 if (n->end > end) {
1813 if (!new2) {
1814 spin_unlock(&sp->lock);
1815 new2 = sp_alloc(end, n->end, n->policy);
1816 if (!new2)
1817 return -ENOMEM;
1818 goto restart;
1819 }
1820 n->end = start;
1821 sp_insert(sp, new2);
1822 new2 = NULL;
1823 break;
1824 } else
1825 n->end = start;
1826 }
1827 if (!next)
1828 break;
1829 n = rb_entry(next, struct sp_node, nd);
1830 }
1831 if (new)
1832 sp_insert(sp, new);
1833 spin_unlock(&sp->lock);
1834 if (new2) {
1835 mpol_put(new2->policy);
1836 kmem_cache_free(sn_cache, new2);
1837 }
1838 return 0;
1839}
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1851{
1852 sp->root = RB_ROOT;
1853 spin_lock_init(&sp->lock);
1854
1855 if (mpol) {
1856 struct vm_area_struct pvma;
1857 struct mempolicy *new;
1858
1859
1860 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1861 mpol_put(mpol);
1862 if (IS_ERR(new))
1863 return;
1864
1865
1866 memset(&pvma, 0, sizeof(struct vm_area_struct));
1867 pvma.vm_end = TASK_SIZE;
1868 mpol_set_shared_policy(sp, &pvma, new);
1869 mpol_put(new);
1870 }
1871}
1872
1873int mpol_set_shared_policy(struct shared_policy *info,
1874 struct vm_area_struct *vma, struct mempolicy *npol)
1875{
1876 int err;
1877 struct sp_node *new = NULL;
1878 unsigned long sz = vma_pages(vma);
1879
1880 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1881 vma->vm_pgoff,
1882 sz, npol ? npol->mode : -1,
1883 npol ? npol->flags : -1,
1884 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1885
1886 if (npol) {
1887 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1888 if (!new)
1889 return -ENOMEM;
1890 }
1891 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1892 if (err && new)
1893 kmem_cache_free(sn_cache, new);
1894 return err;
1895}
1896
1897
1898void mpol_free_shared_policy(struct shared_policy *p)
1899{
1900 struct sp_node *n;
1901 struct rb_node *next;
1902
1903 if (!p->root.rb_node)
1904 return;
1905 spin_lock(&p->lock);
1906 next = rb_first(&p->root);
1907 while (next) {
1908 n = rb_entry(next, struct sp_node, nd);
1909 next = rb_next(&n->nd);
1910 rb_erase(&n->nd, &p->root);
1911 mpol_put(n->policy);
1912 kmem_cache_free(sn_cache, n);
1913 }
1914 spin_unlock(&p->lock);
1915}
1916
1917
1918void __init numa_policy_init(void)
1919{
1920 nodemask_t interleave_nodes;
1921 unsigned long largest = 0;
1922 int nid, prefer = 0;
1923
1924 policy_cache = kmem_cache_create("numa_policy",
1925 sizeof(struct mempolicy),
1926 0, SLAB_PANIC, NULL);
1927
1928 sn_cache = kmem_cache_create("shared_policy_node",
1929 sizeof(struct sp_node),
1930 0, SLAB_PANIC, NULL);
1931
1932
1933
1934
1935
1936
1937 nodes_clear(interleave_nodes);
1938 for_each_node_state(nid, N_HIGH_MEMORY) {
1939 unsigned long total_pages = node_present_pages(nid);
1940
1941
1942 if (largest < total_pages) {
1943 largest = total_pages;
1944 prefer = nid;
1945 }
1946
1947
1948 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1949 node_set(nid, interleave_nodes);
1950 }
1951
1952
1953 if (unlikely(nodes_empty(interleave_nodes)))
1954 node_set(prefer, interleave_nodes);
1955
1956 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1957 printk("numa_policy_init: interleaving failed\n");
1958}
1959
1960
1961void numa_default_policy(void)
1962{
1963 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1964}
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1975static const char * const policy_types[] =
1976 { "default", "prefer", "bind", "interleave", "local" };
1977
1978
1979#ifdef CONFIG_TMPFS
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1999{
2000 struct mempolicy *new = NULL;
2001 unsigned short uninitialized_var(mode);
2002 unsigned short uninitialized_var(mode_flags);
2003 nodemask_t nodes;
2004 char *nodelist = strchr(str, ':');
2005 char *flags = strchr(str, '=');
2006 int i;
2007 int err = 1;
2008
2009 if (nodelist) {
2010
2011 *nodelist++ = '\0';
2012 if (nodelist_parse(nodelist, nodes))
2013 goto out;
2014 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2015 goto out;
2016 } else
2017 nodes_clear(nodes);
2018
2019 if (flags)
2020 *flags++ = '\0';
2021
2022 for (i = 0; i <= MPOL_LOCAL; i++) {
2023 if (!strcmp(str, policy_types[i])) {
2024 mode = i;
2025 break;
2026 }
2027 }
2028 if (i > MPOL_LOCAL)
2029 goto out;
2030
2031 switch (mode) {
2032 case MPOL_PREFERRED:
2033
2034
2035
2036 if (nodelist) {
2037 char *rest = nodelist;
2038 while (isdigit(*rest))
2039 rest++;
2040 if (!*rest)
2041 err = 0;
2042 }
2043 break;
2044 case MPOL_INTERLEAVE:
2045
2046
2047
2048 if (!nodelist)
2049 nodes = node_states[N_HIGH_MEMORY];
2050 err = 0;
2051 break;
2052 case MPOL_LOCAL:
2053
2054
2055
2056 if (nodelist)
2057 goto out;
2058 mode = MPOL_PREFERRED;
2059 break;
2060
2061
2062
2063
2064
2065 }
2066
2067 mode_flags = 0;
2068 if (flags) {
2069
2070
2071
2072
2073 if (!strcmp(flags, "static"))
2074 mode_flags |= MPOL_F_STATIC_NODES;
2075 else if (!strcmp(flags, "relative"))
2076 mode_flags |= MPOL_F_RELATIVE_NODES;
2077 else
2078 err = 1;
2079 }
2080
2081 new = mpol_new(mode, mode_flags, &nodes);
2082 if (IS_ERR(new))
2083 err = 1;
2084 else if (no_context)
2085 new->w.user_nodemask = nodes;
2086
2087out:
2088
2089 if (nodelist)
2090 *--nodelist = ':';
2091 if (flags)
2092 *--flags = '=';
2093 if (!err)
2094 *mpol = new;
2095 return err;
2096}
2097#endif
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2111{
2112 char *p = buffer;
2113 int l;
2114 nodemask_t nodes;
2115 unsigned short mode;
2116 unsigned short flags = pol ? pol->flags : 0;
2117
2118
2119
2120
2121 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2122
2123 if (!pol || pol == &default_policy)
2124 mode = MPOL_DEFAULT;
2125 else
2126 mode = pol->mode;
2127
2128 switch (mode) {
2129 case MPOL_DEFAULT:
2130 nodes_clear(nodes);
2131 break;
2132
2133 case MPOL_PREFERRED:
2134 nodes_clear(nodes);
2135 if (flags & MPOL_F_LOCAL)
2136 mode = MPOL_LOCAL;
2137 else
2138 node_set(pol->v.preferred_node, nodes);
2139 break;
2140
2141 case MPOL_BIND:
2142
2143 case MPOL_INTERLEAVE:
2144 if (no_context)
2145 nodes = pol->w.user_nodemask;
2146 else
2147 nodes = pol->v.nodes;
2148 break;
2149
2150 default:
2151 BUG();
2152 }
2153
2154 l = strlen(policy_types[mode]);
2155 if (buffer + maxlen < p + l + 1)
2156 return -ENOSPC;
2157
2158 strcpy(p, policy_types[mode]);
2159 p += l;
2160
2161 if (flags & MPOL_MODE_FLAGS) {
2162 if (buffer + maxlen < p + 2)
2163 return -ENOSPC;
2164 *p++ = '=';
2165
2166
2167
2168
2169 if (flags & MPOL_F_STATIC_NODES)
2170 p += snprintf(p, buffer + maxlen - p, "static");
2171 else if (flags & MPOL_F_RELATIVE_NODES)
2172 p += snprintf(p, buffer + maxlen - p, "relative");
2173 }
2174
2175 if (!nodes_empty(nodes)) {
2176 if (buffer + maxlen < p + 2)
2177 return -ENOSPC;
2178 *p++ = ':';
2179 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2180 }
2181 return p - buffer;
2182}
2183
2184struct numa_maps {
2185 unsigned long pages;
2186 unsigned long anon;
2187 unsigned long active;
2188 unsigned long writeback;
2189 unsigned long mapcount_max;
2190 unsigned long dirty;
2191 unsigned long swapcache;
2192 unsigned long node[MAX_NUMNODES];
2193};
2194
2195static void gather_stats(struct page *page, void *private, int pte_dirty)
2196{
2197 struct numa_maps *md = private;
2198 int count = page_mapcount(page);
2199
2200 md->pages++;
2201 if (pte_dirty || PageDirty(page))
2202 md->dirty++;
2203
2204 if (PageSwapCache(page))
2205 md->swapcache++;
2206
2207 if (PageActive(page) || PageUnevictable(page))
2208 md->active++;
2209
2210 if (PageWriteback(page))
2211 md->writeback++;
2212
2213 if (PageAnon(page))
2214 md->anon++;
2215
2216 if (count > md->mapcount_max)
2217 md->mapcount_max = count;
2218
2219 md->node[page_to_nid(page)]++;
2220}
2221
2222#ifdef CONFIG_HUGETLB_PAGE
2223static void check_huge_range(struct vm_area_struct *vma,
2224 unsigned long start, unsigned long end,
2225 struct numa_maps *md)
2226{
2227 unsigned long addr;
2228 struct page *page;
2229 struct hstate *h = hstate_vma(vma);
2230 unsigned long sz = huge_page_size(h);
2231
2232 for (addr = start; addr < end; addr += sz) {
2233 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2234 addr & huge_page_mask(h));
2235 pte_t pte;
2236
2237 if (!ptep)
2238 continue;
2239
2240 pte = *ptep;
2241 if (pte_none(pte))
2242 continue;
2243
2244 page = pte_page(pte);
2245 if (!page)
2246 continue;
2247
2248 gather_stats(page, md, pte_dirty(*ptep));
2249 }
2250}
2251#else
2252static inline void check_huge_range(struct vm_area_struct *vma,
2253 unsigned long start, unsigned long end,
2254 struct numa_maps *md)
2255{
2256}
2257#endif
2258
2259
2260
2261
2262int show_numa_map(struct seq_file *m, void *v)
2263{
2264 struct proc_maps_private *priv = m->private;
2265 struct vm_area_struct *vma = v;
2266 struct numa_maps *md;
2267 struct file *file = vma->vm_file;
2268 struct mm_struct *mm = vma->vm_mm;
2269 struct mempolicy *pol;
2270 int n;
2271 char buffer[50];
2272
2273 if (!mm)
2274 return 0;
2275
2276 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2277 if (!md)
2278 return 0;
2279
2280 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2281 mpol_to_str(buffer, sizeof(buffer), pol, 0);
2282 mpol_cond_put(pol);
2283
2284 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2285
2286 if (file) {
2287 seq_printf(m, " file=");
2288 seq_path(m, &file->f_path, "\n\t= ");
2289 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2290 seq_printf(m, " heap");
2291 } else if (vma->vm_start <= mm->start_stack &&
2292 vma->vm_end >= mm->start_stack) {
2293 seq_printf(m, " stack");
2294 }
2295
2296 if (is_vm_hugetlb_page(vma)) {
2297 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2298 seq_printf(m, " huge");
2299 } else {
2300 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2301 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2302 }
2303
2304 if (!md->pages)
2305 goto out;
2306
2307 if (md->anon)
2308 seq_printf(m," anon=%lu",md->anon);
2309
2310 if (md->dirty)
2311 seq_printf(m," dirty=%lu",md->dirty);
2312
2313 if (md->pages != md->anon && md->pages != md->dirty)
2314 seq_printf(m, " mapped=%lu", md->pages);
2315
2316 if (md->mapcount_max > 1)
2317 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2318
2319 if (md->swapcache)
2320 seq_printf(m," swapcache=%lu", md->swapcache);
2321
2322 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2323 seq_printf(m," active=%lu", md->active);
2324
2325 if (md->writeback)
2326 seq_printf(m," writeback=%lu", md->writeback);
2327
2328 for_each_node_state(n, N_HIGH_MEMORY)
2329 if (md->node[n])
2330 seq_printf(m, " N%d=%lu", n, md->node[n]);
2331out:
2332 seq_putc(m, '\n');
2333 kfree(md);
2334
2335 if (m->count < m->size)
2336 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2337 return 0;
2338}
2339