1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/module.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <asm/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126
127typedef enum {
128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
131 CS_MEMORY_MIGRATE,
132 CS_SCHED_LOAD_BALANCE,
133 CS_SPREAD_PAGE,
134 CS_SPREAD_SLAB,
135} cpuset_flagbits_t;
136
137
138static inline int is_cpu_exclusive(const struct cpuset *cs)
139{
140 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
141}
142
143static inline int is_mem_exclusive(const struct cpuset *cs)
144{
145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
146}
147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
153static inline int is_sched_load_balance(const struct cpuset *cs)
154{
155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
156}
157
158static inline int is_memory_migrate(const struct cpuset *cs)
159{
160 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
161}
162
163static inline int is_spread_page(const struct cpuset *cs)
164{
165 return test_bit(CS_SPREAD_PAGE, &cs->flags);
166}
167
168static inline int is_spread_slab(const struct cpuset *cs)
169{
170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
171}
172
173static struct cpuset top_cpuset = {
174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
175};
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216static DEFINE_MUTEX(callback_mutex);
217
218
219
220
221
222
223#define CPUSET_NAME_LEN (128)
224#define CPUSET_NODELIST_LEN (256)
225static char cpuset_name[CPUSET_NAME_LEN];
226static char cpuset_nodelist[CPUSET_NODELIST_LEN];
227static DEFINE_SPINLOCK(cpuset_buffer_lock);
228
229
230
231
232
233
234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, void *data)
236{
237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
238 struct dentry *ret = ERR_PTR(-ENODEV);
239 if (cgroup_fs) {
240 char mountopts[] =
241 "cpuset,noprefix,"
242 "release_agent=/sbin/cpuset_release_agent";
243 ret = cgroup_fs->mount(cgroup_fs, flags,
244 unused_dev_name, mountopts);
245 put_filesystem(cgroup_fs);
246 }
247 return ret;
248}
249
250static struct file_system_type cpuset_fs_type = {
251 .name = "cpuset",
252 .mount = cpuset_mount,
253};
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269static void guarantee_online_cpus(const struct cpuset *cs,
270 struct cpumask *pmask)
271{
272 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
273 cs = cs->parent;
274 if (cs)
275 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
276 else
277 cpumask_copy(pmask, cpu_online_mask);
278 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
279}
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
295{
296 while (cs && !nodes_intersects(cs->mems_allowed,
297 node_states[N_HIGH_MEMORY]))
298 cs = cs->parent;
299 if (cs)
300 nodes_and(*pmask, cs->mems_allowed,
301 node_states[N_HIGH_MEMORY]);
302 else
303 *pmask = node_states[N_HIGH_MEMORY];
304 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
305}
306
307
308
309
310
311
312static void cpuset_update_task_spread_flag(struct cpuset *cs,
313 struct task_struct *tsk)
314{
315 if (is_spread_page(cs))
316 tsk->flags |= PF_SPREAD_PAGE;
317 else
318 tsk->flags &= ~PF_SPREAD_PAGE;
319 if (is_spread_slab(cs))
320 tsk->flags |= PF_SPREAD_SLAB;
321 else
322 tsk->flags &= ~PF_SPREAD_SLAB;
323}
324
325
326
327
328
329
330
331
332
333static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
334{
335 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
336 nodes_subset(p->mems_allowed, q->mems_allowed) &&
337 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
338 is_mem_exclusive(p) <= is_mem_exclusive(q);
339}
340
341
342
343
344
345static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
346{
347 struct cpuset *trial;
348
349 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
350 if (!trial)
351 return NULL;
352
353 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
354 kfree(trial);
355 return NULL;
356 }
357 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
358
359 return trial;
360}
361
362
363
364
365
366static void free_trial_cpuset(struct cpuset *trial)
367{
368 free_cpumask_var(trial->cpus_allowed);
369 kfree(trial);
370}
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
393{
394 struct cgroup *cont;
395 struct cpuset *c, *par;
396
397
398 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
399 if (!is_cpuset_subset(cgroup_cs(cont), trial))
400 return -EBUSY;
401 }
402
403
404 if (cur == &top_cpuset)
405 return 0;
406
407 par = cur->parent;
408
409
410 if (!is_cpuset_subset(trial, par))
411 return -EACCES;
412
413
414
415
416
417 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
418 c = cgroup_cs(cont);
419 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
420 c != cur &&
421 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
422 return -EINVAL;
423 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
424 c != cur &&
425 nodes_intersects(trial->mems_allowed, c->mems_allowed))
426 return -EINVAL;
427 }
428
429
430 if (cgroup_task_count(cur->css.cgroup)) {
431 if (cpumask_empty(trial->cpus_allowed) ||
432 nodes_empty(trial->mems_allowed)) {
433 return -ENOSPC;
434 }
435 }
436
437 return 0;
438}
439
440#ifdef CONFIG_SMP
441
442
443
444
445static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
446{
447 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
448}
449
450static void
451update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
452{
453 if (dattr->relax_domain_level < c->relax_domain_level)
454 dattr->relax_domain_level = c->relax_domain_level;
455 return;
456}
457
458static void
459update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
460{
461 LIST_HEAD(q);
462
463 list_add(&c->stack_list, &q);
464 while (!list_empty(&q)) {
465 struct cpuset *cp;
466 struct cgroup *cont;
467 struct cpuset *child;
468
469 cp = list_first_entry(&q, struct cpuset, stack_list);
470 list_del(q.next);
471
472 if (cpumask_empty(cp->cpus_allowed))
473 continue;
474
475 if (is_sched_load_balance(cp))
476 update_domain_attr(dattr, cp);
477
478 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
479 child = cgroup_cs(cont);
480 list_add_tail(&child->stack_list, &q);
481 }
482 }
483}
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539static int generate_sched_domains(cpumask_var_t **domains,
540 struct sched_domain_attr **attributes)
541{
542 LIST_HEAD(q);
543 struct cpuset *cp;
544 struct cpuset **csa;
545 int csn;
546 int i, j, k;
547 cpumask_var_t *doms;
548 struct sched_domain_attr *dattr;
549 int ndoms = 0;
550 int nslot;
551
552 doms = NULL;
553 dattr = NULL;
554 csa = NULL;
555
556
557 if (is_sched_load_balance(&top_cpuset)) {
558 ndoms = 1;
559 doms = alloc_sched_domains(ndoms);
560 if (!doms)
561 goto done;
562
563 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
564 if (dattr) {
565 *dattr = SD_ATTR_INIT;
566 update_domain_attr_tree(dattr, &top_cpuset);
567 }
568 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
569
570 goto done;
571 }
572
573 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
574 if (!csa)
575 goto done;
576 csn = 0;
577
578 list_add(&top_cpuset.stack_list, &q);
579 while (!list_empty(&q)) {
580 struct cgroup *cont;
581 struct cpuset *child;
582
583 cp = list_first_entry(&q, struct cpuset, stack_list);
584 list_del(q.next);
585
586 if (cpumask_empty(cp->cpus_allowed))
587 continue;
588
589
590
591
592
593
594
595 if (is_sched_load_balance(cp)) {
596 csa[csn++] = cp;
597 continue;
598 }
599
600 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
601 child = cgroup_cs(cont);
602 list_add_tail(&child->stack_list, &q);
603 }
604 }
605
606 for (i = 0; i < csn; i++)
607 csa[i]->pn = i;
608 ndoms = csn;
609
610restart:
611
612 for (i = 0; i < csn; i++) {
613 struct cpuset *a = csa[i];
614 int apn = a->pn;
615
616 for (j = 0; j < csn; j++) {
617 struct cpuset *b = csa[j];
618 int bpn = b->pn;
619
620 if (apn != bpn && cpusets_overlap(a, b)) {
621 for (k = 0; k < csn; k++) {
622 struct cpuset *c = csa[k];
623
624 if (c->pn == bpn)
625 c->pn = apn;
626 }
627 ndoms--;
628 goto restart;
629 }
630 }
631 }
632
633
634
635
636
637 doms = alloc_sched_domains(ndoms);
638 if (!doms)
639 goto done;
640
641
642
643
644
645 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
646
647 for (nslot = 0, i = 0; i < csn; i++) {
648 struct cpuset *a = csa[i];
649 struct cpumask *dp;
650 int apn = a->pn;
651
652 if (apn < 0) {
653
654 continue;
655 }
656
657 dp = doms[nslot];
658
659 if (nslot == ndoms) {
660 static int warnings = 10;
661 if (warnings) {
662 printk(KERN_WARNING
663 "rebuild_sched_domains confused:"
664 " nslot %d, ndoms %d, csn %d, i %d,"
665 " apn %d\n",
666 nslot, ndoms, csn, i, apn);
667 warnings--;
668 }
669 continue;
670 }
671
672 cpumask_clear(dp);
673 if (dattr)
674 *(dattr + nslot) = SD_ATTR_INIT;
675 for (j = i; j < csn; j++) {
676 struct cpuset *b = csa[j];
677
678 if (apn == b->pn) {
679 cpumask_or(dp, dp, b->cpus_allowed);
680 if (dattr)
681 update_domain_attr_tree(dattr + nslot, b);
682
683
684 b->pn = -1;
685 }
686 }
687 nslot++;
688 }
689 BUG_ON(nslot != ndoms);
690
691done:
692 kfree(csa);
693
694
695
696
697
698 if (doms == NULL)
699 ndoms = 1;
700
701 *domains = doms;
702 *attributes = dattr;
703 return ndoms;
704}
705
706
707
708
709
710
711
712
713
714
715
716static void do_rebuild_sched_domains(struct work_struct *unused)
717{
718 struct sched_domain_attr *attr;
719 cpumask_var_t *doms;
720 int ndoms;
721
722 get_online_cpus();
723
724
725 cgroup_lock();
726 ndoms = generate_sched_domains(&doms, &attr);
727 cgroup_unlock();
728
729
730 partition_sched_domains(ndoms, doms, attr);
731
732 put_online_cpus();
733}
734#else
735static void do_rebuild_sched_domains(struct work_struct *unused)
736{
737}
738
739static int generate_sched_domains(cpumask_var_t **domains,
740 struct sched_domain_attr **attributes)
741{
742 *domains = NULL;
743 return 1;
744}
745#endif
746
747static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768static void async_rebuild_sched_domains(void)
769{
770 queue_work(cpuset_wq, &rebuild_sched_domains_work);
771}
772
773
774
775
776
777
778
779
780
781
782void rebuild_sched_domains(void)
783{
784 do_rebuild_sched_domains(NULL);
785}
786
787
788
789
790
791
792
793
794
795
796
797static int cpuset_test_cpumask(struct task_struct *tsk,
798 struct cgroup_scanner *scan)
799{
800 return !cpumask_equal(&tsk->cpus_allowed,
801 (cgroup_cs(scan->cg))->cpus_allowed);
802}
803
804
805
806
807
808
809
810
811
812
813
814
815static void cpuset_change_cpumask(struct task_struct *tsk,
816 struct cgroup_scanner *scan)
817{
818 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
819}
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
835{
836 struct cgroup_scanner scan;
837
838 scan.cg = cs->css.cgroup;
839 scan.test_task = cpuset_test_cpumask;
840 scan.process_task = cpuset_change_cpumask;
841 scan.heap = heap;
842 cgroup_scan_tasks(&scan);
843}
844
845
846
847
848
849
850static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
851 const char *buf)
852{
853 struct ptr_heap heap;
854 int retval;
855 int is_load_balanced;
856
857
858 if (cs == &top_cpuset)
859 return -EACCES;
860
861
862
863
864
865
866
867 if (!*buf) {
868 cpumask_clear(trialcs->cpus_allowed);
869 } else {
870 retval = cpulist_parse(buf, trialcs->cpus_allowed);
871 if (retval < 0)
872 return retval;
873
874 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
875 return -EINVAL;
876 }
877 retval = validate_change(cs, trialcs);
878 if (retval < 0)
879 return retval;
880
881
882 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
883 return 0;
884
885 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
886 if (retval)
887 return retval;
888
889 is_load_balanced = is_sched_load_balance(trialcs);
890
891 mutex_lock(&callback_mutex);
892 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
893 mutex_unlock(&callback_mutex);
894
895
896
897
898
899 update_tasks_cpumask(cs, &heap);
900
901 heap_free(&heap);
902
903 if (is_load_balanced)
904 async_rebuild_sched_domains();
905 return 0;
906}
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
929 const nodemask_t *to)
930{
931 struct task_struct *tsk = current;
932
933 tsk->mems_allowed = *to;
934
935 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
936
937 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
938}
939
940
941
942
943
944
945
946
947
948
949static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems)
951{
952repeat:
953
954
955
956
957 if (unlikely(test_thread_flag(TIF_MEMDIE)))
958 return;
959 if (current->flags & PF_EXITING)
960 return;
961
962 task_lock(tsk);
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979 smp_mb();
980
981
982
983
984
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk);
987 if (!task_curr(tsk))
988 yield();
989 goto repeat;
990 }
991
992
993
994
995
996
997
998
999 smp_mb();
1000
1001 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1002 tsk->mems_allowed = *newmems;
1003 task_unlock(tsk);
1004}
1005
1006
1007
1008
1009
1010
1011static void cpuset_change_nodemask(struct task_struct *p,
1012 struct cgroup_scanner *scan)
1013{
1014 struct mm_struct *mm;
1015 struct cpuset *cs;
1016 int migrate;
1017 const nodemask_t *oldmem = scan->data;
1018 static nodemask_t newmems;
1019
1020 cs = cgroup_cs(scan->cg);
1021 guarantee_online_mems(cs, &newmems);
1022
1023 cpuset_change_task_nodemask(p, &newmems);
1024
1025 mm = get_task_mm(p);
1026 if (!mm)
1027 return;
1028
1029 migrate = is_memory_migrate(cs);
1030
1031 mpol_rebind_mm(mm, &cs->mems_allowed);
1032 if (migrate)
1033 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1034 mmput(mm);
1035}
1036
1037static void *cpuset_being_rebound;
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1050 struct ptr_heap *heap)
1051{
1052 struct cgroup_scanner scan;
1053
1054 cpuset_being_rebound = cs;
1055
1056 scan.cg = cs->css.cgroup;
1057 scan.test_task = NULL;
1058 scan.process_task = cpuset_change_nodemask;
1059 scan.heap = heap;
1060 scan.data = (nodemask_t *)oldmem;
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072 cgroup_scan_tasks(&scan);
1073
1074
1075 cpuset_being_rebound = NULL;
1076}
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1092 const char *buf)
1093{
1094 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1095 int retval;
1096 struct ptr_heap heap;
1097
1098 if (!oldmem)
1099 return -ENOMEM;
1100
1101
1102
1103
1104
1105 if (cs == &top_cpuset) {
1106 retval = -EACCES;
1107 goto done;
1108 }
1109
1110
1111
1112
1113
1114
1115
1116 if (!*buf) {
1117 nodes_clear(trialcs->mems_allowed);
1118 } else {
1119 retval = nodelist_parse(buf, trialcs->mems_allowed);
1120 if (retval < 0)
1121 goto done;
1122
1123 if (!nodes_subset(trialcs->mems_allowed,
1124 node_states[N_HIGH_MEMORY])) {
1125 retval = -EINVAL;
1126 goto done;
1127 }
1128 }
1129 *oldmem = cs->mems_allowed;
1130 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1131 retval = 0;
1132 goto done;
1133 }
1134 retval = validate_change(cs, trialcs);
1135 if (retval < 0)
1136 goto done;
1137
1138 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1139 if (retval < 0)
1140 goto done;
1141
1142 mutex_lock(&callback_mutex);
1143 cs->mems_allowed = trialcs->mems_allowed;
1144 mutex_unlock(&callback_mutex);
1145
1146 update_tasks_nodemask(cs, oldmem, &heap);
1147
1148 heap_free(&heap);
1149done:
1150 NODEMASK_FREE(oldmem);
1151 return retval;
1152}
1153
1154int current_cpuset_is_being_rebound(void)
1155{
1156 return task_cs(current) == cpuset_being_rebound;
1157}
1158
1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{
1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX)
1163 return -EINVAL;
1164#endif
1165
1166 if (val != cs->relax_domain_level) {
1167 cs->relax_domain_level = val;
1168 if (!cpumask_empty(cs->cpus_allowed) &&
1169 is_sched_load_balance(cs))
1170 async_rebuild_sched_domains();
1171 }
1172
1173 return 0;
1174}
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186static void cpuset_change_flag(struct task_struct *tsk,
1187 struct cgroup_scanner *scan)
1188{
1189 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1190}
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1206{
1207 struct cgroup_scanner scan;
1208
1209 scan.cg = cs->css.cgroup;
1210 scan.test_task = NULL;
1211 scan.process_task = cpuset_change_flag;
1212 scan.heap = heap;
1213 cgroup_scan_tasks(&scan);
1214}
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1226 int turning_on)
1227{
1228 struct cpuset *trialcs;
1229 int balance_flag_changed;
1230 int spread_flag_changed;
1231 struct ptr_heap heap;
1232 int err;
1233
1234 trialcs = alloc_trial_cpuset(cs);
1235 if (!trialcs)
1236 return -ENOMEM;
1237
1238 if (turning_on)
1239 set_bit(bit, &trialcs->flags);
1240 else
1241 clear_bit(bit, &trialcs->flags);
1242
1243 err = validate_change(cs, trialcs);
1244 if (err < 0)
1245 goto out;
1246
1247 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1248 if (err < 0)
1249 goto out;
1250
1251 balance_flag_changed = (is_sched_load_balance(cs) !=
1252 is_sched_load_balance(trialcs));
1253
1254 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1255 || (is_spread_page(cs) != is_spread_page(trialcs)));
1256
1257 mutex_lock(&callback_mutex);
1258 cs->flags = trialcs->flags;
1259 mutex_unlock(&callback_mutex);
1260
1261 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1262 async_rebuild_sched_domains();
1263
1264 if (spread_flag_changed)
1265 update_tasks_flags(cs, &heap);
1266 heap_free(&heap);
1267out:
1268 free_trial_cpuset(trialcs);
1269 return err;
1270}
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317#define FM_COEF 933
1318#define FM_MAXTICKS ((time_t)99)
1319#define FM_MAXCNT 1000000
1320#define FM_SCALE 1000
1321
1322
1323static void fmeter_init(struct fmeter *fmp)
1324{
1325 fmp->cnt = 0;
1326 fmp->val = 0;
1327 fmp->time = 0;
1328 spin_lock_init(&fmp->lock);
1329}
1330
1331
1332static void fmeter_update(struct fmeter *fmp)
1333{
1334 time_t now = get_seconds();
1335 time_t ticks = now - fmp->time;
1336
1337 if (ticks == 0)
1338 return;
1339
1340 ticks = min(FM_MAXTICKS, ticks);
1341 while (ticks-- > 0)
1342 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1343 fmp->time = now;
1344
1345 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1346 fmp->cnt = 0;
1347}
1348
1349
1350static void fmeter_markevent(struct fmeter *fmp)
1351{
1352 spin_lock(&fmp->lock);
1353 fmeter_update(fmp);
1354 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1355 spin_unlock(&fmp->lock);
1356}
1357
1358
1359static int fmeter_getrate(struct fmeter *fmp)
1360{
1361 int val;
1362
1363 spin_lock(&fmp->lock);
1364 fmeter_update(fmp);
1365 val = fmp->val;
1366 spin_unlock(&fmp->lock);
1367 return val;
1368}
1369
1370
1371static cpumask_var_t cpus_attach;
1372
1373
1374static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1375 struct task_struct *tsk, bool threadgroup)
1376{
1377 int ret;
1378 struct cpuset *cs = cgroup_cs(cont);
1379
1380 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1381 return -ENOSPC;
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391 if (tsk->flags & PF_THREAD_BOUND)
1392 return -EINVAL;
1393
1394 ret = security_task_setscheduler(tsk);
1395 if (ret)
1396 return ret;
1397 if (threadgroup) {
1398 struct task_struct *c;
1399
1400 rcu_read_lock();
1401 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1402 ret = security_task_setscheduler(c);
1403 if (ret) {
1404 rcu_read_unlock();
1405 return ret;
1406 }
1407 }
1408 rcu_read_unlock();
1409 }
1410 return 0;
1411}
1412
1413static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1414 struct cpuset *cs)
1415{
1416 int err;
1417
1418
1419
1420
1421 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1422 WARN_ON_ONCE(err);
1423
1424 cpuset_change_task_nodemask(tsk, to);
1425 cpuset_update_task_spread_flag(cs, tsk);
1426
1427}
1428
1429static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1430 struct cgroup *oldcont, struct task_struct *tsk,
1431 bool threadgroup)
1432{
1433 struct mm_struct *mm;
1434 struct cpuset *cs = cgroup_cs(cont);
1435 struct cpuset *oldcs = cgroup_cs(oldcont);
1436 static nodemask_t to;
1437
1438 if (cs == &top_cpuset) {
1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 } else {
1441 guarantee_online_cpus(cs, cpus_attach);
1442 }
1443 guarantee_online_mems(cs, &to);
1444
1445
1446 cpuset_attach_task(tsk, &to, cs);
1447 if (threadgroup) {
1448 struct task_struct *c;
1449 rcu_read_lock();
1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1451 cpuset_attach_task(c, &to, cs);
1452 }
1453 rcu_read_unlock();
1454 }
1455
1456
1457 to = cs->mems_allowed;
1458 mm = get_task_mm(tsk);
1459 if (mm) {
1460 mpol_rebind_mm(mm, &to);
1461 if (is_memory_migrate(cs))
1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
1463 mmput(mm);
1464 }
1465}
1466
1467
1468
1469typedef enum {
1470 FILE_MEMORY_MIGRATE,
1471 FILE_CPULIST,
1472 FILE_MEMLIST,
1473 FILE_CPU_EXCLUSIVE,
1474 FILE_MEM_EXCLUSIVE,
1475 FILE_MEM_HARDWALL,
1476 FILE_SCHED_LOAD_BALANCE,
1477 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1478 FILE_MEMORY_PRESSURE_ENABLED,
1479 FILE_MEMORY_PRESSURE,
1480 FILE_SPREAD_PAGE,
1481 FILE_SPREAD_SLAB,
1482} cpuset_filetype_t;
1483
1484static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1485{
1486 int retval = 0;
1487 struct cpuset *cs = cgroup_cs(cgrp);
1488 cpuset_filetype_t type = cft->private;
1489
1490 if (!cgroup_lock_live_group(cgrp))
1491 return -ENODEV;
1492
1493 switch (type) {
1494 case FILE_CPU_EXCLUSIVE:
1495 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1496 break;
1497 case FILE_MEM_EXCLUSIVE:
1498 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1499 break;
1500 case FILE_MEM_HARDWALL:
1501 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1502 break;
1503 case FILE_SCHED_LOAD_BALANCE:
1504 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1505 break;
1506 case FILE_MEMORY_MIGRATE:
1507 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1508 break;
1509 case FILE_MEMORY_PRESSURE_ENABLED:
1510 cpuset_memory_pressure_enabled = !!val;
1511 break;
1512 case FILE_MEMORY_PRESSURE:
1513 retval = -EACCES;
1514 break;
1515 case FILE_SPREAD_PAGE:
1516 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1517 break;
1518 case FILE_SPREAD_SLAB:
1519 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1520 break;
1521 default:
1522 retval = -EINVAL;
1523 break;
1524 }
1525 cgroup_unlock();
1526 return retval;
1527}
1528
1529static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1530{
1531 int retval = 0;
1532 struct cpuset *cs = cgroup_cs(cgrp);
1533 cpuset_filetype_t type = cft->private;
1534
1535 if (!cgroup_lock_live_group(cgrp))
1536 return -ENODEV;
1537
1538 switch (type) {
1539 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1540 retval = update_relax_domain_level(cs, val);
1541 break;
1542 default:
1543 retval = -EINVAL;
1544 break;
1545 }
1546 cgroup_unlock();
1547 return retval;
1548}
1549
1550
1551
1552
1553static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1554 const char *buf)
1555{
1556 int retval = 0;
1557 struct cpuset *cs = cgroup_cs(cgrp);
1558 struct cpuset *trialcs;
1559
1560 if (!cgroup_lock_live_group(cgrp))
1561 return -ENODEV;
1562
1563 trialcs = alloc_trial_cpuset(cs);
1564 if (!trialcs) {
1565 retval = -ENOMEM;
1566 goto out;
1567 }
1568
1569 switch (cft->private) {
1570 case FILE_CPULIST:
1571 retval = update_cpumask(cs, trialcs, buf);
1572 break;
1573 case FILE_MEMLIST:
1574 retval = update_nodemask(cs, trialcs, buf);
1575 break;
1576 default:
1577 retval = -EINVAL;
1578 break;
1579 }
1580
1581 free_trial_cpuset(trialcs);
1582out:
1583 cgroup_unlock();
1584 return retval;
1585}
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1600{
1601 size_t count;
1602
1603 mutex_lock(&callback_mutex);
1604 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1605 mutex_unlock(&callback_mutex);
1606
1607 return count;
1608}
1609
1610static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1611{
1612 size_t count;
1613
1614 mutex_lock(&callback_mutex);
1615 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1616 mutex_unlock(&callback_mutex);
1617
1618 return count;
1619}
1620
1621static ssize_t cpuset_common_file_read(struct cgroup *cont,
1622 struct cftype *cft,
1623 struct file *file,
1624 char __user *buf,
1625 size_t nbytes, loff_t *ppos)
1626{
1627 struct cpuset *cs = cgroup_cs(cont);
1628 cpuset_filetype_t type = cft->private;
1629 char *page;
1630 ssize_t retval = 0;
1631 char *s;
1632
1633 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1634 return -ENOMEM;
1635
1636 s = page;
1637
1638 switch (type) {
1639 case FILE_CPULIST:
1640 s += cpuset_sprintf_cpulist(s, cs);
1641 break;
1642 case FILE_MEMLIST:
1643 s += cpuset_sprintf_memlist(s, cs);
1644 break;
1645 default:
1646 retval = -EINVAL;
1647 goto out;
1648 }
1649 *s++ = '\n';
1650
1651 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1652out:
1653 free_page((unsigned long)page);
1654 return retval;
1655}
1656
1657static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1658{
1659 struct cpuset *cs = cgroup_cs(cont);
1660 cpuset_filetype_t type = cft->private;
1661 switch (type) {
1662 case FILE_CPU_EXCLUSIVE:
1663 return is_cpu_exclusive(cs);
1664 case FILE_MEM_EXCLUSIVE:
1665 return is_mem_exclusive(cs);
1666 case FILE_MEM_HARDWALL:
1667 return is_mem_hardwall(cs);
1668 case FILE_SCHED_LOAD_BALANCE:
1669 return is_sched_load_balance(cs);
1670 case FILE_MEMORY_MIGRATE:
1671 return is_memory_migrate(cs);
1672 case FILE_MEMORY_PRESSURE_ENABLED:
1673 return cpuset_memory_pressure_enabled;
1674 case FILE_MEMORY_PRESSURE:
1675 return fmeter_getrate(&cs->fmeter);
1676 case FILE_SPREAD_PAGE:
1677 return is_spread_page(cs);
1678 case FILE_SPREAD_SLAB:
1679 return is_spread_slab(cs);
1680 default:
1681 BUG();
1682 }
1683
1684
1685 return 0;
1686}
1687
1688static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1689{
1690 struct cpuset *cs = cgroup_cs(cont);
1691 cpuset_filetype_t type = cft->private;
1692 switch (type) {
1693 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1694 return cs->relax_domain_level;
1695 default:
1696 BUG();
1697 }
1698
1699
1700 return 0;
1701}
1702
1703
1704
1705
1706
1707
1708static struct cftype files[] = {
1709 {
1710 .name = "cpus",
1711 .read = cpuset_common_file_read,
1712 .write_string = cpuset_write_resmask,
1713 .max_write_len = (100U + 6 * NR_CPUS),
1714 .private = FILE_CPULIST,
1715 },
1716
1717 {
1718 .name = "mems",
1719 .read = cpuset_common_file_read,
1720 .write_string = cpuset_write_resmask,
1721 .max_write_len = (100U + 6 * MAX_NUMNODES),
1722 .private = FILE_MEMLIST,
1723 },
1724
1725 {
1726 .name = "cpu_exclusive",
1727 .read_u64 = cpuset_read_u64,
1728 .write_u64 = cpuset_write_u64,
1729 .private = FILE_CPU_EXCLUSIVE,
1730 },
1731
1732 {
1733 .name = "mem_exclusive",
1734 .read_u64 = cpuset_read_u64,
1735 .write_u64 = cpuset_write_u64,
1736 .private = FILE_MEM_EXCLUSIVE,
1737 },
1738
1739 {
1740 .name = "mem_hardwall",
1741 .read_u64 = cpuset_read_u64,
1742 .write_u64 = cpuset_write_u64,
1743 .private = FILE_MEM_HARDWALL,
1744 },
1745
1746 {
1747 .name = "sched_load_balance",
1748 .read_u64 = cpuset_read_u64,
1749 .write_u64 = cpuset_write_u64,
1750 .private = FILE_SCHED_LOAD_BALANCE,
1751 },
1752
1753 {
1754 .name = "sched_relax_domain_level",
1755 .read_s64 = cpuset_read_s64,
1756 .write_s64 = cpuset_write_s64,
1757 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1758 },
1759
1760 {
1761 .name = "memory_migrate",
1762 .read_u64 = cpuset_read_u64,
1763 .write_u64 = cpuset_write_u64,
1764 .private = FILE_MEMORY_MIGRATE,
1765 },
1766
1767 {
1768 .name = "memory_pressure",
1769 .read_u64 = cpuset_read_u64,
1770 .write_u64 = cpuset_write_u64,
1771 .private = FILE_MEMORY_PRESSURE,
1772 .mode = S_IRUGO,
1773 },
1774
1775 {
1776 .name = "memory_spread_page",
1777 .read_u64 = cpuset_read_u64,
1778 .write_u64 = cpuset_write_u64,
1779 .private = FILE_SPREAD_PAGE,
1780 },
1781
1782 {
1783 .name = "memory_spread_slab",
1784 .read_u64 = cpuset_read_u64,
1785 .write_u64 = cpuset_write_u64,
1786 .private = FILE_SPREAD_SLAB,
1787 },
1788};
1789
1790static struct cftype cft_memory_pressure_enabled = {
1791 .name = "memory_pressure_enabled",
1792 .read_u64 = cpuset_read_u64,
1793 .write_u64 = cpuset_write_u64,
1794 .private = FILE_MEMORY_PRESSURE_ENABLED,
1795};
1796
1797static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1798{
1799 int err;
1800
1801 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1802 if (err)
1803 return err;
1804
1805 if (!cont->parent)
1806 err = cgroup_add_file(cont, ss,
1807 &cft_memory_pressure_enabled);
1808 return err;
1809}
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828static void cpuset_post_clone(struct cgroup_subsys *ss,
1829 struct cgroup *cgroup)
1830{
1831 struct cgroup *parent, *child;
1832 struct cpuset *cs, *parent_cs;
1833
1834 parent = cgroup->parent;
1835 list_for_each_entry(child, &parent->children, sibling) {
1836 cs = cgroup_cs(child);
1837 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1838 return;
1839 }
1840 cs = cgroup_cs(cgroup);
1841 parent_cs = cgroup_cs(parent);
1842
1843 mutex_lock(&callback_mutex);
1844 cs->mems_allowed = parent_cs->mems_allowed;
1845 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1846 mutex_unlock(&callback_mutex);
1847 return;
1848}
1849
1850
1851
1852
1853
1854
1855
1856static struct cgroup_subsys_state *cpuset_create(
1857 struct cgroup_subsys *ss,
1858 struct cgroup *cont)
1859{
1860 struct cpuset *cs;
1861 struct cpuset *parent;
1862
1863 if (!cont->parent) {
1864 return &top_cpuset.css;
1865 }
1866 parent = cgroup_cs(cont->parent);
1867 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1868 if (!cs)
1869 return ERR_PTR(-ENOMEM);
1870 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1871 kfree(cs);
1872 return ERR_PTR(-ENOMEM);
1873 }
1874
1875 cs->flags = 0;
1876 if (is_spread_page(parent))
1877 set_bit(CS_SPREAD_PAGE, &cs->flags);
1878 if (is_spread_slab(parent))
1879 set_bit(CS_SPREAD_SLAB, &cs->flags);
1880 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1881 cpumask_clear(cs->cpus_allowed);
1882 nodes_clear(cs->mems_allowed);
1883 fmeter_init(&cs->fmeter);
1884 cs->relax_domain_level = -1;
1885
1886 cs->parent = parent;
1887 number_of_cpusets++;
1888 return &cs->css ;
1889}
1890
1891
1892
1893
1894
1895
1896
1897static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1898{
1899 struct cpuset *cs = cgroup_cs(cont);
1900
1901 if (is_sched_load_balance(cs))
1902 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1903
1904 number_of_cpusets--;
1905 free_cpumask_var(cs->cpus_allowed);
1906 kfree(cs);
1907}
1908
1909struct cgroup_subsys cpuset_subsys = {
1910 .name = "cpuset",
1911 .create = cpuset_create,
1912 .destroy = cpuset_destroy,
1913 .can_attach = cpuset_can_attach,
1914 .attach = cpuset_attach,
1915 .populate = cpuset_populate,
1916 .post_clone = cpuset_post_clone,
1917 .subsys_id = cpuset_subsys_id,
1918 .early_init = 1,
1919};
1920
1921
1922
1923
1924
1925
1926
1927int __init cpuset_init(void)
1928{
1929 int err = 0;
1930
1931 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1932 BUG();
1933
1934 cpumask_setall(top_cpuset.cpus_allowed);
1935 nodes_setall(top_cpuset.mems_allowed);
1936
1937 fmeter_init(&top_cpuset.fmeter);
1938 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1939 top_cpuset.relax_domain_level = -1;
1940
1941 err = register_filesystem(&cpuset_fs_type);
1942 if (err < 0)
1943 return err;
1944
1945 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1946 BUG();
1947
1948 number_of_cpusets = 1;
1949 return 0;
1950}
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960static void cpuset_do_move_task(struct task_struct *tsk,
1961 struct cgroup_scanner *scan)
1962{
1963 struct cgroup *new_cgroup = scan->data;
1964
1965 cgroup_attach_task(new_cgroup, tsk);
1966}
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1980{
1981 struct cgroup_scanner scan;
1982
1983 scan.cg = from->css.cgroup;
1984 scan.test_task = NULL;
1985 scan.process_task = cpuset_do_move_task;
1986 scan.heap = NULL;
1987 scan.data = to->css.cgroup;
1988
1989 if (cgroup_scan_tasks(&scan))
1990 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1991 "cgroup_scan_tasks failed\n");
1992}
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2005{
2006 struct cpuset *parent;
2007
2008
2009
2010
2011
2012
2013 if (list_empty(&cs->css.cgroup->css_sets))
2014 return;
2015
2016
2017
2018
2019
2020 parent = cs->parent;
2021 while (cpumask_empty(parent->cpus_allowed) ||
2022 nodes_empty(parent->mems_allowed))
2023 parent = parent->parent;
2024
2025 move_member_tasks_to_cpuset(cs, parent);
2026}
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043static void scan_for_empty_cpusets(struct cpuset *root)
2044{
2045 LIST_HEAD(queue);
2046 struct cpuset *cp;
2047 struct cpuset *child;
2048 struct cgroup *cont;
2049 static nodemask_t oldmems;
2050
2051 list_add_tail((struct list_head *)&root->stack_list, &queue);
2052
2053 while (!list_empty(&queue)) {
2054 cp = list_first_entry(&queue, struct cpuset, stack_list);
2055 list_del(queue.next);
2056 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2057 child = cgroup_cs(cont);
2058 list_add_tail(&child->stack_list, &queue);
2059 }
2060
2061
2062 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2063 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2064 continue;
2065
2066 oldmems = cp->mems_allowed;
2067
2068
2069 mutex_lock(&callback_mutex);
2070 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2071 cpu_active_mask);
2072 nodes_and(cp->mems_allowed, cp->mems_allowed,
2073 node_states[N_HIGH_MEMORY]);
2074 mutex_unlock(&callback_mutex);
2075
2076
2077 if (cpumask_empty(cp->cpus_allowed) ||
2078 nodes_empty(cp->mems_allowed))
2079 remove_tasks_in_empty_cpuset(cp);
2080 else {
2081 update_tasks_cpumask(cp, NULL);
2082 update_tasks_nodemask(cp, &oldmems, NULL);
2083 }
2084 }
2085}
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099void cpuset_update_active_cpus(void)
2100{
2101 struct sched_domain_attr *attr;
2102 cpumask_var_t *doms;
2103 int ndoms;
2104
2105 cgroup_lock();
2106 mutex_lock(&callback_mutex);
2107 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2108 mutex_unlock(&callback_mutex);
2109 scan_for_empty_cpusets(&top_cpuset);
2110 ndoms = generate_sched_domains(&doms, &attr);
2111 cgroup_unlock();
2112
2113
2114 partition_sched_domains(ndoms, doms, attr);
2115}
2116
2117#ifdef CONFIG_MEMORY_HOTPLUG
2118
2119
2120
2121
2122
2123static int cpuset_track_online_nodes(struct notifier_block *self,
2124 unsigned long action, void *arg)
2125{
2126 static nodemask_t oldmems;
2127
2128 cgroup_lock();
2129 switch (action) {
2130 case MEM_ONLINE:
2131 oldmems = top_cpuset.mems_allowed;
2132 mutex_lock(&callback_mutex);
2133 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2134 mutex_unlock(&callback_mutex);
2135 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2136 break;
2137 case MEM_OFFLINE:
2138
2139
2140
2141
2142 scan_for_empty_cpusets(&top_cpuset);
2143 break;
2144 default:
2145 break;
2146 }
2147 cgroup_unlock();
2148
2149 return NOTIFY_OK;
2150}
2151#endif
2152
2153
2154
2155
2156
2157
2158
2159void __init cpuset_init_smp(void)
2160{
2161 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2162 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2163
2164 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2165
2166 cpuset_wq = create_singlethread_workqueue("cpuset");
2167 BUG_ON(!cpuset_wq);
2168}
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2182{
2183 mutex_lock(&callback_mutex);
2184 task_lock(tsk);
2185 guarantee_online_cpus(task_cs(tsk), pmask);
2186 task_unlock(tsk);
2187 mutex_unlock(&callback_mutex);
2188}
2189
2190int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2191{
2192 const struct cpuset *cs;
2193 int cpu;
2194
2195 rcu_read_lock();
2196 cs = task_cs(tsk);
2197 if (cs)
2198 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2199 rcu_read_unlock();
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2217 if (cpu >= nr_cpu_ids) {
2218
2219
2220
2221
2222
2223
2224
2225 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2226 cpu = cpumask_any(cpu_active_mask);
2227 }
2228
2229 return cpu;
2230}
2231
2232void cpuset_init_current_mems_allowed(void)
2233{
2234 nodes_setall(current->mems_allowed);
2235}
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2248{
2249 nodemask_t mask;
2250
2251 mutex_lock(&callback_mutex);
2252 task_lock(tsk);
2253 guarantee_online_mems(task_cs(tsk), &mask);
2254 task_unlock(tsk);
2255 mutex_unlock(&callback_mutex);
2256
2257 return mask;
2258}
2259
2260
2261
2262
2263
2264
2265
2266int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2267{
2268 return nodes_intersects(*nodemask, current->mems_allowed);
2269}
2270
2271
2272
2273
2274
2275
2276
2277static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2278{
2279 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2280 cs = cs->parent;
2281 return cs;
2282}
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2346{
2347 const struct cpuset *cs;
2348 int allowed;
2349
2350 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2351 return 1;
2352 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2353 if (node_isset(node, current->mems_allowed))
2354 return 1;
2355
2356
2357
2358
2359 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2360 return 1;
2361 if (gfp_mask & __GFP_HARDWALL)
2362 return 0;
2363
2364 if (current->flags & PF_EXITING)
2365 return 1;
2366
2367
2368 mutex_lock(&callback_mutex);
2369
2370 task_lock(current);
2371 cs = nearest_hardwall_ancestor(task_cs(current));
2372 task_unlock(current);
2373
2374 allowed = node_isset(node, cs->mems_allowed);
2375 mutex_unlock(&callback_mutex);
2376 return allowed;
2377}
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2403{
2404 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2405 return 1;
2406 if (node_isset(node, current->mems_allowed))
2407 return 1;
2408
2409
2410
2411
2412 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2413 return 1;
2414 return 0;
2415}
2416
2417
2418
2419
2420
2421
2422
2423void cpuset_unlock(void)
2424{
2425 mutex_unlock(&callback_mutex);
2426}
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455static int cpuset_spread_node(int *rotor)
2456{
2457 int node;
2458
2459 node = next_node(*rotor, current->mems_allowed);
2460 if (node == MAX_NUMNODES)
2461 node = first_node(current->mems_allowed);
2462 *rotor = node;
2463 return node;
2464}
2465
2466int cpuset_mem_spread_node(void)
2467{
2468 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2469}
2470
2471int cpuset_slab_spread_node(void)
2472{
2473 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2474}
2475
2476EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2490 const struct task_struct *tsk2)
2491{
2492 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2493}
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2504{
2505 struct dentry *dentry;
2506
2507 dentry = task_cs(tsk)->css.cgroup->dentry;
2508 spin_lock(&cpuset_buffer_lock);
2509 snprintf(cpuset_name, CPUSET_NAME_LEN,
2510 dentry ? (const char *)dentry->d_name.name : "/");
2511 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2512 tsk->mems_allowed);
2513 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2514 tsk->comm, cpuset_name, cpuset_nodelist);
2515 spin_unlock(&cpuset_buffer_lock);
2516}
2517
2518
2519
2520
2521
2522
2523
2524int cpuset_memory_pressure_enabled __read_mostly;
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544void __cpuset_memory_pressure_bump(void)
2545{
2546 task_lock(current);
2547 fmeter_markevent(&task_cs(current)->fmeter);
2548 task_unlock(current);
2549}
2550
2551#ifdef CONFIG_PROC_PID_CPUSET
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2562{
2563 struct pid *pid;
2564 struct task_struct *tsk;
2565 char *buf;
2566 struct cgroup_subsys_state *css;
2567 int retval;
2568
2569 retval = -ENOMEM;
2570 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2571 if (!buf)
2572 goto out;
2573
2574 retval = -ESRCH;
2575 pid = m->private;
2576 tsk = get_pid_task(pid, PIDTYPE_PID);
2577 if (!tsk)
2578 goto out_free;
2579
2580 retval = -EINVAL;
2581 cgroup_lock();
2582 css = task_subsys_state(tsk, cpuset_subsys_id);
2583 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2584 if (retval < 0)
2585 goto out_unlock;
2586 seq_puts(m, buf);
2587 seq_putc(m, '\n');
2588out_unlock:
2589 cgroup_unlock();
2590 put_task_struct(tsk);
2591out_free:
2592 kfree(buf);
2593out:
2594 return retval;
2595}
2596
2597static int cpuset_open(struct inode *inode, struct file *file)
2598{
2599 struct pid *pid = PROC_I(inode)->pid;
2600 return single_open(file, proc_cpuset_show, pid);
2601}
2602
2603const struct file_operations proc_cpuset_operations = {
2604 .open = cpuset_open,
2605 .read = seq_read,
2606 .llseek = seq_lseek,
2607 .release = single_release,
2608};
2609#endif
2610
2611
2612void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2613{
2614 seq_printf(m, "Mems_allowed:\t");
2615 seq_nodemask(m, &task->mems_allowed);
2616 seq_printf(m, "\n");
2617 seq_printf(m, "Mems_allowed_list:\t");
2618 seq_nodemask_list(m, &task->mems_allowed);
2619 seq_printf(m, "\n");
2620}
2621