1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
139
140typedef enum {
141 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL,
144 CS_MEMORY_MIGRATE,
145 CS_SCHED_LOAD_BALANCE,
146 CS_SPREAD_PAGE,
147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t;
149
150
151static inline int is_cpu_exclusive(const struct cpuset *cs)
152{
153 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
154}
155
156static inline int is_mem_exclusive(const struct cpuset *cs)
157{
158 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
159}
160
161static inline int is_mem_hardwall(const struct cpuset *cs)
162{
163 return test_bit(CS_MEM_HARDWALL, &cs->flags);
164}
165
166static inline int is_sched_load_balance(const struct cpuset *cs)
167{
168 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
169}
170
171static inline int is_memory_migrate(const struct cpuset *cs)
172{
173 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
174}
175
176static inline int is_spread_page(const struct cpuset *cs)
177{
178 return test_bit(CS_SPREAD_PAGE, &cs->flags);
179}
180
181static inline int is_spread_slab(const struct cpuset *cs)
182{
183 return test_bit(CS_SPREAD_SLAB, &cs->flags);
184}
185
186static struct cpuset top_cpuset = {
187 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
188};
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229static DEFINE_MUTEX(callback_mutex);
230
231
232
233
234
235
236#define CPUSET_NAME_LEN (128)
237#define CPUSET_NODELIST_LEN (256)
238static char cpuset_name[CPUSET_NAME_LEN];
239static char cpuset_nodelist[CPUSET_NODELIST_LEN];
240static DEFINE_SPINLOCK(cpuset_buffer_lock);
241
242
243
244
245
246
247static struct dentry *cpuset_mount(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name, void *data)
249{
250 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
251 struct dentry *ret = ERR_PTR(-ENODEV);
252 if (cgroup_fs) {
253 char mountopts[] =
254 "cpuset,noprefix,"
255 "release_agent=/sbin/cpuset_release_agent";
256 ret = cgroup_fs->mount(cgroup_fs, flags,
257 unused_dev_name, mountopts);
258 put_filesystem(cgroup_fs);
259 }
260 return ret;
261}
262
263static struct file_system_type cpuset_fs_type = {
264 .name = "cpuset",
265 .mount = cpuset_mount,
266};
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282static void guarantee_online_cpus(const struct cpuset *cs,
283 struct cpumask *pmask)
284{
285 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent;
287 if (cs)
288 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else
290 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292}
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
308{
309 while (cs && !nodes_intersects(cs->mems_allowed,
310 node_states[N_HIGH_MEMORY]))
311 cs = cs->parent;
312 if (cs)
313 nodes_and(*pmask, cs->mems_allowed,
314 node_states[N_HIGH_MEMORY]);
315 else
316 *pmask = node_states[N_HIGH_MEMORY];
317 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
318}
319
320
321
322
323
324
325static void cpuset_update_task_spread_flag(struct cpuset *cs,
326 struct task_struct *tsk)
327{
328 if (is_spread_page(cs))
329 tsk->flags |= PF_SPREAD_PAGE;
330 else
331 tsk->flags &= ~PF_SPREAD_PAGE;
332 if (is_spread_slab(cs))
333 tsk->flags |= PF_SPREAD_SLAB;
334 else
335 tsk->flags &= ~PF_SPREAD_SLAB;
336}
337
338
339
340
341
342
343
344
345
346static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
347{
348 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
349 nodes_subset(p->mems_allowed, q->mems_allowed) &&
350 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
351 is_mem_exclusive(p) <= is_mem_exclusive(q);
352}
353
354
355
356
357
358static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
359{
360 struct cpuset *trial;
361
362 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
363 if (!trial)
364 return NULL;
365
366 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
367 kfree(trial);
368 return NULL;
369 }
370 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
371
372 return trial;
373}
374
375
376
377
378
379static void free_trial_cpuset(struct cpuset *trial)
380{
381 free_cpumask_var(trial->cpus_allowed);
382 kfree(trial);
383}
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
406{
407 struct cgroup *cont;
408 struct cpuset *c, *par;
409
410
411 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
412 if (!is_cpuset_subset(cgroup_cs(cont), trial))
413 return -EBUSY;
414 }
415
416
417 if (cur == &top_cpuset)
418 return 0;
419
420 par = cur->parent;
421
422
423 if (!is_cpuset_subset(trial, par))
424 return -EACCES;
425
426
427
428
429
430 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
431 c = cgroup_cs(cont);
432 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
433 c != cur &&
434 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
435 return -EINVAL;
436 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
437 c != cur &&
438 nodes_intersects(trial->mems_allowed, c->mems_allowed))
439 return -EINVAL;
440 }
441
442
443 if (cgroup_task_count(cur->css.cgroup)) {
444 if (cpumask_empty(trial->cpus_allowed) ||
445 nodes_empty(trial->mems_allowed)) {
446 return -ENOSPC;
447 }
448 }
449
450 return 0;
451}
452
453#ifdef CONFIG_SMP
454
455
456
457
458static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
459{
460 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
461}
462
463static void
464update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
465{
466 if (dattr->relax_domain_level < c->relax_domain_level)
467 dattr->relax_domain_level = c->relax_domain_level;
468 return;
469}
470
471static void
472update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
473{
474 LIST_HEAD(q);
475
476 list_add(&c->stack_list, &q);
477 while (!list_empty(&q)) {
478 struct cpuset *cp;
479 struct cgroup *cont;
480 struct cpuset *child;
481
482 cp = list_first_entry(&q, struct cpuset, stack_list);
483 list_del(q.next);
484
485 if (cpumask_empty(cp->cpus_allowed))
486 continue;
487
488 if (is_sched_load_balance(cp))
489 update_domain_attr(dattr, cp);
490
491 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
492 child = cgroup_cs(cont);
493 list_add_tail(&child->stack_list, &q);
494 }
495 }
496}
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552static int generate_sched_domains(cpumask_var_t **domains,
553 struct sched_domain_attr **attributes)
554{
555 LIST_HEAD(q);
556 struct cpuset *cp;
557 struct cpuset **csa;
558 int csn;
559 int i, j, k;
560 cpumask_var_t *doms;
561 struct sched_domain_attr *dattr;
562 int ndoms = 0;
563 int nslot;
564
565 doms = NULL;
566 dattr = NULL;
567 csa = NULL;
568
569
570 if (is_sched_load_balance(&top_cpuset)) {
571 ndoms = 1;
572 doms = alloc_sched_domains(ndoms);
573 if (!doms)
574 goto done;
575
576 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
577 if (dattr) {
578 *dattr = SD_ATTR_INIT;
579 update_domain_attr_tree(dattr, &top_cpuset);
580 }
581 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
582
583 goto done;
584 }
585
586 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
587 if (!csa)
588 goto done;
589 csn = 0;
590
591 list_add(&top_cpuset.stack_list, &q);
592 while (!list_empty(&q)) {
593 struct cgroup *cont;
594 struct cpuset *child;
595
596 cp = list_first_entry(&q, struct cpuset, stack_list);
597 list_del(q.next);
598
599 if (cpumask_empty(cp->cpus_allowed))
600 continue;
601
602
603
604
605
606
607
608 if (is_sched_load_balance(cp)) {
609 csa[csn++] = cp;
610 continue;
611 }
612
613 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
614 child = cgroup_cs(cont);
615 list_add_tail(&child->stack_list, &q);
616 }
617 }
618
619 for (i = 0; i < csn; i++)
620 csa[i]->pn = i;
621 ndoms = csn;
622
623restart:
624
625 for (i = 0; i < csn; i++) {
626 struct cpuset *a = csa[i];
627 int apn = a->pn;
628
629 for (j = 0; j < csn; j++) {
630 struct cpuset *b = csa[j];
631 int bpn = b->pn;
632
633 if (apn != bpn && cpusets_overlap(a, b)) {
634 for (k = 0; k < csn; k++) {
635 struct cpuset *c = csa[k];
636
637 if (c->pn == bpn)
638 c->pn = apn;
639 }
640 ndoms--;
641 goto restart;
642 }
643 }
644 }
645
646
647
648
649
650 doms = alloc_sched_domains(ndoms);
651 if (!doms)
652 goto done;
653
654
655
656
657
658 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
659
660 for (nslot = 0, i = 0; i < csn; i++) {
661 struct cpuset *a = csa[i];
662 struct cpumask *dp;
663 int apn = a->pn;
664
665 if (apn < 0) {
666
667 continue;
668 }
669
670 dp = doms[nslot];
671
672 if (nslot == ndoms) {
673 static int warnings = 10;
674 if (warnings) {
675 printk(KERN_WARNING
676 "rebuild_sched_domains confused:"
677 " nslot %d, ndoms %d, csn %d, i %d,"
678 " apn %d\n",
679 nslot, ndoms, csn, i, apn);
680 warnings--;
681 }
682 continue;
683 }
684
685 cpumask_clear(dp);
686 if (dattr)
687 *(dattr + nslot) = SD_ATTR_INIT;
688 for (j = i; j < csn; j++) {
689 struct cpuset *b = csa[j];
690
691 if (apn == b->pn) {
692 cpumask_or(dp, dp, b->cpus_allowed);
693 if (dattr)
694 update_domain_attr_tree(dattr + nslot, b);
695
696
697 b->pn = -1;
698 }
699 }
700 nslot++;
701 }
702 BUG_ON(nslot != ndoms);
703
704done:
705 kfree(csa);
706
707
708
709
710
711 if (doms == NULL)
712 ndoms = 1;
713
714 *domains = doms;
715 *attributes = dattr;
716 return ndoms;
717}
718
719
720
721
722
723
724
725
726
727
728
729static void do_rebuild_sched_domains(struct work_struct *unused)
730{
731 struct sched_domain_attr *attr;
732 cpumask_var_t *doms;
733 int ndoms;
734
735 get_online_cpus();
736
737
738 cgroup_lock();
739 ndoms = generate_sched_domains(&doms, &attr);
740 cgroup_unlock();
741
742
743 partition_sched_domains(ndoms, doms, attr);
744
745 put_online_cpus();
746}
747#else
748static void do_rebuild_sched_domains(struct work_struct *unused)
749{
750}
751
752static int generate_sched_domains(cpumask_var_t **domains,
753 struct sched_domain_attr **attributes)
754{
755 *domains = NULL;
756 return 1;
757}
758#endif
759
760static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781static void async_rebuild_sched_domains(void)
782{
783 queue_work(cpuset_wq, &rebuild_sched_domains_work);
784}
785
786
787
788
789
790
791
792
793
794
795void rebuild_sched_domains(void)
796{
797 do_rebuild_sched_domains(NULL);
798}
799
800
801
802
803
804
805
806
807
808
809
810static int cpuset_test_cpumask(struct task_struct *tsk,
811 struct cgroup_scanner *scan)
812{
813 return !cpumask_equal(&tsk->cpus_allowed,
814 (cgroup_cs(scan->cg))->cpus_allowed);
815}
816
817
818
819
820
821
822
823
824
825
826
827
828static void cpuset_change_cpumask(struct task_struct *tsk,
829 struct cgroup_scanner *scan)
830{
831 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
832}
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
848{
849 struct cgroup_scanner scan;
850
851 scan.cg = cs->css.cgroup;
852 scan.test_task = cpuset_test_cpumask;
853 scan.process_task = cpuset_change_cpumask;
854 scan.heap = heap;
855 cgroup_scan_tasks(&scan);
856}
857
858
859
860
861
862
863static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
864 const char *buf)
865{
866 struct ptr_heap heap;
867 int retval;
868 int is_load_balanced;
869
870
871 if (cs == &top_cpuset)
872 return -EACCES;
873
874
875
876
877
878
879
880 if (!*buf) {
881 cpumask_clear(trialcs->cpus_allowed);
882 } else {
883 retval = cpulist_parse(buf, trialcs->cpus_allowed);
884 if (retval < 0)
885 return retval;
886
887 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
888 return -EINVAL;
889 }
890 retval = validate_change(cs, trialcs);
891 if (retval < 0)
892 return retval;
893
894
895 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
896 return 0;
897
898 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
899 if (retval)
900 return retval;
901
902 is_load_balanced = is_sched_load_balance(trialcs);
903
904 mutex_lock(&callback_mutex);
905 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
906 mutex_unlock(&callback_mutex);
907
908
909
910
911
912 update_tasks_cpumask(cs, &heap);
913
914 heap_free(&heap);
915
916 if (is_load_balanced)
917 async_rebuild_sched_domains();
918 return 0;
919}
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
942 const nodemask_t *to)
943{
944 struct task_struct *tsk = current;
945
946 tsk->mems_allowed = *to;
947
948 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
949
950 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
951}
952
953
954
955
956
957
958
959
960
961
962static void cpuset_change_task_nodemask(struct task_struct *tsk,
963 nodemask_t *newmems)
964{
965 bool need_loop;
966
967repeat:
968
969
970
971
972 if (unlikely(test_thread_flag(TIF_MEMDIE)))
973 return;
974 if (current->flags & PF_EXITING)
975 return;
976
977 task_lock(tsk);
978
979
980
981
982
983
984 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001 smp_mb();
1002
1003
1004
1005
1006
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013
1014
1015
1016
1017
1018
1019
1020
1021 smp_mb();
1022
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems;
1025 task_unlock(tsk);
1026}
1027
1028
1029
1030
1031
1032
1033static void cpuset_change_nodemask(struct task_struct *p,
1034 struct cgroup_scanner *scan)
1035{
1036 struct mm_struct *mm;
1037 struct cpuset *cs;
1038 int migrate;
1039 const nodemask_t *oldmem = scan->data;
1040 static nodemask_t newmems;
1041
1042 cs = cgroup_cs(scan->cg);
1043 guarantee_online_mems(cs, &newmems);
1044
1045 cpuset_change_task_nodemask(p, &newmems);
1046
1047 mm = get_task_mm(p);
1048 if (!mm)
1049 return;
1050
1051 migrate = is_memory_migrate(cs);
1052
1053 mpol_rebind_mm(mm, &cs->mems_allowed);
1054 if (migrate)
1055 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1056 mmput(mm);
1057}
1058
1059static void *cpuset_being_rebound;
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1072 struct ptr_heap *heap)
1073{
1074 struct cgroup_scanner scan;
1075
1076 cpuset_being_rebound = cs;
1077
1078 scan.cg = cs->css.cgroup;
1079 scan.test_task = NULL;
1080 scan.process_task = cpuset_change_nodemask;
1081 scan.heap = heap;
1082 scan.data = (nodemask_t *)oldmem;
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094 cgroup_scan_tasks(&scan);
1095
1096
1097 cpuset_being_rebound = NULL;
1098}
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1114 const char *buf)
1115{
1116 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1117 int retval;
1118 struct ptr_heap heap;
1119
1120 if (!oldmem)
1121 return -ENOMEM;
1122
1123
1124
1125
1126
1127 if (cs == &top_cpuset) {
1128 retval = -EACCES;
1129 goto done;
1130 }
1131
1132
1133
1134
1135
1136
1137
1138 if (!*buf) {
1139 nodes_clear(trialcs->mems_allowed);
1140 } else {
1141 retval = nodelist_parse(buf, trialcs->mems_allowed);
1142 if (retval < 0)
1143 goto done;
1144
1145 if (!nodes_subset(trialcs->mems_allowed,
1146 node_states[N_HIGH_MEMORY])) {
1147 retval = -EINVAL;
1148 goto done;
1149 }
1150 }
1151 *oldmem = cs->mems_allowed;
1152 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1153 retval = 0;
1154 goto done;
1155 }
1156 retval = validate_change(cs, trialcs);
1157 if (retval < 0)
1158 goto done;
1159
1160 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1161 if (retval < 0)
1162 goto done;
1163
1164 mutex_lock(&callback_mutex);
1165 cs->mems_allowed = trialcs->mems_allowed;
1166 mutex_unlock(&callback_mutex);
1167
1168 update_tasks_nodemask(cs, oldmem, &heap);
1169
1170 heap_free(&heap);
1171done:
1172 NODEMASK_FREE(oldmem);
1173 return retval;
1174}
1175
1176int current_cpuset_is_being_rebound(void)
1177{
1178 return task_cs(current) == cpuset_being_rebound;
1179}
1180
1181static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182{
1183#ifdef CONFIG_SMP
1184 if (val < -1 || val >= sched_domain_level_max)
1185 return -EINVAL;
1186#endif
1187
1188 if (val != cs->relax_domain_level) {
1189 cs->relax_domain_level = val;
1190 if (!cpumask_empty(cs->cpus_allowed) &&
1191 is_sched_load_balance(cs))
1192 async_rebuild_sched_domains();
1193 }
1194
1195 return 0;
1196}
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208static void cpuset_change_flag(struct task_struct *tsk,
1209 struct cgroup_scanner *scan)
1210{
1211 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1212}
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1228{
1229 struct cgroup_scanner scan;
1230
1231 scan.cg = cs->css.cgroup;
1232 scan.test_task = NULL;
1233 scan.process_task = cpuset_change_flag;
1234 scan.heap = heap;
1235 cgroup_scan_tasks(&scan);
1236}
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1248 int turning_on)
1249{
1250 struct cpuset *trialcs;
1251 int balance_flag_changed;
1252 int spread_flag_changed;
1253 struct ptr_heap heap;
1254 int err;
1255
1256 trialcs = alloc_trial_cpuset(cs);
1257 if (!trialcs)
1258 return -ENOMEM;
1259
1260 if (turning_on)
1261 set_bit(bit, &trialcs->flags);
1262 else
1263 clear_bit(bit, &trialcs->flags);
1264
1265 err = validate_change(cs, trialcs);
1266 if (err < 0)
1267 goto out;
1268
1269 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1270 if (err < 0)
1271 goto out;
1272
1273 balance_flag_changed = (is_sched_load_balance(cs) !=
1274 is_sched_load_balance(trialcs));
1275
1276 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1277 || (is_spread_page(cs) != is_spread_page(trialcs)));
1278
1279 mutex_lock(&callback_mutex);
1280 cs->flags = trialcs->flags;
1281 mutex_unlock(&callback_mutex);
1282
1283 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1284 async_rebuild_sched_domains();
1285
1286 if (spread_flag_changed)
1287 update_tasks_flags(cs, &heap);
1288 heap_free(&heap);
1289out:
1290 free_trial_cpuset(trialcs);
1291 return err;
1292}
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339#define FM_COEF 933
1340#define FM_MAXTICKS ((time_t)99)
1341#define FM_MAXCNT 1000000
1342#define FM_SCALE 1000
1343
1344
1345static void fmeter_init(struct fmeter *fmp)
1346{
1347 fmp->cnt = 0;
1348 fmp->val = 0;
1349 fmp->time = 0;
1350 spin_lock_init(&fmp->lock);
1351}
1352
1353
1354static void fmeter_update(struct fmeter *fmp)
1355{
1356 time_t now = get_seconds();
1357 time_t ticks = now - fmp->time;
1358
1359 if (ticks == 0)
1360 return;
1361
1362 ticks = min(FM_MAXTICKS, ticks);
1363 while (ticks-- > 0)
1364 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1365 fmp->time = now;
1366
1367 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1368 fmp->cnt = 0;
1369}
1370
1371
1372static void fmeter_markevent(struct fmeter *fmp)
1373{
1374 spin_lock(&fmp->lock);
1375 fmeter_update(fmp);
1376 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1377 spin_unlock(&fmp->lock);
1378}
1379
1380
1381static int fmeter_getrate(struct fmeter *fmp)
1382{
1383 int val;
1384
1385 spin_lock(&fmp->lock);
1386 fmeter_update(fmp);
1387 val = fmp->val;
1388 spin_unlock(&fmp->lock);
1389 return val;
1390}
1391
1392
1393static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1394 struct task_struct *tsk)
1395{
1396 struct cpuset *cs = cgroup_cs(cont);
1397
1398 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1399 return -ENOSPC;
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409 if (tsk->flags & PF_THREAD_BOUND)
1410 return -EINVAL;
1411
1412 return 0;
1413}
1414
1415static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1416{
1417 return security_task_setscheduler(task);
1418}
1419
1420
1421
1422
1423
1424
1425static cpumask_var_t cpus_attach;
1426static nodemask_t cpuset_attach_nodemask_from;
1427static nodemask_t cpuset_attach_nodemask_to;
1428
1429
1430static void cpuset_pre_attach(struct cgroup *cont)
1431{
1432 struct cpuset *cs = cgroup_cs(cont);
1433
1434 if (cs == &top_cpuset)
1435 cpumask_copy(cpus_attach, cpu_possible_mask);
1436 else
1437 guarantee_online_cpus(cs, cpus_attach);
1438
1439 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1440}
1441
1442
1443static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1444{
1445 int err;
1446 struct cpuset *cs = cgroup_cs(cont);
1447
1448
1449
1450
1451
1452 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1453 WARN_ON_ONCE(err);
1454
1455 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1456 cpuset_update_task_spread_flag(cs, tsk);
1457}
1458
1459static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1460 struct cgroup *oldcont, struct task_struct *tsk)
1461{
1462 struct mm_struct *mm;
1463 struct cpuset *cs = cgroup_cs(cont);
1464 struct cpuset *oldcs = cgroup_cs(oldcont);
1465
1466
1467
1468
1469
1470 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1471 cpuset_attach_nodemask_to = cs->mems_allowed;
1472 mm = get_task_mm(tsk);
1473 if (mm) {
1474 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1475 if (is_memory_migrate(cs))
1476 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1477 &cpuset_attach_nodemask_to);
1478 mmput(mm);
1479 }
1480}
1481
1482
1483
1484typedef enum {
1485 FILE_MEMORY_MIGRATE,
1486 FILE_CPULIST,
1487 FILE_MEMLIST,
1488 FILE_CPU_EXCLUSIVE,
1489 FILE_MEM_EXCLUSIVE,
1490 FILE_MEM_HARDWALL,
1491 FILE_SCHED_LOAD_BALANCE,
1492 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1493 FILE_MEMORY_PRESSURE_ENABLED,
1494 FILE_MEMORY_PRESSURE,
1495 FILE_SPREAD_PAGE,
1496 FILE_SPREAD_SLAB,
1497} cpuset_filetype_t;
1498
1499static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1500{
1501 int retval = 0;
1502 struct cpuset *cs = cgroup_cs(cgrp);
1503 cpuset_filetype_t type = cft->private;
1504
1505 if (!cgroup_lock_live_group(cgrp))
1506 return -ENODEV;
1507
1508 switch (type) {
1509 case FILE_CPU_EXCLUSIVE:
1510 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1511 break;
1512 case FILE_MEM_EXCLUSIVE:
1513 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1514 break;
1515 case FILE_MEM_HARDWALL:
1516 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1517 break;
1518 case FILE_SCHED_LOAD_BALANCE:
1519 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1520 break;
1521 case FILE_MEMORY_MIGRATE:
1522 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1523 break;
1524 case FILE_MEMORY_PRESSURE_ENABLED:
1525 cpuset_memory_pressure_enabled = !!val;
1526 break;
1527 case FILE_MEMORY_PRESSURE:
1528 retval = -EACCES;
1529 break;
1530 case FILE_SPREAD_PAGE:
1531 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1532 break;
1533 case FILE_SPREAD_SLAB:
1534 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1535 break;
1536 default:
1537 retval = -EINVAL;
1538 break;
1539 }
1540 cgroup_unlock();
1541 return retval;
1542}
1543
1544static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1545{
1546 int retval = 0;
1547 struct cpuset *cs = cgroup_cs(cgrp);
1548 cpuset_filetype_t type = cft->private;
1549
1550 if (!cgroup_lock_live_group(cgrp))
1551 return -ENODEV;
1552
1553 switch (type) {
1554 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1555 retval = update_relax_domain_level(cs, val);
1556 break;
1557 default:
1558 retval = -EINVAL;
1559 break;
1560 }
1561 cgroup_unlock();
1562 return retval;
1563}
1564
1565
1566
1567
1568static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1569 const char *buf)
1570{
1571 int retval = 0;
1572 struct cpuset *cs = cgroup_cs(cgrp);
1573 struct cpuset *trialcs;
1574
1575 if (!cgroup_lock_live_group(cgrp))
1576 return -ENODEV;
1577
1578 trialcs = alloc_trial_cpuset(cs);
1579 if (!trialcs) {
1580 retval = -ENOMEM;
1581 goto out;
1582 }
1583
1584 switch (cft->private) {
1585 case FILE_CPULIST:
1586 retval = update_cpumask(cs, trialcs, buf);
1587 break;
1588 case FILE_MEMLIST:
1589 retval = update_nodemask(cs, trialcs, buf);
1590 break;
1591 default:
1592 retval = -EINVAL;
1593 break;
1594 }
1595
1596 free_trial_cpuset(trialcs);
1597out:
1598 cgroup_unlock();
1599 return retval;
1600}
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1615{
1616 size_t count;
1617
1618 mutex_lock(&callback_mutex);
1619 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1620 mutex_unlock(&callback_mutex);
1621
1622 return count;
1623}
1624
1625static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1626{
1627 size_t count;
1628
1629 mutex_lock(&callback_mutex);
1630 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1631 mutex_unlock(&callback_mutex);
1632
1633 return count;
1634}
1635
1636static ssize_t cpuset_common_file_read(struct cgroup *cont,
1637 struct cftype *cft,
1638 struct file *file,
1639 char __user *buf,
1640 size_t nbytes, loff_t *ppos)
1641{
1642 struct cpuset *cs = cgroup_cs(cont);
1643 cpuset_filetype_t type = cft->private;
1644 char *page;
1645 ssize_t retval = 0;
1646 char *s;
1647
1648 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1649 return -ENOMEM;
1650
1651 s = page;
1652
1653 switch (type) {
1654 case FILE_CPULIST:
1655 s += cpuset_sprintf_cpulist(s, cs);
1656 break;
1657 case FILE_MEMLIST:
1658 s += cpuset_sprintf_memlist(s, cs);
1659 break;
1660 default:
1661 retval = -EINVAL;
1662 goto out;
1663 }
1664 *s++ = '\n';
1665
1666 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1667out:
1668 free_page((unsigned long)page);
1669 return retval;
1670}
1671
1672static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1673{
1674 struct cpuset *cs = cgroup_cs(cont);
1675 cpuset_filetype_t type = cft->private;
1676 switch (type) {
1677 case FILE_CPU_EXCLUSIVE:
1678 return is_cpu_exclusive(cs);
1679 case FILE_MEM_EXCLUSIVE:
1680 return is_mem_exclusive(cs);
1681 case FILE_MEM_HARDWALL:
1682 return is_mem_hardwall(cs);
1683 case FILE_SCHED_LOAD_BALANCE:
1684 return is_sched_load_balance(cs);
1685 case FILE_MEMORY_MIGRATE:
1686 return is_memory_migrate(cs);
1687 case FILE_MEMORY_PRESSURE_ENABLED:
1688 return cpuset_memory_pressure_enabled;
1689 case FILE_MEMORY_PRESSURE:
1690 return fmeter_getrate(&cs->fmeter);
1691 case FILE_SPREAD_PAGE:
1692 return is_spread_page(cs);
1693 case FILE_SPREAD_SLAB:
1694 return is_spread_slab(cs);
1695 default:
1696 BUG();
1697 }
1698
1699
1700 return 0;
1701}
1702
1703static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1704{
1705 struct cpuset *cs = cgroup_cs(cont);
1706 cpuset_filetype_t type = cft->private;
1707 switch (type) {
1708 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1709 return cs->relax_domain_level;
1710 default:
1711 BUG();
1712 }
1713
1714
1715 return 0;
1716}
1717
1718
1719
1720
1721
1722
1723static struct cftype files[] = {
1724 {
1725 .name = "cpus",
1726 .read = cpuset_common_file_read,
1727 .write_string = cpuset_write_resmask,
1728 .max_write_len = (100U + 6 * NR_CPUS),
1729 .private = FILE_CPULIST,
1730 },
1731
1732 {
1733 .name = "mems",
1734 .read = cpuset_common_file_read,
1735 .write_string = cpuset_write_resmask,
1736 .max_write_len = (100U + 6 * MAX_NUMNODES),
1737 .private = FILE_MEMLIST,
1738 },
1739
1740 {
1741 .name = "cpu_exclusive",
1742 .read_u64 = cpuset_read_u64,
1743 .write_u64 = cpuset_write_u64,
1744 .private = FILE_CPU_EXCLUSIVE,
1745 },
1746
1747 {
1748 .name = "mem_exclusive",
1749 .read_u64 = cpuset_read_u64,
1750 .write_u64 = cpuset_write_u64,
1751 .private = FILE_MEM_EXCLUSIVE,
1752 },
1753
1754 {
1755 .name = "mem_hardwall",
1756 .read_u64 = cpuset_read_u64,
1757 .write_u64 = cpuset_write_u64,
1758 .private = FILE_MEM_HARDWALL,
1759 },
1760
1761 {
1762 .name = "sched_load_balance",
1763 .read_u64 = cpuset_read_u64,
1764 .write_u64 = cpuset_write_u64,
1765 .private = FILE_SCHED_LOAD_BALANCE,
1766 },
1767
1768 {
1769 .name = "sched_relax_domain_level",
1770 .read_s64 = cpuset_read_s64,
1771 .write_s64 = cpuset_write_s64,
1772 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1773 },
1774
1775 {
1776 .name = "memory_migrate",
1777 .read_u64 = cpuset_read_u64,
1778 .write_u64 = cpuset_write_u64,
1779 .private = FILE_MEMORY_MIGRATE,
1780 },
1781
1782 {
1783 .name = "memory_pressure",
1784 .read_u64 = cpuset_read_u64,
1785 .write_u64 = cpuset_write_u64,
1786 .private = FILE_MEMORY_PRESSURE,
1787 .mode = S_IRUGO,
1788 },
1789
1790 {
1791 .name = "memory_spread_page",
1792 .read_u64 = cpuset_read_u64,
1793 .write_u64 = cpuset_write_u64,
1794 .private = FILE_SPREAD_PAGE,
1795 },
1796
1797 {
1798 .name = "memory_spread_slab",
1799 .read_u64 = cpuset_read_u64,
1800 .write_u64 = cpuset_write_u64,
1801 .private = FILE_SPREAD_SLAB,
1802 },
1803};
1804
1805static struct cftype cft_memory_pressure_enabled = {
1806 .name = "memory_pressure_enabled",
1807 .read_u64 = cpuset_read_u64,
1808 .write_u64 = cpuset_write_u64,
1809 .private = FILE_MEMORY_PRESSURE_ENABLED,
1810};
1811
1812static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1813{
1814 int err;
1815
1816 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1817 if (err)
1818 return err;
1819
1820 if (!cont->parent)
1821 err = cgroup_add_file(cont, ss,
1822 &cft_memory_pressure_enabled);
1823 return err;
1824}
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842static void cpuset_post_clone(struct cgroup_subsys *ss,
1843 struct cgroup *cgroup)
1844{
1845 struct cgroup *parent, *child;
1846 struct cpuset *cs, *parent_cs;
1847
1848 parent = cgroup->parent;
1849 list_for_each_entry(child, &parent->children, sibling) {
1850 cs = cgroup_cs(child);
1851 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1852 return;
1853 }
1854 cs = cgroup_cs(cgroup);
1855 parent_cs = cgroup_cs(parent);
1856
1857 mutex_lock(&callback_mutex);
1858 cs->mems_allowed = parent_cs->mems_allowed;
1859 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1860 mutex_unlock(&callback_mutex);
1861 return;
1862}
1863
1864
1865
1866
1867
1868
1869
1870static struct cgroup_subsys_state *cpuset_create(
1871 struct cgroup_subsys *ss,
1872 struct cgroup *cont)
1873{
1874 struct cpuset *cs;
1875 struct cpuset *parent;
1876
1877 if (!cont->parent) {
1878 return &top_cpuset.css;
1879 }
1880 parent = cgroup_cs(cont->parent);
1881 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1882 if (!cs)
1883 return ERR_PTR(-ENOMEM);
1884 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1885 kfree(cs);
1886 return ERR_PTR(-ENOMEM);
1887 }
1888
1889 cs->flags = 0;
1890 if (is_spread_page(parent))
1891 set_bit(CS_SPREAD_PAGE, &cs->flags);
1892 if (is_spread_slab(parent))
1893 set_bit(CS_SPREAD_SLAB, &cs->flags);
1894 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1895 cpumask_clear(cs->cpus_allowed);
1896 nodes_clear(cs->mems_allowed);
1897 fmeter_init(&cs->fmeter);
1898 cs->relax_domain_level = -1;
1899
1900 cs->parent = parent;
1901 number_of_cpusets++;
1902 return &cs->css ;
1903}
1904
1905
1906
1907
1908
1909
1910
1911static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1912{
1913 struct cpuset *cs = cgroup_cs(cont);
1914
1915 if (is_sched_load_balance(cs))
1916 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1917
1918 number_of_cpusets--;
1919 free_cpumask_var(cs->cpus_allowed);
1920 kfree(cs);
1921}
1922
1923struct cgroup_subsys cpuset_subsys = {
1924 .name = "cpuset",
1925 .create = cpuset_create,
1926 .destroy = cpuset_destroy,
1927 .can_attach = cpuset_can_attach,
1928 .can_attach_task = cpuset_can_attach_task,
1929 .pre_attach = cpuset_pre_attach,
1930 .attach_task = cpuset_attach_task,
1931 .attach = cpuset_attach,
1932 .populate = cpuset_populate,
1933 .post_clone = cpuset_post_clone,
1934 .subsys_id = cpuset_subsys_id,
1935 .early_init = 1,
1936};
1937
1938
1939
1940
1941
1942
1943
1944int __init cpuset_init(void)
1945{
1946 int err = 0;
1947
1948 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1949 BUG();
1950
1951 cpumask_setall(top_cpuset.cpus_allowed);
1952 nodes_setall(top_cpuset.mems_allowed);
1953
1954 fmeter_init(&top_cpuset.fmeter);
1955 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1956 top_cpuset.relax_domain_level = -1;
1957
1958 err = register_filesystem(&cpuset_fs_type);
1959 if (err < 0)
1960 return err;
1961
1962 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1963 BUG();
1964
1965 number_of_cpusets = 1;
1966 return 0;
1967}
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977static void cpuset_do_move_task(struct task_struct *tsk,
1978 struct cgroup_scanner *scan)
1979{
1980 struct cgroup *new_cgroup = scan->data;
1981
1982 cgroup_attach_task(new_cgroup, tsk);
1983}
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1997{
1998 struct cgroup_scanner scan;
1999
2000 scan.cg = from->css.cgroup;
2001 scan.test_task = NULL;
2002 scan.process_task = cpuset_do_move_task;
2003 scan.heap = NULL;
2004 scan.data = to->css.cgroup;
2005
2006 if (cgroup_scan_tasks(&scan))
2007 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2008 "cgroup_scan_tasks failed\n");
2009}
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2022{
2023 struct cpuset *parent;
2024
2025
2026
2027
2028
2029
2030 if (list_empty(&cs->css.cgroup->css_sets))
2031 return;
2032
2033
2034
2035
2036
2037 parent = cs->parent;
2038 while (cpumask_empty(parent->cpus_allowed) ||
2039 nodes_empty(parent->mems_allowed))
2040 parent = parent->parent;
2041
2042 move_member_tasks_to_cpuset(cs, parent);
2043}
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060static void scan_for_empty_cpusets(struct cpuset *root)
2061{
2062 LIST_HEAD(queue);
2063 struct cpuset *cp;
2064 struct cpuset *child;
2065 struct cgroup *cont;
2066 static nodemask_t oldmems;
2067
2068 list_add_tail((struct list_head *)&root->stack_list, &queue);
2069
2070 while (!list_empty(&queue)) {
2071 cp = list_first_entry(&queue, struct cpuset, stack_list);
2072 list_del(queue.next);
2073 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2074 child = cgroup_cs(cont);
2075 list_add_tail(&child->stack_list, &queue);
2076 }
2077
2078
2079 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2080 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2081 continue;
2082
2083 oldmems = cp->mems_allowed;
2084
2085
2086 mutex_lock(&callback_mutex);
2087 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2088 cpu_active_mask);
2089 nodes_and(cp->mems_allowed, cp->mems_allowed,
2090 node_states[N_HIGH_MEMORY]);
2091 mutex_unlock(&callback_mutex);
2092
2093
2094 if (cpumask_empty(cp->cpus_allowed) ||
2095 nodes_empty(cp->mems_allowed))
2096 remove_tasks_in_empty_cpuset(cp);
2097 else {
2098 update_tasks_cpumask(cp, NULL);
2099 update_tasks_nodemask(cp, &oldmems, NULL);
2100 }
2101 }
2102}
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116void cpuset_update_active_cpus(void)
2117{
2118 struct sched_domain_attr *attr;
2119 cpumask_var_t *doms;
2120 int ndoms;
2121
2122 cgroup_lock();
2123 mutex_lock(&callback_mutex);
2124 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2125 mutex_unlock(&callback_mutex);
2126 scan_for_empty_cpusets(&top_cpuset);
2127 ndoms = generate_sched_domains(&doms, &attr);
2128 cgroup_unlock();
2129
2130
2131 partition_sched_domains(ndoms, doms, attr);
2132}
2133
2134#ifdef CONFIG_MEMORY_HOTPLUG
2135
2136
2137
2138
2139
2140static int cpuset_track_online_nodes(struct notifier_block *self,
2141 unsigned long action, void *arg)
2142{
2143 static nodemask_t oldmems;
2144
2145 cgroup_lock();
2146 switch (action) {
2147 case MEM_ONLINE:
2148 oldmems = top_cpuset.mems_allowed;
2149 mutex_lock(&callback_mutex);
2150 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2151 mutex_unlock(&callback_mutex);
2152 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2153 break;
2154 case MEM_OFFLINE:
2155
2156
2157
2158
2159 scan_for_empty_cpusets(&top_cpuset);
2160 break;
2161 default:
2162 break;
2163 }
2164 cgroup_unlock();
2165
2166 return NOTIFY_OK;
2167}
2168#endif
2169
2170
2171
2172
2173
2174
2175
2176void __init cpuset_init_smp(void)
2177{
2178 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2179 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2180
2181 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2182
2183 cpuset_wq = create_singlethread_workqueue("cpuset");
2184 BUG_ON(!cpuset_wq);
2185}
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2199{
2200 mutex_lock(&callback_mutex);
2201 task_lock(tsk);
2202 guarantee_online_cpus(task_cs(tsk), pmask);
2203 task_unlock(tsk);
2204 mutex_unlock(&callback_mutex);
2205}
2206
2207int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2208{
2209 const struct cpuset *cs;
2210 int cpu;
2211
2212 rcu_read_lock();
2213 cs = task_cs(tsk);
2214 if (cs)
2215 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2216 rcu_read_unlock();
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2234 if (cpu >= nr_cpu_ids) {
2235
2236
2237
2238
2239
2240
2241
2242 do_set_cpus_allowed(tsk, cpu_possible_mask);
2243 cpu = cpumask_any(cpu_active_mask);
2244 }
2245
2246 return cpu;
2247}
2248
2249void cpuset_init_current_mems_allowed(void)
2250{
2251 nodes_setall(current->mems_allowed);
2252}
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2265{
2266 nodemask_t mask;
2267
2268 mutex_lock(&callback_mutex);
2269 task_lock(tsk);
2270 guarantee_online_mems(task_cs(tsk), &mask);
2271 task_unlock(tsk);
2272 mutex_unlock(&callback_mutex);
2273
2274 return mask;
2275}
2276
2277
2278
2279
2280
2281
2282
2283int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2284{
2285 return nodes_intersects(*nodemask, current->mems_allowed);
2286}
2287
2288
2289
2290
2291
2292
2293
2294static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2295{
2296 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2297 cs = cs->parent;
2298 return cs;
2299}
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2363{
2364 const struct cpuset *cs;
2365 int allowed;
2366
2367 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2368 return 1;
2369 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2370 if (node_isset(node, current->mems_allowed))
2371 return 1;
2372
2373
2374
2375
2376 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2377 return 1;
2378 if (gfp_mask & __GFP_HARDWALL)
2379 return 0;
2380
2381 if (current->flags & PF_EXITING)
2382 return 1;
2383
2384
2385 mutex_lock(&callback_mutex);
2386
2387 task_lock(current);
2388 cs = nearest_hardwall_ancestor(task_cs(current));
2389 task_unlock(current);
2390
2391 allowed = node_isset(node, cs->mems_allowed);
2392 mutex_unlock(&callback_mutex);
2393 return allowed;
2394}
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2420{
2421 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2422 return 1;
2423 if (node_isset(node, current->mems_allowed))
2424 return 1;
2425
2426
2427
2428
2429 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2430 return 1;
2431 return 0;
2432}
2433
2434
2435
2436
2437
2438
2439
2440void cpuset_unlock(void)
2441{
2442 mutex_unlock(&callback_mutex);
2443}
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472static int cpuset_spread_node(int *rotor)
2473{
2474 int node;
2475
2476 node = next_node(*rotor, current->mems_allowed);
2477 if (node == MAX_NUMNODES)
2478 node = first_node(current->mems_allowed);
2479 *rotor = node;
2480 return node;
2481}
2482
2483int cpuset_mem_spread_node(void)
2484{
2485 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2486 current->cpuset_mem_spread_rotor =
2487 node_random(¤t->mems_allowed);
2488
2489 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2490}
2491
2492int cpuset_slab_spread_node(void)
2493{
2494 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2495 current->cpuset_slab_spread_rotor =
2496 node_random(¤t->mems_allowed);
2497
2498 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2499}
2500
2501EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2515 const struct task_struct *tsk2)
2516{
2517 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2518}
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2529{
2530 struct dentry *dentry;
2531
2532 dentry = task_cs(tsk)->css.cgroup->dentry;
2533 spin_lock(&cpuset_buffer_lock);
2534 snprintf(cpuset_name, CPUSET_NAME_LEN,
2535 dentry ? (const char *)dentry->d_name.name : "/");
2536 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2537 tsk->mems_allowed);
2538 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2539 tsk->comm, cpuset_name, cpuset_nodelist);
2540 spin_unlock(&cpuset_buffer_lock);
2541}
2542
2543
2544
2545
2546
2547
2548
2549int cpuset_memory_pressure_enabled __read_mostly;
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569void __cpuset_memory_pressure_bump(void)
2570{
2571 task_lock(current);
2572 fmeter_markevent(&task_cs(current)->fmeter);
2573 task_unlock(current);
2574}
2575
2576#ifdef CONFIG_PROC_PID_CPUSET
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2587{
2588 struct pid *pid;
2589 struct task_struct *tsk;
2590 char *buf;
2591 struct cgroup_subsys_state *css;
2592 int retval;
2593
2594 retval = -ENOMEM;
2595 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2596 if (!buf)
2597 goto out;
2598
2599 retval = -ESRCH;
2600 pid = m->private;
2601 tsk = get_pid_task(pid, PIDTYPE_PID);
2602 if (!tsk)
2603 goto out_free;
2604
2605 retval = -EINVAL;
2606 cgroup_lock();
2607 css = task_subsys_state(tsk, cpuset_subsys_id);
2608 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2609 if (retval < 0)
2610 goto out_unlock;
2611 seq_puts(m, buf);
2612 seq_putc(m, '\n');
2613out_unlock:
2614 cgroup_unlock();
2615 put_task_struct(tsk);
2616out_free:
2617 kfree(buf);
2618out:
2619 return retval;
2620}
2621
2622static int cpuset_open(struct inode *inode, struct file *file)
2623{
2624 struct pid *pid = PROC_I(inode)->pid;
2625 return single_open(file, proc_cpuset_show, pid);
2626}
2627
2628const struct file_operations proc_cpuset_operations = {
2629 .open = cpuset_open,
2630 .read = seq_read,
2631 .llseek = seq_lseek,
2632 .release = single_release,
2633};
2634#endif
2635
2636
2637void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2638{
2639 seq_printf(m, "Mems_allowed:\t");
2640 seq_nodemask(m, &task->mems_allowed);
2641 seq_printf(m, "\n");
2642 seq_printf(m, "Mems_allowed_list:\t");
2643 seq_nodemask_list(m, &task->mems_allowed);
2644 seq_printf(m, "\n");
2645}
2646