1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
139
140typedef enum {
141 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL,
144 CS_MEMORY_MIGRATE,
145 CS_SCHED_LOAD_BALANCE,
146 CS_SPREAD_PAGE,
147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t;
149
150
151static inline int is_cpu_exclusive(const struct cpuset *cs)
152{
153 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
154}
155
156static inline int is_mem_exclusive(const struct cpuset *cs)
157{
158 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
159}
160
161static inline int is_mem_hardwall(const struct cpuset *cs)
162{
163 return test_bit(CS_MEM_HARDWALL, &cs->flags);
164}
165
166static inline int is_sched_load_balance(const struct cpuset *cs)
167{
168 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
169}
170
171static inline int is_memory_migrate(const struct cpuset *cs)
172{
173 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
174}
175
176static inline int is_spread_page(const struct cpuset *cs)
177{
178 return test_bit(CS_SPREAD_PAGE, &cs->flags);
179}
180
181static inline int is_spread_slab(const struct cpuset *cs)
182{
183 return test_bit(CS_SPREAD_SLAB, &cs->flags);
184}
185
186static struct cpuset top_cpuset = {
187 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
188};
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229static DEFINE_MUTEX(callback_mutex);
230
231
232
233
234
235
236#define CPUSET_NAME_LEN (128)
237#define CPUSET_NODELIST_LEN (256)
238static char cpuset_name[CPUSET_NAME_LEN];
239static char cpuset_nodelist[CPUSET_NODELIST_LEN];
240static DEFINE_SPINLOCK(cpuset_buffer_lock);
241
242
243
244
245
246
247static struct dentry *cpuset_mount(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name, void *data)
249{
250 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
251 struct dentry *ret = ERR_PTR(-ENODEV);
252 if (cgroup_fs) {
253 char mountopts[] =
254 "cpuset,noprefix,"
255 "release_agent=/sbin/cpuset_release_agent";
256 ret = cgroup_fs->mount(cgroup_fs, flags,
257 unused_dev_name, mountopts);
258 put_filesystem(cgroup_fs);
259 }
260 return ret;
261}
262
263static struct file_system_type cpuset_fs_type = {
264 .name = "cpuset",
265 .mount = cpuset_mount,
266};
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282static void guarantee_online_cpus(const struct cpuset *cs,
283 struct cpumask *pmask)
284{
285 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent;
287 if (cs)
288 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else
290 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292}
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
308{
309 while (cs && !nodes_intersects(cs->mems_allowed,
310 node_states[N_HIGH_MEMORY]))
311 cs = cs->parent;
312 if (cs)
313 nodes_and(*pmask, cs->mems_allowed,
314 node_states[N_HIGH_MEMORY]);
315 else
316 *pmask = node_states[N_HIGH_MEMORY];
317 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
318}
319
320
321
322
323
324
325static void cpuset_update_task_spread_flag(struct cpuset *cs,
326 struct task_struct *tsk)
327{
328 if (is_spread_page(cs))
329 tsk->flags |= PF_SPREAD_PAGE;
330 else
331 tsk->flags &= ~PF_SPREAD_PAGE;
332 if (is_spread_slab(cs))
333 tsk->flags |= PF_SPREAD_SLAB;
334 else
335 tsk->flags &= ~PF_SPREAD_SLAB;
336}
337
338
339
340
341
342
343
344
345
346static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
347{
348 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
349 nodes_subset(p->mems_allowed, q->mems_allowed) &&
350 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
351 is_mem_exclusive(p) <= is_mem_exclusive(q);
352}
353
354
355
356
357
358static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
359{
360 struct cpuset *trial;
361
362 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
363 if (!trial)
364 return NULL;
365
366 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
367 kfree(trial);
368 return NULL;
369 }
370 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
371
372 return trial;
373}
374
375
376
377
378
379static void free_trial_cpuset(struct cpuset *trial)
380{
381 free_cpumask_var(trial->cpus_allowed);
382 kfree(trial);
383}
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
406{
407 struct cgroup *cont;
408 struct cpuset *c, *par;
409
410
411 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
412 if (!is_cpuset_subset(cgroup_cs(cont), trial))
413 return -EBUSY;
414 }
415
416
417 if (cur == &top_cpuset)
418 return 0;
419
420 par = cur->parent;
421
422
423 if (!is_cpuset_subset(trial, par))
424 return -EACCES;
425
426
427
428
429
430 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
431 c = cgroup_cs(cont);
432 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
433 c != cur &&
434 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
435 return -EINVAL;
436 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
437 c != cur &&
438 nodes_intersects(trial->mems_allowed, c->mems_allowed))
439 return -EINVAL;
440 }
441
442
443 if (cgroup_task_count(cur->css.cgroup)) {
444 if (cpumask_empty(trial->cpus_allowed) ||
445 nodes_empty(trial->mems_allowed)) {
446 return -ENOSPC;
447 }
448 }
449
450 return 0;
451}
452
453#ifdef CONFIG_SMP
454
455
456
457
458static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
459{
460 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
461}
462
463static void
464update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
465{
466 if (dattr->relax_domain_level < c->relax_domain_level)
467 dattr->relax_domain_level = c->relax_domain_level;
468 return;
469}
470
471static void
472update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
473{
474 LIST_HEAD(q);
475
476 list_add(&c->stack_list, &q);
477 while (!list_empty(&q)) {
478 struct cpuset *cp;
479 struct cgroup *cont;
480 struct cpuset *child;
481
482 cp = list_first_entry(&q, struct cpuset, stack_list);
483 list_del(q.next);
484
485 if (cpumask_empty(cp->cpus_allowed))
486 continue;
487
488 if (is_sched_load_balance(cp))
489 update_domain_attr(dattr, cp);
490
491 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
492 child = cgroup_cs(cont);
493 list_add_tail(&child->stack_list, &q);
494 }
495 }
496}
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552static int generate_sched_domains(cpumask_var_t **domains,
553 struct sched_domain_attr **attributes)
554{
555 LIST_HEAD(q);
556 struct cpuset *cp;
557 struct cpuset **csa;
558 int csn;
559 int i, j, k;
560 cpumask_var_t *doms;
561 struct sched_domain_attr *dattr;
562 int ndoms = 0;
563 int nslot;
564
565 doms = NULL;
566 dattr = NULL;
567 csa = NULL;
568
569
570 if (is_sched_load_balance(&top_cpuset)) {
571 ndoms = 1;
572 doms = alloc_sched_domains(ndoms);
573 if (!doms)
574 goto done;
575
576 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
577 if (dattr) {
578 *dattr = SD_ATTR_INIT;
579 update_domain_attr_tree(dattr, &top_cpuset);
580 }
581 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
582
583 goto done;
584 }
585
586 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
587 if (!csa)
588 goto done;
589 csn = 0;
590
591 list_add(&top_cpuset.stack_list, &q);
592 while (!list_empty(&q)) {
593 struct cgroup *cont;
594 struct cpuset *child;
595
596 cp = list_first_entry(&q, struct cpuset, stack_list);
597 list_del(q.next);
598
599 if (cpumask_empty(cp->cpus_allowed))
600 continue;
601
602
603
604
605
606
607
608 if (is_sched_load_balance(cp)) {
609 csa[csn++] = cp;
610 continue;
611 }
612
613 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
614 child = cgroup_cs(cont);
615 list_add_tail(&child->stack_list, &q);
616 }
617 }
618
619 for (i = 0; i < csn; i++)
620 csa[i]->pn = i;
621 ndoms = csn;
622
623restart:
624
625 for (i = 0; i < csn; i++) {
626 struct cpuset *a = csa[i];
627 int apn = a->pn;
628
629 for (j = 0; j < csn; j++) {
630 struct cpuset *b = csa[j];
631 int bpn = b->pn;
632
633 if (apn != bpn && cpusets_overlap(a, b)) {
634 for (k = 0; k < csn; k++) {
635 struct cpuset *c = csa[k];
636
637 if (c->pn == bpn)
638 c->pn = apn;
639 }
640 ndoms--;
641 goto restart;
642 }
643 }
644 }
645
646
647
648
649
650 doms = alloc_sched_domains(ndoms);
651 if (!doms)
652 goto done;
653
654
655
656
657
658 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
659
660 for (nslot = 0, i = 0; i < csn; i++) {
661 struct cpuset *a = csa[i];
662 struct cpumask *dp;
663 int apn = a->pn;
664
665 if (apn < 0) {
666
667 continue;
668 }
669
670 dp = doms[nslot];
671
672 if (nslot == ndoms) {
673 static int warnings = 10;
674 if (warnings) {
675 printk(KERN_WARNING
676 "rebuild_sched_domains confused:"
677 " nslot %d, ndoms %d, csn %d, i %d,"
678 " apn %d\n",
679 nslot, ndoms, csn, i, apn);
680 warnings--;
681 }
682 continue;
683 }
684
685 cpumask_clear(dp);
686 if (dattr)
687 *(dattr + nslot) = SD_ATTR_INIT;
688 for (j = i; j < csn; j++) {
689 struct cpuset *b = csa[j];
690
691 if (apn == b->pn) {
692 cpumask_or(dp, dp, b->cpus_allowed);
693 if (dattr)
694 update_domain_attr_tree(dattr + nslot, b);
695
696
697 b->pn = -1;
698 }
699 }
700 nslot++;
701 }
702 BUG_ON(nslot != ndoms);
703
704done:
705 kfree(csa);
706
707
708
709
710
711 if (doms == NULL)
712 ndoms = 1;
713
714 *domains = doms;
715 *attributes = dattr;
716 return ndoms;
717}
718
719
720
721
722
723
724
725
726
727
728
729static void do_rebuild_sched_domains(struct work_struct *unused)
730{
731 struct sched_domain_attr *attr;
732 cpumask_var_t *doms;
733 int ndoms;
734
735 get_online_cpus();
736
737
738 cgroup_lock();
739 ndoms = generate_sched_domains(&doms, &attr);
740 cgroup_unlock();
741
742
743 partition_sched_domains(ndoms, doms, attr);
744
745 put_online_cpus();
746}
747#else
748static void do_rebuild_sched_domains(struct work_struct *unused)
749{
750}
751
752static int generate_sched_domains(cpumask_var_t **domains,
753 struct sched_domain_attr **attributes)
754{
755 *domains = NULL;
756 return 1;
757}
758#endif
759
760static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781static void async_rebuild_sched_domains(void)
782{
783 queue_work(cpuset_wq, &rebuild_sched_domains_work);
784}
785
786
787
788
789
790
791
792
793
794
795void rebuild_sched_domains(void)
796{
797 do_rebuild_sched_domains(NULL);
798}
799
800
801
802
803
804
805
806
807
808
809
810static int cpuset_test_cpumask(struct task_struct *tsk,
811 struct cgroup_scanner *scan)
812{
813 return !cpumask_equal(&tsk->cpus_allowed,
814 (cgroup_cs(scan->cg))->cpus_allowed);
815}
816
817
818
819
820
821
822
823
824
825
826
827
828static void cpuset_change_cpumask(struct task_struct *tsk,
829 struct cgroup_scanner *scan)
830{
831 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
832}
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
848{
849 struct cgroup_scanner scan;
850
851 scan.cg = cs->css.cgroup;
852 scan.test_task = cpuset_test_cpumask;
853 scan.process_task = cpuset_change_cpumask;
854 scan.heap = heap;
855 cgroup_scan_tasks(&scan);
856}
857
858
859
860
861
862
863static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
864 const char *buf)
865{
866 struct ptr_heap heap;
867 int retval;
868 int is_load_balanced;
869
870
871 if (cs == &top_cpuset)
872 return -EACCES;
873
874
875
876
877
878
879
880 if (!*buf) {
881 cpumask_clear(trialcs->cpus_allowed);
882 } else {
883 retval = cpulist_parse(buf, trialcs->cpus_allowed);
884 if (retval < 0)
885 return retval;
886
887 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
888 return -EINVAL;
889 }
890 retval = validate_change(cs, trialcs);
891 if (retval < 0)
892 return retval;
893
894
895 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
896 return 0;
897
898 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
899 if (retval)
900 return retval;
901
902 is_load_balanced = is_sched_load_balance(trialcs);
903
904 mutex_lock(&callback_mutex);
905 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
906 mutex_unlock(&callback_mutex);
907
908
909
910
911
912 update_tasks_cpumask(cs, &heap);
913
914 heap_free(&heap);
915
916 if (is_load_balanced)
917 async_rebuild_sched_domains();
918 return 0;
919}
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
942 const nodemask_t *to)
943{
944 struct task_struct *tsk = current;
945
946 tsk->mems_allowed = *to;
947
948 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
949
950 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
951}
952
953
954
955
956
957
958
959
960
961
962static void cpuset_change_task_nodemask(struct task_struct *tsk,
963 nodemask_t *newmems)
964{
965 bool need_loop;
966
967repeat:
968
969
970
971
972 if (unlikely(test_thread_flag(TIF_MEMDIE)))
973 return;
974 if (current->flags & PF_EXITING)
975 return;
976
977 task_lock(tsk);
978
979
980
981
982
983
984 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001 smp_mb();
1002
1003
1004
1005
1006
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013
1014
1015
1016
1017
1018
1019
1020
1021 smp_mb();
1022
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems;
1025 task_unlock(tsk);
1026}
1027
1028
1029
1030
1031
1032
1033static void cpuset_change_nodemask(struct task_struct *p,
1034 struct cgroup_scanner *scan)
1035{
1036 struct mm_struct *mm;
1037 struct cpuset *cs;
1038 int migrate;
1039 const nodemask_t *oldmem = scan->data;
1040 static nodemask_t newmems;
1041
1042 cs = cgroup_cs(scan->cg);
1043 guarantee_online_mems(cs, &newmems);
1044
1045 cpuset_change_task_nodemask(p, &newmems);
1046
1047 mm = get_task_mm(p);
1048 if (!mm)
1049 return;
1050
1051 migrate = is_memory_migrate(cs);
1052
1053 mpol_rebind_mm(mm, &cs->mems_allowed);
1054 if (migrate)
1055 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1056 mmput(mm);
1057}
1058
1059static void *cpuset_being_rebound;
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1072 struct ptr_heap *heap)
1073{
1074 struct cgroup_scanner scan;
1075
1076 cpuset_being_rebound = cs;
1077
1078 scan.cg = cs->css.cgroup;
1079 scan.test_task = NULL;
1080 scan.process_task = cpuset_change_nodemask;
1081 scan.heap = heap;
1082 scan.data = (nodemask_t *)oldmem;
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094 cgroup_scan_tasks(&scan);
1095
1096
1097 cpuset_being_rebound = NULL;
1098}
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1114 const char *buf)
1115{
1116 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1117 int retval;
1118 struct ptr_heap heap;
1119
1120 if (!oldmem)
1121 return -ENOMEM;
1122
1123
1124
1125
1126
1127 if (cs == &top_cpuset) {
1128 retval = -EACCES;
1129 goto done;
1130 }
1131
1132
1133
1134
1135
1136
1137
1138 if (!*buf) {
1139 nodes_clear(trialcs->mems_allowed);
1140 } else {
1141 retval = nodelist_parse(buf, trialcs->mems_allowed);
1142 if (retval < 0)
1143 goto done;
1144
1145 if (!nodes_subset(trialcs->mems_allowed,
1146 node_states[N_HIGH_MEMORY])) {
1147 retval = -EINVAL;
1148 goto done;
1149 }
1150 }
1151 *oldmem = cs->mems_allowed;
1152 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1153 retval = 0;
1154 goto done;
1155 }
1156 retval = validate_change(cs, trialcs);
1157 if (retval < 0)
1158 goto done;
1159
1160 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1161 if (retval < 0)
1162 goto done;
1163
1164 mutex_lock(&callback_mutex);
1165 cs->mems_allowed = trialcs->mems_allowed;
1166 mutex_unlock(&callback_mutex);
1167
1168 update_tasks_nodemask(cs, oldmem, &heap);
1169
1170 heap_free(&heap);
1171done:
1172 NODEMASK_FREE(oldmem);
1173 return retval;
1174}
1175
1176int current_cpuset_is_being_rebound(void)
1177{
1178 return task_cs(current) == cpuset_being_rebound;
1179}
1180
1181static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182{
1183#ifdef CONFIG_SMP
1184 if (val < -1 || val >= sched_domain_level_max)
1185 return -EINVAL;
1186#endif
1187
1188 if (val != cs->relax_domain_level) {
1189 cs->relax_domain_level = val;
1190 if (!cpumask_empty(cs->cpus_allowed) &&
1191 is_sched_load_balance(cs))
1192 async_rebuild_sched_domains();
1193 }
1194
1195 return 0;
1196}
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208static void cpuset_change_flag(struct task_struct *tsk,
1209 struct cgroup_scanner *scan)
1210{
1211 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1212}
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1228{
1229 struct cgroup_scanner scan;
1230
1231 scan.cg = cs->css.cgroup;
1232 scan.test_task = NULL;
1233 scan.process_task = cpuset_change_flag;
1234 scan.heap = heap;
1235 cgroup_scan_tasks(&scan);
1236}
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1248 int turning_on)
1249{
1250 struct cpuset *trialcs;
1251 int balance_flag_changed;
1252 int spread_flag_changed;
1253 struct ptr_heap heap;
1254 int err;
1255
1256 trialcs = alloc_trial_cpuset(cs);
1257 if (!trialcs)
1258 return -ENOMEM;
1259
1260 if (turning_on)
1261 set_bit(bit, &trialcs->flags);
1262 else
1263 clear_bit(bit, &trialcs->flags);
1264
1265 err = validate_change(cs, trialcs);
1266 if (err < 0)
1267 goto out;
1268
1269 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1270 if (err < 0)
1271 goto out;
1272
1273 balance_flag_changed = (is_sched_load_balance(cs) !=
1274 is_sched_load_balance(trialcs));
1275
1276 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1277 || (is_spread_page(cs) != is_spread_page(trialcs)));
1278
1279 mutex_lock(&callback_mutex);
1280 cs->flags = trialcs->flags;
1281 mutex_unlock(&callback_mutex);
1282
1283 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1284 async_rebuild_sched_domains();
1285
1286 if (spread_flag_changed)
1287 update_tasks_flags(cs, &heap);
1288 heap_free(&heap);
1289out:
1290 free_trial_cpuset(trialcs);
1291 return err;
1292}
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339#define FM_COEF 933
1340#define FM_MAXTICKS ((time_t)99)
1341#define FM_MAXCNT 1000000
1342#define FM_SCALE 1000
1343
1344
1345static void fmeter_init(struct fmeter *fmp)
1346{
1347 fmp->cnt = 0;
1348 fmp->val = 0;
1349 fmp->time = 0;
1350 spin_lock_init(&fmp->lock);
1351}
1352
1353
1354static void fmeter_update(struct fmeter *fmp)
1355{
1356 time_t now = get_seconds();
1357 time_t ticks = now - fmp->time;
1358
1359 if (ticks == 0)
1360 return;
1361
1362 ticks = min(FM_MAXTICKS, ticks);
1363 while (ticks-- > 0)
1364 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1365 fmp->time = now;
1366
1367 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1368 fmp->cnt = 0;
1369}
1370
1371
1372static void fmeter_markevent(struct fmeter *fmp)
1373{
1374 spin_lock(&fmp->lock);
1375 fmeter_update(fmp);
1376 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1377 spin_unlock(&fmp->lock);
1378}
1379
1380
1381static int fmeter_getrate(struct fmeter *fmp)
1382{
1383 int val;
1384
1385 spin_lock(&fmp->lock);
1386 fmeter_update(fmp);
1387 val = fmp->val;
1388 spin_unlock(&fmp->lock);
1389 return val;
1390}
1391
1392
1393
1394
1395
1396
1397static cpumask_var_t cpus_attach;
1398static nodemask_t cpuset_attach_nodemask_from;
1399static nodemask_t cpuset_attach_nodemask_to;
1400
1401
1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1403 struct cgroup_taskset *tset)
1404{
1405 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task;
1407 int ret;
1408
1409 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1410 return -ENOSPC;
1411
1412 cgroup_taskset_for_each(task, cgrp, tset) {
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422 if (task->flags & PF_THREAD_BOUND)
1423 return -EINVAL;
1424 if ((ret = security_task_setscheduler(task)))
1425 return ret;
1426 }
1427
1428
1429 if (cs == &top_cpuset)
1430 cpumask_copy(cpus_attach, cpu_possible_mask);
1431 else
1432 guarantee_online_cpus(cs, cpus_attach);
1433
1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1435
1436 return 0;
1437}
1438
1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1440 struct cgroup_taskset *tset)
1441{
1442 struct mm_struct *mm;
1443 struct task_struct *task;
1444 struct task_struct *leader = cgroup_taskset_first(tset);
1445 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1446 struct cpuset *cs = cgroup_cs(cgrp);
1447 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1448
1449 cgroup_taskset_for_each(task, cgrp, tset) {
1450
1451
1452
1453
1454 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1455
1456 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1457 cpuset_update_task_spread_flag(cs, task);
1458 }
1459
1460
1461
1462
1463
1464 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1465 cpuset_attach_nodemask_to = cs->mems_allowed;
1466 mm = get_task_mm(leader);
1467 if (mm) {
1468 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1469 if (is_memory_migrate(cs))
1470 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1471 &cpuset_attach_nodemask_to);
1472 mmput(mm);
1473 }
1474}
1475
1476
1477
1478typedef enum {
1479 FILE_MEMORY_MIGRATE,
1480 FILE_CPULIST,
1481 FILE_MEMLIST,
1482 FILE_CPU_EXCLUSIVE,
1483 FILE_MEM_EXCLUSIVE,
1484 FILE_MEM_HARDWALL,
1485 FILE_SCHED_LOAD_BALANCE,
1486 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1487 FILE_MEMORY_PRESSURE_ENABLED,
1488 FILE_MEMORY_PRESSURE,
1489 FILE_SPREAD_PAGE,
1490 FILE_SPREAD_SLAB,
1491} cpuset_filetype_t;
1492
1493static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1494{
1495 int retval = 0;
1496 struct cpuset *cs = cgroup_cs(cgrp);
1497 cpuset_filetype_t type = cft->private;
1498
1499 if (!cgroup_lock_live_group(cgrp))
1500 return -ENODEV;
1501
1502 switch (type) {
1503 case FILE_CPU_EXCLUSIVE:
1504 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1505 break;
1506 case FILE_MEM_EXCLUSIVE:
1507 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1508 break;
1509 case FILE_MEM_HARDWALL:
1510 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1511 break;
1512 case FILE_SCHED_LOAD_BALANCE:
1513 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1514 break;
1515 case FILE_MEMORY_MIGRATE:
1516 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1517 break;
1518 case FILE_MEMORY_PRESSURE_ENABLED:
1519 cpuset_memory_pressure_enabled = !!val;
1520 break;
1521 case FILE_MEMORY_PRESSURE:
1522 retval = -EACCES;
1523 break;
1524 case FILE_SPREAD_PAGE:
1525 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1526 break;
1527 case FILE_SPREAD_SLAB:
1528 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1529 break;
1530 default:
1531 retval = -EINVAL;
1532 break;
1533 }
1534 cgroup_unlock();
1535 return retval;
1536}
1537
1538static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539{
1540 int retval = 0;
1541 struct cpuset *cs = cgroup_cs(cgrp);
1542 cpuset_filetype_t type = cft->private;
1543
1544 if (!cgroup_lock_live_group(cgrp))
1545 return -ENODEV;
1546
1547 switch (type) {
1548 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1549 retval = update_relax_domain_level(cs, val);
1550 break;
1551 default:
1552 retval = -EINVAL;
1553 break;
1554 }
1555 cgroup_unlock();
1556 return retval;
1557}
1558
1559
1560
1561
1562static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1563 const char *buf)
1564{
1565 int retval = 0;
1566 struct cpuset *cs = cgroup_cs(cgrp);
1567 struct cpuset *trialcs;
1568
1569 if (!cgroup_lock_live_group(cgrp))
1570 return -ENODEV;
1571
1572 trialcs = alloc_trial_cpuset(cs);
1573 if (!trialcs) {
1574 retval = -ENOMEM;
1575 goto out;
1576 }
1577
1578 switch (cft->private) {
1579 case FILE_CPULIST:
1580 retval = update_cpumask(cs, trialcs, buf);
1581 break;
1582 case FILE_MEMLIST:
1583 retval = update_nodemask(cs, trialcs, buf);
1584 break;
1585 default:
1586 retval = -EINVAL;
1587 break;
1588 }
1589
1590 free_trial_cpuset(trialcs);
1591out:
1592 cgroup_unlock();
1593 return retval;
1594}
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1609{
1610 size_t count;
1611
1612 mutex_lock(&callback_mutex);
1613 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1614 mutex_unlock(&callback_mutex);
1615
1616 return count;
1617}
1618
1619static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1620{
1621 size_t count;
1622
1623 mutex_lock(&callback_mutex);
1624 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1625 mutex_unlock(&callback_mutex);
1626
1627 return count;
1628}
1629
1630static ssize_t cpuset_common_file_read(struct cgroup *cont,
1631 struct cftype *cft,
1632 struct file *file,
1633 char __user *buf,
1634 size_t nbytes, loff_t *ppos)
1635{
1636 struct cpuset *cs = cgroup_cs(cont);
1637 cpuset_filetype_t type = cft->private;
1638 char *page;
1639 ssize_t retval = 0;
1640 char *s;
1641
1642 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1643 return -ENOMEM;
1644
1645 s = page;
1646
1647 switch (type) {
1648 case FILE_CPULIST:
1649 s += cpuset_sprintf_cpulist(s, cs);
1650 break;
1651 case FILE_MEMLIST:
1652 s += cpuset_sprintf_memlist(s, cs);
1653 break;
1654 default:
1655 retval = -EINVAL;
1656 goto out;
1657 }
1658 *s++ = '\n';
1659
1660 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1661out:
1662 free_page((unsigned long)page);
1663 return retval;
1664}
1665
1666static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1667{
1668 struct cpuset *cs = cgroup_cs(cont);
1669 cpuset_filetype_t type = cft->private;
1670 switch (type) {
1671 case FILE_CPU_EXCLUSIVE:
1672 return is_cpu_exclusive(cs);
1673 case FILE_MEM_EXCLUSIVE:
1674 return is_mem_exclusive(cs);
1675 case FILE_MEM_HARDWALL:
1676 return is_mem_hardwall(cs);
1677 case FILE_SCHED_LOAD_BALANCE:
1678 return is_sched_load_balance(cs);
1679 case FILE_MEMORY_MIGRATE:
1680 return is_memory_migrate(cs);
1681 case FILE_MEMORY_PRESSURE_ENABLED:
1682 return cpuset_memory_pressure_enabled;
1683 case FILE_MEMORY_PRESSURE:
1684 return fmeter_getrate(&cs->fmeter);
1685 case FILE_SPREAD_PAGE:
1686 return is_spread_page(cs);
1687 case FILE_SPREAD_SLAB:
1688 return is_spread_slab(cs);
1689 default:
1690 BUG();
1691 }
1692
1693
1694 return 0;
1695}
1696
1697static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1698{
1699 struct cpuset *cs = cgroup_cs(cont);
1700 cpuset_filetype_t type = cft->private;
1701 switch (type) {
1702 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1703 return cs->relax_domain_level;
1704 default:
1705 BUG();
1706 }
1707
1708
1709 return 0;
1710}
1711
1712
1713
1714
1715
1716
1717static struct cftype files[] = {
1718 {
1719 .name = "cpus",
1720 .read = cpuset_common_file_read,
1721 .write_string = cpuset_write_resmask,
1722 .max_write_len = (100U + 6 * NR_CPUS),
1723 .private = FILE_CPULIST,
1724 },
1725
1726 {
1727 .name = "mems",
1728 .read = cpuset_common_file_read,
1729 .write_string = cpuset_write_resmask,
1730 .max_write_len = (100U + 6 * MAX_NUMNODES),
1731 .private = FILE_MEMLIST,
1732 },
1733
1734 {
1735 .name = "cpu_exclusive",
1736 .read_u64 = cpuset_read_u64,
1737 .write_u64 = cpuset_write_u64,
1738 .private = FILE_CPU_EXCLUSIVE,
1739 },
1740
1741 {
1742 .name = "mem_exclusive",
1743 .read_u64 = cpuset_read_u64,
1744 .write_u64 = cpuset_write_u64,
1745 .private = FILE_MEM_EXCLUSIVE,
1746 },
1747
1748 {
1749 .name = "mem_hardwall",
1750 .read_u64 = cpuset_read_u64,
1751 .write_u64 = cpuset_write_u64,
1752 .private = FILE_MEM_HARDWALL,
1753 },
1754
1755 {
1756 .name = "sched_load_balance",
1757 .read_u64 = cpuset_read_u64,
1758 .write_u64 = cpuset_write_u64,
1759 .private = FILE_SCHED_LOAD_BALANCE,
1760 },
1761
1762 {
1763 .name = "sched_relax_domain_level",
1764 .read_s64 = cpuset_read_s64,
1765 .write_s64 = cpuset_write_s64,
1766 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1767 },
1768
1769 {
1770 .name = "memory_migrate",
1771 .read_u64 = cpuset_read_u64,
1772 .write_u64 = cpuset_write_u64,
1773 .private = FILE_MEMORY_MIGRATE,
1774 },
1775
1776 {
1777 .name = "memory_pressure",
1778 .read_u64 = cpuset_read_u64,
1779 .write_u64 = cpuset_write_u64,
1780 .private = FILE_MEMORY_PRESSURE,
1781 .mode = S_IRUGO,
1782 },
1783
1784 {
1785 .name = "memory_spread_page",
1786 .read_u64 = cpuset_read_u64,
1787 .write_u64 = cpuset_write_u64,
1788 .private = FILE_SPREAD_PAGE,
1789 },
1790
1791 {
1792 .name = "memory_spread_slab",
1793 .read_u64 = cpuset_read_u64,
1794 .write_u64 = cpuset_write_u64,
1795 .private = FILE_SPREAD_SLAB,
1796 },
1797};
1798
1799static struct cftype cft_memory_pressure_enabled = {
1800 .name = "memory_pressure_enabled",
1801 .read_u64 = cpuset_read_u64,
1802 .write_u64 = cpuset_write_u64,
1803 .private = FILE_MEMORY_PRESSURE_ENABLED,
1804};
1805
1806static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1807{
1808 int err;
1809
1810 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1811 if (err)
1812 return err;
1813
1814 if (!cont->parent)
1815 err = cgroup_add_file(cont, ss,
1816 &cft_memory_pressure_enabled);
1817 return err;
1818}
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836static void cpuset_post_clone(struct cgroup_subsys *ss,
1837 struct cgroup *cgroup)
1838{
1839 struct cgroup *parent, *child;
1840 struct cpuset *cs, *parent_cs;
1841
1842 parent = cgroup->parent;
1843 list_for_each_entry(child, &parent->children, sibling) {
1844 cs = cgroup_cs(child);
1845 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1846 return;
1847 }
1848 cs = cgroup_cs(cgroup);
1849 parent_cs = cgroup_cs(parent);
1850
1851 mutex_lock(&callback_mutex);
1852 cs->mems_allowed = parent_cs->mems_allowed;
1853 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1854 mutex_unlock(&callback_mutex);
1855 return;
1856}
1857
1858
1859
1860
1861
1862
1863
1864static struct cgroup_subsys_state *cpuset_create(
1865 struct cgroup_subsys *ss,
1866 struct cgroup *cont)
1867{
1868 struct cpuset *cs;
1869 struct cpuset *parent;
1870
1871 if (!cont->parent) {
1872 return &top_cpuset.css;
1873 }
1874 parent = cgroup_cs(cont->parent);
1875 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1876 if (!cs)
1877 return ERR_PTR(-ENOMEM);
1878 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1879 kfree(cs);
1880 return ERR_PTR(-ENOMEM);
1881 }
1882
1883 cs->flags = 0;
1884 if (is_spread_page(parent))
1885 set_bit(CS_SPREAD_PAGE, &cs->flags);
1886 if (is_spread_slab(parent))
1887 set_bit(CS_SPREAD_SLAB, &cs->flags);
1888 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1889 cpumask_clear(cs->cpus_allowed);
1890 nodes_clear(cs->mems_allowed);
1891 fmeter_init(&cs->fmeter);
1892 cs->relax_domain_level = -1;
1893
1894 cs->parent = parent;
1895 number_of_cpusets++;
1896 return &cs->css ;
1897}
1898
1899
1900
1901
1902
1903
1904
1905static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1906{
1907 struct cpuset *cs = cgroup_cs(cont);
1908
1909 if (is_sched_load_balance(cs))
1910 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1911
1912 number_of_cpusets--;
1913 free_cpumask_var(cs->cpus_allowed);
1914 kfree(cs);
1915}
1916
1917struct cgroup_subsys cpuset_subsys = {
1918 .name = "cpuset",
1919 .create = cpuset_create,
1920 .destroy = cpuset_destroy,
1921 .can_attach = cpuset_can_attach,
1922 .attach = cpuset_attach,
1923 .populate = cpuset_populate,
1924 .post_clone = cpuset_post_clone,
1925 .subsys_id = cpuset_subsys_id,
1926 .early_init = 1,
1927};
1928
1929
1930
1931
1932
1933
1934
1935int __init cpuset_init(void)
1936{
1937 int err = 0;
1938
1939 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1940 BUG();
1941
1942 cpumask_setall(top_cpuset.cpus_allowed);
1943 nodes_setall(top_cpuset.mems_allowed);
1944
1945 fmeter_init(&top_cpuset.fmeter);
1946 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1947 top_cpuset.relax_domain_level = -1;
1948
1949 err = register_filesystem(&cpuset_fs_type);
1950 if (err < 0)
1951 return err;
1952
1953 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1954 BUG();
1955
1956 number_of_cpusets = 1;
1957 return 0;
1958}
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968static void cpuset_do_move_task(struct task_struct *tsk,
1969 struct cgroup_scanner *scan)
1970{
1971 struct cgroup *new_cgroup = scan->data;
1972
1973 cgroup_attach_task(new_cgroup, tsk);
1974}
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1988{
1989 struct cgroup_scanner scan;
1990
1991 scan.cg = from->css.cgroup;
1992 scan.test_task = NULL;
1993 scan.process_task = cpuset_do_move_task;
1994 scan.heap = NULL;
1995 scan.data = to->css.cgroup;
1996
1997 if (cgroup_scan_tasks(&scan))
1998 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1999 "cgroup_scan_tasks failed\n");
2000}
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2013{
2014 struct cpuset *parent;
2015
2016
2017
2018
2019
2020
2021 if (list_empty(&cs->css.cgroup->css_sets))
2022 return;
2023
2024
2025
2026
2027
2028 parent = cs->parent;
2029 while (cpumask_empty(parent->cpus_allowed) ||
2030 nodes_empty(parent->mems_allowed))
2031 parent = parent->parent;
2032
2033 move_member_tasks_to_cpuset(cs, parent);
2034}
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051static void scan_for_empty_cpusets(struct cpuset *root)
2052{
2053 LIST_HEAD(queue);
2054 struct cpuset *cp;
2055 struct cpuset *child;
2056 struct cgroup *cont;
2057 static nodemask_t oldmems;
2058
2059 list_add_tail((struct list_head *)&root->stack_list, &queue);
2060
2061 while (!list_empty(&queue)) {
2062 cp = list_first_entry(&queue, struct cpuset, stack_list);
2063 list_del(queue.next);
2064 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2065 child = cgroup_cs(cont);
2066 list_add_tail(&child->stack_list, &queue);
2067 }
2068
2069
2070 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2071 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2072 continue;
2073
2074 oldmems = cp->mems_allowed;
2075
2076
2077 mutex_lock(&callback_mutex);
2078 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2079 cpu_active_mask);
2080 nodes_and(cp->mems_allowed, cp->mems_allowed,
2081 node_states[N_HIGH_MEMORY]);
2082 mutex_unlock(&callback_mutex);
2083
2084
2085 if (cpumask_empty(cp->cpus_allowed) ||
2086 nodes_empty(cp->mems_allowed))
2087 remove_tasks_in_empty_cpuset(cp);
2088 else {
2089 update_tasks_cpumask(cp, NULL);
2090 update_tasks_nodemask(cp, &oldmems, NULL);
2091 }
2092 }
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107void cpuset_update_active_cpus(void)
2108{
2109 struct sched_domain_attr *attr;
2110 cpumask_var_t *doms;
2111 int ndoms;
2112
2113 cgroup_lock();
2114 mutex_lock(&callback_mutex);
2115 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2116 mutex_unlock(&callback_mutex);
2117 scan_for_empty_cpusets(&top_cpuset);
2118 ndoms = generate_sched_domains(&doms, &attr);
2119 cgroup_unlock();
2120
2121
2122 partition_sched_domains(ndoms, doms, attr);
2123}
2124
2125#ifdef CONFIG_MEMORY_HOTPLUG
2126
2127
2128
2129
2130
2131static int cpuset_track_online_nodes(struct notifier_block *self,
2132 unsigned long action, void *arg)
2133{
2134 static nodemask_t oldmems;
2135
2136 cgroup_lock();
2137 switch (action) {
2138 case MEM_ONLINE:
2139 oldmems = top_cpuset.mems_allowed;
2140 mutex_lock(&callback_mutex);
2141 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2142 mutex_unlock(&callback_mutex);
2143 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2144 break;
2145 case MEM_OFFLINE:
2146
2147
2148
2149
2150 scan_for_empty_cpusets(&top_cpuset);
2151 break;
2152 default:
2153 break;
2154 }
2155 cgroup_unlock();
2156
2157 return NOTIFY_OK;
2158}
2159#endif
2160
2161
2162
2163
2164
2165
2166
2167void __init cpuset_init_smp(void)
2168{
2169 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2170 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2171
2172 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2173
2174 cpuset_wq = create_singlethread_workqueue("cpuset");
2175 BUG_ON(!cpuset_wq);
2176}
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2190{
2191 mutex_lock(&callback_mutex);
2192 task_lock(tsk);
2193 guarantee_online_cpus(task_cs(tsk), pmask);
2194 task_unlock(tsk);
2195 mutex_unlock(&callback_mutex);
2196}
2197
2198int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2199{
2200 const struct cpuset *cs;
2201 int cpu;
2202
2203 rcu_read_lock();
2204 cs = task_cs(tsk);
2205 if (cs)
2206 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2207 rcu_read_unlock();
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2225 if (cpu >= nr_cpu_ids) {
2226
2227
2228
2229
2230
2231
2232
2233 do_set_cpus_allowed(tsk, cpu_possible_mask);
2234 cpu = cpumask_any(cpu_active_mask);
2235 }
2236
2237 return cpu;
2238}
2239
2240void cpuset_init_current_mems_allowed(void)
2241{
2242 nodes_setall(current->mems_allowed);
2243}
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2256{
2257 nodemask_t mask;
2258
2259 mutex_lock(&callback_mutex);
2260 task_lock(tsk);
2261 guarantee_online_mems(task_cs(tsk), &mask);
2262 task_unlock(tsk);
2263 mutex_unlock(&callback_mutex);
2264
2265 return mask;
2266}
2267
2268
2269
2270
2271
2272
2273
2274int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2275{
2276 return nodes_intersects(*nodemask, current->mems_allowed);
2277}
2278
2279
2280
2281
2282
2283
2284
2285static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2286{
2287 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2288 cs = cs->parent;
2289 return cs;
2290}
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2354{
2355 const struct cpuset *cs;
2356 int allowed;
2357
2358 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2359 return 1;
2360 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2361 if (node_isset(node, current->mems_allowed))
2362 return 1;
2363
2364
2365
2366
2367 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2368 return 1;
2369 if (gfp_mask & __GFP_HARDWALL)
2370 return 0;
2371
2372 if (current->flags & PF_EXITING)
2373 return 1;
2374
2375
2376 mutex_lock(&callback_mutex);
2377
2378 task_lock(current);
2379 cs = nearest_hardwall_ancestor(task_cs(current));
2380 task_unlock(current);
2381
2382 allowed = node_isset(node, cs->mems_allowed);
2383 mutex_unlock(&callback_mutex);
2384 return allowed;
2385}
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2411{
2412 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2413 return 1;
2414 if (node_isset(node, current->mems_allowed))
2415 return 1;
2416
2417
2418
2419
2420 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2421 return 1;
2422 return 0;
2423}
2424
2425
2426
2427
2428
2429
2430
2431void cpuset_unlock(void)
2432{
2433 mutex_unlock(&callback_mutex);
2434}
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463static int cpuset_spread_node(int *rotor)
2464{
2465 int node;
2466
2467 node = next_node(*rotor, current->mems_allowed);
2468 if (node == MAX_NUMNODES)
2469 node = first_node(current->mems_allowed);
2470 *rotor = node;
2471 return node;
2472}
2473
2474int cpuset_mem_spread_node(void)
2475{
2476 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2477 current->cpuset_mem_spread_rotor =
2478 node_random(¤t->mems_allowed);
2479
2480 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2481}
2482
2483int cpuset_slab_spread_node(void)
2484{
2485 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2486 current->cpuset_slab_spread_rotor =
2487 node_random(¤t->mems_allowed);
2488
2489 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2490}
2491
2492EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2506 const struct task_struct *tsk2)
2507{
2508 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2509}
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2520{
2521 struct dentry *dentry;
2522
2523 dentry = task_cs(tsk)->css.cgroup->dentry;
2524 spin_lock(&cpuset_buffer_lock);
2525 snprintf(cpuset_name, CPUSET_NAME_LEN,
2526 dentry ? (const char *)dentry->d_name.name : "/");
2527 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2528 tsk->mems_allowed);
2529 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2530 tsk->comm, cpuset_name, cpuset_nodelist);
2531 spin_unlock(&cpuset_buffer_lock);
2532}
2533
2534
2535
2536
2537
2538
2539
2540int cpuset_memory_pressure_enabled __read_mostly;
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560void __cpuset_memory_pressure_bump(void)
2561{
2562 task_lock(current);
2563 fmeter_markevent(&task_cs(current)->fmeter);
2564 task_unlock(current);
2565}
2566
2567#ifdef CONFIG_PROC_PID_CPUSET
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2578{
2579 struct pid *pid;
2580 struct task_struct *tsk;
2581 char *buf;
2582 struct cgroup_subsys_state *css;
2583 int retval;
2584
2585 retval = -ENOMEM;
2586 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2587 if (!buf)
2588 goto out;
2589
2590 retval = -ESRCH;
2591 pid = m->private;
2592 tsk = get_pid_task(pid, PIDTYPE_PID);
2593 if (!tsk)
2594 goto out_free;
2595
2596 retval = -EINVAL;
2597 cgroup_lock();
2598 css = task_subsys_state(tsk, cpuset_subsys_id);
2599 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2600 if (retval < 0)
2601 goto out_unlock;
2602 seq_puts(m, buf);
2603 seq_putc(m, '\n');
2604out_unlock:
2605 cgroup_unlock();
2606 put_task_struct(tsk);
2607out_free:
2608 kfree(buf);
2609out:
2610 return retval;
2611}
2612
2613static int cpuset_open(struct inode *inode, struct file *file)
2614{
2615 struct pid *pid = PROC_I(inode)->pid;
2616 return single_open(file, proc_cpuset_show, pid);
2617}
2618
2619const struct file_operations proc_cpuset_operations = {
2620 .open = cpuset_open,
2621 .read = seq_read,
2622 .llseek = seq_lseek,
2623 .release = single_release,
2624};
2625#endif
2626
2627
2628void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2629{
2630 seq_printf(m, "Mems_allowed:\t");
2631 seq_nodemask(m, &task->mems_allowed);
2632 seq_printf(m, "\n");
2633 seq_printf(m, "Mems_allowed_list:\t");
2634 seq_nodemask_list(m, &task->mems_allowed);
2635 seq_printf(m, "\n");
2636}
2637