1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/module.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <asm/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100
101
102
103
104 int mems_generation;
105
106 struct fmeter fmeter;
107
108
109 int pn;
110
111
112 int relax_domain_level;
113
114
115 struct list_head stack_list;
116};
117
118
119static inline struct cpuset *cgroup_cs(struct cgroup *cont)
120{
121 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
122 struct cpuset, css);
123}
124
125
126static inline struct cpuset *task_cs(struct task_struct *task)
127{
128 return container_of(task_subsys_state(task, cpuset_subsys_id),
129 struct cpuset, css);
130}
131
132
133typedef enum {
134 CS_CPU_EXCLUSIVE,
135 CS_MEM_EXCLUSIVE,
136 CS_MEM_HARDWALL,
137 CS_MEMORY_MIGRATE,
138 CS_SCHED_LOAD_BALANCE,
139 CS_SPREAD_PAGE,
140 CS_SPREAD_SLAB,
141} cpuset_flagbits_t;
142
143
144static inline int is_cpu_exclusive(const struct cpuset *cs)
145{
146 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
147}
148
149static inline int is_mem_exclusive(const struct cpuset *cs)
150{
151 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
152}
153
154static inline int is_mem_hardwall(const struct cpuset *cs)
155{
156 return test_bit(CS_MEM_HARDWALL, &cs->flags);
157}
158
159static inline int is_sched_load_balance(const struct cpuset *cs)
160{
161 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
162}
163
164static inline int is_memory_migrate(const struct cpuset *cs)
165{
166 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
167}
168
169static inline int is_spread_page(const struct cpuset *cs)
170{
171 return test_bit(CS_SPREAD_PAGE, &cs->flags);
172}
173
174static inline int is_spread_slab(const struct cpuset *cs)
175{
176 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177}
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202};
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242static DEFINE_MUTEX(callback_mutex);
243
244
245
246
247
248
249#define CPUSET_NAME_LEN (128)
250#define CPUSET_NODELIST_LEN (256)
251static char cpuset_name[CPUSET_NAME_LEN];
252static char cpuset_nodelist[CPUSET_NODELIST_LEN];
253static DEFINE_SPINLOCK(cpuset_buffer_lock);
254
255
256
257
258
259
260static int cpuset_get_sb(struct file_system_type *fs_type,
261 int flags, const char *unused_dev_name,
262 void *data, struct vfsmount *mnt)
263{
264 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
265 int ret = -ENODEV;
266 if (cgroup_fs) {
267 char mountopts[] =
268 "cpuset,noprefix,"
269 "release_agent=/sbin/cpuset_release_agent";
270 ret = cgroup_fs->get_sb(cgroup_fs, flags,
271 unused_dev_name, mountopts, mnt);
272 put_filesystem(cgroup_fs);
273 }
274 return ret;
275}
276
277static struct file_system_type cpuset_fs_type = {
278 .name = "cpuset",
279 .get_sb = cpuset_get_sb,
280};
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296static void guarantee_online_cpus(const struct cpuset *cs,
297 struct cpumask *pmask)
298{
299 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
300 cs = cs->parent;
301 if (cs)
302 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
303 else
304 cpumask_copy(pmask, cpu_online_mask);
305 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
306}
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
322{
323 while (cs && !nodes_intersects(cs->mems_allowed,
324 node_states[N_HIGH_MEMORY]))
325 cs = cs->parent;
326 if (cs)
327 nodes_and(*pmask, cs->mems_allowed,
328 node_states[N_HIGH_MEMORY]);
329 else
330 *pmask = node_states[N_HIGH_MEMORY];
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332}
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375void cpuset_update_task_memory_state(void)
376{
377 int my_cpusets_mem_gen;
378 struct task_struct *tsk = current;
379 struct cpuset *cs;
380
381 rcu_read_lock();
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
383 rcu_read_unlock();
384
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk);
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403}
404
405
406
407
408
409
410
411
412
413static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
414{
415 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
416 nodes_subset(p->mems_allowed, q->mems_allowed) &&
417 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
418 is_mem_exclusive(p) <= is_mem_exclusive(q);
419}
420
421
422
423
424
425static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
426{
427 struct cpuset *trial;
428
429 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
430 if (!trial)
431 return NULL;
432
433 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
434 kfree(trial);
435 return NULL;
436 }
437 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
438
439 return trial;
440}
441
442
443
444
445
446static void free_trial_cpuset(struct cpuset *trial)
447{
448 free_cpumask_var(trial->cpus_allowed);
449 kfree(trial);
450}
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473{
474 struct cgroup *cont;
475 struct cpuset *c, *par;
476
477
478 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
479 if (!is_cpuset_subset(cgroup_cs(cont), trial))
480 return -EBUSY;
481 }
482
483
484 if (cur == &top_cpuset)
485 return 0;
486
487 par = cur->parent;
488
489
490 if (!is_cpuset_subset(trial, par))
491 return -EACCES;
492
493
494
495
496
497 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
498 c = cgroup_cs(cont);
499 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
500 c != cur &&
501 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
502 return -EINVAL;
503 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
504 c != cur &&
505 nodes_intersects(trial->mems_allowed, c->mems_allowed))
506 return -EINVAL;
507 }
508
509
510 if (cgroup_task_count(cur->css.cgroup)) {
511 if (cpumask_empty(trial->cpus_allowed) ||
512 nodes_empty(trial->mems_allowed)) {
513 return -ENOSPC;
514 }
515 }
516
517 return 0;
518}
519
520#ifdef CONFIG_SMP
521
522
523
524
525static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
526{
527 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
528}
529
530static void
531update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
532{
533 if (dattr->relax_domain_level < c->relax_domain_level)
534 dattr->relax_domain_level = c->relax_domain_level;
535 return;
536}
537
538static void
539update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
540{
541 LIST_HEAD(q);
542
543 list_add(&c->stack_list, &q);
544 while (!list_empty(&q)) {
545 struct cpuset *cp;
546 struct cgroup *cont;
547 struct cpuset *child;
548
549 cp = list_first_entry(&q, struct cpuset, stack_list);
550 list_del(q.next);
551
552 if (cpumask_empty(cp->cpus_allowed))
553 continue;
554
555 if (is_sched_load_balance(cp))
556 update_domain_attr(dattr, cp);
557
558 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
559 child = cgroup_cs(cont);
560 list_add_tail(&child->stack_list, &q);
561 }
562 }
563}
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620static int generate_sched_domains(struct cpumask **domains,
621 struct sched_domain_attr **attributes)
622{
623 LIST_HEAD(q);
624 struct cpuset *cp;
625 struct cpuset **csa;
626 int csn;
627 int i, j, k;
628 struct cpumask *doms;
629 struct sched_domain_attr *dattr;
630 int ndoms = 0;
631 int nslot;
632
633 doms = NULL;
634 dattr = NULL;
635 csa = NULL;
636
637
638 if (is_sched_load_balance(&top_cpuset)) {
639 doms = kmalloc(cpumask_size(), GFP_KERNEL);
640 if (!doms)
641 goto done;
642
643 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
644 if (dattr) {
645 *dattr = SD_ATTR_INIT;
646 update_domain_attr_tree(dattr, &top_cpuset);
647 }
648 cpumask_copy(doms, top_cpuset.cpus_allowed);
649
650 ndoms = 1;
651 goto done;
652 }
653
654 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
655 if (!csa)
656 goto done;
657 csn = 0;
658
659 list_add(&top_cpuset.stack_list, &q);
660 while (!list_empty(&q)) {
661 struct cgroup *cont;
662 struct cpuset *child;
663
664 cp = list_first_entry(&q, struct cpuset, stack_list);
665 list_del(q.next);
666
667 if (cpumask_empty(cp->cpus_allowed))
668 continue;
669
670
671
672
673
674
675
676 if (is_sched_load_balance(cp)) {
677 csa[csn++] = cp;
678 continue;
679 }
680
681 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
682 child = cgroup_cs(cont);
683 list_add_tail(&child->stack_list, &q);
684 }
685 }
686
687 for (i = 0; i < csn; i++)
688 csa[i]->pn = i;
689 ndoms = csn;
690
691restart:
692
693 for (i = 0; i < csn; i++) {
694 struct cpuset *a = csa[i];
695 int apn = a->pn;
696
697 for (j = 0; j < csn; j++) {
698 struct cpuset *b = csa[j];
699 int bpn = b->pn;
700
701 if (apn != bpn && cpusets_overlap(a, b)) {
702 for (k = 0; k < csn; k++) {
703 struct cpuset *c = csa[k];
704
705 if (c->pn == bpn)
706 c->pn = apn;
707 }
708 ndoms--;
709 goto restart;
710 }
711 }
712 }
713
714
715
716
717
718 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
719 if (!doms)
720 goto done;
721
722
723
724
725
726 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
727
728 for (nslot = 0, i = 0; i < csn; i++) {
729 struct cpuset *a = csa[i];
730 struct cpumask *dp;
731 int apn = a->pn;
732
733 if (apn < 0) {
734
735 continue;
736 }
737
738 dp = doms + nslot;
739
740 if (nslot == ndoms) {
741 static int warnings = 10;
742 if (warnings) {
743 printk(KERN_WARNING
744 "rebuild_sched_domains confused:"
745 " nslot %d, ndoms %d, csn %d, i %d,"
746 " apn %d\n",
747 nslot, ndoms, csn, i, apn);
748 warnings--;
749 }
750 continue;
751 }
752
753 cpumask_clear(dp);
754 if (dattr)
755 *(dattr + nslot) = SD_ATTR_INIT;
756 for (j = i; j < csn; j++) {
757 struct cpuset *b = csa[j];
758
759 if (apn == b->pn) {
760 cpumask_or(dp, dp, b->cpus_allowed);
761 if (dattr)
762 update_domain_attr_tree(dattr + nslot, b);
763
764
765 b->pn = -1;
766 }
767 }
768 nslot++;
769 }
770 BUG_ON(nslot != ndoms);
771
772done:
773 kfree(csa);
774
775
776
777
778
779 if (doms == NULL)
780 ndoms = 1;
781
782 *domains = doms;
783 *attributes = dattr;
784 return ndoms;
785}
786
787
788
789
790
791
792
793
794
795
796
797static void do_rebuild_sched_domains(struct work_struct *unused)
798{
799 struct sched_domain_attr *attr;
800 struct cpumask *doms;
801 int ndoms;
802
803 get_online_cpus();
804
805
806 cgroup_lock();
807 ndoms = generate_sched_domains(&doms, &attr);
808 cgroup_unlock();
809
810
811 partition_sched_domains(ndoms, doms, attr);
812
813 put_online_cpus();
814}
815#else
816static void do_rebuild_sched_domains(struct work_struct *unused)
817{
818}
819
820static int generate_sched_domains(struct cpumask **domains,
821 struct sched_domain_attr **attributes)
822{
823 *domains = NULL;
824 return 1;
825}
826#endif
827
828static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849static void async_rebuild_sched_domains(void)
850{
851 queue_work(cpuset_wq, &rebuild_sched_domains_work);
852}
853
854
855
856
857
858
859
860
861
862
863void rebuild_sched_domains(void)
864{
865 do_rebuild_sched_domains(NULL);
866}
867
868
869
870
871
872
873
874
875
876
877
878static int cpuset_test_cpumask(struct task_struct *tsk,
879 struct cgroup_scanner *scan)
880{
881 return !cpumask_equal(&tsk->cpus_allowed,
882 (cgroup_cs(scan->cg))->cpus_allowed);
883}
884
885
886
887
888
889
890
891
892
893
894
895
896static void cpuset_change_cpumask(struct task_struct *tsk,
897 struct cgroup_scanner *scan)
898{
899 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
900}
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
916{
917 struct cgroup_scanner scan;
918
919 scan.cg = cs->css.cgroup;
920 scan.test_task = cpuset_test_cpumask;
921 scan.process_task = cpuset_change_cpumask;
922 scan.heap = heap;
923 cgroup_scan_tasks(&scan);
924}
925
926
927
928
929
930
931static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 const char *buf)
933{
934 struct ptr_heap heap;
935 int retval;
936 int is_load_balanced;
937
938
939 if (cs == &top_cpuset)
940 return -EACCES;
941
942
943
944
945
946
947
948 if (!*buf) {
949 cpumask_clear(trialcs->cpus_allowed);
950 } else {
951 retval = cpulist_parse(buf, trialcs->cpus_allowed);
952 if (retval < 0)
953 return retval;
954
955 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
956 return -EINVAL;
957 }
958 retval = validate_change(cs, trialcs);
959 if (retval < 0)
960 return retval;
961
962
963 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
964 return 0;
965
966 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
967 if (retval)
968 return retval;
969
970 is_load_balanced = is_sched_load_balance(trialcs);
971
972 mutex_lock(&callback_mutex);
973 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
974 mutex_unlock(&callback_mutex);
975
976
977
978
979
980 update_tasks_cpumask(cs, &heap);
981
982 heap_free(&heap);
983
984 if (is_load_balanced)
985 async_rebuild_sched_domains();
986 return 0;
987}
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1021 const nodemask_t *to)
1022{
1023 struct task_struct *tsk = current;
1024
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036}
1037
1038
1039
1040
1041
1042static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan)
1044{
1045 struct mm_struct *mm;
1046 struct cpuset *cs;
1047 int migrate;
1048 const nodemask_t *oldmem = scan->data;
1049
1050 mm = get_task_mm(p);
1051 if (!mm)
1052 return;
1053
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs);
1056
1057 mpol_rebind_mm(mm, &cs->mems_allowed);
1058 if (migrate)
1059 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1060 mmput(mm);
1061}
1062
1063static void *cpuset_being_rebound;
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1076 struct ptr_heap *heap)
1077{
1078 struct cgroup_scanner scan;
1079
1080 cpuset_being_rebound = cs;
1081
1082 scan.cg = cs->css.cgroup;
1083 scan.test_task = NULL;
1084 scan.process_task = cpuset_change_nodemask;
1085 scan.heap = heap;
1086 scan.data = (nodemask_t *)oldmem;
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098 cgroup_scan_tasks(&scan);
1099
1100
1101 cpuset_being_rebound = NULL;
1102}
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1118 const char *buf)
1119{
1120 nodemask_t oldmem;
1121 int retval;
1122 struct ptr_heap heap;
1123
1124
1125
1126
1127
1128 if (cs == &top_cpuset)
1129 return -EACCES;
1130
1131
1132
1133
1134
1135
1136
1137 if (!*buf) {
1138 nodes_clear(trialcs->mems_allowed);
1139 } else {
1140 retval = nodelist_parse(buf, trialcs->mems_allowed);
1141 if (retval < 0)
1142 goto done;
1143
1144 if (!nodes_subset(trialcs->mems_allowed,
1145 node_states[N_HIGH_MEMORY]))
1146 return -EINVAL;
1147 }
1148 oldmem = cs->mems_allowed;
1149 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1150 retval = 0;
1151 goto done;
1152 }
1153 retval = validate_change(cs, trialcs);
1154 if (retval < 0)
1155 goto done;
1156
1157 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1158 if (retval < 0)
1159 goto done;
1160
1161 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex);
1165
1166 update_tasks_nodemask(cs, &oldmem, &heap);
1167
1168 heap_free(&heap);
1169done:
1170 return retval;
1171}
1172
1173int current_cpuset_is_being_rebound(void)
1174{
1175 return task_cs(current) == cpuset_being_rebound;
1176}
1177
1178static int update_relax_domain_level(struct cpuset *cs, s64 val)
1179{
1180#ifdef CONFIG_SMP
1181 if (val < -1 || val >= SD_LV_MAX)
1182 return -EINVAL;
1183#endif
1184
1185 if (val != cs->relax_domain_level) {
1186 cs->relax_domain_level = val;
1187 if (!cpumask_empty(cs->cpus_allowed) &&
1188 is_sched_load_balance(cs))
1189 async_rebuild_sched_domains();
1190 }
1191
1192 return 0;
1193}
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on)
1206{
1207 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed;
1210
1211 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs)
1213 return -ENOMEM;
1214
1215 if (turning_on)
1216 set_bit(bit, &trialcs->flags);
1217 else
1218 clear_bit(bit, &trialcs->flags);
1219
1220 err = validate_change(cs, trialcs);
1221 if (err < 0)
1222 goto out;
1223
1224 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs));
1226
1227 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex);
1230
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains();
1233
1234out:
1235 free_trial_cpuset(trialcs);
1236 return err;
1237}
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284#define FM_COEF 933
1285#define FM_MAXTICKS ((time_t)99)
1286#define FM_MAXCNT 1000000
1287#define FM_SCALE 1000
1288
1289
1290static void fmeter_init(struct fmeter *fmp)
1291{
1292 fmp->cnt = 0;
1293 fmp->val = 0;
1294 fmp->time = 0;
1295 spin_lock_init(&fmp->lock);
1296}
1297
1298
1299static void fmeter_update(struct fmeter *fmp)
1300{
1301 time_t now = get_seconds();
1302 time_t ticks = now - fmp->time;
1303
1304 if (ticks == 0)
1305 return;
1306
1307 ticks = min(FM_MAXTICKS, ticks);
1308 while (ticks-- > 0)
1309 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1310 fmp->time = now;
1311
1312 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1313 fmp->cnt = 0;
1314}
1315
1316
1317static void fmeter_markevent(struct fmeter *fmp)
1318{
1319 spin_lock(&fmp->lock);
1320 fmeter_update(fmp);
1321 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1322 spin_unlock(&fmp->lock);
1323}
1324
1325
1326static int fmeter_getrate(struct fmeter *fmp)
1327{
1328 int val;
1329
1330 spin_lock(&fmp->lock);
1331 fmeter_update(fmp);
1332 val = fmp->val;
1333 spin_unlock(&fmp->lock);
1334 return val;
1335}
1336
1337
1338static cpumask_var_t cpus_attach;
1339
1340
1341static int cpuset_can_attach(struct cgroup_subsys *ss,
1342 struct cgroup *cont, struct task_struct *tsk)
1343{
1344 struct cpuset *cs = cgroup_cs(cont);
1345
1346 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1347 return -ENOSPC;
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357 if (tsk->flags & PF_THREAD_BOUND)
1358 return -EINVAL;
1359
1360 return security_task_setscheduler(tsk, 0, NULL);
1361}
1362
1363static void cpuset_attach(struct cgroup_subsys *ss,
1364 struct cgroup *cont, struct cgroup *oldcont,
1365 struct task_struct *tsk)
1366{
1367 nodemask_t from, to;
1368 struct mm_struct *mm;
1369 struct cpuset *cs = cgroup_cs(cont);
1370 struct cpuset *oldcs = cgroup_cs(oldcont);
1371 int err;
1372
1373 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask);
1375 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex);
1379 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err)
1382 return;
1383
1384 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk);
1387 if (mm) {
1388 mpol_rebind_mm(mm, &to);
1389 if (is_memory_migrate(cs))
1390 cpuset_migrate_mm(mm, &from, &to);
1391 mmput(mm);
1392 }
1393}
1394
1395
1396
1397typedef enum {
1398 FILE_MEMORY_MIGRATE,
1399 FILE_CPULIST,
1400 FILE_MEMLIST,
1401 FILE_CPU_EXCLUSIVE,
1402 FILE_MEM_EXCLUSIVE,
1403 FILE_MEM_HARDWALL,
1404 FILE_SCHED_LOAD_BALANCE,
1405 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1406 FILE_MEMORY_PRESSURE_ENABLED,
1407 FILE_MEMORY_PRESSURE,
1408 FILE_SPREAD_PAGE,
1409 FILE_SPREAD_SLAB,
1410} cpuset_filetype_t;
1411
1412static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1413{
1414 int retval = 0;
1415 struct cpuset *cs = cgroup_cs(cgrp);
1416 cpuset_filetype_t type = cft->private;
1417
1418 if (!cgroup_lock_live_group(cgrp))
1419 return -ENODEV;
1420
1421 switch (type) {
1422 case FILE_CPU_EXCLUSIVE:
1423 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1424 break;
1425 case FILE_MEM_EXCLUSIVE:
1426 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1427 break;
1428 case FILE_MEM_HARDWALL:
1429 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1430 break;
1431 case FILE_SCHED_LOAD_BALANCE:
1432 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1433 break;
1434 case FILE_MEMORY_MIGRATE:
1435 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1436 break;
1437 case FILE_MEMORY_PRESSURE_ENABLED:
1438 cpuset_memory_pressure_enabled = !!val;
1439 break;
1440 case FILE_MEMORY_PRESSURE:
1441 retval = -EACCES;
1442 break;
1443 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break;
1447 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break;
1451 default:
1452 retval = -EINVAL;
1453 break;
1454 }
1455 cgroup_unlock();
1456 return retval;
1457}
1458
1459static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1460{
1461 int retval = 0;
1462 struct cpuset *cs = cgroup_cs(cgrp);
1463 cpuset_filetype_t type = cft->private;
1464
1465 if (!cgroup_lock_live_group(cgrp))
1466 return -ENODEV;
1467
1468 switch (type) {
1469 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1470 retval = update_relax_domain_level(cs, val);
1471 break;
1472 default:
1473 retval = -EINVAL;
1474 break;
1475 }
1476 cgroup_unlock();
1477 return retval;
1478}
1479
1480
1481
1482
1483static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1484 const char *buf)
1485{
1486 int retval = 0;
1487 struct cpuset *cs = cgroup_cs(cgrp);
1488 struct cpuset *trialcs;
1489
1490 if (!cgroup_lock_live_group(cgrp))
1491 return -ENODEV;
1492
1493 trialcs = alloc_trial_cpuset(cs);
1494 if (!trialcs)
1495 return -ENOMEM;
1496
1497 switch (cft->private) {
1498 case FILE_CPULIST:
1499 retval = update_cpumask(cs, trialcs, buf);
1500 break;
1501 case FILE_MEMLIST:
1502 retval = update_nodemask(cs, trialcs, buf);
1503 break;
1504 default:
1505 retval = -EINVAL;
1506 break;
1507 }
1508
1509 free_trial_cpuset(trialcs);
1510 cgroup_unlock();
1511 return retval;
1512}
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1527{
1528 int ret;
1529
1530 mutex_lock(&callback_mutex);
1531 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1532 mutex_unlock(&callback_mutex);
1533
1534 return ret;
1535}
1536
1537static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1538{
1539 nodemask_t mask;
1540
1541 mutex_lock(&callback_mutex);
1542 mask = cs->mems_allowed;
1543 mutex_unlock(&callback_mutex);
1544
1545 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1546}
1547
1548static ssize_t cpuset_common_file_read(struct cgroup *cont,
1549 struct cftype *cft,
1550 struct file *file,
1551 char __user *buf,
1552 size_t nbytes, loff_t *ppos)
1553{
1554 struct cpuset *cs = cgroup_cs(cont);
1555 cpuset_filetype_t type = cft->private;
1556 char *page;
1557 ssize_t retval = 0;
1558 char *s;
1559
1560 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1561 return -ENOMEM;
1562
1563 s = page;
1564
1565 switch (type) {
1566 case FILE_CPULIST:
1567 s += cpuset_sprintf_cpulist(s, cs);
1568 break;
1569 case FILE_MEMLIST:
1570 s += cpuset_sprintf_memlist(s, cs);
1571 break;
1572 default:
1573 retval = -EINVAL;
1574 goto out;
1575 }
1576 *s++ = '\n';
1577
1578 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1579out:
1580 free_page((unsigned long)page);
1581 return retval;
1582}
1583
1584static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1585{
1586 struct cpuset *cs = cgroup_cs(cont);
1587 cpuset_filetype_t type = cft->private;
1588 switch (type) {
1589 case FILE_CPU_EXCLUSIVE:
1590 return is_cpu_exclusive(cs);
1591 case FILE_MEM_EXCLUSIVE:
1592 return is_mem_exclusive(cs);
1593 case FILE_MEM_HARDWALL:
1594 return is_mem_hardwall(cs);
1595 case FILE_SCHED_LOAD_BALANCE:
1596 return is_sched_load_balance(cs);
1597 case FILE_MEMORY_MIGRATE:
1598 return is_memory_migrate(cs);
1599 case FILE_MEMORY_PRESSURE_ENABLED:
1600 return cpuset_memory_pressure_enabled;
1601 case FILE_MEMORY_PRESSURE:
1602 return fmeter_getrate(&cs->fmeter);
1603 case FILE_SPREAD_PAGE:
1604 return is_spread_page(cs);
1605 case FILE_SPREAD_SLAB:
1606 return is_spread_slab(cs);
1607 default:
1608 BUG();
1609 }
1610
1611
1612 return 0;
1613}
1614
1615static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1616{
1617 struct cpuset *cs = cgroup_cs(cont);
1618 cpuset_filetype_t type = cft->private;
1619 switch (type) {
1620 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1621 return cs->relax_domain_level;
1622 default:
1623 BUG();
1624 }
1625
1626
1627 return 0;
1628}
1629
1630
1631
1632
1633
1634
1635static struct cftype files[] = {
1636 {
1637 .name = "cpus",
1638 .read = cpuset_common_file_read,
1639 .write_string = cpuset_write_resmask,
1640 .max_write_len = (100U + 6 * NR_CPUS),
1641 .private = FILE_CPULIST,
1642 },
1643
1644 {
1645 .name = "mems",
1646 .read = cpuset_common_file_read,
1647 .write_string = cpuset_write_resmask,
1648 .max_write_len = (100U + 6 * MAX_NUMNODES),
1649 .private = FILE_MEMLIST,
1650 },
1651
1652 {
1653 .name = "cpu_exclusive",
1654 .read_u64 = cpuset_read_u64,
1655 .write_u64 = cpuset_write_u64,
1656 .private = FILE_CPU_EXCLUSIVE,
1657 },
1658
1659 {
1660 .name = "mem_exclusive",
1661 .read_u64 = cpuset_read_u64,
1662 .write_u64 = cpuset_write_u64,
1663 .private = FILE_MEM_EXCLUSIVE,
1664 },
1665
1666 {
1667 .name = "mem_hardwall",
1668 .read_u64 = cpuset_read_u64,
1669 .write_u64 = cpuset_write_u64,
1670 .private = FILE_MEM_HARDWALL,
1671 },
1672
1673 {
1674 .name = "sched_load_balance",
1675 .read_u64 = cpuset_read_u64,
1676 .write_u64 = cpuset_write_u64,
1677 .private = FILE_SCHED_LOAD_BALANCE,
1678 },
1679
1680 {
1681 .name = "sched_relax_domain_level",
1682 .read_s64 = cpuset_read_s64,
1683 .write_s64 = cpuset_write_s64,
1684 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1685 },
1686
1687 {
1688 .name = "memory_migrate",
1689 .read_u64 = cpuset_read_u64,
1690 .write_u64 = cpuset_write_u64,
1691 .private = FILE_MEMORY_MIGRATE,
1692 },
1693
1694 {
1695 .name = "memory_pressure",
1696 .read_u64 = cpuset_read_u64,
1697 .write_u64 = cpuset_write_u64,
1698 .private = FILE_MEMORY_PRESSURE,
1699 .mode = S_IRUGO,
1700 },
1701
1702 {
1703 .name = "memory_spread_page",
1704 .read_u64 = cpuset_read_u64,
1705 .write_u64 = cpuset_write_u64,
1706 .private = FILE_SPREAD_PAGE,
1707 },
1708
1709 {
1710 .name = "memory_spread_slab",
1711 .read_u64 = cpuset_read_u64,
1712 .write_u64 = cpuset_write_u64,
1713 .private = FILE_SPREAD_SLAB,
1714 },
1715};
1716
1717static struct cftype cft_memory_pressure_enabled = {
1718 .name = "memory_pressure_enabled",
1719 .read_u64 = cpuset_read_u64,
1720 .write_u64 = cpuset_write_u64,
1721 .private = FILE_MEMORY_PRESSURE_ENABLED,
1722};
1723
1724static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1725{
1726 int err;
1727
1728 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
1729 if (err)
1730 return err;
1731
1732 if (!cont->parent)
1733 err = cgroup_add_file(cont, ss,
1734 &cft_memory_pressure_enabled);
1735 return err;
1736}
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755static void cpuset_post_clone(struct cgroup_subsys *ss,
1756 struct cgroup *cgroup)
1757{
1758 struct cgroup *parent, *child;
1759 struct cpuset *cs, *parent_cs;
1760
1761 parent = cgroup->parent;
1762 list_for_each_entry(child, &parent->children, sibling) {
1763 cs = cgroup_cs(child);
1764 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1765 return;
1766 }
1767 cs = cgroup_cs(cgroup);
1768 parent_cs = cgroup_cs(parent);
1769
1770 cs->mems_allowed = parent_cs->mems_allowed;
1771 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1772 return;
1773}
1774
1775
1776
1777
1778
1779
1780
1781static struct cgroup_subsys_state *cpuset_create(
1782 struct cgroup_subsys *ss,
1783 struct cgroup *cont)
1784{
1785 struct cpuset *cs;
1786 struct cpuset *parent;
1787
1788 if (!cont->parent) {
1789
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css;
1792 }
1793 parent = cgroup_cs(cont->parent);
1794 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1795 if (!cs)
1796 return ERR_PTR(-ENOMEM);
1797 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1798 kfree(cs);
1799 return ERR_PTR(-ENOMEM);
1800 }
1801
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0;
1804 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags);
1806 if (is_spread_slab(parent))
1807 set_bit(CS_SPREAD_SLAB, &cs->flags);
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1;
1814
1815 cs->parent = parent;
1816 number_of_cpusets++;
1817 return &cs->css ;
1818}
1819
1820
1821
1822
1823
1824
1825
1826static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{
1828 struct cpuset *cs = cgroup_cs(cont);
1829
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834
1835 number_of_cpusets--;
1836 free_cpumask_var(cs->cpus_allowed);
1837 kfree(cs);
1838}
1839
1840struct cgroup_subsys cpuset_subsys = {
1841 .name = "cpuset",
1842 .create = cpuset_create,
1843 .destroy = cpuset_destroy,
1844 .can_attach = cpuset_can_attach,
1845 .attach = cpuset_attach,
1846 .populate = cpuset_populate,
1847 .post_clone = cpuset_post_clone,
1848 .subsys_id = cpuset_subsys_id,
1849 .early_init = 1,
1850};
1851
1852
1853
1854
1855
1856
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867
1868
1869
1870
1871
1872
1873int __init cpuset_init(void)
1874{
1875 int err = 0;
1876
1877 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed);
1879
1880 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1;
1884
1885 err = register_filesystem(&cpuset_fs_type);
1886 if (err < 0)
1887 return err;
1888
1889 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1890 BUG();
1891
1892 number_of_cpusets = 1;
1893 return 0;
1894}
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904static void cpuset_do_move_task(struct task_struct *tsk,
1905 struct cgroup_scanner *scan)
1906{
1907 struct cgroup *new_cgroup = scan->data;
1908
1909 cgroup_attach_task(new_cgroup, tsk);
1910}
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1924{
1925 struct cgroup_scanner scan;
1926
1927 scan.cg = from->css.cgroup;
1928 scan.test_task = NULL;
1929 scan.process_task = cpuset_do_move_task;
1930 scan.heap = NULL;
1931 scan.data = to->css.cgroup;
1932
1933 if (cgroup_scan_tasks(&scan))
1934 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1935 "cgroup_scan_tasks failed\n");
1936}
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1949{
1950 struct cpuset *parent;
1951
1952
1953
1954
1955
1956
1957 if (list_empty(&cs->css.cgroup->css_sets))
1958 return;
1959
1960
1961
1962
1963
1964 parent = cs->parent;
1965 while (cpumask_empty(parent->cpus_allowed) ||
1966 nodes_empty(parent->mems_allowed))
1967 parent = parent->parent;
1968
1969 move_member_tasks_to_cpuset(cs, parent);
1970}
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987static void scan_for_empty_cpusets(struct cpuset *root)
1988{
1989 LIST_HEAD(queue);
1990 struct cpuset *cp;
1991 struct cpuset *child;
1992 struct cgroup *cont;
1993 nodemask_t oldmems;
1994
1995 list_add_tail((struct list_head *)&root->stack_list, &queue);
1996
1997 while (!list_empty(&queue)) {
1998 cp = list_first_entry(&queue, struct cpuset, stack_list);
1999 list_del(queue.next);
2000 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2001 child = cgroup_cs(cont);
2002 list_add_tail(&child->stack_list, &queue);
2003 }
2004
2005
2006 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
2007 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2008 continue;
2009
2010 oldmems = cp->mems_allowed;
2011
2012
2013 mutex_lock(&callback_mutex);
2014 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2015 cpu_online_mask);
2016 nodes_and(cp->mems_allowed, cp->mems_allowed,
2017 node_states[N_HIGH_MEMORY]);
2018 mutex_unlock(&callback_mutex);
2019
2020
2021 if (cpumask_empty(cp->cpus_allowed) ||
2022 nodes_empty(cp->mems_allowed))
2023 remove_tasks_in_empty_cpuset(cp);
2024 else {
2025 update_tasks_cpumask(cp, NULL);
2026 update_tasks_nodemask(cp, &oldmems, NULL);
2027 }
2028 }
2029}
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2044 unsigned long phase, void *unused_cpu)
2045{
2046 struct sched_domain_attr *attr;
2047 struct cpumask *doms;
2048 int ndoms;
2049
2050 switch (phase) {
2051 case CPU_ONLINE:
2052 case CPU_ONLINE_FROZEN:
2053 case CPU_DEAD:
2054 case CPU_DEAD_FROZEN:
2055 break;
2056
2057 default:
2058 return NOTIFY_DONE;
2059 }
2060
2061 cgroup_lock();
2062 mutex_lock(&callback_mutex);
2063 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2064 mutex_unlock(&callback_mutex);
2065 scan_for_empty_cpusets(&top_cpuset);
2066 ndoms = generate_sched_domains(&doms, &attr);
2067 cgroup_unlock();
2068
2069
2070 partition_sched_domains(ndoms, doms, attr);
2071
2072 return NOTIFY_OK;
2073}
2074
2075#ifdef CONFIG_MEMORY_HOTPLUG
2076
2077
2078
2079
2080
2081static int cpuset_track_online_nodes(struct notifier_block *self,
2082 unsigned long action, void *arg)
2083{
2084 cgroup_lock();
2085 switch (action) {
2086 case MEM_ONLINE:
2087 case MEM_OFFLINE:
2088 mutex_lock(&callback_mutex);
2089 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2090 mutex_unlock(&callback_mutex);
2091 if (action == MEM_OFFLINE)
2092 scan_for_empty_cpusets(&top_cpuset);
2093 break;
2094 default:
2095 break;
2096 }
2097 cgroup_unlock();
2098 return NOTIFY_OK;
2099}
2100#endif
2101
2102
2103
2104
2105
2106
2107
2108void __init cpuset_init_smp(void)
2109{
2110 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2111 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2112
2113 hotcpu_notifier(cpuset_track_online_cpus, 0);
2114 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2115
2116 cpuset_wq = create_singlethread_workqueue("cpuset");
2117 BUG_ON(!cpuset_wq);
2118}
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2132{
2133 mutex_lock(&callback_mutex);
2134 cpuset_cpus_allowed_locked(tsk, pmask);
2135 mutex_unlock(&callback_mutex);
2136}
2137
2138
2139
2140
2141
2142void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2143{
2144 task_lock(tsk);
2145 guarantee_online_cpus(task_cs(tsk), pmask);
2146 task_unlock(tsk);
2147}
2148
2149void cpuset_init_current_mems_allowed(void)
2150{
2151 nodes_setall(current->mems_allowed);
2152}
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2165{
2166 nodemask_t mask;
2167
2168 mutex_lock(&callback_mutex);
2169 task_lock(tsk);
2170 guarantee_online_mems(task_cs(tsk), &mask);
2171 task_unlock(tsk);
2172 mutex_unlock(&callback_mutex);
2173
2174 return mask;
2175}
2176
2177
2178
2179
2180
2181
2182
2183int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2184{
2185 return nodes_intersects(*nodemask, current->mems_allowed);
2186}
2187
2188
2189
2190
2191
2192
2193
2194static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2195{
2196 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2197 cs = cs->parent;
2198 return cs;
2199}
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2263{
2264 const struct cpuset *cs;
2265 int allowed;
2266
2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2268 return 1;
2269 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2270 if (node_isset(node, current->mems_allowed))
2271 return 1;
2272
2273
2274
2275
2276 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2277 return 1;
2278 if (gfp_mask & __GFP_HARDWALL)
2279 return 0;
2280
2281 if (current->flags & PF_EXITING)
2282 return 1;
2283
2284
2285 mutex_lock(&callback_mutex);
2286
2287 task_lock(current);
2288 cs = nearest_hardwall_ancestor(task_cs(current));
2289 task_unlock(current);
2290
2291 allowed = node_isset(node, cs->mems_allowed);
2292 mutex_unlock(&callback_mutex);
2293 return allowed;
2294}
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2320{
2321 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2322 return 1;
2323 if (node_isset(node, current->mems_allowed))
2324 return 1;
2325
2326
2327
2328
2329 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2330 return 1;
2331 return 0;
2332}
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345void cpuset_lock(void)
2346{
2347 mutex_lock(&callback_mutex);
2348}
2349
2350
2351
2352
2353
2354
2355
2356void cpuset_unlock(void)
2357{
2358 mutex_unlock(&callback_mutex);
2359}
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387int cpuset_mem_spread_node(void)
2388{
2389 int node;
2390
2391 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2392 if (node == MAX_NUMNODES)
2393 node = first_node(current->mems_allowed);
2394 current->cpuset_mem_spread_rotor = node;
2395 return node;
2396}
2397EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2411 const struct task_struct *tsk2)
2412{
2413 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2414}
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2425{
2426 struct dentry *dentry;
2427
2428 dentry = task_cs(tsk)->css.cgroup->dentry;
2429 spin_lock(&cpuset_buffer_lock);
2430 snprintf(cpuset_name, CPUSET_NAME_LEN,
2431 dentry ? (const char *)dentry->d_name.name : "/");
2432 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2433 tsk->mems_allowed);
2434 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2435 tsk->comm, cpuset_name, cpuset_nodelist);
2436 spin_unlock(&cpuset_buffer_lock);
2437}
2438
2439
2440
2441
2442
2443
2444
2445int cpuset_memory_pressure_enabled __read_mostly;
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465void __cpuset_memory_pressure_bump(void)
2466{
2467 task_lock(current);
2468 fmeter_markevent(&task_cs(current)->fmeter);
2469 task_unlock(current);
2470}
2471
2472#ifdef CONFIG_PROC_PID_CPUSET
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2483{
2484 struct pid *pid;
2485 struct task_struct *tsk;
2486 char *buf;
2487 struct cgroup_subsys_state *css;
2488 int retval;
2489
2490 retval = -ENOMEM;
2491 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2492 if (!buf)
2493 goto out;
2494
2495 retval = -ESRCH;
2496 pid = m->private;
2497 tsk = get_pid_task(pid, PIDTYPE_PID);
2498 if (!tsk)
2499 goto out_free;
2500
2501 retval = -EINVAL;
2502 cgroup_lock();
2503 css = task_subsys_state(tsk, cpuset_subsys_id);
2504 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2505 if (retval < 0)
2506 goto out_unlock;
2507 seq_puts(m, buf);
2508 seq_putc(m, '\n');
2509out_unlock:
2510 cgroup_unlock();
2511 put_task_struct(tsk);
2512out_free:
2513 kfree(buf);
2514out:
2515 return retval;
2516}
2517
2518static int cpuset_open(struct inode *inode, struct file *file)
2519{
2520 struct pid *pid = PROC_I(inode)->pid;
2521 return single_open(file, proc_cpuset_show, pid);
2522}
2523
2524const struct file_operations proc_cpuset_operations = {
2525 .open = cpuset_open,
2526 .read = seq_read,
2527 .llseek = seq_lseek,
2528 .release = single_release,
2529};
2530#endif
2531
2532
2533void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2534{
2535 seq_printf(m, "Cpus_allowed:\t");
2536 seq_cpumask(m, &task->cpus_allowed);
2537 seq_printf(m, "\n");
2538 seq_printf(m, "Cpus_allowed_list:\t");
2539 seq_cpumask_list(m, &task->cpus_allowed);
2540 seq_printf(m, "\n");
2541 seq_printf(m, "Mems_allowed:\t");
2542 seq_nodemask(m, &task->mems_allowed);
2543 seq_printf(m, "\n");
2544 seq_printf(m, "Mems_allowed_list:\t");
2545 seq_nodemask_list(m, &task->mems_allowed);
2546 seq_printf(m, "\n");
2547}
2548