1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62
63
64
65
66
67
68
69static struct workqueue_struct *cpuset_wq;
70
71
72
73
74
75
76int number_of_cpusets __read_mostly;
77
78
79struct cgroup_subsys cpuset_subsys;
80struct cpuset;
81
82
83
84struct fmeter {
85 int cnt;
86 int val;
87 time_t time;
88 spinlock_t lock;
89};
90
91struct cpuset {
92 struct cgroup_subsys_state css;
93
94 unsigned long flags;
95 cpumask_var_t cpus_allowed;
96 nodemask_t mems_allowed;
97
98 struct cpuset *parent;
99
100 struct fmeter fmeter;
101
102
103 int pn;
104
105
106 int relax_domain_level;
107
108
109 struct list_head stack_list;
110};
111
112
113static inline struct cpuset *cgroup_cs(struct cgroup *cont)
114{
115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
116 struct cpuset, css);
117}
118
119
120static inline struct cpuset *task_cs(struct task_struct *task)
121{
122 return container_of(task_subsys_state(task, cpuset_subsys_id),
123 struct cpuset, css);
124}
125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
139
140typedef enum {
141 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL,
144 CS_MEMORY_MIGRATE,
145 CS_SCHED_LOAD_BALANCE,
146 CS_SPREAD_PAGE,
147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t;
149
150
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156
157static inline int is_cpu_exclusive(const struct cpuset *cs)
158{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
160}
161
162static inline int is_mem_exclusive(const struct cpuset *cs)
163{
164 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
165}
166
167static inline int is_mem_hardwall(const struct cpuset *cs)
168{
169 return test_bit(CS_MEM_HARDWALL, &cs->flags);
170}
171
172static inline int is_sched_load_balance(const struct cpuset *cs)
173{
174 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
175}
176
177static inline int is_memory_migrate(const struct cpuset *cs)
178{
179 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
180}
181
182static inline int is_spread_page(const struct cpuset *cs)
183{
184 return test_bit(CS_SPREAD_PAGE, &cs->flags);
185}
186
187static inline int is_spread_slab(const struct cpuset *cs)
188{
189 return test_bit(CS_SPREAD_SLAB, &cs->flags);
190}
191
192static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
194};
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235static DEFINE_MUTEX(callback_mutex);
236
237
238
239
240
241
242#define CPUSET_NAME_LEN (128)
243#define CPUSET_NODELIST_LEN (256)
244static char cpuset_name[CPUSET_NAME_LEN];
245static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock);
247
248
249
250
251
252
253static struct dentry *cpuset_mount(struct file_system_type *fs_type,
254 int flags, const char *unused_dev_name, void *data)
255{
256 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
257 struct dentry *ret = ERR_PTR(-ENODEV);
258 if (cgroup_fs) {
259 char mountopts[] =
260 "cpuset,noprefix,"
261 "release_agent=/sbin/cpuset_release_agent";
262 ret = cgroup_fs->mount(cgroup_fs, flags,
263 unused_dev_name, mountopts);
264 put_filesystem(cgroup_fs);
265 }
266 return ret;
267}
268
269static struct file_system_type cpuset_fs_type = {
270 .name = "cpuset",
271 .mount = cpuset_mount,
272};
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask)
290{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent;
293 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else
296 cpumask_copy(pmask, cpu_online_mask);
297 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
298}
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{
315 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY]))
317 cs = cs->parent;
318 if (cs)
319 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]);
321 else
322 *pmask = node_states[N_HIGH_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
324}
325
326
327
328
329
330
331static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk)
333{
334 if (is_spread_page(cs))
335 tsk->flags |= PF_SPREAD_PAGE;
336 else
337 tsk->flags &= ~PF_SPREAD_PAGE;
338 if (is_spread_slab(cs))
339 tsk->flags |= PF_SPREAD_SLAB;
340 else
341 tsk->flags &= ~PF_SPREAD_SLAB;
342}
343
344
345
346
347
348
349
350
351
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
353{
354 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
355 nodes_subset(p->mems_allowed, q->mems_allowed) &&
356 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
357 is_mem_exclusive(p) <= is_mem_exclusive(q);
358}
359
360
361
362
363
364static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
365{
366 struct cpuset *trial;
367
368 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
369 if (!trial)
370 return NULL;
371
372 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
373 kfree(trial);
374 return NULL;
375 }
376 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
377
378 return trial;
379}
380
381
382
383
384
385static void free_trial_cpuset(struct cpuset *trial)
386{
387 free_cpumask_var(trial->cpus_allowed);
388 kfree(trial);
389}
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{
413 struct cgroup *cont;
414 struct cpuset *c, *par;
415
416
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
418 if (!is_cpuset_subset(cgroup_cs(cont), trial))
419 return -EBUSY;
420 }
421
422
423 if (cur == &top_cpuset)
424 return 0;
425
426 par = cur->parent;
427
428
429 if (!is_cpuset_subset(trial, par))
430 return -EACCES;
431
432
433
434
435
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
437 c = cgroup_cs(cont);
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL;
446 }
447
448
449 if (cgroup_task_count(cur->css.cgroup)) {
450 if (cpumask_empty(trial->cpus_allowed) ||
451 nodes_empty(trial->mems_allowed)) {
452 return -ENOSPC;
453 }
454 }
455
456 return 0;
457}
458
459#ifdef CONFIG_SMP
460
461
462
463
464static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
465{
466 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
467}
468
469static void
470update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
471{
472 if (dattr->relax_domain_level < c->relax_domain_level)
473 dattr->relax_domain_level = c->relax_domain_level;
474 return;
475}
476
477static void
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
479{
480 LIST_HEAD(q);
481
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490
491 if (cpumask_empty(cp->cpus_allowed))
492 continue;
493
494 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 }
502}
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes)
560{
561 LIST_HEAD(q);
562 struct cpuset *cp;
563 struct cpuset **csa;
564 int csn;
565 int i, j, k;
566 cpumask_var_t *doms;
567 struct sched_domain_attr *dattr;
568 int ndoms = 0;
569 int nslot;
570
571 doms = NULL;
572 dattr = NULL;
573 csa = NULL;
574
575
576 if (is_sched_load_balance(&top_cpuset)) {
577 ndoms = 1;
578 doms = alloc_sched_domains(ndoms);
579 if (!doms)
580 goto done;
581
582 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
583 if (dattr) {
584 *dattr = SD_ATTR_INIT;
585 update_domain_attr_tree(dattr, &top_cpuset);
586 }
587 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
588
589 goto done;
590 }
591
592 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
593 if (!csa)
594 goto done;
595 csn = 0;
596
597 list_add(&top_cpuset.stack_list, &q);
598 while (!list_empty(&q)) {
599 struct cgroup *cont;
600 struct cpuset *child;
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608
609
610
611
612
613
614 if (is_sched_load_balance(cp)) {
615 csa[csn++] = cp;
616 continue;
617 }
618
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
620 child = cgroup_cs(cont);
621 list_add_tail(&child->stack_list, &q);
622 }
623 }
624
625 for (i = 0; i < csn; i++)
626 csa[i]->pn = i;
627 ndoms = csn;
628
629restart:
630
631 for (i = 0; i < csn; i++) {
632 struct cpuset *a = csa[i];
633 int apn = a->pn;
634
635 for (j = 0; j < csn; j++) {
636 struct cpuset *b = csa[j];
637 int bpn = b->pn;
638
639 if (apn != bpn && cpusets_overlap(a, b)) {
640 for (k = 0; k < csn; k++) {
641 struct cpuset *c = csa[k];
642
643 if (c->pn == bpn)
644 c->pn = apn;
645 }
646 ndoms--;
647 goto restart;
648 }
649 }
650 }
651
652
653
654
655
656 doms = alloc_sched_domains(ndoms);
657 if (!doms)
658 goto done;
659
660
661
662
663
664 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
665
666 for (nslot = 0, i = 0; i < csn; i++) {
667 struct cpuset *a = csa[i];
668 struct cpumask *dp;
669 int apn = a->pn;
670
671 if (apn < 0) {
672
673 continue;
674 }
675
676 dp = doms[nslot];
677
678 if (nslot == ndoms) {
679 static int warnings = 10;
680 if (warnings) {
681 printk(KERN_WARNING
682 "rebuild_sched_domains confused:"
683 " nslot %d, ndoms %d, csn %d, i %d,"
684 " apn %d\n",
685 nslot, ndoms, csn, i, apn);
686 warnings--;
687 }
688 continue;
689 }
690
691 cpumask_clear(dp);
692 if (dattr)
693 *(dattr + nslot) = SD_ATTR_INIT;
694 for (j = i; j < csn; j++) {
695 struct cpuset *b = csa[j];
696
697 if (apn == b->pn) {
698 cpumask_or(dp, dp, b->cpus_allowed);
699 if (dattr)
700 update_domain_attr_tree(dattr + nslot, b);
701
702
703 b->pn = -1;
704 }
705 }
706 nslot++;
707 }
708 BUG_ON(nslot != ndoms);
709
710done:
711 kfree(csa);
712
713
714
715
716
717 if (doms == NULL)
718 ndoms = 1;
719
720 *domains = doms;
721 *attributes = dattr;
722 return ndoms;
723}
724
725
726
727
728
729
730
731
732
733
734
735static void do_rebuild_sched_domains(struct work_struct *unused)
736{
737 struct sched_domain_attr *attr;
738 cpumask_var_t *doms;
739 int ndoms;
740
741 get_online_cpus();
742
743
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747
748
749 partition_sched_domains(ndoms, doms, attr);
750
751 put_online_cpus();
752}
753#else
754static void do_rebuild_sched_domains(struct work_struct *unused)
755{
756}
757
758static int generate_sched_domains(cpumask_var_t **domains,
759 struct sched_domain_attr **attributes)
760{
761 *domains = NULL;
762 return 1;
763}
764#endif
765
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792
793
794
795
796
797
798
799
800
801void rebuild_sched_domains(void)
802{
803 do_rebuild_sched_domains(NULL);
804}
805
806
807
808
809
810
811
812
813
814
815
816static int cpuset_test_cpumask(struct task_struct *tsk,
817 struct cgroup_scanner *scan)
818{
819 return !cpumask_equal(&tsk->cpus_allowed,
820 (cgroup_cs(scan->cg))->cpus_allowed);
821}
822
823
824
825
826
827
828
829
830
831
832
833
834static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan)
836{
837 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
838}
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
854{
855 struct cgroup_scanner scan;
856
857 scan.cg = cs->css.cgroup;
858 scan.test_task = cpuset_test_cpumask;
859 scan.process_task = cpuset_change_cpumask;
860 scan.heap = heap;
861 cgroup_scan_tasks(&scan);
862}
863
864
865
866
867
868
869static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
870 const char *buf)
871{
872 struct ptr_heap heap;
873 int retval;
874 int is_load_balanced;
875
876
877 if (cs == &top_cpuset)
878 return -EACCES;
879
880
881
882
883
884
885
886 if (!*buf) {
887 cpumask_clear(trialcs->cpus_allowed);
888 } else {
889 retval = cpulist_parse(buf, trialcs->cpus_allowed);
890 if (retval < 0)
891 return retval;
892
893 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
894 return -EINVAL;
895 }
896 retval = validate_change(cs, trialcs);
897 if (retval < 0)
898 return retval;
899
900
901 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
902 return 0;
903
904 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
905 if (retval)
906 return retval;
907
908 is_load_balanced = is_sched_load_balance(trialcs);
909
910 mutex_lock(&callback_mutex);
911 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
912 mutex_unlock(&callback_mutex);
913
914
915
916
917
918 update_tasks_cpumask(cs, &heap);
919
920 heap_free(&heap);
921
922 if (is_load_balanced)
923 async_rebuild_sched_domains();
924 return 0;
925}
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
948 const nodemask_t *to)
949{
950 struct task_struct *tsk = current;
951
952 tsk->mems_allowed = *to;
953
954 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
955
956 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
957}
958
959
960
961
962
963
964
965
966
967
968static void cpuset_change_task_nodemask(struct task_struct *tsk,
969 nodemask_t *newmems)
970{
971 bool need_loop;
972
973
974
975
976
977 if (unlikely(test_thread_flag(TIF_MEMDIE)))
978 return;
979 if (current->flags & PF_EXITING)
980 return;
981
982 task_lock(tsk);
983
984
985
986
987
988
989 need_loop = task_has_mempolicy(tsk) ||
990 !nodes_intersects(*newmems, tsk->mems_allowed);
991
992 if (need_loop)
993 write_seqcount_begin(&tsk->mems_allowed_seq);
994
995 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
996 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
997
998 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
999 tsk->mems_allowed = *newmems;
1000
1001 if (need_loop)
1002 write_seqcount_end(&tsk->mems_allowed_seq);
1003
1004 task_unlock(tsk);
1005}
1006
1007
1008
1009
1010
1011
1012static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan)
1014{
1015 struct mm_struct *mm;
1016 struct cpuset *cs;
1017 int migrate;
1018 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems;
1020
1021 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems);
1023
1024 cpuset_change_task_nodemask(p, &newmems);
1025
1026 mm = get_task_mm(p);
1027 if (!mm)
1028 return;
1029
1030 migrate = is_memory_migrate(cs);
1031
1032 mpol_rebind_mm(mm, &cs->mems_allowed);
1033 if (migrate)
1034 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1035 mmput(mm);
1036}
1037
1038static void *cpuset_being_rebound;
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051 struct ptr_heap *heap)
1052{
1053 struct cgroup_scanner scan;
1054
1055 cpuset_being_rebound = cs;
1056
1057 scan.cg = cs->css.cgroup;
1058 scan.test_task = NULL;
1059 scan.process_task = cpuset_change_nodemask;
1060 scan.heap = heap;
1061 scan.data = (nodemask_t *)oldmem;
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073 cgroup_scan_tasks(&scan);
1074
1075
1076 cpuset_being_rebound = NULL;
1077}
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1093 const char *buf)
1094{
1095 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1096 int retval;
1097 struct ptr_heap heap;
1098
1099 if (!oldmem)
1100 return -ENOMEM;
1101
1102
1103
1104
1105
1106 if (cs == &top_cpuset) {
1107 retval = -EACCES;
1108 goto done;
1109 }
1110
1111
1112
1113
1114
1115
1116
1117 if (!*buf) {
1118 nodes_clear(trialcs->mems_allowed);
1119 } else {
1120 retval = nodelist_parse(buf, trialcs->mems_allowed);
1121 if (retval < 0)
1122 goto done;
1123
1124 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) {
1126 retval = -EINVAL;
1127 goto done;
1128 }
1129 }
1130 *oldmem = cs->mems_allowed;
1131 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1132 retval = 0;
1133 goto done;
1134 }
1135 retval = validate_change(cs, trialcs);
1136 if (retval < 0)
1137 goto done;
1138
1139 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1140 if (retval < 0)
1141 goto done;
1142
1143 mutex_lock(&callback_mutex);
1144 cs->mems_allowed = trialcs->mems_allowed;
1145 mutex_unlock(&callback_mutex);
1146
1147 update_tasks_nodemask(cs, oldmem, &heap);
1148
1149 heap_free(&heap);
1150done:
1151 NODEMASK_FREE(oldmem);
1152 return retval;
1153}
1154
1155int current_cpuset_is_being_rebound(void)
1156{
1157 return task_cs(current) == cpuset_being_rebound;
1158}
1159
1160static int update_relax_domain_level(struct cpuset *cs, s64 val)
1161{
1162#ifdef CONFIG_SMP
1163 if (val < -1 || val >= sched_domain_level_max)
1164 return -EINVAL;
1165#endif
1166
1167 if (val != cs->relax_domain_level) {
1168 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains();
1172 }
1173
1174 return 0;
1175}
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan)
1189{
1190 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1191}
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1207{
1208 struct cgroup_scanner scan;
1209
1210 scan.cg = cs->css.cgroup;
1211 scan.test_task = NULL;
1212 scan.process_task = cpuset_change_flag;
1213 scan.heap = heap;
1214 cgroup_scan_tasks(&scan);
1215}
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1227 int turning_on)
1228{
1229 struct cpuset *trialcs;
1230 int balance_flag_changed;
1231 int spread_flag_changed;
1232 struct ptr_heap heap;
1233 int err;
1234
1235 trialcs = alloc_trial_cpuset(cs);
1236 if (!trialcs)
1237 return -ENOMEM;
1238
1239 if (turning_on)
1240 set_bit(bit, &trialcs->flags);
1241 else
1242 clear_bit(bit, &trialcs->flags);
1243
1244 err = validate_change(cs, trialcs);
1245 if (err < 0)
1246 goto out;
1247
1248 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1249 if (err < 0)
1250 goto out;
1251
1252 balance_flag_changed = (is_sched_load_balance(cs) !=
1253 is_sched_load_balance(trialcs));
1254
1255 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1256 || (is_spread_page(cs) != is_spread_page(trialcs)));
1257
1258 mutex_lock(&callback_mutex);
1259 cs->flags = trialcs->flags;
1260 mutex_unlock(&callback_mutex);
1261
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains();
1264
1265 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap);
1267 heap_free(&heap);
1268out:
1269 free_trial_cpuset(trialcs);
1270 return err;
1271}
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318#define FM_COEF 933
1319#define FM_MAXTICKS ((time_t)99)
1320#define FM_MAXCNT 1000000
1321#define FM_SCALE 1000
1322
1323
1324static void fmeter_init(struct fmeter *fmp)
1325{
1326 fmp->cnt = 0;
1327 fmp->val = 0;
1328 fmp->time = 0;
1329 spin_lock_init(&fmp->lock);
1330}
1331
1332
1333static void fmeter_update(struct fmeter *fmp)
1334{
1335 time_t now = get_seconds();
1336 time_t ticks = now - fmp->time;
1337
1338 if (ticks == 0)
1339 return;
1340
1341 ticks = min(FM_MAXTICKS, ticks);
1342 while (ticks-- > 0)
1343 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1344 fmp->time = now;
1345
1346 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1347 fmp->cnt = 0;
1348}
1349
1350
1351static void fmeter_markevent(struct fmeter *fmp)
1352{
1353 spin_lock(&fmp->lock);
1354 fmeter_update(fmp);
1355 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1356 spin_unlock(&fmp->lock);
1357}
1358
1359
1360static int fmeter_getrate(struct fmeter *fmp)
1361{
1362 int val;
1363
1364 spin_lock(&fmp->lock);
1365 fmeter_update(fmp);
1366 val = fmp->val;
1367 spin_unlock(&fmp->lock);
1368 return val;
1369}
1370
1371
1372
1373
1374
1375
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{
1383 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task;
1385 int ret;
1386
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC;
1389
1390 cgroup_taskset_for_each(task, cgrp, tset) {
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL;
1402 if ((ret = security_task_setscheduler(task)))
1403 return ret;
1404 }
1405
1406
1407 if (cs == &top_cpuset)
1408 cpumask_copy(cpus_attach, cpu_possible_mask);
1409 else
1410 guarantee_online_cpus(cs, cpus_attach);
1411
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1413
1414 return 0;
1415}
1416
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{
1419 struct mm_struct *mm;
1420 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset);
1422 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1423 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425
1426 cgroup_taskset_for_each(task, cgrp, tset) {
1427
1428
1429
1430
1431 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1432
1433 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1434 cpuset_update_task_spread_flag(cs, task);
1435 }
1436
1437
1438
1439
1440
1441 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1442 cpuset_attach_nodemask_to = cs->mems_allowed;
1443 mm = get_task_mm(leader);
1444 if (mm) {
1445 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1446 if (is_memory_migrate(cs))
1447 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1448 &cpuset_attach_nodemask_to);
1449 mmput(mm);
1450 }
1451}
1452
1453
1454
1455typedef enum {
1456 FILE_MEMORY_MIGRATE,
1457 FILE_CPULIST,
1458 FILE_MEMLIST,
1459 FILE_CPU_EXCLUSIVE,
1460 FILE_MEM_EXCLUSIVE,
1461 FILE_MEM_HARDWALL,
1462 FILE_SCHED_LOAD_BALANCE,
1463 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1464 FILE_MEMORY_PRESSURE_ENABLED,
1465 FILE_MEMORY_PRESSURE,
1466 FILE_SPREAD_PAGE,
1467 FILE_SPREAD_SLAB,
1468} cpuset_filetype_t;
1469
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private;
1475
1476 if (!cgroup_lock_live_group(cgrp))
1477 return -ENODEV;
1478
1479 switch (type) {
1480 case FILE_CPU_EXCLUSIVE:
1481 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1482 break;
1483 case FILE_MEM_EXCLUSIVE:
1484 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1485 break;
1486 case FILE_MEM_HARDWALL:
1487 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1488 break;
1489 case FILE_SCHED_LOAD_BALANCE:
1490 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1491 break;
1492 case FILE_MEMORY_MIGRATE:
1493 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1494 break;
1495 case FILE_MEMORY_PRESSURE_ENABLED:
1496 cpuset_memory_pressure_enabled = !!val;
1497 break;
1498 case FILE_MEMORY_PRESSURE:
1499 retval = -EACCES;
1500 break;
1501 case FILE_SPREAD_PAGE:
1502 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1503 break;
1504 case FILE_SPREAD_SLAB:
1505 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1506 break;
1507 default:
1508 retval = -EINVAL;
1509 break;
1510 }
1511 cgroup_unlock();
1512 return retval;
1513}
1514
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private;
1520
1521 if (!cgroup_lock_live_group(cgrp))
1522 return -ENODEV;
1523
1524 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1526 retval = update_relax_domain_level(cs, val);
1527 break;
1528 default:
1529 retval = -EINVAL;
1530 break;
1531 }
1532 cgroup_unlock();
1533 return retval;
1534}
1535
1536
1537
1538
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf)
1541{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs;
1545
1546 if (!cgroup_lock_live_group(cgrp))
1547 return -ENODEV;
1548
1549 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) {
1551 retval = -ENOMEM;
1552 goto out;
1553 }
1554
1555 switch (cft->private) {
1556 case FILE_CPULIST:
1557 retval = update_cpumask(cs, trialcs, buf);
1558 break;
1559 case FILE_MEMLIST:
1560 retval = update_nodemask(cs, trialcs, buf);
1561 break;
1562 default:
1563 retval = -EINVAL;
1564 break;
1565 }
1566
1567 free_trial_cpuset(trialcs);
1568out:
1569 cgroup_unlock();
1570 return retval;
1571}
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1586{
1587 size_t count;
1588
1589 mutex_lock(&callback_mutex);
1590 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1591 mutex_unlock(&callback_mutex);
1592
1593 return count;
1594}
1595
1596static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1597{
1598 size_t count;
1599
1600 mutex_lock(&callback_mutex);
1601 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1602 mutex_unlock(&callback_mutex);
1603
1604 return count;
1605}
1606
1607static ssize_t cpuset_common_file_read(struct cgroup *cont,
1608 struct cftype *cft,
1609 struct file *file,
1610 char __user *buf,
1611 size_t nbytes, loff_t *ppos)
1612{
1613 struct cpuset *cs = cgroup_cs(cont);
1614 cpuset_filetype_t type = cft->private;
1615 char *page;
1616 ssize_t retval = 0;
1617 char *s;
1618
1619 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1620 return -ENOMEM;
1621
1622 s = page;
1623
1624 switch (type) {
1625 case FILE_CPULIST:
1626 s += cpuset_sprintf_cpulist(s, cs);
1627 break;
1628 case FILE_MEMLIST:
1629 s += cpuset_sprintf_memlist(s, cs);
1630 break;
1631 default:
1632 retval = -EINVAL;
1633 goto out;
1634 }
1635 *s++ = '\n';
1636
1637 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1638out:
1639 free_page((unsigned long)page);
1640 return retval;
1641}
1642
1643static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1644{
1645 struct cpuset *cs = cgroup_cs(cont);
1646 cpuset_filetype_t type = cft->private;
1647 switch (type) {
1648 case FILE_CPU_EXCLUSIVE:
1649 return is_cpu_exclusive(cs);
1650 case FILE_MEM_EXCLUSIVE:
1651 return is_mem_exclusive(cs);
1652 case FILE_MEM_HARDWALL:
1653 return is_mem_hardwall(cs);
1654 case FILE_SCHED_LOAD_BALANCE:
1655 return is_sched_load_balance(cs);
1656 case FILE_MEMORY_MIGRATE:
1657 return is_memory_migrate(cs);
1658 case FILE_MEMORY_PRESSURE_ENABLED:
1659 return cpuset_memory_pressure_enabled;
1660 case FILE_MEMORY_PRESSURE:
1661 return fmeter_getrate(&cs->fmeter);
1662 case FILE_SPREAD_PAGE:
1663 return is_spread_page(cs);
1664 case FILE_SPREAD_SLAB:
1665 return is_spread_slab(cs);
1666 default:
1667 BUG();
1668 }
1669
1670
1671 return 0;
1672}
1673
1674static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1675{
1676 struct cpuset *cs = cgroup_cs(cont);
1677 cpuset_filetype_t type = cft->private;
1678 switch (type) {
1679 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1680 return cs->relax_domain_level;
1681 default:
1682 BUG();
1683 }
1684
1685
1686 return 0;
1687}
1688
1689
1690
1691
1692
1693
1694static struct cftype files[] = {
1695 {
1696 .name = "cpus",
1697 .read = cpuset_common_file_read,
1698 .write_string = cpuset_write_resmask,
1699 .max_write_len = (100U + 6 * NR_CPUS),
1700 .private = FILE_CPULIST,
1701 },
1702
1703 {
1704 .name = "mems",
1705 .read = cpuset_common_file_read,
1706 .write_string = cpuset_write_resmask,
1707 .max_write_len = (100U + 6 * MAX_NUMNODES),
1708 .private = FILE_MEMLIST,
1709 },
1710
1711 {
1712 .name = "cpu_exclusive",
1713 .read_u64 = cpuset_read_u64,
1714 .write_u64 = cpuset_write_u64,
1715 .private = FILE_CPU_EXCLUSIVE,
1716 },
1717
1718 {
1719 .name = "mem_exclusive",
1720 .read_u64 = cpuset_read_u64,
1721 .write_u64 = cpuset_write_u64,
1722 .private = FILE_MEM_EXCLUSIVE,
1723 },
1724
1725 {
1726 .name = "mem_hardwall",
1727 .read_u64 = cpuset_read_u64,
1728 .write_u64 = cpuset_write_u64,
1729 .private = FILE_MEM_HARDWALL,
1730 },
1731
1732 {
1733 .name = "sched_load_balance",
1734 .read_u64 = cpuset_read_u64,
1735 .write_u64 = cpuset_write_u64,
1736 .private = FILE_SCHED_LOAD_BALANCE,
1737 },
1738
1739 {
1740 .name = "sched_relax_domain_level",
1741 .read_s64 = cpuset_read_s64,
1742 .write_s64 = cpuset_write_s64,
1743 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1744 },
1745
1746 {
1747 .name = "memory_migrate",
1748 .read_u64 = cpuset_read_u64,
1749 .write_u64 = cpuset_write_u64,
1750 .private = FILE_MEMORY_MIGRATE,
1751 },
1752
1753 {
1754 .name = "memory_pressure",
1755 .read_u64 = cpuset_read_u64,
1756 .write_u64 = cpuset_write_u64,
1757 .private = FILE_MEMORY_PRESSURE,
1758 .mode = S_IRUGO,
1759 },
1760
1761 {
1762 .name = "memory_spread_page",
1763 .read_u64 = cpuset_read_u64,
1764 .write_u64 = cpuset_write_u64,
1765 .private = FILE_SPREAD_PAGE,
1766 },
1767
1768 {
1769 .name = "memory_spread_slab",
1770 .read_u64 = cpuset_read_u64,
1771 .write_u64 = cpuset_write_u64,
1772 .private = FILE_SPREAD_SLAB,
1773 },
1774
1775 {
1776 .name = "memory_pressure_enabled",
1777 .flags = CFTYPE_ONLY_ON_ROOT,
1778 .read_u64 = cpuset_read_u64,
1779 .write_u64 = cpuset_write_u64,
1780 .private = FILE_MEMORY_PRESSURE_ENABLED,
1781 },
1782
1783 { }
1784};
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823
1824
1825
1826
1827
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1829{
1830 struct cpuset *cs;
1831 struct cpuset *parent;
1832
1833 if (!cont->parent) {
1834 return &top_cpuset.css;
1835 }
1836 parent = cgroup_cs(cont->parent);
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs)
1839 return ERR_PTR(-ENOMEM);
1840 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1841 kfree(cs);
1842 return ERR_PTR(-ENOMEM);
1843 }
1844
1845 cs->flags = 0;
1846 if (is_spread_page(parent))
1847 set_bit(CS_SPREAD_PAGE, &cs->flags);
1848 if (is_spread_slab(parent))
1849 set_bit(CS_SPREAD_SLAB, &cs->flags);
1850 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1851 cpumask_clear(cs->cpus_allowed);
1852 nodes_clear(cs->mems_allowed);
1853 fmeter_init(&cs->fmeter);
1854 cs->relax_domain_level = -1;
1855
1856 cs->parent = parent;
1857 number_of_cpusets++;
1858 return &cs->css ;
1859}
1860
1861
1862
1863
1864
1865
1866
1867static void cpuset_destroy(struct cgroup *cont)
1868{
1869 struct cpuset *cs = cgroup_cs(cont);
1870
1871 if (is_sched_load_balance(cs))
1872 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1873
1874 number_of_cpusets--;
1875 free_cpumask_var(cs->cpus_allowed);
1876 kfree(cs);
1877}
1878
1879struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset",
1881 .create = cpuset_create,
1882 .destroy = cpuset_destroy,
1883 .can_attach = cpuset_can_attach,
1884 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files,
1888 .early_init = 1,
1889};
1890
1891
1892
1893
1894
1895
1896
1897int __init cpuset_init(void)
1898{
1899 int err = 0;
1900
1901 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1902 BUG();
1903
1904 cpumask_setall(top_cpuset.cpus_allowed);
1905 nodes_setall(top_cpuset.mems_allowed);
1906
1907 fmeter_init(&top_cpuset.fmeter);
1908 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1909 top_cpuset.relax_domain_level = -1;
1910
1911 err = register_filesystem(&cpuset_fs_type);
1912 if (err < 0)
1913 return err;
1914
1915 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1916 BUG();
1917
1918 number_of_cpusets = 1;
1919 return 0;
1920}
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930static void cpuset_do_move_task(struct task_struct *tsk,
1931 struct cgroup_scanner *scan)
1932{
1933 struct cgroup *new_cgroup = scan->data;
1934
1935 cgroup_attach_task(new_cgroup, tsk);
1936}
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1950{
1951 struct cgroup_scanner scan;
1952
1953 scan.cg = from->css.cgroup;
1954 scan.test_task = NULL;
1955 scan.process_task = cpuset_do_move_task;
1956 scan.heap = NULL;
1957 scan.data = to->css.cgroup;
1958
1959 if (cgroup_scan_tasks(&scan))
1960 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1961 "cgroup_scan_tasks failed\n");
1962}
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1975{
1976 struct cpuset *parent;
1977
1978
1979
1980
1981
1982
1983 if (list_empty(&cs->css.cgroup->css_sets))
1984 return;
1985
1986
1987
1988
1989
1990 parent = cs->parent;
1991 while (cpumask_empty(parent->cpus_allowed) ||
1992 nodes_empty(parent->mems_allowed))
1993 parent = parent->parent;
1994
1995 move_member_tasks_to_cpuset(cs, parent);
1996}
1997
1998
1999
2000
2001
2002
2003
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child;
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2042{
2043 LIST_HEAD(queue);
2044 struct cpuset *cp;
2045 static nodemask_t oldmems;
2046
2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2048
2049 switch (event) {
2050 case CPUSET_CPU_OFFLINE:
2051 while ((cp = cpuset_next(&queue)) != NULL) {
2052
2053
2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2068 }
2069 break;
2070
2071 case CPUSET_MEM_OFFLINE:
2072 while ((cp = cpuset_next(&queue)) != NULL) {
2073
2074
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2078
2079 oldmems = cp->mems_allowed;
2080
2081
2082 mutex_lock(&callback_mutex);
2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2084 node_states[N_HIGH_MEMORY]);
2085 mutex_unlock(&callback_mutex);
2086
2087
2088 if (nodes_empty(cp->mems_allowed))
2089 remove_tasks_in_empty_cpuset(cp);
2090 else
2091 update_tasks_nodemask(cp, &oldmems, NULL);
2092 }
2093 }
2094}
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114void cpuset_update_active_cpus(bool cpu_online)
2115{
2116 struct sched_domain_attr *attr;
2117 cpumask_var_t *doms;
2118 int ndoms;
2119
2120 cgroup_lock();
2121 mutex_lock(&callback_mutex);
2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2123 mutex_unlock(&callback_mutex);
2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2128 ndoms = generate_sched_domains(&doms, &attr);
2129 cgroup_unlock();
2130
2131
2132 partition_sched_domains(ndoms, doms, attr);
2133}
2134
2135#ifdef CONFIG_MEMORY_HOTPLUG
2136
2137
2138
2139
2140
2141static int cpuset_track_online_nodes(struct notifier_block *self,
2142 unsigned long action, void *arg)
2143{
2144 static nodemask_t oldmems;
2145
2146 cgroup_lock();
2147 switch (action) {
2148 case MEM_ONLINE:
2149 oldmems = top_cpuset.mems_allowed;
2150 mutex_lock(&callback_mutex);
2151 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2152 mutex_unlock(&callback_mutex);
2153 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2154 break;
2155 case MEM_OFFLINE:
2156
2157
2158
2159
2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2161 break;
2162 default:
2163 break;
2164 }
2165 cgroup_unlock();
2166
2167 return NOTIFY_OK;
2168}
2169#endif
2170
2171
2172
2173
2174
2175
2176
2177void __init cpuset_init_smp(void)
2178{
2179 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2180 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2181
2182 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2183
2184 cpuset_wq = create_singlethread_workqueue("cpuset");
2185 BUG_ON(!cpuset_wq);
2186}
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2200{
2201 mutex_lock(&callback_mutex);
2202 task_lock(tsk);
2203 guarantee_online_cpus(task_cs(tsk), pmask);
2204 task_unlock(tsk);
2205 mutex_unlock(&callback_mutex);
2206}
2207
2208void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2209{
2210 const struct cpuset *cs;
2211
2212 rcu_read_lock();
2213 cs = task_cs(tsk);
2214 if (cs)
2215 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2216 rcu_read_unlock();
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235}
2236
2237void cpuset_init_current_mems_allowed(void)
2238{
2239 nodes_setall(current->mems_allowed);
2240}
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2253{
2254 nodemask_t mask;
2255
2256 mutex_lock(&callback_mutex);
2257 task_lock(tsk);
2258 guarantee_online_mems(task_cs(tsk), &mask);
2259 task_unlock(tsk);
2260 mutex_unlock(&callback_mutex);
2261
2262 return mask;
2263}
2264
2265
2266
2267
2268
2269
2270
2271int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2272{
2273 return nodes_intersects(*nodemask, current->mems_allowed);
2274}
2275
2276
2277
2278
2279
2280
2281
2282static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2283{
2284 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
2285 cs = cs->parent;
2286 return cs;
2287}
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2351{
2352 const struct cpuset *cs;
2353 int allowed;
2354
2355 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2356 return 1;
2357 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2358 if (node_isset(node, current->mems_allowed))
2359 return 1;
2360
2361
2362
2363
2364 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2365 return 1;
2366 if (gfp_mask & __GFP_HARDWALL)
2367 return 0;
2368
2369 if (current->flags & PF_EXITING)
2370 return 1;
2371
2372
2373 mutex_lock(&callback_mutex);
2374
2375 task_lock(current);
2376 cs = nearest_hardwall_ancestor(task_cs(current));
2377 task_unlock(current);
2378
2379 allowed = node_isset(node, cs->mems_allowed);
2380 mutex_unlock(&callback_mutex);
2381 return allowed;
2382}
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2408{
2409 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2410 return 1;
2411 if (node_isset(node, current->mems_allowed))
2412 return 1;
2413
2414
2415
2416
2417 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2418 return 1;
2419 return 0;
2420}
2421
2422
2423
2424
2425
2426
2427
2428void cpuset_unlock(void)
2429{
2430 mutex_unlock(&callback_mutex);
2431}
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460static int cpuset_spread_node(int *rotor)
2461{
2462 int node;
2463
2464 node = next_node(*rotor, current->mems_allowed);
2465 if (node == MAX_NUMNODES)
2466 node = first_node(current->mems_allowed);
2467 *rotor = node;
2468 return node;
2469}
2470
2471int cpuset_mem_spread_node(void)
2472{
2473 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2474 current->cpuset_mem_spread_rotor =
2475 node_random(¤t->mems_allowed);
2476
2477 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2478}
2479
2480int cpuset_slab_spread_node(void)
2481{
2482 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2483 current->cpuset_slab_spread_rotor =
2484 node_random(¤t->mems_allowed);
2485
2486 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2487}
2488
2489EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2503 const struct task_struct *tsk2)
2504{
2505 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2506}
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2517{
2518 struct dentry *dentry;
2519
2520 dentry = task_cs(tsk)->css.cgroup->dentry;
2521 spin_lock(&cpuset_buffer_lock);
2522 snprintf(cpuset_name, CPUSET_NAME_LEN,
2523 dentry ? (const char *)dentry->d_name.name : "/");
2524 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2525 tsk->mems_allowed);
2526 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2527 tsk->comm, cpuset_name, cpuset_nodelist);
2528 spin_unlock(&cpuset_buffer_lock);
2529}
2530
2531
2532
2533
2534
2535
2536
2537int cpuset_memory_pressure_enabled __read_mostly;
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557void __cpuset_memory_pressure_bump(void)
2558{
2559 task_lock(current);
2560 fmeter_markevent(&task_cs(current)->fmeter);
2561 task_unlock(current);
2562}
2563
2564#ifdef CONFIG_PROC_PID_CPUSET
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2575{
2576 struct pid *pid;
2577 struct task_struct *tsk;
2578 char *buf;
2579 struct cgroup_subsys_state *css;
2580 int retval;
2581
2582 retval = -ENOMEM;
2583 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2584 if (!buf)
2585 goto out;
2586
2587 retval = -ESRCH;
2588 pid = m->private;
2589 tsk = get_pid_task(pid, PIDTYPE_PID);
2590 if (!tsk)
2591 goto out_free;
2592
2593 retval = -EINVAL;
2594 cgroup_lock();
2595 css = task_subsys_state(tsk, cpuset_subsys_id);
2596 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2597 if (retval < 0)
2598 goto out_unlock;
2599 seq_puts(m, buf);
2600 seq_putc(m, '\n');
2601out_unlock:
2602 cgroup_unlock();
2603 put_task_struct(tsk);
2604out_free:
2605 kfree(buf);
2606out:
2607 return retval;
2608}
2609
2610static int cpuset_open(struct inode *inode, struct file *file)
2611{
2612 struct pid *pid = PROC_I(inode)->pid;
2613 return single_open(file, proc_cpuset_show, pid);
2614}
2615
2616const struct file_operations proc_cpuset_operations = {
2617 .open = cpuset_open,
2618 .read = seq_read,
2619 .llseek = seq_lseek,
2620 .release = single_release,
2621};
2622#endif
2623
2624
2625void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2626{
2627 seq_printf(m, "Mems_allowed:\t");
2628 seq_nodemask(m, &task->mems_allowed);
2629 seq_printf(m, "\n");
2630 seq_printf(m, "Mems_allowed_list:\t");
2631 seq_nodemask_list(m, &task->mems_allowed);
2632 seq_printf(m, "\n");
2633}
2634