1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include "cgroup-internal.h"
32
33#include <linux/bpf-cgroup.h>
34#include <linux/cred.h>
35#include <linux/errno.h>
36#include <linux/init_task.h>
37#include <linux/kernel.h>
38#include <linux/magic.h>
39#include <linux/mutex.h>
40#include <linux/mount.h>
41#include <linux/pagemap.h>
42#include <linux/proc_fs.h>
43#include <linux/rcupdate.h>
44#include <linux/sched.h>
45#include <linux/sched/task.h>
46#include <linux/slab.h>
47#include <linux/spinlock.h>
48#include <linux/percpu-rwsem.h>
49#include <linux/string.h>
50#include <linux/hashtable.h>
51#include <linux/idr.h>
52#include <linux/kthread.h>
53#include <linux/atomic.h>
54#include <linux/cpuset.h>
55#include <linux/proc_ns.h>
56#include <linux/nsproxy.h>
57#include <linux/file.h>
58#include <linux/fs_parser.h>
59#include <linux/sched/cputime.h>
60#include <linux/psi.h>
61#include <net/sock.h>
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/cgroup.h>
65
66#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
67 MAX_CFTYPE_NAME + 2)
68
69#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
70
71
72
73
74
75
76
77#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
78
79
80
81
82
83
84
85
86
87
88
89DEFINE_MUTEX(cgroup_mutex);
90DEFINE_SPINLOCK(css_set_lock);
91
92#ifdef CONFIG_PROVE_RCU
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_lock);
95#endif
96
97DEFINE_SPINLOCK(trace_cgroup_path_lock);
98char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
99static bool cgroup_debug __read_mostly;
100
101
102
103
104
105static DEFINE_SPINLOCK(cgroup_idr_lock);
106
107
108
109
110
111static DEFINE_SPINLOCK(cgroup_file_kn_lock);
112
113DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
114
115#define cgroup_assert_mutex_or_rcu_locked() \
116 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
117 !lockdep_is_held(&cgroup_mutex), \
118 "cgroup_mutex or RCU read lock required");
119
120
121
122
123
124
125
126static struct workqueue_struct *cgroup_destroy_wq;
127
128
129#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
130struct cgroup_subsys *cgroup_subsys[] = {
131#include <linux/cgroup_subsys.h>
132};
133#undef SUBSYS
134
135
136#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
137static const char *cgroup_subsys_name[] = {
138#include <linux/cgroup_subsys.h>
139};
140#undef SUBSYS
141
142
143#define SUBSYS(_x) \
144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
145 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
147 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
148#include <linux/cgroup_subsys.h>
149#undef SUBSYS
150
151#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
152static struct static_key_true *cgroup_subsys_enabled_key[] = {
153#include <linux/cgroup_subsys.h>
154};
155#undef SUBSYS
156
157#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
158static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
159#include <linux/cgroup_subsys.h>
160};
161#undef SUBSYS
162
163static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
164
165
166struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
167EXPORT_SYMBOL_GPL(cgrp_dfl_root);
168
169
170
171
172
173static bool cgrp_dfl_visible;
174
175
176static u16 cgrp_dfl_inhibit_ss_mask;
177
178
179static u16 cgrp_dfl_implicit_ss_mask;
180
181
182static u16 cgrp_dfl_threaded_ss_mask;
183
184
185LIST_HEAD(cgroup_roots);
186static int cgroup_root_count;
187
188
189static DEFINE_IDR(cgroup_hierarchy_idr);
190
191
192
193
194
195
196
197
198static u64 css_serial_nr_next = 1;
199
200
201
202
203
204static u16 have_fork_callback __read_mostly;
205static u16 have_exit_callback __read_mostly;
206static u16 have_release_callback __read_mostly;
207static u16 have_canfork_callback __read_mostly;
208
209
210struct cgroup_namespace init_cgroup_ns = {
211 .ns.count = REFCOUNT_INIT(2),
212 .user_ns = &init_user_ns,
213 .ns.ops = &cgroupns_operations,
214 .ns.inum = PROC_CGROUP_INIT_INO,
215 .root_cset = &init_css_set,
216};
217
218static struct file_system_type cgroup2_fs_type;
219static struct cftype cgroup_base_files[];
220
221
222enum cgroup_opt_features {
223#ifdef CONFIG_PSI
224 OPT_FEATURE_PRESSURE,
225#endif
226 OPT_FEATURE_COUNT
227};
228
229static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
230#ifdef CONFIG_PSI
231 "pressure",
232#endif
233};
234
235static u16 cgroup_feature_disable_mask __read_mostly;
236
237static int cgroup_apply_control(struct cgroup *cgrp);
238static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
239static void css_task_iter_skip(struct css_task_iter *it,
240 struct task_struct *task);
241static int cgroup_destroy_locked(struct cgroup *cgrp);
242static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
243 struct cgroup_subsys *ss);
244static void css_release(struct percpu_ref *ref);
245static void kill_css(struct cgroup_subsys_state *css);
246static int cgroup_addrm_files(struct cgroup_subsys_state *css,
247 struct cgroup *cgrp, struct cftype cfts[],
248 bool is_add);
249
250
251
252
253
254
255
256
257
258bool cgroup_ssid_enabled(int ssid)
259{
260 if (!CGROUP_HAS_SUBSYS_CONFIG)
261 return false;
262
263 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
264}
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311bool cgroup_on_dfl(const struct cgroup *cgrp)
312{
313 return cgrp->root == &cgrp_dfl_root;
314}
315
316
317static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
318 gfp_t gfp_mask)
319{
320 int ret;
321
322 idr_preload(gfp_mask);
323 spin_lock_bh(&cgroup_idr_lock);
324 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
325 spin_unlock_bh(&cgroup_idr_lock);
326 idr_preload_end();
327 return ret;
328}
329
330static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
331{
332 void *ret;
333
334 spin_lock_bh(&cgroup_idr_lock);
335 ret = idr_replace(idr, ptr, id);
336 spin_unlock_bh(&cgroup_idr_lock);
337 return ret;
338}
339
340static void cgroup_idr_remove(struct idr *idr, int id)
341{
342 spin_lock_bh(&cgroup_idr_lock);
343 idr_remove(idr, id);
344 spin_unlock_bh(&cgroup_idr_lock);
345}
346
347static bool cgroup_has_tasks(struct cgroup *cgrp)
348{
349 return cgrp->nr_populated_csets;
350}
351
352bool cgroup_is_threaded(struct cgroup *cgrp)
353{
354 return cgrp->dom_cgrp != cgrp;
355}
356
357
358static bool cgroup_is_mixable(struct cgroup *cgrp)
359{
360
361
362
363
364
365 return !cgroup_parent(cgrp);
366}
367
368
369static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
370{
371
372 if (cgroup_is_mixable(cgrp))
373 return true;
374
375
376 if (cgroup_is_threaded(cgrp))
377 return false;
378
379
380 if (cgrp->nr_populated_domain_children)
381 return false;
382
383
384 if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
385 return false;
386
387 return true;
388}
389
390
391bool cgroup_is_thread_root(struct cgroup *cgrp)
392{
393
394 if (cgroup_is_threaded(cgrp))
395 return false;
396
397
398 if (cgrp->nr_threaded_children)
399 return true;
400
401
402
403
404
405 if (cgroup_has_tasks(cgrp) &&
406 (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
407 return true;
408
409 return false;
410}
411
412
413static bool cgroup_is_valid_domain(struct cgroup *cgrp)
414{
415
416 if (cgroup_is_threaded(cgrp))
417 return false;
418
419
420 while ((cgrp = cgroup_parent(cgrp))) {
421 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
422 return false;
423 if (cgroup_is_threaded(cgrp))
424 return false;
425 }
426
427 return true;
428}
429
430
431static u16 cgroup_control(struct cgroup *cgrp)
432{
433 struct cgroup *parent = cgroup_parent(cgrp);
434 u16 root_ss_mask = cgrp->root->subsys_mask;
435
436 if (parent) {
437 u16 ss_mask = parent->subtree_control;
438
439
440 if (cgroup_is_threaded(cgrp))
441 ss_mask &= cgrp_dfl_threaded_ss_mask;
442 return ss_mask;
443 }
444
445 if (cgroup_on_dfl(cgrp))
446 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
447 cgrp_dfl_implicit_ss_mask);
448 return root_ss_mask;
449}
450
451
452static u16 cgroup_ss_mask(struct cgroup *cgrp)
453{
454 struct cgroup *parent = cgroup_parent(cgrp);
455
456 if (parent) {
457 u16 ss_mask = parent->subtree_ss_mask;
458
459
460 if (cgroup_is_threaded(cgrp))
461 ss_mask &= cgrp_dfl_threaded_ss_mask;
462 return ss_mask;
463 }
464
465 return cgrp->root->subsys_mask;
466}
467
468
469
470
471
472
473
474
475
476
477
478
479static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
480 struct cgroup_subsys *ss)
481{
482 if (CGROUP_HAS_SUBSYS_CONFIG && ss)
483 return rcu_dereference_check(cgrp->subsys[ss->id],
484 lockdep_is_held(&cgroup_mutex));
485 else
486 return &cgrp->self;
487}
488
489
490
491
492
493
494
495
496
497static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
498 struct cgroup_subsys *ss)
499{
500 struct cgroup_subsys_state *css;
501
502 rcu_read_lock();
503 css = cgroup_css(cgrp, ss);
504 if (css && !css_tryget_online(css))
505 css = NULL;
506 rcu_read_unlock();
507
508 return css;
509}
510
511
512
513
514
515
516
517
518
519
520
521static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
522 struct cgroup_subsys *ss)
523{
524 lockdep_assert_held(&cgroup_mutex);
525
526 if (!ss)
527 return &cgrp->self;
528
529
530
531
532
533 while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
534 cgrp = cgroup_parent(cgrp);
535 if (!cgrp)
536 return NULL;
537 }
538
539 return cgroup_css(cgrp, ss);
540}
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
556 struct cgroup_subsys *ss)
557{
558 struct cgroup_subsys_state *css;
559
560 if (!CGROUP_HAS_SUBSYS_CONFIG)
561 return NULL;
562
563 do {
564 css = cgroup_css(cgrp, ss);
565
566 if (css)
567 return css;
568 cgrp = cgroup_parent(cgrp);
569 } while (cgrp);
570
571 return init_css_set.subsys[ss->id];
572}
573
574
575
576
577
578
579
580
581
582
583
584
585struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
586 struct cgroup_subsys *ss)
587{
588 struct cgroup_subsys_state *css;
589
590 if (!CGROUP_HAS_SUBSYS_CONFIG)
591 return NULL;
592
593 rcu_read_lock();
594
595 do {
596 css = cgroup_css(cgrp, ss);
597
598 if (css && css_tryget_online(css))
599 goto out_unlock;
600 cgrp = cgroup_parent(cgrp);
601 } while (cgrp);
602
603 css = init_css_set.subsys[ss->id];
604 css_get(css);
605out_unlock:
606 rcu_read_unlock();
607 return css;
608}
609EXPORT_SYMBOL_GPL(cgroup_get_e_css);
610
611static void cgroup_get_live(struct cgroup *cgrp)
612{
613 WARN_ON_ONCE(cgroup_is_dead(cgrp));
614 css_get(&cgrp->self);
615}
616
617
618
619
620
621
622int __cgroup_task_count(const struct cgroup *cgrp)
623{
624 int count = 0;
625 struct cgrp_cset_link *link;
626
627 lockdep_assert_held(&css_set_lock);
628
629 list_for_each_entry(link, &cgrp->cset_links, cset_link)
630 count += link->cset->nr_tasks;
631
632 return count;
633}
634
635
636
637
638
639int cgroup_task_count(const struct cgroup *cgrp)
640{
641 int count;
642
643 spin_lock_irq(&css_set_lock);
644 count = __cgroup_task_count(cgrp);
645 spin_unlock_irq(&css_set_lock);
646
647 return count;
648}
649
650struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
651{
652 struct cgroup *cgrp = of->kn->parent->priv;
653 struct cftype *cft = of_cft(of);
654
655
656
657
658
659
660
661
662
663 if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
664 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
665 else
666 return &cgrp->self;
667}
668EXPORT_SYMBOL_GPL(of_css);
669
670
671
672
673
674
675
676
677
678#define for_each_css(css, ssid, cgrp) \
679 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
680 if (!((css) = rcu_dereference_check( \
681 (cgrp)->subsys[(ssid)], \
682 lockdep_is_held(&cgroup_mutex)))) { } \
683 else
684
685
686
687
688
689
690
691
692
693#define for_each_e_css(css, ssid, cgrp) \
694 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
695 if (!((css) = cgroup_e_css_by_mask(cgrp, \
696 cgroup_subsys[(ssid)]))) \
697 ; \
698 else
699
700
701
702
703
704
705
706
707
708
709#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
710 unsigned long __ss_mask = (ss_mask); \
711 if (!CGROUP_HAS_SUBSYS_CONFIG) { \
712 (ssid) = 0; \
713 break; \
714 } \
715 for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
716 (ss) = cgroup_subsys[ssid]; \
717 {
718
719#define while_each_subsys_mask() \
720 } \
721 } \
722} while (false)
723
724
725#define cgroup_for_each_live_child(child, cgrp) \
726 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
727 if (({ lockdep_assert_held(&cgroup_mutex); \
728 cgroup_is_dead(child); })) \
729 ; \
730 else
731
732
733#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
734 css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
735 if (({ lockdep_assert_held(&cgroup_mutex); \
736 (dsct) = (d_css)->cgroup; \
737 cgroup_is_dead(dsct); })) \
738 ; \
739 else
740
741
742#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
743 css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
744 if (({ lockdep_assert_held(&cgroup_mutex); \
745 (dsct) = (d_css)->cgroup; \
746 cgroup_is_dead(dsct); })) \
747 ; \
748 else
749
750
751
752
753
754
755
756
757struct css_set init_css_set = {
758 .refcount = REFCOUNT_INIT(1),
759 .dom_cset = &init_css_set,
760 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
761 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
762 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
763 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
764 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
765 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
766 .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
767 .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
768 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
769
770
771
772
773
774
775
776 .dfl_cgrp = &cgrp_dfl_root.cgrp,
777};
778
779static int css_set_count = 1;
780
781static bool css_set_threaded(struct css_set *cset)
782{
783 return cset->dom_cset != cset;
784}
785
786
787
788
789
790
791
792
793
794
795static bool css_set_populated(struct css_set *cset)
796{
797 lockdep_assert_held(&css_set_lock);
798
799 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
800}
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
820{
821 struct cgroup *child = NULL;
822 int adj = populated ? 1 : -1;
823
824 lockdep_assert_held(&css_set_lock);
825
826 do {
827 bool was_populated = cgroup_is_populated(cgrp);
828
829 if (!child) {
830 cgrp->nr_populated_csets += adj;
831 } else {
832 if (cgroup_is_threaded(child))
833 cgrp->nr_populated_threaded_children += adj;
834 else
835 cgrp->nr_populated_domain_children += adj;
836 }
837
838 if (was_populated == cgroup_is_populated(cgrp))
839 break;
840
841 cgroup1_check_for_release(cgrp);
842 TRACE_CGROUP_PATH(notify_populated, cgrp,
843 cgroup_is_populated(cgrp));
844 cgroup_file_notify(&cgrp->events_file);
845
846 child = cgrp;
847 cgrp = cgroup_parent(cgrp);
848 } while (cgrp);
849}
850
851
852
853
854
855
856
857
858
859static void css_set_update_populated(struct css_set *cset, bool populated)
860{
861 struct cgrp_cset_link *link;
862
863 lockdep_assert_held(&css_set_lock);
864
865 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
866 cgroup_update_populated(link->cgrp, populated);
867}
868
869
870
871
872
873
874
875static void css_set_skip_task_iters(struct css_set *cset,
876 struct task_struct *task)
877{
878 struct css_task_iter *it, *pos;
879
880 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
881 css_task_iter_skip(it, task);
882}
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899static void css_set_move_task(struct task_struct *task,
900 struct css_set *from_cset, struct css_set *to_cset,
901 bool use_mg_tasks)
902{
903 lockdep_assert_held(&css_set_lock);
904
905 if (to_cset && !css_set_populated(to_cset))
906 css_set_update_populated(to_cset, true);
907
908 if (from_cset) {
909 WARN_ON_ONCE(list_empty(&task->cg_list));
910
911 css_set_skip_task_iters(from_cset, task);
912 list_del_init(&task->cg_list);
913 if (!css_set_populated(from_cset))
914 css_set_update_populated(from_cset, false);
915 } else {
916 WARN_ON_ONCE(!list_empty(&task->cg_list));
917 }
918
919 if (to_cset) {
920
921
922
923
924
925 WARN_ON_ONCE(task->flags & PF_EXITING);
926
927 cgroup_move_task(task, to_cset);
928 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
929 &to_cset->tasks);
930 }
931}
932
933
934
935
936
937
938#define CSS_SET_HASH_BITS 7
939static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
940
941static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
942{
943 unsigned long key = 0UL;
944 struct cgroup_subsys *ss;
945 int i;
946
947 for_each_subsys(ss, i)
948 key += (unsigned long)css[i];
949 key = (key >> 16) ^ key;
950
951 return key;
952}
953
954void put_css_set_locked(struct css_set *cset)
955{
956 struct cgrp_cset_link *link, *tmp_link;
957 struct cgroup_subsys *ss;
958 int ssid;
959
960 lockdep_assert_held(&css_set_lock);
961
962 if (!refcount_dec_and_test(&cset->refcount))
963 return;
964
965 WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
966
967
968 for_each_subsys(ss, ssid) {
969 list_del(&cset->e_cset_node[ssid]);
970 css_put(cset->subsys[ssid]);
971 }
972 hash_del(&cset->hlist);
973 css_set_count--;
974
975 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
976 list_del(&link->cset_link);
977 list_del(&link->cgrp_link);
978 if (cgroup_parent(link->cgrp))
979 cgroup_put(link->cgrp);
980 kfree(link);
981 }
982
983 if (css_set_threaded(cset)) {
984 list_del(&cset->threaded_csets_node);
985 put_css_set_locked(cset->dom_cset);
986 }
987
988 kfree_rcu(cset, rcu_head);
989}
990
991
992
993
994
995
996
997
998
999
1000
1001static bool compare_css_sets(struct css_set *cset,
1002 struct css_set *old_cset,
1003 struct cgroup *new_cgrp,
1004 struct cgroup_subsys_state *template[])
1005{
1006 struct cgroup *new_dfl_cgrp;
1007 struct list_head *l1, *l2;
1008
1009
1010
1011
1012
1013
1014 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
1015 return false;
1016
1017
1018
1019 if (cgroup_on_dfl(new_cgrp))
1020 new_dfl_cgrp = new_cgrp;
1021 else
1022 new_dfl_cgrp = old_cset->dfl_cgrp;
1023
1024 if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1025 return false;
1026
1027
1028
1029
1030
1031
1032
1033 l1 = &cset->cgrp_links;
1034 l2 = &old_cset->cgrp_links;
1035 while (1) {
1036 struct cgrp_cset_link *link1, *link2;
1037 struct cgroup *cgrp1, *cgrp2;
1038
1039 l1 = l1->next;
1040 l2 = l2->next;
1041
1042 if (l1 == &cset->cgrp_links) {
1043 BUG_ON(l2 != &old_cset->cgrp_links);
1044 break;
1045 } else {
1046 BUG_ON(l2 == &old_cset->cgrp_links);
1047 }
1048
1049 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1050 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1051 cgrp1 = link1->cgrp;
1052 cgrp2 = link2->cgrp;
1053
1054 BUG_ON(cgrp1->root != cgrp2->root);
1055
1056
1057
1058
1059
1060
1061
1062
1063 if (cgrp1->root == new_cgrp->root) {
1064 if (cgrp1 != new_cgrp)
1065 return false;
1066 } else {
1067 if (cgrp1 != cgrp2)
1068 return false;
1069 }
1070 }
1071 return true;
1072}
1073
1074
1075
1076
1077
1078
1079
1080static struct css_set *find_existing_css_set(struct css_set *old_cset,
1081 struct cgroup *cgrp,
1082 struct cgroup_subsys_state *template[])
1083{
1084 struct cgroup_root *root = cgrp->root;
1085 struct cgroup_subsys *ss;
1086 struct css_set *cset;
1087 unsigned long key;
1088 int i;
1089
1090
1091
1092
1093
1094
1095 for_each_subsys(ss, i) {
1096 if (root->subsys_mask & (1UL << i)) {
1097
1098
1099
1100
1101 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1102 } else {
1103
1104
1105
1106
1107 template[i] = old_cset->subsys[i];
1108 }
1109 }
1110
1111 key = css_set_hash(template);
1112 hash_for_each_possible(css_set_table, cset, hlist, key) {
1113 if (!compare_css_sets(cset, old_cset, cgrp, template))
1114 continue;
1115
1116
1117 return cset;
1118 }
1119
1120
1121 return NULL;
1122}
1123
1124static void free_cgrp_cset_links(struct list_head *links_to_free)
1125{
1126 struct cgrp_cset_link *link, *tmp_link;
1127
1128 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1129 list_del(&link->cset_link);
1130 kfree(link);
1131 }
1132}
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1143{
1144 struct cgrp_cset_link *link;
1145 int i;
1146
1147 INIT_LIST_HEAD(tmp_links);
1148
1149 for (i = 0; i < count; i++) {
1150 link = kzalloc(sizeof(*link), GFP_KERNEL);
1151 if (!link) {
1152 free_cgrp_cset_links(tmp_links);
1153 return -ENOMEM;
1154 }
1155 list_add(&link->cset_link, tmp_links);
1156 }
1157 return 0;
1158}
1159
1160
1161
1162
1163
1164
1165
1166static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1167 struct cgroup *cgrp)
1168{
1169 struct cgrp_cset_link *link;
1170
1171 BUG_ON(list_empty(tmp_links));
1172
1173 if (cgroup_on_dfl(cgrp))
1174 cset->dfl_cgrp = cgrp;
1175
1176 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1177 link->cset = cset;
1178 link->cgrp = cgrp;
1179
1180
1181
1182
1183
1184 list_move_tail(&link->cset_link, &cgrp->cset_links);
1185 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1186
1187 if (cgroup_parent(cgrp))
1188 cgroup_get_live(cgrp);
1189}
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199static struct css_set *find_css_set(struct css_set *old_cset,
1200 struct cgroup *cgrp)
1201{
1202 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1203 struct css_set *cset;
1204 struct list_head tmp_links;
1205 struct cgrp_cset_link *link;
1206 struct cgroup_subsys *ss;
1207 unsigned long key;
1208 int ssid;
1209
1210 lockdep_assert_held(&cgroup_mutex);
1211
1212
1213
1214 spin_lock_irq(&css_set_lock);
1215 cset = find_existing_css_set(old_cset, cgrp, template);
1216 if (cset)
1217 get_css_set(cset);
1218 spin_unlock_irq(&css_set_lock);
1219
1220 if (cset)
1221 return cset;
1222
1223 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1224 if (!cset)
1225 return NULL;
1226
1227
1228 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1229 kfree(cset);
1230 return NULL;
1231 }
1232
1233 refcount_set(&cset->refcount, 1);
1234 cset->dom_cset = cset;
1235 INIT_LIST_HEAD(&cset->tasks);
1236 INIT_LIST_HEAD(&cset->mg_tasks);
1237 INIT_LIST_HEAD(&cset->dying_tasks);
1238 INIT_LIST_HEAD(&cset->task_iters);
1239 INIT_LIST_HEAD(&cset->threaded_csets);
1240 INIT_HLIST_NODE(&cset->hlist);
1241 INIT_LIST_HEAD(&cset->cgrp_links);
1242 INIT_LIST_HEAD(&cset->mg_src_preload_node);
1243 INIT_LIST_HEAD(&cset->mg_dst_preload_node);
1244 INIT_LIST_HEAD(&cset->mg_node);
1245
1246
1247
1248 memcpy(cset->subsys, template, sizeof(cset->subsys));
1249
1250 spin_lock_irq(&css_set_lock);
1251
1252 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1253 struct cgroup *c = link->cgrp;
1254
1255 if (c->root == cgrp->root)
1256 c = cgrp;
1257 link_css_set(&tmp_links, cset, c);
1258 }
1259
1260 BUG_ON(!list_empty(&tmp_links));
1261
1262 css_set_count++;
1263
1264
1265 key = css_set_hash(cset->subsys);
1266 hash_add(css_set_table, &cset->hlist, key);
1267
1268 for_each_subsys(ss, ssid) {
1269 struct cgroup_subsys_state *css = cset->subsys[ssid];
1270
1271 list_add_tail(&cset->e_cset_node[ssid],
1272 &css->cgroup->e_csets[ssid]);
1273 css_get(css);
1274 }
1275
1276 spin_unlock_irq(&css_set_lock);
1277
1278
1279
1280
1281
1282
1283
1284 if (cgroup_is_threaded(cset->dfl_cgrp)) {
1285 struct css_set *dcset;
1286
1287 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1288 if (!dcset) {
1289 put_css_set(cset);
1290 return NULL;
1291 }
1292
1293 spin_lock_irq(&css_set_lock);
1294 cset->dom_cset = dcset;
1295 list_add_tail(&cset->threaded_csets_node,
1296 &dcset->threaded_csets);
1297 spin_unlock_irq(&css_set_lock);
1298 }
1299
1300 return cset;
1301}
1302
1303struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1304{
1305 struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
1306
1307 return root_cgrp->root;
1308}
1309
1310void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
1311{
1312 bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
1313
1314
1315 if (favor && !favoring) {
1316 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
1317 root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1318 } else if (!favor && favoring) {
1319 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
1320 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
1321 }
1322}
1323
1324static int cgroup_init_root_id(struct cgroup_root *root)
1325{
1326 int id;
1327
1328 lockdep_assert_held(&cgroup_mutex);
1329
1330 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1331 if (id < 0)
1332 return id;
1333
1334 root->hierarchy_id = id;
1335 return 0;
1336}
1337
1338static void cgroup_exit_root_id(struct cgroup_root *root)
1339{
1340 lockdep_assert_held(&cgroup_mutex);
1341
1342 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1343}
1344
1345void cgroup_free_root(struct cgroup_root *root)
1346{
1347 kfree(root);
1348}
1349
1350static void cgroup_destroy_root(struct cgroup_root *root)
1351{
1352 struct cgroup *cgrp = &root->cgrp;
1353 struct cgrp_cset_link *link, *tmp_link;
1354
1355 trace_cgroup_destroy_root(root);
1356
1357 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1358
1359 BUG_ON(atomic_read(&root->nr_cgrps));
1360 BUG_ON(!list_empty(&cgrp->self.children));
1361
1362
1363 WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1364
1365
1366
1367
1368
1369 spin_lock_irq(&css_set_lock);
1370
1371 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1372 list_del(&link->cset_link);
1373 list_del(&link->cgrp_link);
1374 kfree(link);
1375 }
1376
1377 spin_unlock_irq(&css_set_lock);
1378
1379 if (!list_empty(&root->root_list)) {
1380 list_del(&root->root_list);
1381 cgroup_root_count--;
1382 }
1383
1384 cgroup_favor_dynmods(root, false);
1385 cgroup_exit_root_id(root);
1386
1387 mutex_unlock(&cgroup_mutex);
1388
1389 cgroup_rstat_exit(cgrp);
1390 kernfs_destroy_root(root->kf_root);
1391 cgroup_free_root(root);
1392}
1393
1394static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
1395 struct cgroup_root *root)
1396{
1397 struct cgroup *res_cgroup = NULL;
1398
1399 if (cset == &init_css_set) {
1400 res_cgroup = &root->cgrp;
1401 } else if (root == &cgrp_dfl_root) {
1402 res_cgroup = cset->dfl_cgrp;
1403 } else {
1404 struct cgrp_cset_link *link;
1405
1406 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1407 struct cgroup *c = link->cgrp;
1408
1409 if (c->root == root) {
1410 res_cgroup = c;
1411 break;
1412 }
1413 }
1414 }
1415
1416 return res_cgroup;
1417}
1418
1419
1420
1421
1422
1423static struct cgroup *
1424current_cgns_cgroup_from_root(struct cgroup_root *root)
1425{
1426 struct cgroup *res = NULL;
1427 struct css_set *cset;
1428
1429 lockdep_assert_held(&css_set_lock);
1430
1431 rcu_read_lock();
1432
1433 cset = current->nsproxy->cgroup_ns->root_cset;
1434 res = __cset_cgroup_from_root(cset, root);
1435
1436 rcu_read_unlock();
1437
1438 BUG_ON(!res);
1439 return res;
1440}
1441
1442
1443static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1444 struct cgroup_root *root)
1445{
1446 struct cgroup *res = NULL;
1447
1448 lockdep_assert_held(&cgroup_mutex);
1449 lockdep_assert_held(&css_set_lock);
1450
1451 res = __cset_cgroup_from_root(cset, root);
1452
1453 BUG_ON(!res);
1454 return res;
1455}
1456
1457
1458
1459
1460
1461struct cgroup *task_cgroup_from_root(struct task_struct *task,
1462 struct cgroup_root *root)
1463{
1464
1465
1466
1467
1468 return cset_cgroup_from_root(task_css_set(task), root);
1469}
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1498
1499static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1500 char *buf)
1501{
1502 struct cgroup_subsys *ss = cft->ss;
1503
1504 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1505 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1506 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1507
1508 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1509 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1510 cft->name);
1511 } else {
1512 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1513 }
1514 return buf;
1515}
1516
1517
1518
1519
1520
1521
1522
1523static umode_t cgroup_file_mode(const struct cftype *cft)
1524{
1525 umode_t mode = 0;
1526
1527 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1528 mode |= S_IRUGO;
1529
1530 if (cft->write_u64 || cft->write_s64 || cft->write) {
1531 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1532 mode |= S_IWUGO;
1533 else
1534 mode |= S_IWUSR;
1535 }
1536
1537 return mode;
1538}
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1553{
1554 u16 cur_ss_mask = subtree_control;
1555 struct cgroup_subsys *ss;
1556 int ssid;
1557
1558 lockdep_assert_held(&cgroup_mutex);
1559
1560 cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1561
1562 while (true) {
1563 u16 new_ss_mask = cur_ss_mask;
1564
1565 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1566 new_ss_mask |= ss->depends_on;
1567 } while_each_subsys_mask();
1568
1569
1570
1571
1572
1573
1574 new_ss_mask &= this_ss_mask;
1575
1576 if (new_ss_mask == cur_ss_mask)
1577 break;
1578 cur_ss_mask = new_ss_mask;
1579 }
1580
1581 return cur_ss_mask;
1582}
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594void cgroup_kn_unlock(struct kernfs_node *kn)
1595{
1596 struct cgroup *cgrp;
1597
1598 if (kernfs_type(kn) == KERNFS_DIR)
1599 cgrp = kn->priv;
1600 else
1601 cgrp = kn->parent->priv;
1602
1603 mutex_unlock(&cgroup_mutex);
1604
1605 kernfs_unbreak_active_protection(kn);
1606 cgroup_put(cgrp);
1607}
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1627{
1628 struct cgroup *cgrp;
1629
1630 if (kernfs_type(kn) == KERNFS_DIR)
1631 cgrp = kn->priv;
1632 else
1633 cgrp = kn->parent->priv;
1634
1635
1636
1637
1638
1639
1640
1641 if (!cgroup_tryget(cgrp))
1642 return NULL;
1643 kernfs_break_active_protection(kn);
1644
1645 if (drain_offline)
1646 cgroup_lock_and_drain_offline(cgrp);
1647 else
1648 mutex_lock(&cgroup_mutex);
1649
1650 if (!cgroup_is_dead(cgrp))
1651 return cgrp;
1652
1653 cgroup_kn_unlock(kn);
1654 return NULL;
1655}
1656
1657static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1658{
1659 char name[CGROUP_FILE_NAME_MAX];
1660
1661 lockdep_assert_held(&cgroup_mutex);
1662
1663 if (cft->file_offset) {
1664 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1665 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1666
1667 spin_lock_irq(&cgroup_file_kn_lock);
1668 cfile->kn = NULL;
1669 spin_unlock_irq(&cgroup_file_kn_lock);
1670
1671 del_timer_sync(&cfile->notify_timer);
1672 }
1673
1674 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1675}
1676
1677
1678
1679
1680
1681static void css_clear_dir(struct cgroup_subsys_state *css)
1682{
1683 struct cgroup *cgrp = css->cgroup;
1684 struct cftype *cfts;
1685
1686 if (!(css->flags & CSS_VISIBLE))
1687 return;
1688
1689 css->flags &= ~CSS_VISIBLE;
1690
1691 if (!css->ss) {
1692 if (cgroup_on_dfl(cgrp))
1693 cfts = cgroup_base_files;
1694 else
1695 cfts = cgroup1_base_files;
1696
1697 cgroup_addrm_files(css, cgrp, cfts, false);
1698 } else {
1699 list_for_each_entry(cfts, &css->ss->cfts, node)
1700 cgroup_addrm_files(css, cgrp, cfts, false);
1701 }
1702}
1703
1704
1705
1706
1707
1708
1709
1710static int css_populate_dir(struct cgroup_subsys_state *css)
1711{
1712 struct cgroup *cgrp = css->cgroup;
1713 struct cftype *cfts, *failed_cfts;
1714 int ret;
1715
1716 if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1717 return 0;
1718
1719 if (!css->ss) {
1720 if (cgroup_on_dfl(cgrp))
1721 cfts = cgroup_base_files;
1722 else
1723 cfts = cgroup1_base_files;
1724
1725 ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1726 if (ret < 0)
1727 return ret;
1728 } else {
1729 list_for_each_entry(cfts, &css->ss->cfts, node) {
1730 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1731 if (ret < 0) {
1732 failed_cfts = cfts;
1733 goto err;
1734 }
1735 }
1736 }
1737
1738 css->flags |= CSS_VISIBLE;
1739
1740 return 0;
1741err:
1742 list_for_each_entry(cfts, &css->ss->cfts, node) {
1743 if (cfts == failed_cfts)
1744 break;
1745 cgroup_addrm_files(css, cgrp, cfts, false);
1746 }
1747 return ret;
1748}
1749
1750int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1751{
1752 struct cgroup *dcgrp = &dst_root->cgrp;
1753 struct cgroup_subsys *ss;
1754 int ssid, i, ret;
1755 u16 dfl_disable_ss_mask = 0;
1756
1757 lockdep_assert_held(&cgroup_mutex);
1758
1759 do_each_subsys_mask(ss, ssid, ss_mask) {
1760
1761
1762
1763
1764
1765 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1766 !ss->implicit_on_dfl)
1767 return -EBUSY;
1768
1769
1770 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1771 return -EBUSY;
1772
1773
1774
1775
1776
1777 if (ss->root == &cgrp_dfl_root)
1778 dfl_disable_ss_mask |= 1 << ssid;
1779
1780 } while_each_subsys_mask();
1781
1782 if (dfl_disable_ss_mask) {
1783 struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
1784
1785
1786
1787
1788
1789 cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
1790 WARN_ON(cgroup_apply_control(scgrp));
1791 cgroup_finalize_control(scgrp, 0);
1792 }
1793
1794 do_each_subsys_mask(ss, ssid, ss_mask) {
1795 struct cgroup_root *src_root = ss->root;
1796 struct cgroup *scgrp = &src_root->cgrp;
1797 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1798 struct css_set *cset;
1799
1800 WARN_ON(!css || cgroup_css(dcgrp, ss));
1801
1802 if (src_root != &cgrp_dfl_root) {
1803
1804 src_root->subsys_mask &= ~(1 << ssid);
1805 WARN_ON(cgroup_apply_control(scgrp));
1806 cgroup_finalize_control(scgrp, 0);
1807 }
1808
1809
1810 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1811 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1812 ss->root = dst_root;
1813 css->cgroup = dcgrp;
1814
1815 spin_lock_irq(&css_set_lock);
1816 hash_for_each(css_set_table, i, cset, hlist)
1817 list_move_tail(&cset->e_cset_node[ss->id],
1818 &dcgrp->e_csets[ss->id]);
1819 spin_unlock_irq(&css_set_lock);
1820
1821 if (ss->css_rstat_flush) {
1822 list_del_rcu(&css->rstat_css_node);
1823 synchronize_rcu();
1824 list_add_rcu(&css->rstat_css_node,
1825 &dcgrp->rstat_css_list);
1826 }
1827
1828
1829 dst_root->subsys_mask |= 1 << ssid;
1830 if (dst_root == &cgrp_dfl_root) {
1831 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1832 } else {
1833 dcgrp->subtree_control |= 1 << ssid;
1834 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1835 }
1836
1837 ret = cgroup_apply_control(dcgrp);
1838 if (ret)
1839 pr_warn("partial failure to rebind %s controller (err=%d)\n",
1840 ss->name, ret);
1841
1842 if (ss->bind)
1843 ss->bind(css);
1844 } while_each_subsys_mask();
1845
1846 kernfs_activate(dcgrp->kn);
1847 return 0;
1848}
1849
1850int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1851 struct kernfs_root *kf_root)
1852{
1853 int len = 0;
1854 char *buf = NULL;
1855 struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1856 struct cgroup *ns_cgroup;
1857
1858 buf = kmalloc(PATH_MAX, GFP_KERNEL);
1859 if (!buf)
1860 return -ENOMEM;
1861
1862 spin_lock_irq(&css_set_lock);
1863 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1864 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1865 spin_unlock_irq(&css_set_lock);
1866
1867 if (len >= PATH_MAX)
1868 len = -ERANGE;
1869 else if (len > 0) {
1870 seq_escape(sf, buf, " \t\n\\");
1871 len = 0;
1872 }
1873 kfree(buf);
1874 return len;
1875}
1876
1877enum cgroup2_param {
1878 Opt_nsdelegate,
1879 Opt_favordynmods,
1880 Opt_memory_localevents,
1881 Opt_memory_recursiveprot,
1882 nr__cgroup2_params
1883};
1884
1885static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1886 fsparam_flag("nsdelegate", Opt_nsdelegate),
1887 fsparam_flag("favordynmods", Opt_favordynmods),
1888 fsparam_flag("memory_localevents", Opt_memory_localevents),
1889 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1890 {}
1891};
1892
1893static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1894{
1895 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1896 struct fs_parse_result result;
1897 int opt;
1898
1899 opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1900 if (opt < 0)
1901 return opt;
1902
1903 switch (opt) {
1904 case Opt_nsdelegate:
1905 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1906 return 0;
1907 case Opt_favordynmods:
1908 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1909 return 0;
1910 case Opt_memory_localevents:
1911 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1912 return 0;
1913 case Opt_memory_recursiveprot:
1914 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1915 return 0;
1916 }
1917 return -EINVAL;
1918}
1919
1920static void apply_cgroup_root_flags(unsigned int root_flags)
1921{
1922 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1923 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1924 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1925 else
1926 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1927
1928 cgroup_favor_dynmods(&cgrp_dfl_root,
1929 root_flags & CGRP_ROOT_FAVOR_DYNMODS);
1930
1931 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1932 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1933 else
1934 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1935
1936 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1937 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1938 else
1939 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1940 }
1941}
1942
1943static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1944{
1945 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1946 seq_puts(seq, ",nsdelegate");
1947 if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
1948 seq_puts(seq, ",favordynmods");
1949 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1950 seq_puts(seq, ",memory_localevents");
1951 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1952 seq_puts(seq, ",memory_recursiveprot");
1953 return 0;
1954}
1955
1956static int cgroup_reconfigure(struct fs_context *fc)
1957{
1958 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1959
1960 apply_cgroup_root_flags(ctx->flags);
1961 return 0;
1962}
1963
1964static void init_cgroup_housekeeping(struct cgroup *cgrp)
1965{
1966 struct cgroup_subsys *ss;
1967 int ssid;
1968
1969 INIT_LIST_HEAD(&cgrp->self.sibling);
1970 INIT_LIST_HEAD(&cgrp->self.children);
1971 INIT_LIST_HEAD(&cgrp->cset_links);
1972 INIT_LIST_HEAD(&cgrp->pidlists);
1973 mutex_init(&cgrp->pidlist_mutex);
1974 cgrp->self.cgroup = cgrp;
1975 cgrp->self.flags |= CSS_ONLINE;
1976 cgrp->dom_cgrp = cgrp;
1977 cgrp->max_descendants = INT_MAX;
1978 cgrp->max_depth = INT_MAX;
1979 INIT_LIST_HEAD(&cgrp->rstat_css_list);
1980 prev_cputime_init(&cgrp->prev_cputime);
1981
1982 for_each_subsys(ss, ssid)
1983 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1984
1985 init_waitqueue_head(&cgrp->offline_waitq);
1986 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1987}
1988
1989void init_cgroup_root(struct cgroup_fs_context *ctx)
1990{
1991 struct cgroup_root *root = ctx->root;
1992 struct cgroup *cgrp = &root->cgrp;
1993
1994 INIT_LIST_HEAD(&root->root_list);
1995 atomic_set(&root->nr_cgrps, 1);
1996 cgrp->root = root;
1997 init_cgroup_housekeeping(cgrp);
1998
1999
2000 root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
2001 if (ctx->release_agent)
2002 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
2003 if (ctx->name)
2004 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
2005 if (ctx->cpuset_clone_children)
2006 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
2007}
2008
2009int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2010{
2011 LIST_HEAD(tmp_links);
2012 struct cgroup *root_cgrp = &root->cgrp;
2013 struct kernfs_syscall_ops *kf_sops;
2014 struct css_set *cset;
2015 int i, ret;
2016
2017 lockdep_assert_held(&cgroup_mutex);
2018
2019 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
2020 0, GFP_KERNEL);
2021 if (ret)
2022 goto out;
2023
2024
2025
2026
2027
2028
2029
2030
2031 ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
2032 if (ret)
2033 goto cancel_ref;
2034
2035 ret = cgroup_init_root_id(root);
2036 if (ret)
2037 goto cancel_ref;
2038
2039 kf_sops = root == &cgrp_dfl_root ?
2040 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
2041
2042 root->kf_root = kernfs_create_root(kf_sops,
2043 KERNFS_ROOT_CREATE_DEACTIVATED |
2044 KERNFS_ROOT_SUPPORT_EXPORTOP |
2045 KERNFS_ROOT_SUPPORT_USER_XATTR,
2046 root_cgrp);
2047 if (IS_ERR(root->kf_root)) {
2048 ret = PTR_ERR(root->kf_root);
2049 goto exit_root_id;
2050 }
2051 root_cgrp->kn = kernfs_root_to_node(root->kf_root);
2052 WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2053 root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
2054
2055 ret = css_populate_dir(&root_cgrp->self);
2056 if (ret)
2057 goto destroy_root;
2058
2059 ret = cgroup_rstat_init(root_cgrp);
2060 if (ret)
2061 goto destroy_root;
2062
2063 ret = rebind_subsystems(root, ss_mask);
2064 if (ret)
2065 goto exit_stats;
2066
2067 ret = cgroup_bpf_inherit(root_cgrp);
2068 WARN_ON_ONCE(ret);
2069
2070 trace_cgroup_setup_root(root);
2071
2072
2073
2074
2075
2076
2077 list_add(&root->root_list, &cgroup_roots);
2078 cgroup_root_count++;
2079
2080
2081
2082
2083
2084 spin_lock_irq(&css_set_lock);
2085 hash_for_each(css_set_table, i, cset, hlist) {
2086 link_css_set(&tmp_links, cset, root_cgrp);
2087 if (css_set_populated(cset))
2088 cgroup_update_populated(root_cgrp, true);
2089 }
2090 spin_unlock_irq(&css_set_lock);
2091
2092 BUG_ON(!list_empty(&root_cgrp->self.children));
2093 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2094
2095 ret = 0;
2096 goto out;
2097
2098exit_stats:
2099 cgroup_rstat_exit(root_cgrp);
2100destroy_root:
2101 kernfs_destroy_root(root->kf_root);
2102 root->kf_root = NULL;
2103exit_root_id:
2104 cgroup_exit_root_id(root);
2105cancel_ref:
2106 percpu_ref_exit(&root_cgrp->self.refcnt);
2107out:
2108 free_cgrp_cset_links(&tmp_links);
2109 return ret;
2110}
2111
2112int cgroup_do_get_tree(struct fs_context *fc)
2113{
2114 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2115 int ret;
2116
2117 ctx->kfc.root = ctx->root->kf_root;
2118 if (fc->fs_type == &cgroup2_fs_type)
2119 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2120 else
2121 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2122 ret = kernfs_get_tree(fc);
2123
2124
2125
2126
2127
2128 if (!ret && ctx->ns != &init_cgroup_ns) {
2129 struct dentry *nsdentry;
2130 struct super_block *sb = fc->root->d_sb;
2131 struct cgroup *cgrp;
2132
2133 mutex_lock(&cgroup_mutex);
2134 spin_lock_irq(&css_set_lock);
2135
2136 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2137
2138 spin_unlock_irq(&css_set_lock);
2139 mutex_unlock(&cgroup_mutex);
2140
2141 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2142 dput(fc->root);
2143 if (IS_ERR(nsdentry)) {
2144 deactivate_locked_super(sb);
2145 ret = PTR_ERR(nsdentry);
2146 nsdentry = NULL;
2147 }
2148 fc->root = nsdentry;
2149 }
2150
2151 if (!ctx->kfc.new_sb_created)
2152 cgroup_put(&ctx->root->cgrp);
2153
2154 return ret;
2155}
2156
2157
2158
2159
2160static void cgroup_fs_context_free(struct fs_context *fc)
2161{
2162 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2163
2164 kfree(ctx->name);
2165 kfree(ctx->release_agent);
2166 put_cgroup_ns(ctx->ns);
2167 kernfs_free_fs_context(fc);
2168 kfree(ctx);
2169}
2170
2171static int cgroup_get_tree(struct fs_context *fc)
2172{
2173 struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2174 int ret;
2175
2176 cgrp_dfl_visible = true;
2177 cgroup_get_live(&cgrp_dfl_root.cgrp);
2178 ctx->root = &cgrp_dfl_root;
2179
2180 ret = cgroup_do_get_tree(fc);
2181 if (!ret)
2182 apply_cgroup_root_flags(ctx->flags);
2183 return ret;
2184}
2185
2186static const struct fs_context_operations cgroup_fs_context_ops = {
2187 .free = cgroup_fs_context_free,
2188 .parse_param = cgroup2_parse_param,
2189 .get_tree = cgroup_get_tree,
2190 .reconfigure = cgroup_reconfigure,
2191};
2192
2193static const struct fs_context_operations cgroup1_fs_context_ops = {
2194 .free = cgroup_fs_context_free,
2195 .parse_param = cgroup1_parse_param,
2196 .get_tree = cgroup1_get_tree,
2197 .reconfigure = cgroup1_reconfigure,
2198};
2199
2200
2201
2202
2203
2204static int cgroup_init_fs_context(struct fs_context *fc)
2205{
2206 struct cgroup_fs_context *ctx;
2207
2208 ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2209 if (!ctx)
2210 return -ENOMEM;
2211
2212 ctx->ns = current->nsproxy->cgroup_ns;
2213 get_cgroup_ns(ctx->ns);
2214 fc->fs_private = &ctx->kfc;
2215 if (fc->fs_type == &cgroup2_fs_type)
2216 fc->ops = &cgroup_fs_context_ops;
2217 else
2218 fc->ops = &cgroup1_fs_context_ops;
2219 put_user_ns(fc->user_ns);
2220 fc->user_ns = get_user_ns(ctx->ns->user_ns);
2221 fc->global = true;
2222
2223#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
2224 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
2225#endif
2226 return 0;
2227}
2228
2229static void cgroup_kill_sb(struct super_block *sb)
2230{
2231 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2232 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2233
2234
2235
2236
2237
2238
2239
2240 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2241 !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2242 cgroup_bpf_offline(&root->cgrp);
2243 percpu_ref_kill(&root->cgrp.self.refcnt);
2244 }
2245 cgroup_put(&root->cgrp);
2246 kernfs_kill_sb(sb);
2247}
2248
2249struct file_system_type cgroup_fs_type = {
2250 .name = "cgroup",
2251 .init_fs_context = cgroup_init_fs_context,
2252 .parameters = cgroup1_fs_parameters,
2253 .kill_sb = cgroup_kill_sb,
2254 .fs_flags = FS_USERNS_MOUNT,
2255};
2256
2257static struct file_system_type cgroup2_fs_type = {
2258 .name = "cgroup2",
2259 .init_fs_context = cgroup_init_fs_context,
2260 .parameters = cgroup2_fs_parameters,
2261 .kill_sb = cgroup_kill_sb,
2262 .fs_flags = FS_USERNS_MOUNT,
2263};
2264
2265#ifdef CONFIG_CPUSETS
2266static const struct fs_context_operations cpuset_fs_context_ops = {
2267 .get_tree = cgroup1_get_tree,
2268 .free = cgroup_fs_context_free,
2269};
2270
2271
2272
2273
2274
2275
2276static int cpuset_init_fs_context(struct fs_context *fc)
2277{
2278 char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2279 struct cgroup_fs_context *ctx;
2280 int err;
2281
2282 err = cgroup_init_fs_context(fc);
2283 if (err) {
2284 kfree(agent);
2285 return err;
2286 }
2287
2288 fc->ops = &cpuset_fs_context_ops;
2289
2290 ctx = cgroup_fc2context(fc);
2291 ctx->subsys_mask = 1 << cpuset_cgrp_id;
2292 ctx->flags |= CGRP_ROOT_NOPREFIX;
2293 ctx->release_agent = agent;
2294
2295 get_filesystem(&cgroup_fs_type);
2296 put_filesystem(fc->fs_type);
2297 fc->fs_type = &cgroup_fs_type;
2298
2299 return 0;
2300}
2301
2302static struct file_system_type cpuset_fs_type = {
2303 .name = "cpuset",
2304 .init_fs_context = cpuset_init_fs_context,
2305 .fs_flags = FS_USERNS_MOUNT,
2306};
2307#endif
2308
2309int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2310 struct cgroup_namespace *ns)
2311{
2312 struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2313
2314 return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2315}
2316
2317int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2318 struct cgroup_namespace *ns)
2319{
2320 int ret;
2321
2322 mutex_lock(&cgroup_mutex);
2323 spin_lock_irq(&css_set_lock);
2324
2325 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2326
2327 spin_unlock_irq(&css_set_lock);
2328 mutex_unlock(&cgroup_mutex);
2329
2330 return ret;
2331}
2332EXPORT_SYMBOL_GPL(cgroup_path_ns);
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2348{
2349 struct cgroup_root *root;
2350 struct cgroup *cgrp;
2351 int hierarchy_id = 1;
2352 int ret;
2353
2354 mutex_lock(&cgroup_mutex);
2355 spin_lock_irq(&css_set_lock);
2356
2357 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2358
2359 if (root) {
2360 cgrp = task_cgroup_from_root(task, root);
2361 ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2362 } else {
2363
2364 ret = strlcpy(buf, "/", buflen);
2365 }
2366
2367 spin_unlock_irq(&css_set_lock);
2368 mutex_unlock(&cgroup_mutex);
2369 return ret;
2370}
2371EXPORT_SYMBOL_GPL(task_cgroup_path);
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396static void cgroup_attach_lock(bool lock_threadgroup)
2397{
2398 cpus_read_lock();
2399 if (lock_threadgroup)
2400 percpu_down_write(&cgroup_threadgroup_rwsem);
2401}
2402
2403
2404
2405
2406
2407static void cgroup_attach_unlock(bool lock_threadgroup)
2408{
2409 if (lock_threadgroup)
2410 percpu_up_write(&cgroup_threadgroup_rwsem);
2411 cpus_read_unlock();
2412}
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424static void cgroup_migrate_add_task(struct task_struct *task,
2425 struct cgroup_mgctx *mgctx)
2426{
2427 struct css_set *cset;
2428
2429 lockdep_assert_held(&css_set_lock);
2430
2431
2432 if (task->flags & PF_EXITING)
2433 return;
2434
2435
2436 WARN_ON_ONCE(list_empty(&task->cg_list));
2437
2438 cset = task_css_set(task);
2439 if (!cset->mg_src_cgrp)
2440 return;
2441
2442 mgctx->tset.nr_tasks++;
2443
2444 list_move_tail(&task->cg_list, &cset->mg_tasks);
2445 if (list_empty(&cset->mg_node))
2446 list_add_tail(&cset->mg_node,
2447 &mgctx->tset.src_csets);
2448 if (list_empty(&cset->mg_dst_cset->mg_node))
2449 list_add_tail(&cset->mg_dst_cset->mg_node,
2450 &mgctx->tset.dst_csets);
2451}
2452
2453
2454
2455
2456
2457
2458
2459
2460struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2461 struct cgroup_subsys_state **dst_cssp)
2462{
2463 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2464 tset->cur_task = NULL;
2465
2466 return cgroup_taskset_next(tset, dst_cssp);
2467}
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2478 struct cgroup_subsys_state **dst_cssp)
2479{
2480 struct css_set *cset = tset->cur_cset;
2481 struct task_struct *task = tset->cur_task;
2482
2483 while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
2484 if (!task)
2485 task = list_first_entry(&cset->mg_tasks,
2486 struct task_struct, cg_list);
2487 else
2488 task = list_next_entry(task, cg_list);
2489
2490 if (&task->cg_list != &cset->mg_tasks) {
2491 tset->cur_cset = cset;
2492 tset->cur_task = task;
2493
2494
2495
2496
2497
2498
2499
2500 if (cset->mg_dst_cset)
2501 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2502 else
2503 *dst_cssp = cset->subsys[tset->ssid];
2504
2505 return task;
2506 }
2507
2508 cset = list_next_entry(cset, mg_node);
2509 task = NULL;
2510 }
2511
2512 return NULL;
2513}
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2525{
2526 struct cgroup_taskset *tset = &mgctx->tset;
2527 struct cgroup_subsys *ss;
2528 struct task_struct *task, *tmp_task;
2529 struct css_set *cset, *tmp_cset;
2530 int ssid, failed_ssid, ret;
2531
2532
2533 if (tset->nr_tasks) {
2534 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2535 if (ss->can_attach) {
2536 tset->ssid = ssid;
2537 ret = ss->can_attach(tset);
2538 if (ret) {
2539 failed_ssid = ssid;
2540 goto out_cancel_attach;
2541 }
2542 }
2543 } while_each_subsys_mask();
2544 }
2545
2546
2547
2548
2549
2550
2551 spin_lock_irq(&css_set_lock);
2552 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2553 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2554 struct css_set *from_cset = task_css_set(task);
2555 struct css_set *to_cset = cset->mg_dst_cset;
2556
2557 get_css_set(to_cset);
2558 to_cset->nr_tasks++;
2559 css_set_move_task(task, from_cset, to_cset, true);
2560 from_cset->nr_tasks--;
2561
2562
2563
2564
2565 cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2566 to_cset->dfl_cgrp);
2567 put_css_set_locked(from_cset);
2568
2569 }
2570 }
2571 spin_unlock_irq(&css_set_lock);
2572
2573
2574
2575
2576
2577
2578 tset->csets = &tset->dst_csets;
2579
2580 if (tset->nr_tasks) {
2581 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2582 if (ss->attach) {
2583 tset->ssid = ssid;
2584 ss->attach(tset);
2585 }
2586 } while_each_subsys_mask();
2587 }
2588
2589 ret = 0;
2590 goto out_release_tset;
2591
2592out_cancel_attach:
2593 if (tset->nr_tasks) {
2594 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2595 if (ssid == failed_ssid)
2596 break;
2597 if (ss->cancel_attach) {
2598 tset->ssid = ssid;
2599 ss->cancel_attach(tset);
2600 }
2601 } while_each_subsys_mask();
2602 }
2603out_release_tset:
2604 spin_lock_irq(&css_set_lock);
2605 list_splice_init(&tset->dst_csets, &tset->src_csets);
2606 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2607 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2608 list_del_init(&cset->mg_node);
2609 }
2610 spin_unlock_irq(&css_set_lock);
2611
2612
2613
2614
2615
2616
2617 tset->nr_tasks = 0;
2618 tset->csets = &tset->src_csets;
2619 return ret;
2620}
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2632{
2633
2634 if (!cgroup_on_dfl(dst_cgrp))
2635 return 0;
2636
2637
2638 if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2639 return -EOPNOTSUPP;
2640
2641
2642
2643
2644
2645 if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2646 return 0;
2647
2648
2649 if (dst_cgrp->subtree_control)
2650 return -EBUSY;
2651
2652 return 0;
2653}
2654
2655
2656
2657
2658
2659
2660
2661
2662void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2663{
2664 struct css_set *cset, *tmp_cset;
2665
2666 lockdep_assert_held(&cgroup_mutex);
2667
2668 spin_lock_irq(&css_set_lock);
2669
2670 list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2671 mg_src_preload_node) {
2672 cset->mg_src_cgrp = NULL;
2673 cset->mg_dst_cgrp = NULL;
2674 cset->mg_dst_cset = NULL;
2675 list_del_init(&cset->mg_src_preload_node);
2676 put_css_set_locked(cset);
2677 }
2678
2679 list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2680 mg_dst_preload_node) {
2681 cset->mg_src_cgrp = NULL;
2682 cset->mg_dst_cgrp = NULL;
2683 cset->mg_dst_cset = NULL;
2684 list_del_init(&cset->mg_dst_preload_node);
2685 put_css_set_locked(cset);
2686 }
2687
2688 spin_unlock_irq(&css_set_lock);
2689}
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707void cgroup_migrate_add_src(struct css_set *src_cset,
2708 struct cgroup *dst_cgrp,
2709 struct cgroup_mgctx *mgctx)
2710{
2711 struct cgroup *src_cgrp;
2712
2713 lockdep_assert_held(&cgroup_mutex);
2714 lockdep_assert_held(&css_set_lock);
2715
2716
2717
2718
2719
2720
2721 if (src_cset->dead)
2722 return;
2723
2724 if (!list_empty(&src_cset->mg_src_preload_node))
2725 return;
2726
2727 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2728
2729 WARN_ON(src_cset->mg_src_cgrp);
2730 WARN_ON(src_cset->mg_dst_cgrp);
2731 WARN_ON(!list_empty(&src_cset->mg_tasks));
2732 WARN_ON(!list_empty(&src_cset->mg_node));
2733
2734 src_cset->mg_src_cgrp = src_cgrp;
2735 src_cset->mg_dst_cgrp = dst_cgrp;
2736 get_css_set(src_cset);
2737 list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
2738}
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2755{
2756 struct css_set *src_cset, *tmp_cset;
2757
2758 lockdep_assert_held(&cgroup_mutex);
2759
2760
2761 list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2762 mg_src_preload_node) {
2763 struct css_set *dst_cset;
2764 struct cgroup_subsys *ss;
2765 int ssid;
2766
2767 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2768 if (!dst_cset)
2769 return -ENOMEM;
2770
2771 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2772
2773
2774
2775
2776
2777
2778 if (src_cset == dst_cset) {
2779 src_cset->mg_src_cgrp = NULL;
2780 src_cset->mg_dst_cgrp = NULL;
2781 list_del_init(&src_cset->mg_src_preload_node);
2782 put_css_set(src_cset);
2783 put_css_set(dst_cset);
2784 continue;
2785 }
2786
2787 src_cset->mg_dst_cset = dst_cset;
2788
2789 if (list_empty(&dst_cset->mg_dst_preload_node))
2790 list_add_tail(&dst_cset->mg_dst_preload_node,
2791 &mgctx->preloaded_dst_csets);
2792 else
2793 put_css_set(dst_cset);
2794
2795 for_each_subsys(ss, ssid)
2796 if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2797 mgctx->ss_mask |= 1 << ssid;
2798 }
2799
2800 return 0;
2801}
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2822 struct cgroup_mgctx *mgctx)
2823{
2824 struct task_struct *task;
2825
2826
2827
2828
2829
2830
2831 spin_lock_irq(&css_set_lock);
2832 rcu_read_lock();
2833 task = leader;
2834 do {
2835 cgroup_migrate_add_task(task, mgctx);
2836 if (!threadgroup)
2837 break;
2838 } while_each_thread(leader, task);
2839 rcu_read_unlock();
2840 spin_unlock_irq(&css_set_lock);
2841
2842 return cgroup_migrate_execute(mgctx);
2843}
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2854 bool threadgroup)
2855{
2856 DEFINE_CGROUP_MGCTX(mgctx);
2857 struct task_struct *task;
2858 int ret = 0;
2859
2860
2861 spin_lock_irq(&css_set_lock);
2862 rcu_read_lock();
2863 task = leader;
2864 do {
2865 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2866 if (!threadgroup)
2867 break;
2868 } while_each_thread(leader, task);
2869 rcu_read_unlock();
2870 spin_unlock_irq(&css_set_lock);
2871
2872
2873 ret = cgroup_migrate_prepare_dst(&mgctx);
2874 if (!ret)
2875 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2876
2877 cgroup_migrate_finish(&mgctx);
2878
2879 if (!ret)
2880 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2881
2882 return ret;
2883}
2884
2885struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2886 bool *threadgroup_locked)
2887{
2888 struct task_struct *tsk;
2889 pid_t pid;
2890
2891 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2892 return ERR_PTR(-EINVAL);
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902 lockdep_assert_held(&cgroup_mutex);
2903 *threadgroup_locked = pid || threadgroup;
2904 cgroup_attach_lock(*threadgroup_locked);
2905
2906 rcu_read_lock();
2907 if (pid) {
2908 tsk = find_task_by_vpid(pid);
2909 if (!tsk) {
2910 tsk = ERR_PTR(-ESRCH);
2911 goto out_unlock_threadgroup;
2912 }
2913 } else {
2914 tsk = current;
2915 }
2916
2917 if (threadgroup)
2918 tsk = tsk->group_leader;
2919
2920
2921
2922
2923
2924
2925
2926 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2927 tsk = ERR_PTR(-EINVAL);
2928 goto out_unlock_threadgroup;
2929 }
2930
2931 get_task_struct(tsk);
2932 goto out_unlock_rcu;
2933
2934out_unlock_threadgroup:
2935 cgroup_attach_unlock(*threadgroup_locked);
2936 *threadgroup_locked = false;
2937out_unlock_rcu:
2938 rcu_read_unlock();
2939 return tsk;
2940}
2941
2942void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
2943{
2944 struct cgroup_subsys *ss;
2945 int ssid;
2946
2947
2948 put_task_struct(task);
2949
2950 cgroup_attach_unlock(threadgroup_locked);
2951
2952 for_each_subsys(ss, ssid)
2953 if (ss->post_attach)
2954 ss->post_attach();
2955}
2956
2957static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2958{
2959 struct cgroup_subsys *ss;
2960 bool printed = false;
2961 int ssid;
2962
2963 do_each_subsys_mask(ss, ssid, ss_mask) {
2964 if (printed)
2965 seq_putc(seq, ' ');
2966 seq_puts(seq, ss->name);
2967 printed = true;
2968 } while_each_subsys_mask();
2969 if (printed)
2970 seq_putc(seq, '\n');
2971}
2972
2973
2974static int cgroup_controllers_show(struct seq_file *seq, void *v)
2975{
2976 struct cgroup *cgrp = seq_css(seq)->cgroup;
2977
2978 cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2979 return 0;
2980}
2981
2982
2983static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2984{
2985 struct cgroup *cgrp = seq_css(seq)->cgroup;
2986
2987 cgroup_print_ss_mask(seq, cgrp->subtree_control);
2988 return 0;
2989}
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3001{
3002 DEFINE_CGROUP_MGCTX(mgctx);
3003 struct cgroup_subsys_state *d_css;
3004 struct cgroup *dsct;
3005 struct css_set *src_cset;
3006 bool has_tasks;
3007 int ret;
3008
3009 lockdep_assert_held(&cgroup_mutex);
3010
3011
3012 spin_lock_irq(&css_set_lock);
3013 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3014 struct cgrp_cset_link *link;
3015
3016
3017
3018
3019
3020
3021
3022 if (dsct == cgrp)
3023 continue;
3024
3025 list_for_each_entry(link, &dsct->cset_links, cset_link)
3026 cgroup_migrate_add_src(link->cset, dsct, &mgctx);
3027 }
3028 spin_unlock_irq(&css_set_lock);
3029
3030
3031
3032
3033
3034
3035
3036 has_tasks = !list_empty(&mgctx.preloaded_src_csets);
3037 cgroup_attach_lock(has_tasks);
3038
3039
3040 ret = cgroup_migrate_prepare_dst(&mgctx);
3041 if (ret)
3042 goto out_finish;
3043
3044 spin_lock_irq(&css_set_lock);
3045 list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
3046 mg_src_preload_node) {
3047 struct task_struct *task, *ntask;
3048
3049
3050 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3051 cgroup_migrate_add_task(task, &mgctx);
3052 }
3053 spin_unlock_irq(&css_set_lock);
3054
3055 ret = cgroup_migrate_execute(&mgctx);
3056out_finish:
3057 cgroup_migrate_finish(&mgctx);
3058 cgroup_attach_unlock(has_tasks);
3059 return ret;
3060}
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3071 __acquires(&cgroup_mutex)
3072{
3073 struct cgroup *dsct;
3074 struct cgroup_subsys_state *d_css;
3075 struct cgroup_subsys *ss;
3076 int ssid;
3077
3078restart:
3079 mutex_lock(&cgroup_mutex);
3080
3081 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3082 for_each_subsys(ss, ssid) {
3083 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3084 DEFINE_WAIT(wait);
3085
3086 if (!css || !percpu_ref_is_dying(&css->refcnt))
3087 continue;
3088
3089 cgroup_get_live(dsct);
3090 prepare_to_wait(&dsct->offline_waitq, &wait,
3091 TASK_UNINTERRUPTIBLE);
3092
3093 mutex_unlock(&cgroup_mutex);
3094 schedule();
3095 finish_wait(&dsct->offline_waitq, &wait);
3096
3097 cgroup_put(dsct);
3098 goto restart;
3099 }
3100 }
3101}
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111static void cgroup_save_control(struct cgroup *cgrp)
3112{
3113 struct cgroup *dsct;
3114 struct cgroup_subsys_state *d_css;
3115
3116 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3117 dsct->old_subtree_control = dsct->subtree_control;
3118 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3119 dsct->old_dom_cgrp = dsct->dom_cgrp;
3120 }
3121}
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131static void cgroup_propagate_control(struct cgroup *cgrp)
3132{
3133 struct cgroup *dsct;
3134 struct cgroup_subsys_state *d_css;
3135
3136 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3137 dsct->subtree_control &= cgroup_control(dsct);
3138 dsct->subtree_ss_mask =
3139 cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3140 cgroup_ss_mask(dsct));
3141 }
3142}
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152static void cgroup_restore_control(struct cgroup *cgrp)
3153{
3154 struct cgroup *dsct;
3155 struct cgroup_subsys_state *d_css;
3156
3157 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3158 dsct->subtree_control = dsct->old_subtree_control;
3159 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3160 dsct->dom_cgrp = dsct->old_dom_cgrp;
3161 }
3162}
3163
3164static bool css_visible(struct cgroup_subsys_state *css)
3165{
3166 struct cgroup_subsys *ss = css->ss;
3167 struct cgroup *cgrp = css->cgroup;
3168
3169 if (cgroup_control(cgrp) & (1 << ss->id))
3170 return true;
3171 if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3172 return false;
3173 return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3174}
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189static int cgroup_apply_control_enable(struct cgroup *cgrp)
3190{
3191 struct cgroup *dsct;
3192 struct cgroup_subsys_state *d_css;
3193 struct cgroup_subsys *ss;
3194 int ssid, ret;
3195
3196 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3197 for_each_subsys(ss, ssid) {
3198 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3199
3200 if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3201 continue;
3202
3203 if (!css) {
3204 css = css_create(dsct, ss);
3205 if (IS_ERR(css))
3206 return PTR_ERR(css);
3207 }
3208
3209 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3210
3211 if (css_visible(css)) {
3212 ret = css_populate_dir(css);
3213 if (ret)
3214 return ret;
3215 }
3216 }
3217 }
3218
3219 return 0;
3220}
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235static void cgroup_apply_control_disable(struct cgroup *cgrp)
3236{
3237 struct cgroup *dsct;
3238 struct cgroup_subsys_state *d_css;
3239 struct cgroup_subsys *ss;
3240 int ssid;
3241
3242 cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3243 for_each_subsys(ss, ssid) {
3244 struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3245
3246 if (!css)
3247 continue;
3248
3249 WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3250
3251 if (css->parent &&
3252 !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3253 kill_css(css);
3254 } else if (!css_visible(css)) {
3255 css_clear_dir(css);
3256 if (ss->css_reset)
3257 ss->css_reset(css);
3258 }
3259 }
3260 }
3261}
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280static int cgroup_apply_control(struct cgroup *cgrp)
3281{
3282 int ret;
3283
3284 cgroup_propagate_control(cgrp);
3285
3286 ret = cgroup_apply_control_enable(cgrp);
3287 if (ret)
3288 return ret;
3289
3290
3291
3292
3293
3294
3295 ret = cgroup_update_dfl_csses(cgrp);
3296 if (ret)
3297 return ret;
3298
3299 return 0;
3300}
3301
3302
3303
3304
3305
3306
3307
3308
3309static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3310{
3311 if (ret) {
3312 cgroup_restore_control(cgrp);
3313 cgroup_propagate_control(cgrp);
3314 }
3315
3316 cgroup_apply_control_disable(cgrp);
3317}
3318
3319static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3320{
3321 u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3322
3323
3324 if (!enable)
3325 return 0;
3326
3327
3328 if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3329 return -EOPNOTSUPP;
3330
3331
3332 if (cgroup_is_mixable(cgrp))
3333 return 0;
3334
3335 if (domain_enable) {
3336
3337 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3338 return -EOPNOTSUPP;
3339 } else {
3340
3341
3342
3343
3344
3345 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3346 return 0;
3347 }
3348
3349
3350
3351
3352
3353 if (cgroup_has_tasks(cgrp))
3354 return -EBUSY;
3355
3356 return 0;
3357}
3358
3359
3360static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3361 char *buf, size_t nbytes,
3362 loff_t off)
3363{
3364 u16 enable = 0, disable = 0;
3365 struct cgroup *cgrp, *child;
3366 struct cgroup_subsys *ss;
3367 char *tok;
3368 int ssid, ret;
3369
3370
3371
3372
3373
3374 buf = strstrip(buf);
3375 while ((tok = strsep(&buf, " "))) {
3376 if (tok[0] == '\0')
3377 continue;
3378 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3379 if (!cgroup_ssid_enabled(ssid) ||
3380 strcmp(tok + 1, ss->name))
3381 continue;
3382
3383 if (*tok == '+') {
3384 enable |= 1 << ssid;
3385 disable &= ~(1 << ssid);
3386 } else if (*tok == '-') {
3387 disable |= 1 << ssid;
3388 enable &= ~(1 << ssid);
3389 } else {
3390 return -EINVAL;
3391 }
3392 break;
3393 } while_each_subsys_mask();
3394 if (ssid == CGROUP_SUBSYS_COUNT)
3395 return -EINVAL;
3396 }
3397
3398 cgrp = cgroup_kn_lock_live(of->kn, true);
3399 if (!cgrp)
3400 return -ENODEV;
3401
3402 for_each_subsys(ss, ssid) {
3403 if (enable & (1 << ssid)) {
3404 if (cgrp->subtree_control & (1 << ssid)) {
3405 enable &= ~(1 << ssid);
3406 continue;
3407 }
3408
3409 if (!(cgroup_control(cgrp) & (1 << ssid))) {
3410 ret = -ENOENT;
3411 goto out_unlock;
3412 }
3413 } else if (disable & (1 << ssid)) {
3414 if (!(cgrp->subtree_control & (1 << ssid))) {
3415 disable &= ~(1 << ssid);
3416 continue;
3417 }
3418
3419
3420 cgroup_for_each_live_child(child, cgrp) {
3421 if (child->subtree_control & (1 << ssid)) {
3422 ret = -EBUSY;
3423 goto out_unlock;
3424 }
3425 }
3426 }
3427 }
3428
3429 if (!enable && !disable) {
3430 ret = 0;
3431 goto out_unlock;
3432 }
3433
3434 ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3435 if (ret)
3436 goto out_unlock;
3437
3438
3439 cgroup_save_control(cgrp);
3440
3441 cgrp->subtree_control |= enable;
3442 cgrp->subtree_control &= ~disable;
3443
3444 ret = cgroup_apply_control(cgrp);
3445 cgroup_finalize_control(cgrp, ret);
3446 if (ret)
3447 goto out_unlock;
3448
3449 kernfs_activate(cgrp->kn);
3450out_unlock:
3451 cgroup_kn_unlock(of->kn);
3452 return ret ?: nbytes;
3453}
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464static int cgroup_enable_threaded(struct cgroup *cgrp)
3465{
3466 struct cgroup *parent = cgroup_parent(cgrp);
3467 struct cgroup *dom_cgrp = parent->dom_cgrp;
3468 struct cgroup *dsct;
3469 struct cgroup_subsys_state *d_css;
3470 int ret;
3471
3472 lockdep_assert_held(&cgroup_mutex);
3473
3474
3475 if (cgroup_is_threaded(cgrp))
3476 return 0;
3477
3478
3479
3480
3481
3482
3483
3484 if (cgroup_is_populated(cgrp) ||
3485 cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3486 return -EOPNOTSUPP;
3487
3488
3489 if (!cgroup_is_valid_domain(dom_cgrp) ||
3490 !cgroup_can_be_thread_root(dom_cgrp))
3491 return -EOPNOTSUPP;
3492
3493
3494
3495
3496
3497 cgroup_save_control(cgrp);
3498
3499 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3500 if (dsct == cgrp || cgroup_is_threaded(dsct))
3501 dsct->dom_cgrp = dom_cgrp;
3502
3503 ret = cgroup_apply_control(cgrp);
3504 if (!ret)
3505 parent->nr_threaded_children++;
3506
3507 cgroup_finalize_control(cgrp, ret);
3508 return ret;
3509}
3510
3511static int cgroup_type_show(struct seq_file *seq, void *v)
3512{
3513 struct cgroup *cgrp = seq_css(seq)->cgroup;
3514
3515 if (cgroup_is_threaded(cgrp))
3516 seq_puts(seq, "threaded\n");
3517 else if (!cgroup_is_valid_domain(cgrp))
3518 seq_puts(seq, "domain invalid\n");
3519 else if (cgroup_is_thread_root(cgrp))
3520 seq_puts(seq, "domain threaded\n");
3521 else
3522 seq_puts(seq, "domain\n");
3523
3524 return 0;
3525}
3526
3527static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3528 size_t nbytes, loff_t off)
3529{
3530 struct cgroup *cgrp;
3531 int ret;
3532
3533
3534 if (strcmp(strstrip(buf), "threaded"))
3535 return -EINVAL;
3536
3537
3538 cgrp = cgroup_kn_lock_live(of->kn, true);
3539 if (!cgrp)
3540 return -ENOENT;
3541
3542
3543 ret = cgroup_enable_threaded(cgrp);
3544
3545 cgroup_kn_unlock(of->kn);
3546 return ret ?: nbytes;
3547}
3548
3549static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3550{
3551 struct cgroup *cgrp = seq_css(seq)->cgroup;
3552 int descendants = READ_ONCE(cgrp->max_descendants);
3553
3554 if (descendants == INT_MAX)
3555 seq_puts(seq, "max\n");
3556 else
3557 seq_printf(seq, "%d\n", descendants);
3558
3559 return 0;
3560}
3561
3562static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3563 char *buf, size_t nbytes, loff_t off)
3564{
3565 struct cgroup *cgrp;
3566 int descendants;
3567 ssize_t ret;
3568
3569 buf = strstrip(buf);
3570 if (!strcmp(buf, "max")) {
3571 descendants = INT_MAX;
3572 } else {
3573 ret = kstrtoint(buf, 0, &descendants);
3574 if (ret)
3575 return ret;
3576 }
3577
3578 if (descendants < 0)
3579 return -ERANGE;
3580
3581 cgrp = cgroup_kn_lock_live(of->kn, false);
3582 if (!cgrp)
3583 return -ENOENT;
3584
3585 cgrp->max_descendants = descendants;
3586
3587 cgroup_kn_unlock(of->kn);
3588
3589 return nbytes;
3590}
3591
3592static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3593{
3594 struct cgroup *cgrp = seq_css(seq)->cgroup;
3595 int depth = READ_ONCE(cgrp->max_depth);
3596
3597 if (depth == INT_MAX)
3598 seq_puts(seq, "max\n");
3599 else
3600 seq_printf(seq, "%d\n", depth);
3601
3602 return 0;
3603}
3604
3605static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3606 char *buf, size_t nbytes, loff_t off)
3607{
3608 struct cgroup *cgrp;
3609 ssize_t ret;
3610 int depth;
3611
3612 buf = strstrip(buf);
3613 if (!strcmp(buf, "max")) {
3614 depth = INT_MAX;
3615 } else {
3616 ret = kstrtoint(buf, 0, &depth);
3617 if (ret)
3618 return ret;
3619 }
3620
3621 if (depth < 0)
3622 return -ERANGE;
3623
3624 cgrp = cgroup_kn_lock_live(of->kn, false);
3625 if (!cgrp)
3626 return -ENOENT;
3627
3628 cgrp->max_depth = depth;
3629
3630 cgroup_kn_unlock(of->kn);
3631
3632 return nbytes;
3633}
3634
3635static int cgroup_events_show(struct seq_file *seq, void *v)
3636{
3637 struct cgroup *cgrp = seq_css(seq)->cgroup;
3638
3639 seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3640 seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3641
3642 return 0;
3643}
3644
3645static int cgroup_stat_show(struct seq_file *seq, void *v)
3646{
3647 struct cgroup *cgroup = seq_css(seq)->cgroup;
3648
3649 seq_printf(seq, "nr_descendants %d\n",
3650 cgroup->nr_descendants);
3651 seq_printf(seq, "nr_dying_descendants %d\n",
3652 cgroup->nr_dying_descendants);
3653
3654 return 0;
3655}
3656
3657static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3658 struct cgroup *cgrp, int ssid)
3659{
3660 struct cgroup_subsys *ss = cgroup_subsys[ssid];
3661 struct cgroup_subsys_state *css;
3662 int ret;
3663
3664 if (!ss->css_extra_stat_show)
3665 return 0;
3666
3667 css = cgroup_tryget_css(cgrp, ss);
3668 if (!css)
3669 return 0;
3670
3671 ret = ss->css_extra_stat_show(seq, css);
3672 css_put(css);
3673 return ret;
3674}
3675
3676static int cpu_stat_show(struct seq_file *seq, void *v)
3677{
3678 struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3679 int ret = 0;
3680
3681 cgroup_base_stat_cputime_show(seq);
3682#ifdef CONFIG_CGROUP_SCHED
3683 ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3684#endif
3685 return ret;
3686}
3687
3688#ifdef CONFIG_PSI
3689static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3690{
3691 struct cgroup *cgrp = seq_css(seq)->cgroup;
3692 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3693
3694 return psi_show(seq, psi, PSI_IO);
3695}
3696static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3697{
3698 struct cgroup *cgrp = seq_css(seq)->cgroup;
3699 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3700
3701 return psi_show(seq, psi, PSI_MEM);
3702}
3703static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3704{
3705 struct cgroup *cgrp = seq_css(seq)->cgroup;
3706 struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3707
3708 return psi_show(seq, psi, PSI_CPU);
3709}
3710
3711static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3712 size_t nbytes, enum psi_res res)
3713{
3714 struct cgroup_file_ctx *ctx = of->priv;
3715 struct psi_trigger *new;
3716 struct cgroup *cgrp;
3717 struct psi_group *psi;
3718
3719 cgrp = cgroup_kn_lock_live(of->kn, false);
3720 if (!cgrp)
3721 return -ENODEV;
3722
3723 cgroup_get(cgrp);
3724 cgroup_kn_unlock(of->kn);
3725
3726
3727 if (ctx->psi.trigger) {
3728 cgroup_put(cgrp);
3729 return -EBUSY;
3730 }
3731
3732 psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3733 new = psi_trigger_create(psi, buf, res);
3734 if (IS_ERR(new)) {
3735 cgroup_put(cgrp);
3736 return PTR_ERR(new);
3737 }
3738
3739 smp_store_release(&ctx->psi.trigger, new);
3740 cgroup_put(cgrp);
3741
3742 return nbytes;
3743}
3744
3745static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3746 char *buf, size_t nbytes,
3747 loff_t off)
3748{
3749 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3750}
3751
3752static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3753 char *buf, size_t nbytes,
3754 loff_t off)
3755{
3756 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3757}
3758
3759static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3760 char *buf, size_t nbytes,
3761 loff_t off)
3762{
3763 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3764}
3765
3766static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3767 poll_table *pt)
3768{
3769 struct cgroup_file_ctx *ctx = of->priv;
3770
3771 return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
3772}
3773
3774static void cgroup_pressure_release(struct kernfs_open_file *of)
3775{
3776 struct cgroup_file_ctx *ctx = of->priv;
3777
3778 psi_trigger_destroy(ctx->psi.trigger);
3779}
3780
3781bool cgroup_psi_enabled(void)
3782{
3783 return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3784}
3785
3786#else
3787bool cgroup_psi_enabled(void)
3788{
3789 return false;
3790}
3791
3792#endif
3793
3794static int cgroup_freeze_show(struct seq_file *seq, void *v)
3795{
3796 struct cgroup *cgrp = seq_css(seq)->cgroup;
3797
3798 seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3799
3800 return 0;
3801}
3802
3803static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3804 char *buf, size_t nbytes, loff_t off)
3805{
3806 struct cgroup *cgrp;
3807 ssize_t ret;
3808 int freeze;
3809
3810 ret = kstrtoint(strstrip(buf), 0, &freeze);
3811 if (ret)
3812 return ret;
3813
3814 if (freeze < 0 || freeze > 1)
3815 return -ERANGE;
3816
3817 cgrp = cgroup_kn_lock_live(of->kn, false);
3818 if (!cgrp)
3819 return -ENOENT;
3820
3821 cgroup_freeze(cgrp, freeze);
3822
3823 cgroup_kn_unlock(of->kn);
3824
3825 return nbytes;
3826}
3827
3828static void __cgroup_kill(struct cgroup *cgrp)
3829{
3830 struct css_task_iter it;
3831 struct task_struct *task;
3832
3833 lockdep_assert_held(&cgroup_mutex);
3834
3835 spin_lock_irq(&css_set_lock);
3836 set_bit(CGRP_KILL, &cgrp->flags);
3837 spin_unlock_irq(&css_set_lock);
3838
3839 css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3840 while ((task = css_task_iter_next(&it))) {
3841
3842 if (task->flags & PF_KTHREAD)
3843 continue;
3844
3845
3846 if (__fatal_signal_pending(task))
3847 continue;
3848
3849 send_sig(SIGKILL, task, 0);
3850 }
3851 css_task_iter_end(&it);
3852
3853 spin_lock_irq(&css_set_lock);
3854 clear_bit(CGRP_KILL, &cgrp->flags);
3855 spin_unlock_irq(&css_set_lock);
3856}
3857
3858static void cgroup_kill(struct cgroup *cgrp)
3859{
3860 struct cgroup_subsys_state *css;
3861 struct cgroup *dsct;
3862
3863 lockdep_assert_held(&cgroup_mutex);
3864
3865 cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3866 __cgroup_kill(dsct);
3867}
3868
3869static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3870 size_t nbytes, loff_t off)
3871{
3872 ssize_t ret = 0;
3873 int kill;
3874 struct cgroup *cgrp;
3875
3876 ret = kstrtoint(strstrip(buf), 0, &kill);
3877 if (ret)
3878 return ret;
3879
3880 if (kill != 1)
3881 return -ERANGE;
3882
3883 cgrp = cgroup_kn_lock_live(of->kn, false);
3884 if (!cgrp)
3885 return -ENOENT;
3886
3887
3888
3889
3890
3891
3892 if (cgroup_is_threaded(cgrp))
3893 ret = -EOPNOTSUPP;
3894 else
3895 cgroup_kill(cgrp);
3896
3897 cgroup_kn_unlock(of->kn);
3898
3899 return ret ?: nbytes;
3900}
3901
3902static int cgroup_file_open(struct kernfs_open_file *of)
3903{
3904 struct cftype *cft = of_cft(of);
3905 struct cgroup_file_ctx *ctx;
3906 int ret;
3907
3908 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
3909 if (!ctx)
3910 return -ENOMEM;
3911
3912 ctx->ns = current->nsproxy->cgroup_ns;
3913 get_cgroup_ns(ctx->ns);
3914 of->priv = ctx;
3915
3916 if (!cft->open)
3917 return 0;
3918
3919 ret = cft->open(of);
3920 if (ret) {
3921 put_cgroup_ns(ctx->ns);
3922 kfree(ctx);
3923 }
3924 return ret;
3925}
3926
3927static void cgroup_file_release(struct kernfs_open_file *of)
3928{
3929 struct cftype *cft = of_cft(of);
3930 struct cgroup_file_ctx *ctx = of->priv;
3931
3932 if (cft->release)
3933 cft->release(of);
3934 put_cgroup_ns(ctx->ns);
3935 kfree(ctx);
3936}
3937
3938static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3939 size_t nbytes, loff_t off)
3940{
3941 struct cgroup_file_ctx *ctx = of->priv;
3942 struct cgroup *cgrp = of->kn->parent->priv;
3943 struct cftype *cft = of_cft(of);
3944 struct cgroup_subsys_state *css;
3945 int ret;
3946
3947 if (!nbytes)
3948 return 0;
3949
3950
3951
3952
3953
3954
3955
3956 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3957 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3958 ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
3959 return -EPERM;
3960
3961 if (cft->write)
3962 return cft->write(of, buf, nbytes, off);
3963
3964
3965
3966
3967
3968
3969
3970 rcu_read_lock();
3971 css = cgroup_css(cgrp, cft->ss);
3972 rcu_read_unlock();
3973
3974 if (cft->write_u64) {
3975 unsigned long long v;
3976 ret = kstrtoull(buf, 0, &v);
3977 if (!ret)
3978 ret = cft->write_u64(css, cft, v);
3979 } else if (cft->write_s64) {
3980 long long v;
3981 ret = kstrtoll(buf, 0, &v);
3982 if (!ret)
3983 ret = cft->write_s64(css, cft, v);
3984 } else {
3985 ret = -EINVAL;
3986 }
3987
3988 return ret ?: nbytes;
3989}
3990
3991static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
3992{
3993 struct cftype *cft = of_cft(of);
3994
3995 if (cft->poll)
3996 return cft->poll(of, pt);
3997
3998 return kernfs_generic_poll(of, pt);
3999}
4000
4001static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
4002{
4003 return seq_cft(seq)->seq_start(seq, ppos);
4004}
4005
4006static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
4007{
4008 return seq_cft(seq)->seq_next(seq, v, ppos);
4009}
4010
4011static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
4012{
4013 if (seq_cft(seq)->seq_stop)
4014 seq_cft(seq)->seq_stop(seq, v);
4015}
4016
4017static int cgroup_seqfile_show(struct seq_file *m, void *arg)
4018{
4019 struct cftype *cft = seq_cft(m);
4020 struct cgroup_subsys_state *css = seq_css(m);
4021
4022 if (cft->seq_show)
4023 return cft->seq_show(m, arg);
4024
4025 if (cft->read_u64)
4026 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
4027 else if (cft->read_s64)
4028 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
4029 else
4030 return -EINVAL;
4031 return 0;
4032}
4033
4034static struct kernfs_ops cgroup_kf_single_ops = {
4035 .atomic_write_len = PAGE_SIZE,
4036 .open = cgroup_file_open,
4037 .release = cgroup_file_release,
4038 .write = cgroup_file_write,
4039 .poll = cgroup_file_poll,
4040 .seq_show = cgroup_seqfile_show,
4041};
4042
4043static struct kernfs_ops cgroup_kf_ops = {
4044 .atomic_write_len = PAGE_SIZE,
4045 .open = cgroup_file_open,
4046 .release = cgroup_file_release,
4047 .write = cgroup_file_write,
4048 .poll = cgroup_file_poll,
4049 .seq_start = cgroup_seqfile_start,
4050 .seq_next = cgroup_seqfile_next,
4051 .seq_stop = cgroup_seqfile_stop,
4052 .seq_show = cgroup_seqfile_show,
4053};
4054
4055
4056static int cgroup_kn_set_ugid(struct kernfs_node *kn)
4057{
4058 struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
4059 .ia_uid = current_fsuid(),
4060 .ia_gid = current_fsgid(), };
4061
4062 if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
4063 gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
4064 return 0;
4065
4066 return kernfs_setattr(kn, &iattr);
4067}
4068
4069static void cgroup_file_notify_timer(struct timer_list *timer)
4070{
4071 cgroup_file_notify(container_of(timer, struct cgroup_file,
4072 notify_timer));
4073}
4074
4075static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
4076 struct cftype *cft)
4077{
4078 char name[CGROUP_FILE_NAME_MAX];
4079 struct kernfs_node *kn;
4080 struct lock_class_key *key = NULL;
4081 int ret;
4082
4083#ifdef CONFIG_DEBUG_LOCK_ALLOC
4084 key = &cft->lockdep_key;
4085#endif
4086 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
4087 cgroup_file_mode(cft),
4088 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
4089 0, cft->kf_ops, cft,
4090 NULL, key);
4091 if (IS_ERR(kn))
4092 return PTR_ERR(kn);
4093
4094 ret = cgroup_kn_set_ugid(kn);
4095 if (ret) {
4096 kernfs_remove(kn);
4097 return ret;
4098 }
4099
4100 if (cft->file_offset) {
4101 struct cgroup_file *cfile = (void *)css + cft->file_offset;
4102
4103 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
4104
4105 spin_lock_irq(&cgroup_file_kn_lock);
4106 cfile->kn = kn;
4107 spin_unlock_irq(&cgroup_file_kn_lock);
4108 }
4109
4110 return 0;
4111}
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123static int cgroup_addrm_files(struct cgroup_subsys_state *css,
4124 struct cgroup *cgrp, struct cftype cfts[],
4125 bool is_add)
4126{
4127 struct cftype *cft, *cft_end = NULL;
4128 int ret = 0;
4129
4130 lockdep_assert_held(&cgroup_mutex);
4131
4132restart:
4133 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
4134
4135 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4136 continue;
4137 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
4138 continue;
4139 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
4140 continue;
4141 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
4142 continue;
4143 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
4144 continue;
4145 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4146 continue;
4147 if (is_add) {
4148 ret = cgroup_add_file(css, cgrp, cft);
4149 if (ret) {
4150 pr_warn("%s: failed to add %s, err=%d\n",
4151 __func__, cft->name, ret);
4152 cft_end = cft;
4153 is_add = false;
4154 goto restart;
4155 }
4156 } else {
4157 cgroup_rm_file(cgrp, cft);
4158 }
4159 }
4160 return ret;
4161}
4162
4163static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
4164{
4165 struct cgroup_subsys *ss = cfts[0].ss;
4166 struct cgroup *root = &ss->root->cgrp;
4167 struct cgroup_subsys_state *css;
4168 int ret = 0;
4169
4170 lockdep_assert_held(&cgroup_mutex);
4171
4172
4173 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
4174 struct cgroup *cgrp = css->cgroup;
4175
4176 if (!(css->flags & CSS_VISIBLE))
4177 continue;
4178
4179 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
4180 if (ret)
4181 break;
4182 }
4183
4184 if (is_add && !ret)
4185 kernfs_activate(root->kn);
4186 return ret;
4187}
4188
4189static void cgroup_exit_cftypes(struct cftype *cfts)
4190{
4191 struct cftype *cft;
4192
4193 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4194
4195 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4196 kfree(cft->kf_ops);
4197 cft->kf_ops = NULL;
4198 cft->ss = NULL;
4199
4200
4201 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
4202 }
4203}
4204
4205static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4206{
4207 struct cftype *cft;
4208
4209 for (cft = cfts; cft->name[0] != '\0'; cft++) {
4210 struct kernfs_ops *kf_ops;
4211
4212 WARN_ON(cft->ss || cft->kf_ops);
4213
4214 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
4215 continue;
4216
4217 if (cft->seq_start)
4218 kf_ops = &cgroup_kf_ops;
4219 else
4220 kf_ops = &cgroup_kf_single_ops;
4221
4222
4223
4224
4225
4226 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4227 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4228 if (!kf_ops) {
4229 cgroup_exit_cftypes(cfts);
4230 return -ENOMEM;
4231 }
4232 kf_ops->atomic_write_len = cft->max_write_len;
4233 }
4234
4235 cft->kf_ops = kf_ops;
4236 cft->ss = ss;
4237 }
4238
4239 return 0;
4240}
4241
4242static int cgroup_rm_cftypes_locked(struct cftype *cfts)
4243{
4244 lockdep_assert_held(&cgroup_mutex);
4245
4246 if (!cfts || !cfts[0].ss)
4247 return -ENOENT;
4248
4249 list_del(&cfts->node);
4250 cgroup_apply_cftypes(cfts, false);
4251 cgroup_exit_cftypes(cfts);
4252 return 0;
4253}
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266int cgroup_rm_cftypes(struct cftype *cfts)
4267{
4268 int ret;
4269
4270 mutex_lock(&cgroup_mutex);
4271 ret = cgroup_rm_cftypes_locked(cfts);
4272 mutex_unlock(&cgroup_mutex);
4273 return ret;
4274}
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4291{
4292 int ret;
4293
4294 if (!cgroup_ssid_enabled(ss->id))
4295 return 0;
4296
4297 if (!cfts || cfts[0].name[0] == '\0')
4298 return 0;
4299
4300 ret = cgroup_init_cftypes(ss, cfts);
4301 if (ret)
4302 return ret;
4303
4304 mutex_lock(&cgroup_mutex);
4305
4306 list_add_tail(&cfts->node, &ss->cfts);
4307 ret = cgroup_apply_cftypes(cfts, true);
4308 if (ret)
4309 cgroup_rm_cftypes_locked(cfts);
4310
4311 mutex_unlock(&cgroup_mutex);
4312 return ret;
4313}
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4324{
4325 struct cftype *cft;
4326
4327 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4328 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4329 return cgroup_add_cftypes(ss, cfts);
4330}
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4341{
4342 struct cftype *cft;
4343
4344 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4345 cft->flags |= __CFTYPE_NOT_ON_DFL;
4346 return cgroup_add_cftypes(ss, cfts);
4347}
4348
4349
4350
4351
4352
4353
4354
4355void cgroup_file_notify(struct cgroup_file *cfile)
4356{
4357 unsigned long flags;
4358
4359 spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4360 if (cfile->kn) {
4361 unsigned long last = cfile->notified_at;
4362 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4363
4364 if (time_in_range(jiffies, last, next)) {
4365 timer_reduce(&cfile->notify_timer, next);
4366 } else {
4367 kernfs_notify(cfile->kn);
4368 cfile->notified_at = jiffies;
4369 }
4370 }
4371 spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4372}
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4392 struct cgroup_subsys_state *parent)
4393{
4394 struct cgroup_subsys_state *next;
4395
4396 cgroup_assert_mutex_or_rcu_locked();
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418 if (!pos) {
4419 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4420 } else if (likely(!(pos->flags & CSS_RELEASED))) {
4421 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4422 } else {
4423 list_for_each_entry_rcu(next, &parent->children, sibling,
4424 lockdep_is_held(&cgroup_mutex))
4425 if (next->serial_nr > pos->serial_nr)
4426 break;
4427 }
4428
4429
4430
4431
4432
4433 if (&next->sibling != &parent->children)
4434 return next;
4435 return NULL;
4436}
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459struct cgroup_subsys_state *
4460css_next_descendant_pre(struct cgroup_subsys_state *pos,
4461 struct cgroup_subsys_state *root)
4462{
4463 struct cgroup_subsys_state *next;
4464
4465 cgroup_assert_mutex_or_rcu_locked();
4466
4467
4468 if (!pos)
4469 return root;
4470
4471
4472 next = css_next_child(NULL, pos);
4473 if (next)
4474 return next;
4475
4476
4477 while (pos != root) {
4478 next = css_next_child(pos, pos->parent);
4479 if (next)
4480 return next;
4481 pos = pos->parent;
4482 }
4483
4484 return NULL;
4485}
4486EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501struct cgroup_subsys_state *
4502css_rightmost_descendant(struct cgroup_subsys_state *pos)
4503{
4504 struct cgroup_subsys_state *last, *tmp;
4505
4506 cgroup_assert_mutex_or_rcu_locked();
4507
4508 do {
4509 last = pos;
4510
4511 pos = NULL;
4512 css_for_each_child(tmp, last)
4513 pos = tmp;
4514 } while (pos);
4515
4516 return last;
4517}
4518
4519static struct cgroup_subsys_state *
4520css_leftmost_descendant(struct cgroup_subsys_state *pos)
4521{
4522 struct cgroup_subsys_state *last;
4523
4524 do {
4525 last = pos;
4526 pos = css_next_child(NULL, pos);
4527 } while (pos);
4528
4529 return last;
4530}
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554struct cgroup_subsys_state *
4555css_next_descendant_post(struct cgroup_subsys_state *pos,
4556 struct cgroup_subsys_state *root)
4557{
4558 struct cgroup_subsys_state *next;
4559
4560 cgroup_assert_mutex_or_rcu_locked();
4561
4562
4563 if (!pos)
4564 return css_leftmost_descendant(root);
4565
4566
4567 if (pos == root)
4568 return NULL;
4569
4570
4571 next = css_next_child(pos, pos->parent);
4572 if (next)
4573 return css_leftmost_descendant(next);
4574
4575
4576 return pos->parent;
4577}
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587bool css_has_online_children(struct cgroup_subsys_state *css)
4588{
4589 struct cgroup_subsys_state *child;
4590 bool ret = false;
4591
4592 rcu_read_lock();
4593 css_for_each_child(child, css) {
4594 if (child->flags & CSS_ONLINE) {
4595 ret = true;
4596 break;
4597 }
4598 }
4599 rcu_read_unlock();
4600 return ret;
4601}
4602
4603static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4604{
4605 struct list_head *l;
4606 struct cgrp_cset_link *link;
4607 struct css_set *cset;
4608
4609 lockdep_assert_held(&css_set_lock);
4610
4611
4612 if (it->tcset_pos) {
4613 l = it->tcset_pos->next;
4614
4615 if (l != it->tcset_head) {
4616 it->tcset_pos = l;
4617 return container_of(l, struct css_set,
4618 threaded_csets_node);
4619 }
4620
4621 it->tcset_pos = NULL;
4622 }
4623
4624
4625 l = it->cset_pos;
4626 l = l->next;
4627 if (l == it->cset_head) {
4628 it->cset_pos = NULL;
4629 return NULL;
4630 }
4631
4632 if (it->ss) {
4633 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4634 } else {
4635 link = list_entry(l, struct cgrp_cset_link, cset_link);
4636 cset = link->cset;
4637 }
4638
4639 it->cset_pos = l;
4640
4641
4642 if (it->flags & CSS_TASK_ITER_THREADED) {
4643 if (it->cur_dcset)
4644 put_css_set_locked(it->cur_dcset);
4645 it->cur_dcset = cset;
4646 get_css_set(cset);
4647
4648 it->tcset_head = &cset->threaded_csets;
4649 it->tcset_pos = &cset->threaded_csets;
4650 }
4651
4652 return cset;
4653}
4654
4655
4656
4657
4658
4659
4660
4661static void css_task_iter_advance_css_set(struct css_task_iter *it)
4662{
4663 struct css_set *cset;
4664
4665 lockdep_assert_held(&css_set_lock);
4666
4667
4668 while ((cset = css_task_iter_next_css_set(it))) {
4669 if (!list_empty(&cset->tasks)) {
4670 it->cur_tasks_head = &cset->tasks;
4671 break;
4672 } else if (!list_empty(&cset->mg_tasks)) {
4673 it->cur_tasks_head = &cset->mg_tasks;
4674 break;
4675 } else if (!list_empty(&cset->dying_tasks)) {
4676 it->cur_tasks_head = &cset->dying_tasks;
4677 break;
4678 }
4679 }
4680 if (!cset) {
4681 it->task_pos = NULL;
4682 return;
4683 }
4684 it->task_pos = it->cur_tasks_head->next;
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701 if (it->cur_cset) {
4702 list_del(&it->iters_node);
4703 put_css_set_locked(it->cur_cset);
4704 }
4705 get_css_set(cset);
4706 it->cur_cset = cset;
4707 list_add(&it->iters_node, &cset->task_iters);
4708}
4709
4710static void css_task_iter_skip(struct css_task_iter *it,
4711 struct task_struct *task)
4712{
4713 lockdep_assert_held(&css_set_lock);
4714
4715 if (it->task_pos == &task->cg_list) {
4716 it->task_pos = it->task_pos->next;
4717 it->flags |= CSS_TASK_ITER_SKIPPED;
4718 }
4719}
4720
4721static void css_task_iter_advance(struct css_task_iter *it)
4722{
4723 struct task_struct *task;
4724
4725 lockdep_assert_held(&css_set_lock);
4726repeat:
4727 if (it->task_pos) {
4728
4729
4730
4731
4732
4733 if (it->flags & CSS_TASK_ITER_SKIPPED)
4734 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4735 else
4736 it->task_pos = it->task_pos->next;
4737
4738 if (it->task_pos == &it->cur_cset->tasks) {
4739 it->cur_tasks_head = &it->cur_cset->mg_tasks;
4740 it->task_pos = it->cur_tasks_head->next;
4741 }
4742 if (it->task_pos == &it->cur_cset->mg_tasks) {
4743 it->cur_tasks_head = &it->cur_cset->dying_tasks;
4744 it->task_pos = it->cur_tasks_head->next;
4745 }
4746 if (it->task_pos == &it->cur_cset->dying_tasks)
4747 css_task_iter_advance_css_set(it);
4748 } else {
4749
4750 css_task_iter_advance_css_set(it);
4751 }
4752
4753 if (!it->task_pos)
4754 return;
4755
4756 task = list_entry(it->task_pos, struct task_struct, cg_list);
4757
4758 if (it->flags & CSS_TASK_ITER_PROCS) {
4759
4760 if (!thread_group_leader(task))
4761 goto repeat;
4762
4763
4764 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4765 !atomic_read(&task->signal->live))
4766 goto repeat;
4767 } else {
4768
4769 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4770 goto repeat;
4771 }
4772}
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4786 struct css_task_iter *it)
4787{
4788 memset(it, 0, sizeof(*it));
4789
4790 spin_lock_irq(&css_set_lock);
4791
4792 it->ss = css->ss;
4793 it->flags = flags;
4794
4795 if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
4796 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4797 else
4798 it->cset_pos = &css->cgroup->cset_links;
4799
4800 it->cset_head = it->cset_pos;
4801
4802 css_task_iter_advance(it);
4803
4804 spin_unlock_irq(&css_set_lock);
4805}
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815struct task_struct *css_task_iter_next(struct css_task_iter *it)
4816{
4817 if (it->cur_task) {
4818 put_task_struct(it->cur_task);
4819 it->cur_task = NULL;
4820 }
4821
4822 spin_lock_irq(&css_set_lock);
4823
4824
4825 if (it->flags & CSS_TASK_ITER_SKIPPED)
4826 css_task_iter_advance(it);
4827
4828 if (it->task_pos) {
4829 it->cur_task = list_entry(it->task_pos, struct task_struct,
4830 cg_list);
4831 get_task_struct(it->cur_task);
4832 css_task_iter_advance(it);
4833 }
4834
4835 spin_unlock_irq(&css_set_lock);
4836
4837 return it->cur_task;
4838}
4839
4840
4841
4842
4843
4844
4845
4846void css_task_iter_end(struct css_task_iter *it)
4847{
4848 if (it->cur_cset) {
4849 spin_lock_irq(&css_set_lock);
4850 list_del(&it->iters_node);
4851 put_css_set_locked(it->cur_cset);
4852 spin_unlock_irq(&css_set_lock);
4853 }
4854
4855 if (it->cur_dcset)
4856 put_css_set(it->cur_dcset);
4857
4858 if (it->cur_task)
4859 put_task_struct(it->cur_task);
4860}
4861
4862static void cgroup_procs_release(struct kernfs_open_file *of)
4863{
4864 struct cgroup_file_ctx *ctx = of->priv;
4865
4866 if (ctx->procs.started)
4867 css_task_iter_end(&ctx->procs.iter);
4868}
4869
4870static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4871{
4872 struct kernfs_open_file *of = s->private;
4873 struct cgroup_file_ctx *ctx = of->priv;
4874
4875 if (pos)
4876 (*pos)++;
4877
4878 return css_task_iter_next(&ctx->procs.iter);
4879}
4880
4881static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4882 unsigned int iter_flags)
4883{
4884 struct kernfs_open_file *of = s->private;
4885 struct cgroup *cgrp = seq_css(s)->cgroup;
4886 struct cgroup_file_ctx *ctx = of->priv;
4887 struct css_task_iter *it = &ctx->procs.iter;
4888
4889
4890
4891
4892
4893 if (!ctx->procs.started) {
4894 if (WARN_ON_ONCE((*pos)))
4895 return ERR_PTR(-EINVAL);
4896 css_task_iter_start(&cgrp->self, iter_flags, it);
4897 ctx->procs.started = true;
4898 } else if (!(*pos)) {
4899 css_task_iter_end(it);
4900 css_task_iter_start(&cgrp->self, iter_flags, it);
4901 } else
4902 return it->cur_task;
4903
4904 return cgroup_procs_next(s, NULL, NULL);
4905}
4906
4907static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
4908{
4909 struct cgroup *cgrp = seq_css(s)->cgroup;
4910
4911
4912
4913
4914
4915
4916
4917 if (cgroup_is_threaded(cgrp))
4918 return ERR_PTR(-EOPNOTSUPP);
4919
4920 return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
4921 CSS_TASK_ITER_THREADED);
4922}
4923
4924static int cgroup_procs_show(struct seq_file *s, void *v)
4925{
4926 seq_printf(s, "%d\n", task_pid_vnr(v));
4927 return 0;
4928}
4929
4930static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4931{
4932 int ret;
4933 struct inode *inode;
4934
4935 lockdep_assert_held(&cgroup_mutex);
4936
4937 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4938 if (!inode)
4939 return -ENOMEM;
4940
4941 ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
4942 iput(inode);
4943 return ret;
4944}
4945
4946static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4947 struct cgroup *dst_cgrp,
4948 struct super_block *sb,
4949 struct cgroup_namespace *ns)
4950{
4951 struct cgroup *com_cgrp = src_cgrp;
4952 int ret;
4953
4954 lockdep_assert_held(&cgroup_mutex);
4955
4956
4957 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
4958 com_cgrp = cgroup_parent(com_cgrp);
4959
4960
4961 ret = cgroup_may_write(com_cgrp, sb);
4962 if (ret)
4963 return ret;
4964
4965
4966
4967
4968
4969 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
4970 (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
4971 !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
4972 return -ENOENT;
4973
4974 return 0;
4975}
4976
4977static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4978 struct cgroup *dst_cgrp,
4979 struct super_block *sb, bool threadgroup,
4980 struct cgroup_namespace *ns)
4981{
4982 int ret = 0;
4983
4984 ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
4985 if (ret)
4986 return ret;
4987
4988 ret = cgroup_migrate_vet_dst(dst_cgrp);
4989 if (ret)
4990 return ret;
4991
4992 if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4993 ret = -EOPNOTSUPP;
4994
4995 return ret;
4996}
4997
4998static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
4999 bool threadgroup)
5000{
5001 struct cgroup_file_ctx *ctx = of->priv;
5002 struct cgroup *src_cgrp, *dst_cgrp;
5003 struct task_struct *task;
5004 const struct cred *saved_cred;
5005 ssize_t ret;
5006 bool threadgroup_locked;
5007
5008 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
5009 if (!dst_cgrp)
5010 return -ENODEV;
5011
5012 task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
5013 ret = PTR_ERR_OR_ZERO(task);
5014 if (ret)
5015 goto out_unlock;
5016
5017
5018 spin_lock_irq(&css_set_lock);
5019 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
5020 spin_unlock_irq(&css_set_lock);
5021
5022
5023
5024
5025
5026
5027 saved_cred = override_creds(of->file->f_cred);
5028 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
5029 of->file->f_path.dentry->d_sb,
5030 threadgroup, ctx->ns);
5031 revert_creds(saved_cred);
5032 if (ret)
5033 goto out_finish;
5034
5035 ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
5036
5037out_finish:
5038 cgroup_procs_write_finish(task, threadgroup_locked);
5039out_unlock:
5040 cgroup_kn_unlock(of->kn);
5041
5042 return ret;
5043}
5044
5045static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
5046 char *buf, size_t nbytes, loff_t off)
5047{
5048 return __cgroup_procs_write(of, buf, true) ?: nbytes;
5049}
5050
5051static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
5052{
5053 return __cgroup_procs_start(s, pos, 0);
5054}
5055
5056static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
5057 char *buf, size_t nbytes, loff_t off)
5058{
5059 return __cgroup_procs_write(of, buf, false) ?: nbytes;
5060}
5061
5062
5063static struct cftype cgroup_base_files[] = {
5064 {
5065 .name = "cgroup.type",
5066 .flags = CFTYPE_NOT_ON_ROOT,
5067 .seq_show = cgroup_type_show,
5068 .write = cgroup_type_write,
5069 },
5070 {
5071 .name = "cgroup.procs",
5072 .flags = CFTYPE_NS_DELEGATABLE,
5073 .file_offset = offsetof(struct cgroup, procs_file),
5074 .release = cgroup_procs_release,
5075 .seq_start = cgroup_procs_start,
5076 .seq_next = cgroup_procs_next,
5077 .seq_show = cgroup_procs_show,
5078 .write = cgroup_procs_write,
5079 },
5080 {
5081 .name = "cgroup.threads",
5082 .flags = CFTYPE_NS_DELEGATABLE,
5083 .release = cgroup_procs_release,
5084 .seq_start = cgroup_threads_start,
5085 .seq_next = cgroup_procs_next,
5086 .seq_show = cgroup_procs_show,
5087 .write = cgroup_threads_write,
5088 },
5089 {
5090 .name = "cgroup.controllers",
5091 .seq_show = cgroup_controllers_show,
5092 },
5093 {
5094 .name = "cgroup.subtree_control",
5095 .flags = CFTYPE_NS_DELEGATABLE,
5096 .seq_show = cgroup_subtree_control_show,
5097 .write = cgroup_subtree_control_write,
5098 },
5099 {
5100 .name = "cgroup.events",
5101 .flags = CFTYPE_NOT_ON_ROOT,
5102 .file_offset = offsetof(struct cgroup, events_file),
5103 .seq_show = cgroup_events_show,
5104 },
5105 {
5106 .name = "cgroup.max.descendants",
5107 .seq_show = cgroup_max_descendants_show,
5108 .write = cgroup_max_descendants_write,
5109 },
5110 {
5111 .name = "cgroup.max.depth",
5112 .seq_show = cgroup_max_depth_show,
5113 .write = cgroup_max_depth_write,
5114 },
5115 {
5116 .name = "cgroup.stat",
5117 .seq_show = cgroup_stat_show,
5118 },
5119 {
5120 .name = "cgroup.freeze",
5121 .flags = CFTYPE_NOT_ON_ROOT,
5122 .seq_show = cgroup_freeze_show,
5123 .write = cgroup_freeze_write,
5124 },
5125 {
5126 .name = "cgroup.kill",
5127 .flags = CFTYPE_NOT_ON_ROOT,
5128 .write = cgroup_kill_write,
5129 },
5130 {
5131 .name = "cpu.stat",
5132 .seq_show = cpu_stat_show,
5133 },
5134#ifdef CONFIG_PSI
5135 {
5136 .name = "io.pressure",
5137 .flags = CFTYPE_PRESSURE,
5138 .seq_show = cgroup_io_pressure_show,
5139 .write = cgroup_io_pressure_write,
5140 .poll = cgroup_pressure_poll,
5141 .release = cgroup_pressure_release,
5142 },
5143 {
5144 .name = "memory.pressure",
5145 .flags = CFTYPE_PRESSURE,
5146 .seq_show = cgroup_memory_pressure_show,
5147 .write = cgroup_memory_pressure_write,
5148 .poll = cgroup_pressure_poll,
5149 .release = cgroup_pressure_release,
5150 },
5151 {
5152 .name = "cpu.pressure",
5153 .flags = CFTYPE_PRESSURE,
5154 .seq_show = cgroup_cpu_pressure_show,
5155 .write = cgroup_cpu_pressure_write,
5156 .poll = cgroup_pressure_poll,
5157 .release = cgroup_pressure_release,
5158 },
5159#endif
5160 { }
5161};
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185static void css_free_rwork_fn(struct work_struct *work)
5186{
5187 struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
5188 struct cgroup_subsys_state, destroy_rwork);
5189 struct cgroup_subsys *ss = css->ss;
5190 struct cgroup *cgrp = css->cgroup;
5191
5192 percpu_ref_exit(&css->refcnt);
5193
5194 if (ss) {
5195
5196 struct cgroup_subsys_state *parent = css->parent;
5197 int id = css->id;
5198
5199 ss->css_free(css);
5200 cgroup_idr_remove(&ss->css_idr, id);
5201 cgroup_put(cgrp);
5202
5203 if (parent)
5204 css_put(parent);
5205 } else {
5206
5207 atomic_dec(&cgrp->root->nr_cgrps);
5208 cgroup1_pidlist_destroy_all(cgrp);
5209 cancel_work_sync(&cgrp->release_agent_work);
5210
5211 if (cgroup_parent(cgrp)) {
5212
5213
5214
5215
5216
5217
5218 cgroup_put(cgroup_parent(cgrp));
5219 kernfs_put(cgrp->kn);
5220 psi_cgroup_free(cgrp);
5221 cgroup_rstat_exit(cgrp);
5222 kfree(cgrp);
5223 } else {
5224
5225
5226
5227
5228
5229 cgroup_destroy_root(cgrp->root);
5230 }
5231 }
5232}
5233
5234static void css_release_work_fn(struct work_struct *work)
5235{
5236 struct cgroup_subsys_state *css =
5237 container_of(work, struct cgroup_subsys_state, destroy_work);
5238 struct cgroup_subsys *ss = css->ss;
5239 struct cgroup *cgrp = css->cgroup;
5240
5241 mutex_lock(&cgroup_mutex);
5242
5243 css->flags |= CSS_RELEASED;
5244 list_del_rcu(&css->sibling);
5245
5246 if (ss) {
5247
5248 if (!list_empty(&css->rstat_css_node)) {
5249 cgroup_rstat_flush(cgrp);
5250 list_del_rcu(&css->rstat_css_node);
5251 }
5252
5253 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5254 if (ss->css_released)
5255 ss->css_released(css);
5256 } else {
5257 struct cgroup *tcgrp;
5258
5259
5260 TRACE_CGROUP_PATH(release, cgrp);
5261
5262 cgroup_rstat_flush(cgrp);
5263
5264 spin_lock_irq(&css_set_lock);
5265 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5266 tcgrp = cgroup_parent(tcgrp))
5267 tcgrp->nr_dying_descendants--;
5268 spin_unlock_irq(&css_set_lock);
5269
5270
5271
5272
5273
5274
5275
5276
5277 if (cgrp->kn)
5278 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5279 NULL);
5280 }
5281
5282 mutex_unlock(&cgroup_mutex);
5283
5284 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5285 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5286}
5287
5288static void css_release(struct percpu_ref *ref)
5289{
5290 struct cgroup_subsys_state *css =
5291 container_of(ref, struct cgroup_subsys_state, refcnt);
5292
5293 INIT_WORK(&css->destroy_work, css_release_work_fn);
5294 queue_work(cgroup_destroy_wq, &css->destroy_work);
5295}
5296
5297static void init_and_link_css(struct cgroup_subsys_state *css,
5298 struct cgroup_subsys *ss, struct cgroup *cgrp)
5299{
5300 lockdep_assert_held(&cgroup_mutex);
5301
5302 cgroup_get_live(cgrp);
5303
5304 memset(css, 0, sizeof(*css));
5305 css->cgroup = cgrp;
5306 css->ss = ss;
5307 css->id = -1;
5308 INIT_LIST_HEAD(&css->sibling);
5309 INIT_LIST_HEAD(&css->children);
5310 INIT_LIST_HEAD(&css->rstat_css_node);
5311 css->serial_nr = css_serial_nr_next++;
5312 atomic_set(&css->online_cnt, 0);
5313
5314 if (cgroup_parent(cgrp)) {
5315 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5316 css_get(css->parent);
5317 }
5318
5319 if (ss->css_rstat_flush)
5320 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5321
5322 BUG_ON(cgroup_css(cgrp, ss));
5323}
5324
5325
5326static int online_css(struct cgroup_subsys_state *css)
5327{
5328 struct cgroup_subsys *ss = css->ss;
5329 int ret = 0;
5330
5331 lockdep_assert_held(&cgroup_mutex);
5332
5333 if (ss->css_online)
5334 ret = ss->css_online(css);
5335 if (!ret) {
5336 css->flags |= CSS_ONLINE;
5337 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5338
5339 atomic_inc(&css->online_cnt);
5340 if (css->parent)
5341 atomic_inc(&css->parent->online_cnt);
5342 }
5343 return ret;
5344}
5345
5346
5347static void offline_css(struct cgroup_subsys_state *css)
5348{
5349 struct cgroup_subsys *ss = css->ss;
5350
5351 lockdep_assert_held(&cgroup_mutex);
5352
5353 if (!(css->flags & CSS_ONLINE))
5354 return;
5355
5356 if (ss->css_offline)
5357 ss->css_offline(css);
5358
5359 css->flags &= ~CSS_ONLINE;
5360 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5361
5362 wake_up_all(&css->cgroup->offline_waitq);
5363}
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5375 struct cgroup_subsys *ss)
5376{
5377 struct cgroup *parent = cgroup_parent(cgrp);
5378 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5379 struct cgroup_subsys_state *css;
5380 int err;
5381
5382 lockdep_assert_held(&cgroup_mutex);
5383
5384 css = ss->css_alloc(parent_css);
5385 if (!css)
5386 css = ERR_PTR(-ENOMEM);
5387 if (IS_ERR(css))
5388 return css;
5389
5390 init_and_link_css(css, ss, cgrp);
5391
5392 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5393 if (err)
5394 goto err_free_css;
5395
5396 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5397 if (err < 0)
5398 goto err_free_css;
5399 css->id = err;
5400
5401
5402 list_add_tail_rcu(&css->sibling, &parent_css->children);
5403 cgroup_idr_replace(&ss->css_idr, css, css->id);
5404
5405 err = online_css(css);
5406 if (err)
5407 goto err_list_del;
5408
5409 return css;
5410
5411err_list_del:
5412 list_del_rcu(&css->sibling);
5413err_free_css:
5414 list_del_rcu(&css->rstat_css_node);
5415 INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5416 queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5417 return ERR_PTR(err);
5418}
5419
5420
5421
5422
5423
5424
5425static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5426 umode_t mode)
5427{
5428 struct cgroup_root *root = parent->root;
5429 struct cgroup *cgrp, *tcgrp;
5430 struct kernfs_node *kn;
5431 int level = parent->level + 1;
5432 int ret;
5433
5434
5435 cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
5436 GFP_KERNEL);
5437 if (!cgrp)
5438 return ERR_PTR(-ENOMEM);
5439
5440 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5441 if (ret)
5442 goto out_free_cgrp;
5443
5444 ret = cgroup_rstat_init(cgrp);
5445 if (ret)
5446 goto out_cancel_ref;
5447
5448
5449 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5450 if (IS_ERR(kn)) {
5451 ret = PTR_ERR(kn);
5452 goto out_stat_exit;
5453 }
5454 cgrp->kn = kn;
5455
5456 init_cgroup_housekeeping(cgrp);
5457
5458 cgrp->self.parent = &parent->self;
5459 cgrp->root = root;
5460 cgrp->level = level;
5461
5462 ret = psi_cgroup_alloc(cgrp);
5463 if (ret)
5464 goto out_kernfs_remove;
5465
5466 ret = cgroup_bpf_inherit(cgrp);
5467 if (ret)
5468 goto out_psi_free;
5469
5470
5471
5472
5473
5474 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5475 if (cgrp->freezer.e_freeze) {
5476
5477
5478
5479
5480
5481
5482 set_bit(CGRP_FREEZE, &cgrp->flags);
5483 set_bit(CGRP_FROZEN, &cgrp->flags);
5484 }
5485
5486 spin_lock_irq(&css_set_lock);
5487 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5488 cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5489
5490 if (tcgrp != cgrp) {
5491 tcgrp->nr_descendants++;
5492
5493
5494
5495
5496
5497
5498 if (cgrp->freezer.e_freeze)
5499 tcgrp->freezer.nr_frozen_descendants++;
5500 }
5501 }
5502 spin_unlock_irq(&css_set_lock);
5503
5504 if (notify_on_release(parent))
5505 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5506
5507 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5508 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5509
5510 cgrp->self.serial_nr = css_serial_nr_next++;
5511
5512
5513 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5514 atomic_inc(&root->nr_cgrps);
5515 cgroup_get_live(parent);
5516
5517
5518
5519
5520
5521 if (!cgroup_on_dfl(cgrp))
5522 cgrp->subtree_control = cgroup_control(cgrp);
5523
5524 cgroup_propagate_control(cgrp);
5525
5526 return cgrp;
5527
5528out_psi_free:
5529 psi_cgroup_free(cgrp);
5530out_kernfs_remove:
5531 kernfs_remove(cgrp->kn);
5532out_stat_exit:
5533 cgroup_rstat_exit(cgrp);
5534out_cancel_ref:
5535 percpu_ref_exit(&cgrp->self.refcnt);
5536out_free_cgrp:
5537 kfree(cgrp);
5538 return ERR_PTR(ret);
5539}
5540
5541static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5542{
5543 struct cgroup *cgroup;
5544 int ret = false;
5545 int level = 1;
5546
5547 lockdep_assert_held(&cgroup_mutex);
5548
5549 for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5550 if (cgroup->nr_descendants >= cgroup->max_descendants)
5551 goto fail;
5552
5553 if (level > cgroup->max_depth)
5554 goto fail;
5555
5556 level++;
5557 }
5558
5559 ret = true;
5560fail:
5561 return ret;
5562}
5563
5564int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5565{
5566 struct cgroup *parent, *cgrp;
5567 int ret;
5568
5569
5570 if (strchr(name, '\n'))
5571 return -EINVAL;
5572
5573 parent = cgroup_kn_lock_live(parent_kn, false);
5574 if (!parent)
5575 return -ENODEV;
5576
5577 if (!cgroup_check_hierarchy_limits(parent)) {
5578 ret = -EAGAIN;
5579 goto out_unlock;
5580 }
5581
5582 cgrp = cgroup_create(parent, name, mode);
5583 if (IS_ERR(cgrp)) {
5584 ret = PTR_ERR(cgrp);
5585 goto out_unlock;
5586 }
5587
5588
5589
5590
5591
5592 kernfs_get(cgrp->kn);
5593
5594 ret = cgroup_kn_set_ugid(cgrp->kn);
5595 if (ret)
5596 goto out_destroy;
5597
5598 ret = css_populate_dir(&cgrp->self);
5599 if (ret)
5600 goto out_destroy;
5601
5602 ret = cgroup_apply_control_enable(cgrp);
5603 if (ret)
5604 goto out_destroy;
5605
5606 TRACE_CGROUP_PATH(mkdir, cgrp);
5607
5608
5609 kernfs_activate(cgrp->kn);
5610
5611 ret = 0;
5612 goto out_unlock;
5613
5614out_destroy:
5615 cgroup_destroy_locked(cgrp);
5616out_unlock:
5617 cgroup_kn_unlock(parent_kn);
5618 return ret;
5619}
5620
5621
5622
5623
5624
5625
5626static void css_killed_work_fn(struct work_struct *work)
5627{
5628 struct cgroup_subsys_state *css =
5629 container_of(work, struct cgroup_subsys_state, destroy_work);
5630
5631 mutex_lock(&cgroup_mutex);
5632
5633 do {
5634 offline_css(css);
5635 css_put(css);
5636
5637 css = css->parent;
5638 } while (css && atomic_dec_and_test(&css->online_cnt));
5639
5640 mutex_unlock(&cgroup_mutex);
5641}
5642
5643
5644static void css_killed_ref_fn(struct percpu_ref *ref)
5645{
5646 struct cgroup_subsys_state *css =
5647 container_of(ref, struct cgroup_subsys_state, refcnt);
5648
5649 if (atomic_dec_and_test(&css->online_cnt)) {
5650 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5651 queue_work(cgroup_destroy_wq, &css->destroy_work);
5652 }
5653}
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664static void kill_css(struct cgroup_subsys_state *css)
5665{
5666 lockdep_assert_held(&cgroup_mutex);
5667
5668 if (css->flags & CSS_DYING)
5669 return;
5670
5671 css->flags |= CSS_DYING;
5672
5673
5674
5675
5676
5677 css_clear_dir(css);
5678
5679
5680
5681
5682
5683 css_get(css);
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5696}
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722static int cgroup_destroy_locked(struct cgroup *cgrp)
5723 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5724{
5725 struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5726 struct cgroup_subsys_state *css;
5727 struct cgrp_cset_link *link;
5728 int ssid;
5729
5730 lockdep_assert_held(&cgroup_mutex);
5731
5732
5733
5734
5735
5736 if (cgroup_is_populated(cgrp))
5737 return -EBUSY;
5738
5739
5740
5741
5742
5743
5744 if (css_has_online_children(&cgrp->self))
5745 return -EBUSY;
5746
5747
5748
5749
5750
5751
5752
5753 cgrp->self.flags &= ~CSS_ONLINE;
5754
5755 spin_lock_irq(&css_set_lock);
5756 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5757 link->cset->dead = true;
5758 spin_unlock_irq(&css_set_lock);
5759
5760
5761 for_each_css(css, ssid, cgrp)
5762 kill_css(css);
5763
5764
5765 css_clear_dir(&cgrp->self);
5766 kernfs_remove(cgrp->kn);
5767
5768 if (cgroup_is_threaded(cgrp))
5769 parent->nr_threaded_children--;
5770
5771 spin_lock_irq(&css_set_lock);
5772 for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5773 tcgrp->nr_descendants--;
5774 tcgrp->nr_dying_descendants++;
5775
5776
5777
5778
5779 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5780 tcgrp->freezer.nr_frozen_descendants--;
5781 }
5782 spin_unlock_irq(&css_set_lock);
5783
5784 cgroup1_check_for_release(parent);
5785
5786 cgroup_bpf_offline(cgrp);
5787
5788
5789 percpu_ref_kill(&cgrp->self.refcnt);
5790
5791 return 0;
5792};
5793
5794int cgroup_rmdir(struct kernfs_node *kn)
5795{
5796 struct cgroup *cgrp;
5797 int ret = 0;
5798
5799 cgrp = cgroup_kn_lock_live(kn, false);
5800 if (!cgrp)
5801 return 0;
5802
5803 ret = cgroup_destroy_locked(cgrp);
5804 if (!ret)
5805 TRACE_CGROUP_PATH(rmdir, cgrp);
5806
5807 cgroup_kn_unlock(kn);
5808 return ret;
5809}
5810
5811static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5812 .show_options = cgroup_show_options,
5813 .mkdir = cgroup_mkdir,
5814 .rmdir = cgroup_rmdir,
5815 .show_path = cgroup_show_path,
5816};
5817
5818static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5819{
5820 struct cgroup_subsys_state *css;
5821
5822 pr_debug("Initializing cgroup subsys %s\n", ss->name);
5823
5824 mutex_lock(&cgroup_mutex);
5825
5826 idr_init(&ss->css_idr);
5827 INIT_LIST_HEAD(&ss->cfts);
5828
5829
5830 ss->root = &cgrp_dfl_root;
5831 css = ss->css_alloc(NULL);
5832
5833 BUG_ON(IS_ERR(css));
5834 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5835
5836
5837
5838
5839
5840 css->flags |= CSS_NO_REF;
5841
5842 if (early) {
5843
5844 css->id = 1;
5845 } else {
5846 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5847 BUG_ON(css->id < 0);
5848 }
5849
5850
5851
5852
5853
5854 init_css_set.subsys[ss->id] = css;
5855
5856 have_fork_callback |= (bool)ss->fork << ss->id;
5857 have_exit_callback |= (bool)ss->exit << ss->id;
5858 have_release_callback |= (bool)ss->release << ss->id;
5859 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5860
5861
5862
5863
5864 BUG_ON(!list_empty(&init_task.tasks));
5865
5866 BUG_ON(online_css(css));
5867
5868 mutex_unlock(&cgroup_mutex);
5869}
5870
5871
5872
5873
5874
5875
5876
5877int __init cgroup_init_early(void)
5878{
5879 static struct cgroup_fs_context __initdata ctx;
5880 struct cgroup_subsys *ss;
5881 int i;
5882
5883 ctx.root = &cgrp_dfl_root;
5884 init_cgroup_root(&ctx);
5885 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5886
5887 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5888
5889 for_each_subsys(ss, i) {
5890 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5891 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5892 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5893 ss->id, ss->name);
5894 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5895 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5896
5897 ss->id = i;
5898 ss->name = cgroup_subsys_name[i];
5899 if (!ss->legacy_name)
5900 ss->legacy_name = cgroup_subsys_name[i];
5901
5902 if (ss->early_init)
5903 cgroup_init_subsys(ss, true);
5904 }
5905 return 0;
5906}
5907
5908
5909
5910
5911
5912
5913
5914int __init cgroup_init(void)
5915{
5916 struct cgroup_subsys *ss;
5917 int ssid;
5918
5919 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5920 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5921 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5922
5923 cgroup_rstat_boot();
5924
5925 get_user_ns(init_cgroup_ns.user_ns);
5926
5927 mutex_lock(&cgroup_mutex);
5928
5929
5930
5931
5932
5933 hash_add(css_set_table, &init_css_set.hlist,
5934 css_set_hash(init_css_set.subsys));
5935
5936 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5937
5938 mutex_unlock(&cgroup_mutex);
5939
5940 for_each_subsys(ss, ssid) {
5941 if (ss->early_init) {
5942 struct cgroup_subsys_state *css =
5943 init_css_set.subsys[ss->id];
5944
5945 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5946 GFP_KERNEL);
5947 BUG_ON(css->id < 0);
5948 } else {
5949 cgroup_init_subsys(ss, false);
5950 }
5951
5952 list_add_tail(&init_css_set.e_cset_node[ssid],
5953 &cgrp_dfl_root.cgrp.e_csets[ssid]);
5954
5955
5956
5957
5958
5959
5960 if (!cgroup_ssid_enabled(ssid))
5961 continue;
5962
5963 if (cgroup1_ssid_disabled(ssid))
5964 printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5965 ss->name);
5966
5967 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5968
5969
5970 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
5971
5972 if (ss->implicit_on_dfl)
5973 cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5974 else if (!ss->dfl_cftypes)
5975 cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5976
5977 if (ss->threaded)
5978 cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
5979
5980 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5981 WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5982 } else {
5983 WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5984 WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5985 }
5986
5987 if (ss->bind)
5988 ss->bind(init_css_set.subsys[ssid]);
5989
5990 mutex_lock(&cgroup_mutex);
5991 css_populate_dir(init_css_set.subsys[ssid]);
5992 mutex_unlock(&cgroup_mutex);
5993 }
5994
5995
5996 hash_del(&init_css_set.hlist);
5997 hash_add(css_set_table, &init_css_set.hlist,
5998 css_set_hash(init_css_set.subsys));
5999
6000 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
6001 WARN_ON(register_filesystem(&cgroup_fs_type));
6002 WARN_ON(register_filesystem(&cgroup2_fs_type));
6003 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
6004#ifdef CONFIG_CPUSETS
6005 WARN_ON(register_filesystem(&cpuset_fs_type));
6006#endif
6007
6008 return 0;
6009}
6010
6011static int __init cgroup_wq_init(void)
6012{
6013
6014
6015
6016
6017
6018
6019
6020
6021 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
6022 BUG_ON(!cgroup_destroy_wq);
6023 return 0;
6024}
6025core_initcall(cgroup_wq_init);
6026
6027void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
6028{
6029 struct kernfs_node *kn;
6030
6031 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6032 if (!kn)
6033 return;
6034 kernfs_path(kn, buf, buflen);
6035 kernfs_put(kn);
6036}
6037
6038
6039
6040
6041
6042
6043struct cgroup *cgroup_get_from_id(u64 id)
6044{
6045 struct kernfs_node *kn;
6046 struct cgroup *cgrp = NULL;
6047
6048 kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6049 if (!kn)
6050 goto out;
6051
6052 if (kernfs_type(kn) != KERNFS_DIR)
6053 goto put;
6054
6055 rcu_read_lock();
6056
6057 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6058 if (cgrp && !cgroup_tryget(cgrp))
6059 cgrp = NULL;
6060
6061 rcu_read_unlock();
6062put:
6063 kernfs_put(kn);
6064out:
6065 return cgrp;
6066}
6067EXPORT_SYMBOL_GPL(cgroup_get_from_id);
6068
6069
6070
6071
6072
6073
6074int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
6075 struct pid *pid, struct task_struct *tsk)
6076{
6077 char *buf;
6078 int retval;
6079 struct cgroup_root *root;
6080
6081 retval = -ENOMEM;
6082 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6083 if (!buf)
6084 goto out;
6085
6086 mutex_lock(&cgroup_mutex);
6087 spin_lock_irq(&css_set_lock);
6088
6089 for_each_root(root) {
6090 struct cgroup_subsys *ss;
6091 struct cgroup *cgrp;
6092 int ssid, count = 0;
6093
6094 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
6095 continue;
6096
6097 seq_printf(m, "%d:", root->hierarchy_id);
6098 if (root != &cgrp_dfl_root)
6099 for_each_subsys(ss, ssid)
6100 if (root->subsys_mask & (1 << ssid))
6101 seq_printf(m, "%s%s", count++ ? "," : "",
6102 ss->legacy_name);
6103 if (strlen(root->name))
6104 seq_printf(m, "%sname=%s", count ? "," : "",
6105 root->name);
6106 seq_putc(m, ':');
6107
6108 cgrp = task_cgroup_from_root(tsk, root);
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
6120 retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
6121 current->nsproxy->cgroup_ns);
6122 if (retval >= PATH_MAX)
6123 retval = -ENAMETOOLONG;
6124 if (retval < 0)
6125 goto out_unlock;
6126
6127 seq_puts(m, buf);
6128 } else {
6129 seq_puts(m, "/");
6130 }
6131
6132 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
6133 seq_puts(m, " (deleted)\n");
6134 else
6135 seq_putc(m, '\n');
6136 }
6137
6138 retval = 0;
6139out_unlock:
6140 spin_unlock_irq(&css_set_lock);
6141 mutex_unlock(&cgroup_mutex);
6142 kfree(buf);
6143out:
6144 return retval;
6145}
6146
6147
6148
6149
6150
6151
6152
6153
6154void cgroup_fork(struct task_struct *child)
6155{
6156 RCU_INIT_POINTER(child->cgroups, &init_css_set);
6157 INIT_LIST_HEAD(&child->cg_list);
6158}
6159
6160static struct cgroup *cgroup_get_from_file(struct file *f)
6161{
6162 struct cgroup_subsys_state *css;
6163 struct cgroup *cgrp;
6164
6165 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6166 if (IS_ERR(css))
6167 return ERR_CAST(css);
6168
6169 cgrp = css->cgroup;
6170 if (!cgroup_on_dfl(cgrp)) {
6171 cgroup_put(cgrp);
6172 return ERR_PTR(-EBADF);
6173 }
6174
6175 return cgrp;
6176}
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6195 __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6196{
6197 int ret;
6198 struct cgroup *dst_cgrp = NULL;
6199 struct css_set *cset;
6200 struct super_block *sb;
6201 struct file *f;
6202
6203 if (kargs->flags & CLONE_INTO_CGROUP)
6204 mutex_lock(&cgroup_mutex);
6205
6206 cgroup_threadgroup_change_begin(current);
6207
6208 spin_lock_irq(&css_set_lock);
6209 cset = task_css_set(current);
6210 get_css_set(cset);
6211 spin_unlock_irq(&css_set_lock);
6212
6213 if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6214 kargs->cset = cset;
6215 return 0;
6216 }
6217
6218 f = fget_raw(kargs->cgroup);
6219 if (!f) {
6220 ret = -EBADF;
6221 goto err;
6222 }
6223 sb = f->f_path.dentry->d_sb;
6224
6225 dst_cgrp = cgroup_get_from_file(f);
6226 if (IS_ERR(dst_cgrp)) {
6227 ret = PTR_ERR(dst_cgrp);
6228 dst_cgrp = NULL;
6229 goto err;
6230 }
6231
6232 if (cgroup_is_dead(dst_cgrp)) {
6233 ret = -ENODEV;
6234 goto err;
6235 }
6236
6237
6238
6239
6240
6241
6242 ret = cgroup_may_write(dst_cgrp, sb);
6243 if (ret)
6244 goto err;
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6261 !(kargs->flags & CLONE_THREAD),
6262 current->nsproxy->cgroup_ns);
6263 if (ret)
6264 goto err;
6265
6266 kargs->cset = find_css_set(cset, dst_cgrp);
6267 if (!kargs->cset) {
6268 ret = -ENOMEM;
6269 goto err;
6270 }
6271
6272 put_css_set(cset);
6273 fput(f);
6274 kargs->cgrp = dst_cgrp;
6275 return ret;
6276
6277err:
6278 cgroup_threadgroup_change_end(current);
6279 mutex_unlock(&cgroup_mutex);
6280 if (f)
6281 fput(f);
6282 if (dst_cgrp)
6283 cgroup_put(dst_cgrp);
6284 put_css_set(cset);
6285 if (kargs->cset)
6286 put_css_set(kargs->cset);
6287 return ret;
6288}
6289
6290
6291
6292
6293
6294
6295
6296
6297static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6298 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6299{
6300 cgroup_threadgroup_change_end(current);
6301
6302 if (kargs->flags & CLONE_INTO_CGROUP) {
6303 struct cgroup *cgrp = kargs->cgrp;
6304 struct css_set *cset = kargs->cset;
6305
6306 mutex_unlock(&cgroup_mutex);
6307
6308 if (cset) {
6309 put_css_set(cset);
6310 kargs->cset = NULL;
6311 }
6312
6313 if (cgrp) {
6314 cgroup_put(cgrp);
6315 kargs->cgrp = NULL;
6316 }
6317 }
6318}
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6332{
6333 struct cgroup_subsys *ss;
6334 int i, j, ret;
6335
6336 ret = cgroup_css_set_fork(kargs);
6337 if (ret)
6338 return ret;
6339
6340 do_each_subsys_mask(ss, i, have_canfork_callback) {
6341 ret = ss->can_fork(child, kargs->cset);
6342 if (ret)
6343 goto out_revert;
6344 } while_each_subsys_mask();
6345
6346 return 0;
6347
6348out_revert:
6349 for_each_subsys(ss, j) {
6350 if (j >= i)
6351 break;
6352 if (ss->cancel_fork)
6353 ss->cancel_fork(child, kargs->cset);
6354 }
6355
6356 cgroup_css_set_put_fork(kargs);
6357
6358 return ret;
6359}
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370void cgroup_cancel_fork(struct task_struct *child,
6371 struct kernel_clone_args *kargs)
6372{
6373 struct cgroup_subsys *ss;
6374 int i;
6375
6376 for_each_subsys(ss, i)
6377 if (ss->cancel_fork)
6378 ss->cancel_fork(child, kargs->cset);
6379
6380 cgroup_css_set_put_fork(kargs);
6381}
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391void cgroup_post_fork(struct task_struct *child,
6392 struct kernel_clone_args *kargs)
6393 __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6394{
6395 unsigned long cgrp_flags = 0;
6396 bool kill = false;
6397 struct cgroup_subsys *ss;
6398 struct css_set *cset;
6399 int i;
6400
6401 cset = kargs->cset;
6402 kargs->cset = NULL;
6403
6404 spin_lock_irq(&css_set_lock);
6405
6406
6407 if (likely(child->pid)) {
6408 if (kargs->cgrp)
6409 cgrp_flags = kargs->cgrp->flags;
6410 else
6411 cgrp_flags = cset->dfl_cgrp->flags;
6412
6413 WARN_ON_ONCE(!list_empty(&child->cg_list));
6414 cset->nr_tasks++;
6415 css_set_move_task(child, NULL, cset, false);
6416 } else {
6417 put_css_set(cset);
6418 cset = NULL;
6419 }
6420
6421 if (!(child->flags & PF_KTHREAD)) {
6422 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6423
6424
6425
6426
6427
6428 spin_lock(&child->sighand->siglock);
6429 WARN_ON_ONCE(child->frozen);
6430 child->jobctl |= JOBCTL_TRAP_FREEZE;
6431 spin_unlock(&child->sighand->siglock);
6432
6433
6434
6435
6436
6437
6438
6439 }
6440
6441
6442
6443
6444
6445
6446 kill = test_bit(CGRP_KILL, &cgrp_flags);
6447 }
6448
6449 spin_unlock_irq(&css_set_lock);
6450
6451
6452
6453
6454
6455
6456 do_each_subsys_mask(ss, i, have_fork_callback) {
6457 ss->fork(child);
6458 } while_each_subsys_mask();
6459
6460
6461 if (kargs->flags & CLONE_NEWCGROUP) {
6462 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6463
6464 get_css_set(cset);
6465 child->nsproxy->cgroup_ns->root_cset = cset;
6466 put_css_set(rcset);
6467 }
6468
6469
6470 if (unlikely(kill))
6471 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6472
6473 cgroup_css_set_put_fork(kargs);
6474}
6475
6476
6477
6478
6479
6480
6481
6482
6483void cgroup_exit(struct task_struct *tsk)
6484{
6485 struct cgroup_subsys *ss;
6486 struct css_set *cset;
6487 int i;
6488
6489 spin_lock_irq(&css_set_lock);
6490
6491 WARN_ON_ONCE(list_empty(&tsk->cg_list));
6492 cset = task_css_set(tsk);
6493 css_set_move_task(tsk, cset, NULL, false);
6494 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6495 cset->nr_tasks--;
6496
6497 WARN_ON_ONCE(cgroup_task_frozen(tsk));
6498 if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6499 test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
6500 cgroup_update_frozen(task_dfl_cgroup(tsk));
6501
6502 spin_unlock_irq(&css_set_lock);
6503
6504
6505 do_each_subsys_mask(ss, i, have_exit_callback) {
6506 ss->exit(tsk);
6507 } while_each_subsys_mask();
6508}
6509
6510void cgroup_release(struct task_struct *task)
6511{
6512 struct cgroup_subsys *ss;
6513 int ssid;
6514
6515 do_each_subsys_mask(ss, ssid, have_release_callback) {
6516 ss->release(task);
6517 } while_each_subsys_mask();
6518
6519 spin_lock_irq(&css_set_lock);
6520 css_set_skip_task_iters(task_css_set(task), task);
6521 list_del_init(&task->cg_list);
6522 spin_unlock_irq(&css_set_lock);
6523}
6524
6525void cgroup_free(struct task_struct *task)
6526{
6527 struct css_set *cset = task_css_set(task);
6528 put_css_set(cset);
6529}
6530
6531static int __init cgroup_disable(char *str)
6532{
6533 struct cgroup_subsys *ss;
6534 char *token;
6535 int i;
6536
6537 while ((token = strsep(&str, ",")) != NULL) {
6538 if (!*token)
6539 continue;
6540
6541 for_each_subsys(ss, i) {
6542 if (strcmp(token, ss->name) &&
6543 strcmp(token, ss->legacy_name))
6544 continue;
6545
6546 static_branch_disable(cgroup_subsys_enabled_key[i]);
6547 pr_info("Disabling %s control group subsystem\n",
6548 ss->name);
6549 }
6550
6551 for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6552 if (strcmp(token, cgroup_opt_feature_names[i]))
6553 continue;
6554 cgroup_feature_disable_mask |= 1 << i;
6555 pr_info("Disabling %s control group feature\n",
6556 cgroup_opt_feature_names[i]);
6557 break;
6558 }
6559 }
6560 return 1;
6561}
6562__setup("cgroup_disable=", cgroup_disable);
6563
6564void __init __weak enable_debug_cgroup(void) { }
6565
6566static int __init enable_cgroup_debug(char *str)
6567{
6568 cgroup_debug = true;
6569 enable_debug_cgroup();
6570 return 1;
6571}
6572__setup("cgroup_debug", enable_cgroup_debug);
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6584 struct cgroup_subsys *ss)
6585{
6586 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6587 struct file_system_type *s_type = dentry->d_sb->s_type;
6588 struct cgroup_subsys_state *css = NULL;
6589 struct cgroup *cgrp;
6590
6591
6592 if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6593 !kn || kernfs_type(kn) != KERNFS_DIR)
6594 return ERR_PTR(-EBADF);
6595
6596 rcu_read_lock();
6597
6598
6599
6600
6601
6602
6603 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6604 if (cgrp)
6605 css = cgroup_css(cgrp, ss);
6606
6607 if (!css || !css_tryget_online(css))
6608 css = ERR_PTR(-ENOENT);
6609
6610 rcu_read_unlock();
6611 return css;
6612}
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6623{
6624 WARN_ON_ONCE(!rcu_read_lock_held());
6625 return idr_find(&ss->css_idr, id);
6626}
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637struct cgroup *cgroup_get_from_path(const char *path)
6638{
6639 struct kernfs_node *kn;
6640 struct cgroup *cgrp = ERR_PTR(-ENOENT);
6641 struct cgroup *root_cgrp;
6642
6643 spin_lock_irq(&css_set_lock);
6644 root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
6645 kn = kernfs_walk_and_get(root_cgrp->kn, path);
6646 spin_unlock_irq(&css_set_lock);
6647 if (!kn)
6648 goto out;
6649
6650 if (kernfs_type(kn) != KERNFS_DIR) {
6651 cgrp = ERR_PTR(-ENOTDIR);
6652 goto out_kernfs;
6653 }
6654
6655 rcu_read_lock();
6656
6657 cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6658 if (!cgrp || !cgroup_tryget(cgrp))
6659 cgrp = ERR_PTR(-ENOENT);
6660
6661 rcu_read_unlock();
6662
6663out_kernfs:
6664 kernfs_put(kn);
6665out:
6666 return cgrp;
6667}
6668EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679struct cgroup *cgroup_get_from_fd(int fd)
6680{
6681 struct cgroup *cgrp;
6682 struct file *f;
6683
6684 f = fget_raw(fd);
6685 if (!f)
6686 return ERR_PTR(-EBADF);
6687
6688 cgrp = cgroup_get_from_file(f);
6689 fput(f);
6690 return cgrp;
6691}
6692EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6693
6694static u64 power_of_ten(int power)
6695{
6696 u64 v = 1;
6697 while (power--)
6698 v *= 10;
6699 return v;
6700}
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6717{
6718 s64 whole, frac = 0;
6719 int fstart = 0, fend = 0, flen;
6720
6721 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6722 return -EINVAL;
6723 if (frac < 0)
6724 return -EINVAL;
6725
6726 flen = fend > fstart ? fend - fstart : 0;
6727 if (flen < dec_shift)
6728 frac *= power_of_ten(dec_shift - flen);
6729 else
6730 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6731
6732 *v = whole * power_of_ten(dec_shift) + frac;
6733 return 0;
6734}
6735
6736
6737
6738
6739
6740#ifdef CONFIG_SOCK_CGROUP_DATA
6741
6742void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6743{
6744 struct cgroup *cgroup;
6745
6746 rcu_read_lock();
6747
6748 if (in_interrupt()) {
6749 cgroup = &cgrp_dfl_root.cgrp;
6750 cgroup_get(cgroup);
6751 goto out;
6752 }
6753
6754 while (true) {
6755 struct css_set *cset;
6756
6757 cset = task_css_set(current);
6758 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6759 cgroup = cset->dfl_cgrp;
6760 break;
6761 }
6762 cpu_relax();
6763 }
6764out:
6765 skcd->cgroup = cgroup;
6766 cgroup_bpf_get(cgroup);
6767 rcu_read_unlock();
6768}
6769
6770void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6771{
6772 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6773
6774
6775
6776
6777
6778
6779 cgroup_get(cgrp);
6780 cgroup_bpf_get(cgrp);
6781}
6782
6783void cgroup_sk_free(struct sock_cgroup_data *skcd)
6784{
6785 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6786
6787 cgroup_bpf_put(cgrp);
6788 cgroup_put(cgrp);
6789}
6790
6791#endif
6792
6793#ifdef CONFIG_SYSFS
6794static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6795 ssize_t size, const char *prefix)
6796{
6797 struct cftype *cft;
6798 ssize_t ret = 0;
6799
6800 for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6801 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6802 continue;
6803
6804 if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
6805 continue;
6806
6807 if (prefix)
6808 ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6809
6810 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6811
6812 if (WARN_ON(ret >= size))
6813 break;
6814 }
6815
6816 return ret;
6817}
6818
6819static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6820 char *buf)
6821{
6822 struct cgroup_subsys *ss;
6823 int ssid;
6824 ssize_t ret = 0;
6825
6826 ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
6827 NULL);
6828
6829 for_each_subsys(ss, ssid)
6830 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
6831 PAGE_SIZE - ret,
6832 cgroup_subsys_name[ssid]);
6833
6834 return ret;
6835}
6836static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6837
6838static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6839 char *buf)
6840{
6841 return snprintf(buf, PAGE_SIZE,
6842 "nsdelegate\n"
6843 "favordynmods\n"
6844 "memory_localevents\n"
6845 "memory_recursiveprot\n");
6846}
6847static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6848
6849static struct attribute *cgroup_sysfs_attrs[] = {
6850 &cgroup_delegate_attr.attr,
6851 &cgroup_features_attr.attr,
6852 NULL,
6853};
6854
6855static const struct attribute_group cgroup_sysfs_attr_group = {
6856 .attrs = cgroup_sysfs_attrs,
6857 .name = "cgroup",
6858};
6859
6860static int __init cgroup_sysfs_init(void)
6861{
6862 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6863}
6864subsys_initcall(cgroup_sysfs_init);
6865
6866#endif
6867