1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cgroup.h>
26#include <linux/errno.h>
27#include <linux/fs.h>
28#include <linux/kernel.h>
29#include <linux/list.h>
30#include <linux/mm.h>
31#include <linux/mutex.h>
32#include <linux/mount.h>
33#include <linux/pagemap.h>
34#include <linux/proc_fs.h>
35#include <linux/rcupdate.h>
36#include <linux/sched.h>
37#include <linux/backing-dev.h>
38#include <linux/seq_file.h>
39#include <linux/slab.h>
40#include <linux/magic.h>
41#include <linux/spinlock.h>
42#include <linux/string.h>
43#include <linux/sort.h>
44#include <linux/kmod.h>
45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h>
47#include <linux/hash.h>
48#include <linux/namei.h>
49
50#include <asm/atomic.h>
51
52static DEFINE_MUTEX(cgroup_mutex);
53
54
55#define SUBSYS(_x) &_x ## _subsys,
56
57static struct cgroup_subsys *subsys[] = {
58#include <linux/cgroup_subsys.h>
59};
60
61
62
63
64
65
66struct cgroupfs_root {
67 struct super_block *sb;
68
69
70
71
72
73 unsigned long subsys_bits;
74
75
76 unsigned long actual_subsys_bits;
77
78
79 struct list_head subsys_list;
80
81
82 struct cgroup top_cgroup;
83
84
85 int number_of_cgroups;
86
87
88 struct list_head root_list;
89
90
91 unsigned long flags;
92
93
94 char release_agent_path[PATH_MAX];
95};
96
97
98
99
100
101
102static struct cgroupfs_root rootnode;
103
104
105
106
107
108#define CSS_ID_MAX (65535)
109struct css_id {
110
111
112
113
114
115
116
117 struct cgroup_subsys_state *css;
118
119
120
121 unsigned short id;
122
123
124
125 unsigned short depth;
126
127
128
129 struct rcu_head rcu_head;
130
131
132
133 unsigned short stack[0];
134};
135
136
137
138
139static LIST_HEAD(roots);
140static int root_count;
141
142
143#define dummytop (&rootnode.top_cgroup)
144
145
146
147
148
149
150static int need_forkexit_callback __read_mostly;
151
152
153inline int cgroup_is_removed(const struct cgroup *cgrp)
154{
155 return test_bit(CGRP_REMOVED, &cgrp->flags);
156}
157
158
159enum {
160 ROOT_NOPREFIX,
161};
162
163static int cgroup_is_releasable(const struct cgroup *cgrp)
164{
165 const int bits =
166 (1 << CGRP_RELEASABLE) |
167 (1 << CGRP_NOTIFY_ON_RELEASE);
168 return (cgrp->flags & bits) == bits;
169}
170
171static int notify_on_release(const struct cgroup *cgrp)
172{
173 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
174}
175
176
177
178
179
180#define for_each_subsys(_root, _ss) \
181list_for_each_entry(_ss, &_root->subsys_list, sibling)
182
183
184#define for_each_active_root(_root) \
185list_for_each_entry(_root, &roots, root_list)
186
187
188
189static LIST_HEAD(release_list);
190static DEFINE_SPINLOCK(release_list_lock);
191static void cgroup_release_agent(struct work_struct *work);
192static DECLARE_WORK(release_agent_work, cgroup_release_agent);
193static void check_for_release(struct cgroup *cgrp);
194
195
196struct cg_cgroup_link {
197
198
199
200
201 struct list_head cgrp_link_list;
202
203
204
205
206 struct list_head cg_link_list;
207 struct css_set *cg;
208};
209
210
211
212
213
214
215
216
217static struct css_set init_css_set;
218static struct cg_cgroup_link init_css_set_link;
219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
222
223
224
225static DEFINE_RWLOCK(css_set_lock);
226static int css_set_count;
227
228
229
230#define CSS_SET_HASH_BITS 7
231#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
232static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
233
234static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
235{
236 int i;
237 int index;
238 unsigned long tmp = 0UL;
239
240 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
241 tmp += (unsigned long)css[i];
242 tmp = (tmp >> 16) ^ tmp;
243
244 index = hash_long(tmp, CSS_SET_HASH_BITS);
245
246 return &css_set_table[index];
247}
248
249
250
251
252
253static int use_task_css_set_links __read_mostly;
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272static void unlink_css_set(struct css_set *cg)
273{
274 struct cg_cgroup_link *link;
275 struct cg_cgroup_link *saved_link;
276
277 hlist_del(&cg->hlist);
278 css_set_count--;
279
280 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
281 cg_link_list) {
282 list_del(&link->cg_link_list);
283 list_del(&link->cgrp_link_list);
284 kfree(link);
285 }
286}
287
288static void __put_css_set(struct css_set *cg, int taskexit)
289{
290 int i;
291
292
293
294
295
296 if (atomic_add_unless(&cg->refcount, -1, 1))
297 return;
298 write_lock(&css_set_lock);
299 if (!atomic_dec_and_test(&cg->refcount)) {
300 write_unlock(&css_set_lock);
301 return;
302 }
303 unlink_css_set(cg);
304 write_unlock(&css_set_lock);
305
306 rcu_read_lock();
307 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
308 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
309 if (atomic_dec_and_test(&cgrp->count) &&
310 notify_on_release(cgrp)) {
311 if (taskexit)
312 set_bit(CGRP_RELEASABLE, &cgrp->flags);
313 check_for_release(cgrp);
314 }
315 }
316 rcu_read_unlock();
317 kfree(cg);
318}
319
320
321
322
323static inline void get_css_set(struct css_set *cg)
324{
325 atomic_inc(&cg->refcount);
326}
327
328static inline void put_css_set(struct css_set *cg)
329{
330 __put_css_set(cg, 0);
331}
332
333static inline void put_css_set_taskexit(struct css_set *cg)
334{
335 __put_css_set(cg, 1);
336}
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351static struct css_set *find_existing_css_set(
352 struct css_set *oldcg,
353 struct cgroup *cgrp,
354 struct cgroup_subsys_state *template[])
355{
356 int i;
357 struct cgroupfs_root *root = cgrp->root;
358 struct hlist_head *hhead;
359 struct hlist_node *node;
360 struct css_set *cg;
361
362
363
364 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
365 if (root->subsys_bits & (1UL << i)) {
366
367
368
369 template[i] = cgrp->subsys[i];
370 } else {
371
372
373 template[i] = oldcg->subsys[i];
374 }
375 }
376
377 hhead = css_set_hash(template);
378 hlist_for_each_entry(cg, node, hhead, hlist) {
379 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
380
381 return cg;
382 }
383 }
384
385
386 return NULL;
387}
388
389static void free_cg_links(struct list_head *tmp)
390{
391 struct cg_cgroup_link *link;
392 struct cg_cgroup_link *saved_link;
393
394 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
395 list_del(&link->cgrp_link_list);
396 kfree(link);
397 }
398}
399
400
401
402
403
404
405static int allocate_cg_links(int count, struct list_head *tmp)
406{
407 struct cg_cgroup_link *link;
408 int i;
409 INIT_LIST_HEAD(tmp);
410 for (i = 0; i < count; i++) {
411 link = kmalloc(sizeof(*link), GFP_KERNEL);
412 if (!link) {
413 free_cg_links(tmp);
414 return -ENOMEM;
415 }
416 list_add(&link->cgrp_link_list, tmp);
417 }
418 return 0;
419}
420
421
422
423
424
425
426
427static void link_css_set(struct list_head *tmp_cg_links,
428 struct css_set *cg, struct cgroup *cgrp)
429{
430 struct cg_cgroup_link *link;
431
432 BUG_ON(list_empty(tmp_cg_links));
433 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
434 cgrp_link_list);
435 link->cg = cg;
436 list_move(&link->cgrp_link_list, &cgrp->css_sets);
437 list_add(&link->cg_link_list, &cg->cg_links);
438}
439
440
441
442
443
444
445
446
447static struct css_set *find_css_set(
448 struct css_set *oldcg, struct cgroup *cgrp)
449{
450 struct css_set *res;
451 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
452 int i;
453
454 struct list_head tmp_cg_links;
455
456 struct hlist_head *hhead;
457
458
459
460 read_lock(&css_set_lock);
461 res = find_existing_css_set(oldcg, cgrp, template);
462 if (res)
463 get_css_set(res);
464 read_unlock(&css_set_lock);
465
466 if (res)
467 return res;
468
469 res = kmalloc(sizeof(*res), GFP_KERNEL);
470 if (!res)
471 return NULL;
472
473
474 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
475 kfree(res);
476 return NULL;
477 }
478
479 atomic_set(&res->refcount, 1);
480 INIT_LIST_HEAD(&res->cg_links);
481 INIT_LIST_HEAD(&res->tasks);
482 INIT_HLIST_NODE(&res->hlist);
483
484
485
486 memcpy(res->subsys, template, sizeof(res->subsys));
487
488 write_lock(&css_set_lock);
489
490 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
491 struct cgroup *cgrp = res->subsys[i]->cgroup;
492 struct cgroup_subsys *ss = subsys[i];
493 atomic_inc(&cgrp->count);
494
495
496
497
498
499 if (ss->root->subsys_list.next == &ss->sibling)
500 link_css_set(&tmp_cg_links, res, cgrp);
501 }
502 if (list_empty(&rootnode.subsys_list))
503 link_css_set(&tmp_cg_links, res, dummytop);
504
505 BUG_ON(!list_empty(&tmp_cg_links));
506
507 css_set_count++;
508
509
510 hhead = css_set_hash(res->subsys);
511 hlist_add_head(&res->hlist, hhead);
512
513 write_unlock(&css_set_lock);
514
515 return res;
516}
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572void cgroup_lock(void)
573{
574 mutex_lock(&cgroup_mutex);
575}
576
577
578
579
580
581
582void cgroup_unlock(void)
583{
584 mutex_unlock(&cgroup_mutex);
585}
586
587
588
589
590
591
592
593
594static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
595static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
596static int cgroup_populate_dir(struct cgroup *cgrp);
597static struct inode_operations cgroup_dir_inode_operations;
598static struct file_operations proc_cgroupstats_operations;
599
600static struct backing_dev_info cgroup_backing_dev_info = {
601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
602};
603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
608{
609 struct inode *inode = new_inode(sb);
610
611 if (inode) {
612 inode->i_mode = mode;
613 inode->i_uid = current_fsuid();
614 inode->i_gid = current_fsgid();
615 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
616 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
617 }
618 return inode;
619}
620
621
622
623
624
625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
626{
627 struct cgroup_subsys *ss;
628 int ret = 0;
629
630 for_each_subsys(cgrp->root, ss)
631 if (ss->pre_destroy) {
632 ret = ss->pre_destroy(ss, cgrp);
633 if (ret)
634 break;
635 }
636 return ret;
637}
638
639static void free_cgroup_rcu(struct rcu_head *obj)
640{
641 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
642
643 kfree(cgrp);
644}
645
646static void cgroup_diput(struct dentry *dentry, struct inode *inode)
647{
648
649 if (S_ISDIR(inode->i_mode)) {
650 struct cgroup *cgrp = dentry->d_fsdata;
651 struct cgroup_subsys *ss;
652 BUG_ON(!(cgroup_is_removed(cgrp)));
653
654
655
656
657
658
659 synchronize_rcu();
660
661 mutex_lock(&cgroup_mutex);
662
663
664
665 for_each_subsys(cgrp->root, ss)
666 ss->destroy(ss, cgrp);
667
668 cgrp->root->number_of_cgroups--;
669 mutex_unlock(&cgroup_mutex);
670
671
672
673
674
675 deactivate_super(cgrp->root->sb);
676
677 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
678 }
679 iput(inode);
680}
681
682static void remove_dir(struct dentry *d)
683{
684 struct dentry *parent = dget(d->d_parent);
685
686 d_delete(d);
687 simple_rmdir(parent->d_inode, d);
688 dput(parent);
689}
690
691static void cgroup_clear_directory(struct dentry *dentry)
692{
693 struct list_head *node;
694
695 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
696 spin_lock(&dcache_lock);
697 node = dentry->d_subdirs.next;
698 while (node != &dentry->d_subdirs) {
699 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
700 list_del_init(node);
701 if (d->d_inode) {
702
703
704 BUG_ON(d->d_inode->i_mode & S_IFDIR);
705 d = dget_locked(d);
706 spin_unlock(&dcache_lock);
707 d_delete(d);
708 simple_unlink(dentry->d_inode, d);
709 dput(d);
710 spin_lock(&dcache_lock);
711 }
712 node = dentry->d_subdirs.next;
713 }
714 spin_unlock(&dcache_lock);
715}
716
717
718
719
720static void cgroup_d_remove_dir(struct dentry *dentry)
721{
722 cgroup_clear_directory(dentry);
723
724 spin_lock(&dcache_lock);
725 list_del_init(&dentry->d_u.d_child);
726 spin_unlock(&dcache_lock);
727 remove_dir(dentry);
728}
729
730
731
732
733
734
735
736
737
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
746static int rebind_subsystems(struct cgroupfs_root *root,
747 unsigned long final_bits)
748{
749 unsigned long added_bits, removed_bits;
750 struct cgroup *cgrp = &root->top_cgroup;
751 int i;
752
753 removed_bits = root->actual_subsys_bits & ~final_bits;
754 added_bits = final_bits & ~root->actual_subsys_bits;
755
756 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
757 unsigned long bit = 1UL << i;
758 struct cgroup_subsys *ss = subsys[i];
759 if (!(bit & added_bits))
760 continue;
761 if (ss->root != &rootnode) {
762
763 return -EBUSY;
764 }
765 }
766
767
768
769
770
771 if (root->number_of_cgroups > 1)
772 return -EBUSY;
773
774
775 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
776 struct cgroup_subsys *ss = subsys[i];
777 unsigned long bit = 1UL << i;
778 if (bit & added_bits) {
779
780 BUG_ON(cgrp->subsys[i]);
781 BUG_ON(!dummytop->subsys[i]);
782 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
783 mutex_lock(&ss->hierarchy_mutex);
784 cgrp->subsys[i] = dummytop->subsys[i];
785 cgrp->subsys[i]->cgroup = cgrp;
786 list_move(&ss->sibling, &root->subsys_list);
787 ss->root = root;
788 if (ss->bind)
789 ss->bind(ss, cgrp);
790 mutex_unlock(&ss->hierarchy_mutex);
791 } else if (bit & removed_bits) {
792
793 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
794 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
795 mutex_lock(&ss->hierarchy_mutex);
796 if (ss->bind)
797 ss->bind(ss, dummytop);
798 dummytop->subsys[i]->cgroup = dummytop;
799 cgrp->subsys[i] = NULL;
800 subsys[i]->root = &rootnode;
801 list_move(&ss->sibling, &rootnode.subsys_list);
802 mutex_unlock(&ss->hierarchy_mutex);
803 } else if (bit & final_bits) {
804
805 BUG_ON(!cgrp->subsys[i]);
806 } else {
807
808 BUG_ON(cgrp->subsys[i]);
809 }
810 }
811 root->subsys_bits = root->actual_subsys_bits = final_bits;
812 synchronize_rcu();
813
814 return 0;
815}
816
817static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
818{
819 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
820 struct cgroup_subsys *ss;
821
822 mutex_lock(&cgroup_mutex);
823 for_each_subsys(root, ss)
824 seq_printf(seq, ",%s", ss->name);
825 if (test_bit(ROOT_NOPREFIX, &root->flags))
826 seq_puts(seq, ",noprefix");
827 if (strlen(root->release_agent_path))
828 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
829 mutex_unlock(&cgroup_mutex);
830 return 0;
831}
832
833struct cgroup_sb_opts {
834 unsigned long subsys_bits;
835 unsigned long flags;
836 char *release_agent;
837};
838
839
840
841static int parse_cgroupfs_options(char *data,
842 struct cgroup_sb_opts *opts)
843{
844 char *token, *o = data ?: "all";
845
846 opts->subsys_bits = 0;
847 opts->flags = 0;
848 opts->release_agent = NULL;
849
850 while ((token = strsep(&o, ",")) != NULL) {
851 if (!*token)
852 return -EINVAL;
853 if (!strcmp(token, "all")) {
854
855 int i;
856 opts->subsys_bits = 0;
857 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
858 struct cgroup_subsys *ss = subsys[i];
859 if (!ss->disabled)
860 opts->subsys_bits |= 1ul << i;
861 }
862 } else if (!strcmp(token, "noprefix")) {
863 set_bit(ROOT_NOPREFIX, &opts->flags);
864 } else if (!strncmp(token, "release_agent=", 14)) {
865
866 if (opts->release_agent)
867 return -EINVAL;
868 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
869 if (!opts->release_agent)
870 return -ENOMEM;
871 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
872 opts->release_agent[PATH_MAX - 1] = 0;
873 } else {
874 struct cgroup_subsys *ss;
875 int i;
876 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
877 ss = subsys[i];
878 if (!strcmp(token, ss->name)) {
879 if (!ss->disabled)
880 set_bit(i, &opts->subsys_bits);
881 break;
882 }
883 }
884 if (i == CGROUP_SUBSYS_COUNT)
885 return -ENOENT;
886 }
887 }
888
889
890 if (!opts->subsys_bits)
891 return -EINVAL;
892
893 return 0;
894}
895
896static int cgroup_remount(struct super_block *sb, int *flags, char *data)
897{
898 int ret = 0;
899 struct cgroupfs_root *root = sb->s_fs_info;
900 struct cgroup *cgrp = &root->top_cgroup;
901 struct cgroup_sb_opts opts;
902
903 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
904 mutex_lock(&cgroup_mutex);
905
906
907 ret = parse_cgroupfs_options(data, &opts);
908 if (ret)
909 goto out_unlock;
910
911
912 if (opts.flags != root->flags) {
913 ret = -EINVAL;
914 goto out_unlock;
915 }
916
917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
920
921
922 cgroup_populate_dir(cgrp);
923
924 if (opts.release_agent)
925 strcpy(root->release_agent_path, opts.release_agent);
926 out_unlock:
927 kfree(opts.release_agent);
928 mutex_unlock(&cgroup_mutex);
929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
930 return ret;
931}
932
933static struct super_operations cgroup_ops = {
934 .statfs = simple_statfs,
935 .drop_inode = generic_delete_inode,
936 .show_options = cgroup_show_options,
937 .remount_fs = cgroup_remount,
938};
939
940static void init_cgroup_housekeeping(struct cgroup *cgrp)
941{
942 INIT_LIST_HEAD(&cgrp->sibling);
943 INIT_LIST_HEAD(&cgrp->children);
944 INIT_LIST_HEAD(&cgrp->css_sets);
945 INIT_LIST_HEAD(&cgrp->release_list);
946 init_rwsem(&cgrp->pids_mutex);
947}
948static void init_cgroup_root(struct cgroupfs_root *root)
949{
950 struct cgroup *cgrp = &root->top_cgroup;
951 INIT_LIST_HEAD(&root->subsys_list);
952 INIT_LIST_HEAD(&root->root_list);
953 root->number_of_cgroups = 1;
954 cgrp->root = root;
955 cgrp->top_cgroup = cgrp;
956 init_cgroup_housekeeping(cgrp);
957}
958
959static int cgroup_test_super(struct super_block *sb, void *data)
960{
961 struct cgroupfs_root *new = data;
962 struct cgroupfs_root *root = sb->s_fs_info;
963
964
965 if (new->subsys_bits != root->subsys_bits)
966 return 0;
967
968
969 if (new->flags != root->flags)
970 return 0;
971
972 return 1;
973}
974
975static int cgroup_set_super(struct super_block *sb, void *data)
976{
977 int ret;
978 struct cgroupfs_root *root = data;
979
980 ret = set_anon_super(sb, NULL);
981 if (ret)
982 return ret;
983
984 sb->s_fs_info = root;
985 root->sb = sb;
986
987 sb->s_blocksize = PAGE_CACHE_SIZE;
988 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
989 sb->s_magic = CGROUP_SUPER_MAGIC;
990 sb->s_op = &cgroup_ops;
991
992 return 0;
993}
994
995static int cgroup_get_rootdir(struct super_block *sb)
996{
997 struct inode *inode =
998 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
999 struct dentry *dentry;
1000
1001 if (!inode)
1002 return -ENOMEM;
1003
1004 inode->i_fop = &simple_dir_operations;
1005 inode->i_op = &cgroup_dir_inode_operations;
1006
1007 inc_nlink(inode);
1008 dentry = d_alloc_root(inode);
1009 if (!dentry) {
1010 iput(inode);
1011 return -ENOMEM;
1012 }
1013 sb->s_root = dentry;
1014 return 0;
1015}
1016
1017static int cgroup_get_sb(struct file_system_type *fs_type,
1018 int flags, const char *unused_dev_name,
1019 void *data, struct vfsmount *mnt)
1020{
1021 struct cgroup_sb_opts opts;
1022 int ret = 0;
1023 struct super_block *sb;
1024 struct cgroupfs_root *root;
1025 struct list_head tmp_cg_links;
1026
1027
1028 ret = parse_cgroupfs_options(data, &opts);
1029 if (ret) {
1030 kfree(opts.release_agent);
1031 return ret;
1032 }
1033
1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
1035 if (!root) {
1036 kfree(opts.release_agent);
1037 return -ENOMEM;
1038 }
1039
1040 init_cgroup_root(root);
1041 root->subsys_bits = opts.subsys_bits;
1042 root->flags = opts.flags;
1043 if (opts.release_agent) {
1044 strcpy(root->release_agent_path, opts.release_agent);
1045 kfree(opts.release_agent);
1046 }
1047
1048 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
1049
1050 if (IS_ERR(sb)) {
1051 kfree(root);
1052 return PTR_ERR(sb);
1053 }
1054
1055 if (sb->s_fs_info != root) {
1056
1057 BUG_ON(sb->s_root == NULL);
1058 kfree(root);
1059 root = NULL;
1060 } else {
1061
1062 struct cgroup *root_cgrp = &root->top_cgroup;
1063 struct inode *inode;
1064 int i;
1065
1066 BUG_ON(sb->s_root != NULL);
1067
1068 ret = cgroup_get_rootdir(sb);
1069 if (ret)
1070 goto drop_new_super;
1071 inode = sb->s_root->d_inode;
1072
1073 mutex_lock(&inode->i_mutex);
1074 mutex_lock(&cgroup_mutex);
1075
1076
1077
1078
1079
1080
1081
1082
1083 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1084 if (ret) {
1085 mutex_unlock(&cgroup_mutex);
1086 mutex_unlock(&inode->i_mutex);
1087 goto drop_new_super;
1088 }
1089
1090 ret = rebind_subsystems(root, root->subsys_bits);
1091 if (ret == -EBUSY) {
1092 mutex_unlock(&cgroup_mutex);
1093 mutex_unlock(&inode->i_mutex);
1094 goto free_cg_links;
1095 }
1096
1097
1098 BUG_ON(ret);
1099
1100 list_add(&root->root_list, &roots);
1101 root_count++;
1102
1103 sb->s_root->d_fsdata = root_cgrp;
1104 root->top_cgroup.dentry = sb->s_root;
1105
1106
1107
1108 write_lock(&css_set_lock);
1109 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1110 struct hlist_head *hhead = &css_set_table[i];
1111 struct hlist_node *node;
1112 struct css_set *cg;
1113
1114 hlist_for_each_entry(cg, node, hhead, hlist)
1115 link_css_set(&tmp_cg_links, cg, root_cgrp);
1116 }
1117 write_unlock(&css_set_lock);
1118
1119 free_cg_links(&tmp_cg_links);
1120
1121 BUG_ON(!list_empty(&root_cgrp->sibling));
1122 BUG_ON(!list_empty(&root_cgrp->children));
1123 BUG_ON(root->number_of_cgroups != 1);
1124
1125 cgroup_populate_dir(root_cgrp);
1126 mutex_unlock(&inode->i_mutex);
1127 mutex_unlock(&cgroup_mutex);
1128 }
1129
1130 simple_set_mnt(mnt, sb);
1131 return 0;
1132
1133 free_cg_links:
1134 free_cg_links(&tmp_cg_links);
1135 drop_new_super:
1136 deactivate_locked_super(sb);
1137 return ret;
1138}
1139
1140static void cgroup_kill_sb(struct super_block *sb) {
1141 struct cgroupfs_root *root = sb->s_fs_info;
1142 struct cgroup *cgrp = &root->top_cgroup;
1143 int ret;
1144 struct cg_cgroup_link *link;
1145 struct cg_cgroup_link *saved_link;
1146
1147 BUG_ON(!root);
1148
1149 BUG_ON(root->number_of_cgroups != 1);
1150 BUG_ON(!list_empty(&cgrp->children));
1151 BUG_ON(!list_empty(&cgrp->sibling));
1152
1153 mutex_lock(&cgroup_mutex);
1154
1155
1156 ret = rebind_subsystems(root, 0);
1157
1158 BUG_ON(ret);
1159
1160
1161
1162
1163
1164 write_lock(&css_set_lock);
1165
1166 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1167 cgrp_link_list) {
1168 list_del(&link->cg_link_list);
1169 list_del(&link->cgrp_link_list);
1170 kfree(link);
1171 }
1172 write_unlock(&css_set_lock);
1173
1174 if (!list_empty(&root->root_list)) {
1175 list_del(&root->root_list);
1176 root_count--;
1177 }
1178
1179 mutex_unlock(&cgroup_mutex);
1180
1181 kill_litter_super(sb);
1182 kfree(root);
1183}
1184
1185static struct file_system_type cgroup_fs_type = {
1186 .name = "cgroup",
1187 .get_sb = cgroup_get_sb,
1188 .kill_sb = cgroup_kill_sb,
1189};
1190
1191static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1192{
1193 return dentry->d_fsdata;
1194}
1195
1196static inline struct cftype *__d_cft(struct dentry *dentry)
1197{
1198 return dentry->d_fsdata;
1199}
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1212{
1213 char *start;
1214 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1215
1216 if (!dentry || cgrp == dummytop) {
1217
1218
1219
1220
1221 strcpy(buf, "/");
1222 return 0;
1223 }
1224
1225 start = buf + buflen;
1226
1227 *--start = '\0';
1228 for (;;) {
1229 int len = dentry->d_name.len;
1230 if ((start -= len) < buf)
1231 return -ENAMETOOLONG;
1232 memcpy(start, cgrp->dentry->d_name.name, len);
1233 cgrp = cgrp->parent;
1234 if (!cgrp)
1235 break;
1236 dentry = rcu_dereference(cgrp->dentry);
1237 if (!cgrp->parent)
1238 continue;
1239 if (--start < buf)
1240 return -ENAMETOOLONG;
1241 *start = '/';
1242 }
1243 memmove(buf, start, buf + buflen - start);
1244 return 0;
1245}
1246
1247
1248
1249
1250
1251
1252static void get_first_subsys(const struct cgroup *cgrp,
1253 struct cgroup_subsys_state **css, int *subsys_id)
1254{
1255 const struct cgroupfs_root *root = cgrp->root;
1256 const struct cgroup_subsys *test_ss;
1257 BUG_ON(list_empty(&root->subsys_list));
1258 test_ss = list_entry(root->subsys_list.next,
1259 struct cgroup_subsys, sibling);
1260 if (css) {
1261 *css = cgrp->subsys[test_ss->subsys_id];
1262 BUG_ON(!*css);
1263 }
1264 if (subsys_id)
1265 *subsys_id = test_ss->subsys_id;
1266}
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1277{
1278 int retval = 0;
1279 struct cgroup_subsys *ss;
1280 struct cgroup *oldcgrp;
1281 struct css_set *cg;
1282 struct css_set *newcg;
1283 struct cgroupfs_root *root = cgrp->root;
1284 int subsys_id;
1285
1286 get_first_subsys(cgrp, NULL, &subsys_id);
1287
1288
1289 oldcgrp = task_cgroup(tsk, subsys_id);
1290 if (cgrp == oldcgrp)
1291 return 0;
1292
1293 for_each_subsys(root, ss) {
1294 if (ss->can_attach) {
1295 retval = ss->can_attach(ss, cgrp, tsk);
1296 if (retval)
1297 return retval;
1298 }
1299 }
1300
1301 task_lock(tsk);
1302 cg = tsk->cgroups;
1303 get_css_set(cg);
1304 task_unlock(tsk);
1305
1306
1307
1308
1309 newcg = find_css_set(cg, cgrp);
1310 put_css_set(cg);
1311 if (!newcg)
1312 return -ENOMEM;
1313
1314 task_lock(tsk);
1315 if (tsk->flags & PF_EXITING) {
1316 task_unlock(tsk);
1317 put_css_set(newcg);
1318 return -ESRCH;
1319 }
1320 rcu_assign_pointer(tsk->cgroups, newcg);
1321 task_unlock(tsk);
1322
1323
1324 write_lock(&css_set_lock);
1325 if (!list_empty(&tsk->cg_list)) {
1326 list_del(&tsk->cg_list);
1327 list_add(&tsk->cg_list, &newcg->tasks);
1328 }
1329 write_unlock(&css_set_lock);
1330
1331 for_each_subsys(root, ss) {
1332 if (ss->attach)
1333 ss->attach(ss, cgrp, oldcgrp, tsk);
1334 }
1335 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1336 synchronize_rcu();
1337 put_css_set(cg);
1338
1339
1340
1341
1342
1343 cgroup_wakeup_rmdir_waiters(cgrp);
1344 return 0;
1345}
1346
1347
1348
1349
1350
1351static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1352{
1353 struct task_struct *tsk;
1354 const struct cred *cred = current_cred(), *tcred;
1355 int ret;
1356
1357 if (pid) {
1358 rcu_read_lock();
1359 tsk = find_task_by_vpid(pid);
1360 if (!tsk || tsk->flags & PF_EXITING) {
1361 rcu_read_unlock();
1362 return -ESRCH;
1363 }
1364
1365 tcred = __task_cred(tsk);
1366 if (cred->euid &&
1367 cred->euid != tcred->uid &&
1368 cred->euid != tcred->suid) {
1369 rcu_read_unlock();
1370 return -EACCES;
1371 }
1372 get_task_struct(tsk);
1373 rcu_read_unlock();
1374 } else {
1375 tsk = current;
1376 get_task_struct(tsk);
1377 }
1378
1379 ret = cgroup_attach_task(cgrp, tsk);
1380 put_task_struct(tsk);
1381 return ret;
1382}
1383
1384static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1385{
1386 int ret;
1387 if (!cgroup_lock_live_group(cgrp))
1388 return -ENODEV;
1389 ret = attach_task_by_pid(cgrp, pid);
1390 cgroup_unlock();
1391 return ret;
1392}
1393
1394
1395enum cgroup_filetype {
1396 FILE_ROOT,
1397 FILE_DIR,
1398 FILE_TASKLIST,
1399 FILE_NOTIFY_ON_RELEASE,
1400 FILE_RELEASE_AGENT,
1401};
1402
1403
1404
1405
1406
1407
1408
1409
1410bool cgroup_lock_live_group(struct cgroup *cgrp)
1411{
1412 mutex_lock(&cgroup_mutex);
1413 if (cgroup_is_removed(cgrp)) {
1414 mutex_unlock(&cgroup_mutex);
1415 return false;
1416 }
1417 return true;
1418}
1419
1420static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1421 const char *buffer)
1422{
1423 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1424 if (!cgroup_lock_live_group(cgrp))
1425 return -ENODEV;
1426 strcpy(cgrp->root->release_agent_path, buffer);
1427 cgroup_unlock();
1428 return 0;
1429}
1430
1431static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
1432 struct seq_file *seq)
1433{
1434 if (!cgroup_lock_live_group(cgrp))
1435 return -ENODEV;
1436 seq_puts(seq, cgrp->root->release_agent_path);
1437 seq_putc(seq, '\n');
1438 cgroup_unlock();
1439 return 0;
1440}
1441
1442
1443#define CGROUP_LOCAL_BUFFER_SIZE 64
1444
1445static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1446 struct file *file,
1447 const char __user *userbuf,
1448 size_t nbytes, loff_t *unused_ppos)
1449{
1450 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
1451 int retval = 0;
1452 char *end;
1453
1454 if (!nbytes)
1455 return -EINVAL;
1456 if (nbytes >= sizeof(buffer))
1457 return -E2BIG;
1458 if (copy_from_user(buffer, userbuf, nbytes))
1459 return -EFAULT;
1460
1461 buffer[nbytes] = 0;
1462 strstrip(buffer);
1463 if (cft->write_u64) {
1464 u64 val = simple_strtoull(buffer, &end, 0);
1465 if (*end)
1466 return -EINVAL;
1467 retval = cft->write_u64(cgrp, cft, val);
1468 } else {
1469 s64 val = simple_strtoll(buffer, &end, 0);
1470 if (*end)
1471 return -EINVAL;
1472 retval = cft->write_s64(cgrp, cft, val);
1473 }
1474 if (!retval)
1475 retval = nbytes;
1476 return retval;
1477}
1478
1479static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1480 struct file *file,
1481 const char __user *userbuf,
1482 size_t nbytes, loff_t *unused_ppos)
1483{
1484 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
1485 int retval = 0;
1486 size_t max_bytes = cft->max_write_len;
1487 char *buffer = local_buffer;
1488
1489 if (!max_bytes)
1490 max_bytes = sizeof(local_buffer) - 1;
1491 if (nbytes >= max_bytes)
1492 return -E2BIG;
1493
1494 if (nbytes >= sizeof(local_buffer)) {
1495 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1496 if (buffer == NULL)
1497 return -ENOMEM;
1498 }
1499 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
1500 retval = -EFAULT;
1501 goto out;
1502 }
1503
1504 buffer[nbytes] = 0;
1505 strstrip(buffer);
1506 retval = cft->write_string(cgrp, cft, buffer);
1507 if (!retval)
1508 retval = nbytes;
1509out:
1510 if (buffer != local_buffer)
1511 kfree(buffer);
1512 return retval;
1513}
1514
1515static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1516 size_t nbytes, loff_t *ppos)
1517{
1518 struct cftype *cft = __d_cft(file->f_dentry);
1519 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1520
1521 if (cgroup_is_removed(cgrp))
1522 return -ENODEV;
1523 if (cft->write)
1524 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1525 if (cft->write_u64 || cft->write_s64)
1526 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1527 if (cft->write_string)
1528 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
1529 if (cft->trigger) {
1530 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1531 return ret ? ret : nbytes;
1532 }
1533 return -EINVAL;
1534}
1535
1536static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1537 struct file *file,
1538 char __user *buf, size_t nbytes,
1539 loff_t *ppos)
1540{
1541 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1542 u64 val = cft->read_u64(cgrp, cft);
1543 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1544
1545 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1546}
1547
1548static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1549 struct file *file,
1550 char __user *buf, size_t nbytes,
1551 loff_t *ppos)
1552{
1553 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1554 s64 val = cft->read_s64(cgrp, cft);
1555 int len = sprintf(tmp, "%lld\n", (long long) val);
1556
1557 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1558}
1559
1560static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1561 size_t nbytes, loff_t *ppos)
1562{
1563 struct cftype *cft = __d_cft(file->f_dentry);
1564 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1565
1566 if (cgroup_is_removed(cgrp))
1567 return -ENODEV;
1568
1569 if (cft->read)
1570 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1571 if (cft->read_u64)
1572 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
1573 if (cft->read_s64)
1574 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
1575 return -EINVAL;
1576}
1577
1578
1579
1580
1581
1582
1583struct cgroup_seqfile_state {
1584 struct cftype *cft;
1585 struct cgroup *cgroup;
1586};
1587
1588static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
1589{
1590 struct seq_file *sf = cb->state;
1591 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
1592}
1593
1594static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1595{
1596 struct cgroup_seqfile_state *state = m->private;
1597 struct cftype *cft = state->cft;
1598 if (cft->read_map) {
1599 struct cgroup_map_cb cb = {
1600 .fill = cgroup_map_add,
1601 .state = m,
1602 };
1603 return cft->read_map(state->cgroup, cft, &cb);
1604 }
1605 return cft->read_seq_string(state->cgroup, cft, m);
1606}
1607
1608static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1609{
1610 struct seq_file *seq = file->private_data;
1611 kfree(seq->private);
1612 return single_release(inode, file);
1613}
1614
1615static struct file_operations cgroup_seqfile_operations = {
1616 .read = seq_read,
1617 .write = cgroup_file_write,
1618 .llseek = seq_lseek,
1619 .release = cgroup_seqfile_release,
1620};
1621
1622static int cgroup_file_open(struct inode *inode, struct file *file)
1623{
1624 int err;
1625 struct cftype *cft;
1626
1627 err = generic_file_open(inode, file);
1628 if (err)
1629 return err;
1630 cft = __d_cft(file->f_dentry);
1631
1632 if (cft->read_map || cft->read_seq_string) {
1633 struct cgroup_seqfile_state *state =
1634 kzalloc(sizeof(*state), GFP_USER);
1635 if (!state)
1636 return -ENOMEM;
1637 state->cft = cft;
1638 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
1639 file->f_op = &cgroup_seqfile_operations;
1640 err = single_open(file, cgroup_seqfile_show, state);
1641 if (err < 0)
1642 kfree(state);
1643 } else if (cft->open)
1644 err = cft->open(inode, file);
1645 else
1646 err = 0;
1647
1648 return err;
1649}
1650
1651static int cgroup_file_release(struct inode *inode, struct file *file)
1652{
1653 struct cftype *cft = __d_cft(file->f_dentry);
1654 if (cft->release)
1655 return cft->release(inode, file);
1656 return 0;
1657}
1658
1659
1660
1661
1662static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1663 struct inode *new_dir, struct dentry *new_dentry)
1664{
1665 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1666 return -ENOTDIR;
1667 if (new_dentry->d_inode)
1668 return -EEXIST;
1669 if (old_dir != new_dir)
1670 return -EIO;
1671 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1672}
1673
1674static struct file_operations cgroup_file_operations = {
1675 .read = cgroup_file_read,
1676 .write = cgroup_file_write,
1677 .llseek = generic_file_llseek,
1678 .open = cgroup_file_open,
1679 .release = cgroup_file_release,
1680};
1681
1682static struct inode_operations cgroup_dir_inode_operations = {
1683 .lookup = simple_lookup,
1684 .mkdir = cgroup_mkdir,
1685 .rmdir = cgroup_rmdir,
1686 .rename = cgroup_rename,
1687};
1688
1689static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1690 struct super_block *sb)
1691{
1692 static const struct dentry_operations cgroup_dops = {
1693 .d_iput = cgroup_diput,
1694 };
1695
1696 struct inode *inode;
1697
1698 if (!dentry)
1699 return -ENOENT;
1700 if (dentry->d_inode)
1701 return -EEXIST;
1702
1703 inode = cgroup_new_inode(mode, sb);
1704 if (!inode)
1705 return -ENOMEM;
1706
1707 if (S_ISDIR(mode)) {
1708 inode->i_op = &cgroup_dir_inode_operations;
1709 inode->i_fop = &simple_dir_operations;
1710
1711
1712 inc_nlink(inode);
1713
1714
1715
1716 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1717 } else if (S_ISREG(mode)) {
1718 inode->i_size = 0;
1719 inode->i_fop = &cgroup_file_operations;
1720 }
1721 dentry->d_op = &cgroup_dops;
1722 d_instantiate(dentry, inode);
1723 dget(dentry);
1724 return 0;
1725}
1726
1727
1728
1729
1730
1731
1732
1733
1734static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1735 mode_t mode)
1736{
1737 struct dentry *parent;
1738 int error = 0;
1739
1740 parent = cgrp->parent->dentry;
1741 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
1742 if (!error) {
1743 dentry->d_fsdata = cgrp;
1744 inc_nlink(parent->d_inode);
1745 rcu_assign_pointer(cgrp->dentry, dentry);
1746 dget(dentry);
1747 }
1748 dput(dentry);
1749
1750 return error;
1751}
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762static mode_t cgroup_file_mode(const struct cftype *cft)
1763{
1764 mode_t mode = 0;
1765
1766 if (cft->mode)
1767 return cft->mode;
1768
1769 if (cft->read || cft->read_u64 || cft->read_s64 ||
1770 cft->read_map || cft->read_seq_string)
1771 mode |= S_IRUGO;
1772
1773 if (cft->write || cft->write_u64 || cft->write_s64 ||
1774 cft->write_string || cft->trigger)
1775 mode |= S_IWUSR;
1776
1777 return mode;
1778}
1779
1780int cgroup_add_file(struct cgroup *cgrp,
1781 struct cgroup_subsys *subsys,
1782 const struct cftype *cft)
1783{
1784 struct dentry *dir = cgrp->dentry;
1785 struct dentry *dentry;
1786 int error;
1787 mode_t mode;
1788
1789 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1790 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
1791 strcpy(name, subsys->name);
1792 strcat(name, ".");
1793 }
1794 strcat(name, cft->name);
1795 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1796 dentry = lookup_one_len(name, dir, strlen(name));
1797 if (!IS_ERR(dentry)) {
1798 mode = cgroup_file_mode(cft);
1799 error = cgroup_create_file(dentry, mode | S_IFREG,
1800 cgrp->root->sb);
1801 if (!error)
1802 dentry->d_fsdata = (void *)cft;
1803 dput(dentry);
1804 } else
1805 error = PTR_ERR(dentry);
1806 return error;
1807}
1808
1809int cgroup_add_files(struct cgroup *cgrp,
1810 struct cgroup_subsys *subsys,
1811 const struct cftype cft[],
1812 int count)
1813{
1814 int i, err;
1815 for (i = 0; i < count; i++) {
1816 err = cgroup_add_file(cgrp, subsys, &cft[i]);
1817 if (err)
1818 return err;
1819 }
1820 return 0;
1821}
1822
1823
1824
1825
1826
1827
1828
1829int cgroup_task_count(const struct cgroup *cgrp)
1830{
1831 int count = 0;
1832 struct cg_cgroup_link *link;
1833
1834 read_lock(&css_set_lock);
1835 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1836 count += atomic_read(&link->cg->refcount);
1837 }
1838 read_unlock(&css_set_lock);
1839 return count;
1840}
1841
1842
1843
1844
1845
1846static void cgroup_advance_iter(struct cgroup *cgrp,
1847 struct cgroup_iter *it)
1848{
1849 struct list_head *l = it->cg_link;
1850 struct cg_cgroup_link *link;
1851 struct css_set *cg;
1852
1853
1854 do {
1855 l = l->next;
1856 if (l == &cgrp->css_sets) {
1857 it->cg_link = NULL;
1858 return;
1859 }
1860 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1861 cg = link->cg;
1862 } while (list_empty(&cg->tasks));
1863 it->cg_link = l;
1864 it->task = cg->tasks.next;
1865}
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876static void cgroup_enable_task_cg_lists(void)
1877{
1878 struct task_struct *p, *g;
1879 write_lock(&css_set_lock);
1880 use_task_css_set_links = 1;
1881 do_each_thread(g, p) {
1882 task_lock(p);
1883
1884
1885
1886
1887
1888 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
1889 list_add(&p->cg_list, &p->cgroups->tasks);
1890 task_unlock(p);
1891 } while_each_thread(g, p);
1892 write_unlock(&css_set_lock);
1893}
1894
1895void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1896{
1897
1898
1899
1900
1901
1902 if (!use_task_css_set_links)
1903 cgroup_enable_task_cg_lists();
1904
1905 read_lock(&css_set_lock);
1906 it->cg_link = &cgrp->css_sets;
1907 cgroup_advance_iter(cgrp, it);
1908}
1909
1910struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1911 struct cgroup_iter *it)
1912{
1913 struct task_struct *res;
1914 struct list_head *l = it->task;
1915 struct cg_cgroup_link *link;
1916
1917
1918 if (!it->cg_link)
1919 return NULL;
1920 res = list_entry(l, struct task_struct, cg_list);
1921
1922 l = l->next;
1923 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1924 if (l == &link->cg->tasks) {
1925
1926
1927 cgroup_advance_iter(cgrp, it);
1928 } else {
1929 it->task = l;
1930 }
1931 return res;
1932}
1933
1934void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1935{
1936 read_unlock(&css_set_lock);
1937}
1938
1939static inline int started_after_time(struct task_struct *t1,
1940 struct timespec *time,
1941 struct task_struct *t2)
1942{
1943 int start_diff = timespec_compare(&t1->start_time, time);
1944 if (start_diff > 0) {
1945 return 1;
1946 } else if (start_diff < 0) {
1947 return 0;
1948 } else {
1949
1950
1951
1952
1953
1954
1955
1956
1957 return t1 > t2;
1958 }
1959}
1960
1961
1962
1963
1964
1965
1966static inline int started_after(void *p1, void *p2)
1967{
1968 struct task_struct *t1 = p1;
1969 struct task_struct *t2 = p2;
1970 return started_after_time(t1, &t2->start_time, t2);
1971}
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000int cgroup_scan_tasks(struct cgroup_scanner *scan)
2001{
2002 int retval, i;
2003 struct cgroup_iter it;
2004 struct task_struct *p, *dropped;
2005
2006 struct task_struct *latest_task = NULL;
2007 struct ptr_heap tmp_heap;
2008 struct ptr_heap *heap;
2009 struct timespec latest_time = { 0, 0 };
2010
2011 if (scan->heap) {
2012
2013 heap = scan->heap;
2014 heap->gt = &started_after;
2015 } else {
2016
2017 heap = &tmp_heap;
2018 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
2019 if (retval)
2020
2021 return retval;
2022 }
2023
2024 again:
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037 heap->size = 0;
2038 cgroup_iter_start(scan->cg, &it);
2039 while ((p = cgroup_iter_next(scan->cg, &it))) {
2040
2041
2042
2043
2044 if (scan->test_task && !scan->test_task(p, scan))
2045 continue;
2046
2047
2048
2049
2050 if (!started_after_time(p, &latest_time, latest_task))
2051 continue;
2052 dropped = heap_insert(heap, p);
2053 if (dropped == NULL) {
2054
2055
2056
2057
2058 get_task_struct(p);
2059 } else if (dropped != p) {
2060
2061
2062
2063
2064 get_task_struct(p);
2065 put_task_struct(dropped);
2066 }
2067
2068
2069
2070
2071 }
2072 cgroup_iter_end(scan->cg, &it);
2073
2074 if (heap->size) {
2075 for (i = 0; i < heap->size; i++) {
2076 struct task_struct *q = heap->ptrs[i];
2077 if (i == 0) {
2078 latest_time = q->start_time;
2079 latest_task = q;
2080 }
2081
2082 scan->process_task(q, scan);
2083 put_task_struct(q);
2084 }
2085
2086
2087
2088
2089
2090
2091
2092 goto again;
2093 }
2094 if (heap == &tmp_heap)
2095 heap_free(&tmp_heap);
2096 return 0;
2097}
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2117{
2118 int n = 0, pid;
2119 struct cgroup_iter it;
2120 struct task_struct *tsk;
2121 cgroup_iter_start(cgrp, &it);
2122 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2123 if (unlikely(n == npids))
2124 break;
2125 pid = task_pid_vnr(tsk);
2126 if (pid > 0)
2127 pidarray[n++] = pid;
2128 }
2129 cgroup_iter_end(cgrp, &it);
2130 return n;
2131}
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2143{
2144 int ret = -EINVAL;
2145 struct cgroup *cgrp;
2146 struct cgroup_iter it;
2147 struct task_struct *tsk;
2148
2149
2150
2151
2152
2153 if (dentry->d_sb->s_op != &cgroup_ops ||
2154 !S_ISDIR(dentry->d_inode->i_mode))
2155 goto err;
2156
2157 ret = 0;
2158 cgrp = dentry->d_fsdata;
2159
2160 cgroup_iter_start(cgrp, &it);
2161 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2162 switch (tsk->state) {
2163 case TASK_RUNNING:
2164 stats->nr_running++;
2165 break;
2166 case TASK_INTERRUPTIBLE:
2167 stats->nr_sleeping++;
2168 break;
2169 case TASK_UNINTERRUPTIBLE:
2170 stats->nr_uninterruptible++;
2171 break;
2172 case TASK_STOPPED:
2173 stats->nr_stopped++;
2174 break;
2175 default:
2176 if (delayacct_is_task_waiting_on_io(tsk))
2177 stats->nr_io_wait++;
2178 break;
2179 }
2180 }
2181 cgroup_iter_end(cgrp, &it);
2182
2183err:
2184 return ret;
2185}
2186
2187static int cmppid(const void *a, const void *b)
2188{
2189 return *(pid_t *)a - *(pid_t *)b;
2190}
2191
2192
2193
2194
2195
2196
2197
2198
2199static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2200{
2201
2202
2203
2204
2205
2206
2207 struct cgroup *cgrp = s->private;
2208 int index = 0, pid = *pos;
2209 int *iter;
2210
2211 down_read(&cgrp->pids_mutex);
2212 if (pid) {
2213 int end = cgrp->pids_length;
2214
2215 while (index < end) {
2216 int mid = (index + end) / 2;
2217 if (cgrp->tasks_pids[mid] == pid) {
2218 index = mid;
2219 break;
2220 } else if (cgrp->tasks_pids[mid] <= pid)
2221 index = mid + 1;
2222 else
2223 end = mid;
2224 }
2225 }
2226
2227 if (index >= cgrp->pids_length)
2228 return NULL;
2229
2230 iter = cgrp->tasks_pids + index;
2231 *pos = *iter;
2232 return iter;
2233}
2234
2235static void cgroup_tasks_stop(struct seq_file *s, void *v)
2236{
2237 struct cgroup *cgrp = s->private;
2238 up_read(&cgrp->pids_mutex);
2239}
2240
2241static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2242{
2243 struct cgroup *cgrp = s->private;
2244 int *p = v;
2245 int *end = cgrp->tasks_pids + cgrp->pids_length;
2246
2247
2248
2249
2250
2251 p++;
2252 if (p >= end) {
2253 return NULL;
2254 } else {
2255 *pos = *p;
2256 return p;
2257 }
2258}
2259
2260static int cgroup_tasks_show(struct seq_file *s, void *v)
2261{
2262 return seq_printf(s, "%d\n", *(int *)v);
2263}
2264
2265static struct seq_operations cgroup_tasks_seq_operations = {
2266 .start = cgroup_tasks_start,
2267 .stop = cgroup_tasks_stop,
2268 .next = cgroup_tasks_next,
2269 .show = cgroup_tasks_show,
2270};
2271
2272static void release_cgroup_pid_array(struct cgroup *cgrp)
2273{
2274 down_write(&cgrp->pids_mutex);
2275 BUG_ON(!cgrp->pids_use_count);
2276 if (!--cgrp->pids_use_count) {
2277 kfree(cgrp->tasks_pids);
2278 cgrp->tasks_pids = NULL;
2279 cgrp->pids_length = 0;
2280 }
2281 up_write(&cgrp->pids_mutex);
2282}
2283
2284static int cgroup_tasks_release(struct inode *inode, struct file *file)
2285{
2286 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2287
2288 if (!(file->f_mode & FMODE_READ))
2289 return 0;
2290
2291 release_cgroup_pid_array(cgrp);
2292 return seq_release(inode, file);
2293}
2294
2295static struct file_operations cgroup_tasks_operations = {
2296 .read = seq_read,
2297 .llseek = seq_lseek,
2298 .write = cgroup_file_write,
2299 .release = cgroup_tasks_release,
2300};
2301
2302
2303
2304
2305
2306
2307static int cgroup_tasks_open(struct inode *unused, struct file *file)
2308{
2309 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2310 pid_t *pidarray;
2311 int npids;
2312 int retval;
2313
2314
2315 if (!(file->f_mode & FMODE_READ))
2316 return 0;
2317
2318
2319
2320
2321
2322
2323
2324 npids = cgroup_task_count(cgrp);
2325 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2326 if (!pidarray)
2327 return -ENOMEM;
2328 npids = pid_array_load(pidarray, npids, cgrp);
2329 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2330
2331
2332
2333
2334
2335 down_write(&cgrp->pids_mutex);
2336 kfree(cgrp->tasks_pids);
2337 cgrp->tasks_pids = pidarray;
2338 cgrp->pids_length = npids;
2339 cgrp->pids_use_count++;
2340 up_write(&cgrp->pids_mutex);
2341
2342 file->f_op = &cgroup_tasks_operations;
2343
2344 retval = seq_open(file, &cgroup_tasks_seq_operations);
2345 if (retval) {
2346 release_cgroup_pid_array(cgrp);
2347 return retval;
2348 }
2349 ((struct seq_file *)file->private_data)->private = cgrp;
2350 return 0;
2351}
2352
2353static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2354 struct cftype *cft)
2355{
2356 return notify_on_release(cgrp);
2357}
2358
2359static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2360 struct cftype *cft,
2361 u64 val)
2362{
2363 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
2364 if (val)
2365 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2366 else
2367 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2368 return 0;
2369}
2370
2371
2372
2373
2374static struct cftype files[] = {
2375 {
2376 .name = "tasks",
2377 .open = cgroup_tasks_open,
2378 .write_u64 = cgroup_tasks_write,
2379 .release = cgroup_tasks_release,
2380 .private = FILE_TASKLIST,
2381 .mode = S_IRUGO | S_IWUSR,
2382 },
2383
2384 {
2385 .name = "notify_on_release",
2386 .read_u64 = cgroup_read_notify_on_release,
2387 .write_u64 = cgroup_write_notify_on_release,
2388 .private = FILE_NOTIFY_ON_RELEASE,
2389 },
2390};
2391
2392static struct cftype cft_release_agent = {
2393 .name = "release_agent",
2394 .read_seq_string = cgroup_release_agent_show,
2395 .write_string = cgroup_release_agent_write,
2396 .max_write_len = PATH_MAX,
2397 .private = FILE_RELEASE_AGENT,
2398};
2399
2400static int cgroup_populate_dir(struct cgroup *cgrp)
2401{
2402 int err;
2403 struct cgroup_subsys *ss;
2404
2405
2406 cgroup_clear_directory(cgrp->dentry);
2407
2408 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
2409 if (err < 0)
2410 return err;
2411
2412 if (cgrp == cgrp->top_cgroup) {
2413 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
2414 return err;
2415 }
2416
2417 for_each_subsys(cgrp->root, ss) {
2418 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2419 return err;
2420 }
2421
2422 for_each_subsys(cgrp->root, ss) {
2423 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2424
2425
2426
2427
2428
2429 if (css->id)
2430 rcu_assign_pointer(css->id->css, css);
2431 }
2432
2433 return 0;
2434}
2435
2436static void init_cgroup_css(struct cgroup_subsys_state *css,
2437 struct cgroup_subsys *ss,
2438 struct cgroup *cgrp)
2439{
2440 css->cgroup = cgrp;
2441 atomic_set(&css->refcnt, 1);
2442 css->flags = 0;
2443 css->id = NULL;
2444 if (cgrp == dummytop)
2445 set_bit(CSS_ROOT, &css->flags);
2446 BUG_ON(cgrp->subsys[ss->subsys_id]);
2447 cgrp->subsys[ss->subsys_id] = css;
2448}
2449
2450static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2451{
2452
2453 int i;
2454
2455 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2456 struct cgroup_subsys *ss = subsys[i];
2457 if (ss->root == root)
2458 mutex_lock(&ss->hierarchy_mutex);
2459 }
2460}
2461
2462static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2463{
2464 int i;
2465
2466 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2467 struct cgroup_subsys *ss = subsys[i];
2468 if (ss->root == root)
2469 mutex_unlock(&ss->hierarchy_mutex);
2470 }
2471}
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2482 mode_t mode)
2483{
2484 struct cgroup *cgrp;
2485 struct cgroupfs_root *root = parent->root;
2486 int err = 0;
2487 struct cgroup_subsys *ss;
2488 struct super_block *sb = root->sb;
2489
2490 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
2491 if (!cgrp)
2492 return -ENOMEM;
2493
2494
2495
2496
2497
2498
2499 atomic_inc(&sb->s_active);
2500
2501 mutex_lock(&cgroup_mutex);
2502
2503 init_cgroup_housekeeping(cgrp);
2504
2505 cgrp->parent = parent;
2506 cgrp->root = parent->root;
2507 cgrp->top_cgroup = parent->top_cgroup;
2508
2509 if (notify_on_release(parent))
2510 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2511
2512 for_each_subsys(root, ss) {
2513 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2514 if (IS_ERR(css)) {
2515 err = PTR_ERR(css);
2516 goto err_destroy;
2517 }
2518 init_cgroup_css(css, ss, cgrp);
2519 if (ss->use_id)
2520 if (alloc_css_id(ss, parent, cgrp))
2521 goto err_destroy;
2522
2523 }
2524
2525 cgroup_lock_hierarchy(root);
2526 list_add(&cgrp->sibling, &cgrp->parent->children);
2527 cgroup_unlock_hierarchy(root);
2528 root->number_of_cgroups++;
2529
2530 err = cgroup_create_dir(cgrp, dentry, mode);
2531 if (err < 0)
2532 goto err_remove;
2533
2534
2535 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
2536
2537 err = cgroup_populate_dir(cgrp);
2538
2539
2540 mutex_unlock(&cgroup_mutex);
2541 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
2542
2543 return 0;
2544
2545 err_remove:
2546
2547 cgroup_lock_hierarchy(root);
2548 list_del(&cgrp->sibling);
2549 cgroup_unlock_hierarchy(root);
2550 root->number_of_cgroups--;
2551
2552 err_destroy:
2553
2554 for_each_subsys(root, ss) {
2555 if (cgrp->subsys[ss->subsys_id])
2556 ss->destroy(ss, cgrp);
2557 }
2558
2559 mutex_unlock(&cgroup_mutex);
2560
2561
2562 deactivate_super(sb);
2563
2564 kfree(cgrp);
2565 return err;
2566}
2567
2568static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2569{
2570 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
2571
2572
2573 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2574}
2575
2576static int cgroup_has_css_refs(struct cgroup *cgrp)
2577{
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587 int i;
2588 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2589 struct cgroup_subsys *ss = subsys[i];
2590 struct cgroup_subsys_state *css;
2591
2592 if (ss->root != cgrp->root)
2593 continue;
2594 css = cgrp->subsys[ss->subsys_id];
2595
2596
2597
2598
2599
2600
2601 if (css && (atomic_read(&css->refcnt) > 1))
2602 return 1;
2603 }
2604 return 0;
2605}
2606
2607
2608
2609
2610
2611
2612
2613static int cgroup_clear_css_refs(struct cgroup *cgrp)
2614{
2615 struct cgroup_subsys *ss;
2616 unsigned long flags;
2617 bool failed = false;
2618 local_irq_save(flags);
2619 for_each_subsys(cgrp->root, ss) {
2620 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2621 int refcnt;
2622 while (1) {
2623
2624 refcnt = atomic_read(&css->refcnt);
2625 if (refcnt > 1) {
2626 failed = true;
2627 goto done;
2628 }
2629 BUG_ON(!refcnt);
2630
2631
2632
2633
2634
2635
2636 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
2637 break;
2638 cpu_relax();
2639 }
2640 }
2641 done:
2642 for_each_subsys(cgrp->root, ss) {
2643 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2644 if (failed) {
2645
2646
2647
2648
2649 if (!atomic_read(&css->refcnt))
2650 atomic_set(&css->refcnt, 1);
2651 } else {
2652
2653 set_bit(CSS_REMOVED, &css->flags);
2654 }
2655 }
2656 local_irq_restore(flags);
2657 return !failed;
2658}
2659
2660static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2661{
2662 struct cgroup *cgrp = dentry->d_fsdata;
2663 struct dentry *d;
2664 struct cgroup *parent;
2665 DEFINE_WAIT(wait);
2666 int ret;
2667
2668
2669again:
2670 mutex_lock(&cgroup_mutex);
2671 if (atomic_read(&cgrp->count) != 0) {
2672 mutex_unlock(&cgroup_mutex);
2673 return -EBUSY;
2674 }
2675 if (!list_empty(&cgrp->children)) {
2676 mutex_unlock(&cgroup_mutex);
2677 return -EBUSY;
2678 }
2679 mutex_unlock(&cgroup_mutex);
2680
2681
2682
2683
2684
2685 ret = cgroup_call_pre_destroy(cgrp);
2686 if (ret)
2687 return ret;
2688
2689 mutex_lock(&cgroup_mutex);
2690 parent = cgrp->parent;
2691 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2692 mutex_unlock(&cgroup_mutex);
2693 return -EBUSY;
2694 }
2695
2696
2697
2698
2699
2700
2701
2702
2703 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2705
2706 if (!cgroup_clear_css_refs(cgrp)) {
2707 mutex_unlock(&cgroup_mutex);
2708 schedule();
2709 finish_wait(&cgroup_rmdir_waitq, &wait);
2710 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2711 if (signal_pending(current))
2712 return -EINTR;
2713 goto again;
2714 }
2715
2716 finish_wait(&cgroup_rmdir_waitq, &wait);
2717 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2718
2719 spin_lock(&release_list_lock);
2720 set_bit(CGRP_REMOVED, &cgrp->flags);
2721 if (!list_empty(&cgrp->release_list))
2722 list_del(&cgrp->release_list);
2723 spin_unlock(&release_list_lock);
2724
2725 cgroup_lock_hierarchy(cgrp->root);
2726
2727 list_del(&cgrp->sibling);
2728 cgroup_unlock_hierarchy(cgrp->root);
2729
2730 spin_lock(&cgrp->dentry->d_lock);
2731 d = dget(cgrp->dentry);
2732 spin_unlock(&d->d_lock);
2733
2734 cgroup_d_remove_dir(d);
2735 dput(d);
2736
2737 set_bit(CGRP_RELEASABLE, &parent->flags);
2738 check_for_release(parent);
2739
2740 mutex_unlock(&cgroup_mutex);
2741 return 0;
2742}
2743
2744static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2745{
2746 struct cgroup_subsys_state *css;
2747
2748 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2749
2750
2751 list_add(&ss->sibling, &rootnode.subsys_list);
2752 ss->root = &rootnode;
2753 css = ss->create(ss, dummytop);
2754
2755 BUG_ON(IS_ERR(css));
2756 init_cgroup_css(css, ss, dummytop);
2757
2758
2759
2760
2761
2762 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2763
2764 need_forkexit_callback |= ss->fork || ss->exit;
2765
2766
2767
2768
2769 BUG_ON(!list_empty(&init_task.tasks));
2770
2771 mutex_init(&ss->hierarchy_mutex);
2772 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
2773 ss->active = 1;
2774}
2775
2776
2777
2778
2779
2780
2781
2782int __init cgroup_init_early(void)
2783{
2784 int i;
2785 atomic_set(&init_css_set.refcount, 1);
2786 INIT_LIST_HEAD(&init_css_set.cg_links);
2787 INIT_LIST_HEAD(&init_css_set.tasks);
2788 INIT_HLIST_NODE(&init_css_set.hlist);
2789 css_set_count = 1;
2790 init_cgroup_root(&rootnode);
2791 root_count = 1;
2792 init_task.cgroups = &init_css_set;
2793
2794 init_css_set_link.cg = &init_css_set;
2795 list_add(&init_css_set_link.cgrp_link_list,
2796 &rootnode.top_cgroup.css_sets);
2797 list_add(&init_css_set_link.cg_link_list,
2798 &init_css_set.cg_links);
2799
2800 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
2801 INIT_HLIST_HEAD(&css_set_table[i]);
2802
2803 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2804 struct cgroup_subsys *ss = subsys[i];
2805
2806 BUG_ON(!ss->name);
2807 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
2808 BUG_ON(!ss->create);
2809 BUG_ON(!ss->destroy);
2810 if (ss->subsys_id != i) {
2811 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
2812 ss->name, ss->subsys_id);
2813 BUG();
2814 }
2815
2816 if (ss->early_init)
2817 cgroup_init_subsys(ss);
2818 }
2819 return 0;
2820}
2821
2822
2823
2824
2825
2826
2827
2828int __init cgroup_init(void)
2829{
2830 int err;
2831 int i;
2832 struct hlist_head *hhead;
2833
2834 err = bdi_init(&cgroup_backing_dev_info);
2835 if (err)
2836 return err;
2837
2838 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2839 struct cgroup_subsys *ss = subsys[i];
2840 if (!ss->early_init)
2841 cgroup_init_subsys(ss);
2842 if (ss->use_id)
2843 cgroup_subsys_init_idr(ss);
2844 }
2845
2846
2847 hhead = css_set_hash(init_css_set.subsys);
2848 hlist_add_head(&init_css_set.hlist, hhead);
2849
2850 err = register_filesystem(&cgroup_fs_type);
2851 if (err < 0)
2852 goto out;
2853
2854 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
2855
2856out:
2857 if (err)
2858 bdi_destroy(&cgroup_backing_dev_info);
2859
2860 return err;
2861}
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876static int proc_cgroup_show(struct seq_file *m, void *v)
2877{
2878 struct pid *pid;
2879 struct task_struct *tsk;
2880 char *buf;
2881 int retval;
2882 struct cgroupfs_root *root;
2883
2884 retval = -ENOMEM;
2885 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2886 if (!buf)
2887 goto out;
2888
2889 retval = -ESRCH;
2890 pid = m->private;
2891 tsk = get_pid_task(pid, PIDTYPE_PID);
2892 if (!tsk)
2893 goto out_free;
2894
2895 retval = 0;
2896
2897 mutex_lock(&cgroup_mutex);
2898
2899 for_each_active_root(root) {
2900 struct cgroup_subsys *ss;
2901 struct cgroup *cgrp;
2902 int subsys_id;
2903 int count = 0;
2904
2905 seq_printf(m, "%lu:", root->subsys_bits);
2906 for_each_subsys(root, ss)
2907 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2908 seq_putc(m, ':');
2909 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2910 cgrp = task_cgroup(tsk, subsys_id);
2911 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2912 if (retval < 0)
2913 goto out_unlock;
2914 seq_puts(m, buf);
2915 seq_putc(m, '\n');
2916 }
2917
2918out_unlock:
2919 mutex_unlock(&cgroup_mutex);
2920 put_task_struct(tsk);
2921out_free:
2922 kfree(buf);
2923out:
2924 return retval;
2925}
2926
2927static int cgroup_open(struct inode *inode, struct file *file)
2928{
2929 struct pid *pid = PROC_I(inode)->pid;
2930 return single_open(file, proc_cgroup_show, pid);
2931}
2932
2933struct file_operations proc_cgroup_operations = {
2934 .open = cgroup_open,
2935 .read = seq_read,
2936 .llseek = seq_lseek,
2937 .release = single_release,
2938};
2939
2940
2941static int proc_cgroupstats_show(struct seq_file *m, void *v)
2942{
2943 int i;
2944
2945 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
2946 mutex_lock(&cgroup_mutex);
2947 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2948 struct cgroup_subsys *ss = subsys[i];
2949 seq_printf(m, "%s\t%lu\t%d\t%d\n",
2950 ss->name, ss->root->subsys_bits,
2951 ss->root->number_of_cgroups, !ss->disabled);
2952 }
2953 mutex_unlock(&cgroup_mutex);
2954 return 0;
2955}
2956
2957static int cgroupstats_open(struct inode *inode, struct file *file)
2958{
2959 return single_open(file, proc_cgroupstats_show, NULL);
2960}
2961
2962static struct file_operations proc_cgroupstats_operations = {
2963 .open = cgroupstats_open,
2964 .read = seq_read,
2965 .llseek = seq_lseek,
2966 .release = single_release,
2967};
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985void cgroup_fork(struct task_struct *child)
2986{
2987 task_lock(current);
2988 child->cgroups = current->cgroups;
2989 get_css_set(child->cgroups);
2990 task_unlock(current);
2991 INIT_LIST_HEAD(&child->cg_list);
2992}
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002void cgroup_fork_callbacks(struct task_struct *child)
3003{
3004 if (need_forkexit_callback) {
3005 int i;
3006 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3007 struct cgroup_subsys *ss = subsys[i];
3008 if (ss->fork)
3009 ss->fork(ss, child);
3010 }
3011 }
3012}
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023void cgroup_post_fork(struct task_struct *child)
3024{
3025 if (use_task_css_set_links) {
3026 write_lock(&css_set_lock);
3027 task_lock(child);
3028 if (list_empty(&child->cg_list))
3029 list_add(&child->cg_list, &child->cgroups->tasks);
3030 task_unlock(child);
3031 write_unlock(&css_set_lock);
3032 }
3033}
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3070{
3071 int i;
3072 struct css_set *cg;
3073
3074 if (run_callbacks && need_forkexit_callback) {
3075 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3076 struct cgroup_subsys *ss = subsys[i];
3077 if (ss->exit)
3078 ss->exit(ss, tsk);
3079 }
3080 }
3081
3082
3083
3084
3085
3086
3087 if (!list_empty(&tsk->cg_list)) {
3088 write_lock(&css_set_lock);
3089 if (!list_empty(&tsk->cg_list))
3090 list_del(&tsk->cg_list);
3091 write_unlock(&css_set_lock);
3092 }
3093
3094
3095 task_lock(tsk);
3096 cg = tsk->cgroups;
3097 tsk->cgroups = &init_css_set;
3098 task_unlock(tsk);
3099 if (cg)
3100 put_css_set_taskexit(cg);
3101}
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3114 char *nodename)
3115{
3116 struct dentry *dentry;
3117 int ret = 0;
3118 struct cgroup *parent, *child;
3119 struct inode *inode;
3120 struct css_set *cg;
3121 struct cgroupfs_root *root;
3122 struct cgroup_subsys *ss;
3123
3124
3125 BUG_ON(!subsys->active);
3126
3127
3128
3129 mutex_lock(&cgroup_mutex);
3130 again:
3131 root = subsys->root;
3132 if (root == &rootnode) {
3133 mutex_unlock(&cgroup_mutex);
3134 return 0;
3135 }
3136
3137
3138 if (!atomic_inc_not_zero(&root->sb->s_active)) {
3139
3140 mutex_unlock(&cgroup_mutex);
3141 return 0;
3142 }
3143
3144
3145 task_lock(tsk);
3146 parent = task_cgroup(tsk, subsys->subsys_id);
3147 cg = tsk->cgroups;
3148 get_css_set(cg);
3149 task_unlock(tsk);
3150
3151 mutex_unlock(&cgroup_mutex);
3152
3153
3154 inode = parent->dentry->d_inode;
3155
3156
3157
3158 mutex_lock(&inode->i_mutex);
3159 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
3160 if (IS_ERR(dentry)) {
3161 printk(KERN_INFO
3162 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
3163 PTR_ERR(dentry));
3164 ret = PTR_ERR(dentry);
3165 goto out_release;
3166 }
3167
3168
3169 ret = vfs_mkdir(inode, dentry, 0755);
3170 child = __d_cgrp(dentry);
3171 dput(dentry);
3172 if (ret) {
3173 printk(KERN_INFO
3174 "Failed to create cgroup %s: %d\n", nodename,
3175 ret);
3176 goto out_release;
3177 }
3178
3179
3180
3181
3182 mutex_lock(&cgroup_mutex);
3183 if ((root != subsys->root) ||
3184 (parent != task_cgroup(tsk, subsys->subsys_id))) {
3185
3186 mutex_unlock(&inode->i_mutex);
3187 put_css_set(cg);
3188
3189 deactivate_super(root->sb);
3190
3191
3192
3193 printk(KERN_INFO
3194 "Race in cgroup_clone() - leaking cgroup %s\n",
3195 nodename);
3196 goto again;
3197 }
3198
3199
3200 for_each_subsys(root, ss) {
3201 if (ss->post_clone)
3202 ss->post_clone(ss, child);
3203 }
3204
3205
3206 ret = cgroup_attach_task(child, tsk);
3207 mutex_unlock(&cgroup_mutex);
3208
3209 out_release:
3210 mutex_unlock(&inode->i_mutex);
3211
3212 mutex_lock(&cgroup_mutex);
3213 put_css_set(cg);
3214 mutex_unlock(&cgroup_mutex);
3215 deactivate_super(root->sb);
3216 return ret;
3217}
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3233{
3234 int ret;
3235 struct cgroup *target;
3236 int subsys_id;
3237
3238 if (cgrp == dummytop)
3239 return 1;
3240
3241 get_first_subsys(cgrp, NULL, &subsys_id);
3242 target = task_cgroup(task, subsys_id);
3243 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3244 cgrp = cgrp->parent;
3245 ret = (cgrp == target);
3246 return ret;
3247}
3248
3249static void check_for_release(struct cgroup *cgrp)
3250{
3251
3252
3253 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
3254 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
3255
3256
3257
3258 int need_schedule_work = 0;
3259 spin_lock(&release_list_lock);
3260 if (!cgroup_is_removed(cgrp) &&
3261 list_empty(&cgrp->release_list)) {
3262 list_add(&cgrp->release_list, &release_list);
3263 need_schedule_work = 1;
3264 }
3265 spin_unlock(&release_list_lock);
3266 if (need_schedule_work)
3267 schedule_work(&release_agent_work);
3268 }
3269}
3270
3271void __css_put(struct cgroup_subsys_state *css)
3272{
3273 struct cgroup *cgrp = css->cgroup;
3274 rcu_read_lock();
3275 if (atomic_dec_return(&css->refcnt) == 1) {
3276 if (notify_on_release(cgrp)) {
3277 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3278 check_for_release(cgrp);
3279 }
3280 cgroup_wakeup_rmdir_waiters(cgrp);
3281 }
3282 rcu_read_unlock();
3283}
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308static void cgroup_release_agent(struct work_struct *work)
3309{
3310 BUG_ON(work != &release_agent_work);
3311 mutex_lock(&cgroup_mutex);
3312 spin_lock(&release_list_lock);
3313 while (!list_empty(&release_list)) {
3314 char *argv[3], *envp[3];
3315 int i;
3316 char *pathbuf = NULL, *agentbuf = NULL;
3317 struct cgroup *cgrp = list_entry(release_list.next,
3318 struct cgroup,
3319 release_list);
3320 list_del_init(&cgrp->release_list);
3321 spin_unlock(&release_list_lock);
3322 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3323 if (!pathbuf)
3324 goto continue_free;
3325 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
3326 goto continue_free;
3327 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
3328 if (!agentbuf)
3329 goto continue_free;
3330
3331 i = 0;
3332 argv[i++] = agentbuf;
3333 argv[i++] = pathbuf;
3334 argv[i] = NULL;
3335
3336 i = 0;
3337
3338 envp[i++] = "HOME=/";
3339 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
3340 envp[i] = NULL;
3341
3342
3343
3344
3345 mutex_unlock(&cgroup_mutex);
3346 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
3347 mutex_lock(&cgroup_mutex);
3348 continue_free:
3349 kfree(pathbuf);
3350 kfree(agentbuf);
3351 spin_lock(&release_list_lock);
3352 }
3353 spin_unlock(&release_list_lock);
3354 mutex_unlock(&cgroup_mutex);
3355}
3356
3357static int __init cgroup_disable(char *str)
3358{
3359 int i;
3360 char *token;
3361
3362 while ((token = strsep(&str, ",")) != NULL) {
3363 if (!*token)
3364 continue;
3365
3366 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3367 struct cgroup_subsys *ss = subsys[i];
3368
3369 if (!strcmp(token, ss->name)) {
3370 ss->disabled = 1;
3371 printk(KERN_INFO "Disabling %s control group"
3372 " subsystem\n", ss->name);
3373 break;
3374 }
3375 }
3376 }
3377 return 1;
3378}
3379__setup("cgroup_disable=", cgroup_disable);
3380
3381
3382
3383
3384
3385
3386
3387
3388unsigned short css_id(struct cgroup_subsys_state *css)
3389{
3390 struct css_id *cssid = rcu_dereference(css->id);
3391
3392 if (cssid)
3393 return cssid->id;
3394 return 0;
3395}
3396
3397unsigned short css_depth(struct cgroup_subsys_state *css)
3398{
3399 struct css_id *cssid = rcu_dereference(css->id);
3400
3401 if (cssid)
3402 return cssid->depth;
3403 return 0;
3404}
3405
3406bool css_is_ancestor(struct cgroup_subsys_state *child,
3407 const struct cgroup_subsys_state *root)
3408{
3409 struct css_id *child_id = rcu_dereference(child->id);
3410 struct css_id *root_id = rcu_dereference(root->id);
3411
3412 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3413 return false;
3414 return child_id->stack[root_id->depth] == root_id->id;
3415}
3416
3417static void __free_css_id_cb(struct rcu_head *head)
3418{
3419 struct css_id *id;
3420
3421 id = container_of(head, struct css_id, rcu_head);
3422 kfree(id);
3423}
3424
3425void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3426{
3427 struct css_id *id = css->id;
3428
3429 if (!id)
3430 return;
3431
3432 BUG_ON(!ss->use_id);
3433
3434 rcu_assign_pointer(id->css, NULL);
3435 rcu_assign_pointer(css->id, NULL);
3436 spin_lock(&ss->id_lock);
3437 idr_remove(&ss->idr, id->id);
3438 spin_unlock(&ss->id_lock);
3439 call_rcu(&id->rcu_head, __free_css_id_cb);
3440}
3441
3442
3443
3444
3445
3446
3447static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3448{
3449 struct css_id *newid;
3450 int myid, error, size;
3451
3452 BUG_ON(!ss->use_id);
3453
3454 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3455 newid = kzalloc(size, GFP_KERNEL);
3456 if (!newid)
3457 return ERR_PTR(-ENOMEM);
3458
3459 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3460 error = -ENOMEM;
3461 goto err_out;
3462 }
3463 spin_lock(&ss->id_lock);
3464
3465 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3466 spin_unlock(&ss->id_lock);
3467
3468
3469 if (error) {
3470 error = -ENOSPC;
3471 goto err_out;
3472 }
3473 if (myid > CSS_ID_MAX)
3474 goto remove_idr;
3475
3476 newid->id = myid;
3477 newid->depth = depth;
3478 return newid;
3479remove_idr:
3480 error = -ENOSPC;
3481 spin_lock(&ss->id_lock);
3482 idr_remove(&ss->idr, myid);
3483 spin_unlock(&ss->id_lock);
3484err_out:
3485 kfree(newid);
3486 return ERR_PTR(error);
3487
3488}
3489
3490static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3491{
3492 struct css_id *newid;
3493 struct cgroup_subsys_state *rootcss;
3494
3495 spin_lock_init(&ss->id_lock);
3496 idr_init(&ss->idr);
3497
3498 rootcss = init_css_set.subsys[ss->subsys_id];
3499 newid = get_new_cssid(ss, 0);
3500 if (IS_ERR(newid))
3501 return PTR_ERR(newid);
3502
3503 newid->stack[0] = newid->id;
3504 newid->css = rootcss;
3505 rootcss->id = newid;
3506 return 0;
3507}
3508
3509static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3510 struct cgroup *child)
3511{
3512 int subsys_id, i, depth = 0;
3513 struct cgroup_subsys_state *parent_css, *child_css;
3514 struct css_id *child_id, *parent_id = NULL;
3515
3516 subsys_id = ss->subsys_id;
3517 parent_css = parent->subsys[subsys_id];
3518 child_css = child->subsys[subsys_id];
3519 depth = css_depth(parent_css) + 1;
3520 parent_id = parent_css->id;
3521
3522 child_id = get_new_cssid(ss, depth);
3523 if (IS_ERR(child_id))
3524 return PTR_ERR(child_id);
3525
3526 for (i = 0; i < depth; i++)
3527 child_id->stack[i] = parent_id->stack[i];
3528 child_id->stack[depth] = child_id->id;
3529
3530
3531
3532
3533 rcu_assign_pointer(child_css->id, child_id);
3534
3535 return 0;
3536}
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3547{
3548 struct css_id *cssid = NULL;
3549
3550 BUG_ON(!ss->use_id);
3551 cssid = idr_find(&ss->idr, id);
3552
3553 if (unlikely(!cssid))
3554 return NULL;
3555
3556 return rcu_dereference(cssid->css);
3557}
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569struct cgroup_subsys_state *
3570css_get_next(struct cgroup_subsys *ss, int id,
3571 struct cgroup_subsys_state *root, int *foundid)
3572{
3573 struct cgroup_subsys_state *ret = NULL;
3574 struct css_id *tmp;
3575 int tmpid;
3576 int rootid = css_id(root);
3577 int depth = css_depth(root);
3578
3579 if (!rootid)
3580 return NULL;
3581
3582 BUG_ON(!ss->use_id);
3583
3584 tmpid = id;
3585 while (1) {
3586
3587
3588
3589
3590 spin_lock(&ss->id_lock);
3591 tmp = idr_get_next(&ss->idr, &tmpid);
3592 spin_unlock(&ss->id_lock);
3593
3594 if (!tmp)
3595 break;
3596 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3597 ret = rcu_dereference(tmp->css);
3598 if (ret) {
3599 *foundid = tmpid;
3600 break;
3601 }
3602 }
3603
3604 tmpid = tmpid + 1;
3605 }
3606 return ret;
3607}
3608
3609