1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/mutex.h>
37#include <linux/rbtree.h>
38#include <linux/slab.h>
39#include <linux/swap.h>
40#include <linux/swapops.h>
41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
44#include <linux/fs.h>
45#include <linux/seq_file.h>
46#include <linux/vmalloc.h>
47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
50#include "internal.h"
51
52#include <asm/uaccess.h>
53
54struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55#define MEM_CGROUP_RECLAIM_RETRIES 5
56struct mem_cgroup *root_mem_cgroup __read_mostly;
57
58#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
59
60int do_swap_account __read_mostly;
61static int really_do_swap_account __initdata = 1;
62#else
63#define do_swap_account (0)
64#endif
65
66
67
68
69
70
71
72
73#define THRESHOLDS_EVENTS_THRESH (7)
74#define SOFTLIMIT_EVENTS_THRESH (10)
75
76
77
78
79enum mem_cgroup_stat_index {
80
81
82
83 MEM_CGROUP_STAT_CACHE,
84 MEM_CGROUP_STAT_RSS,
85 MEM_CGROUP_STAT_FILE_MAPPED,
86 MEM_CGROUP_STAT_PGPGIN_COUNT,
87 MEM_CGROUP_STAT_PGPGOUT_COUNT,
88 MEM_CGROUP_STAT_SWAPOUT,
89 MEM_CGROUP_EVENTS,
90
91 MEM_CGROUP_STAT_NSTATS,
92};
93
94struct mem_cgroup_stat_cpu {
95 s64 count[MEM_CGROUP_STAT_NSTATS];
96};
97
98
99
100
101struct mem_cgroup_per_zone {
102
103
104
105 struct list_head lists[NR_LRU_LISTS];
106 unsigned long count[NR_LRU_LISTS];
107
108 struct zone_reclaim_stat reclaim_stat;
109 struct rb_node tree_node;
110 unsigned long long usage_in_excess;
111
112 bool on_tree;
113 struct mem_cgroup *mem;
114
115};
116
117#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
118
119struct mem_cgroup_per_node {
120 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
121};
122
123struct mem_cgroup_lru_info {
124 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
125};
126
127
128
129
130
131
132struct mem_cgroup_tree_per_zone {
133 struct rb_root rb_root;
134 spinlock_t lock;
135};
136
137struct mem_cgroup_tree_per_node {
138 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
139};
140
141struct mem_cgroup_tree {
142 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
143};
144
145static struct mem_cgroup_tree soft_limit_tree __read_mostly;
146
147struct mem_cgroup_threshold {
148 struct eventfd_ctx *eventfd;
149 u64 threshold;
150};
151
152
153struct mem_cgroup_threshold_ary {
154
155 int current_threshold;
156
157 unsigned int size;
158
159 struct mem_cgroup_threshold entries[0];
160};
161
162struct mem_cgroup_thresholds {
163
164 struct mem_cgroup_threshold_ary *primary;
165
166
167
168
169
170 struct mem_cgroup_threshold_ary *spare;
171};
172
173
174struct mem_cgroup_eventfd_list {
175 struct list_head list;
176 struct eventfd_ctx *eventfd;
177};
178
179static void mem_cgroup_threshold(struct mem_cgroup *mem);
180static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
181
182
183
184
185
186
187
188
189
190
191
192
193struct mem_cgroup {
194 struct cgroup_subsys_state css;
195
196
197
198 struct res_counter res;
199
200
201
202 struct res_counter memsw;
203
204
205
206
207 struct mem_cgroup_lru_info info;
208
209
210
211
212 spinlock_t reclaim_param_lock;
213
214 int prev_priority;
215
216
217
218
219
220 int last_scanned_child;
221
222
223
224 bool use_hierarchy;
225 atomic_t oom_lock;
226 atomic_t refcnt;
227
228 unsigned int swappiness;
229
230 int oom_kill_disable;
231
232
233 bool memsw_is_minimum;
234
235
236 struct mutex thresholds_lock;
237
238
239 struct mem_cgroup_thresholds thresholds;
240
241
242 struct mem_cgroup_thresholds memsw_thresholds;
243
244
245 struct list_head oom_notify;
246
247
248
249
250
251 unsigned long move_charge_at_immigrate;
252
253
254
255 struct mem_cgroup_stat_cpu *stat;
256};
257
258
259
260
261
262
263enum move_type {
264 MOVE_CHARGE_TYPE_ANON,
265 MOVE_CHARGE_TYPE_FILE,
266 NR_MOVE_TYPE,
267};
268
269
270static struct move_charge_struct {
271 struct mem_cgroup *from;
272 struct mem_cgroup *to;
273 unsigned long precharge;
274 unsigned long moved_charge;
275 unsigned long moved_swap;
276 struct task_struct *moving_task;
277 wait_queue_head_t waitq;
278} mc = {
279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280};
281
282static bool move_anon(void)
283{
284 return test_bit(MOVE_CHARGE_TYPE_ANON,
285 &mc.to->move_charge_at_immigrate);
286}
287
288static bool move_file(void)
289{
290 return test_bit(MOVE_CHARGE_TYPE_FILE,
291 &mc.to->move_charge_at_immigrate);
292}
293
294
295
296
297
298#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
299#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
300
301enum charge_type {
302 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
303 MEM_CGROUP_CHARGE_TYPE_MAPPED,
304 MEM_CGROUP_CHARGE_TYPE_SHMEM,
305 MEM_CGROUP_CHARGE_TYPE_FORCE,
306 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
307 MEM_CGROUP_CHARGE_TYPE_DROP,
308 NR_CHARGE_TYPE,
309};
310
311
312#define PCGF_CACHE (1UL << PCG_CACHE)
313#define PCGF_USED (1UL << PCG_USED)
314#define PCGF_LOCK (1UL << PCG_LOCK)
315
316#define PCGF_ACCT (1UL << PCG_ACCT)
317
318
319#define _MEM (0)
320#define _MEMSWAP (1)
321#define _OOM_TYPE (2)
322#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
323#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
324#define MEMFILE_ATTR(val) ((val) & 0xffff)
325
326#define OOM_CONTROL (0)
327
328
329
330
331#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
332#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
333#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
334#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
335#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
336#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
337
338static void mem_cgroup_get(struct mem_cgroup *mem);
339static void mem_cgroup_put(struct mem_cgroup *mem);
340static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
341static void drain_all_stock_async(void);
342
343static struct mem_cgroup_per_zone *
344mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
345{
346 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
347}
348
349struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
350{
351 return &mem->css;
352}
353
354static struct mem_cgroup_per_zone *
355page_cgroup_zoneinfo(struct page_cgroup *pc)
356{
357 struct mem_cgroup *mem = pc->mem_cgroup;
358 int nid = page_cgroup_nid(pc);
359 int zid = page_cgroup_zid(pc);
360
361 if (!mem)
362 return NULL;
363
364 return mem_cgroup_zoneinfo(mem, nid, zid);
365}
366
367static struct mem_cgroup_tree_per_zone *
368soft_limit_tree_node_zone(int nid, int zid)
369{
370 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
371}
372
373static struct mem_cgroup_tree_per_zone *
374soft_limit_tree_from_page(struct page *page)
375{
376 int nid = page_to_nid(page);
377 int zid = page_zonenum(page);
378
379 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
380}
381
382static void
383__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
384 struct mem_cgroup_per_zone *mz,
385 struct mem_cgroup_tree_per_zone *mctz,
386 unsigned long long new_usage_in_excess)
387{
388 struct rb_node **p = &mctz->rb_root.rb_node;
389 struct rb_node *parent = NULL;
390 struct mem_cgroup_per_zone *mz_node;
391
392 if (mz->on_tree)
393 return;
394
395 mz->usage_in_excess = new_usage_in_excess;
396 if (!mz->usage_in_excess)
397 return;
398 while (*p) {
399 parent = *p;
400 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
401 tree_node);
402 if (mz->usage_in_excess < mz_node->usage_in_excess)
403 p = &(*p)->rb_left;
404
405
406
407
408 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
409 p = &(*p)->rb_right;
410 }
411 rb_link_node(&mz->tree_node, parent, p);
412 rb_insert_color(&mz->tree_node, &mctz->rb_root);
413 mz->on_tree = true;
414}
415
416static void
417__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
418 struct mem_cgroup_per_zone *mz,
419 struct mem_cgroup_tree_per_zone *mctz)
420{
421 if (!mz->on_tree)
422 return;
423 rb_erase(&mz->tree_node, &mctz->rb_root);
424 mz->on_tree = false;
425}
426
427static void
428mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
429 struct mem_cgroup_per_zone *mz,
430 struct mem_cgroup_tree_per_zone *mctz)
431{
432 spin_lock(&mctz->lock);
433 __mem_cgroup_remove_exceeded(mem, mz, mctz);
434 spin_unlock(&mctz->lock);
435}
436
437
438static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
439{
440 unsigned long long excess;
441 struct mem_cgroup_per_zone *mz;
442 struct mem_cgroup_tree_per_zone *mctz;
443 int nid = page_to_nid(page);
444 int zid = page_zonenum(page);
445 mctz = soft_limit_tree_from_page(page);
446
447
448
449
450
451 for (; mem; mem = parent_mem_cgroup(mem)) {
452 mz = mem_cgroup_zoneinfo(mem, nid, zid);
453 excess = res_counter_soft_limit_excess(&mem->res);
454
455
456
457
458 if (excess || mz->on_tree) {
459 spin_lock(&mctz->lock);
460
461 if (mz->on_tree)
462 __mem_cgroup_remove_exceeded(mem, mz, mctz);
463
464
465
466
467 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
468 spin_unlock(&mctz->lock);
469 }
470 }
471}
472
473static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
474{
475 int node, zone;
476 struct mem_cgroup_per_zone *mz;
477 struct mem_cgroup_tree_per_zone *mctz;
478
479 for_each_node_state(node, N_POSSIBLE) {
480 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
481 mz = mem_cgroup_zoneinfo(mem, node, zone);
482 mctz = soft_limit_tree_node_zone(node, zone);
483 mem_cgroup_remove_exceeded(mem, mz, mctz);
484 }
485 }
486}
487
488static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
489{
490 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
491}
492
493static struct mem_cgroup_per_zone *
494__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
495{
496 struct rb_node *rightmost = NULL;
497 struct mem_cgroup_per_zone *mz;
498
499retry:
500 mz = NULL;
501 rightmost = rb_last(&mctz->rb_root);
502 if (!rightmost)
503 goto done;
504
505 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
506
507
508
509
510
511 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
512 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
513 !css_tryget(&mz->mem->css))
514 goto retry;
515done:
516 return mz;
517}
518
519static struct mem_cgroup_per_zone *
520mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
521{
522 struct mem_cgroup_per_zone *mz;
523
524 spin_lock(&mctz->lock);
525 mz = __mem_cgroup_largest_soft_limit_node(mctz);
526 spin_unlock(&mctz->lock);
527 return mz;
528}
529
530static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
531 enum mem_cgroup_stat_index idx)
532{
533 int cpu;
534 s64 val = 0;
535
536 for_each_possible_cpu(cpu)
537 val += per_cpu(mem->stat->count[idx], cpu);
538 return val;
539}
540
541static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
542{
543 s64 ret;
544
545 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
546 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
547 return ret;
548}
549
550static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
551 bool charge)
552{
553 int val = (charge) ? 1 : -1;
554 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
555}
556
557static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
558 struct page_cgroup *pc,
559 bool charge)
560{
561 int val = (charge) ? 1 : -1;
562
563 preempt_disable();
564
565 if (PageCgroupCache(pc))
566 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
567 else
568 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
569
570 if (charge)
571 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
572 else
573 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
574 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
575
576 preempt_enable();
577}
578
579static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
580 enum lru_list idx)
581{
582 int nid, zid;
583 struct mem_cgroup_per_zone *mz;
584 u64 total = 0;
585
586 for_each_online_node(nid)
587 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
588 mz = mem_cgroup_zoneinfo(mem, nid, zid);
589 total += MEM_CGROUP_ZSTAT(mz, idx);
590 }
591 return total;
592}
593
594static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
595{
596 s64 val;
597
598 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
599
600 return !(val & ((1 << event_mask_shift) - 1));
601}
602
603
604
605
606
607static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
608{
609
610 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
611 mem_cgroup_threshold(mem);
612 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
613 mem_cgroup_update_tree(mem, page);
614 }
615}
616
617static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
618{
619 return container_of(cgroup_subsys_state(cont,
620 mem_cgroup_subsys_id), struct mem_cgroup,
621 css);
622}
623
624struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
625{
626
627
628
629
630
631 if (unlikely(!p))
632 return NULL;
633
634 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
635 struct mem_cgroup, css);
636}
637
638static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
639{
640 struct mem_cgroup *mem = NULL;
641
642 if (!mm)
643 return NULL;
644
645
646
647
648
649 rcu_read_lock();
650 do {
651 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
652 if (unlikely(!mem))
653 break;
654 } while (!css_tryget(&mem->css));
655 rcu_read_unlock();
656 return mem;
657}
658
659
660
661
662static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
663 int (*func)(struct mem_cgroup *, void *))
664{
665 int found, ret, nextid;
666 struct cgroup_subsys_state *css;
667 struct mem_cgroup *mem;
668
669 if (!root->use_hierarchy)
670 return (*func)(root, data);
671
672 nextid = 1;
673 do {
674 ret = 0;
675 mem = NULL;
676
677 rcu_read_lock();
678 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
679 &found);
680 if (css && css_tryget(css))
681 mem = container_of(css, struct mem_cgroup, css);
682 rcu_read_unlock();
683
684 if (mem) {
685 ret = (*func)(mem, data);
686 css_put(&mem->css);
687 }
688 nextid = found + 1;
689 } while (!ret && css);
690
691 return ret;
692}
693
694static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
695{
696 return (mem == root_mem_cgroup);
697}
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
714{
715 struct page_cgroup *pc;
716 struct mem_cgroup_per_zone *mz;
717
718 if (mem_cgroup_disabled())
719 return;
720 pc = lookup_page_cgroup(page);
721
722 if (!TestClearPageCgroupAcctLRU(pc))
723 return;
724 VM_BUG_ON(!pc->mem_cgroup);
725
726
727
728
729 mz = page_cgroup_zoneinfo(pc);
730 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
731 if (mem_cgroup_is_root(pc->mem_cgroup))
732 return;
733 VM_BUG_ON(list_empty(&pc->lru));
734 list_del_init(&pc->lru);
735 return;
736}
737
738void mem_cgroup_del_lru(struct page *page)
739{
740 mem_cgroup_del_lru_list(page, page_lru(page));
741}
742
743void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
744{
745 struct mem_cgroup_per_zone *mz;
746 struct page_cgroup *pc;
747
748 if (mem_cgroup_disabled())
749 return;
750
751 pc = lookup_page_cgroup(page);
752
753
754
755
756 smp_rmb();
757
758 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
759 return;
760 mz = page_cgroup_zoneinfo(pc);
761 list_move(&pc->lru, &mz->lists[lru]);
762}
763
764void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
765{
766 struct page_cgroup *pc;
767 struct mem_cgroup_per_zone *mz;
768
769 if (mem_cgroup_disabled())
770 return;
771 pc = lookup_page_cgroup(page);
772 VM_BUG_ON(PageCgroupAcctLRU(pc));
773
774
775
776
777 smp_rmb();
778 if (!PageCgroupUsed(pc))
779 return;
780
781 mz = page_cgroup_zoneinfo(pc);
782 MEM_CGROUP_ZSTAT(mz, lru) += 1;
783 SetPageCgroupAcctLRU(pc);
784 if (mem_cgroup_is_root(pc->mem_cgroup))
785 return;
786 list_add(&pc->lru, &mz->lists[lru]);
787}
788
789
790
791
792
793
794
795
796static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
797{
798 unsigned long flags;
799 struct zone *zone = page_zone(page);
800 struct page_cgroup *pc = lookup_page_cgroup(page);
801
802 spin_lock_irqsave(&zone->lru_lock, flags);
803
804
805
806
807 if (!PageCgroupUsed(pc))
808 mem_cgroup_del_lru_list(page, page_lru(page));
809 spin_unlock_irqrestore(&zone->lru_lock, flags);
810}
811
812static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
813{
814 unsigned long flags;
815 struct zone *zone = page_zone(page);
816 struct page_cgroup *pc = lookup_page_cgroup(page);
817
818 spin_lock_irqsave(&zone->lru_lock, flags);
819
820 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
821 mem_cgroup_add_lru_list(page, page_lru(page));
822 spin_unlock_irqrestore(&zone->lru_lock, flags);
823}
824
825
826void mem_cgroup_move_lists(struct page *page,
827 enum lru_list from, enum lru_list to)
828{
829 if (mem_cgroup_disabled())
830 return;
831 mem_cgroup_del_lru_list(page, from);
832 mem_cgroup_add_lru_list(page, to);
833}
834
835int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836{
837 int ret;
838 struct mem_cgroup *curr = NULL;
839
840 task_lock(task);
841 rcu_read_lock();
842 curr = try_get_mem_cgroup_from_mm(task->mm);
843 rcu_read_unlock();
844 task_unlock(task);
845 if (!curr)
846 return 0;
847
848
849
850
851
852
853 if (mem->use_hierarchy)
854 ret = css_is_ancestor(&curr->css, &mem->css);
855 else
856 ret = (curr == mem);
857 css_put(&curr->css);
858 return ret;
859}
860
861
862
863
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{
892 unsigned long active;
893 unsigned long inactive;
894 unsigned long gb;
895 unsigned long inactive_ratio;
896
897 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
898 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
899
900 gb = (inactive + active) >> (30 - PAGE_SHIFT);
901 if (gb)
902 inactive_ratio = int_sqrt(10 * gb);
903 else
904 inactive_ratio = 1;
905
906 if (present_pages) {
907 present_pages[0] = inactive;
908 present_pages[1] = active;
909 }
910
911 return inactive_ratio;
912}
913
914int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
915{
916 unsigned long active;
917 unsigned long inactive;
918 unsigned long present_pages[2];
919 unsigned long inactive_ratio;
920
921 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
922
923 inactive = present_pages[0];
924 active = present_pages[1];
925
926 if (inactive * inactive_ratio < active)
927 return 1;
928
929 return 0;
930}
931
932int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
933{
934 unsigned long active;
935 unsigned long inactive;
936
937 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
938 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
939
940 return (active > inactive);
941}
942
943unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
944 struct zone *zone,
945 enum lru_list lru)
946{
947 int nid = zone->zone_pgdat->node_id;
948 int zid = zone_idx(zone);
949 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
950
951 return MEM_CGROUP_ZSTAT(mz, lru);
952}
953
954struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
955 struct zone *zone)
956{
957 int nid = zone->zone_pgdat->node_id;
958 int zid = zone_idx(zone);
959 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
960
961 return &mz->reclaim_stat;
962}
963
964struct zone_reclaim_stat *
965mem_cgroup_get_reclaim_stat_from_page(struct page *page)
966{
967 struct page_cgroup *pc;
968 struct mem_cgroup_per_zone *mz;
969
970 if (mem_cgroup_disabled())
971 return NULL;
972
973 pc = lookup_page_cgroup(page);
974
975
976
977
978 smp_rmb();
979 if (!PageCgroupUsed(pc))
980 return NULL;
981
982 mz = page_cgroup_zoneinfo(pc);
983 if (!mz)
984 return NULL;
985
986 return &mz->reclaim_stat;
987}
988
989unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
990 struct list_head *dst,
991 unsigned long *scanned, int order,
992 int mode, struct zone *z,
993 struct mem_cgroup *mem_cont,
994 int active, int file)
995{
996 unsigned long nr_taken = 0;
997 struct page *page;
998 unsigned long scan;
999 LIST_HEAD(pc_list);
1000 struct list_head *src;
1001 struct page_cgroup *pc, *tmp;
1002 int nid = z->zone_pgdat->node_id;
1003 int zid = zone_idx(z);
1004 struct mem_cgroup_per_zone *mz;
1005 int lru = LRU_FILE * file + active;
1006 int ret;
1007
1008 BUG_ON(!mem_cont);
1009 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1010 src = &mz->lists[lru];
1011
1012 scan = 0;
1013 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1014 if (scan >= nr_to_scan)
1015 break;
1016
1017 page = pc->page;
1018 if (unlikely(!PageCgroupUsed(pc)))
1019 continue;
1020 if (unlikely(!PageLRU(page)))
1021 continue;
1022
1023 scan++;
1024 ret = __isolate_lru_page(page, mode, file);
1025 switch (ret) {
1026 case 0:
1027 list_move(&page->lru, dst);
1028 mem_cgroup_del_lru(page);
1029 nr_taken++;
1030 break;
1031 case -EBUSY:
1032
1033 mem_cgroup_rotate_lru_list(page, page_lru(page));
1034 break;
1035 default:
1036 break;
1037 }
1038 }
1039
1040 *scanned = scan;
1041 return nr_taken;
1042}
1043
1044#define mem_cgroup_from_res_counter(counter, member) \
1045 container_of(counter, struct mem_cgroup, member)
1046
1047static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1048{
1049 if (do_swap_account) {
1050 if (res_counter_check_under_limit(&mem->res) &&
1051 res_counter_check_under_limit(&mem->memsw))
1052 return true;
1053 } else
1054 if (res_counter_check_under_limit(&mem->res))
1055 return true;
1056 return false;
1057}
1058
1059static unsigned int get_swappiness(struct mem_cgroup *memcg)
1060{
1061 struct cgroup *cgrp = memcg->css.cgroup;
1062 unsigned int swappiness;
1063
1064
1065 if (cgrp->parent == NULL)
1066 return vm_swappiness;
1067
1068 spin_lock(&memcg->reclaim_param_lock);
1069 swappiness = memcg->swappiness;
1070 spin_unlock(&memcg->reclaim_param_lock);
1071
1072 return swappiness;
1073}
1074
1075static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1076{
1077 int *val = data;
1078 (*val)++;
1079 return 0;
1080}
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1091{
1092 struct cgroup *task_cgrp;
1093 struct cgroup *mem_cgrp;
1094
1095
1096
1097
1098
1099 static char memcg_name[PATH_MAX];
1100 int ret;
1101
1102 if (!memcg || !p)
1103 return;
1104
1105
1106 rcu_read_lock();
1107
1108 mem_cgrp = memcg->css.cgroup;
1109 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1110
1111 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1112 if (ret < 0) {
1113
1114
1115
1116
1117 rcu_read_unlock();
1118 goto done;
1119 }
1120 rcu_read_unlock();
1121
1122 printk(KERN_INFO "Task in %s killed", memcg_name);
1123
1124 rcu_read_lock();
1125 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1126 if (ret < 0) {
1127 rcu_read_unlock();
1128 goto done;
1129 }
1130 rcu_read_unlock();
1131
1132
1133
1134
1135 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1136done:
1137
1138 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1139 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1140 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1141 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1142 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1143 "failcnt %llu\n",
1144 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1145 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1146 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1147}
1148
1149
1150
1151
1152
1153static int mem_cgroup_count_children(struct mem_cgroup *mem)
1154{
1155 int num = 0;
1156 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1157 return num;
1158}
1159
1160
1161
1162
1163
1164
1165static struct mem_cgroup *
1166mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1167{
1168 struct mem_cgroup *ret = NULL;
1169 struct cgroup_subsys_state *css;
1170 int nextid, found;
1171
1172 if (!root_mem->use_hierarchy) {
1173 css_get(&root_mem->css);
1174 ret = root_mem;
1175 }
1176
1177 while (!ret) {
1178 rcu_read_lock();
1179 nextid = root_mem->last_scanned_child + 1;
1180 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1181 &found);
1182 if (css && css_tryget(css))
1183 ret = container_of(css, struct mem_cgroup, css);
1184
1185 rcu_read_unlock();
1186
1187 spin_lock(&root_mem->reclaim_param_lock);
1188 if (!css) {
1189
1190 root_mem->last_scanned_child = 0;
1191 } else
1192 root_mem->last_scanned_child = found;
1193 spin_unlock(&root_mem->reclaim_param_lock);
1194 }
1195
1196 return ret;
1197}
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1212 struct zone *zone,
1213 gfp_t gfp_mask,
1214 unsigned long reclaim_options)
1215{
1216 struct mem_cgroup *victim;
1217 int ret, total = 0;
1218 int loop = 0;
1219 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1220 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1221 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1222 unsigned long excess = mem_cgroup_get_excess(root_mem);
1223
1224
1225 if (root_mem->memsw_is_minimum)
1226 noswap = true;
1227
1228 while (1) {
1229 victim = mem_cgroup_select_victim(root_mem);
1230 if (victim == root_mem) {
1231 loop++;
1232 if (loop >= 1)
1233 drain_all_stock_async();
1234 if (loop >= 2) {
1235
1236
1237
1238
1239
1240 if (!check_soft || !total) {
1241 css_put(&victim->css);
1242 break;
1243 }
1244
1245
1246
1247
1248
1249
1250 if (total >= (excess >> 2) ||
1251 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1252 css_put(&victim->css);
1253 break;
1254 }
1255 }
1256 }
1257 if (!mem_cgroup_local_usage(victim)) {
1258
1259 css_put(&victim->css);
1260 continue;
1261 }
1262
1263 if (check_soft)
1264 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1265 noswap, get_swappiness(victim), zone,
1266 zone->zone_pgdat->node_id);
1267 else
1268 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1269 noswap, get_swappiness(victim));
1270 css_put(&victim->css);
1271
1272
1273
1274
1275
1276 if (shrink)
1277 return ret;
1278 total += ret;
1279 if (check_soft) {
1280 if (res_counter_check_under_soft_limit(&root_mem->res))
1281 return total;
1282 } else if (mem_cgroup_check_under_limit(root_mem))
1283 return 1 + total;
1284 }
1285 return total;
1286}
1287
1288static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1289{
1290 int *val = (int *)data;
1291 int x;
1292
1293
1294
1295
1296
1297 x = atomic_inc_return(&mem->oom_lock);
1298 *val = max(x, *val);
1299 return 0;
1300}
1301
1302
1303
1304
1305static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1306{
1307 int lock_count = 0;
1308
1309 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1310
1311 if (lock_count == 1)
1312 return true;
1313 return false;
1314}
1315
1316static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1317{
1318
1319
1320
1321
1322
1323 atomic_add_unless(&mem->oom_lock, -1, 0);
1324 return 0;
1325}
1326
1327static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1328{
1329 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1330}
1331
1332static DEFINE_MUTEX(memcg_oom_mutex);
1333static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1334
1335struct oom_wait_info {
1336 struct mem_cgroup *mem;
1337 wait_queue_t wait;
1338};
1339
1340static int memcg_oom_wake_function(wait_queue_t *wait,
1341 unsigned mode, int sync, void *arg)
1342{
1343 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1344 struct oom_wait_info *oom_wait_info;
1345
1346 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1347
1348 if (oom_wait_info->mem == wake_mem)
1349 goto wakeup;
1350
1351 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1352 return 0;
1353
1354
1355
1356
1357 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1358 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1359 return 0;
1360
1361wakeup:
1362 return autoremove_wake_function(wait, mode, sync, arg);
1363}
1364
1365static void memcg_wakeup_oom(struct mem_cgroup *mem)
1366{
1367
1368 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1369}
1370
1371static void memcg_oom_recover(struct mem_cgroup *mem)
1372{
1373 if (atomic_read(&mem->oom_lock))
1374 memcg_wakeup_oom(mem);
1375}
1376
1377
1378
1379
1380bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1381{
1382 struct oom_wait_info owait;
1383 bool locked, need_to_kill;
1384
1385 owait.mem = mem;
1386 owait.wait.flags = 0;
1387 owait.wait.func = memcg_oom_wake_function;
1388 owait.wait.private = current;
1389 INIT_LIST_HEAD(&owait.wait.task_list);
1390 need_to_kill = true;
1391
1392 mutex_lock(&memcg_oom_mutex);
1393 locked = mem_cgroup_oom_lock(mem);
1394
1395
1396
1397
1398
1399 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1400 if (!locked || mem->oom_kill_disable)
1401 need_to_kill = false;
1402 if (locked)
1403 mem_cgroup_oom_notify(mem);
1404 mutex_unlock(&memcg_oom_mutex);
1405
1406 if (need_to_kill) {
1407 finish_wait(&memcg_oom_waitq, &owait.wait);
1408 mem_cgroup_out_of_memory(mem, mask);
1409 } else {
1410 schedule();
1411 finish_wait(&memcg_oom_waitq, &owait.wait);
1412 }
1413 mutex_lock(&memcg_oom_mutex);
1414 mem_cgroup_oom_unlock(mem);
1415 memcg_wakeup_oom(mem);
1416 mutex_unlock(&memcg_oom_mutex);
1417
1418 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1419 return false;
1420
1421 schedule_timeout(1);
1422 return true;
1423}
1424
1425
1426
1427
1428
1429void mem_cgroup_update_file_mapped(struct page *page, int val)
1430{
1431 struct mem_cgroup *mem;
1432 struct page_cgroup *pc;
1433
1434 pc = lookup_page_cgroup(page);
1435 if (unlikely(!pc))
1436 return;
1437
1438 lock_page_cgroup(pc);
1439 mem = pc->mem_cgroup;
1440 if (!mem || !PageCgroupUsed(pc))
1441 goto done;
1442
1443
1444
1445
1446 if (val > 0) {
1447 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1448 SetPageCgroupFileMapped(pc);
1449 } else {
1450 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1451 ClearPageCgroupFileMapped(pc);
1452 }
1453
1454done:
1455 unlock_page_cgroup(pc);
1456}
1457
1458
1459
1460
1461
1462#define CHARGE_SIZE (32 * PAGE_SIZE)
1463struct memcg_stock_pcp {
1464 struct mem_cgroup *cached;
1465 int charge;
1466 struct work_struct work;
1467};
1468static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1469static atomic_t memcg_drain_count;
1470
1471
1472
1473
1474
1475
1476
1477static bool consume_stock(struct mem_cgroup *mem)
1478{
1479 struct memcg_stock_pcp *stock;
1480 bool ret = true;
1481
1482 stock = &get_cpu_var(memcg_stock);
1483 if (mem == stock->cached && stock->charge)
1484 stock->charge -= PAGE_SIZE;
1485 else
1486 ret = false;
1487 put_cpu_var(memcg_stock);
1488 return ret;
1489}
1490
1491
1492
1493
1494static void drain_stock(struct memcg_stock_pcp *stock)
1495{
1496 struct mem_cgroup *old = stock->cached;
1497
1498 if (stock->charge) {
1499 res_counter_uncharge(&old->res, stock->charge);
1500 if (do_swap_account)
1501 res_counter_uncharge(&old->memsw, stock->charge);
1502 }
1503 stock->cached = NULL;
1504 stock->charge = 0;
1505}
1506
1507
1508
1509
1510
1511static void drain_local_stock(struct work_struct *dummy)
1512{
1513 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1514 drain_stock(stock);
1515}
1516
1517
1518
1519
1520
1521static void refill_stock(struct mem_cgroup *mem, int val)
1522{
1523 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1524
1525 if (stock->cached != mem) {
1526 drain_stock(stock);
1527 stock->cached = mem;
1528 }
1529 stock->charge += val;
1530 put_cpu_var(memcg_stock);
1531}
1532
1533
1534
1535
1536
1537
1538
1539static void drain_all_stock_async(void)
1540{
1541 int cpu;
1542
1543
1544
1545
1546
1547
1548 if (atomic_read(&memcg_drain_count))
1549 return;
1550
1551 atomic_inc(&memcg_drain_count);
1552 get_online_cpus();
1553 for_each_online_cpu(cpu) {
1554 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1555 schedule_work_on(cpu, &stock->work);
1556 }
1557 put_online_cpus();
1558 atomic_dec(&memcg_drain_count);
1559
1560}
1561
1562
1563static void drain_all_stock_sync(void)
1564{
1565
1566 atomic_inc(&memcg_drain_count);
1567 schedule_on_each_cpu(drain_local_stock);
1568 atomic_dec(&memcg_drain_count);
1569}
1570
1571static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1572 unsigned long action,
1573 void *hcpu)
1574{
1575 int cpu = (unsigned long)hcpu;
1576 struct memcg_stock_pcp *stock;
1577
1578 if (action != CPU_DEAD)
1579 return NOTIFY_OK;
1580 stock = &per_cpu(memcg_stock, cpu);
1581 drain_stock(stock);
1582 return NOTIFY_OK;
1583}
1584
1585
1586
1587
1588
1589static int __mem_cgroup_try_charge(struct mm_struct *mm,
1590 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1591{
1592 struct mem_cgroup *mem, *mem_over_limit;
1593 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1594 struct res_counter *fail_res;
1595 int csize = CHARGE_SIZE;
1596
1597
1598
1599
1600
1601
1602 if (unlikely(test_thread_flag(TIF_MEMDIE)
1603 || fatal_signal_pending(current)))
1604 goto bypass;
1605
1606
1607
1608
1609
1610
1611
1612 mem = *memcg;
1613 if (likely(!mem)) {
1614 mem = try_get_mem_cgroup_from_mm(mm);
1615 *memcg = mem;
1616 } else {
1617 css_get(&mem->css);
1618 }
1619 if (unlikely(!mem))
1620 return 0;
1621
1622 VM_BUG_ON(css_is_removed(&mem->css));
1623 if (mem_cgroup_is_root(mem))
1624 goto done;
1625
1626 while (1) {
1627 int ret = 0;
1628 unsigned long flags = 0;
1629
1630 if (consume_stock(mem))
1631 goto done;
1632
1633 ret = res_counter_charge(&mem->res, csize, &fail_res);
1634 if (likely(!ret)) {
1635 if (!do_swap_account)
1636 break;
1637 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1638 if (likely(!ret))
1639 break;
1640
1641 res_counter_uncharge(&mem->res, csize);
1642 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1643 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1644 memsw);
1645 } else
1646
1647 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1648 res);
1649
1650
1651 if (csize > PAGE_SIZE) {
1652 csize = PAGE_SIZE;
1653 continue;
1654 }
1655 if (!(gfp_mask & __GFP_WAIT))
1656 goto nomem;
1657
1658 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1659 gfp_mask, flags);
1660 if (ret)
1661 continue;
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671 if (mem_cgroup_check_under_limit(mem_over_limit))
1672 continue;
1673
1674
1675 if (mc.moving_task && current != mc.moving_task) {
1676 struct mem_cgroup *from, *to;
1677 bool do_continue = false;
1678
1679
1680
1681
1682 from = mc.from;
1683 to = mc.to;
1684 if (from && css_tryget(&from->css)) {
1685 if (mem_over_limit->use_hierarchy)
1686 do_continue = css_is_ancestor(
1687 &from->css,
1688 &mem_over_limit->css);
1689 else
1690 do_continue = (from == mem_over_limit);
1691 css_put(&from->css);
1692 }
1693 if (!do_continue && to && css_tryget(&to->css)) {
1694 if (mem_over_limit->use_hierarchy)
1695 do_continue = css_is_ancestor(
1696 &to->css,
1697 &mem_over_limit->css);
1698 else
1699 do_continue = (to == mem_over_limit);
1700 css_put(&to->css);
1701 }
1702 if (do_continue) {
1703 DEFINE_WAIT(wait);
1704 prepare_to_wait(&mc.waitq, &wait,
1705 TASK_INTERRUPTIBLE);
1706
1707 if (mc.moving_task)
1708 schedule();
1709 finish_wait(&mc.waitq, &wait);
1710 continue;
1711 }
1712 }
1713
1714 if (!nr_retries--) {
1715 if (!oom)
1716 goto nomem;
1717 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1718 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1719 continue;
1720 }
1721
1722 css_put(&mem->css);
1723 goto bypass;
1724 }
1725 }
1726 if (csize > PAGE_SIZE)
1727 refill_stock(mem, csize - PAGE_SIZE);
1728done:
1729 return 0;
1730nomem:
1731 css_put(&mem->css);
1732 return -ENOMEM;
1733bypass:
1734 *memcg = NULL;
1735 return 0;
1736}
1737
1738
1739
1740
1741
1742
1743static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1744 unsigned long count)
1745{
1746 if (!mem_cgroup_is_root(mem)) {
1747 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1748 if (do_swap_account)
1749 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1750 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1751 WARN_ON_ONCE(count > INT_MAX);
1752 __css_put(&mem->css, (int)count);
1753 }
1754
1755}
1756
1757static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1758{
1759 __mem_cgroup_cancel_charge(mem, 1);
1760}
1761
1762
1763
1764
1765
1766
1767
1768static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1769{
1770 struct cgroup_subsys_state *css;
1771
1772
1773 if (!id)
1774 return NULL;
1775 css = css_lookup(&mem_cgroup_subsys, id);
1776 if (!css)
1777 return NULL;
1778 return container_of(css, struct mem_cgroup, css);
1779}
1780
1781struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1782{
1783 struct mem_cgroup *mem = NULL;
1784 struct page_cgroup *pc;
1785 unsigned short id;
1786 swp_entry_t ent;
1787
1788 VM_BUG_ON(!PageLocked(page));
1789
1790 pc = lookup_page_cgroup(page);
1791 lock_page_cgroup(pc);
1792 if (PageCgroupUsed(pc)) {
1793 mem = pc->mem_cgroup;
1794 if (mem && !css_tryget(&mem->css))
1795 mem = NULL;
1796 } else if (PageSwapCache(page)) {
1797 ent.val = page_private(page);
1798 id = lookup_swap_cgroup(ent);
1799 rcu_read_lock();
1800 mem = mem_cgroup_lookup(id);
1801 if (mem && !css_tryget(&mem->css))
1802 mem = NULL;
1803 rcu_read_unlock();
1804 }
1805 unlock_page_cgroup(pc);
1806 return mem;
1807}
1808
1809
1810
1811
1812
1813
1814static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1815 struct page_cgroup *pc,
1816 enum charge_type ctype)
1817{
1818
1819 if (!mem)
1820 return;
1821
1822 lock_page_cgroup(pc);
1823 if (unlikely(PageCgroupUsed(pc))) {
1824 unlock_page_cgroup(pc);
1825 mem_cgroup_cancel_charge(mem);
1826 return;
1827 }
1828
1829 pc->mem_cgroup = mem;
1830
1831
1832
1833
1834
1835
1836
1837 smp_wmb();
1838 switch (ctype) {
1839 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1840 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1841 SetPageCgroupCache(pc);
1842 SetPageCgroupUsed(pc);
1843 break;
1844 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1845 ClearPageCgroupCache(pc);
1846 SetPageCgroupUsed(pc);
1847 break;
1848 default:
1849 break;
1850 }
1851
1852 mem_cgroup_charge_statistics(mem, pc, true);
1853
1854 unlock_page_cgroup(pc);
1855
1856
1857
1858
1859
1860 memcg_check_events(mem, pc->page);
1861}
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880static void __mem_cgroup_move_account(struct page_cgroup *pc,
1881 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1882{
1883 VM_BUG_ON(from == to);
1884 VM_BUG_ON(PageLRU(pc->page));
1885 VM_BUG_ON(!PageCgroupLocked(pc));
1886 VM_BUG_ON(!PageCgroupUsed(pc));
1887 VM_BUG_ON(pc->mem_cgroup != from);
1888
1889 if (PageCgroupFileMapped(pc)) {
1890
1891 preempt_disable();
1892 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1893 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1894 preempt_enable();
1895 }
1896 mem_cgroup_charge_statistics(from, pc, false);
1897 if (uncharge)
1898
1899 mem_cgroup_cancel_charge(from);
1900
1901
1902 pc->mem_cgroup = to;
1903 mem_cgroup_charge_statistics(to, pc, true);
1904
1905
1906
1907
1908
1909
1910
1911}
1912
1913
1914
1915
1916
1917static int mem_cgroup_move_account(struct page_cgroup *pc,
1918 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1919{
1920 int ret = -EINVAL;
1921 lock_page_cgroup(pc);
1922 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1923 __mem_cgroup_move_account(pc, from, to, uncharge);
1924 ret = 0;
1925 }
1926 unlock_page_cgroup(pc);
1927
1928
1929
1930 memcg_check_events(to, pc->page);
1931 memcg_check_events(from, pc->page);
1932 return ret;
1933}
1934
1935
1936
1937
1938
1939static int mem_cgroup_move_parent(struct page_cgroup *pc,
1940 struct mem_cgroup *child,
1941 gfp_t gfp_mask)
1942{
1943 struct page *page = pc->page;
1944 struct cgroup *cg = child->css.cgroup;
1945 struct cgroup *pcg = cg->parent;
1946 struct mem_cgroup *parent;
1947 int ret;
1948
1949
1950 if (!pcg)
1951 return -EINVAL;
1952
1953 ret = -EBUSY;
1954 if (!get_page_unless_zero(page))
1955 goto out;
1956 if (isolate_lru_page(page))
1957 goto put;
1958
1959 parent = mem_cgroup_from_cont(pcg);
1960 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1961 if (ret || !parent)
1962 goto put_back;
1963
1964 ret = mem_cgroup_move_account(pc, child, parent, true);
1965 if (ret)
1966 mem_cgroup_cancel_charge(parent);
1967put_back:
1968 putback_lru_page(page);
1969put:
1970 put_page(page);
1971out:
1972 return ret;
1973}
1974
1975
1976
1977
1978
1979
1980
1981static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1982 gfp_t gfp_mask, enum charge_type ctype,
1983 struct mem_cgroup *memcg)
1984{
1985 struct mem_cgroup *mem;
1986 struct page_cgroup *pc;
1987 int ret;
1988
1989 pc = lookup_page_cgroup(page);
1990
1991 if (unlikely(!pc))
1992 return 0;
1993 prefetchw(pc);
1994
1995 mem = memcg;
1996 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1997 if (ret || !mem)
1998 return ret;
1999
2000 __mem_cgroup_commit_charge(mem, pc, ctype);
2001 return 0;
2002}
2003
2004int mem_cgroup_newpage_charge(struct page *page,
2005 struct mm_struct *mm, gfp_t gfp_mask)
2006{
2007 if (mem_cgroup_disabled())
2008 return 0;
2009 if (PageCompound(page))
2010 return 0;
2011
2012
2013
2014
2015
2016
2017
2018 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2019 return 0;
2020 if (unlikely(!mm))
2021 mm = &init_mm;
2022 return mem_cgroup_charge_common(page, mm, gfp_mask,
2023 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
2024}
2025
2026static void
2027__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2028 enum charge_type ctype);
2029
2030int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2031 gfp_t gfp_mask)
2032{
2033 struct mem_cgroup *mem = NULL;
2034 int ret;
2035
2036 if (mem_cgroup_disabled())
2037 return 0;
2038 if (PageCompound(page))
2039 return 0;
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051 if (!(gfp_mask & __GFP_WAIT)) {
2052 struct page_cgroup *pc;
2053
2054
2055 pc = lookup_page_cgroup(page);
2056 if (!pc)
2057 return 0;
2058 lock_page_cgroup(pc);
2059 if (PageCgroupUsed(pc)) {
2060 unlock_page_cgroup(pc);
2061 return 0;
2062 }
2063 unlock_page_cgroup(pc);
2064 }
2065
2066 if (unlikely(!mm && !mem))
2067 mm = &init_mm;
2068
2069 if (page_is_file_cache(page))
2070 return mem_cgroup_charge_common(page, mm, gfp_mask,
2071 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
2072
2073
2074 if (PageSwapCache(page)) {
2075 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2076 if (!ret)
2077 __mem_cgroup_commit_charge_swapin(page, mem,
2078 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2079 } else
2080 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2081 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
2082
2083 return ret;
2084}
2085
2086
2087
2088
2089
2090
2091
2092int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2093 struct page *page,
2094 gfp_t mask, struct mem_cgroup **ptr)
2095{
2096 struct mem_cgroup *mem;
2097 int ret;
2098
2099 if (mem_cgroup_disabled())
2100 return 0;
2101
2102 if (!do_swap_account)
2103 goto charge_cur_mm;
2104
2105
2106
2107
2108
2109
2110 if (!PageSwapCache(page))
2111 goto charge_cur_mm;
2112 mem = try_get_mem_cgroup_from_page(page);
2113 if (!mem)
2114 goto charge_cur_mm;
2115 *ptr = mem;
2116 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2117
2118 css_put(&mem->css);
2119 return ret;
2120charge_cur_mm:
2121 if (unlikely(!mm))
2122 mm = &init_mm;
2123 return __mem_cgroup_try_charge(mm, mask, ptr, true);
2124}
2125
2126static void
2127__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2128 enum charge_type ctype)
2129{
2130 struct page_cgroup *pc;
2131
2132 if (mem_cgroup_disabled())
2133 return;
2134 if (!ptr)
2135 return;
2136 cgroup_exclude_rmdir(&ptr->css);
2137 pc = lookup_page_cgroup(page);
2138 mem_cgroup_lru_del_before_commit_swapcache(page);
2139 __mem_cgroup_commit_charge(ptr, pc, ctype);
2140 mem_cgroup_lru_add_after_commit_swapcache(page);
2141
2142
2143
2144
2145
2146
2147
2148 if (do_swap_account && PageSwapCache(page)) {
2149 swp_entry_t ent = {.val = page_private(page)};
2150 unsigned short id;
2151 struct mem_cgroup *memcg;
2152
2153 id = swap_cgroup_record(ent, 0);
2154 rcu_read_lock();
2155 memcg = mem_cgroup_lookup(id);
2156 if (memcg) {
2157
2158
2159
2160
2161 if (!mem_cgroup_is_root(memcg))
2162 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2163 mem_cgroup_swap_statistics(memcg, false);
2164 mem_cgroup_put(memcg);
2165 }
2166 rcu_read_unlock();
2167 }
2168
2169
2170
2171
2172
2173 cgroup_release_and_wakeup_rmdir(&ptr->css);
2174}
2175
2176void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2177{
2178 __mem_cgroup_commit_charge_swapin(page, ptr,
2179 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2180}
2181
2182void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2183{
2184 if (mem_cgroup_disabled())
2185 return;
2186 if (!mem)
2187 return;
2188 mem_cgroup_cancel_charge(mem);
2189}
2190
2191static void
2192__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2193{
2194 struct memcg_batch_info *batch = NULL;
2195 bool uncharge_memsw = true;
2196
2197 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2198 uncharge_memsw = false;
2199
2200 batch = ¤t->memcg_batch;
2201
2202
2203
2204
2205
2206 if (!batch->memcg)
2207 batch->memcg = mem;
2208
2209
2210
2211
2212
2213
2214
2215
2216 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2217 goto direct_uncharge;
2218
2219
2220
2221
2222
2223
2224 if (batch->memcg != mem)
2225 goto direct_uncharge;
2226
2227 batch->bytes += PAGE_SIZE;
2228 if (uncharge_memsw)
2229 batch->memsw_bytes += PAGE_SIZE;
2230 return;
2231direct_uncharge:
2232 res_counter_uncharge(&mem->res, PAGE_SIZE);
2233 if (uncharge_memsw)
2234 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2235 if (unlikely(batch->memcg != mem))
2236 memcg_oom_recover(mem);
2237 return;
2238}
2239
2240
2241
2242
2243static struct mem_cgroup *
2244__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2245{
2246 struct page_cgroup *pc;
2247 struct mem_cgroup *mem = NULL;
2248 struct mem_cgroup_per_zone *mz;
2249
2250 if (mem_cgroup_disabled())
2251 return NULL;
2252
2253 if (PageSwapCache(page))
2254 return NULL;
2255
2256
2257
2258
2259 pc = lookup_page_cgroup(page);
2260 if (unlikely(!pc || !PageCgroupUsed(pc)))
2261 return NULL;
2262
2263 lock_page_cgroup(pc);
2264
2265 mem = pc->mem_cgroup;
2266
2267 if (!PageCgroupUsed(pc))
2268 goto unlock_out;
2269
2270 switch (ctype) {
2271 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2272 case MEM_CGROUP_CHARGE_TYPE_DROP:
2273
2274 if (page_mapped(page) || PageCgroupMigration(pc))
2275 goto unlock_out;
2276 break;
2277 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2278 if (!PageAnon(page)) {
2279 if (page->mapping && !page_is_file_cache(page))
2280 goto unlock_out;
2281 } else if (page_mapped(page))
2282 goto unlock_out;
2283 break;
2284 default:
2285 break;
2286 }
2287
2288 if (!mem_cgroup_is_root(mem))
2289 __do_uncharge(mem, ctype);
2290 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2291 mem_cgroup_swap_statistics(mem, true);
2292 mem_cgroup_charge_statistics(mem, pc, false);
2293
2294 ClearPageCgroupUsed(pc);
2295
2296
2297
2298
2299
2300
2301
2302 mz = page_cgroup_zoneinfo(pc);
2303 unlock_page_cgroup(pc);
2304
2305 memcg_check_events(mem, page);
2306
2307 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2308 css_put(&mem->css);
2309
2310 return mem;
2311
2312unlock_out:
2313 unlock_page_cgroup(pc);
2314 return NULL;
2315}
2316
2317void mem_cgroup_uncharge_page(struct page *page)
2318{
2319
2320 if (page_mapped(page))
2321 return;
2322 if (page->mapping && !PageAnon(page))
2323 return;
2324 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2325}
2326
2327void mem_cgroup_uncharge_cache_page(struct page *page)
2328{
2329 VM_BUG_ON(page_mapped(page));
2330 VM_BUG_ON(page->mapping);
2331 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2332}
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342void mem_cgroup_uncharge_start(void)
2343{
2344 current->memcg_batch.do_batch++;
2345
2346 if (current->memcg_batch.do_batch == 1) {
2347 current->memcg_batch.memcg = NULL;
2348 current->memcg_batch.bytes = 0;
2349 current->memcg_batch.memsw_bytes = 0;
2350 }
2351}
2352
2353void mem_cgroup_uncharge_end(void)
2354{
2355 struct memcg_batch_info *batch = ¤t->memcg_batch;
2356
2357 if (!batch->do_batch)
2358 return;
2359
2360 batch->do_batch--;
2361 if (batch->do_batch)
2362 return;
2363
2364 if (!batch->memcg)
2365 return;
2366
2367
2368
2369
2370 if (batch->bytes)
2371 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2372 if (batch->memsw_bytes)
2373 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2374 memcg_oom_recover(batch->memcg);
2375
2376 batch->memcg = NULL;
2377}
2378
2379#ifdef CONFIG_SWAP
2380
2381
2382
2383
2384void
2385mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2386{
2387 struct mem_cgroup *memcg;
2388 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2389
2390 if (!swapout)
2391 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2392
2393 memcg = __mem_cgroup_uncharge_common(page, ctype);
2394
2395
2396 if (do_swap_account && swapout && memcg) {
2397 swap_cgroup_record(ent, css_id(&memcg->css));
2398 mem_cgroup_get(memcg);
2399 }
2400 if (swapout && memcg)
2401 css_put(&memcg->css);
2402}
2403#endif
2404
2405#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2406
2407
2408
2409
2410void mem_cgroup_uncharge_swap(swp_entry_t ent)
2411{
2412 struct mem_cgroup *memcg;
2413 unsigned short id;
2414
2415 if (!do_swap_account)
2416 return;
2417
2418 id = swap_cgroup_record(ent, 0);
2419 rcu_read_lock();
2420 memcg = mem_cgroup_lookup(id);
2421 if (memcg) {
2422
2423
2424
2425
2426 if (!mem_cgroup_is_root(memcg))
2427 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2428 mem_cgroup_swap_statistics(memcg, false);
2429 mem_cgroup_put(memcg);
2430 }
2431 rcu_read_unlock();
2432}
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449static int mem_cgroup_move_swap_account(swp_entry_t entry,
2450 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2451{
2452 unsigned short old_id, new_id;
2453
2454 old_id = css_id(&from->css);
2455 new_id = css_id(&to->css);
2456
2457 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2458 mem_cgroup_swap_statistics(from, false);
2459 mem_cgroup_swap_statistics(to, true);
2460
2461
2462
2463
2464
2465
2466
2467
2468 mem_cgroup_get(to);
2469 if (need_fixup) {
2470 if (!mem_cgroup_is_root(from))
2471 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2472 mem_cgroup_put(from);
2473
2474
2475
2476
2477 if (!mem_cgroup_is_root(to))
2478 res_counter_uncharge(&to->res, PAGE_SIZE);
2479 css_put(&to->css);
2480 }
2481 return 0;
2482 }
2483 return -EINVAL;
2484}
2485#else
2486static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2487 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2488{
2489 return -EINVAL;
2490}
2491#endif
2492
2493
2494
2495
2496
2497int mem_cgroup_prepare_migration(struct page *page,
2498 struct page *newpage, struct mem_cgroup **ptr)
2499{
2500 struct page_cgroup *pc;
2501 struct mem_cgroup *mem = NULL;
2502 enum charge_type ctype;
2503 int ret = 0;
2504
2505 if (mem_cgroup_disabled())
2506 return 0;
2507
2508 pc = lookup_page_cgroup(page);
2509 lock_page_cgroup(pc);
2510 if (PageCgroupUsed(pc)) {
2511 mem = pc->mem_cgroup;
2512 css_get(&mem->css);
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542 if (PageAnon(page))
2543 SetPageCgroupMigration(pc);
2544 }
2545 unlock_page_cgroup(pc);
2546
2547
2548
2549
2550 if (!mem)
2551 return 0;
2552
2553 *ptr = mem;
2554 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2555 css_put(&mem->css);
2556 if (ret || *ptr == NULL) {
2557 if (PageAnon(page)) {
2558 lock_page_cgroup(pc);
2559 ClearPageCgroupMigration(pc);
2560 unlock_page_cgroup(pc);
2561
2562
2563
2564 mem_cgroup_uncharge_page(page);
2565 }
2566 return -ENOMEM;
2567 }
2568
2569
2570
2571
2572
2573
2574 pc = lookup_page_cgroup(newpage);
2575 if (PageAnon(page))
2576 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2577 else if (page_is_file_cache(page))
2578 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2579 else
2580 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2581 __mem_cgroup_commit_charge(mem, pc, ctype);
2582 return ret;
2583}
2584
2585
2586void mem_cgroup_end_migration(struct mem_cgroup *mem,
2587 struct page *oldpage, struct page *newpage)
2588{
2589 struct page *used, *unused;
2590 struct page_cgroup *pc;
2591
2592 if (!mem)
2593 return;
2594
2595 cgroup_exclude_rmdir(&mem->css);
2596
2597 if (oldpage->mapping) {
2598 used = oldpage;
2599 unused = newpage;
2600 } else {
2601 used = newpage;
2602 unused = oldpage;
2603 }
2604
2605
2606
2607
2608
2609 pc = lookup_page_cgroup(oldpage);
2610 lock_page_cgroup(pc);
2611 ClearPageCgroupMigration(pc);
2612 unlock_page_cgroup(pc);
2613
2614 if (unused != oldpage)
2615 pc = lookup_page_cgroup(unused);
2616 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2617
2618 pc = lookup_page_cgroup(used);
2619
2620
2621
2622
2623
2624
2625
2626
2627 if (PageAnon(used))
2628 mem_cgroup_uncharge_page(used);
2629
2630
2631
2632
2633
2634
2635 cgroup_release_and_wakeup_rmdir(&mem->css);
2636}
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646int mem_cgroup_shmem_charge_fallback(struct page *page,
2647 struct mm_struct *mm,
2648 gfp_t gfp_mask)
2649{
2650 struct mem_cgroup *mem = NULL;
2651 int ret;
2652
2653 if (mem_cgroup_disabled())
2654 return 0;
2655
2656 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2657 if (!ret)
2658 mem_cgroup_cancel_charge_swapin(mem);
2659
2660 return ret;
2661}
2662
2663static DEFINE_MUTEX(set_limit_mutex);
2664
2665static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2666 unsigned long long val)
2667{
2668 int retry_count;
2669 u64 memswlimit, memlimit;
2670 int ret = 0;
2671 int children = mem_cgroup_count_children(memcg);
2672 u64 curusage, oldusage;
2673 int enlarge;
2674
2675
2676
2677
2678
2679
2680 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2681
2682 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2683
2684 enlarge = 0;
2685 while (retry_count) {
2686 if (signal_pending(current)) {
2687 ret = -EINTR;
2688 break;
2689 }
2690
2691
2692
2693
2694
2695 mutex_lock(&set_limit_mutex);
2696 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2697 if (memswlimit < val) {
2698 ret = -EINVAL;
2699 mutex_unlock(&set_limit_mutex);
2700 break;
2701 }
2702
2703 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2704 if (memlimit < val)
2705 enlarge = 1;
2706
2707 ret = res_counter_set_limit(&memcg->res, val);
2708 if (!ret) {
2709 if (memswlimit == val)
2710 memcg->memsw_is_minimum = true;
2711 else
2712 memcg->memsw_is_minimum = false;
2713 }
2714 mutex_unlock(&set_limit_mutex);
2715
2716 if (!ret)
2717 break;
2718
2719 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2720 MEM_CGROUP_RECLAIM_SHRINK);
2721 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2722
2723 if (curusage >= oldusage)
2724 retry_count--;
2725 else
2726 oldusage = curusage;
2727 }
2728 if (!ret && enlarge)
2729 memcg_oom_recover(memcg);
2730
2731 return ret;
2732}
2733
2734static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2735 unsigned long long val)
2736{
2737 int retry_count;
2738 u64 memlimit, memswlimit, oldusage, curusage;
2739 int children = mem_cgroup_count_children(memcg);
2740 int ret = -EBUSY;
2741 int enlarge = 0;
2742
2743
2744 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2745 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2746 while (retry_count) {
2747 if (signal_pending(current)) {
2748 ret = -EINTR;
2749 break;
2750 }
2751
2752
2753
2754
2755
2756 mutex_lock(&set_limit_mutex);
2757 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2758 if (memlimit > val) {
2759 ret = -EINVAL;
2760 mutex_unlock(&set_limit_mutex);
2761 break;
2762 }
2763 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2764 if (memswlimit < val)
2765 enlarge = 1;
2766 ret = res_counter_set_limit(&memcg->memsw, val);
2767 if (!ret) {
2768 if (memlimit == val)
2769 memcg->memsw_is_minimum = true;
2770 else
2771 memcg->memsw_is_minimum = false;
2772 }
2773 mutex_unlock(&set_limit_mutex);
2774
2775 if (!ret)
2776 break;
2777
2778 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2779 MEM_CGROUP_RECLAIM_NOSWAP |
2780 MEM_CGROUP_RECLAIM_SHRINK);
2781 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2782
2783 if (curusage >= oldusage)
2784 retry_count--;
2785 else
2786 oldusage = curusage;
2787 }
2788 if (!ret && enlarge)
2789 memcg_oom_recover(memcg);
2790 return ret;
2791}
2792
2793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2794 gfp_t gfp_mask, int nid,
2795 int zid)
2796{
2797 unsigned long nr_reclaimed = 0;
2798 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2799 unsigned long reclaimed;
2800 int loop = 0;
2801 struct mem_cgroup_tree_per_zone *mctz;
2802 unsigned long long excess;
2803
2804 if (order > 0)
2805 return 0;
2806
2807 mctz = soft_limit_tree_node_zone(nid, zid);
2808
2809
2810
2811
2812
2813 do {
2814 if (next_mz)
2815 mz = next_mz;
2816 else
2817 mz = mem_cgroup_largest_soft_limit_node(mctz);
2818 if (!mz)
2819 break;
2820
2821 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2822 gfp_mask,
2823 MEM_CGROUP_RECLAIM_SOFT);
2824 nr_reclaimed += reclaimed;
2825 spin_lock(&mctz->lock);
2826
2827
2828
2829
2830
2831 next_mz = NULL;
2832 if (!reclaimed) {
2833 do {
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845 next_mz =
2846 __mem_cgroup_largest_soft_limit_node(mctz);
2847 if (next_mz == mz) {
2848 css_put(&next_mz->mem->css);
2849 next_mz = NULL;
2850 } else
2851 break;
2852 } while (1);
2853 }
2854 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2855 excess = res_counter_soft_limit_excess(&mz->mem->res);
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2866 spin_unlock(&mctz->lock);
2867 css_put(&mz->mem->css);
2868 loop++;
2869
2870
2871
2872
2873
2874 if (!nr_reclaimed &&
2875 (next_mz == NULL ||
2876 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2877 break;
2878 } while (!nr_reclaimed);
2879 if (next_mz)
2880 css_put(&next_mz->mem->css);
2881 return nr_reclaimed;
2882}
2883
2884
2885
2886
2887
2888static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2889 int node, int zid, enum lru_list lru)
2890{
2891 struct zone *zone;
2892 struct mem_cgroup_per_zone *mz;
2893 struct page_cgroup *pc, *busy;
2894 unsigned long flags, loop;
2895 struct list_head *list;
2896 int ret = 0;
2897
2898 zone = &NODE_DATA(node)->node_zones[zid];
2899 mz = mem_cgroup_zoneinfo(mem, node, zid);
2900 list = &mz->lists[lru];
2901
2902 loop = MEM_CGROUP_ZSTAT(mz, lru);
2903
2904 loop += 256;
2905 busy = NULL;
2906 while (loop--) {
2907 ret = 0;
2908 spin_lock_irqsave(&zone->lru_lock, flags);
2909 if (list_empty(list)) {
2910 spin_unlock_irqrestore(&zone->lru_lock, flags);
2911 break;
2912 }
2913 pc = list_entry(list->prev, struct page_cgroup, lru);
2914 if (busy == pc) {
2915 list_move(&pc->lru, list);
2916 busy = NULL;
2917 spin_unlock_irqrestore(&zone->lru_lock, flags);
2918 continue;
2919 }
2920 spin_unlock_irqrestore(&zone->lru_lock, flags);
2921
2922 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2923 if (ret == -ENOMEM)
2924 break;
2925
2926 if (ret == -EBUSY || ret == -EINVAL) {
2927
2928 busy = pc;
2929 cond_resched();
2930 } else
2931 busy = NULL;
2932 }
2933
2934 if (!ret && !list_empty(list))
2935 return -EBUSY;
2936 return ret;
2937}
2938
2939
2940
2941
2942
2943static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2944{
2945 int ret;
2946 int node, zid, shrink;
2947 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2948 struct cgroup *cgrp = mem->css.cgroup;
2949
2950 css_get(&mem->css);
2951
2952 shrink = 0;
2953
2954 if (free_all)
2955 goto try_to_free;
2956move_account:
2957 do {
2958 ret = -EBUSY;
2959 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2960 goto out;
2961 ret = -EINTR;
2962 if (signal_pending(current))
2963 goto out;
2964
2965 lru_add_drain_all();
2966 drain_all_stock_sync();
2967 ret = 0;
2968 for_each_node_state(node, N_HIGH_MEMORY) {
2969 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2970 enum lru_list l;
2971 for_each_lru(l) {
2972 ret = mem_cgroup_force_empty_list(mem,
2973 node, zid, l);
2974 if (ret)
2975 break;
2976 }
2977 }
2978 if (ret)
2979 break;
2980 }
2981 memcg_oom_recover(mem);
2982
2983 if (ret == -ENOMEM)
2984 goto try_to_free;
2985 cond_resched();
2986
2987 } while (mem->res.usage > 0 || ret);
2988out:
2989 css_put(&mem->css);
2990 return ret;
2991
2992try_to_free:
2993
2994 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2995 ret = -EBUSY;
2996 goto out;
2997 }
2998
2999 lru_add_drain_all();
3000
3001 shrink = 1;
3002 while (nr_retries && mem->res.usage > 0) {
3003 int progress;
3004
3005 if (signal_pending(current)) {
3006 ret = -EINTR;
3007 goto out;
3008 }
3009 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3010 false, get_swappiness(mem));
3011 if (!progress) {
3012 nr_retries--;
3013
3014 congestion_wait(BLK_RW_ASYNC, HZ/10);
3015 }
3016
3017 }
3018 lru_add_drain();
3019
3020 goto move_account;
3021}
3022
3023int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3024{
3025 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3026}
3027
3028
3029static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3030{
3031 return mem_cgroup_from_cont(cont)->use_hierarchy;
3032}
3033
3034static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3035 u64 val)
3036{
3037 int retval = 0;
3038 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3039 struct cgroup *parent = cont->parent;
3040 struct mem_cgroup *parent_mem = NULL;
3041
3042 if (parent)
3043 parent_mem = mem_cgroup_from_cont(parent);
3044
3045 cgroup_lock();
3046
3047
3048
3049
3050
3051
3052
3053
3054 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3055 (val == 1 || val == 0)) {
3056 if (list_empty(&cont->children))
3057 mem->use_hierarchy = val;
3058 else
3059 retval = -EBUSY;
3060 } else
3061 retval = -EINVAL;
3062 cgroup_unlock();
3063
3064 return retval;
3065}
3066
3067struct mem_cgroup_idx_data {
3068 s64 val;
3069 enum mem_cgroup_stat_index idx;
3070};
3071
3072static int
3073mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
3074{
3075 struct mem_cgroup_idx_data *d = data;
3076 d->val += mem_cgroup_read_stat(mem, d->idx);
3077 return 0;
3078}
3079
3080static void
3081mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3082 enum mem_cgroup_stat_index idx, s64 *val)
3083{
3084 struct mem_cgroup_idx_data d;
3085 d.idx = idx;
3086 d.val = 0;
3087 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3088 *val = d.val;
3089}
3090
3091static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3092{
3093 u64 idx_val, val;
3094
3095 if (!mem_cgroup_is_root(mem)) {
3096 if (!swap)
3097 return res_counter_read_u64(&mem->res, RES_USAGE);
3098 else
3099 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3100 }
3101
3102 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
3103 val = idx_val;
3104 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3105 val += idx_val;
3106
3107 if (swap) {
3108 mem_cgroup_get_recursive_idx_stat(mem,
3109 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3110 val += idx_val;
3111 }
3112
3113 return val << PAGE_SHIFT;
3114}
3115
3116static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3117{
3118 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3119 u64 val;
3120 int type, name;
3121
3122 type = MEMFILE_TYPE(cft->private);
3123 name = MEMFILE_ATTR(cft->private);
3124 switch (type) {
3125 case _MEM:
3126 if (name == RES_USAGE)
3127 val = mem_cgroup_usage(mem, false);
3128 else
3129 val = res_counter_read_u64(&mem->res, name);
3130 break;
3131 case _MEMSWAP:
3132 if (name == RES_USAGE)
3133 val = mem_cgroup_usage(mem, true);
3134 else
3135 val = res_counter_read_u64(&mem->memsw, name);
3136 break;
3137 default:
3138 BUG();
3139 break;
3140 }
3141 return val;
3142}
3143
3144
3145
3146
3147static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3148 const char *buffer)
3149{
3150 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3151 int type, name;
3152 unsigned long long val;
3153 int ret;
3154
3155 type = MEMFILE_TYPE(cft->private);
3156 name = MEMFILE_ATTR(cft->private);
3157 switch (name) {
3158 case RES_LIMIT:
3159 if (mem_cgroup_is_root(memcg)) {
3160 ret = -EINVAL;
3161 break;
3162 }
3163
3164 ret = res_counter_memparse_write_strategy(buffer, &val);
3165 if (ret)
3166 break;
3167 if (type == _MEM)
3168 ret = mem_cgroup_resize_limit(memcg, val);
3169 else
3170 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3171 break;
3172 case RES_SOFT_LIMIT:
3173 ret = res_counter_memparse_write_strategy(buffer, &val);
3174 if (ret)
3175 break;
3176
3177
3178
3179
3180
3181 if (type == _MEM)
3182 ret = res_counter_set_soft_limit(&memcg->res, val);
3183 else
3184 ret = -EINVAL;
3185 break;
3186 default:
3187 ret = -EINVAL;
3188 break;
3189 }
3190 return ret;
3191}
3192
3193static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3194 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3195{
3196 struct cgroup *cgroup;
3197 unsigned long long min_limit, min_memsw_limit, tmp;
3198
3199 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3200 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3201 cgroup = memcg->css.cgroup;
3202 if (!memcg->use_hierarchy)
3203 goto out;
3204
3205 while (cgroup->parent) {
3206 cgroup = cgroup->parent;
3207 memcg = mem_cgroup_from_cont(cgroup);
3208 if (!memcg->use_hierarchy)
3209 break;
3210 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3211 min_limit = min(min_limit, tmp);
3212 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3213 min_memsw_limit = min(min_memsw_limit, tmp);
3214 }
3215out:
3216 *mem_limit = min_limit;
3217 *memsw_limit = min_memsw_limit;
3218 return;
3219}
3220
3221static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3222{
3223 struct mem_cgroup *mem;
3224 int type, name;
3225
3226 mem = mem_cgroup_from_cont(cont);
3227 type = MEMFILE_TYPE(event);
3228 name = MEMFILE_ATTR(event);
3229 switch (name) {
3230 case RES_MAX_USAGE:
3231 if (type == _MEM)
3232 res_counter_reset_max(&mem->res);
3233 else
3234 res_counter_reset_max(&mem->memsw);
3235 break;
3236 case RES_FAILCNT:
3237 if (type == _MEM)
3238 res_counter_reset_failcnt(&mem->res);
3239 else
3240 res_counter_reset_failcnt(&mem->memsw);
3241 break;
3242 }
3243
3244 return 0;
3245}
3246
3247static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3248 struct cftype *cft)
3249{
3250 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3251}
3252
3253#ifdef CONFIG_MMU
3254static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3255 struct cftype *cft, u64 val)
3256{
3257 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3258
3259 if (val >= (1 << NR_MOVE_TYPE))
3260 return -EINVAL;
3261
3262
3263
3264
3265
3266 cgroup_lock();
3267 mem->move_charge_at_immigrate = val;
3268 cgroup_unlock();
3269
3270 return 0;
3271}
3272#else
3273static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3274 struct cftype *cft, u64 val)
3275{
3276 return -ENOSYS;
3277}
3278#endif
3279
3280
3281
3282enum {
3283 MCS_CACHE,
3284 MCS_RSS,
3285 MCS_FILE_MAPPED,
3286 MCS_PGPGIN,
3287 MCS_PGPGOUT,
3288 MCS_SWAP,
3289 MCS_INACTIVE_ANON,
3290 MCS_ACTIVE_ANON,
3291 MCS_INACTIVE_FILE,
3292 MCS_ACTIVE_FILE,
3293 MCS_UNEVICTABLE,
3294 NR_MCS_STAT,
3295};
3296
3297struct mcs_total_stat {
3298 s64 stat[NR_MCS_STAT];
3299};
3300
3301struct {
3302 char *local_name;
3303 char *total_name;
3304} memcg_stat_strings[NR_MCS_STAT] = {
3305 {"cache", "total_cache"},
3306 {"rss", "total_rss"},
3307 {"mapped_file", "total_mapped_file"},
3308 {"pgpgin", "total_pgpgin"},
3309 {"pgpgout", "total_pgpgout"},
3310 {"swap", "total_swap"},
3311 {"inactive_anon", "total_inactive_anon"},
3312 {"active_anon", "total_active_anon"},
3313 {"inactive_file", "total_inactive_file"},
3314 {"active_file", "total_active_file"},
3315 {"unevictable", "total_unevictable"}
3316};
3317
3318
3319static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3320{
3321 struct mcs_total_stat *s = data;
3322 s64 val;
3323
3324
3325 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3326 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3327 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3328 s->stat[MCS_RSS] += val * PAGE_SIZE;
3329 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3330 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3331 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3332 s->stat[MCS_PGPGIN] += val;
3333 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3334 s->stat[MCS_PGPGOUT] += val;
3335 if (do_swap_account) {
3336 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3337 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3338 }
3339
3340
3341 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3342 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3343 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3344 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3345 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3346 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3347 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3348 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3349 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3350 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3351 return 0;
3352}
3353
3354static void
3355mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3356{
3357 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3358}
3359
3360static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3361 struct cgroup_map_cb *cb)
3362{
3363 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3364 struct mcs_total_stat mystat;
3365 int i;
3366
3367 memset(&mystat, 0, sizeof(mystat));
3368 mem_cgroup_get_local_stat(mem_cont, &mystat);
3369
3370 for (i = 0; i < NR_MCS_STAT; i++) {
3371 if (i == MCS_SWAP && !do_swap_account)
3372 continue;
3373 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3374 }
3375
3376
3377 {
3378 unsigned long long limit, memsw_limit;
3379 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3380 cb->fill(cb, "hierarchical_memory_limit", limit);
3381 if (do_swap_account)
3382 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3383 }
3384
3385 memset(&mystat, 0, sizeof(mystat));
3386 mem_cgroup_get_total_stat(mem_cont, &mystat);
3387 for (i = 0; i < NR_MCS_STAT; i++) {
3388 if (i == MCS_SWAP && !do_swap_account)
3389 continue;
3390 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3391 }
3392
3393#ifdef CONFIG_DEBUG_VM
3394 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3395
3396 {
3397 int nid, zid;
3398 struct mem_cgroup_per_zone *mz;
3399 unsigned long recent_rotated[2] = {0, 0};
3400 unsigned long recent_scanned[2] = {0, 0};
3401
3402 for_each_online_node(nid)
3403 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3404 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3405
3406 recent_rotated[0] +=
3407 mz->reclaim_stat.recent_rotated[0];
3408 recent_rotated[1] +=
3409 mz->reclaim_stat.recent_rotated[1];
3410 recent_scanned[0] +=
3411 mz->reclaim_stat.recent_scanned[0];
3412 recent_scanned[1] +=
3413 mz->reclaim_stat.recent_scanned[1];
3414 }
3415 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3416 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3417 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3418 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3419 }
3420#endif
3421
3422 return 0;
3423}
3424
3425static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3426{
3427 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3428
3429 return get_swappiness(memcg);
3430}
3431
3432static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3433 u64 val)
3434{
3435 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3436 struct mem_cgroup *parent;
3437
3438 if (val > 100)
3439 return -EINVAL;
3440
3441 if (cgrp->parent == NULL)
3442 return -EINVAL;
3443
3444 parent = mem_cgroup_from_cont(cgrp->parent);
3445
3446 cgroup_lock();
3447
3448
3449 if ((parent->use_hierarchy) ||
3450 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3451 cgroup_unlock();
3452 return -EINVAL;
3453 }
3454
3455 spin_lock(&memcg->reclaim_param_lock);
3456 memcg->swappiness = val;
3457 spin_unlock(&memcg->reclaim_param_lock);
3458
3459 cgroup_unlock();
3460
3461 return 0;
3462}
3463
3464static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3465{
3466 struct mem_cgroup_threshold_ary *t;
3467 u64 usage;
3468 int i;
3469
3470 rcu_read_lock();
3471 if (!swap)
3472 t = rcu_dereference(memcg->thresholds.primary);
3473 else
3474 t = rcu_dereference(memcg->memsw_thresholds.primary);
3475
3476 if (!t)
3477 goto unlock;
3478
3479 usage = mem_cgroup_usage(memcg, swap);
3480
3481
3482
3483
3484
3485
3486 i = t->current_threshold;
3487
3488
3489
3490
3491
3492
3493
3494 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3495 eventfd_signal(t->entries[i].eventfd, 1);
3496
3497
3498 i++;
3499
3500
3501
3502
3503
3504
3505
3506 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3507 eventfd_signal(t->entries[i].eventfd, 1);
3508
3509
3510 t->current_threshold = i - 1;
3511unlock:
3512 rcu_read_unlock();
3513}
3514
3515static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3516{
3517 __mem_cgroup_threshold(memcg, false);
3518 if (do_swap_account)
3519 __mem_cgroup_threshold(memcg, true);
3520}
3521
3522static int compare_thresholds(const void *a, const void *b)
3523{
3524 const struct mem_cgroup_threshold *_a = a;
3525 const struct mem_cgroup_threshold *_b = b;
3526
3527 return _a->threshold - _b->threshold;
3528}
3529
3530static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3531{
3532 struct mem_cgroup_eventfd_list *ev;
3533
3534 list_for_each_entry(ev, &mem->oom_notify, list)
3535 eventfd_signal(ev->eventfd, 1);
3536 return 0;
3537}
3538
3539static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3540{
3541 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3542}
3543
3544static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3545 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3546{
3547 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3548 struct mem_cgroup_thresholds *thresholds;
3549 struct mem_cgroup_threshold_ary *new;
3550 int type = MEMFILE_TYPE(cft->private);
3551 u64 threshold, usage;
3552 int i, size, ret;
3553
3554 ret = res_counter_memparse_write_strategy(args, &threshold);
3555 if (ret)
3556 return ret;
3557
3558 mutex_lock(&memcg->thresholds_lock);
3559
3560 if (type == _MEM)
3561 thresholds = &memcg->thresholds;
3562 else if (type == _MEMSWAP)
3563 thresholds = &memcg->memsw_thresholds;
3564 else
3565 BUG();
3566
3567 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3568
3569
3570 if (thresholds->primary)
3571 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3572
3573 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3574
3575
3576 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3577 GFP_KERNEL);
3578 if (!new) {
3579 ret = -ENOMEM;
3580 goto unlock;
3581 }
3582 new->size = size;
3583
3584
3585 if (thresholds->primary) {
3586 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3587 sizeof(struct mem_cgroup_threshold));
3588 }
3589
3590
3591 new->entries[size - 1].eventfd = eventfd;
3592 new->entries[size - 1].threshold = threshold;
3593
3594
3595 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3596 compare_thresholds, NULL);
3597
3598
3599 new->current_threshold = -1;
3600 for (i = 0; i < size; i++) {
3601 if (new->entries[i].threshold < usage) {
3602
3603
3604
3605
3606
3607 ++new->current_threshold;
3608 }
3609 }
3610
3611
3612 kfree(thresholds->spare);
3613 thresholds->spare = thresholds->primary;
3614
3615 rcu_assign_pointer(thresholds->primary, new);
3616
3617
3618 synchronize_rcu();
3619
3620unlock:
3621 mutex_unlock(&memcg->thresholds_lock);
3622
3623 return ret;
3624}
3625
3626static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3627 struct cftype *cft, struct eventfd_ctx *eventfd)
3628{
3629 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3630 struct mem_cgroup_thresholds *thresholds;
3631 struct mem_cgroup_threshold_ary *new;
3632 int type = MEMFILE_TYPE(cft->private);
3633 u64 usage;
3634 int i, j, size;
3635
3636 mutex_lock(&memcg->thresholds_lock);
3637 if (type == _MEM)
3638 thresholds = &memcg->thresholds;
3639 else if (type == _MEMSWAP)
3640 thresholds = &memcg->memsw_thresholds;
3641 else
3642 BUG();
3643
3644
3645
3646
3647
3648 BUG_ON(!thresholds);
3649
3650 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3651
3652
3653 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3654
3655
3656 size = 0;
3657 for (i = 0; i < thresholds->primary->size; i++) {
3658 if (thresholds->primary->entries[i].eventfd != eventfd)
3659 size++;
3660 }
3661
3662 new = thresholds->spare;
3663
3664
3665 if (!size) {
3666 kfree(new);
3667 new = NULL;
3668 goto swap_buffers;
3669 }
3670
3671 new->size = size;
3672
3673
3674 new->current_threshold = -1;
3675 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3676 if (thresholds->primary->entries[i].eventfd == eventfd)
3677 continue;
3678
3679 new->entries[j] = thresholds->primary->entries[i];
3680 if (new->entries[j].threshold < usage) {
3681
3682
3683
3684
3685
3686 ++new->current_threshold;
3687 }
3688 j++;
3689 }
3690
3691swap_buffers:
3692
3693 thresholds->spare = thresholds->primary;
3694 rcu_assign_pointer(thresholds->primary, new);
3695
3696
3697 synchronize_rcu();
3698
3699 mutex_unlock(&memcg->thresholds_lock);
3700}
3701
3702static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3703 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3704{
3705 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3706 struct mem_cgroup_eventfd_list *event;
3707 int type = MEMFILE_TYPE(cft->private);
3708
3709 BUG_ON(type != _OOM_TYPE);
3710 event = kmalloc(sizeof(*event), GFP_KERNEL);
3711 if (!event)
3712 return -ENOMEM;
3713
3714 mutex_lock(&memcg_oom_mutex);
3715
3716 event->eventfd = eventfd;
3717 list_add(&event->list, &memcg->oom_notify);
3718
3719
3720 if (atomic_read(&memcg->oom_lock))
3721 eventfd_signal(eventfd, 1);
3722 mutex_unlock(&memcg_oom_mutex);
3723
3724 return 0;
3725}
3726
3727static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3728 struct cftype *cft, struct eventfd_ctx *eventfd)
3729{
3730 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3731 struct mem_cgroup_eventfd_list *ev, *tmp;
3732 int type = MEMFILE_TYPE(cft->private);
3733
3734 BUG_ON(type != _OOM_TYPE);
3735
3736 mutex_lock(&memcg_oom_mutex);
3737
3738 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3739 if (ev->eventfd == eventfd) {
3740 list_del(&ev->list);
3741 kfree(ev);
3742 }
3743 }
3744
3745 mutex_unlock(&memcg_oom_mutex);
3746}
3747
3748static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3749 struct cftype *cft, struct cgroup_map_cb *cb)
3750{
3751 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3752
3753 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3754
3755 if (atomic_read(&mem->oom_lock))
3756 cb->fill(cb, "under_oom", 1);
3757 else
3758 cb->fill(cb, "under_oom", 0);
3759 return 0;
3760}
3761
3762
3763
3764static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 struct cftype *cft, u64 val)
3766{
3767 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3768 struct mem_cgroup *parent;
3769
3770
3771 if (!cgrp->parent || !((val == 0) || (val == 1)))
3772 return -EINVAL;
3773
3774 parent = mem_cgroup_from_cont(cgrp->parent);
3775
3776 cgroup_lock();
3777
3778 if ((parent->use_hierarchy) ||
3779 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3780 cgroup_unlock();
3781 return -EINVAL;
3782 }
3783 mem->oom_kill_disable = val;
3784 if (!val)
3785 memcg_oom_recover(mem);
3786 cgroup_unlock();
3787 return 0;
3788}
3789
3790static struct cftype mem_cgroup_files[] = {
3791 {
3792 .name = "usage_in_bytes",
3793 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3794 .read_u64 = mem_cgroup_read,
3795 .register_event = mem_cgroup_usage_register_event,
3796 .unregister_event = mem_cgroup_usage_unregister_event,
3797 },
3798 {
3799 .name = "max_usage_in_bytes",
3800 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3801 .trigger = mem_cgroup_reset,
3802 .read_u64 = mem_cgroup_read,
3803 },
3804 {
3805 .name = "limit_in_bytes",
3806 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3807 .write_string = mem_cgroup_write,
3808 .read_u64 = mem_cgroup_read,
3809 },
3810 {
3811 .name = "soft_limit_in_bytes",
3812 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3813 .write_string = mem_cgroup_write,
3814 .read_u64 = mem_cgroup_read,
3815 },
3816 {
3817 .name = "failcnt",
3818 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3819 .trigger = mem_cgroup_reset,
3820 .read_u64 = mem_cgroup_read,
3821 },
3822 {
3823 .name = "stat",
3824 .read_map = mem_control_stat_show,
3825 },
3826 {
3827 .name = "force_empty",
3828 .trigger = mem_cgroup_force_empty_write,
3829 },
3830 {
3831 .name = "use_hierarchy",
3832 .write_u64 = mem_cgroup_hierarchy_write,
3833 .read_u64 = mem_cgroup_hierarchy_read,
3834 },
3835 {
3836 .name = "swappiness",
3837 .read_u64 = mem_cgroup_swappiness_read,
3838 .write_u64 = mem_cgroup_swappiness_write,
3839 },
3840 {
3841 .name = "move_charge_at_immigrate",
3842 .read_u64 = mem_cgroup_move_charge_read,
3843 .write_u64 = mem_cgroup_move_charge_write,
3844 },
3845 {
3846 .name = "oom_control",
3847 .read_map = mem_cgroup_oom_control_read,
3848 .write_u64 = mem_cgroup_oom_control_write,
3849 .register_event = mem_cgroup_oom_register_event,
3850 .unregister_event = mem_cgroup_oom_unregister_event,
3851 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3852 },
3853};
3854
3855#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3856static struct cftype memsw_cgroup_files[] = {
3857 {
3858 .name = "memsw.usage_in_bytes",
3859 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3860 .read_u64 = mem_cgroup_read,
3861 .register_event = mem_cgroup_usage_register_event,
3862 .unregister_event = mem_cgroup_usage_unregister_event,
3863 },
3864 {
3865 .name = "memsw.max_usage_in_bytes",
3866 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3867 .trigger = mem_cgroup_reset,
3868 .read_u64 = mem_cgroup_read,
3869 },
3870 {
3871 .name = "memsw.limit_in_bytes",
3872 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3873 .write_string = mem_cgroup_write,
3874 .read_u64 = mem_cgroup_read,
3875 },
3876 {
3877 .name = "memsw.failcnt",
3878 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3879 .trigger = mem_cgroup_reset,
3880 .read_u64 = mem_cgroup_read,
3881 },
3882};
3883
3884static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3885{
3886 if (!do_swap_account)
3887 return 0;
3888 return cgroup_add_files(cont, ss, memsw_cgroup_files,
3889 ARRAY_SIZE(memsw_cgroup_files));
3890};
3891#else
3892static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3893{
3894 return 0;
3895}
3896#endif
3897
3898static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3899{
3900 struct mem_cgroup_per_node *pn;
3901 struct mem_cgroup_per_zone *mz;
3902 enum lru_list l;
3903 int zone, tmp = node;
3904
3905
3906
3907
3908
3909
3910
3911
3912 if (!node_state(node, N_NORMAL_MEMORY))
3913 tmp = -1;
3914 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3915 if (!pn)
3916 return 1;
3917
3918 mem->info.nodeinfo[node] = pn;
3919 memset(pn, 0, sizeof(*pn));
3920
3921 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3922 mz = &pn->zoneinfo[zone];
3923 for_each_lru(l)
3924 INIT_LIST_HEAD(&mz->lists[l]);
3925 mz->usage_in_excess = 0;
3926 mz->on_tree = false;
3927 mz->mem = mem;
3928 }
3929 return 0;
3930}
3931
3932static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3933{
3934 kfree(mem->info.nodeinfo[node]);
3935}
3936
3937static struct mem_cgroup *mem_cgroup_alloc(void)
3938{
3939 struct mem_cgroup *mem;
3940 int size = sizeof(struct mem_cgroup);
3941
3942
3943 if (size < PAGE_SIZE)
3944 mem = kmalloc(size, GFP_KERNEL);
3945 else
3946 mem = vmalloc(size);
3947
3948 if (!mem)
3949 return NULL;
3950
3951 memset(mem, 0, size);
3952 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3953 if (!mem->stat) {
3954 if (size < PAGE_SIZE)
3955 kfree(mem);
3956 else
3957 vfree(mem);
3958 mem = NULL;
3959 }
3960 return mem;
3961}
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974static void __mem_cgroup_free(struct mem_cgroup *mem)
3975{
3976 int node;
3977
3978 mem_cgroup_remove_from_trees(mem);
3979 free_css_id(&mem_cgroup_subsys, &mem->css);
3980
3981 for_each_node_state(node, N_POSSIBLE)
3982 free_mem_cgroup_per_zone_info(mem, node);
3983
3984 free_percpu(mem->stat);
3985 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3986 kfree(mem);
3987 else
3988 vfree(mem);
3989}
3990
3991static void mem_cgroup_get(struct mem_cgroup *mem)
3992{
3993 atomic_inc(&mem->refcnt);
3994}
3995
3996static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3997{
3998 if (atomic_sub_and_test(count, &mem->refcnt)) {
3999 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4000 __mem_cgroup_free(mem);
4001 if (parent)
4002 mem_cgroup_put(parent);
4003 }
4004}
4005
4006static void mem_cgroup_put(struct mem_cgroup *mem)
4007{
4008 __mem_cgroup_put(mem, 1);
4009}
4010
4011
4012
4013
4014static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4015{
4016 if (!mem->res.parent)
4017 return NULL;
4018 return mem_cgroup_from_res_counter(mem->res.parent, res);
4019}
4020
4021#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4022static void __init enable_swap_cgroup(void)
4023{
4024 if (!mem_cgroup_disabled() && really_do_swap_account)
4025 do_swap_account = 1;
4026}
4027#else
4028static void __init enable_swap_cgroup(void)
4029{
4030}
4031#endif
4032
4033static int mem_cgroup_soft_limit_tree_init(void)
4034{
4035 struct mem_cgroup_tree_per_node *rtpn;
4036 struct mem_cgroup_tree_per_zone *rtpz;
4037 int tmp, node, zone;
4038
4039 for_each_node_state(node, N_POSSIBLE) {
4040 tmp = node;
4041 if (!node_state(node, N_NORMAL_MEMORY))
4042 tmp = -1;
4043 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4044 if (!rtpn)
4045 return 1;
4046
4047 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4048
4049 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4050 rtpz = &rtpn->rb_tree_per_zone[zone];
4051 rtpz->rb_root = RB_ROOT;
4052 spin_lock_init(&rtpz->lock);
4053 }
4054 }
4055 return 0;
4056}
4057
4058static struct cgroup_subsys_state * __ref
4059mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4060{
4061 struct mem_cgroup *mem, *parent;
4062 long error = -ENOMEM;
4063 int node;
4064
4065 mem = mem_cgroup_alloc();
4066 if (!mem)
4067 return ERR_PTR(error);
4068
4069 for_each_node_state(node, N_POSSIBLE)
4070 if (alloc_mem_cgroup_per_zone_info(mem, node))
4071 goto free_out;
4072
4073
4074 if (cont->parent == NULL) {
4075 int cpu;
4076 enable_swap_cgroup();
4077 parent = NULL;
4078 root_mem_cgroup = mem;
4079 if (mem_cgroup_soft_limit_tree_init())
4080 goto free_out;
4081 for_each_possible_cpu(cpu) {
4082 struct memcg_stock_pcp *stock =
4083 &per_cpu(memcg_stock, cpu);
4084 INIT_WORK(&stock->work, drain_local_stock);
4085 }
4086 hotcpu_notifier(memcg_stock_cpu_callback, 0);
4087 } else {
4088 parent = mem_cgroup_from_cont(cont->parent);
4089 mem->use_hierarchy = parent->use_hierarchy;
4090 mem->oom_kill_disable = parent->oom_kill_disable;
4091 }
4092
4093 if (parent && parent->use_hierarchy) {
4094 res_counter_init(&mem->res, &parent->res);
4095 res_counter_init(&mem->memsw, &parent->memsw);
4096
4097
4098
4099
4100
4101
4102 mem_cgroup_get(parent);
4103 } else {
4104 res_counter_init(&mem->res, NULL);
4105 res_counter_init(&mem->memsw, NULL);
4106 }
4107 mem->last_scanned_child = 0;
4108 spin_lock_init(&mem->reclaim_param_lock);
4109 INIT_LIST_HEAD(&mem->oom_notify);
4110
4111 if (parent)
4112 mem->swappiness = get_swappiness(parent);
4113 atomic_set(&mem->refcnt, 1);
4114 mem->move_charge_at_immigrate = 0;
4115 mutex_init(&mem->thresholds_lock);
4116 return &mem->css;
4117free_out:
4118 __mem_cgroup_free(mem);
4119 root_mem_cgroup = NULL;
4120 return ERR_PTR(error);
4121}
4122
4123static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4124 struct cgroup *cont)
4125{
4126 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4127
4128 return mem_cgroup_force_empty(mem, false);
4129}
4130
4131static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4132 struct cgroup *cont)
4133{
4134 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4135
4136 mem_cgroup_put(mem);
4137}
4138
4139static int mem_cgroup_populate(struct cgroup_subsys *ss,
4140 struct cgroup *cont)
4141{
4142 int ret;
4143
4144 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4145 ARRAY_SIZE(mem_cgroup_files));
4146
4147 if (!ret)
4148 ret = register_memsw_files(cont, ss);
4149 return ret;
4150}
4151
4152#ifdef CONFIG_MMU
4153
4154#define PRECHARGE_COUNT_AT_ONCE 256
4155static int mem_cgroup_do_precharge(unsigned long count)
4156{
4157 int ret = 0;
4158 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4159 struct mem_cgroup *mem = mc.to;
4160
4161 if (mem_cgroup_is_root(mem)) {
4162 mc.precharge += count;
4163
4164 return ret;
4165 }
4166
4167 if (count > 1) {
4168 struct res_counter *dummy;
4169
4170
4171
4172
4173
4174
4175 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4176 goto one_by_one;
4177 if (do_swap_account && res_counter_charge(&mem->memsw,
4178 PAGE_SIZE * count, &dummy)) {
4179 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4180 goto one_by_one;
4181 }
4182 mc.precharge += count;
4183 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4184 WARN_ON_ONCE(count > INT_MAX);
4185 __css_get(&mem->css, (int)count);
4186 return ret;
4187 }
4188one_by_one:
4189
4190 while (count--) {
4191 if (signal_pending(current)) {
4192 ret = -EINTR;
4193 break;
4194 }
4195 if (!batch_count--) {
4196 batch_count = PRECHARGE_COUNT_AT_ONCE;
4197 cond_resched();
4198 }
4199 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4200 if (ret || !mem)
4201
4202 return -ENOMEM;
4203 mc.precharge++;
4204 }
4205 return ret;
4206}
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226union mc_target {
4227 struct page *page;
4228 swp_entry_t ent;
4229};
4230
4231enum mc_target_type {
4232 MC_TARGET_NONE,
4233 MC_TARGET_PAGE,
4234 MC_TARGET_SWAP,
4235};
4236
4237static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4238 unsigned long addr, pte_t ptent)
4239{
4240 struct page *page = vm_normal_page(vma, addr, ptent);
4241
4242 if (!page || !page_mapped(page))
4243 return NULL;
4244 if (PageAnon(page)) {
4245
4246 if (!move_anon() || page_mapcount(page) > 2)
4247 return NULL;
4248 } else if (!move_file())
4249
4250 return NULL;
4251 if (!get_page_unless_zero(page))
4252 return NULL;
4253
4254 return page;
4255}
4256
4257static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4258 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4259{
4260 int usage_count;
4261 struct page *page = NULL;
4262 swp_entry_t ent = pte_to_swp_entry(ptent);
4263
4264 if (!move_anon() || non_swap_entry(ent))
4265 return NULL;
4266 usage_count = mem_cgroup_count_swap_user(ent, &page);
4267 if (usage_count > 1) {
4268 if (page)
4269 put_page(page);
4270 return NULL;
4271 }
4272 if (do_swap_account)
4273 entry->val = ent.val;
4274
4275 return page;
4276}
4277
4278static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4279 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4280{
4281 struct page *page = NULL;
4282 struct inode *inode;
4283 struct address_space *mapping;
4284 pgoff_t pgoff;
4285
4286 if (!vma->vm_file)
4287 return NULL;
4288 if (!move_file())
4289 return NULL;
4290
4291 inode = vma->vm_file->f_path.dentry->d_inode;
4292 mapping = vma->vm_file->f_mapping;
4293 if (pte_none(ptent))
4294 pgoff = linear_page_index(vma, addr);
4295 else
4296 pgoff = pte_to_pgoff(ptent);
4297
4298
4299 if (!mapping_cap_swap_backed(mapping)) {
4300 page = find_get_page(mapping, pgoff);
4301 } else {
4302 swp_entry_t ent;
4303 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4304 if (do_swap_account)
4305 entry->val = ent.val;
4306 }
4307
4308 return page;
4309}
4310
4311static int is_target_pte_for_mc(struct vm_area_struct *vma,
4312 unsigned long addr, pte_t ptent, union mc_target *target)
4313{
4314 struct page *page = NULL;
4315 struct page_cgroup *pc;
4316 int ret = 0;
4317 swp_entry_t ent = { .val = 0 };
4318
4319 if (pte_present(ptent))
4320 page = mc_handle_present_pte(vma, addr, ptent);
4321 else if (is_swap_pte(ptent))
4322 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4323 else if (pte_none(ptent) || pte_file(ptent))
4324 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4325
4326 if (!page && !ent.val)
4327 return 0;
4328 if (page) {
4329 pc = lookup_page_cgroup(page);
4330
4331
4332
4333
4334
4335 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4336 ret = MC_TARGET_PAGE;
4337 if (target)
4338 target->page = page;
4339 }
4340 if (!ret || !target)
4341 put_page(page);
4342 }
4343
4344 if (ent.val && !ret &&
4345 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4346 ret = MC_TARGET_SWAP;
4347 if (target)
4348 target->ent = ent;
4349 }
4350 return ret;
4351}
4352
4353static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4354 unsigned long addr, unsigned long end,
4355 struct mm_walk *walk)
4356{
4357 struct vm_area_struct *vma = walk->private;
4358 pte_t *pte;
4359 spinlock_t *ptl;
4360
4361 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4362 for (; addr != end; pte++, addr += PAGE_SIZE)
4363 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4364 mc.precharge++;
4365 pte_unmap_unlock(pte - 1, ptl);
4366 cond_resched();
4367
4368 return 0;
4369}
4370
4371static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4372{
4373 unsigned long precharge;
4374 struct vm_area_struct *vma;
4375
4376 down_read(&mm->mmap_sem);
4377 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4378 struct mm_walk mem_cgroup_count_precharge_walk = {
4379 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4380 .mm = mm,
4381 .private = vma,
4382 };
4383 if (is_vm_hugetlb_page(vma))
4384 continue;
4385 walk_page_range(vma->vm_start, vma->vm_end,
4386 &mem_cgroup_count_precharge_walk);
4387 }
4388 up_read(&mm->mmap_sem);
4389
4390 precharge = mc.precharge;
4391 mc.precharge = 0;
4392
4393 return precharge;
4394}
4395
4396static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4397{
4398 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4399}
4400
4401static void mem_cgroup_clear_mc(void)
4402{
4403
4404 if (mc.precharge) {
4405 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4406 mc.precharge = 0;
4407 memcg_oom_recover(mc.to);
4408 }
4409
4410
4411
4412
4413 if (mc.moved_charge) {
4414 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4415 mc.moved_charge = 0;
4416 memcg_oom_recover(mc.from);
4417 }
4418
4419 if (mc.moved_swap) {
4420 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4421
4422 if (!mem_cgroup_is_root(mc.from))
4423 res_counter_uncharge(&mc.from->memsw,
4424 PAGE_SIZE * mc.moved_swap);
4425 __mem_cgroup_put(mc.from, mc.moved_swap);
4426
4427 if (!mem_cgroup_is_root(mc.to)) {
4428
4429
4430
4431
4432 res_counter_uncharge(&mc.to->res,
4433 PAGE_SIZE * mc.moved_swap);
4434 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4435 __css_put(&mc.to->css, mc.moved_swap);
4436 }
4437
4438
4439 mc.moved_swap = 0;
4440 }
4441 mc.from = NULL;
4442 mc.to = NULL;
4443 mc.moving_task = NULL;
4444 wake_up_all(&mc.waitq);
4445}
4446
4447static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4448 struct cgroup *cgroup,
4449 struct task_struct *p,
4450 bool threadgroup)
4451{
4452 int ret = 0;
4453 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4454
4455 if (mem->move_charge_at_immigrate) {
4456 struct mm_struct *mm;
4457 struct mem_cgroup *from = mem_cgroup_from_task(p);
4458
4459 VM_BUG_ON(from == mem);
4460
4461 mm = get_task_mm(p);
4462 if (!mm)
4463 return 0;
4464
4465 if (mm->owner == p) {
4466 VM_BUG_ON(mc.from);
4467 VM_BUG_ON(mc.to);
4468 VM_BUG_ON(mc.precharge);
4469 VM_BUG_ON(mc.moved_charge);
4470 VM_BUG_ON(mc.moved_swap);
4471 VM_BUG_ON(mc.moving_task);
4472 mc.from = from;
4473 mc.to = mem;
4474 mc.precharge = 0;
4475 mc.moved_charge = 0;
4476 mc.moved_swap = 0;
4477 mc.moving_task = current;
4478
4479 ret = mem_cgroup_precharge_mc(mm);
4480 if (ret)
4481 mem_cgroup_clear_mc();
4482 }
4483 mmput(mm);
4484 }
4485 return ret;
4486}
4487
4488static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4489 struct cgroup *cgroup,
4490 struct task_struct *p,
4491 bool threadgroup)
4492{
4493 mem_cgroup_clear_mc();
4494}
4495
4496static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4497 unsigned long addr, unsigned long end,
4498 struct mm_walk *walk)
4499{
4500 int ret = 0;
4501 struct vm_area_struct *vma = walk->private;
4502 pte_t *pte;
4503 spinlock_t *ptl;
4504
4505retry:
4506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4507 for (; addr != end; addr += PAGE_SIZE) {
4508 pte_t ptent = *(pte++);
4509 union mc_target target;
4510 int type;
4511 struct page *page;
4512 struct page_cgroup *pc;
4513 swp_entry_t ent;
4514
4515 if (!mc.precharge)
4516 break;
4517
4518 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4519 switch (type) {
4520 case MC_TARGET_PAGE:
4521 page = target.page;
4522 if (isolate_lru_page(page))
4523 goto put;
4524 pc = lookup_page_cgroup(page);
4525 if (!mem_cgroup_move_account(pc,
4526 mc.from, mc.to, false)) {
4527 mc.precharge--;
4528
4529 mc.moved_charge++;
4530 }
4531 putback_lru_page(page);
4532put:
4533 put_page(page);
4534 break;
4535 case MC_TARGET_SWAP:
4536 ent = target.ent;
4537 if (!mem_cgroup_move_swap_account(ent,
4538 mc.from, mc.to, false)) {
4539 mc.precharge--;
4540
4541 mc.moved_swap++;
4542 }
4543 break;
4544 default:
4545 break;
4546 }
4547 }
4548 pte_unmap_unlock(pte - 1, ptl);
4549 cond_resched();
4550
4551 if (addr != end) {
4552
4553
4554
4555
4556
4557
4558 ret = mem_cgroup_do_precharge(1);
4559 if (!ret)
4560 goto retry;
4561 }
4562
4563 return ret;
4564}
4565
4566static void mem_cgroup_move_charge(struct mm_struct *mm)
4567{
4568 struct vm_area_struct *vma;
4569
4570 lru_add_drain_all();
4571 down_read(&mm->mmap_sem);
4572 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4573 int ret;
4574 struct mm_walk mem_cgroup_move_charge_walk = {
4575 .pmd_entry = mem_cgroup_move_charge_pte_range,
4576 .mm = mm,
4577 .private = vma,
4578 };
4579 if (is_vm_hugetlb_page(vma))
4580 continue;
4581 ret = walk_page_range(vma->vm_start, vma->vm_end,
4582 &mem_cgroup_move_charge_walk);
4583 if (ret)
4584
4585
4586
4587
4588 break;
4589 }
4590 up_read(&mm->mmap_sem);
4591}
4592
4593static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4594 struct cgroup *cont,
4595 struct cgroup *old_cont,
4596 struct task_struct *p,
4597 bool threadgroup)
4598{
4599 struct mm_struct *mm;
4600
4601 if (!mc.to)
4602
4603 return;
4604
4605 mm = get_task_mm(p);
4606 if (mm) {
4607 mem_cgroup_move_charge(mm);
4608 mmput(mm);
4609 }
4610 mem_cgroup_clear_mc();
4611}
4612#else
4613static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4614 struct cgroup *cgroup,
4615 struct task_struct *p,
4616 bool threadgroup)
4617{
4618 return 0;
4619}
4620static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4621 struct cgroup *cgroup,
4622 struct task_struct *p,
4623 bool threadgroup)
4624{
4625}
4626static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4627 struct cgroup *cont,
4628 struct cgroup *old_cont,
4629 struct task_struct *p,
4630 bool threadgroup)
4631{
4632}
4633#endif
4634
4635struct cgroup_subsys mem_cgroup_subsys = {
4636 .name = "memory",
4637 .subsys_id = mem_cgroup_subsys_id,
4638 .create = mem_cgroup_create,
4639 .pre_destroy = mem_cgroup_pre_destroy,
4640 .destroy = mem_cgroup_destroy,
4641 .populate = mem_cgroup_populate,
4642 .can_attach = mem_cgroup_can_attach,
4643 .cancel_attach = mem_cgroup_cancel_attach,
4644 .attach = mem_cgroup_move_task,
4645 .early_init = 0,
4646 .use_id = 1,
4647};
4648
4649#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4650
4651static int __init disable_swap_account(char *s)
4652{
4653 really_do_swap_account = 0;
4654 return 1;
4655}
4656__setup("noswapaccount", disable_swap_account);
4657#endif
4658