1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/mutex.h>
37#include <linux/rbtree.h>
38#include <linux/slab.h>
39#include <linux/swap.h>
40#include <linux/swapops.h>
41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
44#include <linux/fs.h>
45#include <linux/seq_file.h>
46#include <linux/vmalloc.h>
47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
50#include <linux/oom.h>
51#include "internal.h"
52
53#include <asm/uaccess.h>
54
55#include <trace/events/vmscan.h>
56
57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58#define MEM_CGROUP_RECLAIM_RETRIES 5
59struct mem_cgroup *root_mem_cgroup __read_mostly;
60
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62
63int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1;
65#else
66#define do_swap_account (0)
67#endif
68
69
70
71
72
73
74
75
76#define THRESHOLDS_EVENTS_THRESH (7)
77#define SOFTLIMIT_EVENTS_THRESH (10)
78
79
80
81
82enum mem_cgroup_stat_index {
83
84
85
86 MEM_CGROUP_STAT_CACHE,
87 MEM_CGROUP_STAT_RSS,
88 MEM_CGROUP_STAT_FILE_MAPPED,
89 MEM_CGROUP_STAT_PGPGIN_COUNT,
90 MEM_CGROUP_STAT_PGPGOUT_COUNT,
91 MEM_CGROUP_STAT_SWAPOUT,
92 MEM_CGROUP_EVENTS,
93
94 MEM_CGROUP_STAT_NSTATS,
95};
96
97struct mem_cgroup_stat_cpu {
98 s64 count[MEM_CGROUP_STAT_NSTATS];
99};
100
101
102
103
104struct mem_cgroup_per_zone {
105
106
107
108 struct list_head lists[NR_LRU_LISTS];
109 unsigned long count[NR_LRU_LISTS];
110
111 struct zone_reclaim_stat reclaim_stat;
112 struct rb_node tree_node;
113 unsigned long long usage_in_excess;
114
115 bool on_tree;
116 struct mem_cgroup *mem;
117
118};
119
120#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
121
122struct mem_cgroup_per_node {
123 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
124};
125
126struct mem_cgroup_lru_info {
127 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
128};
129
130
131
132
133
134
135struct mem_cgroup_tree_per_zone {
136 struct rb_root rb_root;
137 spinlock_t lock;
138};
139
140struct mem_cgroup_tree_per_node {
141 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
142};
143
144struct mem_cgroup_tree {
145 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
146};
147
148static struct mem_cgroup_tree soft_limit_tree __read_mostly;
149
150struct mem_cgroup_threshold {
151 struct eventfd_ctx *eventfd;
152 u64 threshold;
153};
154
155
156struct mem_cgroup_threshold_ary {
157
158 int current_threshold;
159
160 unsigned int size;
161
162 struct mem_cgroup_threshold entries[0];
163};
164
165struct mem_cgroup_thresholds {
166
167 struct mem_cgroup_threshold_ary *primary;
168
169
170
171
172
173 struct mem_cgroup_threshold_ary *spare;
174};
175
176
177struct mem_cgroup_eventfd_list {
178 struct list_head list;
179 struct eventfd_ctx *eventfd;
180};
181
182static void mem_cgroup_threshold(struct mem_cgroup *mem);
183static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
184
185
186
187
188
189
190
191
192
193
194
195
196struct mem_cgroup {
197 struct cgroup_subsys_state css;
198
199
200
201 struct res_counter res;
202
203
204
205 struct res_counter memsw;
206
207
208
209
210 struct mem_cgroup_lru_info info;
211
212
213
214
215 spinlock_t reclaim_param_lock;
216
217
218
219
220
221 int last_scanned_child;
222
223
224
225 bool use_hierarchy;
226 atomic_t oom_lock;
227 atomic_t refcnt;
228
229 unsigned int swappiness;
230
231 int oom_kill_disable;
232
233
234 bool memsw_is_minimum;
235
236
237 struct mutex thresholds_lock;
238
239
240 struct mem_cgroup_thresholds thresholds;
241
242
243 struct mem_cgroup_thresholds memsw_thresholds;
244
245
246 struct list_head oom_notify;
247
248
249
250
251
252 unsigned long move_charge_at_immigrate;
253
254
255
256 struct mem_cgroup_stat_cpu *stat;
257};
258
259
260
261
262
263
264enum move_type {
265 MOVE_CHARGE_TYPE_ANON,
266 MOVE_CHARGE_TYPE_FILE,
267 NR_MOVE_TYPE,
268};
269
270
271static struct move_charge_struct {
272 spinlock_t lock;
273 struct mem_cgroup *from;
274 struct mem_cgroup *to;
275 unsigned long precharge;
276 unsigned long moved_charge;
277 unsigned long moved_swap;
278 struct task_struct *moving_task;
279 struct mm_struct *mm;
280 wait_queue_head_t waitq;
281} mc = {
282 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
283 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
284};
285
286static bool move_anon(void)
287{
288 return test_bit(MOVE_CHARGE_TYPE_ANON,
289 &mc.to->move_charge_at_immigrate);
290}
291
292static bool move_file(void)
293{
294 return test_bit(MOVE_CHARGE_TYPE_FILE,
295 &mc.to->move_charge_at_immigrate);
296}
297
298
299
300
301
302#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
303#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
304
305enum charge_type {
306 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
307 MEM_CGROUP_CHARGE_TYPE_MAPPED,
308 MEM_CGROUP_CHARGE_TYPE_SHMEM,
309 MEM_CGROUP_CHARGE_TYPE_FORCE,
310 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
311 MEM_CGROUP_CHARGE_TYPE_DROP,
312 NR_CHARGE_TYPE,
313};
314
315
316#define PCGF_CACHE (1UL << PCG_CACHE)
317#define PCGF_USED (1UL << PCG_USED)
318#define PCGF_LOCK (1UL << PCG_LOCK)
319
320#define PCGF_ACCT (1UL << PCG_ACCT)
321
322
323#define _MEM (0)
324#define _MEMSWAP (1)
325#define _OOM_TYPE (2)
326#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
327#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
328#define MEMFILE_ATTR(val) ((val) & 0xffff)
329
330#define OOM_CONTROL (0)
331
332
333
334
335#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
336#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
337#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
338#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
339#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
340#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
341
342static void mem_cgroup_get(struct mem_cgroup *mem);
343static void mem_cgroup_put(struct mem_cgroup *mem);
344static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
345static void drain_all_stock_async(void);
346
347static struct mem_cgroup_per_zone *
348mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
349{
350 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
351}
352
353struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
354{
355 return &mem->css;
356}
357
358static struct mem_cgroup_per_zone *
359page_cgroup_zoneinfo(struct page_cgroup *pc)
360{
361 struct mem_cgroup *mem = pc->mem_cgroup;
362 int nid = page_cgroup_nid(pc);
363 int zid = page_cgroup_zid(pc);
364
365 if (!mem)
366 return NULL;
367
368 return mem_cgroup_zoneinfo(mem, nid, zid);
369}
370
371static struct mem_cgroup_tree_per_zone *
372soft_limit_tree_node_zone(int nid, int zid)
373{
374 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
375}
376
377static struct mem_cgroup_tree_per_zone *
378soft_limit_tree_from_page(struct page *page)
379{
380 int nid = page_to_nid(page);
381 int zid = page_zonenum(page);
382
383 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
384}
385
386static void
387__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
388 struct mem_cgroup_per_zone *mz,
389 struct mem_cgroup_tree_per_zone *mctz,
390 unsigned long long new_usage_in_excess)
391{
392 struct rb_node **p = &mctz->rb_root.rb_node;
393 struct rb_node *parent = NULL;
394 struct mem_cgroup_per_zone *mz_node;
395
396 if (mz->on_tree)
397 return;
398
399 mz->usage_in_excess = new_usage_in_excess;
400 if (!mz->usage_in_excess)
401 return;
402 while (*p) {
403 parent = *p;
404 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
405 tree_node);
406 if (mz->usage_in_excess < mz_node->usage_in_excess)
407 p = &(*p)->rb_left;
408
409
410
411
412 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
413 p = &(*p)->rb_right;
414 }
415 rb_link_node(&mz->tree_node, parent, p);
416 rb_insert_color(&mz->tree_node, &mctz->rb_root);
417 mz->on_tree = true;
418}
419
420static void
421__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
422 struct mem_cgroup_per_zone *mz,
423 struct mem_cgroup_tree_per_zone *mctz)
424{
425 if (!mz->on_tree)
426 return;
427 rb_erase(&mz->tree_node, &mctz->rb_root);
428 mz->on_tree = false;
429}
430
431static void
432mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
433 struct mem_cgroup_per_zone *mz,
434 struct mem_cgroup_tree_per_zone *mctz)
435{
436 spin_lock(&mctz->lock);
437 __mem_cgroup_remove_exceeded(mem, mz, mctz);
438 spin_unlock(&mctz->lock);
439}
440
441
442static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
443{
444 unsigned long long excess;
445 struct mem_cgroup_per_zone *mz;
446 struct mem_cgroup_tree_per_zone *mctz;
447 int nid = page_to_nid(page);
448 int zid = page_zonenum(page);
449 mctz = soft_limit_tree_from_page(page);
450
451
452
453
454
455 for (; mem; mem = parent_mem_cgroup(mem)) {
456 mz = mem_cgroup_zoneinfo(mem, nid, zid);
457 excess = res_counter_soft_limit_excess(&mem->res);
458
459
460
461
462 if (excess || mz->on_tree) {
463 spin_lock(&mctz->lock);
464
465 if (mz->on_tree)
466 __mem_cgroup_remove_exceeded(mem, mz, mctz);
467
468
469
470
471 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
472 spin_unlock(&mctz->lock);
473 }
474 }
475}
476
477static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
478{
479 int node, zone;
480 struct mem_cgroup_per_zone *mz;
481 struct mem_cgroup_tree_per_zone *mctz;
482
483 for_each_node_state(node, N_POSSIBLE) {
484 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
485 mz = mem_cgroup_zoneinfo(mem, node, zone);
486 mctz = soft_limit_tree_node_zone(node, zone);
487 mem_cgroup_remove_exceeded(mem, mz, mctz);
488 }
489 }
490}
491
492static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
493{
494 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
495}
496
497static struct mem_cgroup_per_zone *
498__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
499{
500 struct rb_node *rightmost = NULL;
501 struct mem_cgroup_per_zone *mz;
502
503retry:
504 mz = NULL;
505 rightmost = rb_last(&mctz->rb_root);
506 if (!rightmost)
507 goto done;
508
509 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
510
511
512
513
514
515 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
516 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
517 !css_tryget(&mz->mem->css))
518 goto retry;
519done:
520 return mz;
521}
522
523static struct mem_cgroup_per_zone *
524mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
525{
526 struct mem_cgroup_per_zone *mz;
527
528 spin_lock(&mctz->lock);
529 mz = __mem_cgroup_largest_soft_limit_node(mctz);
530 spin_unlock(&mctz->lock);
531 return mz;
532}
533
534static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
535 enum mem_cgroup_stat_index idx)
536{
537 int cpu;
538 s64 val = 0;
539
540 for_each_possible_cpu(cpu)
541 val += per_cpu(mem->stat->count[idx], cpu);
542 return val;
543}
544
545static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
546{
547 s64 ret;
548
549 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
550 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
551 return ret;
552}
553
554static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
555 bool charge)
556{
557 int val = (charge) ? 1 : -1;
558 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
559}
560
561static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
562 struct page_cgroup *pc,
563 bool charge)
564{
565 int val = (charge) ? 1 : -1;
566
567 preempt_disable();
568
569 if (PageCgroupCache(pc))
570 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
571 else
572 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
573
574 if (charge)
575 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
576 else
577 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
578 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
579
580 preempt_enable();
581}
582
583static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
584 enum lru_list idx)
585{
586 int nid, zid;
587 struct mem_cgroup_per_zone *mz;
588 u64 total = 0;
589
590 for_each_online_node(nid)
591 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
592 mz = mem_cgroup_zoneinfo(mem, nid, zid);
593 total += MEM_CGROUP_ZSTAT(mz, idx);
594 }
595 return total;
596}
597
598static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
599{
600 s64 val;
601
602 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
603
604 return !(val & ((1 << event_mask_shift) - 1));
605}
606
607
608
609
610
611static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
612{
613
614 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
615 mem_cgroup_threshold(mem);
616 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
617 mem_cgroup_update_tree(mem, page);
618 }
619}
620
621static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
622{
623 return container_of(cgroup_subsys_state(cont,
624 mem_cgroup_subsys_id), struct mem_cgroup,
625 css);
626}
627
628struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
629{
630
631
632
633
634
635 if (unlikely(!p))
636 return NULL;
637
638 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
639 struct mem_cgroup, css);
640}
641
642static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
643{
644 struct mem_cgroup *mem = NULL;
645
646 if (!mm)
647 return NULL;
648
649
650
651
652
653 rcu_read_lock();
654 do {
655 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
656 if (unlikely(!mem))
657 break;
658 } while (!css_tryget(&mem->css));
659 rcu_read_unlock();
660 return mem;
661}
662
663
664
665
666static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
667 int (*func)(struct mem_cgroup *, void *))
668{
669 int found, ret, nextid;
670 struct cgroup_subsys_state *css;
671 struct mem_cgroup *mem;
672
673 if (!root->use_hierarchy)
674 return (*func)(root, data);
675
676 nextid = 1;
677 do {
678 ret = 0;
679 mem = NULL;
680
681 rcu_read_lock();
682 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
683 &found);
684 if (css && css_tryget(css))
685 mem = container_of(css, struct mem_cgroup, css);
686 rcu_read_unlock();
687
688 if (mem) {
689 ret = (*func)(mem, data);
690 css_put(&mem->css);
691 }
692 nextid = found + 1;
693 } while (!ret && css);
694
695 return ret;
696}
697
698static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
699{
700 return (mem == root_mem_cgroup);
701}
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
718{
719 struct page_cgroup *pc;
720 struct mem_cgroup_per_zone *mz;
721
722 if (mem_cgroup_disabled())
723 return;
724 pc = lookup_page_cgroup(page);
725
726 if (!TestClearPageCgroupAcctLRU(pc))
727 return;
728 VM_BUG_ON(!pc->mem_cgroup);
729
730
731
732
733 mz = page_cgroup_zoneinfo(pc);
734 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
735 if (mem_cgroup_is_root(pc->mem_cgroup))
736 return;
737 VM_BUG_ON(list_empty(&pc->lru));
738 list_del_init(&pc->lru);
739 return;
740}
741
742void mem_cgroup_del_lru(struct page *page)
743{
744 mem_cgroup_del_lru_list(page, page_lru(page));
745}
746
747void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
748{
749 struct mem_cgroup_per_zone *mz;
750 struct page_cgroup *pc;
751
752 if (mem_cgroup_disabled())
753 return;
754
755 pc = lookup_page_cgroup(page);
756
757
758
759
760 smp_rmb();
761
762 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
763 return;
764 mz = page_cgroup_zoneinfo(pc);
765 list_move(&pc->lru, &mz->lists[lru]);
766}
767
768void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
769{
770 struct page_cgroup *pc;
771 struct mem_cgroup_per_zone *mz;
772
773 if (mem_cgroup_disabled())
774 return;
775 pc = lookup_page_cgroup(page);
776 VM_BUG_ON(PageCgroupAcctLRU(pc));
777
778
779
780
781 smp_rmb();
782 if (!PageCgroupUsed(pc))
783 return;
784
785 mz = page_cgroup_zoneinfo(pc);
786 MEM_CGROUP_ZSTAT(mz, lru) += 1;
787 SetPageCgroupAcctLRU(pc);
788 if (mem_cgroup_is_root(pc->mem_cgroup))
789 return;
790 list_add(&pc->lru, &mz->lists[lru]);
791}
792
793
794
795
796
797
798
799
800static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
801{
802 unsigned long flags;
803 struct zone *zone = page_zone(page);
804 struct page_cgroup *pc = lookup_page_cgroup(page);
805
806 spin_lock_irqsave(&zone->lru_lock, flags);
807
808
809
810
811 if (!PageCgroupUsed(pc))
812 mem_cgroup_del_lru_list(page, page_lru(page));
813 spin_unlock_irqrestore(&zone->lru_lock, flags);
814}
815
816static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
817{
818 unsigned long flags;
819 struct zone *zone = page_zone(page);
820 struct page_cgroup *pc = lookup_page_cgroup(page);
821
822 spin_lock_irqsave(&zone->lru_lock, flags);
823
824 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
825 mem_cgroup_add_lru_list(page, page_lru(page));
826 spin_unlock_irqrestore(&zone->lru_lock, flags);
827}
828
829
830void mem_cgroup_move_lists(struct page *page,
831 enum lru_list from, enum lru_list to)
832{
833 if (mem_cgroup_disabled())
834 return;
835 mem_cgroup_del_lru_list(page, from);
836 mem_cgroup_add_lru_list(page, to);
837}
838
839int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
840{
841 int ret;
842 struct mem_cgroup *curr = NULL;
843 struct task_struct *p;
844
845 p = find_lock_task_mm(task);
846 if (!p)
847 return 0;
848 curr = try_get_mem_cgroup_from_mm(p->mm);
849 task_unlock(p);
850 if (!curr)
851 return 0;
852
853
854
855
856
857
858 if (mem->use_hierarchy)
859 ret = css_is_ancestor(&curr->css, &mem->css);
860 else
861 ret = (curr == mem);
862 css_put(&curr->css);
863 return ret;
864}
865
866static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
867{
868 unsigned long active;
869 unsigned long inactive;
870 unsigned long gb;
871 unsigned long inactive_ratio;
872
873 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
874 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
875
876 gb = (inactive + active) >> (30 - PAGE_SHIFT);
877 if (gb)
878 inactive_ratio = int_sqrt(10 * gb);
879 else
880 inactive_ratio = 1;
881
882 if (present_pages) {
883 present_pages[0] = inactive;
884 present_pages[1] = active;
885 }
886
887 return inactive_ratio;
888}
889
890int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
891{
892 unsigned long active;
893 unsigned long inactive;
894 unsigned long present_pages[2];
895 unsigned long inactive_ratio;
896
897 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
898
899 inactive = present_pages[0];
900 active = present_pages[1];
901
902 if (inactive * inactive_ratio < active)
903 return 1;
904
905 return 0;
906}
907
908int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
909{
910 unsigned long active;
911 unsigned long inactive;
912
913 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
914 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
915
916 return (active > inactive);
917}
918
919unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
920 struct zone *zone,
921 enum lru_list lru)
922{
923 int nid = zone_to_nid(zone);
924 int zid = zone_idx(zone);
925 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
926
927 return MEM_CGROUP_ZSTAT(mz, lru);
928}
929
930struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
931 struct zone *zone)
932{
933 int nid = zone_to_nid(zone);
934 int zid = zone_idx(zone);
935 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
936
937 return &mz->reclaim_stat;
938}
939
940struct zone_reclaim_stat *
941mem_cgroup_get_reclaim_stat_from_page(struct page *page)
942{
943 struct page_cgroup *pc;
944 struct mem_cgroup_per_zone *mz;
945
946 if (mem_cgroup_disabled())
947 return NULL;
948
949 pc = lookup_page_cgroup(page);
950
951
952
953
954 smp_rmb();
955 if (!PageCgroupUsed(pc))
956 return NULL;
957
958 mz = page_cgroup_zoneinfo(pc);
959 if (!mz)
960 return NULL;
961
962 return &mz->reclaim_stat;
963}
964
965unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
966 struct list_head *dst,
967 unsigned long *scanned, int order,
968 int mode, struct zone *z,
969 struct mem_cgroup *mem_cont,
970 int active, int file)
971{
972 unsigned long nr_taken = 0;
973 struct page *page;
974 unsigned long scan;
975 LIST_HEAD(pc_list);
976 struct list_head *src;
977 struct page_cgroup *pc, *tmp;
978 int nid = zone_to_nid(z);
979 int zid = zone_idx(z);
980 struct mem_cgroup_per_zone *mz;
981 int lru = LRU_FILE * file + active;
982 int ret;
983
984 BUG_ON(!mem_cont);
985 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
986 src = &mz->lists[lru];
987
988 scan = 0;
989 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
990 if (scan >= nr_to_scan)
991 break;
992
993 page = pc->page;
994 if (unlikely(!PageCgroupUsed(pc)))
995 continue;
996 if (unlikely(!PageLRU(page)))
997 continue;
998
999 scan++;
1000 ret = __isolate_lru_page(page, mode, file);
1001 switch (ret) {
1002 case 0:
1003 list_move(&page->lru, dst);
1004 mem_cgroup_del_lru(page);
1005 nr_taken++;
1006 break;
1007 case -EBUSY:
1008
1009 mem_cgroup_rotate_lru_list(page, page_lru(page));
1010 break;
1011 default:
1012 break;
1013 }
1014 }
1015
1016 *scanned = scan;
1017
1018 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1019 0, 0, 0, mode);
1020
1021 return nr_taken;
1022}
1023
1024#define mem_cgroup_from_res_counter(counter, member) \
1025 container_of(counter, struct mem_cgroup, member)
1026
1027static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1028{
1029 if (do_swap_account) {
1030 if (res_counter_check_under_limit(&mem->res) &&
1031 res_counter_check_under_limit(&mem->memsw))
1032 return true;
1033 } else
1034 if (res_counter_check_under_limit(&mem->res))
1035 return true;
1036 return false;
1037}
1038
1039static unsigned int get_swappiness(struct mem_cgroup *memcg)
1040{
1041 struct cgroup *cgrp = memcg->css.cgroup;
1042 unsigned int swappiness;
1043
1044
1045 if (cgrp->parent == NULL)
1046 return vm_swappiness;
1047
1048 spin_lock(&memcg->reclaim_param_lock);
1049 swappiness = memcg->swappiness;
1050 spin_unlock(&memcg->reclaim_param_lock);
1051
1052 return swappiness;
1053}
1054
1055
1056
1057static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1058{
1059 struct mem_cgroup *from;
1060 struct mem_cgroup *to;
1061 bool ret = false;
1062
1063
1064
1065
1066 spin_lock(&mc.lock);
1067 from = mc.from;
1068 to = mc.to;
1069 if (!from)
1070 goto unlock;
1071 if (from == mem || to == mem
1072 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1073 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1074 ret = true;
1075unlock:
1076 spin_unlock(&mc.lock);
1077 return ret;
1078}
1079
1080static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1081{
1082 if (mc.moving_task && current != mc.moving_task) {
1083 if (mem_cgroup_under_move(mem)) {
1084 DEFINE_WAIT(wait);
1085 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1086
1087 if (mc.moving_task)
1088 schedule();
1089 finish_wait(&mc.waitq, &wait);
1090 return true;
1091 }
1092 }
1093 return false;
1094}
1095
1096static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1097{
1098 int *val = data;
1099 (*val)++;
1100 return 0;
1101}
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1112{
1113 struct cgroup *task_cgrp;
1114 struct cgroup *mem_cgrp;
1115
1116
1117
1118
1119
1120 static char memcg_name[PATH_MAX];
1121 int ret;
1122
1123 if (!memcg || !p)
1124 return;
1125
1126
1127 rcu_read_lock();
1128
1129 mem_cgrp = memcg->css.cgroup;
1130 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1131
1132 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1133 if (ret < 0) {
1134
1135
1136
1137
1138 rcu_read_unlock();
1139 goto done;
1140 }
1141 rcu_read_unlock();
1142
1143 printk(KERN_INFO "Task in %s killed", memcg_name);
1144
1145 rcu_read_lock();
1146 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1147 if (ret < 0) {
1148 rcu_read_unlock();
1149 goto done;
1150 }
1151 rcu_read_unlock();
1152
1153
1154
1155
1156 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1157done:
1158
1159 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1160 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1161 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1162 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1163 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1164 "failcnt %llu\n",
1165 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1166 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1167 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1168}
1169
1170
1171
1172
1173
1174static int mem_cgroup_count_children(struct mem_cgroup *mem)
1175{
1176 int num = 0;
1177 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1178 return num;
1179}
1180
1181
1182
1183
1184u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1185{
1186 u64 limit;
1187 u64 memsw;
1188
1189 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1190 total_swap_pages;
1191 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1192
1193
1194
1195
1196 return min(limit, memsw);
1197}
1198
1199
1200
1201
1202
1203
1204static struct mem_cgroup *
1205mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1206{
1207 struct mem_cgroup *ret = NULL;
1208 struct cgroup_subsys_state *css;
1209 int nextid, found;
1210
1211 if (!root_mem->use_hierarchy) {
1212 css_get(&root_mem->css);
1213 ret = root_mem;
1214 }
1215
1216 while (!ret) {
1217 rcu_read_lock();
1218 nextid = root_mem->last_scanned_child + 1;
1219 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1220 &found);
1221 if (css && css_tryget(css))
1222 ret = container_of(css, struct mem_cgroup, css);
1223
1224 rcu_read_unlock();
1225
1226 spin_lock(&root_mem->reclaim_param_lock);
1227 if (!css) {
1228
1229 root_mem->last_scanned_child = 0;
1230 } else
1231 root_mem->last_scanned_child = found;
1232 spin_unlock(&root_mem->reclaim_param_lock);
1233 }
1234
1235 return ret;
1236}
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1251 struct zone *zone,
1252 gfp_t gfp_mask,
1253 unsigned long reclaim_options)
1254{
1255 struct mem_cgroup *victim;
1256 int ret, total = 0;
1257 int loop = 0;
1258 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1259 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1260 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1261 unsigned long excess = mem_cgroup_get_excess(root_mem);
1262
1263
1264 if (root_mem->memsw_is_minimum)
1265 noswap = true;
1266
1267 while (1) {
1268 victim = mem_cgroup_select_victim(root_mem);
1269 if (victim == root_mem) {
1270 loop++;
1271 if (loop >= 1)
1272 drain_all_stock_async();
1273 if (loop >= 2) {
1274
1275
1276
1277
1278
1279 if (!check_soft || !total) {
1280 css_put(&victim->css);
1281 break;
1282 }
1283
1284
1285
1286
1287
1288
1289 if (total >= (excess >> 2) ||
1290 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1291 css_put(&victim->css);
1292 break;
1293 }
1294 }
1295 }
1296 if (!mem_cgroup_local_usage(victim)) {
1297
1298 css_put(&victim->css);
1299 continue;
1300 }
1301
1302 if (check_soft)
1303 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1304 noswap, get_swappiness(victim), zone);
1305 else
1306 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1307 noswap, get_swappiness(victim));
1308 css_put(&victim->css);
1309
1310
1311
1312
1313
1314 if (shrink)
1315 return ret;
1316 total += ret;
1317 if (check_soft) {
1318 if (res_counter_check_under_soft_limit(&root_mem->res))
1319 return total;
1320 } else if (mem_cgroup_check_under_limit(root_mem))
1321 return 1 + total;
1322 }
1323 return total;
1324}
1325
1326static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1327{
1328 int *val = (int *)data;
1329 int x;
1330
1331
1332
1333
1334
1335 x = atomic_inc_return(&mem->oom_lock);
1336 *val = max(x, *val);
1337 return 0;
1338}
1339
1340
1341
1342
1343static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1344{
1345 int lock_count = 0;
1346
1347 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1348
1349 if (lock_count == 1)
1350 return true;
1351 return false;
1352}
1353
1354static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1355{
1356
1357
1358
1359
1360
1361 atomic_add_unless(&mem->oom_lock, -1, 0);
1362 return 0;
1363}
1364
1365static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1366{
1367 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1368}
1369
1370static DEFINE_MUTEX(memcg_oom_mutex);
1371static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1372
1373struct oom_wait_info {
1374 struct mem_cgroup *mem;
1375 wait_queue_t wait;
1376};
1377
1378static int memcg_oom_wake_function(wait_queue_t *wait,
1379 unsigned mode, int sync, void *arg)
1380{
1381 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1382 struct oom_wait_info *oom_wait_info;
1383
1384 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1385
1386 if (oom_wait_info->mem == wake_mem)
1387 goto wakeup;
1388
1389 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1390 return 0;
1391
1392
1393
1394
1395 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1396 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1397 return 0;
1398
1399wakeup:
1400 return autoremove_wake_function(wait, mode, sync, arg);
1401}
1402
1403static void memcg_wakeup_oom(struct mem_cgroup *mem)
1404{
1405
1406 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1407}
1408
1409static void memcg_oom_recover(struct mem_cgroup *mem)
1410{
1411 if (mem && atomic_read(&mem->oom_lock))
1412 memcg_wakeup_oom(mem);
1413}
1414
1415
1416
1417
1418bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1419{
1420 struct oom_wait_info owait;
1421 bool locked, need_to_kill;
1422
1423 owait.mem = mem;
1424 owait.wait.flags = 0;
1425 owait.wait.func = memcg_oom_wake_function;
1426 owait.wait.private = current;
1427 INIT_LIST_HEAD(&owait.wait.task_list);
1428 need_to_kill = true;
1429
1430 mutex_lock(&memcg_oom_mutex);
1431 locked = mem_cgroup_oom_lock(mem);
1432
1433
1434
1435
1436
1437 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1438 if (!locked || mem->oom_kill_disable)
1439 need_to_kill = false;
1440 if (locked)
1441 mem_cgroup_oom_notify(mem);
1442 mutex_unlock(&memcg_oom_mutex);
1443
1444 if (need_to_kill) {
1445 finish_wait(&memcg_oom_waitq, &owait.wait);
1446 mem_cgroup_out_of_memory(mem, mask);
1447 } else {
1448 schedule();
1449 finish_wait(&memcg_oom_waitq, &owait.wait);
1450 }
1451 mutex_lock(&memcg_oom_mutex);
1452 mem_cgroup_oom_unlock(mem);
1453 memcg_wakeup_oom(mem);
1454 mutex_unlock(&memcg_oom_mutex);
1455
1456 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1457 return false;
1458
1459 schedule_timeout(1);
1460 return true;
1461}
1462
1463
1464
1465
1466
1467void mem_cgroup_update_file_mapped(struct page *page, int val)
1468{
1469 struct mem_cgroup *mem;
1470 struct page_cgroup *pc;
1471
1472 pc = lookup_page_cgroup(page);
1473 if (unlikely(!pc))
1474 return;
1475
1476 lock_page_cgroup(pc);
1477 mem = pc->mem_cgroup;
1478 if (!mem || !PageCgroupUsed(pc))
1479 goto done;
1480
1481
1482
1483
1484 if (val > 0) {
1485 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1486 SetPageCgroupFileMapped(pc);
1487 } else {
1488 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1489 ClearPageCgroupFileMapped(pc);
1490 }
1491
1492done:
1493 unlock_page_cgroup(pc);
1494}
1495
1496
1497
1498
1499
1500#define CHARGE_SIZE (32 * PAGE_SIZE)
1501struct memcg_stock_pcp {
1502 struct mem_cgroup *cached;
1503 int charge;
1504 struct work_struct work;
1505};
1506static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1507static atomic_t memcg_drain_count;
1508
1509
1510
1511
1512
1513
1514
1515static bool consume_stock(struct mem_cgroup *mem)
1516{
1517 struct memcg_stock_pcp *stock;
1518 bool ret = true;
1519
1520 stock = &get_cpu_var(memcg_stock);
1521 if (mem == stock->cached && stock->charge)
1522 stock->charge -= PAGE_SIZE;
1523 else
1524 ret = false;
1525 put_cpu_var(memcg_stock);
1526 return ret;
1527}
1528
1529
1530
1531
1532static void drain_stock(struct memcg_stock_pcp *stock)
1533{
1534 struct mem_cgroup *old = stock->cached;
1535
1536 if (stock->charge) {
1537 res_counter_uncharge(&old->res, stock->charge);
1538 if (do_swap_account)
1539 res_counter_uncharge(&old->memsw, stock->charge);
1540 }
1541 stock->cached = NULL;
1542 stock->charge = 0;
1543}
1544
1545
1546
1547
1548
1549static void drain_local_stock(struct work_struct *dummy)
1550{
1551 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1552 drain_stock(stock);
1553}
1554
1555
1556
1557
1558
1559static void refill_stock(struct mem_cgroup *mem, int val)
1560{
1561 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1562
1563 if (stock->cached != mem) {
1564 drain_stock(stock);
1565 stock->cached = mem;
1566 }
1567 stock->charge += val;
1568 put_cpu_var(memcg_stock);
1569}
1570
1571
1572
1573
1574
1575
1576
1577static void drain_all_stock_async(void)
1578{
1579 int cpu;
1580
1581
1582
1583
1584
1585
1586 if (atomic_read(&memcg_drain_count))
1587 return;
1588
1589 atomic_inc(&memcg_drain_count);
1590 get_online_cpus();
1591 for_each_online_cpu(cpu) {
1592 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1593 schedule_work_on(cpu, &stock->work);
1594 }
1595 put_online_cpus();
1596 atomic_dec(&memcg_drain_count);
1597
1598}
1599
1600
1601static void drain_all_stock_sync(void)
1602{
1603
1604 atomic_inc(&memcg_drain_count);
1605 schedule_on_each_cpu(drain_local_stock);
1606 atomic_dec(&memcg_drain_count);
1607}
1608
1609static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1610 unsigned long action,
1611 void *hcpu)
1612{
1613 int cpu = (unsigned long)hcpu;
1614 struct memcg_stock_pcp *stock;
1615
1616 if (action != CPU_DEAD)
1617 return NOTIFY_OK;
1618 stock = &per_cpu(memcg_stock, cpu);
1619 drain_stock(stock);
1620 return NOTIFY_OK;
1621}
1622
1623
1624
1625enum {
1626 CHARGE_OK,
1627 CHARGE_RETRY,
1628 CHARGE_NOMEM,
1629 CHARGE_WOULDBLOCK,
1630 CHARGE_OOM_DIE,
1631};
1632
1633static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1634 int csize, bool oom_check)
1635{
1636 struct mem_cgroup *mem_over_limit;
1637 struct res_counter *fail_res;
1638 unsigned long flags = 0;
1639 int ret;
1640
1641 ret = res_counter_charge(&mem->res, csize, &fail_res);
1642
1643 if (likely(!ret)) {
1644 if (!do_swap_account)
1645 return CHARGE_OK;
1646 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1647 if (likely(!ret))
1648 return CHARGE_OK;
1649
1650 res_counter_uncharge(&mem->res, csize);
1651 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1652 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1653 } else
1654 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1655
1656 if (csize > PAGE_SIZE)
1657 return CHARGE_RETRY;
1658
1659 if (!(gfp_mask & __GFP_WAIT))
1660 return CHARGE_WOULDBLOCK;
1661
1662 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1663 gfp_mask, flags);
1664
1665
1666
1667
1668
1669
1670
1671 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1672 return CHARGE_RETRY;
1673
1674
1675
1676
1677
1678 if (mem_cgroup_wait_acct_move(mem_over_limit))
1679 return CHARGE_RETRY;
1680
1681
1682 if (!oom_check)
1683 return CHARGE_NOMEM;
1684
1685 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1686 return CHARGE_OOM_DIE;
1687
1688 return CHARGE_RETRY;
1689}
1690
1691
1692
1693
1694
1695static int __mem_cgroup_try_charge(struct mm_struct *mm,
1696 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1697{
1698 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1699 struct mem_cgroup *mem = NULL;
1700 int ret;
1701 int csize = CHARGE_SIZE;
1702
1703
1704
1705
1706
1707
1708 if (unlikely(test_thread_flag(TIF_MEMDIE)
1709 || fatal_signal_pending(current)))
1710 goto bypass;
1711
1712
1713
1714
1715
1716
1717
1718 if (!*memcg && !mm)
1719 goto bypass;
1720again:
1721 if (*memcg) {
1722 mem = *memcg;
1723 VM_BUG_ON(css_is_removed(&mem->css));
1724 if (mem_cgroup_is_root(mem))
1725 goto done;
1726 if (consume_stock(mem))
1727 goto done;
1728 css_get(&mem->css);
1729 } else {
1730 struct task_struct *p;
1731
1732 rcu_read_lock();
1733 p = rcu_dereference(mm->owner);
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744 mem = mem_cgroup_from_task(p);
1745 if (!mem || mem_cgroup_is_root(mem)) {
1746 rcu_read_unlock();
1747 goto done;
1748 }
1749 if (consume_stock(mem)) {
1750
1751
1752
1753
1754
1755
1756
1757
1758 rcu_read_unlock();
1759 goto done;
1760 }
1761
1762 if (!css_tryget(&mem->css)) {
1763 rcu_read_unlock();
1764 goto again;
1765 }
1766 rcu_read_unlock();
1767 }
1768
1769 do {
1770 bool oom_check;
1771
1772
1773 if (fatal_signal_pending(current)) {
1774 css_put(&mem->css);
1775 goto bypass;
1776 }
1777
1778 oom_check = false;
1779 if (oom && !nr_oom_retries) {
1780 oom_check = true;
1781 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1782 }
1783
1784 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1785
1786 switch (ret) {
1787 case CHARGE_OK:
1788 break;
1789 case CHARGE_RETRY:
1790 csize = PAGE_SIZE;
1791 css_put(&mem->css);
1792 mem = NULL;
1793 goto again;
1794 case CHARGE_WOULDBLOCK:
1795 css_put(&mem->css);
1796 goto nomem;
1797 case CHARGE_NOMEM:
1798 if (!oom) {
1799 css_put(&mem->css);
1800 goto nomem;
1801 }
1802
1803 nr_oom_retries--;
1804 break;
1805 case CHARGE_OOM_DIE:
1806 css_put(&mem->css);
1807 goto bypass;
1808 }
1809 } while (ret != CHARGE_OK);
1810
1811 if (csize > PAGE_SIZE)
1812 refill_stock(mem, csize - PAGE_SIZE);
1813 css_put(&mem->css);
1814done:
1815 *memcg = mem;
1816 return 0;
1817nomem:
1818 *memcg = NULL;
1819 return -ENOMEM;
1820bypass:
1821 *memcg = NULL;
1822 return 0;
1823}
1824
1825
1826
1827
1828
1829
1830static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1831 unsigned long count)
1832{
1833 if (!mem_cgroup_is_root(mem)) {
1834 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1835 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1837 }
1838}
1839
1840static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1841{
1842 __mem_cgroup_cancel_charge(mem, 1);
1843}
1844
1845
1846
1847
1848
1849
1850
1851static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1852{
1853 struct cgroup_subsys_state *css;
1854
1855
1856 if (!id)
1857 return NULL;
1858 css = css_lookup(&mem_cgroup_subsys, id);
1859 if (!css)
1860 return NULL;
1861 return container_of(css, struct mem_cgroup, css);
1862}
1863
1864struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1865{
1866 struct mem_cgroup *mem = NULL;
1867 struct page_cgroup *pc;
1868 unsigned short id;
1869 swp_entry_t ent;
1870
1871 VM_BUG_ON(!PageLocked(page));
1872
1873 pc = lookup_page_cgroup(page);
1874 lock_page_cgroup(pc);
1875 if (PageCgroupUsed(pc)) {
1876 mem = pc->mem_cgroup;
1877 if (mem && !css_tryget(&mem->css))
1878 mem = NULL;
1879 } else if (PageSwapCache(page)) {
1880 ent.val = page_private(page);
1881 id = lookup_swap_cgroup(ent);
1882 rcu_read_lock();
1883 mem = mem_cgroup_lookup(id);
1884 if (mem && !css_tryget(&mem->css))
1885 mem = NULL;
1886 rcu_read_unlock();
1887 }
1888 unlock_page_cgroup(pc);
1889 return mem;
1890}
1891
1892
1893
1894
1895
1896
1897static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1898 struct page_cgroup *pc,
1899 enum charge_type ctype)
1900{
1901
1902 if (!mem)
1903 return;
1904
1905 lock_page_cgroup(pc);
1906 if (unlikely(PageCgroupUsed(pc))) {
1907 unlock_page_cgroup(pc);
1908 mem_cgroup_cancel_charge(mem);
1909 return;
1910 }
1911
1912 pc->mem_cgroup = mem;
1913
1914
1915
1916
1917
1918
1919
1920 smp_wmb();
1921 switch (ctype) {
1922 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1923 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1924 SetPageCgroupCache(pc);
1925 SetPageCgroupUsed(pc);
1926 break;
1927 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1928 ClearPageCgroupCache(pc);
1929 SetPageCgroupUsed(pc);
1930 break;
1931 default:
1932 break;
1933 }
1934
1935 mem_cgroup_charge_statistics(mem, pc, true);
1936
1937 unlock_page_cgroup(pc);
1938
1939
1940
1941
1942
1943 memcg_check_events(mem, pc->page);
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963static void __mem_cgroup_move_account(struct page_cgroup *pc,
1964 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1965{
1966 VM_BUG_ON(from == to);
1967 VM_BUG_ON(PageLRU(pc->page));
1968 VM_BUG_ON(!PageCgroupLocked(pc));
1969 VM_BUG_ON(!PageCgroupUsed(pc));
1970 VM_BUG_ON(pc->mem_cgroup != from);
1971
1972 if (PageCgroupFileMapped(pc)) {
1973
1974 preempt_disable();
1975 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1976 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1977 preempt_enable();
1978 }
1979 mem_cgroup_charge_statistics(from, pc, false);
1980 if (uncharge)
1981
1982 mem_cgroup_cancel_charge(from);
1983
1984
1985 pc->mem_cgroup = to;
1986 mem_cgroup_charge_statistics(to, pc, true);
1987
1988
1989
1990
1991
1992
1993
1994}
1995
1996
1997
1998
1999
2000static int mem_cgroup_move_account(struct page_cgroup *pc,
2001 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2002{
2003 int ret = -EINVAL;
2004 lock_page_cgroup(pc);
2005 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2006 __mem_cgroup_move_account(pc, from, to, uncharge);
2007 ret = 0;
2008 }
2009 unlock_page_cgroup(pc);
2010
2011
2012
2013 memcg_check_events(to, pc->page);
2014 memcg_check_events(from, pc->page);
2015 return ret;
2016}
2017
2018
2019
2020
2021
2022static int mem_cgroup_move_parent(struct page_cgroup *pc,
2023 struct mem_cgroup *child,
2024 gfp_t gfp_mask)
2025{
2026 struct page *page = pc->page;
2027 struct cgroup *cg = child->css.cgroup;
2028 struct cgroup *pcg = cg->parent;
2029 struct mem_cgroup *parent;
2030 int ret;
2031
2032
2033 if (!pcg)
2034 return -EINVAL;
2035
2036 ret = -EBUSY;
2037 if (!get_page_unless_zero(page))
2038 goto out;
2039 if (isolate_lru_page(page))
2040 goto put;
2041
2042 parent = mem_cgroup_from_cont(pcg);
2043 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
2044 if (ret || !parent)
2045 goto put_back;
2046
2047 ret = mem_cgroup_move_account(pc, child, parent, true);
2048 if (ret)
2049 mem_cgroup_cancel_charge(parent);
2050put_back:
2051 putback_lru_page(page);
2052put:
2053 put_page(page);
2054out:
2055 return ret;
2056}
2057
2058
2059
2060
2061
2062
2063
2064static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2065 gfp_t gfp_mask, enum charge_type ctype)
2066{
2067 struct mem_cgroup *mem = NULL;
2068 struct page_cgroup *pc;
2069 int ret;
2070
2071 pc = lookup_page_cgroup(page);
2072
2073 if (unlikely(!pc))
2074 return 0;
2075 prefetchw(pc);
2076
2077 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
2078 if (ret || !mem)
2079 return ret;
2080
2081 __mem_cgroup_commit_charge(mem, pc, ctype);
2082 return 0;
2083}
2084
2085int mem_cgroup_newpage_charge(struct page *page,
2086 struct mm_struct *mm, gfp_t gfp_mask)
2087{
2088 if (mem_cgroup_disabled())
2089 return 0;
2090 if (PageCompound(page))
2091 return 0;
2092
2093
2094
2095
2096
2097
2098
2099 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2100 return 0;
2101 if (unlikely(!mm))
2102 mm = &init_mm;
2103 return mem_cgroup_charge_common(page, mm, gfp_mask,
2104 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2105}
2106
2107static void
2108__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2109 enum charge_type ctype);
2110
2111int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2112 gfp_t gfp_mask)
2113{
2114 int ret;
2115
2116 if (mem_cgroup_disabled())
2117 return 0;
2118 if (PageCompound(page))
2119 return 0;
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131 if (!(gfp_mask & __GFP_WAIT)) {
2132 struct page_cgroup *pc;
2133
2134 pc = lookup_page_cgroup(page);
2135 if (!pc)
2136 return 0;
2137 lock_page_cgroup(pc);
2138 if (PageCgroupUsed(pc)) {
2139 unlock_page_cgroup(pc);
2140 return 0;
2141 }
2142 unlock_page_cgroup(pc);
2143 }
2144
2145 if (unlikely(!mm))
2146 mm = &init_mm;
2147
2148 if (page_is_file_cache(page))
2149 return mem_cgroup_charge_common(page, mm, gfp_mask,
2150 MEM_CGROUP_CHARGE_TYPE_CACHE);
2151
2152
2153 if (PageSwapCache(page)) {
2154 struct mem_cgroup *mem = NULL;
2155
2156 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2157 if (!ret)
2158 __mem_cgroup_commit_charge_swapin(page, mem,
2159 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2160 } else
2161 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2162 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2163
2164 return ret;
2165}
2166
2167
2168
2169
2170
2171
2172
2173int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2174 struct page *page,
2175 gfp_t mask, struct mem_cgroup **ptr)
2176{
2177 struct mem_cgroup *mem;
2178 int ret;
2179
2180 if (mem_cgroup_disabled())
2181 return 0;
2182
2183 if (!do_swap_account)
2184 goto charge_cur_mm;
2185
2186
2187
2188
2189
2190
2191 if (!PageSwapCache(page))
2192 goto charge_cur_mm;
2193 mem = try_get_mem_cgroup_from_page(page);
2194 if (!mem)
2195 goto charge_cur_mm;
2196 *ptr = mem;
2197 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2198 css_put(&mem->css);
2199 return ret;
2200charge_cur_mm:
2201 if (unlikely(!mm))
2202 mm = &init_mm;
2203 return __mem_cgroup_try_charge(mm, mask, ptr, true);
2204}
2205
2206static void
2207__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2208 enum charge_type ctype)
2209{
2210 struct page_cgroup *pc;
2211
2212 if (mem_cgroup_disabled())
2213 return;
2214 if (!ptr)
2215 return;
2216 cgroup_exclude_rmdir(&ptr->css);
2217 pc = lookup_page_cgroup(page);
2218 mem_cgroup_lru_del_before_commit_swapcache(page);
2219 __mem_cgroup_commit_charge(ptr, pc, ctype);
2220 mem_cgroup_lru_add_after_commit_swapcache(page);
2221
2222
2223
2224
2225
2226
2227
2228 if (do_swap_account && PageSwapCache(page)) {
2229 swp_entry_t ent = {.val = page_private(page)};
2230 unsigned short id;
2231 struct mem_cgroup *memcg;
2232
2233 id = swap_cgroup_record(ent, 0);
2234 rcu_read_lock();
2235 memcg = mem_cgroup_lookup(id);
2236 if (memcg) {
2237
2238
2239
2240
2241 if (!mem_cgroup_is_root(memcg))
2242 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2243 mem_cgroup_swap_statistics(memcg, false);
2244 mem_cgroup_put(memcg);
2245 }
2246 rcu_read_unlock();
2247 }
2248
2249
2250
2251
2252
2253 cgroup_release_and_wakeup_rmdir(&ptr->css);
2254}
2255
2256void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2257{
2258 __mem_cgroup_commit_charge_swapin(page, ptr,
2259 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2260}
2261
2262void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2263{
2264 if (mem_cgroup_disabled())
2265 return;
2266 if (!mem)
2267 return;
2268 mem_cgroup_cancel_charge(mem);
2269}
2270
2271static void
2272__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2273{
2274 struct memcg_batch_info *batch = NULL;
2275 bool uncharge_memsw = true;
2276
2277 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2278 uncharge_memsw = false;
2279
2280 batch = ¤t->memcg_batch;
2281
2282
2283
2284
2285
2286 if (!batch->memcg)
2287 batch->memcg = mem;
2288
2289
2290
2291
2292
2293
2294
2295
2296 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2297 goto direct_uncharge;
2298
2299
2300
2301
2302
2303
2304 if (batch->memcg != mem)
2305 goto direct_uncharge;
2306
2307 batch->bytes += PAGE_SIZE;
2308 if (uncharge_memsw)
2309 batch->memsw_bytes += PAGE_SIZE;
2310 return;
2311direct_uncharge:
2312 res_counter_uncharge(&mem->res, PAGE_SIZE);
2313 if (uncharge_memsw)
2314 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2315 if (unlikely(batch->memcg != mem))
2316 memcg_oom_recover(mem);
2317 return;
2318}
2319
2320
2321
2322
2323static struct mem_cgroup *
2324__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2325{
2326 struct page_cgroup *pc;
2327 struct mem_cgroup *mem = NULL;
2328
2329 if (mem_cgroup_disabled())
2330 return NULL;
2331
2332 if (PageSwapCache(page))
2333 return NULL;
2334
2335
2336
2337
2338 pc = lookup_page_cgroup(page);
2339 if (unlikely(!pc || !PageCgroupUsed(pc)))
2340 return NULL;
2341
2342 lock_page_cgroup(pc);
2343
2344 mem = pc->mem_cgroup;
2345
2346 if (!PageCgroupUsed(pc))
2347 goto unlock_out;
2348
2349 switch (ctype) {
2350 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2351 case MEM_CGROUP_CHARGE_TYPE_DROP:
2352
2353 if (page_mapped(page) || PageCgroupMigration(pc))
2354 goto unlock_out;
2355 break;
2356 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2357 if (!PageAnon(page)) {
2358 if (page->mapping && !page_is_file_cache(page))
2359 goto unlock_out;
2360 } else if (page_mapped(page))
2361 goto unlock_out;
2362 break;
2363 default:
2364 break;
2365 }
2366
2367 mem_cgroup_charge_statistics(mem, pc, false);
2368
2369 ClearPageCgroupUsed(pc);
2370
2371
2372
2373
2374
2375
2376
2377 unlock_page_cgroup(pc);
2378
2379
2380
2381
2382 memcg_check_events(mem, page);
2383 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2384 mem_cgroup_swap_statistics(mem, true);
2385 mem_cgroup_get(mem);
2386 }
2387 if (!mem_cgroup_is_root(mem))
2388 __do_uncharge(mem, ctype);
2389
2390 return mem;
2391
2392unlock_out:
2393 unlock_page_cgroup(pc);
2394 return NULL;
2395}
2396
2397void mem_cgroup_uncharge_page(struct page *page)
2398{
2399
2400 if (page_mapped(page))
2401 return;
2402 if (page->mapping && !PageAnon(page))
2403 return;
2404 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2405}
2406
2407void mem_cgroup_uncharge_cache_page(struct page *page)
2408{
2409 VM_BUG_ON(page_mapped(page));
2410 VM_BUG_ON(page->mapping);
2411 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2412}
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422void mem_cgroup_uncharge_start(void)
2423{
2424 current->memcg_batch.do_batch++;
2425
2426 if (current->memcg_batch.do_batch == 1) {
2427 current->memcg_batch.memcg = NULL;
2428 current->memcg_batch.bytes = 0;
2429 current->memcg_batch.memsw_bytes = 0;
2430 }
2431}
2432
2433void mem_cgroup_uncharge_end(void)
2434{
2435 struct memcg_batch_info *batch = ¤t->memcg_batch;
2436
2437 if (!batch->do_batch)
2438 return;
2439
2440 batch->do_batch--;
2441 if (batch->do_batch)
2442 return;
2443
2444 if (!batch->memcg)
2445 return;
2446
2447
2448
2449
2450 if (batch->bytes)
2451 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2452 if (batch->memsw_bytes)
2453 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2454 memcg_oom_recover(batch->memcg);
2455
2456 batch->memcg = NULL;
2457}
2458
2459#ifdef CONFIG_SWAP
2460
2461
2462
2463
2464void
2465mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2466{
2467 struct mem_cgroup *memcg;
2468 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2469
2470 if (!swapout)
2471 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2472
2473 memcg = __mem_cgroup_uncharge_common(page, ctype);
2474
2475
2476
2477
2478
2479 if (do_swap_account && swapout && memcg)
2480 swap_cgroup_record(ent, css_id(&memcg->css));
2481}
2482#endif
2483
2484#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2485
2486
2487
2488
2489void mem_cgroup_uncharge_swap(swp_entry_t ent)
2490{
2491 struct mem_cgroup *memcg;
2492 unsigned short id;
2493
2494 if (!do_swap_account)
2495 return;
2496
2497 id = swap_cgroup_record(ent, 0);
2498 rcu_read_lock();
2499 memcg = mem_cgroup_lookup(id);
2500 if (memcg) {
2501
2502
2503
2504
2505 if (!mem_cgroup_is_root(memcg))
2506 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2507 mem_cgroup_swap_statistics(memcg, false);
2508 mem_cgroup_put(memcg);
2509 }
2510 rcu_read_unlock();
2511}
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528static int mem_cgroup_move_swap_account(swp_entry_t entry,
2529 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2530{
2531 unsigned short old_id, new_id;
2532
2533 old_id = css_id(&from->css);
2534 new_id = css_id(&to->css);
2535
2536 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2537 mem_cgroup_swap_statistics(from, false);
2538 mem_cgroup_swap_statistics(to, true);
2539
2540
2541
2542
2543
2544
2545
2546
2547 mem_cgroup_get(to);
2548 if (need_fixup) {
2549 if (!mem_cgroup_is_root(from))
2550 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2551 mem_cgroup_put(from);
2552
2553
2554
2555
2556 if (!mem_cgroup_is_root(to))
2557 res_counter_uncharge(&to->res, PAGE_SIZE);
2558 }
2559 return 0;
2560 }
2561 return -EINVAL;
2562}
2563#else
2564static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2565 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2566{
2567 return -EINVAL;
2568}
2569#endif
2570
2571
2572
2573
2574
2575int mem_cgroup_prepare_migration(struct page *page,
2576 struct page *newpage, struct mem_cgroup **ptr)
2577{
2578 struct page_cgroup *pc;
2579 struct mem_cgroup *mem = NULL;
2580 enum charge_type ctype;
2581 int ret = 0;
2582
2583 if (mem_cgroup_disabled())
2584 return 0;
2585
2586 pc = lookup_page_cgroup(page);
2587 lock_page_cgroup(pc);
2588 if (PageCgroupUsed(pc)) {
2589 mem = pc->mem_cgroup;
2590 css_get(&mem->css);
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620 if (PageAnon(page))
2621 SetPageCgroupMigration(pc);
2622 }
2623 unlock_page_cgroup(pc);
2624
2625
2626
2627
2628 if (!mem)
2629 return 0;
2630
2631 *ptr = mem;
2632 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2633 css_put(&mem->css);
2634 if (ret || *ptr == NULL) {
2635 if (PageAnon(page)) {
2636 lock_page_cgroup(pc);
2637 ClearPageCgroupMigration(pc);
2638 unlock_page_cgroup(pc);
2639
2640
2641
2642 mem_cgroup_uncharge_page(page);
2643 }
2644 return -ENOMEM;
2645 }
2646
2647
2648
2649
2650
2651
2652 pc = lookup_page_cgroup(newpage);
2653 if (PageAnon(page))
2654 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2655 else if (page_is_file_cache(page))
2656 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2657 else
2658 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2659 __mem_cgroup_commit_charge(mem, pc, ctype);
2660 return ret;
2661}
2662
2663
2664void mem_cgroup_end_migration(struct mem_cgroup *mem,
2665 struct page *oldpage, struct page *newpage)
2666{
2667 struct page *used, *unused;
2668 struct page_cgroup *pc;
2669
2670 if (!mem)
2671 return;
2672
2673 cgroup_exclude_rmdir(&mem->css);
2674
2675 if (oldpage->mapping) {
2676 used = oldpage;
2677 unused = newpage;
2678 } else {
2679 used = newpage;
2680 unused = oldpage;
2681 }
2682
2683
2684
2685
2686
2687 pc = lookup_page_cgroup(oldpage);
2688 lock_page_cgroup(pc);
2689 ClearPageCgroupMigration(pc);
2690 unlock_page_cgroup(pc);
2691
2692 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702 if (PageAnon(used))
2703 mem_cgroup_uncharge_page(used);
2704
2705
2706
2707
2708
2709
2710 cgroup_release_and_wakeup_rmdir(&mem->css);
2711}
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721int mem_cgroup_shmem_charge_fallback(struct page *page,
2722 struct mm_struct *mm,
2723 gfp_t gfp_mask)
2724{
2725 struct mem_cgroup *mem = NULL;
2726 int ret;
2727
2728 if (mem_cgroup_disabled())
2729 return 0;
2730
2731 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2732 if (!ret)
2733 mem_cgroup_cancel_charge_swapin(mem);
2734
2735 return ret;
2736}
2737
2738static DEFINE_MUTEX(set_limit_mutex);
2739
2740static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2741 unsigned long long val)
2742{
2743 int retry_count;
2744 u64 memswlimit, memlimit;
2745 int ret = 0;
2746 int children = mem_cgroup_count_children(memcg);
2747 u64 curusage, oldusage;
2748 int enlarge;
2749
2750
2751
2752
2753
2754
2755 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2756
2757 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2758
2759 enlarge = 0;
2760 while (retry_count) {
2761 if (signal_pending(current)) {
2762 ret = -EINTR;
2763 break;
2764 }
2765
2766
2767
2768
2769
2770 mutex_lock(&set_limit_mutex);
2771 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2772 if (memswlimit < val) {
2773 ret = -EINVAL;
2774 mutex_unlock(&set_limit_mutex);
2775 break;
2776 }
2777
2778 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2779 if (memlimit < val)
2780 enlarge = 1;
2781
2782 ret = res_counter_set_limit(&memcg->res, val);
2783 if (!ret) {
2784 if (memswlimit == val)
2785 memcg->memsw_is_minimum = true;
2786 else
2787 memcg->memsw_is_minimum = false;
2788 }
2789 mutex_unlock(&set_limit_mutex);
2790
2791 if (!ret)
2792 break;
2793
2794 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2795 MEM_CGROUP_RECLAIM_SHRINK);
2796 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2797
2798 if (curusage >= oldusage)
2799 retry_count--;
2800 else
2801 oldusage = curusage;
2802 }
2803 if (!ret && enlarge)
2804 memcg_oom_recover(memcg);
2805
2806 return ret;
2807}
2808
2809static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2810 unsigned long long val)
2811{
2812 int retry_count;
2813 u64 memlimit, memswlimit, oldusage, curusage;
2814 int children = mem_cgroup_count_children(memcg);
2815 int ret = -EBUSY;
2816 int enlarge = 0;
2817
2818
2819 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2820 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2821 while (retry_count) {
2822 if (signal_pending(current)) {
2823 ret = -EINTR;
2824 break;
2825 }
2826
2827
2828
2829
2830
2831 mutex_lock(&set_limit_mutex);
2832 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2833 if (memlimit > val) {
2834 ret = -EINVAL;
2835 mutex_unlock(&set_limit_mutex);
2836 break;
2837 }
2838 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2839 if (memswlimit < val)
2840 enlarge = 1;
2841 ret = res_counter_set_limit(&memcg->memsw, val);
2842 if (!ret) {
2843 if (memlimit == val)
2844 memcg->memsw_is_minimum = true;
2845 else
2846 memcg->memsw_is_minimum = false;
2847 }
2848 mutex_unlock(&set_limit_mutex);
2849
2850 if (!ret)
2851 break;
2852
2853 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2854 MEM_CGROUP_RECLAIM_NOSWAP |
2855 MEM_CGROUP_RECLAIM_SHRINK);
2856 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2857
2858 if (curusage >= oldusage)
2859 retry_count--;
2860 else
2861 oldusage = curusage;
2862 }
2863 if (!ret && enlarge)
2864 memcg_oom_recover(memcg);
2865 return ret;
2866}
2867
2868unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2869 gfp_t gfp_mask)
2870{
2871 unsigned long nr_reclaimed = 0;
2872 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2873 unsigned long reclaimed;
2874 int loop = 0;
2875 struct mem_cgroup_tree_per_zone *mctz;
2876 unsigned long long excess;
2877
2878 if (order > 0)
2879 return 0;
2880
2881 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2882
2883
2884
2885
2886
2887 do {
2888 if (next_mz)
2889 mz = next_mz;
2890 else
2891 mz = mem_cgroup_largest_soft_limit_node(mctz);
2892 if (!mz)
2893 break;
2894
2895 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2896 gfp_mask,
2897 MEM_CGROUP_RECLAIM_SOFT);
2898 nr_reclaimed += reclaimed;
2899 spin_lock(&mctz->lock);
2900
2901
2902
2903
2904
2905 next_mz = NULL;
2906 if (!reclaimed) {
2907 do {
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919 next_mz =
2920 __mem_cgroup_largest_soft_limit_node(mctz);
2921 if (next_mz == mz) {
2922 css_put(&next_mz->mem->css);
2923 next_mz = NULL;
2924 } else
2925 break;
2926 } while (1);
2927 }
2928 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2929 excess = res_counter_soft_limit_excess(&mz->mem->res);
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2940 spin_unlock(&mctz->lock);
2941 css_put(&mz->mem->css);
2942 loop++;
2943
2944
2945
2946
2947
2948 if (!nr_reclaimed &&
2949 (next_mz == NULL ||
2950 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2951 break;
2952 } while (!nr_reclaimed);
2953 if (next_mz)
2954 css_put(&next_mz->mem->css);
2955 return nr_reclaimed;
2956}
2957
2958
2959
2960
2961
2962static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2963 int node, int zid, enum lru_list lru)
2964{
2965 struct zone *zone;
2966 struct mem_cgroup_per_zone *mz;
2967 struct page_cgroup *pc, *busy;
2968 unsigned long flags, loop;
2969 struct list_head *list;
2970 int ret = 0;
2971
2972 zone = &NODE_DATA(node)->node_zones[zid];
2973 mz = mem_cgroup_zoneinfo(mem, node, zid);
2974 list = &mz->lists[lru];
2975
2976 loop = MEM_CGROUP_ZSTAT(mz, lru);
2977
2978 loop += 256;
2979 busy = NULL;
2980 while (loop--) {
2981 ret = 0;
2982 spin_lock_irqsave(&zone->lru_lock, flags);
2983 if (list_empty(list)) {
2984 spin_unlock_irqrestore(&zone->lru_lock, flags);
2985 break;
2986 }
2987 pc = list_entry(list->prev, struct page_cgroup, lru);
2988 if (busy == pc) {
2989 list_move(&pc->lru, list);
2990 busy = NULL;
2991 spin_unlock_irqrestore(&zone->lru_lock, flags);
2992 continue;
2993 }
2994 spin_unlock_irqrestore(&zone->lru_lock, flags);
2995
2996 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2997 if (ret == -ENOMEM)
2998 break;
2999
3000 if (ret == -EBUSY || ret == -EINVAL) {
3001
3002 busy = pc;
3003 cond_resched();
3004 } else
3005 busy = NULL;
3006 }
3007
3008 if (!ret && !list_empty(list))
3009 return -EBUSY;
3010 return ret;
3011}
3012
3013
3014
3015
3016
3017static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3018{
3019 int ret;
3020 int node, zid, shrink;
3021 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3022 struct cgroup *cgrp = mem->css.cgroup;
3023
3024 css_get(&mem->css);
3025
3026 shrink = 0;
3027
3028 if (free_all)
3029 goto try_to_free;
3030move_account:
3031 do {
3032 ret = -EBUSY;
3033 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3034 goto out;
3035 ret = -EINTR;
3036 if (signal_pending(current))
3037 goto out;
3038
3039 lru_add_drain_all();
3040 drain_all_stock_sync();
3041 ret = 0;
3042 for_each_node_state(node, N_HIGH_MEMORY) {
3043 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3044 enum lru_list l;
3045 for_each_lru(l) {
3046 ret = mem_cgroup_force_empty_list(mem,
3047 node, zid, l);
3048 if (ret)
3049 break;
3050 }
3051 }
3052 if (ret)
3053 break;
3054 }
3055 memcg_oom_recover(mem);
3056
3057 if (ret == -ENOMEM)
3058 goto try_to_free;
3059 cond_resched();
3060
3061 } while (mem->res.usage > 0 || ret);
3062out:
3063 css_put(&mem->css);
3064 return ret;
3065
3066try_to_free:
3067
3068 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3069 ret = -EBUSY;
3070 goto out;
3071 }
3072
3073 lru_add_drain_all();
3074
3075 shrink = 1;
3076 while (nr_retries && mem->res.usage > 0) {
3077 int progress;
3078
3079 if (signal_pending(current)) {
3080 ret = -EINTR;
3081 goto out;
3082 }
3083 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3084 false, get_swappiness(mem));
3085 if (!progress) {
3086 nr_retries--;
3087
3088 congestion_wait(BLK_RW_ASYNC, HZ/10);
3089 }
3090
3091 }
3092 lru_add_drain();
3093
3094 goto move_account;
3095}
3096
3097int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3098{
3099 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3100}
3101
3102
3103static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3104{
3105 return mem_cgroup_from_cont(cont)->use_hierarchy;
3106}
3107
3108static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3109 u64 val)
3110{
3111 int retval = 0;
3112 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3113 struct cgroup *parent = cont->parent;
3114 struct mem_cgroup *parent_mem = NULL;
3115
3116 if (parent)
3117 parent_mem = mem_cgroup_from_cont(parent);
3118
3119 cgroup_lock();
3120
3121
3122
3123
3124
3125
3126
3127
3128 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3129 (val == 1 || val == 0)) {
3130 if (list_empty(&cont->children))
3131 mem->use_hierarchy = val;
3132 else
3133 retval = -EBUSY;
3134 } else
3135 retval = -EINVAL;
3136 cgroup_unlock();
3137
3138 return retval;
3139}
3140
3141struct mem_cgroup_idx_data {
3142 s64 val;
3143 enum mem_cgroup_stat_index idx;
3144};
3145
3146static int
3147mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
3148{
3149 struct mem_cgroup_idx_data *d = data;
3150 d->val += mem_cgroup_read_stat(mem, d->idx);
3151 return 0;
3152}
3153
3154static void
3155mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3156 enum mem_cgroup_stat_index idx, s64 *val)
3157{
3158 struct mem_cgroup_idx_data d;
3159 d.idx = idx;
3160 d.val = 0;
3161 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3162 *val = d.val;
3163}
3164
3165static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3166{
3167 u64 idx_val, val;
3168
3169 if (!mem_cgroup_is_root(mem)) {
3170 if (!swap)
3171 return res_counter_read_u64(&mem->res, RES_USAGE);
3172 else
3173 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3174 }
3175
3176 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
3177 val = idx_val;
3178 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3179 val += idx_val;
3180
3181 if (swap) {
3182 mem_cgroup_get_recursive_idx_stat(mem,
3183 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3184 val += idx_val;
3185 }
3186
3187 return val << PAGE_SHIFT;
3188}
3189
3190static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3191{
3192 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3193 u64 val;
3194 int type, name;
3195
3196 type = MEMFILE_TYPE(cft->private);
3197 name = MEMFILE_ATTR(cft->private);
3198 switch (type) {
3199 case _MEM:
3200 if (name == RES_USAGE)
3201 val = mem_cgroup_usage(mem, false);
3202 else
3203 val = res_counter_read_u64(&mem->res, name);
3204 break;
3205 case _MEMSWAP:
3206 if (name == RES_USAGE)
3207 val = mem_cgroup_usage(mem, true);
3208 else
3209 val = res_counter_read_u64(&mem->memsw, name);
3210 break;
3211 default:
3212 BUG();
3213 break;
3214 }
3215 return val;
3216}
3217
3218
3219
3220
3221static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3222 const char *buffer)
3223{
3224 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3225 int type, name;
3226 unsigned long long val;
3227 int ret;
3228
3229 type = MEMFILE_TYPE(cft->private);
3230 name = MEMFILE_ATTR(cft->private);
3231 switch (name) {
3232 case RES_LIMIT:
3233 if (mem_cgroup_is_root(memcg)) {
3234 ret = -EINVAL;
3235 break;
3236 }
3237
3238 ret = res_counter_memparse_write_strategy(buffer, &val);
3239 if (ret)
3240 break;
3241 if (type == _MEM)
3242 ret = mem_cgroup_resize_limit(memcg, val);
3243 else
3244 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3245 break;
3246 case RES_SOFT_LIMIT:
3247 ret = res_counter_memparse_write_strategy(buffer, &val);
3248 if (ret)
3249 break;
3250
3251
3252
3253
3254
3255 if (type == _MEM)
3256 ret = res_counter_set_soft_limit(&memcg->res, val);
3257 else
3258 ret = -EINVAL;
3259 break;
3260 default:
3261 ret = -EINVAL;
3262 break;
3263 }
3264 return ret;
3265}
3266
3267static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3268 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3269{
3270 struct cgroup *cgroup;
3271 unsigned long long min_limit, min_memsw_limit, tmp;
3272
3273 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3274 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3275 cgroup = memcg->css.cgroup;
3276 if (!memcg->use_hierarchy)
3277 goto out;
3278
3279 while (cgroup->parent) {
3280 cgroup = cgroup->parent;
3281 memcg = mem_cgroup_from_cont(cgroup);
3282 if (!memcg->use_hierarchy)
3283 break;
3284 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3285 min_limit = min(min_limit, tmp);
3286 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3287 min_memsw_limit = min(min_memsw_limit, tmp);
3288 }
3289out:
3290 *mem_limit = min_limit;
3291 *memsw_limit = min_memsw_limit;
3292 return;
3293}
3294
3295static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3296{
3297 struct mem_cgroup *mem;
3298 int type, name;
3299
3300 mem = mem_cgroup_from_cont(cont);
3301 type = MEMFILE_TYPE(event);
3302 name = MEMFILE_ATTR(event);
3303 switch (name) {
3304 case RES_MAX_USAGE:
3305 if (type == _MEM)
3306 res_counter_reset_max(&mem->res);
3307 else
3308 res_counter_reset_max(&mem->memsw);
3309 break;
3310 case RES_FAILCNT:
3311 if (type == _MEM)
3312 res_counter_reset_failcnt(&mem->res);
3313 else
3314 res_counter_reset_failcnt(&mem->memsw);
3315 break;
3316 }
3317
3318 return 0;
3319}
3320
3321static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3322 struct cftype *cft)
3323{
3324 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3325}
3326
3327#ifdef CONFIG_MMU
3328static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3329 struct cftype *cft, u64 val)
3330{
3331 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3332
3333 if (val >= (1 << NR_MOVE_TYPE))
3334 return -EINVAL;
3335
3336
3337
3338
3339
3340 cgroup_lock();
3341 mem->move_charge_at_immigrate = val;
3342 cgroup_unlock();
3343
3344 return 0;
3345}
3346#else
3347static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3348 struct cftype *cft, u64 val)
3349{
3350 return -ENOSYS;
3351}
3352#endif
3353
3354
3355
3356enum {
3357 MCS_CACHE,
3358 MCS_RSS,
3359 MCS_FILE_MAPPED,
3360 MCS_PGPGIN,
3361 MCS_PGPGOUT,
3362 MCS_SWAP,
3363 MCS_INACTIVE_ANON,
3364 MCS_ACTIVE_ANON,
3365 MCS_INACTIVE_FILE,
3366 MCS_ACTIVE_FILE,
3367 MCS_UNEVICTABLE,
3368 NR_MCS_STAT,
3369};
3370
3371struct mcs_total_stat {
3372 s64 stat[NR_MCS_STAT];
3373};
3374
3375struct {
3376 char *local_name;
3377 char *total_name;
3378} memcg_stat_strings[NR_MCS_STAT] = {
3379 {"cache", "total_cache"},
3380 {"rss", "total_rss"},
3381 {"mapped_file", "total_mapped_file"},
3382 {"pgpgin", "total_pgpgin"},
3383 {"pgpgout", "total_pgpgout"},
3384 {"swap", "total_swap"},
3385 {"inactive_anon", "total_inactive_anon"},
3386 {"active_anon", "total_active_anon"},
3387 {"inactive_file", "total_inactive_file"},
3388 {"active_file", "total_active_file"},
3389 {"unevictable", "total_unevictable"}
3390};
3391
3392
3393static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3394{
3395 struct mcs_total_stat *s = data;
3396 s64 val;
3397
3398
3399 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3400 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3401 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3402 s->stat[MCS_RSS] += val * PAGE_SIZE;
3403 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3404 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3405 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3406 s->stat[MCS_PGPGIN] += val;
3407 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3408 s->stat[MCS_PGPGOUT] += val;
3409 if (do_swap_account) {
3410 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3411 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3412 }
3413
3414
3415 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3416 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3417 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3418 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3419 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3420 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3421 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3422 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3423 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3424 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3425 return 0;
3426}
3427
3428static void
3429mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3430{
3431 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3432}
3433
3434static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3435 struct cgroup_map_cb *cb)
3436{
3437 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3438 struct mcs_total_stat mystat;
3439 int i;
3440
3441 memset(&mystat, 0, sizeof(mystat));
3442 mem_cgroup_get_local_stat(mem_cont, &mystat);
3443
3444 for (i = 0; i < NR_MCS_STAT; i++) {
3445 if (i == MCS_SWAP && !do_swap_account)
3446 continue;
3447 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3448 }
3449
3450
3451 {
3452 unsigned long long limit, memsw_limit;
3453 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3454 cb->fill(cb, "hierarchical_memory_limit", limit);
3455 if (do_swap_account)
3456 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3457 }
3458
3459 memset(&mystat, 0, sizeof(mystat));
3460 mem_cgroup_get_total_stat(mem_cont, &mystat);
3461 for (i = 0; i < NR_MCS_STAT; i++) {
3462 if (i == MCS_SWAP && !do_swap_account)
3463 continue;
3464 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3465 }
3466
3467#ifdef CONFIG_DEBUG_VM
3468 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3469
3470 {
3471 int nid, zid;
3472 struct mem_cgroup_per_zone *mz;
3473 unsigned long recent_rotated[2] = {0, 0};
3474 unsigned long recent_scanned[2] = {0, 0};
3475
3476 for_each_online_node(nid)
3477 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3478 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3479
3480 recent_rotated[0] +=
3481 mz->reclaim_stat.recent_rotated[0];
3482 recent_rotated[1] +=
3483 mz->reclaim_stat.recent_rotated[1];
3484 recent_scanned[0] +=
3485 mz->reclaim_stat.recent_scanned[0];
3486 recent_scanned[1] +=
3487 mz->reclaim_stat.recent_scanned[1];
3488 }
3489 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3490 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3491 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3492 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3493 }
3494#endif
3495
3496 return 0;
3497}
3498
3499static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3500{
3501 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3502
3503 return get_swappiness(memcg);
3504}
3505
3506static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3507 u64 val)
3508{
3509 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3510 struct mem_cgroup *parent;
3511
3512 if (val > 100)
3513 return -EINVAL;
3514
3515 if (cgrp->parent == NULL)
3516 return -EINVAL;
3517
3518 parent = mem_cgroup_from_cont(cgrp->parent);
3519
3520 cgroup_lock();
3521
3522
3523 if ((parent->use_hierarchy) ||
3524 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3525 cgroup_unlock();
3526 return -EINVAL;
3527 }
3528
3529 spin_lock(&memcg->reclaim_param_lock);
3530 memcg->swappiness = val;
3531 spin_unlock(&memcg->reclaim_param_lock);
3532
3533 cgroup_unlock();
3534
3535 return 0;
3536}
3537
3538static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3539{
3540 struct mem_cgroup_threshold_ary *t;
3541 u64 usage;
3542 int i;
3543
3544 rcu_read_lock();
3545 if (!swap)
3546 t = rcu_dereference(memcg->thresholds.primary);
3547 else
3548 t = rcu_dereference(memcg->memsw_thresholds.primary);
3549
3550 if (!t)
3551 goto unlock;
3552
3553 usage = mem_cgroup_usage(memcg, swap);
3554
3555
3556
3557
3558
3559
3560 i = t->current_threshold;
3561
3562
3563
3564
3565
3566
3567
3568 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3569 eventfd_signal(t->entries[i].eventfd, 1);
3570
3571
3572 i++;
3573
3574
3575
3576
3577
3578
3579
3580 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3581 eventfd_signal(t->entries[i].eventfd, 1);
3582
3583
3584 t->current_threshold = i - 1;
3585unlock:
3586 rcu_read_unlock();
3587}
3588
3589static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3590{
3591 while (memcg) {
3592 __mem_cgroup_threshold(memcg, false);
3593 if (do_swap_account)
3594 __mem_cgroup_threshold(memcg, true);
3595
3596 memcg = parent_mem_cgroup(memcg);
3597 }
3598}
3599
3600static int compare_thresholds(const void *a, const void *b)
3601{
3602 const struct mem_cgroup_threshold *_a = a;
3603 const struct mem_cgroup_threshold *_b = b;
3604
3605 return _a->threshold - _b->threshold;
3606}
3607
3608static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3609{
3610 struct mem_cgroup_eventfd_list *ev;
3611
3612 list_for_each_entry(ev, &mem->oom_notify, list)
3613 eventfd_signal(ev->eventfd, 1);
3614 return 0;
3615}
3616
3617static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3618{
3619 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3620}
3621
3622static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3623 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3624{
3625 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3626 struct mem_cgroup_thresholds *thresholds;
3627 struct mem_cgroup_threshold_ary *new;
3628 int type = MEMFILE_TYPE(cft->private);
3629 u64 threshold, usage;
3630 int i, size, ret;
3631
3632 ret = res_counter_memparse_write_strategy(args, &threshold);
3633 if (ret)
3634 return ret;
3635
3636 mutex_lock(&memcg->thresholds_lock);
3637
3638 if (type == _MEM)
3639 thresholds = &memcg->thresholds;
3640 else if (type == _MEMSWAP)
3641 thresholds = &memcg->memsw_thresholds;
3642 else
3643 BUG();
3644
3645 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3646
3647
3648 if (thresholds->primary)
3649 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3650
3651 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3652
3653
3654 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3655 GFP_KERNEL);
3656 if (!new) {
3657 ret = -ENOMEM;
3658 goto unlock;
3659 }
3660 new->size = size;
3661
3662
3663 if (thresholds->primary) {
3664 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3665 sizeof(struct mem_cgroup_threshold));
3666 }
3667
3668
3669 new->entries[size - 1].eventfd = eventfd;
3670 new->entries[size - 1].threshold = threshold;
3671
3672
3673 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3674 compare_thresholds, NULL);
3675
3676
3677 new->current_threshold = -1;
3678 for (i = 0; i < size; i++) {
3679 if (new->entries[i].threshold < usage) {
3680
3681
3682
3683
3684
3685 ++new->current_threshold;
3686 }
3687 }
3688
3689
3690 kfree(thresholds->spare);
3691 thresholds->spare = thresholds->primary;
3692
3693 rcu_assign_pointer(thresholds->primary, new);
3694
3695
3696 synchronize_rcu();
3697
3698unlock:
3699 mutex_unlock(&memcg->thresholds_lock);
3700
3701 return ret;
3702}
3703
3704static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3705 struct cftype *cft, struct eventfd_ctx *eventfd)
3706{
3707 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3708 struct mem_cgroup_thresholds *thresholds;
3709 struct mem_cgroup_threshold_ary *new;
3710 int type = MEMFILE_TYPE(cft->private);
3711 u64 usage;
3712 int i, j, size;
3713
3714 mutex_lock(&memcg->thresholds_lock);
3715 if (type == _MEM)
3716 thresholds = &memcg->thresholds;
3717 else if (type == _MEMSWAP)
3718 thresholds = &memcg->memsw_thresholds;
3719 else
3720 BUG();
3721
3722
3723
3724
3725
3726 BUG_ON(!thresholds);
3727
3728 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3729
3730
3731 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3732
3733
3734 size = 0;
3735 for (i = 0; i < thresholds->primary->size; i++) {
3736 if (thresholds->primary->entries[i].eventfd != eventfd)
3737 size++;
3738 }
3739
3740 new = thresholds->spare;
3741
3742
3743 if (!size) {
3744 kfree(new);
3745 new = NULL;
3746 goto swap_buffers;
3747 }
3748
3749 new->size = size;
3750
3751
3752 new->current_threshold = -1;
3753 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3754 if (thresholds->primary->entries[i].eventfd == eventfd)
3755 continue;
3756
3757 new->entries[j] = thresholds->primary->entries[i];
3758 if (new->entries[j].threshold < usage) {
3759
3760
3761
3762
3763
3764 ++new->current_threshold;
3765 }
3766 j++;
3767 }
3768
3769swap_buffers:
3770
3771 thresholds->spare = thresholds->primary;
3772 rcu_assign_pointer(thresholds->primary, new);
3773
3774
3775 synchronize_rcu();
3776
3777 mutex_unlock(&memcg->thresholds_lock);
3778}
3779
3780static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3781 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3782{
3783 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3784 struct mem_cgroup_eventfd_list *event;
3785 int type = MEMFILE_TYPE(cft->private);
3786
3787 BUG_ON(type != _OOM_TYPE);
3788 event = kmalloc(sizeof(*event), GFP_KERNEL);
3789 if (!event)
3790 return -ENOMEM;
3791
3792 mutex_lock(&memcg_oom_mutex);
3793
3794 event->eventfd = eventfd;
3795 list_add(&event->list, &memcg->oom_notify);
3796
3797
3798 if (atomic_read(&memcg->oom_lock))
3799 eventfd_signal(eventfd, 1);
3800 mutex_unlock(&memcg_oom_mutex);
3801
3802 return 0;
3803}
3804
3805static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3806 struct cftype *cft, struct eventfd_ctx *eventfd)
3807{
3808 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3809 struct mem_cgroup_eventfd_list *ev, *tmp;
3810 int type = MEMFILE_TYPE(cft->private);
3811
3812 BUG_ON(type != _OOM_TYPE);
3813
3814 mutex_lock(&memcg_oom_mutex);
3815
3816 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3817 if (ev->eventfd == eventfd) {
3818 list_del(&ev->list);
3819 kfree(ev);
3820 }
3821 }
3822
3823 mutex_unlock(&memcg_oom_mutex);
3824}
3825
3826static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3827 struct cftype *cft, struct cgroup_map_cb *cb)
3828{
3829 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3830
3831 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3832
3833 if (atomic_read(&mem->oom_lock))
3834 cb->fill(cb, "under_oom", 1);
3835 else
3836 cb->fill(cb, "under_oom", 0);
3837 return 0;
3838}
3839
3840static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3841 struct cftype *cft, u64 val)
3842{
3843 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3844 struct mem_cgroup *parent;
3845
3846
3847 if (!cgrp->parent || !((val == 0) || (val == 1)))
3848 return -EINVAL;
3849
3850 parent = mem_cgroup_from_cont(cgrp->parent);
3851
3852 cgroup_lock();
3853
3854 if ((parent->use_hierarchy) ||
3855 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3856 cgroup_unlock();
3857 return -EINVAL;
3858 }
3859 mem->oom_kill_disable = val;
3860 if (!val)
3861 memcg_oom_recover(mem);
3862 cgroup_unlock();
3863 return 0;
3864}
3865
3866static struct cftype mem_cgroup_files[] = {
3867 {
3868 .name = "usage_in_bytes",
3869 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3870 .read_u64 = mem_cgroup_read,
3871 .register_event = mem_cgroup_usage_register_event,
3872 .unregister_event = mem_cgroup_usage_unregister_event,
3873 },
3874 {
3875 .name = "max_usage_in_bytes",
3876 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3877 .trigger = mem_cgroup_reset,
3878 .read_u64 = mem_cgroup_read,
3879 },
3880 {
3881 .name = "limit_in_bytes",
3882 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3883 .write_string = mem_cgroup_write,
3884 .read_u64 = mem_cgroup_read,
3885 },
3886 {
3887 .name = "soft_limit_in_bytes",
3888 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3889 .write_string = mem_cgroup_write,
3890 .read_u64 = mem_cgroup_read,
3891 },
3892 {
3893 .name = "failcnt",
3894 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3895 .trigger = mem_cgroup_reset,
3896 .read_u64 = mem_cgroup_read,
3897 },
3898 {
3899 .name = "stat",
3900 .read_map = mem_control_stat_show,
3901 },
3902 {
3903 .name = "force_empty",
3904 .trigger = mem_cgroup_force_empty_write,
3905 },
3906 {
3907 .name = "use_hierarchy",
3908 .write_u64 = mem_cgroup_hierarchy_write,
3909 .read_u64 = mem_cgroup_hierarchy_read,
3910 },
3911 {
3912 .name = "swappiness",
3913 .read_u64 = mem_cgroup_swappiness_read,
3914 .write_u64 = mem_cgroup_swappiness_write,
3915 },
3916 {
3917 .name = "move_charge_at_immigrate",
3918 .read_u64 = mem_cgroup_move_charge_read,
3919 .write_u64 = mem_cgroup_move_charge_write,
3920 },
3921 {
3922 .name = "oom_control",
3923 .read_map = mem_cgroup_oom_control_read,
3924 .write_u64 = mem_cgroup_oom_control_write,
3925 .register_event = mem_cgroup_oom_register_event,
3926 .unregister_event = mem_cgroup_oom_unregister_event,
3927 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3928 },
3929};
3930
3931#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3932static struct cftype memsw_cgroup_files[] = {
3933 {
3934 .name = "memsw.usage_in_bytes",
3935 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3936 .read_u64 = mem_cgroup_read,
3937 .register_event = mem_cgroup_usage_register_event,
3938 .unregister_event = mem_cgroup_usage_unregister_event,
3939 },
3940 {
3941 .name = "memsw.max_usage_in_bytes",
3942 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3943 .trigger = mem_cgroup_reset,
3944 .read_u64 = mem_cgroup_read,
3945 },
3946 {
3947 .name = "memsw.limit_in_bytes",
3948 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3949 .write_string = mem_cgroup_write,
3950 .read_u64 = mem_cgroup_read,
3951 },
3952 {
3953 .name = "memsw.failcnt",
3954 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3955 .trigger = mem_cgroup_reset,
3956 .read_u64 = mem_cgroup_read,
3957 },
3958};
3959
3960static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3961{
3962 if (!do_swap_account)
3963 return 0;
3964 return cgroup_add_files(cont, ss, memsw_cgroup_files,
3965 ARRAY_SIZE(memsw_cgroup_files));
3966};
3967#else
3968static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3969{
3970 return 0;
3971}
3972#endif
3973
3974static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3975{
3976 struct mem_cgroup_per_node *pn;
3977 struct mem_cgroup_per_zone *mz;
3978 enum lru_list l;
3979 int zone, tmp = node;
3980
3981
3982
3983
3984
3985
3986
3987
3988 if (!node_state(node, N_NORMAL_MEMORY))
3989 tmp = -1;
3990 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3991 if (!pn)
3992 return 1;
3993
3994 mem->info.nodeinfo[node] = pn;
3995 memset(pn, 0, sizeof(*pn));
3996
3997 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3998 mz = &pn->zoneinfo[zone];
3999 for_each_lru(l)
4000 INIT_LIST_HEAD(&mz->lists[l]);
4001 mz->usage_in_excess = 0;
4002 mz->on_tree = false;
4003 mz->mem = mem;
4004 }
4005 return 0;
4006}
4007
4008static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4009{
4010 kfree(mem->info.nodeinfo[node]);
4011}
4012
4013static struct mem_cgroup *mem_cgroup_alloc(void)
4014{
4015 struct mem_cgroup *mem;
4016 int size = sizeof(struct mem_cgroup);
4017
4018
4019 if (size < PAGE_SIZE)
4020 mem = kmalloc(size, GFP_KERNEL);
4021 else
4022 mem = vmalloc(size);
4023
4024 if (!mem)
4025 return NULL;
4026
4027 memset(mem, 0, size);
4028 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4029 if (!mem->stat) {
4030 if (size < PAGE_SIZE)
4031 kfree(mem);
4032 else
4033 vfree(mem);
4034 mem = NULL;
4035 }
4036 return mem;
4037}
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050static void __mem_cgroup_free(struct mem_cgroup *mem)
4051{
4052 int node;
4053
4054 mem_cgroup_remove_from_trees(mem);
4055 free_css_id(&mem_cgroup_subsys, &mem->css);
4056
4057 for_each_node_state(node, N_POSSIBLE)
4058 free_mem_cgroup_per_zone_info(mem, node);
4059
4060 free_percpu(mem->stat);
4061 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4062 kfree(mem);
4063 else
4064 vfree(mem);
4065}
4066
4067static void mem_cgroup_get(struct mem_cgroup *mem)
4068{
4069 atomic_inc(&mem->refcnt);
4070}
4071
4072static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4073{
4074 if (atomic_sub_and_test(count, &mem->refcnt)) {
4075 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4076 __mem_cgroup_free(mem);
4077 if (parent)
4078 mem_cgroup_put(parent);
4079 }
4080}
4081
4082static void mem_cgroup_put(struct mem_cgroup *mem)
4083{
4084 __mem_cgroup_put(mem, 1);
4085}
4086
4087
4088
4089
4090static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4091{
4092 if (!mem->res.parent)
4093 return NULL;
4094 return mem_cgroup_from_res_counter(mem->res.parent, res);
4095}
4096
4097#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4098static void __init enable_swap_cgroup(void)
4099{
4100 if (!mem_cgroup_disabled() && really_do_swap_account)
4101 do_swap_account = 1;
4102}
4103#else
4104static void __init enable_swap_cgroup(void)
4105{
4106}
4107#endif
4108
4109static int mem_cgroup_soft_limit_tree_init(void)
4110{
4111 struct mem_cgroup_tree_per_node *rtpn;
4112 struct mem_cgroup_tree_per_zone *rtpz;
4113 int tmp, node, zone;
4114
4115 for_each_node_state(node, N_POSSIBLE) {
4116 tmp = node;
4117 if (!node_state(node, N_NORMAL_MEMORY))
4118 tmp = -1;
4119 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4120 if (!rtpn)
4121 return 1;
4122
4123 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4124
4125 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4126 rtpz = &rtpn->rb_tree_per_zone[zone];
4127 rtpz->rb_root = RB_ROOT;
4128 spin_lock_init(&rtpz->lock);
4129 }
4130 }
4131 return 0;
4132}
4133
4134static struct cgroup_subsys_state * __ref
4135mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4136{
4137 struct mem_cgroup *mem, *parent;
4138 long error = -ENOMEM;
4139 int node;
4140
4141 mem = mem_cgroup_alloc();
4142 if (!mem)
4143 return ERR_PTR(error);
4144
4145 for_each_node_state(node, N_POSSIBLE)
4146 if (alloc_mem_cgroup_per_zone_info(mem, node))
4147 goto free_out;
4148
4149
4150 if (cont->parent == NULL) {
4151 int cpu;
4152 enable_swap_cgroup();
4153 parent = NULL;
4154 root_mem_cgroup = mem;
4155 if (mem_cgroup_soft_limit_tree_init())
4156 goto free_out;
4157 for_each_possible_cpu(cpu) {
4158 struct memcg_stock_pcp *stock =
4159 &per_cpu(memcg_stock, cpu);
4160 INIT_WORK(&stock->work, drain_local_stock);
4161 }
4162 hotcpu_notifier(memcg_stock_cpu_callback, 0);
4163 } else {
4164 parent = mem_cgroup_from_cont(cont->parent);
4165 mem->use_hierarchy = parent->use_hierarchy;
4166 mem->oom_kill_disable = parent->oom_kill_disable;
4167 }
4168
4169 if (parent && parent->use_hierarchy) {
4170 res_counter_init(&mem->res, &parent->res);
4171 res_counter_init(&mem->memsw, &parent->memsw);
4172
4173
4174
4175
4176
4177
4178 mem_cgroup_get(parent);
4179 } else {
4180 res_counter_init(&mem->res, NULL);
4181 res_counter_init(&mem->memsw, NULL);
4182 }
4183 mem->last_scanned_child = 0;
4184 spin_lock_init(&mem->reclaim_param_lock);
4185 INIT_LIST_HEAD(&mem->oom_notify);
4186
4187 if (parent)
4188 mem->swappiness = get_swappiness(parent);
4189 atomic_set(&mem->refcnt, 1);
4190 mem->move_charge_at_immigrate = 0;
4191 mutex_init(&mem->thresholds_lock);
4192 return &mem->css;
4193free_out:
4194 __mem_cgroup_free(mem);
4195 root_mem_cgroup = NULL;
4196 return ERR_PTR(error);
4197}
4198
4199static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4200 struct cgroup *cont)
4201{
4202 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4203
4204 return mem_cgroup_force_empty(mem, false);
4205}
4206
4207static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4208 struct cgroup *cont)
4209{
4210 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4211
4212 mem_cgroup_put(mem);
4213}
4214
4215static int mem_cgroup_populate(struct cgroup_subsys *ss,
4216 struct cgroup *cont)
4217{
4218 int ret;
4219
4220 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4221 ARRAY_SIZE(mem_cgroup_files));
4222
4223 if (!ret)
4224 ret = register_memsw_files(cont, ss);
4225 return ret;
4226}
4227
4228#ifdef CONFIG_MMU
4229
4230#define PRECHARGE_COUNT_AT_ONCE 256
4231static int mem_cgroup_do_precharge(unsigned long count)
4232{
4233 int ret = 0;
4234 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4235 struct mem_cgroup *mem = mc.to;
4236
4237 if (mem_cgroup_is_root(mem)) {
4238 mc.precharge += count;
4239
4240 return ret;
4241 }
4242
4243 if (count > 1) {
4244 struct res_counter *dummy;
4245
4246
4247
4248
4249
4250
4251 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4252 goto one_by_one;
4253 if (do_swap_account && res_counter_charge(&mem->memsw,
4254 PAGE_SIZE * count, &dummy)) {
4255 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4256 goto one_by_one;
4257 }
4258 mc.precharge += count;
4259 return ret;
4260 }
4261one_by_one:
4262
4263 while (count--) {
4264 if (signal_pending(current)) {
4265 ret = -EINTR;
4266 break;
4267 }
4268 if (!batch_count--) {
4269 batch_count = PRECHARGE_COUNT_AT_ONCE;
4270 cond_resched();
4271 }
4272 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4273 if (ret || !mem)
4274
4275 return -ENOMEM;
4276 mc.precharge++;
4277 }
4278 return ret;
4279}
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299union mc_target {
4300 struct page *page;
4301 swp_entry_t ent;
4302};
4303
4304enum mc_target_type {
4305 MC_TARGET_NONE,
4306 MC_TARGET_PAGE,
4307 MC_TARGET_SWAP,
4308};
4309
4310static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4311 unsigned long addr, pte_t ptent)
4312{
4313 struct page *page = vm_normal_page(vma, addr, ptent);
4314
4315 if (!page || !page_mapped(page))
4316 return NULL;
4317 if (PageAnon(page)) {
4318
4319 if (!move_anon() || page_mapcount(page) > 2)
4320 return NULL;
4321 } else if (!move_file())
4322
4323 return NULL;
4324 if (!get_page_unless_zero(page))
4325 return NULL;
4326
4327 return page;
4328}
4329
4330static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4331 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4332{
4333 int usage_count;
4334 struct page *page = NULL;
4335 swp_entry_t ent = pte_to_swp_entry(ptent);
4336
4337 if (!move_anon() || non_swap_entry(ent))
4338 return NULL;
4339 usage_count = mem_cgroup_count_swap_user(ent, &page);
4340 if (usage_count > 1) {
4341 if (page)
4342 put_page(page);
4343 return NULL;
4344 }
4345 if (do_swap_account)
4346 entry->val = ent.val;
4347
4348 return page;
4349}
4350
4351static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4352 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4353{
4354 struct page *page = NULL;
4355 struct inode *inode;
4356 struct address_space *mapping;
4357 pgoff_t pgoff;
4358
4359 if (!vma->vm_file)
4360 return NULL;
4361 if (!move_file())
4362 return NULL;
4363
4364 inode = vma->vm_file->f_path.dentry->d_inode;
4365 mapping = vma->vm_file->f_mapping;
4366 if (pte_none(ptent))
4367 pgoff = linear_page_index(vma, addr);
4368 else
4369 pgoff = pte_to_pgoff(ptent);
4370
4371
4372 if (!mapping_cap_swap_backed(mapping)) {
4373 page = find_get_page(mapping, pgoff);
4374 } else {
4375 swp_entry_t ent;
4376 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4377 if (do_swap_account)
4378 entry->val = ent.val;
4379 }
4380
4381 return page;
4382}
4383
4384static int is_target_pte_for_mc(struct vm_area_struct *vma,
4385 unsigned long addr, pte_t ptent, union mc_target *target)
4386{
4387 struct page *page = NULL;
4388 struct page_cgroup *pc;
4389 int ret = 0;
4390 swp_entry_t ent = { .val = 0 };
4391
4392 if (pte_present(ptent))
4393 page = mc_handle_present_pte(vma, addr, ptent);
4394 else if (is_swap_pte(ptent))
4395 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4396 else if (pte_none(ptent) || pte_file(ptent))
4397 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4398
4399 if (!page && !ent.val)
4400 return 0;
4401 if (page) {
4402 pc = lookup_page_cgroup(page);
4403
4404
4405
4406
4407
4408 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4409 ret = MC_TARGET_PAGE;
4410 if (target)
4411 target->page = page;
4412 }
4413 if (!ret || !target)
4414 put_page(page);
4415 }
4416
4417 if (ent.val && !ret &&
4418 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4419 ret = MC_TARGET_SWAP;
4420 if (target)
4421 target->ent = ent;
4422 }
4423 return ret;
4424}
4425
4426static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4427 unsigned long addr, unsigned long end,
4428 struct mm_walk *walk)
4429{
4430 struct vm_area_struct *vma = walk->private;
4431 pte_t *pte;
4432 spinlock_t *ptl;
4433
4434 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4435 for (; addr != end; pte++, addr += PAGE_SIZE)
4436 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4437 mc.precharge++;
4438 pte_unmap_unlock(pte - 1, ptl);
4439 cond_resched();
4440
4441 return 0;
4442}
4443
4444static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4445{
4446 unsigned long precharge;
4447 struct vm_area_struct *vma;
4448
4449
4450 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4451 struct mm_walk mem_cgroup_count_precharge_walk = {
4452 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4453 .mm = mm,
4454 .private = vma,
4455 };
4456 if (is_vm_hugetlb_page(vma))
4457 continue;
4458 walk_page_range(vma->vm_start, vma->vm_end,
4459 &mem_cgroup_count_precharge_walk);
4460 }
4461
4462 precharge = mc.precharge;
4463 mc.precharge = 0;
4464
4465 return precharge;
4466}
4467
4468static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4469{
4470 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4471}
4472
4473static void mem_cgroup_clear_mc(void)
4474{
4475 struct mem_cgroup *from = mc.from;
4476 struct mem_cgroup *to = mc.to;
4477
4478
4479 if (mc.precharge) {
4480 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4481 mc.precharge = 0;
4482 }
4483
4484
4485
4486
4487 if (mc.moved_charge) {
4488 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4489 mc.moved_charge = 0;
4490 }
4491
4492 if (mc.moved_swap) {
4493
4494 if (!mem_cgroup_is_root(mc.from))
4495 res_counter_uncharge(&mc.from->memsw,
4496 PAGE_SIZE * mc.moved_swap);
4497 __mem_cgroup_put(mc.from, mc.moved_swap);
4498
4499 if (!mem_cgroup_is_root(mc.to)) {
4500
4501
4502
4503
4504 res_counter_uncharge(&mc.to->res,
4505 PAGE_SIZE * mc.moved_swap);
4506 }
4507
4508
4509 mc.moved_swap = 0;
4510 }
4511 if (mc.mm) {
4512 up_read(&mc.mm->mmap_sem);
4513 mmput(mc.mm);
4514 }
4515 spin_lock(&mc.lock);
4516 mc.from = NULL;
4517 mc.to = NULL;
4518 spin_unlock(&mc.lock);
4519 mc.moving_task = NULL;
4520 mc.mm = NULL;
4521 memcg_oom_recover(from);
4522 memcg_oom_recover(to);
4523 wake_up_all(&mc.waitq);
4524}
4525
4526static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4527 struct cgroup *cgroup,
4528 struct task_struct *p,
4529 bool threadgroup)
4530{
4531 int ret = 0;
4532 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4533
4534 if (mem->move_charge_at_immigrate) {
4535 struct mm_struct *mm;
4536 struct mem_cgroup *from = mem_cgroup_from_task(p);
4537
4538 VM_BUG_ON(from == mem);
4539
4540 mm = get_task_mm(p);
4541 if (!mm)
4542 return 0;
4543
4544 if (mm->owner == p) {
4545
4546
4547
4548
4549
4550 down_read(&mm->mmap_sem);
4551
4552 VM_BUG_ON(mc.from);
4553 VM_BUG_ON(mc.to);
4554 VM_BUG_ON(mc.precharge);
4555 VM_BUG_ON(mc.moved_charge);
4556 VM_BUG_ON(mc.moved_swap);
4557 VM_BUG_ON(mc.moving_task);
4558 VM_BUG_ON(mc.mm);
4559
4560 spin_lock(&mc.lock);
4561 mc.from = from;
4562 mc.to = mem;
4563 mc.precharge = 0;
4564 mc.moved_charge = 0;
4565 mc.moved_swap = 0;
4566 spin_unlock(&mc.lock);
4567 mc.moving_task = current;
4568 mc.mm = mm;
4569
4570 ret = mem_cgroup_precharge_mc(mm);
4571 if (ret)
4572 mem_cgroup_clear_mc();
4573
4574 } else
4575 mmput(mm);
4576 }
4577 return ret;
4578}
4579
4580static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4581 struct cgroup *cgroup,
4582 struct task_struct *p,
4583 bool threadgroup)
4584{
4585 mem_cgroup_clear_mc();
4586}
4587
4588static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4589 unsigned long addr, unsigned long end,
4590 struct mm_walk *walk)
4591{
4592 int ret = 0;
4593 struct vm_area_struct *vma = walk->private;
4594 pte_t *pte;
4595 spinlock_t *ptl;
4596
4597retry:
4598 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4599 for (; addr != end; addr += PAGE_SIZE) {
4600 pte_t ptent = *(pte++);
4601 union mc_target target;
4602 int type;
4603 struct page *page;
4604 struct page_cgroup *pc;
4605 swp_entry_t ent;
4606
4607 if (!mc.precharge)
4608 break;
4609
4610 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4611 switch (type) {
4612 case MC_TARGET_PAGE:
4613 page = target.page;
4614 if (isolate_lru_page(page))
4615 goto put;
4616 pc = lookup_page_cgroup(page);
4617 if (!mem_cgroup_move_account(pc,
4618 mc.from, mc.to, false)) {
4619 mc.precharge--;
4620
4621 mc.moved_charge++;
4622 }
4623 putback_lru_page(page);
4624put:
4625 put_page(page);
4626 break;
4627 case MC_TARGET_SWAP:
4628 ent = target.ent;
4629 if (!mem_cgroup_move_swap_account(ent,
4630 mc.from, mc.to, false)) {
4631 mc.precharge--;
4632
4633 mc.moved_swap++;
4634 }
4635 break;
4636 default:
4637 break;
4638 }
4639 }
4640 pte_unmap_unlock(pte - 1, ptl);
4641 cond_resched();
4642
4643 if (addr != end) {
4644
4645
4646
4647
4648
4649
4650 ret = mem_cgroup_do_precharge(1);
4651 if (!ret)
4652 goto retry;
4653 }
4654
4655 return ret;
4656}
4657
4658static void mem_cgroup_move_charge(struct mm_struct *mm)
4659{
4660 struct vm_area_struct *vma;
4661
4662 lru_add_drain_all();
4663
4664 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4665 int ret;
4666 struct mm_walk mem_cgroup_move_charge_walk = {
4667 .pmd_entry = mem_cgroup_move_charge_pte_range,
4668 .mm = mm,
4669 .private = vma,
4670 };
4671 if (is_vm_hugetlb_page(vma))
4672 continue;
4673 ret = walk_page_range(vma->vm_start, vma->vm_end,
4674 &mem_cgroup_move_charge_walk);
4675 if (ret)
4676
4677
4678
4679
4680 break;
4681 }
4682}
4683
4684static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4685 struct cgroup *cont,
4686 struct cgroup *old_cont,
4687 struct task_struct *p,
4688 bool threadgroup)
4689{
4690 if (!mc.mm)
4691
4692 return;
4693
4694 mem_cgroup_move_charge(mc.mm);
4695 mem_cgroup_clear_mc();
4696}
4697#else
4698static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4699 struct cgroup *cgroup,
4700 struct task_struct *p,
4701 bool threadgroup)
4702{
4703 return 0;
4704}
4705static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4706 struct cgroup *cgroup,
4707 struct task_struct *p,
4708 bool threadgroup)
4709{
4710}
4711static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4712 struct cgroup *cont,
4713 struct cgroup *old_cont,
4714 struct task_struct *p,
4715 bool threadgroup)
4716{
4717}
4718#endif
4719
4720struct cgroup_subsys mem_cgroup_subsys = {
4721 .name = "memory",
4722 .subsys_id = mem_cgroup_subsys_id,
4723 .create = mem_cgroup_create,
4724 .pre_destroy = mem_cgroup_pre_destroy,
4725 .destroy = mem_cgroup_destroy,
4726 .populate = mem_cgroup_populate,
4727 .can_attach = mem_cgroup_can_attach,
4728 .cancel_attach = mem_cgroup_cancel_attach,
4729 .attach = mem_cgroup_move_task,
4730 .early_init = 0,
4731 .use_id = 1,
4732};
4733
4734#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4735
4736static int __init disable_swap_account(char *s)
4737{
4738 really_do_swap_account = 0;
4739 return 1;
4740}
4741__setup("noswapaccount", disable_swap_account);
4742#endif
4743