1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/mutex.h>
37#include <linux/rbtree.h>
38#include <linux/slab.h>
39#include <linux/swap.h>
40#include <linux/swapops.h>
41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
44#include <linux/fs.h>
45#include <linux/seq_file.h>
46#include <linux/vmalloc.h>
47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
50#include <linux/oom.h>
51#include "internal.h"
52
53#include <asm/uaccess.h>
54
55#include <trace/events/vmscan.h>
56
57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58#define MEM_CGROUP_RECLAIM_RETRIES 5
59struct mem_cgroup *root_mem_cgroup __read_mostly;
60
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62
63int do_swap_account __read_mostly;
64
65
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
72#else
73#define do_swap_account (0)
74#endif
75
76
77
78
79
80enum mem_cgroup_stat_index {
81
82
83
84 MEM_CGROUP_STAT_CACHE,
85 MEM_CGROUP_STAT_RSS,
86 MEM_CGROUP_STAT_FILE_MAPPED,
87 MEM_CGROUP_STAT_SWAPOUT,
88 MEM_CGROUP_STAT_DATA,
89 MEM_CGROUP_ON_MOVE,
90 MEM_CGROUP_STAT_NSTATS,
91};
92
93enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN,
95 MEM_CGROUP_EVENTS_PGPGOUT,
96 MEM_CGROUP_EVENTS_COUNT,
97 MEM_CGROUP_EVENTS_PGFAULT,
98 MEM_CGROUP_EVENTS_PGMAJFAULT,
99 MEM_CGROUP_EVENTS_NSTATS,
100};
101
102
103
104
105
106
107enum mem_cgroup_events_target {
108 MEM_CGROUP_TARGET_THRESH,
109 MEM_CGROUP_TARGET_SOFTLIMIT,
110 MEM_CGROUP_TARGET_NUMAINFO,
111 MEM_CGROUP_NTARGETS,
112};
113#define THRESHOLDS_EVENTS_TARGET (128)
114#define SOFTLIMIT_EVENTS_TARGET (1024)
115#define NUMAINFO_EVENTS_TARGET (1024)
116
117struct mem_cgroup_stat_cpu {
118 long count[MEM_CGROUP_STAT_NSTATS];
119 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
120 unsigned long targets[MEM_CGROUP_NTARGETS];
121};
122
123
124
125
126struct mem_cgroup_per_zone {
127
128
129
130 struct list_head lists[NR_LRU_LISTS];
131 unsigned long count[NR_LRU_LISTS];
132
133 struct zone_reclaim_stat reclaim_stat;
134 struct rb_node tree_node;
135 unsigned long long usage_in_excess;
136
137 bool on_tree;
138 struct mem_cgroup *mem;
139
140};
141
142#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
143
144struct mem_cgroup_per_node {
145 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
146};
147
148struct mem_cgroup_lru_info {
149 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
150};
151
152
153
154
155
156
157struct mem_cgroup_tree_per_zone {
158 struct rb_root rb_root;
159 spinlock_t lock;
160};
161
162struct mem_cgroup_tree_per_node {
163 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
164};
165
166struct mem_cgroup_tree {
167 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
168};
169
170static struct mem_cgroup_tree soft_limit_tree __read_mostly;
171
172struct mem_cgroup_threshold {
173 struct eventfd_ctx *eventfd;
174 u64 threshold;
175};
176
177
178struct mem_cgroup_threshold_ary {
179
180 int current_threshold;
181
182 unsigned int size;
183
184 struct mem_cgroup_threshold entries[0];
185};
186
187struct mem_cgroup_thresholds {
188
189 struct mem_cgroup_threshold_ary *primary;
190
191
192
193
194
195 struct mem_cgroup_threshold_ary *spare;
196};
197
198
199struct mem_cgroup_eventfd_list {
200 struct list_head list;
201 struct eventfd_ctx *eventfd;
202};
203
204static void mem_cgroup_threshold(struct mem_cgroup *mem);
205static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
206
207
208
209
210
211
212
213
214
215
216
217
218struct mem_cgroup {
219 struct cgroup_subsys_state css;
220
221
222
223 struct res_counter res;
224
225
226
227 struct res_counter memsw;
228
229
230
231
232 struct mem_cgroup_lru_info info;
233
234
235
236
237 int last_scanned_child;
238 int last_scanned_node;
239#if MAX_NUMNODES > 1
240 nodemask_t scan_nodes;
241 atomic_t numainfo_events;
242 atomic_t numainfo_updating;
243#endif
244
245
246
247 bool use_hierarchy;
248
249 bool oom_lock;
250 atomic_t under_oom;
251
252 atomic_t refcnt;
253
254 int swappiness;
255
256 int oom_kill_disable;
257
258
259 bool memsw_is_minimum;
260
261
262 struct mutex thresholds_lock;
263
264
265 struct mem_cgroup_thresholds thresholds;
266
267
268 struct mem_cgroup_thresholds memsw_thresholds;
269
270
271 struct list_head oom_notify;
272
273
274
275
276
277 unsigned long move_charge_at_immigrate;
278
279
280
281 struct mem_cgroup_stat_cpu *stat;
282
283
284
285
286 struct mem_cgroup_stat_cpu nocpu_base;
287 spinlock_t pcp_counter_lock;
288};
289
290
291
292
293
294
295enum move_type {
296 MOVE_CHARGE_TYPE_ANON,
297 MOVE_CHARGE_TYPE_FILE,
298 NR_MOVE_TYPE,
299};
300
301
302static struct move_charge_struct {
303 spinlock_t lock;
304 struct mem_cgroup *from;
305 struct mem_cgroup *to;
306 unsigned long precharge;
307 unsigned long moved_charge;
308 unsigned long moved_swap;
309 struct task_struct *moving_task;
310 wait_queue_head_t waitq;
311} mc = {
312 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
313 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
314};
315
316static bool move_anon(void)
317{
318 return test_bit(MOVE_CHARGE_TYPE_ANON,
319 &mc.to->move_charge_at_immigrate);
320}
321
322static bool move_file(void)
323{
324 return test_bit(MOVE_CHARGE_TYPE_FILE,
325 &mc.to->move_charge_at_immigrate);
326}
327
328
329
330
331
332#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
333#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
334
335enum charge_type {
336 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
337 MEM_CGROUP_CHARGE_TYPE_MAPPED,
338 MEM_CGROUP_CHARGE_TYPE_SHMEM,
339 MEM_CGROUP_CHARGE_TYPE_FORCE,
340 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
341 MEM_CGROUP_CHARGE_TYPE_DROP,
342 NR_CHARGE_TYPE,
343};
344
345
346#define _MEM (0)
347#define _MEMSWAP (1)
348#define _OOM_TYPE (2)
349#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
350#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
351#define MEMFILE_ATTR(val) ((val) & 0xffff)
352
353#define OOM_CONTROL (0)
354
355
356
357
358#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
359#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
360#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
361#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
362#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
363#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
364
365static void mem_cgroup_get(struct mem_cgroup *mem);
366static void mem_cgroup_put(struct mem_cgroup *mem);
367static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
368static void drain_all_stock_async(struct mem_cgroup *mem);
369
370static struct mem_cgroup_per_zone *
371mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
372{
373 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
374}
375
376struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
377{
378 return &mem->css;
379}
380
381static struct mem_cgroup_per_zone *
382page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
383{
384 int nid = page_to_nid(page);
385 int zid = page_zonenum(page);
386
387 return mem_cgroup_zoneinfo(mem, nid, zid);
388}
389
390static struct mem_cgroup_tree_per_zone *
391soft_limit_tree_node_zone(int nid, int zid)
392{
393 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
394}
395
396static struct mem_cgroup_tree_per_zone *
397soft_limit_tree_from_page(struct page *page)
398{
399 int nid = page_to_nid(page);
400 int zid = page_zonenum(page);
401
402 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
403}
404
405static void
406__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
407 struct mem_cgroup_per_zone *mz,
408 struct mem_cgroup_tree_per_zone *mctz,
409 unsigned long long new_usage_in_excess)
410{
411 struct rb_node **p = &mctz->rb_root.rb_node;
412 struct rb_node *parent = NULL;
413 struct mem_cgroup_per_zone *mz_node;
414
415 if (mz->on_tree)
416 return;
417
418 mz->usage_in_excess = new_usage_in_excess;
419 if (!mz->usage_in_excess)
420 return;
421 while (*p) {
422 parent = *p;
423 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
424 tree_node);
425 if (mz->usage_in_excess < mz_node->usage_in_excess)
426 p = &(*p)->rb_left;
427
428
429
430
431 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
432 p = &(*p)->rb_right;
433 }
434 rb_link_node(&mz->tree_node, parent, p);
435 rb_insert_color(&mz->tree_node, &mctz->rb_root);
436 mz->on_tree = true;
437}
438
439static void
440__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
441 struct mem_cgroup_per_zone *mz,
442 struct mem_cgroup_tree_per_zone *mctz)
443{
444 if (!mz->on_tree)
445 return;
446 rb_erase(&mz->tree_node, &mctz->rb_root);
447 mz->on_tree = false;
448}
449
450static void
451mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
452 struct mem_cgroup_per_zone *mz,
453 struct mem_cgroup_tree_per_zone *mctz)
454{
455 spin_lock(&mctz->lock);
456 __mem_cgroup_remove_exceeded(mem, mz, mctz);
457 spin_unlock(&mctz->lock);
458}
459
460
461static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
462{
463 unsigned long long excess;
464 struct mem_cgroup_per_zone *mz;
465 struct mem_cgroup_tree_per_zone *mctz;
466 int nid = page_to_nid(page);
467 int zid = page_zonenum(page);
468 mctz = soft_limit_tree_from_page(page);
469
470
471
472
473
474 for (; mem; mem = parent_mem_cgroup(mem)) {
475 mz = mem_cgroup_zoneinfo(mem, nid, zid);
476 excess = res_counter_soft_limit_excess(&mem->res);
477
478
479
480
481 if (excess || mz->on_tree) {
482 spin_lock(&mctz->lock);
483
484 if (mz->on_tree)
485 __mem_cgroup_remove_exceeded(mem, mz, mctz);
486
487
488
489
490 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
491 spin_unlock(&mctz->lock);
492 }
493 }
494}
495
496static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
497{
498 int node, zone;
499 struct mem_cgroup_per_zone *mz;
500 struct mem_cgroup_tree_per_zone *mctz;
501
502 for_each_node_state(node, N_POSSIBLE) {
503 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
504 mz = mem_cgroup_zoneinfo(mem, node, zone);
505 mctz = soft_limit_tree_node_zone(node, zone);
506 mem_cgroup_remove_exceeded(mem, mz, mctz);
507 }
508 }
509}
510
511static struct mem_cgroup_per_zone *
512__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
513{
514 struct rb_node *rightmost = NULL;
515 struct mem_cgroup_per_zone *mz;
516
517retry:
518 mz = NULL;
519 rightmost = rb_last(&mctz->rb_root);
520 if (!rightmost)
521 goto done;
522
523 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
524
525
526
527
528
529 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
530 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
531 !css_tryget(&mz->mem->css))
532 goto retry;
533done:
534 return mz;
535}
536
537static struct mem_cgroup_per_zone *
538mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
539{
540 struct mem_cgroup_per_zone *mz;
541
542 spin_lock(&mctz->lock);
543 mz = __mem_cgroup_largest_soft_limit_node(mctz);
544 spin_unlock(&mctz->lock);
545 return mz;
546}
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567static long mem_cgroup_read_stat(struct mem_cgroup *mem,
568 enum mem_cgroup_stat_index idx)
569{
570 long val = 0;
571 int cpu;
572
573 get_online_cpus();
574 for_each_online_cpu(cpu)
575 val += per_cpu(mem->stat->count[idx], cpu);
576#ifdef CONFIG_HOTPLUG_CPU
577 spin_lock(&mem->pcp_counter_lock);
578 val += mem->nocpu_base.count[idx];
579 spin_unlock(&mem->pcp_counter_lock);
580#endif
581 put_online_cpus();
582 return val;
583}
584
585static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
586 bool charge)
587{
588 int val = (charge) ? 1 : -1;
589 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
590}
591
592void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
593{
594 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
595}
596
597void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
598{
599 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
600}
601
602static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
603 enum mem_cgroup_events_index idx)
604{
605 unsigned long val = 0;
606 int cpu;
607
608 for_each_online_cpu(cpu)
609 val += per_cpu(mem->stat->events[idx], cpu);
610#ifdef CONFIG_HOTPLUG_CPU
611 spin_lock(&mem->pcp_counter_lock);
612 val += mem->nocpu_base.events[idx];
613 spin_unlock(&mem->pcp_counter_lock);
614#endif
615 return val;
616}
617
618static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
619 bool file, int nr_pages)
620{
621 preempt_disable();
622
623 if (file)
624 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
625 else
626 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
627
628
629 if (nr_pages > 0)
630 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
631 else {
632 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
633 nr_pages = -nr_pages;
634 }
635
636 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
637
638 preempt_enable();
639}
640
641unsigned long
642mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
643 unsigned int lru_mask)
644{
645 struct mem_cgroup_per_zone *mz;
646 enum lru_list l;
647 unsigned long ret = 0;
648
649 mz = mem_cgroup_zoneinfo(mem, nid, zid);
650
651 for_each_lru(l) {
652 if (BIT(l) & lru_mask)
653 ret += MEM_CGROUP_ZSTAT(mz, l);
654 }
655 return ret;
656}
657
658static unsigned long
659mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
660 int nid, unsigned int lru_mask)
661{
662 u64 total = 0;
663 int zid;
664
665 for (zid = 0; zid < MAX_NR_ZONES; zid++)
666 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
667
668 return total;
669}
670
671static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
672 unsigned int lru_mask)
673{
674 int nid;
675 u64 total = 0;
676
677 for_each_node_state(nid, N_HIGH_MEMORY)
678 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
679 return total;
680}
681
682static bool __memcg_event_check(struct mem_cgroup *mem, int target)
683{
684 unsigned long val, next;
685
686 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
687 next = this_cpu_read(mem->stat->targets[target]);
688
689 return ((long)next - (long)val < 0);
690}
691
692static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
693{
694 unsigned long val, next;
695
696 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
697
698 switch (target) {
699 case MEM_CGROUP_TARGET_THRESH:
700 next = val + THRESHOLDS_EVENTS_TARGET;
701 break;
702 case MEM_CGROUP_TARGET_SOFTLIMIT:
703 next = val + SOFTLIMIT_EVENTS_TARGET;
704 break;
705 case MEM_CGROUP_TARGET_NUMAINFO:
706 next = val + NUMAINFO_EVENTS_TARGET;
707 break;
708 default:
709 return;
710 }
711
712 this_cpu_write(mem->stat->targets[target], next);
713}
714
715
716
717
718
719static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
720{
721
722 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
723 mem_cgroup_threshold(mem);
724 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
725 if (unlikely(__memcg_event_check(mem,
726 MEM_CGROUP_TARGET_SOFTLIMIT))) {
727 mem_cgroup_update_tree(mem, page);
728 __mem_cgroup_target_update(mem,
729 MEM_CGROUP_TARGET_SOFTLIMIT);
730 }
731#if MAX_NUMNODES > 1
732 if (unlikely(__memcg_event_check(mem,
733 MEM_CGROUP_TARGET_NUMAINFO))) {
734 atomic_inc(&mem->numainfo_events);
735 __mem_cgroup_target_update(mem,
736 MEM_CGROUP_TARGET_NUMAINFO);
737 }
738#endif
739 }
740}
741
742static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
743{
744 return container_of(cgroup_subsys_state(cont,
745 mem_cgroup_subsys_id), struct mem_cgroup,
746 css);
747}
748
749struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
750{
751
752
753
754
755
756 if (unlikely(!p))
757 return NULL;
758
759 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
760 struct mem_cgroup, css);
761}
762
763struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
764{
765 struct mem_cgroup *mem = NULL;
766
767 if (!mm)
768 return NULL;
769
770
771
772
773
774 rcu_read_lock();
775 do {
776 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
777 if (unlikely(!mem))
778 break;
779 } while (!css_tryget(&mem->css));
780 rcu_read_unlock();
781 return mem;
782}
783
784
785static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
786{
787 struct cgroup_subsys_state *css;
788 int found;
789
790 if (!mem)
791 return root_mem_cgroup;
792 if (!mem->use_hierarchy) {
793 if (css_tryget(&mem->css))
794 return mem;
795 return NULL;
796 }
797 rcu_read_lock();
798
799
800
801
802 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
803 if (css && css_tryget(css))
804 mem = container_of(css, struct mem_cgroup, css);
805 else
806 mem = NULL;
807 rcu_read_unlock();
808 return mem;
809}
810
811static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
812 struct mem_cgroup *root,
813 bool cond)
814{
815 int nextid = css_id(&iter->css) + 1;
816 int found;
817 int hierarchy_used;
818 struct cgroup_subsys_state *css;
819
820 hierarchy_used = iter->use_hierarchy;
821
822 css_put(&iter->css);
823
824 if (!cond || (root && !hierarchy_used))
825 return NULL;
826
827 if (!root)
828 root = root_mem_cgroup;
829
830 do {
831 iter = NULL;
832 rcu_read_lock();
833
834 css = css_get_next(&mem_cgroup_subsys, nextid,
835 &root->css, &found);
836 if (css && css_tryget(css))
837 iter = container_of(css, struct mem_cgroup, css);
838 rcu_read_unlock();
839
840 nextid = found + 1;
841 } while (css && !iter);
842
843 return iter;
844}
845
846
847
848
849
850#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
851 for (iter = mem_cgroup_start_loop(root);\
852 iter != NULL;\
853 iter = mem_cgroup_get_next(iter, root, cond))
854
855#define for_each_mem_cgroup_tree(iter, root) \
856 for_each_mem_cgroup_tree_cond(iter, root, true)
857
858#define for_each_mem_cgroup_all(iter) \
859 for_each_mem_cgroup_tree_cond(iter, NULL, true)
860
861
862static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
863{
864 return (mem == root_mem_cgroup);
865}
866
867void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
868{
869 struct mem_cgroup *mem;
870
871 if (!mm)
872 return;
873
874 rcu_read_lock();
875 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
876 if (unlikely(!mem))
877 goto out;
878
879 switch (idx) {
880 case PGMAJFAULT:
881 mem_cgroup_pgmajfault(mem, 1);
882 break;
883 case PGFAULT:
884 mem_cgroup_pgfault(mem, 1);
885 break;
886 default:
887 BUG();
888 }
889out:
890 rcu_read_unlock();
891}
892EXPORT_SYMBOL(mem_cgroup_count_vm_event);
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
909{
910 struct page_cgroup *pc;
911 struct mem_cgroup_per_zone *mz;
912
913 if (mem_cgroup_disabled())
914 return;
915 pc = lookup_page_cgroup(page);
916
917 if (!TestClearPageCgroupAcctLRU(pc))
918 return;
919 VM_BUG_ON(!pc->mem_cgroup);
920
921
922
923
924 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
925
926 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
927 if (mem_cgroup_is_root(pc->mem_cgroup))
928 return;
929 VM_BUG_ON(list_empty(&pc->lru));
930 list_del_init(&pc->lru);
931}
932
933void mem_cgroup_del_lru(struct page *page)
934{
935 mem_cgroup_del_lru_list(page, page_lru(page));
936}
937
938
939
940
941
942
943void mem_cgroup_rotate_reclaimable_page(struct page *page)
944{
945 struct mem_cgroup_per_zone *mz;
946 struct page_cgroup *pc;
947 enum lru_list lru = page_lru(page);
948
949 if (mem_cgroup_disabled())
950 return;
951
952 pc = lookup_page_cgroup(page);
953
954 if (!PageCgroupUsed(pc))
955 return;
956
957 smp_rmb();
958 if (mem_cgroup_is_root(pc->mem_cgroup))
959 return;
960 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
961 list_move_tail(&pc->lru, &mz->lists[lru]);
962}
963
964void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
965{
966 struct mem_cgroup_per_zone *mz;
967 struct page_cgroup *pc;
968
969 if (mem_cgroup_disabled())
970 return;
971
972 pc = lookup_page_cgroup(page);
973
974 if (!PageCgroupUsed(pc))
975 return;
976
977 smp_rmb();
978 if (mem_cgroup_is_root(pc->mem_cgroup))
979 return;
980 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
981 list_move(&pc->lru, &mz->lists[lru]);
982}
983
984void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
985{
986 struct page_cgroup *pc;
987 struct mem_cgroup_per_zone *mz;
988
989 if (mem_cgroup_disabled())
990 return;
991 pc = lookup_page_cgroup(page);
992 VM_BUG_ON(PageCgroupAcctLRU(pc));
993 if (!PageCgroupUsed(pc))
994 return;
995
996 smp_rmb();
997 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
998
999 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1000 SetPageCgroupAcctLRU(pc);
1001 if (mem_cgroup_is_root(pc->mem_cgroup))
1002 return;
1003 list_add(&pc->lru, &mz->lists[lru]);
1004}
1005
1006
1007
1008
1009
1010
1011
1012static void mem_cgroup_lru_del_before_commit(struct page *page)
1013{
1014 unsigned long flags;
1015 struct zone *zone = page_zone(page);
1016 struct page_cgroup *pc = lookup_page_cgroup(page);
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026 if (likely(!PageLRU(page)))
1027 return;
1028
1029 spin_lock_irqsave(&zone->lru_lock, flags);
1030
1031
1032
1033
1034 if (!PageCgroupUsed(pc))
1035 mem_cgroup_del_lru_list(page, page_lru(page));
1036 spin_unlock_irqrestore(&zone->lru_lock, flags);
1037}
1038
1039static void mem_cgroup_lru_add_after_commit(struct page *page)
1040{
1041 unsigned long flags;
1042 struct zone *zone = page_zone(page);
1043 struct page_cgroup *pc = lookup_page_cgroup(page);
1044
1045
1046 if (likely(!PageLRU(page)))
1047 return;
1048 spin_lock_irqsave(&zone->lru_lock, flags);
1049
1050 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1051 mem_cgroup_add_lru_list(page, page_lru(page));
1052 spin_unlock_irqrestore(&zone->lru_lock, flags);
1053}
1054
1055
1056void mem_cgroup_move_lists(struct page *page,
1057 enum lru_list from, enum lru_list to)
1058{
1059 if (mem_cgroup_disabled())
1060 return;
1061 mem_cgroup_del_lru_list(page, from);
1062 mem_cgroup_add_lru_list(page, to);
1063}
1064
1065
1066
1067
1068
1069static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1070 struct mem_cgroup *mem)
1071{
1072 if (root_mem != mem) {
1073 return (root_mem->use_hierarchy &&
1074 css_is_ancestor(&mem->css, &root_mem->css));
1075 }
1076
1077 return true;
1078}
1079
1080int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1081{
1082 int ret;
1083 struct mem_cgroup *curr = NULL;
1084 struct task_struct *p;
1085
1086 p = find_lock_task_mm(task);
1087 if (!p)
1088 return 0;
1089 curr = try_get_mem_cgroup_from_mm(p->mm);
1090 task_unlock(p);
1091 if (!curr)
1092 return 0;
1093
1094
1095
1096
1097
1098
1099 ret = mem_cgroup_same_or_subtree(mem, curr);
1100 css_put(&curr->css);
1101 return ret;
1102}
1103
1104static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
1105{
1106 unsigned long active;
1107 unsigned long inactive;
1108 unsigned long gb;
1109 unsigned long inactive_ratio;
1110
1111 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1112 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1113
1114 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1115 if (gb)
1116 inactive_ratio = int_sqrt(10 * gb);
1117 else
1118 inactive_ratio = 1;
1119
1120 if (present_pages) {
1121 present_pages[0] = inactive;
1122 present_pages[1] = active;
1123 }
1124
1125 return inactive_ratio;
1126}
1127
1128int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
1129{
1130 unsigned long active;
1131 unsigned long inactive;
1132 unsigned long present_pages[2];
1133 unsigned long inactive_ratio;
1134
1135 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1136
1137 inactive = present_pages[0];
1138 active = present_pages[1];
1139
1140 if (inactive * inactive_ratio < active)
1141 return 1;
1142
1143 return 0;
1144}
1145
1146int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1147{
1148 unsigned long active;
1149 unsigned long inactive;
1150
1151 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1152 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1153
1154 return (active > inactive);
1155}
1156
1157struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1158 struct zone *zone)
1159{
1160 int nid = zone_to_nid(zone);
1161 int zid = zone_idx(zone);
1162 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1163
1164 return &mz->reclaim_stat;
1165}
1166
1167struct zone_reclaim_stat *
1168mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1169{
1170 struct page_cgroup *pc;
1171 struct mem_cgroup_per_zone *mz;
1172
1173 if (mem_cgroup_disabled())
1174 return NULL;
1175
1176 pc = lookup_page_cgroup(page);
1177 if (!PageCgroupUsed(pc))
1178 return NULL;
1179
1180 smp_rmb();
1181 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1182 return &mz->reclaim_stat;
1183}
1184
1185unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1186 struct list_head *dst,
1187 unsigned long *scanned, int order,
1188 int mode, struct zone *z,
1189 struct mem_cgroup *mem_cont,
1190 int active, int file)
1191{
1192 unsigned long nr_taken = 0;
1193 struct page *page;
1194 unsigned long scan;
1195 LIST_HEAD(pc_list);
1196 struct list_head *src;
1197 struct page_cgroup *pc, *tmp;
1198 int nid = zone_to_nid(z);
1199 int zid = zone_idx(z);
1200 struct mem_cgroup_per_zone *mz;
1201 int lru = LRU_FILE * file + active;
1202 int ret;
1203
1204 BUG_ON(!mem_cont);
1205 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1206 src = &mz->lists[lru];
1207
1208 scan = 0;
1209 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1210 if (scan >= nr_to_scan)
1211 break;
1212
1213 if (unlikely(!PageCgroupUsed(pc)))
1214 continue;
1215
1216 page = lookup_cgroup_page(pc);
1217
1218 if (unlikely(!PageLRU(page)))
1219 continue;
1220
1221 scan++;
1222 ret = __isolate_lru_page(page, mode, file);
1223 switch (ret) {
1224 case 0:
1225 list_move(&page->lru, dst);
1226 mem_cgroup_del_lru(page);
1227 nr_taken += hpage_nr_pages(page);
1228 break;
1229 case -EBUSY:
1230
1231 mem_cgroup_rotate_lru_list(page, page_lru(page));
1232 break;
1233 default:
1234 break;
1235 }
1236 }
1237
1238 *scanned = scan;
1239
1240 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1241 0, 0, 0, mode);
1242
1243 return nr_taken;
1244}
1245
1246#define mem_cgroup_from_res_counter(counter, member) \
1247 container_of(counter, struct mem_cgroup, member)
1248
1249
1250
1251
1252
1253
1254
1255
1256static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1257{
1258 unsigned long long margin;
1259
1260 margin = res_counter_margin(&mem->res);
1261 if (do_swap_account)
1262 margin = min(margin, res_counter_margin(&mem->memsw));
1263 return margin >> PAGE_SHIFT;
1264}
1265
1266int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1267{
1268 struct cgroup *cgrp = memcg->css.cgroup;
1269
1270
1271 if (cgrp->parent == NULL)
1272 return vm_swappiness;
1273
1274 return memcg->swappiness;
1275}
1276
1277static void mem_cgroup_start_move(struct mem_cgroup *mem)
1278{
1279 int cpu;
1280
1281 get_online_cpus();
1282 spin_lock(&mem->pcp_counter_lock);
1283 for_each_online_cpu(cpu)
1284 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1285 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1286 spin_unlock(&mem->pcp_counter_lock);
1287 put_online_cpus();
1288
1289 synchronize_rcu();
1290}
1291
1292static void mem_cgroup_end_move(struct mem_cgroup *mem)
1293{
1294 int cpu;
1295
1296 if (!mem)
1297 return;
1298 get_online_cpus();
1299 spin_lock(&mem->pcp_counter_lock);
1300 for_each_online_cpu(cpu)
1301 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1302 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1303 spin_unlock(&mem->pcp_counter_lock);
1304 put_online_cpus();
1305}
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1319{
1320 VM_BUG_ON(!rcu_read_lock_held());
1321 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1322}
1323
1324static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1325{
1326 struct mem_cgroup *from;
1327 struct mem_cgroup *to;
1328 bool ret = false;
1329
1330
1331
1332
1333 spin_lock(&mc.lock);
1334 from = mc.from;
1335 to = mc.to;
1336 if (!from)
1337 goto unlock;
1338
1339 ret = mem_cgroup_same_or_subtree(mem, from)
1340 || mem_cgroup_same_or_subtree(mem, to);
1341unlock:
1342 spin_unlock(&mc.lock);
1343 return ret;
1344}
1345
1346static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1347{
1348 if (mc.moving_task && current != mc.moving_task) {
1349 if (mem_cgroup_under_move(mem)) {
1350 DEFINE_WAIT(wait);
1351 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1352
1353 if (mc.moving_task)
1354 schedule();
1355 finish_wait(&mc.waitq, &wait);
1356 return true;
1357 }
1358 }
1359 return false;
1360}
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1371{
1372 struct cgroup *task_cgrp;
1373 struct cgroup *mem_cgrp;
1374
1375
1376
1377
1378
1379 static char memcg_name[PATH_MAX];
1380 int ret;
1381
1382 if (!memcg || !p)
1383 return;
1384
1385
1386 rcu_read_lock();
1387
1388 mem_cgrp = memcg->css.cgroup;
1389 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1390
1391 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1392 if (ret < 0) {
1393
1394
1395
1396
1397 rcu_read_unlock();
1398 goto done;
1399 }
1400 rcu_read_unlock();
1401
1402 printk(KERN_INFO "Task in %s killed", memcg_name);
1403
1404 rcu_read_lock();
1405 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1406 if (ret < 0) {
1407 rcu_read_unlock();
1408 goto done;
1409 }
1410 rcu_read_unlock();
1411
1412
1413
1414
1415 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1416done:
1417
1418 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1419 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1420 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1421 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1422 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1423 "failcnt %llu\n",
1424 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1425 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1426 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1427}
1428
1429
1430
1431
1432
1433static int mem_cgroup_count_children(struct mem_cgroup *mem)
1434{
1435 int num = 0;
1436 struct mem_cgroup *iter;
1437
1438 for_each_mem_cgroup_tree(iter, mem)
1439 num++;
1440 return num;
1441}
1442
1443
1444
1445
1446u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1447{
1448 u64 limit;
1449 u64 memsw;
1450
1451 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1452 limit += total_swap_pages << PAGE_SHIFT;
1453
1454 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1455
1456
1457
1458
1459 return min(limit, memsw);
1460}
1461
1462
1463
1464
1465
1466
1467static struct mem_cgroup *
1468mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1469{
1470 struct mem_cgroup *ret = NULL;
1471 struct cgroup_subsys_state *css;
1472 int nextid, found;
1473
1474 if (!root_mem->use_hierarchy) {
1475 css_get(&root_mem->css);
1476 ret = root_mem;
1477 }
1478
1479 while (!ret) {
1480 rcu_read_lock();
1481 nextid = root_mem->last_scanned_child + 1;
1482 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1483 &found);
1484 if (css && css_tryget(css))
1485 ret = container_of(css, struct mem_cgroup, css);
1486
1487 rcu_read_unlock();
1488
1489 if (!css) {
1490
1491 root_mem->last_scanned_child = 0;
1492 } else
1493 root_mem->last_scanned_child = found;
1494 }
1495
1496 return ret;
1497}
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1510 int nid, bool noswap)
1511{
1512 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1513 return true;
1514 if (noswap || !total_swap_pages)
1515 return false;
1516 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1517 return true;
1518 return false;
1519
1520}
1521#if MAX_NUMNODES > 1
1522
1523
1524
1525
1526
1527
1528
1529static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1530{
1531 int nid;
1532
1533
1534
1535
1536 if (!atomic_read(&mem->numainfo_events))
1537 return;
1538 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1539 return;
1540
1541
1542 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1543
1544 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1545
1546 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1547 node_clear(nid, mem->scan_nodes);
1548 }
1549
1550 atomic_set(&mem->numainfo_events, 0);
1551 atomic_set(&mem->numainfo_updating, 0);
1552}
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1567{
1568 int node;
1569
1570 mem_cgroup_may_update_nodemask(mem);
1571 node = mem->last_scanned_node;
1572
1573 node = next_node(node, mem->scan_nodes);
1574 if (node == MAX_NUMNODES)
1575 node = first_node(mem->scan_nodes);
1576
1577
1578
1579
1580
1581
1582 if (unlikely(node == MAX_NUMNODES))
1583 node = numa_node_id();
1584
1585 mem->last_scanned_node = node;
1586 return node;
1587}
1588
1589
1590
1591
1592
1593
1594
1595bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1596{
1597 int nid;
1598
1599
1600
1601
1602
1603 if (!nodes_empty(mem->scan_nodes)) {
1604 for (nid = first_node(mem->scan_nodes);
1605 nid < MAX_NUMNODES;
1606 nid = next_node(nid, mem->scan_nodes)) {
1607
1608 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1609 return true;
1610 }
1611 }
1612
1613
1614
1615 for_each_node_state(nid, N_HIGH_MEMORY) {
1616 if (node_isset(nid, mem->scan_nodes))
1617 continue;
1618 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1619 return true;
1620 }
1621 return false;
1622}
1623
1624#else
1625int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626{
1627 return 0;
1628}
1629
1630bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1631{
1632 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1633}
1634#endif
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1649 struct zone *zone,
1650 gfp_t gfp_mask,
1651 unsigned long reclaim_options,
1652 unsigned long *total_scanned)
1653{
1654 struct mem_cgroup *victim;
1655 int ret, total = 0;
1656 int loop = 0;
1657 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1658 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1659 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1660 unsigned long excess;
1661 unsigned long nr_scanned;
1662
1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664
1665
1666 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1667 noswap = true;
1668
1669 while (1) {
1670 victim = mem_cgroup_select_victim(root_mem);
1671 if (victim == root_mem) {
1672 loop++;
1673
1674
1675
1676
1677
1678
1679 if (!check_soft && loop >= 1)
1680 drain_all_stock_async(root_mem);
1681 if (loop >= 2) {
1682
1683
1684
1685
1686
1687 if (!check_soft || !total) {
1688 css_put(&victim->css);
1689 break;
1690 }
1691
1692
1693
1694
1695
1696
1697 if (total >= (excess >> 2) ||
1698 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1699 css_put(&victim->css);
1700 break;
1701 }
1702 }
1703 }
1704 if (!mem_cgroup_reclaimable(victim, noswap)) {
1705
1706 css_put(&victim->css);
1707 continue;
1708 }
1709
1710 if (check_soft) {
1711 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1712 noswap, zone, &nr_scanned);
1713 *total_scanned += nr_scanned;
1714 } else
1715 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1716 noswap);
1717 css_put(&victim->css);
1718
1719
1720
1721
1722
1723 if (shrink)
1724 return ret;
1725 total += ret;
1726 if (check_soft) {
1727 if (!res_counter_soft_limit_excess(&root_mem->res))
1728 return total;
1729 } else if (mem_cgroup_margin(root_mem))
1730 return total;
1731 }
1732 return total;
1733}
1734
1735
1736
1737
1738
1739
1740static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1741{
1742 struct mem_cgroup *iter, *failed = NULL;
1743 bool cond = true;
1744
1745 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1746 if (iter->oom_lock) {
1747
1748
1749
1750
1751 failed = iter;
1752 cond = false;
1753 } else
1754 iter->oom_lock = true;
1755 }
1756
1757 if (!failed)
1758 return true;
1759
1760
1761
1762
1763
1764 cond = true;
1765 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1766 if (iter == failed) {
1767 cond = false;
1768 continue;
1769 }
1770 iter->oom_lock = false;
1771 }
1772 return false;
1773}
1774
1775
1776
1777
1778static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1779{
1780 struct mem_cgroup *iter;
1781
1782 for_each_mem_cgroup_tree(iter, mem)
1783 iter->oom_lock = false;
1784 return 0;
1785}
1786
1787static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1788{
1789 struct mem_cgroup *iter;
1790
1791 for_each_mem_cgroup_tree(iter, mem)
1792 atomic_inc(&iter->under_oom);
1793}
1794
1795static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1796{
1797 struct mem_cgroup *iter;
1798
1799
1800
1801
1802
1803
1804 for_each_mem_cgroup_tree(iter, mem)
1805 atomic_add_unless(&iter->under_oom, -1, 0);
1806}
1807
1808static DEFINE_SPINLOCK(memcg_oom_lock);
1809static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1810
1811struct oom_wait_info {
1812 struct mem_cgroup *mem;
1813 wait_queue_t wait;
1814};
1815
1816static int memcg_oom_wake_function(wait_queue_t *wait,
1817 unsigned mode, int sync, void *arg)
1818{
1819 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1820 *oom_wait_mem;
1821 struct oom_wait_info *oom_wait_info;
1822
1823 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1824 oom_wait_mem = oom_wait_info->mem;
1825
1826
1827
1828
1829
1830 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1831 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1832 return 0;
1833 return autoremove_wake_function(wait, mode, sync, arg);
1834}
1835
1836static void memcg_wakeup_oom(struct mem_cgroup *mem)
1837{
1838
1839 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1840}
1841
1842static void memcg_oom_recover(struct mem_cgroup *mem)
1843{
1844 if (mem && atomic_read(&mem->under_oom))
1845 memcg_wakeup_oom(mem);
1846}
1847
1848
1849
1850
1851bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1852{
1853 struct oom_wait_info owait;
1854 bool locked, need_to_kill;
1855
1856 owait.mem = mem;
1857 owait.wait.flags = 0;
1858 owait.wait.func = memcg_oom_wake_function;
1859 owait.wait.private = current;
1860 INIT_LIST_HEAD(&owait.wait.task_list);
1861 need_to_kill = true;
1862 mem_cgroup_mark_under_oom(mem);
1863
1864
1865 spin_lock(&memcg_oom_lock);
1866 locked = mem_cgroup_oom_lock(mem);
1867
1868
1869
1870
1871
1872 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1873 if (!locked || mem->oom_kill_disable)
1874 need_to_kill = false;
1875 if (locked)
1876 mem_cgroup_oom_notify(mem);
1877 spin_unlock(&memcg_oom_lock);
1878
1879 if (need_to_kill) {
1880 finish_wait(&memcg_oom_waitq, &owait.wait);
1881 mem_cgroup_out_of_memory(mem, mask);
1882 } else {
1883 schedule();
1884 finish_wait(&memcg_oom_waitq, &owait.wait);
1885 }
1886 spin_lock(&memcg_oom_lock);
1887 if (locked)
1888 mem_cgroup_oom_unlock(mem);
1889 memcg_wakeup_oom(mem);
1890 spin_unlock(&memcg_oom_lock);
1891
1892 mem_cgroup_unmark_under_oom(mem);
1893
1894 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1895 return false;
1896
1897 schedule_timeout(1);
1898 return true;
1899}
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925void mem_cgroup_update_page_stat(struct page *page,
1926 enum mem_cgroup_page_stat_item idx, int val)
1927{
1928 struct mem_cgroup *mem;
1929 struct page_cgroup *pc = lookup_page_cgroup(page);
1930 bool need_unlock = false;
1931 unsigned long uninitialized_var(flags);
1932
1933 if (unlikely(!pc))
1934 return;
1935
1936 rcu_read_lock();
1937 mem = pc->mem_cgroup;
1938 if (unlikely(!mem || !PageCgroupUsed(pc)))
1939 goto out;
1940
1941 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1942
1943 move_lock_page_cgroup(pc, &flags);
1944 need_unlock = true;
1945 mem = pc->mem_cgroup;
1946 if (!mem || !PageCgroupUsed(pc))
1947 goto out;
1948 }
1949
1950 switch (idx) {
1951 case MEMCG_NR_FILE_MAPPED:
1952 if (val > 0)
1953 SetPageCgroupFileMapped(pc);
1954 else if (!page_mapped(page))
1955 ClearPageCgroupFileMapped(pc);
1956 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1957 break;
1958 default:
1959 BUG();
1960 }
1961
1962 this_cpu_add(mem->stat->count[idx], val);
1963
1964out:
1965 if (unlikely(need_unlock))
1966 move_unlock_page_cgroup(pc, &flags);
1967 rcu_read_unlock();
1968 return;
1969}
1970EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1971
1972
1973
1974
1975
1976#define CHARGE_BATCH 32U
1977struct memcg_stock_pcp {
1978 struct mem_cgroup *cached;
1979 unsigned int nr_pages;
1980 struct work_struct work;
1981 unsigned long flags;
1982#define FLUSHING_CACHED_CHARGE (0)
1983};
1984static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1985static DEFINE_MUTEX(percpu_charge_mutex);
1986
1987
1988
1989
1990
1991
1992
1993static bool consume_stock(struct mem_cgroup *mem)
1994{
1995 struct memcg_stock_pcp *stock;
1996 bool ret = true;
1997
1998 stock = &get_cpu_var(memcg_stock);
1999 if (mem == stock->cached && stock->nr_pages)
2000 stock->nr_pages--;
2001 else
2002 ret = false;
2003 put_cpu_var(memcg_stock);
2004 return ret;
2005}
2006
2007
2008
2009
2010static void drain_stock(struct memcg_stock_pcp *stock)
2011{
2012 struct mem_cgroup *old = stock->cached;
2013
2014 if (stock->nr_pages) {
2015 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2016
2017 res_counter_uncharge(&old->res, bytes);
2018 if (do_swap_account)
2019 res_counter_uncharge(&old->memsw, bytes);
2020 stock->nr_pages = 0;
2021 }
2022 stock->cached = NULL;
2023}
2024
2025
2026
2027
2028
2029static void drain_local_stock(struct work_struct *dummy)
2030{
2031 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2032 drain_stock(stock);
2033 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2034}
2035
2036
2037
2038
2039
2040static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2041{
2042 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2043
2044 if (stock->cached != mem) {
2045 drain_stock(stock);
2046 stock->cached = mem;
2047 }
2048 stock->nr_pages += nr_pages;
2049 put_cpu_var(memcg_stock);
2050}
2051
2052
2053
2054
2055
2056
2057static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2058{
2059 int cpu, curcpu;
2060
2061
2062 get_online_cpus();
2063 curcpu = get_cpu();
2064 for_each_online_cpu(cpu) {
2065 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2066 struct mem_cgroup *mem;
2067
2068 mem = stock->cached;
2069 if (!mem || !stock->nr_pages)
2070 continue;
2071 if (!mem_cgroup_same_or_subtree(root_mem, mem))
2072 continue;
2073 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2074 if (cpu == curcpu)
2075 drain_local_stock(&stock->work);
2076 else
2077 schedule_work_on(cpu, &stock->work);
2078 }
2079 }
2080 put_cpu();
2081
2082 if (!sync)
2083 goto out;
2084
2085 for_each_online_cpu(cpu) {
2086 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2087 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2088 flush_work(&stock->work);
2089 }
2090out:
2091 put_online_cpus();
2092}
2093
2094
2095
2096
2097
2098
2099
2100static void drain_all_stock_async(struct mem_cgroup *root_mem)
2101{
2102
2103
2104
2105 if (!mutex_trylock(&percpu_charge_mutex))
2106 return;
2107 drain_all_stock(root_mem, false);
2108 mutex_unlock(&percpu_charge_mutex);
2109}
2110
2111
2112static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2113{
2114
2115 mutex_lock(&percpu_charge_mutex);
2116 drain_all_stock(root_mem, true);
2117 mutex_unlock(&percpu_charge_mutex);
2118}
2119
2120
2121
2122
2123
2124static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
2125{
2126 int i;
2127
2128 spin_lock(&mem->pcp_counter_lock);
2129 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2130 long x = per_cpu(mem->stat->count[i], cpu);
2131
2132 per_cpu(mem->stat->count[i], cpu) = 0;
2133 mem->nocpu_base.count[i] += x;
2134 }
2135 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2136 unsigned long x = per_cpu(mem->stat->events[i], cpu);
2137
2138 per_cpu(mem->stat->events[i], cpu) = 0;
2139 mem->nocpu_base.events[i] += x;
2140 }
2141
2142 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2143 spin_unlock(&mem->pcp_counter_lock);
2144}
2145
2146static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
2147{
2148 int idx = MEM_CGROUP_ON_MOVE;
2149
2150 spin_lock(&mem->pcp_counter_lock);
2151 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
2152 spin_unlock(&mem->pcp_counter_lock);
2153}
2154
2155static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2156 unsigned long action,
2157 void *hcpu)
2158{
2159 int cpu = (unsigned long)hcpu;
2160 struct memcg_stock_pcp *stock;
2161 struct mem_cgroup *iter;
2162
2163 if ((action == CPU_ONLINE)) {
2164 for_each_mem_cgroup_all(iter)
2165 synchronize_mem_cgroup_on_move(iter, cpu);
2166 return NOTIFY_OK;
2167 }
2168
2169 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2170 return NOTIFY_OK;
2171
2172 for_each_mem_cgroup_all(iter)
2173 mem_cgroup_drain_pcp_counter(iter, cpu);
2174
2175 stock = &per_cpu(memcg_stock, cpu);
2176 drain_stock(stock);
2177 return NOTIFY_OK;
2178}
2179
2180
2181
2182enum {
2183 CHARGE_OK,
2184 CHARGE_RETRY,
2185 CHARGE_NOMEM,
2186 CHARGE_WOULDBLOCK,
2187 CHARGE_OOM_DIE,
2188};
2189
2190static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2191 unsigned int nr_pages, bool oom_check)
2192{
2193 unsigned long csize = nr_pages * PAGE_SIZE;
2194 struct mem_cgroup *mem_over_limit;
2195 struct res_counter *fail_res;
2196 unsigned long flags = 0;
2197 int ret;
2198
2199 ret = res_counter_charge(&mem->res, csize, &fail_res);
2200
2201 if (likely(!ret)) {
2202 if (!do_swap_account)
2203 return CHARGE_OK;
2204 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
2205 if (likely(!ret))
2206 return CHARGE_OK;
2207
2208 res_counter_uncharge(&mem->res, csize);
2209 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2210 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2211 } else
2212 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2213
2214
2215
2216
2217
2218
2219
2220 if (nr_pages == CHARGE_BATCH)
2221 return CHARGE_RETRY;
2222
2223 if (!(gfp_mask & __GFP_WAIT))
2224 return CHARGE_WOULDBLOCK;
2225
2226 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
2227 gfp_mask, flags, NULL);
2228 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2229 return CHARGE_RETRY;
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239 if (nr_pages == 1 && ret)
2240 return CHARGE_RETRY;
2241
2242
2243
2244
2245
2246 if (mem_cgroup_wait_acct_move(mem_over_limit))
2247 return CHARGE_RETRY;
2248
2249
2250 if (!oom_check)
2251 return CHARGE_NOMEM;
2252
2253 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2254 return CHARGE_OOM_DIE;
2255
2256 return CHARGE_RETRY;
2257}
2258
2259
2260
2261
2262
2263static int __mem_cgroup_try_charge(struct mm_struct *mm,
2264 gfp_t gfp_mask,
2265 unsigned int nr_pages,
2266 struct mem_cgroup **memcg,
2267 bool oom)
2268{
2269 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2270 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2271 struct mem_cgroup *mem = NULL;
2272 int ret;
2273
2274
2275
2276
2277
2278
2279 if (unlikely(test_thread_flag(TIF_MEMDIE)
2280 || fatal_signal_pending(current)))
2281 goto bypass;
2282
2283
2284
2285
2286
2287
2288
2289 if (!*memcg && !mm)
2290 goto bypass;
2291again:
2292 if (*memcg) {
2293 mem = *memcg;
2294 VM_BUG_ON(css_is_removed(&mem->css));
2295 if (mem_cgroup_is_root(mem))
2296 goto done;
2297 if (nr_pages == 1 && consume_stock(mem))
2298 goto done;
2299 css_get(&mem->css);
2300 } else {
2301 struct task_struct *p;
2302
2303 rcu_read_lock();
2304 p = rcu_dereference(mm->owner);
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315 mem = mem_cgroup_from_task(p);
2316 if (!mem || mem_cgroup_is_root(mem)) {
2317 rcu_read_unlock();
2318 goto done;
2319 }
2320 if (nr_pages == 1 && consume_stock(mem)) {
2321
2322
2323
2324
2325
2326
2327
2328
2329 rcu_read_unlock();
2330 goto done;
2331 }
2332
2333 if (!css_tryget(&mem->css)) {
2334 rcu_read_unlock();
2335 goto again;
2336 }
2337 rcu_read_unlock();
2338 }
2339
2340 do {
2341 bool oom_check;
2342
2343
2344 if (fatal_signal_pending(current)) {
2345 css_put(&mem->css);
2346 goto bypass;
2347 }
2348
2349 oom_check = false;
2350 if (oom && !nr_oom_retries) {
2351 oom_check = true;
2352 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2353 }
2354
2355 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2356 switch (ret) {
2357 case CHARGE_OK:
2358 break;
2359 case CHARGE_RETRY:
2360 batch = nr_pages;
2361 css_put(&mem->css);
2362 mem = NULL;
2363 goto again;
2364 case CHARGE_WOULDBLOCK:
2365 css_put(&mem->css);
2366 goto nomem;
2367 case CHARGE_NOMEM:
2368 if (!oom) {
2369 css_put(&mem->css);
2370 goto nomem;
2371 }
2372
2373 nr_oom_retries--;
2374 break;
2375 case CHARGE_OOM_DIE:
2376 css_put(&mem->css);
2377 goto bypass;
2378 }
2379 } while (ret != CHARGE_OK);
2380
2381 if (batch > nr_pages)
2382 refill_stock(mem, batch - nr_pages);
2383 css_put(&mem->css);
2384done:
2385 *memcg = mem;
2386 return 0;
2387nomem:
2388 *memcg = NULL;
2389 return -ENOMEM;
2390bypass:
2391 *memcg = NULL;
2392 return 0;
2393}
2394
2395
2396
2397
2398
2399
2400static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2401 unsigned int nr_pages)
2402{
2403 if (!mem_cgroup_is_root(mem)) {
2404 unsigned long bytes = nr_pages * PAGE_SIZE;
2405
2406 res_counter_uncharge(&mem->res, bytes);
2407 if (do_swap_account)
2408 res_counter_uncharge(&mem->memsw, bytes);
2409 }
2410}
2411
2412
2413
2414
2415
2416
2417
2418static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2419{
2420 struct cgroup_subsys_state *css;
2421
2422
2423 if (!id)
2424 return NULL;
2425 css = css_lookup(&mem_cgroup_subsys, id);
2426 if (!css)
2427 return NULL;
2428 return container_of(css, struct mem_cgroup, css);
2429}
2430
2431struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2432{
2433 struct mem_cgroup *mem = NULL;
2434 struct page_cgroup *pc;
2435 unsigned short id;
2436 swp_entry_t ent;
2437
2438 VM_BUG_ON(!PageLocked(page));
2439
2440 pc = lookup_page_cgroup(page);
2441 lock_page_cgroup(pc);
2442 if (PageCgroupUsed(pc)) {
2443 mem = pc->mem_cgroup;
2444 if (mem && !css_tryget(&mem->css))
2445 mem = NULL;
2446 } else if (PageSwapCache(page)) {
2447 ent.val = page_private(page);
2448 id = lookup_swap_cgroup(ent);
2449 rcu_read_lock();
2450 mem = mem_cgroup_lookup(id);
2451 if (mem && !css_tryget(&mem->css))
2452 mem = NULL;
2453 rcu_read_unlock();
2454 }
2455 unlock_page_cgroup(pc);
2456 return mem;
2457}
2458
2459static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2460 struct page *page,
2461 unsigned int nr_pages,
2462 struct page_cgroup *pc,
2463 enum charge_type ctype)
2464{
2465 lock_page_cgroup(pc);
2466 if (unlikely(PageCgroupUsed(pc))) {
2467 unlock_page_cgroup(pc);
2468 __mem_cgroup_cancel_charge(mem, nr_pages);
2469 return;
2470 }
2471
2472
2473
2474
2475 pc->mem_cgroup = mem;
2476
2477
2478
2479
2480
2481
2482
2483 smp_wmb();
2484 switch (ctype) {
2485 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487 SetPageCgroupCache(pc);
2488 SetPageCgroupUsed(pc);
2489 break;
2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491 ClearPageCgroupCache(pc);
2492 SetPageCgroupUsed(pc);
2493 break;
2494 default:
2495 break;
2496 }
2497
2498 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2499 unlock_page_cgroup(pc);
2500
2501
2502
2503
2504
2505 memcg_check_events(mem, page);
2506}
2507
2508#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2509
2510#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2511 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2512
2513
2514
2515
2516void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2517{
2518 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2519 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2520 unsigned long flags;
2521
2522 if (mem_cgroup_disabled())
2523 return;
2524
2525
2526
2527
2528 move_lock_page_cgroup(head_pc, &flags);
2529
2530 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2531 smp_wmb();
2532 if (PageCgroupAcctLRU(head_pc)) {
2533 enum lru_list lru;
2534 struct mem_cgroup_per_zone *mz;
2535
2536
2537
2538
2539
2540
2541 lru = page_lru(head);
2542 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2543 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2544 }
2545 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2546 move_unlock_page_cgroup(head_pc, &flags);
2547}
2548#endif
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568static int mem_cgroup_move_account(struct page *page,
2569 unsigned int nr_pages,
2570 struct page_cgroup *pc,
2571 struct mem_cgroup *from,
2572 struct mem_cgroup *to,
2573 bool uncharge)
2574{
2575 unsigned long flags;
2576 int ret;
2577
2578 VM_BUG_ON(from == to);
2579 VM_BUG_ON(PageLRU(page));
2580
2581
2582
2583
2584
2585
2586 ret = -EBUSY;
2587 if (nr_pages > 1 && !PageTransHuge(page))
2588 goto out;
2589
2590 lock_page_cgroup(pc);
2591
2592 ret = -EINVAL;
2593 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2594 goto unlock;
2595
2596 move_lock_page_cgroup(pc, &flags);
2597
2598 if (PageCgroupFileMapped(pc)) {
2599
2600 preempt_disable();
2601 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2602 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2603 preempt_enable();
2604 }
2605 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2606 if (uncharge)
2607
2608 __mem_cgroup_cancel_charge(from, nr_pages);
2609
2610
2611 pc->mem_cgroup = to;
2612 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2613
2614
2615
2616
2617
2618
2619
2620 move_unlock_page_cgroup(pc, &flags);
2621 ret = 0;
2622unlock:
2623 unlock_page_cgroup(pc);
2624
2625
2626
2627 memcg_check_events(to, page);
2628 memcg_check_events(from, page);
2629out:
2630 return ret;
2631}
2632
2633
2634
2635
2636
2637static int mem_cgroup_move_parent(struct page *page,
2638 struct page_cgroup *pc,
2639 struct mem_cgroup *child,
2640 gfp_t gfp_mask)
2641{
2642 struct cgroup *cg = child->css.cgroup;
2643 struct cgroup *pcg = cg->parent;
2644 struct mem_cgroup *parent;
2645 unsigned int nr_pages;
2646 unsigned long uninitialized_var(flags);
2647 int ret;
2648
2649
2650 if (!pcg)
2651 return -EINVAL;
2652
2653 ret = -EBUSY;
2654 if (!get_page_unless_zero(page))
2655 goto out;
2656 if (isolate_lru_page(page))
2657 goto put;
2658
2659 nr_pages = hpage_nr_pages(page);
2660
2661 parent = mem_cgroup_from_cont(pcg);
2662 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2663 if (ret || !parent)
2664 goto put_back;
2665
2666 if (nr_pages > 1)
2667 flags = compound_lock_irqsave(page);
2668
2669 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2670 if (ret)
2671 __mem_cgroup_cancel_charge(parent, nr_pages);
2672
2673 if (nr_pages > 1)
2674 compound_unlock_irqrestore(page, flags);
2675put_back:
2676 putback_lru_page(page);
2677put:
2678 put_page(page);
2679out:
2680 return ret;
2681}
2682
2683
2684
2685
2686
2687
2688
2689static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2690 gfp_t gfp_mask, enum charge_type ctype)
2691{
2692 struct mem_cgroup *mem = NULL;
2693 unsigned int nr_pages = 1;
2694 struct page_cgroup *pc;
2695 bool oom = true;
2696 int ret;
2697
2698 if (PageTransHuge(page)) {
2699 nr_pages <<= compound_order(page);
2700 VM_BUG_ON(!PageTransHuge(page));
2701
2702
2703
2704
2705 oom = false;
2706 }
2707
2708 pc = lookup_page_cgroup(page);
2709 BUG_ON(!pc);
2710
2711 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2712 if (ret || !mem)
2713 return ret;
2714
2715 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2716 return 0;
2717}
2718
2719int mem_cgroup_newpage_charge(struct page *page,
2720 struct mm_struct *mm, gfp_t gfp_mask)
2721{
2722 if (mem_cgroup_disabled())
2723 return 0;
2724
2725
2726
2727
2728
2729
2730
2731 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2732 return 0;
2733 if (unlikely(!mm))
2734 mm = &init_mm;
2735 return mem_cgroup_charge_common(page, mm, gfp_mask,
2736 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2737}
2738
2739static void
2740__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2741 enum charge_type ctype);
2742
2743static void
2744__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2745 enum charge_type ctype)
2746{
2747 struct page_cgroup *pc = lookup_page_cgroup(page);
2748
2749
2750
2751
2752
2753 mem_cgroup_lru_del_before_commit(page);
2754 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2755 mem_cgroup_lru_add_after_commit(page);
2756 return;
2757}
2758
2759int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2760 gfp_t gfp_mask)
2761{
2762 struct mem_cgroup *mem = NULL;
2763 int ret;
2764
2765 if (mem_cgroup_disabled())
2766 return 0;
2767 if (PageCompound(page))
2768 return 0;
2769
2770 if (unlikely(!mm))
2771 mm = &init_mm;
2772
2773 if (page_is_file_cache(page)) {
2774 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2775 if (ret || !mem)
2776 return ret;
2777
2778
2779
2780
2781
2782
2783 __mem_cgroup_commit_charge_lrucare(page, mem,
2784 MEM_CGROUP_CHARGE_TYPE_CACHE);
2785 return ret;
2786 }
2787
2788 if (PageSwapCache(page)) {
2789 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2790 if (!ret)
2791 __mem_cgroup_commit_charge_swapin(page, mem,
2792 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2793 } else
2794 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2795 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2796
2797 return ret;
2798}
2799
2800
2801
2802
2803
2804
2805
2806int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2807 struct page *page,
2808 gfp_t mask, struct mem_cgroup **ptr)
2809{
2810 struct mem_cgroup *mem;
2811 int ret;
2812
2813 *ptr = NULL;
2814
2815 if (mem_cgroup_disabled())
2816 return 0;
2817
2818 if (!do_swap_account)
2819 goto charge_cur_mm;
2820
2821
2822
2823
2824
2825
2826 if (!PageSwapCache(page))
2827 goto charge_cur_mm;
2828 mem = try_get_mem_cgroup_from_page(page);
2829 if (!mem)
2830 goto charge_cur_mm;
2831 *ptr = mem;
2832 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2833 css_put(&mem->css);
2834 return ret;
2835charge_cur_mm:
2836 if (unlikely(!mm))
2837 mm = &init_mm;
2838 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2839}
2840
2841static void
2842__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2843 enum charge_type ctype)
2844{
2845 if (mem_cgroup_disabled())
2846 return;
2847 if (!ptr)
2848 return;
2849 cgroup_exclude_rmdir(&ptr->css);
2850
2851 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2852
2853
2854
2855
2856
2857
2858
2859 if (do_swap_account && PageSwapCache(page)) {
2860 swp_entry_t ent = {.val = page_private(page)};
2861 unsigned short id;
2862 struct mem_cgroup *memcg;
2863
2864 id = swap_cgroup_record(ent, 0);
2865 rcu_read_lock();
2866 memcg = mem_cgroup_lookup(id);
2867 if (memcg) {
2868
2869
2870
2871
2872 if (!mem_cgroup_is_root(memcg))
2873 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2874 mem_cgroup_swap_statistics(memcg, false);
2875 mem_cgroup_put(memcg);
2876 }
2877 rcu_read_unlock();
2878 }
2879
2880
2881
2882
2883
2884 cgroup_release_and_wakeup_rmdir(&ptr->css);
2885}
2886
2887void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2888{
2889 __mem_cgroup_commit_charge_swapin(page, ptr,
2890 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2891}
2892
2893void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2894{
2895 if (mem_cgroup_disabled())
2896 return;
2897 if (!mem)
2898 return;
2899 __mem_cgroup_cancel_charge(mem, 1);
2900}
2901
2902static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2903 unsigned int nr_pages,
2904 const enum charge_type ctype)
2905{
2906 struct memcg_batch_info *batch = NULL;
2907 bool uncharge_memsw = true;
2908
2909
2910 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2911 uncharge_memsw = false;
2912
2913 batch = ¤t->memcg_batch;
2914
2915
2916
2917
2918
2919 if (!batch->memcg)
2920 batch->memcg = mem;
2921
2922
2923
2924
2925
2926
2927
2928
2929 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2930 goto direct_uncharge;
2931
2932 if (nr_pages > 1)
2933 goto direct_uncharge;
2934
2935
2936
2937
2938
2939
2940 if (batch->memcg != mem)
2941 goto direct_uncharge;
2942
2943 batch->nr_pages++;
2944 if (uncharge_memsw)
2945 batch->memsw_nr_pages++;
2946 return;
2947direct_uncharge:
2948 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2949 if (uncharge_memsw)
2950 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2951 if (unlikely(batch->memcg != mem))
2952 memcg_oom_recover(mem);
2953 return;
2954}
2955
2956
2957
2958
2959static struct mem_cgroup *
2960__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2961{
2962 struct mem_cgroup *mem = NULL;
2963 unsigned int nr_pages = 1;
2964 struct page_cgroup *pc;
2965
2966 if (mem_cgroup_disabled())
2967 return NULL;
2968
2969 if (PageSwapCache(page))
2970 return NULL;
2971
2972 if (PageTransHuge(page)) {
2973 nr_pages <<= compound_order(page);
2974 VM_BUG_ON(!PageTransHuge(page));
2975 }
2976
2977
2978
2979 pc = lookup_page_cgroup(page);
2980 if (unlikely(!pc || !PageCgroupUsed(pc)))
2981 return NULL;
2982
2983 lock_page_cgroup(pc);
2984
2985 mem = pc->mem_cgroup;
2986
2987 if (!PageCgroupUsed(pc))
2988 goto unlock_out;
2989
2990 switch (ctype) {
2991 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2992 case MEM_CGROUP_CHARGE_TYPE_DROP:
2993
2994 if (page_mapped(page) || PageCgroupMigration(pc))
2995 goto unlock_out;
2996 break;
2997 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2998 if (!PageAnon(page)) {
2999 if (page->mapping && !page_is_file_cache(page))
3000 goto unlock_out;
3001 } else if (page_mapped(page))
3002 goto unlock_out;
3003 break;
3004 default:
3005 break;
3006 }
3007
3008 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
3009
3010 ClearPageCgroupUsed(pc);
3011
3012
3013
3014
3015
3016
3017
3018 unlock_page_cgroup(pc);
3019
3020
3021
3022
3023 memcg_check_events(mem, page);
3024 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3025 mem_cgroup_swap_statistics(mem, true);
3026 mem_cgroup_get(mem);
3027 }
3028 if (!mem_cgroup_is_root(mem))
3029 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
3030
3031 return mem;
3032
3033unlock_out:
3034 unlock_page_cgroup(pc);
3035 return NULL;
3036}
3037
3038void mem_cgroup_uncharge_page(struct page *page)
3039{
3040
3041 if (page_mapped(page))
3042 return;
3043 if (page->mapping && !PageAnon(page))
3044 return;
3045 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3046}
3047
3048void mem_cgroup_uncharge_cache_page(struct page *page)
3049{
3050 VM_BUG_ON(page_mapped(page));
3051 VM_BUG_ON(page->mapping);
3052 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3053}
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063void mem_cgroup_uncharge_start(void)
3064{
3065 current->memcg_batch.do_batch++;
3066
3067 if (current->memcg_batch.do_batch == 1) {
3068 current->memcg_batch.memcg = NULL;
3069 current->memcg_batch.nr_pages = 0;
3070 current->memcg_batch.memsw_nr_pages = 0;
3071 }
3072}
3073
3074void mem_cgroup_uncharge_end(void)
3075{
3076 struct memcg_batch_info *batch = ¤t->memcg_batch;
3077
3078 if (!batch->do_batch)
3079 return;
3080
3081 batch->do_batch--;
3082 if (batch->do_batch)
3083 return;
3084
3085 if (!batch->memcg)
3086 return;
3087
3088
3089
3090
3091 if (batch->nr_pages)
3092 res_counter_uncharge(&batch->memcg->res,
3093 batch->nr_pages * PAGE_SIZE);
3094 if (batch->memsw_nr_pages)
3095 res_counter_uncharge(&batch->memcg->memsw,
3096 batch->memsw_nr_pages * PAGE_SIZE);
3097 memcg_oom_recover(batch->memcg);
3098
3099 batch->memcg = NULL;
3100}
3101
3102#ifdef CONFIG_SWAP
3103
3104
3105
3106
3107void
3108mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3109{
3110 struct mem_cgroup *memcg;
3111 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3112
3113 if (!swapout)
3114 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3115
3116 memcg = __mem_cgroup_uncharge_common(page, ctype);
3117
3118
3119
3120
3121
3122 if (do_swap_account && swapout && memcg)
3123 swap_cgroup_record(ent, css_id(&memcg->css));
3124}
3125#endif
3126
3127#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3128
3129
3130
3131
3132void mem_cgroup_uncharge_swap(swp_entry_t ent)
3133{
3134 struct mem_cgroup *memcg;
3135 unsigned short id;
3136
3137 if (!do_swap_account)
3138 return;
3139
3140 id = swap_cgroup_record(ent, 0);
3141 rcu_read_lock();
3142 memcg = mem_cgroup_lookup(id);
3143 if (memcg) {
3144
3145
3146
3147
3148 if (!mem_cgroup_is_root(memcg))
3149 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3150 mem_cgroup_swap_statistics(memcg, false);
3151 mem_cgroup_put(memcg);
3152 }
3153 rcu_read_unlock();
3154}
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171static int mem_cgroup_move_swap_account(swp_entry_t entry,
3172 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3173{
3174 unsigned short old_id, new_id;
3175
3176 old_id = css_id(&from->css);
3177 new_id = css_id(&to->css);
3178
3179 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3180 mem_cgroup_swap_statistics(from, false);
3181 mem_cgroup_swap_statistics(to, true);
3182
3183
3184
3185
3186
3187
3188
3189
3190 mem_cgroup_get(to);
3191 if (need_fixup) {
3192 if (!mem_cgroup_is_root(from))
3193 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3194 mem_cgroup_put(from);
3195
3196
3197
3198
3199 if (!mem_cgroup_is_root(to))
3200 res_counter_uncharge(&to->res, PAGE_SIZE);
3201 }
3202 return 0;
3203 }
3204 return -EINVAL;
3205}
3206#else
3207static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3208 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3209{
3210 return -EINVAL;
3211}
3212#endif
3213
3214
3215
3216
3217
3218int mem_cgroup_prepare_migration(struct page *page,
3219 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3220{
3221 struct mem_cgroup *mem = NULL;
3222 struct page_cgroup *pc;
3223 enum charge_type ctype;
3224 int ret = 0;
3225
3226 *ptr = NULL;
3227
3228 VM_BUG_ON(PageTransHuge(page));
3229 if (mem_cgroup_disabled())
3230 return 0;
3231
3232 pc = lookup_page_cgroup(page);
3233 lock_page_cgroup(pc);
3234 if (PageCgroupUsed(pc)) {
3235 mem = pc->mem_cgroup;
3236 css_get(&mem->css);
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266 if (PageAnon(page))
3267 SetPageCgroupMigration(pc);
3268 }
3269 unlock_page_cgroup(pc);
3270
3271
3272
3273
3274 if (!mem)
3275 return 0;
3276
3277 *ptr = mem;
3278 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3279 css_put(&mem->css);
3280 if (ret || *ptr == NULL) {
3281 if (PageAnon(page)) {
3282 lock_page_cgroup(pc);
3283 ClearPageCgroupMigration(pc);
3284 unlock_page_cgroup(pc);
3285
3286
3287
3288 mem_cgroup_uncharge_page(page);
3289 }
3290 return -ENOMEM;
3291 }
3292
3293
3294
3295
3296
3297
3298 pc = lookup_page_cgroup(newpage);
3299 if (PageAnon(page))
3300 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3301 else if (page_is_file_cache(page))
3302 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3303 else
3304 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3305 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
3306 return ret;
3307}
3308
3309
3310void mem_cgroup_end_migration(struct mem_cgroup *mem,
3311 struct page *oldpage, struct page *newpage, bool migration_ok)
3312{
3313 struct page *used, *unused;
3314 struct page_cgroup *pc;
3315
3316 if (!mem)
3317 return;
3318
3319 cgroup_exclude_rmdir(&mem->css);
3320 if (!migration_ok) {
3321 used = oldpage;
3322 unused = newpage;
3323 } else {
3324 used = newpage;
3325 unused = oldpage;
3326 }
3327
3328
3329
3330
3331
3332 pc = lookup_page_cgroup(oldpage);
3333 lock_page_cgroup(pc);
3334 ClearPageCgroupMigration(pc);
3335 unlock_page_cgroup(pc);
3336
3337 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347 if (PageAnon(used))
3348 mem_cgroup_uncharge_page(used);
3349
3350
3351
3352
3353
3354
3355 cgroup_release_and_wakeup_rmdir(&mem->css);
3356}
3357
3358#ifdef CONFIG_DEBUG_VM
3359static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3360{
3361 struct page_cgroup *pc;
3362
3363 pc = lookup_page_cgroup(page);
3364 if (likely(pc) && PageCgroupUsed(pc))
3365 return pc;
3366 return NULL;
3367}
3368
3369bool mem_cgroup_bad_page_check(struct page *page)
3370{
3371 if (mem_cgroup_disabled())
3372 return false;
3373
3374 return lookup_page_cgroup_used(page) != NULL;
3375}
3376
3377void mem_cgroup_print_bad_page(struct page *page)
3378{
3379 struct page_cgroup *pc;
3380
3381 pc = lookup_page_cgroup_used(page);
3382 if (pc) {
3383 int ret = -1;
3384 char *path;
3385
3386 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3387 pc, pc->flags, pc->mem_cgroup);
3388
3389 path = kmalloc(PATH_MAX, GFP_KERNEL);
3390 if (path) {
3391 rcu_read_lock();
3392 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3393 path, PATH_MAX);
3394 rcu_read_unlock();
3395 }
3396
3397 printk(KERN_CONT "(%s)\n",
3398 (ret < 0) ? "cannot get the path" : path);
3399 kfree(path);
3400 }
3401}
3402#endif
3403
3404static DEFINE_MUTEX(set_limit_mutex);
3405
3406static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3407 unsigned long long val)
3408{
3409 int retry_count;
3410 u64 memswlimit, memlimit;
3411 int ret = 0;
3412 int children = mem_cgroup_count_children(memcg);
3413 u64 curusage, oldusage;
3414 int enlarge;
3415
3416
3417
3418
3419
3420
3421 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3422
3423 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3424
3425 enlarge = 0;
3426 while (retry_count) {
3427 if (signal_pending(current)) {
3428 ret = -EINTR;
3429 break;
3430 }
3431
3432
3433
3434
3435
3436 mutex_lock(&set_limit_mutex);
3437 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3438 if (memswlimit < val) {
3439 ret = -EINVAL;
3440 mutex_unlock(&set_limit_mutex);
3441 break;
3442 }
3443
3444 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3445 if (memlimit < val)
3446 enlarge = 1;
3447
3448 ret = res_counter_set_limit(&memcg->res, val);
3449 if (!ret) {
3450 if (memswlimit == val)
3451 memcg->memsw_is_minimum = true;
3452 else
3453 memcg->memsw_is_minimum = false;
3454 }
3455 mutex_unlock(&set_limit_mutex);
3456
3457 if (!ret)
3458 break;
3459
3460 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3461 MEM_CGROUP_RECLAIM_SHRINK,
3462 NULL);
3463 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3464
3465 if (curusage >= oldusage)
3466 retry_count--;
3467 else
3468 oldusage = curusage;
3469 }
3470 if (!ret && enlarge)
3471 memcg_oom_recover(memcg);
3472
3473 return ret;
3474}
3475
3476static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3477 unsigned long long val)
3478{
3479 int retry_count;
3480 u64 memlimit, memswlimit, oldusage, curusage;
3481 int children = mem_cgroup_count_children(memcg);
3482 int ret = -EBUSY;
3483 int enlarge = 0;
3484
3485
3486 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3487 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3488 while (retry_count) {
3489 if (signal_pending(current)) {
3490 ret = -EINTR;
3491 break;
3492 }
3493
3494
3495
3496
3497
3498 mutex_lock(&set_limit_mutex);
3499 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3500 if (memlimit > val) {
3501 ret = -EINVAL;
3502 mutex_unlock(&set_limit_mutex);
3503 break;
3504 }
3505 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3506 if (memswlimit < val)
3507 enlarge = 1;
3508 ret = res_counter_set_limit(&memcg->memsw, val);
3509 if (!ret) {
3510 if (memlimit == val)
3511 memcg->memsw_is_minimum = true;
3512 else
3513 memcg->memsw_is_minimum = false;
3514 }
3515 mutex_unlock(&set_limit_mutex);
3516
3517 if (!ret)
3518 break;
3519
3520 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3521 MEM_CGROUP_RECLAIM_NOSWAP |
3522 MEM_CGROUP_RECLAIM_SHRINK,
3523 NULL);
3524 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3525
3526 if (curusage >= oldusage)
3527 retry_count--;
3528 else
3529 oldusage = curusage;
3530 }
3531 if (!ret && enlarge)
3532 memcg_oom_recover(memcg);
3533 return ret;
3534}
3535
3536unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3537 gfp_t gfp_mask,
3538 unsigned long *total_scanned)
3539{
3540 unsigned long nr_reclaimed = 0;
3541 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3542 unsigned long reclaimed;
3543 int loop = 0;
3544 struct mem_cgroup_tree_per_zone *mctz;
3545 unsigned long long excess;
3546 unsigned long nr_scanned;
3547
3548 if (order > 0)
3549 return 0;
3550
3551 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3552
3553
3554
3555
3556
3557 do {
3558 if (next_mz)
3559 mz = next_mz;
3560 else
3561 mz = mem_cgroup_largest_soft_limit_node(mctz);
3562 if (!mz)
3563 break;
3564
3565 nr_scanned = 0;
3566 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3567 gfp_mask,
3568 MEM_CGROUP_RECLAIM_SOFT,
3569 &nr_scanned);
3570 nr_reclaimed += reclaimed;
3571 *total_scanned += nr_scanned;
3572 spin_lock(&mctz->lock);
3573
3574
3575
3576
3577
3578 next_mz = NULL;
3579 if (!reclaimed) {
3580 do {
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592 next_mz =
3593 __mem_cgroup_largest_soft_limit_node(mctz);
3594 if (next_mz == mz)
3595 css_put(&next_mz->mem->css);
3596 else
3597 break;
3598 } while (1);
3599 }
3600 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3601 excess = res_counter_soft_limit_excess(&mz->mem->res);
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3612 spin_unlock(&mctz->lock);
3613 css_put(&mz->mem->css);
3614 loop++;
3615
3616
3617
3618
3619
3620 if (!nr_reclaimed &&
3621 (next_mz == NULL ||
3622 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3623 break;
3624 } while (!nr_reclaimed);
3625 if (next_mz)
3626 css_put(&next_mz->mem->css);
3627 return nr_reclaimed;
3628}
3629
3630
3631
3632
3633
3634static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3635 int node, int zid, enum lru_list lru)
3636{
3637 struct zone *zone;
3638 struct mem_cgroup_per_zone *mz;
3639 struct page_cgroup *pc, *busy;
3640 unsigned long flags, loop;
3641 struct list_head *list;
3642 int ret = 0;
3643
3644 zone = &NODE_DATA(node)->node_zones[zid];
3645 mz = mem_cgroup_zoneinfo(mem, node, zid);
3646 list = &mz->lists[lru];
3647
3648 loop = MEM_CGROUP_ZSTAT(mz, lru);
3649
3650 loop += 256;
3651 busy = NULL;
3652 while (loop--) {
3653 struct page *page;
3654
3655 ret = 0;
3656 spin_lock_irqsave(&zone->lru_lock, flags);
3657 if (list_empty(list)) {
3658 spin_unlock_irqrestore(&zone->lru_lock, flags);
3659 break;
3660 }
3661 pc = list_entry(list->prev, struct page_cgroup, lru);
3662 if (busy == pc) {
3663 list_move(&pc->lru, list);
3664 busy = NULL;
3665 spin_unlock_irqrestore(&zone->lru_lock, flags);
3666 continue;
3667 }
3668 spin_unlock_irqrestore(&zone->lru_lock, flags);
3669
3670 page = lookup_cgroup_page(pc);
3671
3672 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3673 if (ret == -ENOMEM)
3674 break;
3675
3676 if (ret == -EBUSY || ret == -EINVAL) {
3677
3678 busy = pc;
3679 cond_resched();
3680 } else
3681 busy = NULL;
3682 }
3683
3684 if (!ret && !list_empty(list))
3685 return -EBUSY;
3686 return ret;
3687}
3688
3689
3690
3691
3692
3693static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3694{
3695 int ret;
3696 int node, zid, shrink;
3697 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3698 struct cgroup *cgrp = mem->css.cgroup;
3699
3700 css_get(&mem->css);
3701
3702 shrink = 0;
3703
3704 if (free_all)
3705 goto try_to_free;
3706move_account:
3707 do {
3708 ret = -EBUSY;
3709 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3710 goto out;
3711 ret = -EINTR;
3712 if (signal_pending(current))
3713 goto out;
3714
3715 lru_add_drain_all();
3716 drain_all_stock_sync(mem);
3717 ret = 0;
3718 mem_cgroup_start_move(mem);
3719 for_each_node_state(node, N_HIGH_MEMORY) {
3720 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3721 enum lru_list l;
3722 for_each_lru(l) {
3723 ret = mem_cgroup_force_empty_list(mem,
3724 node, zid, l);
3725 if (ret)
3726 break;
3727 }
3728 }
3729 if (ret)
3730 break;
3731 }
3732 mem_cgroup_end_move(mem);
3733 memcg_oom_recover(mem);
3734
3735 if (ret == -ENOMEM)
3736 goto try_to_free;
3737 cond_resched();
3738
3739 } while (mem->res.usage > 0 || ret);
3740out:
3741 css_put(&mem->css);
3742 return ret;
3743
3744try_to_free:
3745
3746 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3747 ret = -EBUSY;
3748 goto out;
3749 }
3750
3751 lru_add_drain_all();
3752
3753 shrink = 1;
3754 while (nr_retries && mem->res.usage > 0) {
3755 int progress;
3756
3757 if (signal_pending(current)) {
3758 ret = -EINTR;
3759 goto out;
3760 }
3761 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3762 false);
3763 if (!progress) {
3764 nr_retries--;
3765
3766 congestion_wait(BLK_RW_ASYNC, HZ/10);
3767 }
3768
3769 }
3770 lru_add_drain();
3771
3772 goto move_account;
3773}
3774
3775int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3776{
3777 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3778}
3779
3780
3781static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3782{
3783 return mem_cgroup_from_cont(cont)->use_hierarchy;
3784}
3785
3786static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3787 u64 val)
3788{
3789 int retval = 0;
3790 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3791 struct cgroup *parent = cont->parent;
3792 struct mem_cgroup *parent_mem = NULL;
3793
3794 if (parent)
3795 parent_mem = mem_cgroup_from_cont(parent);
3796
3797 cgroup_lock();
3798
3799
3800
3801
3802
3803
3804
3805
3806 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3807 (val == 1 || val == 0)) {
3808 if (list_empty(&cont->children))
3809 mem->use_hierarchy = val;
3810 else
3811 retval = -EBUSY;
3812 } else
3813 retval = -EINVAL;
3814 cgroup_unlock();
3815
3816 return retval;
3817}
3818
3819
3820static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3821 enum mem_cgroup_stat_index idx)
3822{
3823 struct mem_cgroup *iter;
3824 long val = 0;
3825
3826
3827 for_each_mem_cgroup_tree(iter, mem)
3828 val += mem_cgroup_read_stat(iter, idx);
3829
3830 if (val < 0)
3831 val = 0;
3832 return val;
3833}
3834
3835static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3836{
3837 u64 val;
3838
3839 if (!mem_cgroup_is_root(mem)) {
3840 if (!swap)
3841 return res_counter_read_u64(&mem->res, RES_USAGE);
3842 else
3843 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3844 }
3845
3846 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3847 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3848
3849 if (swap)
3850 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3851
3852 return val << PAGE_SHIFT;
3853}
3854
3855static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3856{
3857 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3858 u64 val;
3859 int type, name;
3860
3861 type = MEMFILE_TYPE(cft->private);
3862 name = MEMFILE_ATTR(cft->private);
3863 switch (type) {
3864 case _MEM:
3865 if (name == RES_USAGE)
3866 val = mem_cgroup_usage(mem, false);
3867 else
3868 val = res_counter_read_u64(&mem->res, name);
3869 break;
3870 case _MEMSWAP:
3871 if (name == RES_USAGE)
3872 val = mem_cgroup_usage(mem, true);
3873 else
3874 val = res_counter_read_u64(&mem->memsw, name);
3875 break;
3876 default:
3877 BUG();
3878 break;
3879 }
3880 return val;
3881}
3882
3883
3884
3885
3886static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3887 const char *buffer)
3888{
3889 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3890 int type, name;
3891 unsigned long long val;
3892 int ret;
3893
3894 type = MEMFILE_TYPE(cft->private);
3895 name = MEMFILE_ATTR(cft->private);
3896 switch (name) {
3897 case RES_LIMIT:
3898 if (mem_cgroup_is_root(memcg)) {
3899 ret = -EINVAL;
3900 break;
3901 }
3902
3903 ret = res_counter_memparse_write_strategy(buffer, &val);
3904 if (ret)
3905 break;
3906 if (type == _MEM)
3907 ret = mem_cgroup_resize_limit(memcg, val);
3908 else
3909 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3910 break;
3911 case RES_SOFT_LIMIT:
3912 ret = res_counter_memparse_write_strategy(buffer, &val);
3913 if (ret)
3914 break;
3915
3916
3917
3918
3919
3920 if (type == _MEM)
3921 ret = res_counter_set_soft_limit(&memcg->res, val);
3922 else
3923 ret = -EINVAL;
3924 break;
3925 default:
3926 ret = -EINVAL;
3927 break;
3928 }
3929 return ret;
3930}
3931
3932static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3933 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3934{
3935 struct cgroup *cgroup;
3936 unsigned long long min_limit, min_memsw_limit, tmp;
3937
3938 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3939 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3940 cgroup = memcg->css.cgroup;
3941 if (!memcg->use_hierarchy)
3942 goto out;
3943
3944 while (cgroup->parent) {
3945 cgroup = cgroup->parent;
3946 memcg = mem_cgroup_from_cont(cgroup);
3947 if (!memcg->use_hierarchy)
3948 break;
3949 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3950 min_limit = min(min_limit, tmp);
3951 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3952 min_memsw_limit = min(min_memsw_limit, tmp);
3953 }
3954out:
3955 *mem_limit = min_limit;
3956 *memsw_limit = min_memsw_limit;
3957 return;
3958}
3959
3960static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3961{
3962 struct mem_cgroup *mem;
3963 int type, name;
3964
3965 mem = mem_cgroup_from_cont(cont);
3966 type = MEMFILE_TYPE(event);
3967 name = MEMFILE_ATTR(event);
3968 switch (name) {
3969 case RES_MAX_USAGE:
3970 if (type == _MEM)
3971 res_counter_reset_max(&mem->res);
3972 else
3973 res_counter_reset_max(&mem->memsw);
3974 break;
3975 case RES_FAILCNT:
3976 if (type == _MEM)
3977 res_counter_reset_failcnt(&mem->res);
3978 else
3979 res_counter_reset_failcnt(&mem->memsw);
3980 break;
3981 }
3982
3983 return 0;
3984}
3985
3986static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3987 struct cftype *cft)
3988{
3989 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3990}
3991
3992#ifdef CONFIG_MMU
3993static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3994 struct cftype *cft, u64 val)
3995{
3996 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3997
3998 if (val >= (1 << NR_MOVE_TYPE))
3999 return -EINVAL;
4000
4001
4002
4003
4004
4005 cgroup_lock();
4006 mem->move_charge_at_immigrate = val;
4007 cgroup_unlock();
4008
4009 return 0;
4010}
4011#else
4012static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4013 struct cftype *cft, u64 val)
4014{
4015 return -ENOSYS;
4016}
4017#endif
4018
4019
4020
4021enum {
4022 MCS_CACHE,
4023 MCS_RSS,
4024 MCS_FILE_MAPPED,
4025 MCS_PGPGIN,
4026 MCS_PGPGOUT,
4027 MCS_SWAP,
4028 MCS_PGFAULT,
4029 MCS_PGMAJFAULT,
4030 MCS_INACTIVE_ANON,
4031 MCS_ACTIVE_ANON,
4032 MCS_INACTIVE_FILE,
4033 MCS_ACTIVE_FILE,
4034 MCS_UNEVICTABLE,
4035 NR_MCS_STAT,
4036};
4037
4038struct mcs_total_stat {
4039 s64 stat[NR_MCS_STAT];
4040};
4041
4042struct {
4043 char *local_name;
4044 char *total_name;
4045} memcg_stat_strings[NR_MCS_STAT] = {
4046 {"cache", "total_cache"},
4047 {"rss", "total_rss"},
4048 {"mapped_file", "total_mapped_file"},
4049 {"pgpgin", "total_pgpgin"},
4050 {"pgpgout", "total_pgpgout"},
4051 {"swap", "total_swap"},
4052 {"pgfault", "total_pgfault"},
4053 {"pgmajfault", "total_pgmajfault"},
4054 {"inactive_anon", "total_inactive_anon"},
4055 {"active_anon", "total_active_anon"},
4056 {"inactive_file", "total_inactive_file"},
4057 {"active_file", "total_active_file"},
4058 {"unevictable", "total_unevictable"}
4059};
4060
4061
4062static void
4063mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4064{
4065 s64 val;
4066
4067
4068 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
4069 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4070 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
4071 s->stat[MCS_RSS] += val * PAGE_SIZE;
4072 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
4073 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4074 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
4075 s->stat[MCS_PGPGIN] += val;
4076 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
4077 s->stat[MCS_PGPGOUT] += val;
4078 if (do_swap_account) {
4079 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
4080 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4081 }
4082 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4083 s->stat[MCS_PGFAULT] += val;
4084 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4085 s->stat[MCS_PGMAJFAULT] += val;
4086
4087
4088 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4089 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4090 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4091 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4092 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4093 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4094 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4095 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4096 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4097 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4098}
4099
4100static void
4101mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4102{
4103 struct mem_cgroup *iter;
4104
4105 for_each_mem_cgroup_tree(iter, mem)
4106 mem_cgroup_get_local_stat(iter, s);
4107}
4108
4109#ifdef CONFIG_NUMA
4110static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4111{
4112 int nid;
4113 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4114 unsigned long node_nr;
4115 struct cgroup *cont = m->private;
4116 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4117
4118 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4119 seq_printf(m, "total=%lu", total_nr);
4120 for_each_node_state(nid, N_HIGH_MEMORY) {
4121 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4122 seq_printf(m, " N%d=%lu", nid, node_nr);
4123 }
4124 seq_putc(m, '\n');
4125
4126 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4127 seq_printf(m, "file=%lu", file_nr);
4128 for_each_node_state(nid, N_HIGH_MEMORY) {
4129 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4130 LRU_ALL_FILE);
4131 seq_printf(m, " N%d=%lu", nid, node_nr);
4132 }
4133 seq_putc(m, '\n');
4134
4135 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4136 seq_printf(m, "anon=%lu", anon_nr);
4137 for_each_node_state(nid, N_HIGH_MEMORY) {
4138 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4139 LRU_ALL_ANON);
4140 seq_printf(m, " N%d=%lu", nid, node_nr);
4141 }
4142 seq_putc(m, '\n');
4143
4144 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4145 seq_printf(m, "unevictable=%lu", unevictable_nr);
4146 for_each_node_state(nid, N_HIGH_MEMORY) {
4147 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4148 BIT(LRU_UNEVICTABLE));
4149 seq_printf(m, " N%d=%lu", nid, node_nr);
4150 }
4151 seq_putc(m, '\n');
4152 return 0;
4153}
4154#endif
4155
4156static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4157 struct cgroup_map_cb *cb)
4158{
4159 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4160 struct mcs_total_stat mystat;
4161 int i;
4162
4163 memset(&mystat, 0, sizeof(mystat));
4164 mem_cgroup_get_local_stat(mem_cont, &mystat);
4165
4166
4167 for (i = 0; i < NR_MCS_STAT; i++) {
4168 if (i == MCS_SWAP && !do_swap_account)
4169 continue;
4170 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4171 }
4172
4173
4174 {
4175 unsigned long long limit, memsw_limit;
4176 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4177 cb->fill(cb, "hierarchical_memory_limit", limit);
4178 if (do_swap_account)
4179 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4180 }
4181
4182 memset(&mystat, 0, sizeof(mystat));
4183 mem_cgroup_get_total_stat(mem_cont, &mystat);
4184 for (i = 0; i < NR_MCS_STAT; i++) {
4185 if (i == MCS_SWAP && !do_swap_account)
4186 continue;
4187 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4188 }
4189
4190#ifdef CONFIG_DEBUG_VM
4191 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4192
4193 {
4194 int nid, zid;
4195 struct mem_cgroup_per_zone *mz;
4196 unsigned long recent_rotated[2] = {0, 0};
4197 unsigned long recent_scanned[2] = {0, 0};
4198
4199 for_each_online_node(nid)
4200 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4201 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4202
4203 recent_rotated[0] +=
4204 mz->reclaim_stat.recent_rotated[0];
4205 recent_rotated[1] +=
4206 mz->reclaim_stat.recent_rotated[1];
4207 recent_scanned[0] +=
4208 mz->reclaim_stat.recent_scanned[0];
4209 recent_scanned[1] +=
4210 mz->reclaim_stat.recent_scanned[1];
4211 }
4212 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4213 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4214 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4215 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4216 }
4217#endif
4218
4219 return 0;
4220}
4221
4222static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4223{
4224 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4225
4226 return mem_cgroup_swappiness(memcg);
4227}
4228
4229static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4230 u64 val)
4231{
4232 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4233 struct mem_cgroup *parent;
4234
4235 if (val > 100)
4236 return -EINVAL;
4237
4238 if (cgrp->parent == NULL)
4239 return -EINVAL;
4240
4241 parent = mem_cgroup_from_cont(cgrp->parent);
4242
4243 cgroup_lock();
4244
4245
4246 if ((parent->use_hierarchy) ||
4247 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4248 cgroup_unlock();
4249 return -EINVAL;
4250 }
4251
4252 memcg->swappiness = val;
4253
4254 cgroup_unlock();
4255
4256 return 0;
4257}
4258
4259static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4260{
4261 struct mem_cgroup_threshold_ary *t;
4262 u64 usage;
4263 int i;
4264
4265 rcu_read_lock();
4266 if (!swap)
4267 t = rcu_dereference(memcg->thresholds.primary);
4268 else
4269 t = rcu_dereference(memcg->memsw_thresholds.primary);
4270
4271 if (!t)
4272 goto unlock;
4273
4274 usage = mem_cgroup_usage(memcg, swap);
4275
4276
4277
4278
4279
4280
4281 i = t->current_threshold;
4282
4283
4284
4285
4286
4287
4288
4289 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4290 eventfd_signal(t->entries[i].eventfd, 1);
4291
4292
4293 i++;
4294
4295
4296
4297
4298
4299
4300
4301 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4302 eventfd_signal(t->entries[i].eventfd, 1);
4303
4304
4305 t->current_threshold = i - 1;
4306unlock:
4307 rcu_read_unlock();
4308}
4309
4310static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4311{
4312 while (memcg) {
4313 __mem_cgroup_threshold(memcg, false);
4314 if (do_swap_account)
4315 __mem_cgroup_threshold(memcg, true);
4316
4317 memcg = parent_mem_cgroup(memcg);
4318 }
4319}
4320
4321static int compare_thresholds(const void *a, const void *b)
4322{
4323 const struct mem_cgroup_threshold *_a = a;
4324 const struct mem_cgroup_threshold *_b = b;
4325
4326 return _a->threshold - _b->threshold;
4327}
4328
4329static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
4330{
4331 struct mem_cgroup_eventfd_list *ev;
4332
4333 list_for_each_entry(ev, &mem->oom_notify, list)
4334 eventfd_signal(ev->eventfd, 1);
4335 return 0;
4336}
4337
4338static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
4339{
4340 struct mem_cgroup *iter;
4341
4342 for_each_mem_cgroup_tree(iter, mem)
4343 mem_cgroup_oom_notify_cb(iter);
4344}
4345
4346static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4347 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4348{
4349 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4350 struct mem_cgroup_thresholds *thresholds;
4351 struct mem_cgroup_threshold_ary *new;
4352 int type = MEMFILE_TYPE(cft->private);
4353 u64 threshold, usage;
4354 int i, size, ret;
4355
4356 ret = res_counter_memparse_write_strategy(args, &threshold);
4357 if (ret)
4358 return ret;
4359
4360 mutex_lock(&memcg->thresholds_lock);
4361
4362 if (type == _MEM)
4363 thresholds = &memcg->thresholds;
4364 else if (type == _MEMSWAP)
4365 thresholds = &memcg->memsw_thresholds;
4366 else
4367 BUG();
4368
4369 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4370
4371
4372 if (thresholds->primary)
4373 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4374
4375 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4376
4377
4378 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4379 GFP_KERNEL);
4380 if (!new) {
4381 ret = -ENOMEM;
4382 goto unlock;
4383 }
4384 new->size = size;
4385
4386
4387 if (thresholds->primary) {
4388 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4389 sizeof(struct mem_cgroup_threshold));
4390 }
4391
4392
4393 new->entries[size - 1].eventfd = eventfd;
4394 new->entries[size - 1].threshold = threshold;
4395
4396
4397 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4398 compare_thresholds, NULL);
4399
4400
4401 new->current_threshold = -1;
4402 for (i = 0; i < size; i++) {
4403 if (new->entries[i].threshold < usage) {
4404
4405
4406
4407
4408
4409 ++new->current_threshold;
4410 }
4411 }
4412
4413
4414 kfree(thresholds->spare);
4415 thresholds->spare = thresholds->primary;
4416
4417 rcu_assign_pointer(thresholds->primary, new);
4418
4419
4420 synchronize_rcu();
4421
4422unlock:
4423 mutex_unlock(&memcg->thresholds_lock);
4424
4425 return ret;
4426}
4427
4428static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4429 struct cftype *cft, struct eventfd_ctx *eventfd)
4430{
4431 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4432 struct mem_cgroup_thresholds *thresholds;
4433 struct mem_cgroup_threshold_ary *new;
4434 int type = MEMFILE_TYPE(cft->private);
4435 u64 usage;
4436 int i, j, size;
4437
4438 mutex_lock(&memcg->thresholds_lock);
4439 if (type == _MEM)
4440 thresholds = &memcg->thresholds;
4441 else if (type == _MEMSWAP)
4442 thresholds = &memcg->memsw_thresholds;
4443 else
4444 BUG();
4445
4446
4447
4448
4449
4450 BUG_ON(!thresholds);
4451
4452 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4453
4454
4455 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4456
4457
4458 size = 0;
4459 for (i = 0; i < thresholds->primary->size; i++) {
4460 if (thresholds->primary->entries[i].eventfd != eventfd)
4461 size++;
4462 }
4463
4464 new = thresholds->spare;
4465
4466
4467 if (!size) {
4468 kfree(new);
4469 new = NULL;
4470 goto swap_buffers;
4471 }
4472
4473 new->size = size;
4474
4475
4476 new->current_threshold = -1;
4477 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4478 if (thresholds->primary->entries[i].eventfd == eventfd)
4479 continue;
4480
4481 new->entries[j] = thresholds->primary->entries[i];
4482 if (new->entries[j].threshold < usage) {
4483
4484
4485
4486
4487
4488 ++new->current_threshold;
4489 }
4490 j++;
4491 }
4492
4493swap_buffers:
4494
4495 thresholds->spare = thresholds->primary;
4496 rcu_assign_pointer(thresholds->primary, new);
4497
4498
4499 synchronize_rcu();
4500
4501 mutex_unlock(&memcg->thresholds_lock);
4502}
4503
4504static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4505 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4506{
4507 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4508 struct mem_cgroup_eventfd_list *event;
4509 int type = MEMFILE_TYPE(cft->private);
4510
4511 BUG_ON(type != _OOM_TYPE);
4512 event = kmalloc(sizeof(*event), GFP_KERNEL);
4513 if (!event)
4514 return -ENOMEM;
4515
4516 spin_lock(&memcg_oom_lock);
4517
4518 event->eventfd = eventfd;
4519 list_add(&event->list, &memcg->oom_notify);
4520
4521
4522 if (atomic_read(&memcg->under_oom))
4523 eventfd_signal(eventfd, 1);
4524 spin_unlock(&memcg_oom_lock);
4525
4526 return 0;
4527}
4528
4529static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4530 struct cftype *cft, struct eventfd_ctx *eventfd)
4531{
4532 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4533 struct mem_cgroup_eventfd_list *ev, *tmp;
4534 int type = MEMFILE_TYPE(cft->private);
4535
4536 BUG_ON(type != _OOM_TYPE);
4537
4538 spin_lock(&memcg_oom_lock);
4539
4540 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4541 if (ev->eventfd == eventfd) {
4542 list_del(&ev->list);
4543 kfree(ev);
4544 }
4545 }
4546
4547 spin_unlock(&memcg_oom_lock);
4548}
4549
4550static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4551 struct cftype *cft, struct cgroup_map_cb *cb)
4552{
4553 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4554
4555 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4556
4557 if (atomic_read(&mem->under_oom))
4558 cb->fill(cb, "under_oom", 1);
4559 else
4560 cb->fill(cb, "under_oom", 0);
4561 return 0;
4562}
4563
4564static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4565 struct cftype *cft, u64 val)
4566{
4567 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4568 struct mem_cgroup *parent;
4569
4570
4571 if (!cgrp->parent || !((val == 0) || (val == 1)))
4572 return -EINVAL;
4573
4574 parent = mem_cgroup_from_cont(cgrp->parent);
4575
4576 cgroup_lock();
4577
4578 if ((parent->use_hierarchy) ||
4579 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4580 cgroup_unlock();
4581 return -EINVAL;
4582 }
4583 mem->oom_kill_disable = val;
4584 if (!val)
4585 memcg_oom_recover(mem);
4586 cgroup_unlock();
4587 return 0;
4588}
4589
4590#ifdef CONFIG_NUMA
4591static const struct file_operations mem_control_numa_stat_file_operations = {
4592 .read = seq_read,
4593 .llseek = seq_lseek,
4594 .release = single_release,
4595};
4596
4597static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4598{
4599 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4600
4601 file->f_op = &mem_control_numa_stat_file_operations;
4602 return single_open(file, mem_control_numa_stat_show, cont);
4603}
4604#endif
4605
4606static struct cftype mem_cgroup_files[] = {
4607 {
4608 .name = "usage_in_bytes",
4609 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4610 .read_u64 = mem_cgroup_read,
4611 .register_event = mem_cgroup_usage_register_event,
4612 .unregister_event = mem_cgroup_usage_unregister_event,
4613 },
4614 {
4615 .name = "max_usage_in_bytes",
4616 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4617 .trigger = mem_cgroup_reset,
4618 .read_u64 = mem_cgroup_read,
4619 },
4620 {
4621 .name = "limit_in_bytes",
4622 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4623 .write_string = mem_cgroup_write,
4624 .read_u64 = mem_cgroup_read,
4625 },
4626 {
4627 .name = "soft_limit_in_bytes",
4628 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4629 .write_string = mem_cgroup_write,
4630 .read_u64 = mem_cgroup_read,
4631 },
4632 {
4633 .name = "failcnt",
4634 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4635 .trigger = mem_cgroup_reset,
4636 .read_u64 = mem_cgroup_read,
4637 },
4638 {
4639 .name = "stat",
4640 .read_map = mem_control_stat_show,
4641 },
4642 {
4643 .name = "force_empty",
4644 .trigger = mem_cgroup_force_empty_write,
4645 },
4646 {
4647 .name = "use_hierarchy",
4648 .write_u64 = mem_cgroup_hierarchy_write,
4649 .read_u64 = mem_cgroup_hierarchy_read,
4650 },
4651 {
4652 .name = "swappiness",
4653 .read_u64 = mem_cgroup_swappiness_read,
4654 .write_u64 = mem_cgroup_swappiness_write,
4655 },
4656 {
4657 .name = "move_charge_at_immigrate",
4658 .read_u64 = mem_cgroup_move_charge_read,
4659 .write_u64 = mem_cgroup_move_charge_write,
4660 },
4661 {
4662 .name = "oom_control",
4663 .read_map = mem_cgroup_oom_control_read,
4664 .write_u64 = mem_cgroup_oom_control_write,
4665 .register_event = mem_cgroup_oom_register_event,
4666 .unregister_event = mem_cgroup_oom_unregister_event,
4667 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4668 },
4669#ifdef CONFIG_NUMA
4670 {
4671 .name = "numa_stat",
4672 .open = mem_control_numa_stat_open,
4673 .mode = S_IRUGO,
4674 },
4675#endif
4676};
4677
4678#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4679static struct cftype memsw_cgroup_files[] = {
4680 {
4681 .name = "memsw.usage_in_bytes",
4682 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4683 .read_u64 = mem_cgroup_read,
4684 .register_event = mem_cgroup_usage_register_event,
4685 .unregister_event = mem_cgroup_usage_unregister_event,
4686 },
4687 {
4688 .name = "memsw.max_usage_in_bytes",
4689 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4690 .trigger = mem_cgroup_reset,
4691 .read_u64 = mem_cgroup_read,
4692 },
4693 {
4694 .name = "memsw.limit_in_bytes",
4695 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4696 .write_string = mem_cgroup_write,
4697 .read_u64 = mem_cgroup_read,
4698 },
4699 {
4700 .name = "memsw.failcnt",
4701 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4702 .trigger = mem_cgroup_reset,
4703 .read_u64 = mem_cgroup_read,
4704 },
4705};
4706
4707static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4708{
4709 if (!do_swap_account)
4710 return 0;
4711 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4712 ARRAY_SIZE(memsw_cgroup_files));
4713};
4714#else
4715static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4716{
4717 return 0;
4718}
4719#endif
4720
4721static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4722{
4723 struct mem_cgroup_per_node *pn;
4724 struct mem_cgroup_per_zone *mz;
4725 enum lru_list l;
4726 int zone, tmp = node;
4727
4728
4729
4730
4731
4732
4733
4734
4735 if (!node_state(node, N_NORMAL_MEMORY))
4736 tmp = -1;
4737 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4738 if (!pn)
4739 return 1;
4740
4741 mem->info.nodeinfo[node] = pn;
4742 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4743 mz = &pn->zoneinfo[zone];
4744 for_each_lru(l)
4745 INIT_LIST_HEAD(&mz->lists[l]);
4746 mz->usage_in_excess = 0;
4747 mz->on_tree = false;
4748 mz->mem = mem;
4749 }
4750 return 0;
4751}
4752
4753static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4754{
4755 kfree(mem->info.nodeinfo[node]);
4756}
4757
4758static struct mem_cgroup *mem_cgroup_alloc(void)
4759{
4760 struct mem_cgroup *mem;
4761 int size = sizeof(struct mem_cgroup);
4762
4763
4764 if (size < PAGE_SIZE)
4765 mem = kzalloc(size, GFP_KERNEL);
4766 else
4767 mem = vzalloc(size);
4768
4769 if (!mem)
4770 return NULL;
4771
4772 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4773 if (!mem->stat)
4774 goto out_free;
4775 spin_lock_init(&mem->pcp_counter_lock);
4776 return mem;
4777
4778out_free:
4779 if (size < PAGE_SIZE)
4780 kfree(mem);
4781 else
4782 vfree(mem);
4783 return NULL;
4784}
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797static void __mem_cgroup_free(struct mem_cgroup *mem)
4798{
4799 int node;
4800
4801 mem_cgroup_remove_from_trees(mem);
4802 free_css_id(&mem_cgroup_subsys, &mem->css);
4803
4804 for_each_node_state(node, N_POSSIBLE)
4805 free_mem_cgroup_per_zone_info(mem, node);
4806
4807 free_percpu(mem->stat);
4808 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4809 kfree(mem);
4810 else
4811 vfree(mem);
4812}
4813
4814static void mem_cgroup_get(struct mem_cgroup *mem)
4815{
4816 atomic_inc(&mem->refcnt);
4817}
4818
4819static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4820{
4821 if (atomic_sub_and_test(count, &mem->refcnt)) {
4822 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4823 __mem_cgroup_free(mem);
4824 if (parent)
4825 mem_cgroup_put(parent);
4826 }
4827}
4828
4829static void mem_cgroup_put(struct mem_cgroup *mem)
4830{
4831 __mem_cgroup_put(mem, 1);
4832}
4833
4834
4835
4836
4837static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4838{
4839 if (!mem->res.parent)
4840 return NULL;
4841 return mem_cgroup_from_res_counter(mem->res.parent, res);
4842}
4843
4844#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4845static void __init enable_swap_cgroup(void)
4846{
4847 if (!mem_cgroup_disabled() && really_do_swap_account)
4848 do_swap_account = 1;
4849}
4850#else
4851static void __init enable_swap_cgroup(void)
4852{
4853}
4854#endif
4855
4856static int mem_cgroup_soft_limit_tree_init(void)
4857{
4858 struct mem_cgroup_tree_per_node *rtpn;
4859 struct mem_cgroup_tree_per_zone *rtpz;
4860 int tmp, node, zone;
4861
4862 for_each_node_state(node, N_POSSIBLE) {
4863 tmp = node;
4864 if (!node_state(node, N_NORMAL_MEMORY))
4865 tmp = -1;
4866 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4867 if (!rtpn)
4868 return 1;
4869
4870 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4871
4872 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4873 rtpz = &rtpn->rb_tree_per_zone[zone];
4874 rtpz->rb_root = RB_ROOT;
4875 spin_lock_init(&rtpz->lock);
4876 }
4877 }
4878 return 0;
4879}
4880
4881static struct cgroup_subsys_state * __ref
4882mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4883{
4884 struct mem_cgroup *mem, *parent;
4885 long error = -ENOMEM;
4886 int node;
4887
4888 mem = mem_cgroup_alloc();
4889 if (!mem)
4890 return ERR_PTR(error);
4891
4892 for_each_node_state(node, N_POSSIBLE)
4893 if (alloc_mem_cgroup_per_zone_info(mem, node))
4894 goto free_out;
4895
4896
4897 if (cont->parent == NULL) {
4898 int cpu;
4899 enable_swap_cgroup();
4900 parent = NULL;
4901 root_mem_cgroup = mem;
4902 if (mem_cgroup_soft_limit_tree_init())
4903 goto free_out;
4904 for_each_possible_cpu(cpu) {
4905 struct memcg_stock_pcp *stock =
4906 &per_cpu(memcg_stock, cpu);
4907 INIT_WORK(&stock->work, drain_local_stock);
4908 }
4909 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4910 } else {
4911 parent = mem_cgroup_from_cont(cont->parent);
4912 mem->use_hierarchy = parent->use_hierarchy;
4913 mem->oom_kill_disable = parent->oom_kill_disable;
4914 }
4915
4916 if (parent && parent->use_hierarchy) {
4917 res_counter_init(&mem->res, &parent->res);
4918 res_counter_init(&mem->memsw, &parent->memsw);
4919
4920
4921
4922
4923
4924
4925 mem_cgroup_get(parent);
4926 } else {
4927 res_counter_init(&mem->res, NULL);
4928 res_counter_init(&mem->memsw, NULL);
4929 }
4930 mem->last_scanned_child = 0;
4931 mem->last_scanned_node = MAX_NUMNODES;
4932 INIT_LIST_HEAD(&mem->oom_notify);
4933
4934 if (parent)
4935 mem->swappiness = mem_cgroup_swappiness(parent);
4936 atomic_set(&mem->refcnt, 1);
4937 mem->move_charge_at_immigrate = 0;
4938 mutex_init(&mem->thresholds_lock);
4939 return &mem->css;
4940free_out:
4941 __mem_cgroup_free(mem);
4942 root_mem_cgroup = NULL;
4943 return ERR_PTR(error);
4944}
4945
4946static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4947 struct cgroup *cont)
4948{
4949 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4950
4951 return mem_cgroup_force_empty(mem, false);
4952}
4953
4954static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4955 struct cgroup *cont)
4956{
4957 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4958
4959 mem_cgroup_put(mem);
4960}
4961
4962static int mem_cgroup_populate(struct cgroup_subsys *ss,
4963 struct cgroup *cont)
4964{
4965 int ret;
4966
4967 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4968 ARRAY_SIZE(mem_cgroup_files));
4969
4970 if (!ret)
4971 ret = register_memsw_files(cont, ss);
4972 return ret;
4973}
4974
4975#ifdef CONFIG_MMU
4976
4977#define PRECHARGE_COUNT_AT_ONCE 256
4978static int mem_cgroup_do_precharge(unsigned long count)
4979{
4980 int ret = 0;
4981 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4982 struct mem_cgroup *mem = mc.to;
4983
4984 if (mem_cgroup_is_root(mem)) {
4985 mc.precharge += count;
4986
4987 return ret;
4988 }
4989
4990 if (count > 1) {
4991 struct res_counter *dummy;
4992
4993
4994
4995
4996
4997
4998 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4999 goto one_by_one;
5000 if (do_swap_account && res_counter_charge(&mem->memsw,
5001 PAGE_SIZE * count, &dummy)) {
5002 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
5003 goto one_by_one;
5004 }
5005 mc.precharge += count;
5006 return ret;
5007 }
5008one_by_one:
5009
5010 while (count--) {
5011 if (signal_pending(current)) {
5012 ret = -EINTR;
5013 break;
5014 }
5015 if (!batch_count--) {
5016 batch_count = PRECHARGE_COUNT_AT_ONCE;
5017 cond_resched();
5018 }
5019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
5020 if (ret || !mem)
5021
5022 return -ENOMEM;
5023 mc.precharge++;
5024 }
5025 return ret;
5026}
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046union mc_target {
5047 struct page *page;
5048 swp_entry_t ent;
5049};
5050
5051enum mc_target_type {
5052 MC_TARGET_NONE,
5053 MC_TARGET_PAGE,
5054 MC_TARGET_SWAP,
5055};
5056
5057static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5058 unsigned long addr, pte_t ptent)
5059{
5060 struct page *page = vm_normal_page(vma, addr, ptent);
5061
5062 if (!page || !page_mapped(page))
5063 return NULL;
5064 if (PageAnon(page)) {
5065
5066 if (!move_anon() || page_mapcount(page) > 2)
5067 return NULL;
5068 } else if (!move_file())
5069
5070 return NULL;
5071 if (!get_page_unless_zero(page))
5072 return NULL;
5073
5074 return page;
5075}
5076
5077static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5078 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5079{
5080 int usage_count;
5081 struct page *page = NULL;
5082 swp_entry_t ent = pte_to_swp_entry(ptent);
5083
5084 if (!move_anon() || non_swap_entry(ent))
5085 return NULL;
5086 usage_count = mem_cgroup_count_swap_user(ent, &page);
5087 if (usage_count > 1) {
5088 if (page)
5089 put_page(page);
5090 return NULL;
5091 }
5092 if (do_swap_account)
5093 entry->val = ent.val;
5094
5095 return page;
5096}
5097
5098static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5099 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5100{
5101 struct page *page = NULL;
5102 struct inode *inode;
5103 struct address_space *mapping;
5104 pgoff_t pgoff;
5105
5106 if (!vma->vm_file)
5107 return NULL;
5108 if (!move_file())
5109 return NULL;
5110
5111 inode = vma->vm_file->f_path.dentry->d_inode;
5112 mapping = vma->vm_file->f_mapping;
5113 if (pte_none(ptent))
5114 pgoff = linear_page_index(vma, addr);
5115 else
5116 pgoff = pte_to_pgoff(ptent);
5117
5118
5119 page = find_get_page(mapping, pgoff);
5120
5121#ifdef CONFIG_SWAP
5122
5123 if (radix_tree_exceptional_entry(page)) {
5124 swp_entry_t swap = radix_to_swp_entry(page);
5125 if (do_swap_account)
5126 *entry = swap;
5127 page = find_get_page(&swapper_space, swap.val);
5128 }
5129#endif
5130 return page;
5131}
5132
5133static int is_target_pte_for_mc(struct vm_area_struct *vma,
5134 unsigned long addr, pte_t ptent, union mc_target *target)
5135{
5136 struct page *page = NULL;
5137 struct page_cgroup *pc;
5138 int ret = 0;
5139 swp_entry_t ent = { .val = 0 };
5140
5141 if (pte_present(ptent))
5142 page = mc_handle_present_pte(vma, addr, ptent);
5143 else if (is_swap_pte(ptent))
5144 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5145 else if (pte_none(ptent) || pte_file(ptent))
5146 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5147
5148 if (!page && !ent.val)
5149 return 0;
5150 if (page) {
5151 pc = lookup_page_cgroup(page);
5152
5153
5154
5155
5156
5157 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5158 ret = MC_TARGET_PAGE;
5159 if (target)
5160 target->page = page;
5161 }
5162 if (!ret || !target)
5163 put_page(page);
5164 }
5165
5166 if (ent.val && !ret &&
5167 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5168 ret = MC_TARGET_SWAP;
5169 if (target)
5170 target->ent = ent;
5171 }
5172 return ret;
5173}
5174
5175static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5176 unsigned long addr, unsigned long end,
5177 struct mm_walk *walk)
5178{
5179 struct vm_area_struct *vma = walk->private;
5180 pte_t *pte;
5181 spinlock_t *ptl;
5182
5183 split_huge_page_pmd(walk->mm, pmd);
5184
5185 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5186 for (; addr != end; pte++, addr += PAGE_SIZE)
5187 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5188 mc.precharge++;
5189 pte_unmap_unlock(pte - 1, ptl);
5190 cond_resched();
5191
5192 return 0;
5193}
5194
5195static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5196{
5197 unsigned long precharge;
5198 struct vm_area_struct *vma;
5199
5200 down_read(&mm->mmap_sem);
5201 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5202 struct mm_walk mem_cgroup_count_precharge_walk = {
5203 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5204 .mm = mm,
5205 .private = vma,
5206 };
5207 if (is_vm_hugetlb_page(vma))
5208 continue;
5209 walk_page_range(vma->vm_start, vma->vm_end,
5210 &mem_cgroup_count_precharge_walk);
5211 }
5212 up_read(&mm->mmap_sem);
5213
5214 precharge = mc.precharge;
5215 mc.precharge = 0;
5216
5217 return precharge;
5218}
5219
5220static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5221{
5222 unsigned long precharge = mem_cgroup_count_precharge(mm);
5223
5224 VM_BUG_ON(mc.moving_task);
5225 mc.moving_task = current;
5226 return mem_cgroup_do_precharge(precharge);
5227}
5228
5229
5230static void __mem_cgroup_clear_mc(void)
5231{
5232 struct mem_cgroup *from = mc.from;
5233 struct mem_cgroup *to = mc.to;
5234
5235
5236 if (mc.precharge) {
5237 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5238 mc.precharge = 0;
5239 }
5240
5241
5242
5243
5244 if (mc.moved_charge) {
5245 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5246 mc.moved_charge = 0;
5247 }
5248
5249 if (mc.moved_swap) {
5250
5251 if (!mem_cgroup_is_root(mc.from))
5252 res_counter_uncharge(&mc.from->memsw,
5253 PAGE_SIZE * mc.moved_swap);
5254 __mem_cgroup_put(mc.from, mc.moved_swap);
5255
5256 if (!mem_cgroup_is_root(mc.to)) {
5257
5258
5259
5260
5261 res_counter_uncharge(&mc.to->res,
5262 PAGE_SIZE * mc.moved_swap);
5263 }
5264
5265 mc.moved_swap = 0;
5266 }
5267 memcg_oom_recover(from);
5268 memcg_oom_recover(to);
5269 wake_up_all(&mc.waitq);
5270}
5271
5272static void mem_cgroup_clear_mc(void)
5273{
5274 struct mem_cgroup *from = mc.from;
5275
5276
5277
5278
5279
5280 mc.moving_task = NULL;
5281 __mem_cgroup_clear_mc();
5282 spin_lock(&mc.lock);
5283 mc.from = NULL;
5284 mc.to = NULL;
5285 spin_unlock(&mc.lock);
5286 mem_cgroup_end_move(from);
5287}
5288
5289static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5290 struct cgroup *cgroup,
5291 struct task_struct *p)
5292{
5293 int ret = 0;
5294 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
5295
5296 if (mem->move_charge_at_immigrate) {
5297 struct mm_struct *mm;
5298 struct mem_cgroup *from = mem_cgroup_from_task(p);
5299
5300 VM_BUG_ON(from == mem);
5301
5302 mm = get_task_mm(p);
5303 if (!mm)
5304 return 0;
5305
5306 if (mm->owner == p) {
5307 VM_BUG_ON(mc.from);
5308 VM_BUG_ON(mc.to);
5309 VM_BUG_ON(mc.precharge);
5310 VM_BUG_ON(mc.moved_charge);
5311 VM_BUG_ON(mc.moved_swap);
5312 mem_cgroup_start_move(from);
5313 spin_lock(&mc.lock);
5314 mc.from = from;
5315 mc.to = mem;
5316 spin_unlock(&mc.lock);
5317
5318
5319 ret = mem_cgroup_precharge_mc(mm);
5320 if (ret)
5321 mem_cgroup_clear_mc();
5322 }
5323 mmput(mm);
5324 }
5325 return ret;
5326}
5327
5328static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5329 struct cgroup *cgroup,
5330 struct task_struct *p)
5331{
5332 mem_cgroup_clear_mc();
5333}
5334
5335static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5336 unsigned long addr, unsigned long end,
5337 struct mm_walk *walk)
5338{
5339 int ret = 0;
5340 struct vm_area_struct *vma = walk->private;
5341 pte_t *pte;
5342 spinlock_t *ptl;
5343
5344 split_huge_page_pmd(walk->mm, pmd);
5345retry:
5346 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5347 for (; addr != end; addr += PAGE_SIZE) {
5348 pte_t ptent = *(pte++);
5349 union mc_target target;
5350 int type;
5351 struct page *page;
5352 struct page_cgroup *pc;
5353 swp_entry_t ent;
5354
5355 if (!mc.precharge)
5356 break;
5357
5358 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5359 switch (type) {
5360 case MC_TARGET_PAGE:
5361 page = target.page;
5362 if (isolate_lru_page(page))
5363 goto put;
5364 pc = lookup_page_cgroup(page);
5365 if (!mem_cgroup_move_account(page, 1, pc,
5366 mc.from, mc.to, false)) {
5367 mc.precharge--;
5368
5369 mc.moved_charge++;
5370 }
5371 putback_lru_page(page);
5372put:
5373 put_page(page);
5374 break;
5375 case MC_TARGET_SWAP:
5376 ent = target.ent;
5377 if (!mem_cgroup_move_swap_account(ent,
5378 mc.from, mc.to, false)) {
5379 mc.precharge--;
5380
5381 mc.moved_swap++;
5382 }
5383 break;
5384 default:
5385 break;
5386 }
5387 }
5388 pte_unmap_unlock(pte - 1, ptl);
5389 cond_resched();
5390
5391 if (addr != end) {
5392
5393
5394
5395
5396
5397
5398 ret = mem_cgroup_do_precharge(1);
5399 if (!ret)
5400 goto retry;
5401 }
5402
5403 return ret;
5404}
5405
5406static void mem_cgroup_move_charge(struct mm_struct *mm)
5407{
5408 struct vm_area_struct *vma;
5409
5410 lru_add_drain_all();
5411retry:
5412 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5413
5414
5415
5416
5417
5418
5419
5420 __mem_cgroup_clear_mc();
5421 cond_resched();
5422 goto retry;
5423 }
5424 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5425 int ret;
5426 struct mm_walk mem_cgroup_move_charge_walk = {
5427 .pmd_entry = mem_cgroup_move_charge_pte_range,
5428 .mm = mm,
5429 .private = vma,
5430 };
5431 if (is_vm_hugetlb_page(vma))
5432 continue;
5433 ret = walk_page_range(vma->vm_start, vma->vm_end,
5434 &mem_cgroup_move_charge_walk);
5435 if (ret)
5436
5437
5438
5439
5440 break;
5441 }
5442 up_read(&mm->mmap_sem);
5443}
5444
5445static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5446 struct cgroup *cont,
5447 struct cgroup *old_cont,
5448 struct task_struct *p)
5449{
5450 struct mm_struct *mm = get_task_mm(p);
5451
5452 if (mm) {
5453 if (mc.to)
5454 mem_cgroup_move_charge(mm);
5455 put_swap_token(mm);
5456 mmput(mm);
5457 }
5458 if (mc.to)
5459 mem_cgroup_clear_mc();
5460}
5461#else
5462static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5463 struct cgroup *cgroup,
5464 struct task_struct *p)
5465{
5466 return 0;
5467}
5468static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5469 struct cgroup *cgroup,
5470 struct task_struct *p)
5471{
5472}
5473static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5474 struct cgroup *cont,
5475 struct cgroup *old_cont,
5476 struct task_struct *p)
5477{
5478}
5479#endif
5480
5481struct cgroup_subsys mem_cgroup_subsys = {
5482 .name = "memory",
5483 .subsys_id = mem_cgroup_subsys_id,
5484 .create = mem_cgroup_create,
5485 .pre_destroy = mem_cgroup_pre_destroy,
5486 .destroy = mem_cgroup_destroy,
5487 .populate = mem_cgroup_populate,
5488 .can_attach = mem_cgroup_can_attach,
5489 .cancel_attach = mem_cgroup_cancel_attach,
5490 .attach = mem_cgroup_move_task,
5491 .early_init = 0,
5492 .use_id = 1,
5493};
5494
5495#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5496static int __init enable_swap_account(char *s)
5497{
5498
5499 if (!strcmp(s, "1"))
5500 really_do_swap_account = 1;
5501 else if (!strcmp(s, "0"))
5502 really_do_swap_account = 0;
5503 return 1;
5504}
5505__setup("swapaccount=", enable_swap_account);
5506
5507#endif
5508