1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/export.h>
37#include <linux/mutex.h>
38#include <linux/rbtree.h>
39#include <linux/slab.h>
40#include <linux/swap.h>
41#include <linux/swapops.h>
42#include <linux/spinlock.h>
43#include <linux/eventfd.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/seq_file.h>
47#include <linux/vmalloc.h>
48#include <linux/mm_inline.h>
49#include <linux/page_cgroup.h>
50#include <linux/cpu.h>
51#include <linux/oom.h>
52#include "internal.h"
53#include <net/sock.h>
54#include <net/tcp_memcontrol.h>
55
56#include <asm/uaccess.h>
57
58#include <trace/events/vmscan.h>
59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5
62struct mem_cgroup *root_mem_cgroup __read_mostly;
63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65
66int do_swap_account __read_mostly;
67
68
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1;
71#else
72static int really_do_swap_account __initdata = 0;
73#endif
74
75#else
76#define do_swap_account (0)
77#endif
78
79
80
81
82
83enum mem_cgroup_stat_index {
84
85
86
87 MEM_CGROUP_STAT_CACHE,
88 MEM_CGROUP_STAT_RSS,
89 MEM_CGROUP_STAT_FILE_MAPPED,
90 MEM_CGROUP_STAT_SWAPOUT,
91 MEM_CGROUP_STAT_DATA,
92 MEM_CGROUP_ON_MOVE,
93 MEM_CGROUP_STAT_NSTATS,
94};
95
96enum mem_cgroup_events_index {
97 MEM_CGROUP_EVENTS_PGPGIN,
98 MEM_CGROUP_EVENTS_PGPGOUT,
99 MEM_CGROUP_EVENTS_COUNT,
100 MEM_CGROUP_EVENTS_PGFAULT,
101 MEM_CGROUP_EVENTS_PGMAJFAULT,
102 MEM_CGROUP_EVENTS_NSTATS,
103};
104
105
106
107
108
109
110enum mem_cgroup_events_target {
111 MEM_CGROUP_TARGET_THRESH,
112 MEM_CGROUP_TARGET_SOFTLIMIT,
113 MEM_CGROUP_TARGET_NUMAINFO,
114 MEM_CGROUP_NTARGETS,
115};
116#define THRESHOLDS_EVENTS_TARGET (128)
117#define SOFTLIMIT_EVENTS_TARGET (1024)
118#define NUMAINFO_EVENTS_TARGET (1024)
119
120struct mem_cgroup_stat_cpu {
121 long count[MEM_CGROUP_STAT_NSTATS];
122 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
123 unsigned long targets[MEM_CGROUP_NTARGETS];
124};
125
126struct mem_cgroup_reclaim_iter {
127
128 int position;
129
130 unsigned int generation;
131};
132
133
134
135
136struct mem_cgroup_per_zone {
137 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS];
139
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141
142 struct zone_reclaim_stat reclaim_stat;
143 struct rb_node tree_node;
144 unsigned long long usage_in_excess;
145
146 bool on_tree;
147 struct mem_cgroup *mem;
148
149};
150
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152
153struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
155};
156
157struct mem_cgroup_lru_info {
158 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
159};
160
161
162
163
164
165
166struct mem_cgroup_tree_per_zone {
167 struct rb_root rb_root;
168 spinlock_t lock;
169};
170
171struct mem_cgroup_tree_per_node {
172 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
173};
174
175struct mem_cgroup_tree {
176 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
177};
178
179static struct mem_cgroup_tree soft_limit_tree __read_mostly;
180
181struct mem_cgroup_threshold {
182 struct eventfd_ctx *eventfd;
183 u64 threshold;
184};
185
186
187struct mem_cgroup_threshold_ary {
188
189 int current_threshold;
190
191 unsigned int size;
192
193 struct mem_cgroup_threshold entries[0];
194};
195
196struct mem_cgroup_thresholds {
197
198 struct mem_cgroup_threshold_ary *primary;
199
200
201
202
203
204 struct mem_cgroup_threshold_ary *spare;
205};
206
207
208struct mem_cgroup_eventfd_list {
209 struct list_head list;
210 struct eventfd_ctx *eventfd;
211};
212
213static void mem_cgroup_threshold(struct mem_cgroup *memcg);
214static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
215
216
217
218
219
220
221
222
223
224
225
226
227struct mem_cgroup {
228 struct cgroup_subsys_state css;
229
230
231
232 struct res_counter res;
233
234 union {
235
236
237
238 struct res_counter memsw;
239
240
241
242
243
244
245
246
247
248
249 struct rcu_head rcu_freeing;
250
251
252
253
254 struct work_struct work_freeing;
255 };
256
257
258
259
260
261 struct mem_cgroup_lru_info info;
262 int last_scanned_node;
263#if MAX_NUMNODES > 1
264 nodemask_t scan_nodes;
265 atomic_t numainfo_events;
266 atomic_t numainfo_updating;
267#endif
268
269
270
271 bool use_hierarchy;
272
273 bool oom_lock;
274 atomic_t under_oom;
275
276 atomic_t refcnt;
277
278 int swappiness;
279
280 int oom_kill_disable;
281
282
283 bool memsw_is_minimum;
284
285
286 struct mutex thresholds_lock;
287
288
289 struct mem_cgroup_thresholds thresholds;
290
291
292 struct mem_cgroup_thresholds memsw_thresholds;
293
294
295 struct list_head oom_notify;
296
297
298
299
300
301 unsigned long move_charge_at_immigrate;
302
303
304
305 struct mem_cgroup_stat_cpu *stat;
306
307
308
309
310 struct mem_cgroup_stat_cpu nocpu_base;
311 spinlock_t pcp_counter_lock;
312
313#ifdef CONFIG_INET
314 struct tcp_memcontrol tcp_mem;
315#endif
316};
317
318
319
320
321
322
323enum move_type {
324 MOVE_CHARGE_TYPE_ANON,
325 MOVE_CHARGE_TYPE_FILE,
326 NR_MOVE_TYPE,
327};
328
329
330static struct move_charge_struct {
331 spinlock_t lock;
332 struct mem_cgroup *from;
333 struct mem_cgroup *to;
334 unsigned long precharge;
335 unsigned long moved_charge;
336 unsigned long moved_swap;
337 struct task_struct *moving_task;
338 wait_queue_head_t waitq;
339} mc = {
340 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
341 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
342};
343
344static bool move_anon(void)
345{
346 return test_bit(MOVE_CHARGE_TYPE_ANON,
347 &mc.to->move_charge_at_immigrate);
348}
349
350static bool move_file(void)
351{
352 return test_bit(MOVE_CHARGE_TYPE_FILE,
353 &mc.to->move_charge_at_immigrate);
354}
355
356
357
358
359
360#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
361#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
362
363enum charge_type {
364 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
365 MEM_CGROUP_CHARGE_TYPE_MAPPED,
366 MEM_CGROUP_CHARGE_TYPE_SHMEM,
367 MEM_CGROUP_CHARGE_TYPE_FORCE,
368 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
369 MEM_CGROUP_CHARGE_TYPE_DROP,
370 NR_CHARGE_TYPE,
371};
372
373
374#define _MEM (0)
375#define _MEMSWAP (1)
376#define _OOM_TYPE (2)
377#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
378#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
379#define MEMFILE_ATTR(val) ((val) & 0xffff)
380
381#define OOM_CONTROL (0)
382
383
384
385
386#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
387#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
388#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
389#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
390
391static void mem_cgroup_get(struct mem_cgroup *memcg);
392static void mem_cgroup_put(struct mem_cgroup *memcg);
393
394
395#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
396#include <net/sock.h>
397#include <net/ip.h>
398
399static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
400void sock_update_memcg(struct sock *sk)
401{
402 if (mem_cgroup_sockets_enabled) {
403 struct mem_cgroup *memcg;
404
405 BUG_ON(!sk->sk_prot->proto_cgroup);
406
407
408
409
410
411
412
413
414
415 if (sk->sk_cgrp) {
416 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
417 mem_cgroup_get(sk->sk_cgrp->memcg);
418 return;
419 }
420
421 rcu_read_lock();
422 memcg = mem_cgroup_from_task(current);
423 if (!mem_cgroup_is_root(memcg)) {
424 mem_cgroup_get(memcg);
425 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
426 }
427 rcu_read_unlock();
428 }
429}
430EXPORT_SYMBOL(sock_update_memcg);
431
432void sock_release_memcg(struct sock *sk)
433{
434 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
435 struct mem_cgroup *memcg;
436 WARN_ON(!sk->sk_cgrp->memcg);
437 memcg = sk->sk_cgrp->memcg;
438 mem_cgroup_put(memcg);
439 }
440}
441
442#ifdef CONFIG_INET
443struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
444{
445 if (!memcg || mem_cgroup_is_root(memcg))
446 return NULL;
447
448 return &memcg->tcp_mem.cg_proto;
449}
450EXPORT_SYMBOL(tcp_proto_cgroup);
451#endif
452#endif
453
454static void drain_all_stock_async(struct mem_cgroup *memcg);
455
456static struct mem_cgroup_per_zone *
457mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
458{
459 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
460}
461
462struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
463{
464 return &memcg->css;
465}
466
467static struct mem_cgroup_per_zone *
468page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
469{
470 int nid = page_to_nid(page);
471 int zid = page_zonenum(page);
472
473 return mem_cgroup_zoneinfo(memcg, nid, zid);
474}
475
476static struct mem_cgroup_tree_per_zone *
477soft_limit_tree_node_zone(int nid, int zid)
478{
479 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
480}
481
482static struct mem_cgroup_tree_per_zone *
483soft_limit_tree_from_page(struct page *page)
484{
485 int nid = page_to_nid(page);
486 int zid = page_zonenum(page);
487
488 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
489}
490
491static void
492__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
493 struct mem_cgroup_per_zone *mz,
494 struct mem_cgroup_tree_per_zone *mctz,
495 unsigned long long new_usage_in_excess)
496{
497 struct rb_node **p = &mctz->rb_root.rb_node;
498 struct rb_node *parent = NULL;
499 struct mem_cgroup_per_zone *mz_node;
500
501 if (mz->on_tree)
502 return;
503
504 mz->usage_in_excess = new_usage_in_excess;
505 if (!mz->usage_in_excess)
506 return;
507 while (*p) {
508 parent = *p;
509 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
510 tree_node);
511 if (mz->usage_in_excess < mz_node->usage_in_excess)
512 p = &(*p)->rb_left;
513
514
515
516
517 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
518 p = &(*p)->rb_right;
519 }
520 rb_link_node(&mz->tree_node, parent, p);
521 rb_insert_color(&mz->tree_node, &mctz->rb_root);
522 mz->on_tree = true;
523}
524
525static void
526__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
527 struct mem_cgroup_per_zone *mz,
528 struct mem_cgroup_tree_per_zone *mctz)
529{
530 if (!mz->on_tree)
531 return;
532 rb_erase(&mz->tree_node, &mctz->rb_root);
533 mz->on_tree = false;
534}
535
536static void
537mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
538 struct mem_cgroup_per_zone *mz,
539 struct mem_cgroup_tree_per_zone *mctz)
540{
541 spin_lock(&mctz->lock);
542 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
543 spin_unlock(&mctz->lock);
544}
545
546
547static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
548{
549 unsigned long long excess;
550 struct mem_cgroup_per_zone *mz;
551 struct mem_cgroup_tree_per_zone *mctz;
552 int nid = page_to_nid(page);
553 int zid = page_zonenum(page);
554 mctz = soft_limit_tree_from_page(page);
555
556
557
558
559
560 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
561 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
562 excess = res_counter_soft_limit_excess(&memcg->res);
563
564
565
566
567 if (excess || mz->on_tree) {
568 spin_lock(&mctz->lock);
569
570 if (mz->on_tree)
571 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
572
573
574
575
576 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
577 spin_unlock(&mctz->lock);
578 }
579 }
580}
581
582static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
583{
584 int node, zone;
585 struct mem_cgroup_per_zone *mz;
586 struct mem_cgroup_tree_per_zone *mctz;
587
588 for_each_node(node) {
589 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
590 mz = mem_cgroup_zoneinfo(memcg, node, zone);
591 mctz = soft_limit_tree_node_zone(node, zone);
592 mem_cgroup_remove_exceeded(memcg, mz, mctz);
593 }
594 }
595}
596
597static struct mem_cgroup_per_zone *
598__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
599{
600 struct rb_node *rightmost = NULL;
601 struct mem_cgroup_per_zone *mz;
602
603retry:
604 mz = NULL;
605 rightmost = rb_last(&mctz->rb_root);
606 if (!rightmost)
607 goto done;
608
609 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
610
611
612
613
614
615 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
616 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
617 !css_tryget(&mz->mem->css))
618 goto retry;
619done:
620 return mz;
621}
622
623static struct mem_cgroup_per_zone *
624mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
625{
626 struct mem_cgroup_per_zone *mz;
627
628 spin_lock(&mctz->lock);
629 mz = __mem_cgroup_largest_soft_limit_node(mctz);
630 spin_unlock(&mctz->lock);
631 return mz;
632}
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
654 enum mem_cgroup_stat_index idx)
655{
656 long val = 0;
657 int cpu;
658
659 get_online_cpus();
660 for_each_online_cpu(cpu)
661 val += per_cpu(memcg->stat->count[idx], cpu);
662#ifdef CONFIG_HOTPLUG_CPU
663 spin_lock(&memcg->pcp_counter_lock);
664 val += memcg->nocpu_base.count[idx];
665 spin_unlock(&memcg->pcp_counter_lock);
666#endif
667 put_online_cpus();
668 return val;
669}
670
671static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
672 bool charge)
673{
674 int val = (charge) ? 1 : -1;
675 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
676}
677
678static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
679 enum mem_cgroup_events_index idx)
680{
681 unsigned long val = 0;
682 int cpu;
683
684 for_each_online_cpu(cpu)
685 val += per_cpu(memcg->stat->events[idx], cpu);
686#ifdef CONFIG_HOTPLUG_CPU
687 spin_lock(&memcg->pcp_counter_lock);
688 val += memcg->nocpu_base.events[idx];
689 spin_unlock(&memcg->pcp_counter_lock);
690#endif
691 return val;
692}
693
694static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
695 bool file, int nr_pages)
696{
697 preempt_disable();
698
699 if (file)
700 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
701 nr_pages);
702 else
703 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
704 nr_pages);
705
706
707 if (nr_pages > 0)
708 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
709 else {
710 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
711 nr_pages = -nr_pages;
712 }
713
714 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
715
716 preempt_enable();
717}
718
719unsigned long
720mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
721 unsigned int lru_mask)
722{
723 struct mem_cgroup_per_zone *mz;
724 enum lru_list l;
725 unsigned long ret = 0;
726
727 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
728
729 for_each_lru(l) {
730 if (BIT(l) & lru_mask)
731 ret += MEM_CGROUP_ZSTAT(mz, l);
732 }
733 return ret;
734}
735
736static unsigned long
737mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
738 int nid, unsigned int lru_mask)
739{
740 u64 total = 0;
741 int zid;
742
743 for (zid = 0; zid < MAX_NR_ZONES; zid++)
744 total += mem_cgroup_zone_nr_lru_pages(memcg,
745 nid, zid, lru_mask);
746
747 return total;
748}
749
750static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
751 unsigned int lru_mask)
752{
753 int nid;
754 u64 total = 0;
755
756 for_each_node_state(nid, N_HIGH_MEMORY)
757 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
758 return total;
759}
760
761static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
762 enum mem_cgroup_events_target target)
763{
764 unsigned long val, next;
765
766 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
767 next = __this_cpu_read(memcg->stat->targets[target]);
768
769 if ((long)next - (long)val < 0) {
770 switch (target) {
771 case MEM_CGROUP_TARGET_THRESH:
772 next = val + THRESHOLDS_EVENTS_TARGET;
773 break;
774 case MEM_CGROUP_TARGET_SOFTLIMIT:
775 next = val + SOFTLIMIT_EVENTS_TARGET;
776 break;
777 case MEM_CGROUP_TARGET_NUMAINFO:
778 next = val + NUMAINFO_EVENTS_TARGET;
779 break;
780 default:
781 break;
782 }
783 __this_cpu_write(memcg->stat->targets[target], next);
784 return true;
785 }
786 return false;
787}
788
789
790
791
792
793static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
794{
795 preempt_disable();
796
797 if (unlikely(mem_cgroup_event_ratelimit(memcg,
798 MEM_CGROUP_TARGET_THRESH))) {
799 bool do_softlimit;
800 bool do_numainfo __maybe_unused;
801
802 do_softlimit = mem_cgroup_event_ratelimit(memcg,
803 MEM_CGROUP_TARGET_SOFTLIMIT);
804#if MAX_NUMNODES > 1
805 do_numainfo = mem_cgroup_event_ratelimit(memcg,
806 MEM_CGROUP_TARGET_NUMAINFO);
807#endif
808 preempt_enable();
809
810 mem_cgroup_threshold(memcg);
811 if (unlikely(do_softlimit))
812 mem_cgroup_update_tree(memcg, page);
813#if MAX_NUMNODES > 1
814 if (unlikely(do_numainfo))
815 atomic_inc(&memcg->numainfo_events);
816#endif
817 } else
818 preempt_enable();
819}
820
821struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
822{
823 return container_of(cgroup_subsys_state(cont,
824 mem_cgroup_subsys_id), struct mem_cgroup,
825 css);
826}
827
828struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
829{
830
831
832
833
834
835 if (unlikely(!p))
836 return NULL;
837
838 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
839 struct mem_cgroup, css);
840}
841
842struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
843{
844 struct mem_cgroup *memcg = NULL;
845
846 if (!mm)
847 return NULL;
848
849
850
851
852
853 rcu_read_lock();
854 do {
855 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
856 if (unlikely(!memcg))
857 break;
858 } while (!css_tryget(&memcg->css));
859 rcu_read_unlock();
860 return memcg;
861}
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
881 struct mem_cgroup *prev,
882 struct mem_cgroup_reclaim_cookie *reclaim)
883{
884 struct mem_cgroup *memcg = NULL;
885 int id = 0;
886
887 if (mem_cgroup_disabled())
888 return NULL;
889
890 if (!root)
891 root = root_mem_cgroup;
892
893 if (prev && !reclaim)
894 id = css_id(&prev->css);
895
896 if (prev && prev != root)
897 css_put(&prev->css);
898
899 if (!root->use_hierarchy && root != root_mem_cgroup) {
900 if (prev)
901 return NULL;
902 return root;
903 }
904
905 while (!memcg) {
906 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
907 struct cgroup_subsys_state *css;
908
909 if (reclaim) {
910 int nid = zone_to_nid(reclaim->zone);
911 int zid = zone_idx(reclaim->zone);
912 struct mem_cgroup_per_zone *mz;
913
914 mz = mem_cgroup_zoneinfo(root, nid, zid);
915 iter = &mz->reclaim_iter[reclaim->priority];
916 if (prev && reclaim->generation != iter->generation)
917 return NULL;
918 id = iter->position;
919 }
920
921 rcu_read_lock();
922 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
923 if (css) {
924 if (css == &root->css || css_tryget(css))
925 memcg = container_of(css,
926 struct mem_cgroup, css);
927 } else
928 id = 0;
929 rcu_read_unlock();
930
931 if (reclaim) {
932 iter->position = id;
933 if (!css)
934 iter->generation++;
935 else if (!prev && memcg)
936 reclaim->generation = iter->generation;
937 }
938
939 if (prev && !css)
940 return NULL;
941 }
942 return memcg;
943}
944
945
946
947
948
949
950void mem_cgroup_iter_break(struct mem_cgroup *root,
951 struct mem_cgroup *prev)
952{
953 if (!root)
954 root = root_mem_cgroup;
955 if (prev && prev != root)
956 css_put(&prev->css);
957}
958
959
960
961
962
963
964#define for_each_mem_cgroup_tree(iter, root) \
965 for (iter = mem_cgroup_iter(root, NULL, NULL); \
966 iter != NULL; \
967 iter = mem_cgroup_iter(root, iter, NULL))
968
969#define for_each_mem_cgroup(iter) \
970 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
971 iter != NULL; \
972 iter = mem_cgroup_iter(NULL, iter, NULL))
973
974static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
975{
976 return (memcg == root_mem_cgroup);
977}
978
979void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
980{
981 struct mem_cgroup *memcg;
982
983 if (!mm)
984 return;
985
986 rcu_read_lock();
987 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
988 if (unlikely(!memcg))
989 goto out;
990
991 switch (idx) {
992 case PGFAULT:
993 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
994 break;
995 case PGMAJFAULT:
996 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
997 break;
998 default:
999 BUG();
1000 }
1001out:
1002 rcu_read_unlock();
1003}
1004EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1016 struct mem_cgroup *memcg)
1017{
1018 struct mem_cgroup_per_zone *mz;
1019
1020 if (mem_cgroup_disabled())
1021 return &zone->lruvec;
1022
1023 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1024 return &mz->lruvec;
1025}
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1054 enum lru_list lru)
1055{
1056 struct mem_cgroup_per_zone *mz;
1057 struct mem_cgroup *memcg;
1058 struct page_cgroup *pc;
1059
1060 if (mem_cgroup_disabled())
1061 return &zone->lruvec;
1062
1063 pc = lookup_page_cgroup(page);
1064 memcg = pc->mem_cgroup;
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1076 pc->mem_cgroup = memcg = root_mem_cgroup;
1077
1078 mz = page_cgroup_zoneinfo(memcg, page);
1079
1080 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1081 return &mz->lruvec;
1082}
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1095{
1096 struct mem_cgroup_per_zone *mz;
1097 struct mem_cgroup *memcg;
1098 struct page_cgroup *pc;
1099
1100 if (mem_cgroup_disabled())
1101 return;
1102
1103 pc = lookup_page_cgroup(page);
1104 memcg = pc->mem_cgroup;
1105 VM_BUG_ON(!memcg);
1106 mz = page_cgroup_zoneinfo(memcg, page);
1107
1108 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
1109 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1110}
1111
1112void mem_cgroup_lru_del(struct page *page)
1113{
1114 mem_cgroup_lru_del_list(page, page_lru(page));
1115}
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1132 struct page *page,
1133 enum lru_list from,
1134 enum lru_list to)
1135{
1136
1137 mem_cgroup_lru_del_list(page, from);
1138 return mem_cgroup_lru_add_list(zone, page, to);
1139}
1140
1141
1142
1143
1144
1145static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1146 struct mem_cgroup *memcg)
1147{
1148 if (root_memcg != memcg) {
1149 return (root_memcg->use_hierarchy &&
1150 css_is_ancestor(&memcg->css, &root_memcg->css));
1151 }
1152
1153 return true;
1154}
1155
1156int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1157{
1158 int ret;
1159 struct mem_cgroup *curr = NULL;
1160 struct task_struct *p;
1161
1162 p = find_lock_task_mm(task);
1163 if (p) {
1164 curr = try_get_mem_cgroup_from_mm(p->mm);
1165 task_unlock(p);
1166 } else {
1167
1168
1169
1170
1171
1172 task_lock(task);
1173 curr = mem_cgroup_from_task(task);
1174 if (curr)
1175 css_get(&curr->css);
1176 task_unlock(task);
1177 }
1178 if (!curr)
1179 return 0;
1180
1181
1182
1183
1184
1185
1186 ret = mem_cgroup_same_or_subtree(memcg, curr);
1187 css_put(&curr->css);
1188 return ret;
1189}
1190
1191int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1192{
1193 unsigned long inactive_ratio;
1194 int nid = zone_to_nid(zone);
1195 int zid = zone_idx(zone);
1196 unsigned long inactive;
1197 unsigned long active;
1198 unsigned long gb;
1199
1200 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1201 BIT(LRU_INACTIVE_ANON));
1202 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1203 BIT(LRU_ACTIVE_ANON));
1204
1205 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1206 if (gb)
1207 inactive_ratio = int_sqrt(10 * gb);
1208 else
1209 inactive_ratio = 1;
1210
1211 return inactive * inactive_ratio < active;
1212}
1213
1214int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1215{
1216 unsigned long active;
1217 unsigned long inactive;
1218 int zid = zone_idx(zone);
1219 int nid = zone_to_nid(zone);
1220
1221 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1222 BIT(LRU_INACTIVE_FILE));
1223 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1224 BIT(LRU_ACTIVE_FILE));
1225
1226 return (active > inactive);
1227}
1228
1229struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1230 struct zone *zone)
1231{
1232 int nid = zone_to_nid(zone);
1233 int zid = zone_idx(zone);
1234 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1235
1236 return &mz->reclaim_stat;
1237}
1238
1239struct zone_reclaim_stat *
1240mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1241{
1242 struct page_cgroup *pc;
1243 struct mem_cgroup_per_zone *mz;
1244
1245 if (mem_cgroup_disabled())
1246 return NULL;
1247
1248 pc = lookup_page_cgroup(page);
1249 if (!PageCgroupUsed(pc))
1250 return NULL;
1251
1252 smp_rmb();
1253 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1254 return &mz->reclaim_stat;
1255}
1256
1257#define mem_cgroup_from_res_counter(counter, member) \
1258 container_of(counter, struct mem_cgroup, member)
1259
1260
1261
1262
1263
1264
1265
1266
1267static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1268{
1269 unsigned long long margin;
1270
1271 margin = res_counter_margin(&memcg->res);
1272 if (do_swap_account)
1273 margin = min(margin, res_counter_margin(&memcg->memsw));
1274 return margin >> PAGE_SHIFT;
1275}
1276
1277int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1278{
1279 struct cgroup *cgrp = memcg->css.cgroup;
1280
1281
1282 if (cgrp->parent == NULL)
1283 return vm_swappiness;
1284
1285 return memcg->swappiness;
1286}
1287
1288static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1289{
1290 int cpu;
1291
1292 get_online_cpus();
1293 spin_lock(&memcg->pcp_counter_lock);
1294 for_each_online_cpu(cpu)
1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1297 spin_unlock(&memcg->pcp_counter_lock);
1298 put_online_cpus();
1299
1300 synchronize_rcu();
1301}
1302
1303static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1304{
1305 int cpu;
1306
1307 if (!memcg)
1308 return;
1309 get_online_cpus();
1310 spin_lock(&memcg->pcp_counter_lock);
1311 for_each_online_cpu(cpu)
1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1314 spin_unlock(&memcg->pcp_counter_lock);
1315 put_online_cpus();
1316}
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1330{
1331 VM_BUG_ON(!rcu_read_lock_held());
1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1333}
1334
1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1336{
1337 struct mem_cgroup *from;
1338 struct mem_cgroup *to;
1339 bool ret = false;
1340
1341
1342
1343
1344 spin_lock(&mc.lock);
1345 from = mc.from;
1346 to = mc.to;
1347 if (!from)
1348 goto unlock;
1349
1350 ret = mem_cgroup_same_or_subtree(memcg, from)
1351 || mem_cgroup_same_or_subtree(memcg, to);
1352unlock:
1353 spin_unlock(&mc.lock);
1354 return ret;
1355}
1356
1357static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1358{
1359 if (mc.moving_task && current != mc.moving_task) {
1360 if (mem_cgroup_under_move(memcg)) {
1361 DEFINE_WAIT(wait);
1362 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1363
1364 if (mc.moving_task)
1365 schedule();
1366 finish_wait(&mc.waitq, &wait);
1367 return true;
1368 }
1369 }
1370 return false;
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1382{
1383 struct cgroup *task_cgrp;
1384 struct cgroup *mem_cgrp;
1385
1386
1387
1388
1389
1390 static char memcg_name[PATH_MAX];
1391 int ret;
1392
1393 if (!memcg || !p)
1394 return;
1395
1396
1397 rcu_read_lock();
1398
1399 mem_cgrp = memcg->css.cgroup;
1400 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1401
1402 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1403 if (ret < 0) {
1404
1405
1406
1407
1408 rcu_read_unlock();
1409 goto done;
1410 }
1411 rcu_read_unlock();
1412
1413 printk(KERN_INFO "Task in %s killed", memcg_name);
1414
1415 rcu_read_lock();
1416 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1417 if (ret < 0) {
1418 rcu_read_unlock();
1419 goto done;
1420 }
1421 rcu_read_unlock();
1422
1423
1424
1425
1426 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1427done:
1428
1429 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1430 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1431 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1432 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1433 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1434 "failcnt %llu\n",
1435 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1436 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1437 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1438}
1439
1440
1441
1442
1443
1444static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1445{
1446 int num = 0;
1447 struct mem_cgroup *iter;
1448
1449 for_each_mem_cgroup_tree(iter, memcg)
1450 num++;
1451 return num;
1452}
1453
1454
1455
1456
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{
1459 u64 limit;
1460 u64 memsw;
1461
1462 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1463 limit += total_swap_pages << PAGE_SHIFT;
1464
1465 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1466
1467
1468
1469
1470 return min(limit, memsw);
1471}
1472
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask,
1475 unsigned long flags)
1476{
1477 unsigned long total = 0;
1478 bool noswap = false;
1479 int loop;
1480
1481 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1482 noswap = true;
1483 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1484 noswap = true;
1485
1486 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1487 if (loop)
1488 drain_all_stock_async(memcg);
1489 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1490
1491
1492
1493
1494
1495 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1496 break;
1497 if (mem_cgroup_margin(memcg))
1498 break;
1499
1500
1501
1502
1503 if (loop && !total)
1504 break;
1505 }
1506 return total;
1507}
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1520 int nid, bool noswap)
1521{
1522 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1523 return true;
1524 if (noswap || !total_swap_pages)
1525 return false;
1526 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1527 return true;
1528 return false;
1529
1530}
1531#if MAX_NUMNODES > 1
1532
1533
1534
1535
1536
1537
1538
1539static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1540{
1541 int nid;
1542
1543
1544
1545
1546 if (!atomic_read(&memcg->numainfo_events))
1547 return;
1548 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1549 return;
1550
1551
1552 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1553
1554 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1555
1556 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1557 node_clear(nid, memcg->scan_nodes);
1558 }
1559
1560 atomic_set(&memcg->numainfo_events, 0);
1561 atomic_set(&memcg->numainfo_updating, 0);
1562}
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1577{
1578 int node;
1579
1580 mem_cgroup_may_update_nodemask(memcg);
1581 node = memcg->last_scanned_node;
1582
1583 node = next_node(node, memcg->scan_nodes);
1584 if (node == MAX_NUMNODES)
1585 node = first_node(memcg->scan_nodes);
1586
1587
1588
1589
1590
1591
1592 if (unlikely(node == MAX_NUMNODES))
1593 node = numa_node_id();
1594
1595 memcg->last_scanned_node = node;
1596 return node;
1597}
1598
1599
1600
1601
1602
1603
1604
1605bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1606{
1607 int nid;
1608
1609
1610
1611
1612
1613 if (!nodes_empty(memcg->scan_nodes)) {
1614 for (nid = first_node(memcg->scan_nodes);
1615 nid < MAX_NUMNODES;
1616 nid = next_node(nid, memcg->scan_nodes)) {
1617
1618 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1619 return true;
1620 }
1621 }
1622
1623
1624
1625 for_each_node_state(nid, N_HIGH_MEMORY) {
1626 if (node_isset(nid, memcg->scan_nodes))
1627 continue;
1628 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1629 return true;
1630 }
1631 return false;
1632}
1633
1634#else
1635int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1636{
1637 return 0;
1638}
1639
1640bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1641{
1642 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1643}
1644#endif
1645
1646static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1647 struct zone *zone,
1648 gfp_t gfp_mask,
1649 unsigned long *total_scanned)
1650{
1651 struct mem_cgroup *victim = NULL;
1652 int total = 0;
1653 int loop = 0;
1654 unsigned long excess;
1655 unsigned long nr_scanned;
1656 struct mem_cgroup_reclaim_cookie reclaim = {
1657 .zone = zone,
1658 .priority = 0,
1659 };
1660
1661 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1662
1663 while (1) {
1664 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1665 if (!victim) {
1666 loop++;
1667 if (loop >= 2) {
1668
1669
1670
1671
1672
1673 if (!total)
1674 break;
1675
1676
1677
1678
1679
1680
1681 if (total >= (excess >> 2) ||
1682 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1683 break;
1684 }
1685 continue;
1686 }
1687 if (!mem_cgroup_reclaimable(victim, false))
1688 continue;
1689 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1690 zone, &nr_scanned);
1691 *total_scanned += nr_scanned;
1692 if (!res_counter_soft_limit_excess(&root_memcg->res))
1693 break;
1694 }
1695 mem_cgroup_iter_break(root_memcg, victim);
1696 return total;
1697}
1698
1699
1700
1701
1702
1703
1704static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1705{
1706 struct mem_cgroup *iter, *failed = NULL;
1707
1708 for_each_mem_cgroup_tree(iter, memcg) {
1709 if (iter->oom_lock) {
1710
1711
1712
1713
1714 failed = iter;
1715 mem_cgroup_iter_break(memcg, iter);
1716 break;
1717 } else
1718 iter->oom_lock = true;
1719 }
1720
1721 if (!failed)
1722 return true;
1723
1724
1725
1726
1727
1728 for_each_mem_cgroup_tree(iter, memcg) {
1729 if (iter == failed) {
1730 mem_cgroup_iter_break(memcg, iter);
1731 break;
1732 }
1733 iter->oom_lock = false;
1734 }
1735 return false;
1736}
1737
1738
1739
1740
1741static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1742{
1743 struct mem_cgroup *iter;
1744
1745 for_each_mem_cgroup_tree(iter, memcg)
1746 iter->oom_lock = false;
1747 return 0;
1748}
1749
1750static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1751{
1752 struct mem_cgroup *iter;
1753
1754 for_each_mem_cgroup_tree(iter, memcg)
1755 atomic_inc(&iter->under_oom);
1756}
1757
1758static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1759{
1760 struct mem_cgroup *iter;
1761
1762
1763
1764
1765
1766
1767 for_each_mem_cgroup_tree(iter, memcg)
1768 atomic_add_unless(&iter->under_oom, -1, 0);
1769}
1770
1771static DEFINE_SPINLOCK(memcg_oom_lock);
1772static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1773
1774struct oom_wait_info {
1775 struct mem_cgroup *mem;
1776 wait_queue_t wait;
1777};
1778
1779static int memcg_oom_wake_function(wait_queue_t *wait,
1780 unsigned mode, int sync, void *arg)
1781{
1782 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1783 *oom_wait_memcg;
1784 struct oom_wait_info *oom_wait_info;
1785
1786 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1787 oom_wait_memcg = oom_wait_info->mem;
1788
1789
1790
1791
1792
1793 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1794 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1795 return 0;
1796 return autoremove_wake_function(wait, mode, sync, arg);
1797}
1798
1799static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1800{
1801
1802 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1803}
1804
1805static void memcg_oom_recover(struct mem_cgroup *memcg)
1806{
1807 if (memcg && atomic_read(&memcg->under_oom))
1808 memcg_wakeup_oom(memcg);
1809}
1810
1811
1812
1813
1814bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1815{
1816 struct oom_wait_info owait;
1817 bool locked, need_to_kill;
1818
1819 owait.mem = memcg;
1820 owait.wait.flags = 0;
1821 owait.wait.func = memcg_oom_wake_function;
1822 owait.wait.private = current;
1823 INIT_LIST_HEAD(&owait.wait.task_list);
1824 need_to_kill = true;
1825 mem_cgroup_mark_under_oom(memcg);
1826
1827
1828 spin_lock(&memcg_oom_lock);
1829 locked = mem_cgroup_oom_lock(memcg);
1830
1831
1832
1833
1834
1835 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1836 if (!locked || memcg->oom_kill_disable)
1837 need_to_kill = false;
1838 if (locked)
1839 mem_cgroup_oom_notify(memcg);
1840 spin_unlock(&memcg_oom_lock);
1841
1842 if (need_to_kill) {
1843 finish_wait(&memcg_oom_waitq, &owait.wait);
1844 mem_cgroup_out_of_memory(memcg, mask);
1845 } else {
1846 schedule();
1847 finish_wait(&memcg_oom_waitq, &owait.wait);
1848 }
1849 spin_lock(&memcg_oom_lock);
1850 if (locked)
1851 mem_cgroup_oom_unlock(memcg);
1852 memcg_wakeup_oom(memcg);
1853 spin_unlock(&memcg_oom_lock);
1854
1855 mem_cgroup_unmark_under_oom(memcg);
1856
1857 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1858 return false;
1859
1860 schedule_timeout_uninterruptible(1);
1861 return true;
1862}
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888void mem_cgroup_update_page_stat(struct page *page,
1889 enum mem_cgroup_page_stat_item idx, int val)
1890{
1891 struct mem_cgroup *memcg;
1892 struct page_cgroup *pc = lookup_page_cgroup(page);
1893 bool need_unlock = false;
1894 unsigned long uninitialized_var(flags);
1895
1896 if (mem_cgroup_disabled())
1897 return;
1898
1899 rcu_read_lock();
1900 memcg = pc->mem_cgroup;
1901 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1902 goto out;
1903
1904 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1905
1906 move_lock_page_cgroup(pc, &flags);
1907 need_unlock = true;
1908 memcg = pc->mem_cgroup;
1909 if (!memcg || !PageCgroupUsed(pc))
1910 goto out;
1911 }
1912
1913 switch (idx) {
1914 case MEMCG_NR_FILE_MAPPED:
1915 if (val > 0)
1916 SetPageCgroupFileMapped(pc);
1917 else if (!page_mapped(page))
1918 ClearPageCgroupFileMapped(pc);
1919 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1920 break;
1921 default:
1922 BUG();
1923 }
1924
1925 this_cpu_add(memcg->stat->count[idx], val);
1926
1927out:
1928 if (unlikely(need_unlock))
1929 move_unlock_page_cgroup(pc, &flags);
1930 rcu_read_unlock();
1931 return;
1932}
1933EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1934
1935
1936
1937
1938
1939#define CHARGE_BATCH 32U
1940struct memcg_stock_pcp {
1941 struct mem_cgroup *cached;
1942 unsigned int nr_pages;
1943 struct work_struct work;
1944 unsigned long flags;
1945#define FLUSHING_CACHED_CHARGE (0)
1946};
1947static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1948static DEFINE_MUTEX(percpu_charge_mutex);
1949
1950
1951
1952
1953
1954
1955
1956static bool consume_stock(struct mem_cgroup *memcg)
1957{
1958 struct memcg_stock_pcp *stock;
1959 bool ret = true;
1960
1961 stock = &get_cpu_var(memcg_stock);
1962 if (memcg == stock->cached && stock->nr_pages)
1963 stock->nr_pages--;
1964 else
1965 ret = false;
1966 put_cpu_var(memcg_stock);
1967 return ret;
1968}
1969
1970
1971
1972
1973static void drain_stock(struct memcg_stock_pcp *stock)
1974{
1975 struct mem_cgroup *old = stock->cached;
1976
1977 if (stock->nr_pages) {
1978 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1979
1980 res_counter_uncharge(&old->res, bytes);
1981 if (do_swap_account)
1982 res_counter_uncharge(&old->memsw, bytes);
1983 stock->nr_pages = 0;
1984 }
1985 stock->cached = NULL;
1986}
1987
1988
1989
1990
1991
1992static void drain_local_stock(struct work_struct *dummy)
1993{
1994 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1995 drain_stock(stock);
1996 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1997}
1998
1999
2000
2001
2002
2003static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2004{
2005 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2006
2007 if (stock->cached != memcg) {
2008 drain_stock(stock);
2009 stock->cached = memcg;
2010 }
2011 stock->nr_pages += nr_pages;
2012 put_cpu_var(memcg_stock);
2013}
2014
2015
2016
2017
2018
2019
2020static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2021{
2022 int cpu, curcpu;
2023
2024
2025 get_online_cpus();
2026 curcpu = get_cpu();
2027 for_each_online_cpu(cpu) {
2028 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2029 struct mem_cgroup *memcg;
2030
2031 memcg = stock->cached;
2032 if (!memcg || !stock->nr_pages)
2033 continue;
2034 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2035 continue;
2036 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2037 if (cpu == curcpu)
2038 drain_local_stock(&stock->work);
2039 else
2040 schedule_work_on(cpu, &stock->work);
2041 }
2042 }
2043 put_cpu();
2044
2045 if (!sync)
2046 goto out;
2047
2048 for_each_online_cpu(cpu) {
2049 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2050 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2051 flush_work(&stock->work);
2052 }
2053out:
2054 put_online_cpus();
2055}
2056
2057
2058
2059
2060
2061
2062
2063static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2064{
2065
2066
2067
2068 if (!mutex_trylock(&percpu_charge_mutex))
2069 return;
2070 drain_all_stock(root_memcg, false);
2071 mutex_unlock(&percpu_charge_mutex);
2072}
2073
2074
2075static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2076{
2077
2078 mutex_lock(&percpu_charge_mutex);
2079 drain_all_stock(root_memcg, true);
2080 mutex_unlock(&percpu_charge_mutex);
2081}
2082
2083
2084
2085
2086
2087static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2088{
2089 int i;
2090
2091 spin_lock(&memcg->pcp_counter_lock);
2092 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2093 long x = per_cpu(memcg->stat->count[i], cpu);
2094
2095 per_cpu(memcg->stat->count[i], cpu) = 0;
2096 memcg->nocpu_base.count[i] += x;
2097 }
2098 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2099 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2100
2101 per_cpu(memcg->stat->events[i], cpu) = 0;
2102 memcg->nocpu_base.events[i] += x;
2103 }
2104
2105 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2106 spin_unlock(&memcg->pcp_counter_lock);
2107}
2108
2109static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2110{
2111 int idx = MEM_CGROUP_ON_MOVE;
2112
2113 spin_lock(&memcg->pcp_counter_lock);
2114 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2115 spin_unlock(&memcg->pcp_counter_lock);
2116}
2117
2118static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2119 unsigned long action,
2120 void *hcpu)
2121{
2122 int cpu = (unsigned long)hcpu;
2123 struct memcg_stock_pcp *stock;
2124 struct mem_cgroup *iter;
2125
2126 if ((action == CPU_ONLINE)) {
2127 for_each_mem_cgroup(iter)
2128 synchronize_mem_cgroup_on_move(iter, cpu);
2129 return NOTIFY_OK;
2130 }
2131
2132 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2133 return NOTIFY_OK;
2134
2135 for_each_mem_cgroup(iter)
2136 mem_cgroup_drain_pcp_counter(iter, cpu);
2137
2138 stock = &per_cpu(memcg_stock, cpu);
2139 drain_stock(stock);
2140 return NOTIFY_OK;
2141}
2142
2143
2144
2145enum {
2146 CHARGE_OK,
2147 CHARGE_RETRY,
2148 CHARGE_NOMEM,
2149 CHARGE_WOULDBLOCK,
2150 CHARGE_OOM_DIE,
2151};
2152
2153static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2154 unsigned int nr_pages, bool oom_check)
2155{
2156 unsigned long csize = nr_pages * PAGE_SIZE;
2157 struct mem_cgroup *mem_over_limit;
2158 struct res_counter *fail_res;
2159 unsigned long flags = 0;
2160 int ret;
2161
2162 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2163
2164 if (likely(!ret)) {
2165 if (!do_swap_account)
2166 return CHARGE_OK;
2167 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2168 if (likely(!ret))
2169 return CHARGE_OK;
2170
2171 res_counter_uncharge(&memcg->res, csize);
2172 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2173 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2174 } else
2175 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2176
2177
2178
2179
2180
2181
2182
2183 if (nr_pages == CHARGE_BATCH)
2184 return CHARGE_RETRY;
2185
2186 if (!(gfp_mask & __GFP_WAIT))
2187 return CHARGE_WOULDBLOCK;
2188
2189 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2190 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2191 return CHARGE_RETRY;
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201 if (nr_pages == 1 && ret)
2202 return CHARGE_RETRY;
2203
2204
2205
2206
2207
2208 if (mem_cgroup_wait_acct_move(mem_over_limit))
2209 return CHARGE_RETRY;
2210
2211
2212 if (!oom_check)
2213 return CHARGE_NOMEM;
2214
2215 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2216 return CHARGE_OOM_DIE;
2217
2218 return CHARGE_RETRY;
2219}
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242static int __mem_cgroup_try_charge(struct mm_struct *mm,
2243 gfp_t gfp_mask,
2244 unsigned int nr_pages,
2245 struct mem_cgroup **ptr,
2246 bool oom)
2247{
2248 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2249 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2250 struct mem_cgroup *memcg = NULL;
2251 int ret;
2252
2253
2254
2255
2256
2257
2258 if (unlikely(test_thread_flag(TIF_MEMDIE)
2259 || fatal_signal_pending(current)))
2260 goto bypass;
2261
2262
2263
2264
2265
2266
2267
2268 if (!*ptr && !mm)
2269 *ptr = root_mem_cgroup;
2270again:
2271 if (*ptr) {
2272 memcg = *ptr;
2273 VM_BUG_ON(css_is_removed(&memcg->css));
2274 if (mem_cgroup_is_root(memcg))
2275 goto done;
2276 if (nr_pages == 1 && consume_stock(memcg))
2277 goto done;
2278 css_get(&memcg->css);
2279 } else {
2280 struct task_struct *p;
2281
2282 rcu_read_lock();
2283 p = rcu_dereference(mm->owner);
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294 memcg = mem_cgroup_from_task(p);
2295 if (!memcg)
2296 memcg = root_mem_cgroup;
2297 if (mem_cgroup_is_root(memcg)) {
2298 rcu_read_unlock();
2299 goto done;
2300 }
2301 if (nr_pages == 1 && consume_stock(memcg)) {
2302
2303
2304
2305
2306
2307
2308
2309
2310 rcu_read_unlock();
2311 goto done;
2312 }
2313
2314 if (!css_tryget(&memcg->css)) {
2315 rcu_read_unlock();
2316 goto again;
2317 }
2318 rcu_read_unlock();
2319 }
2320
2321 do {
2322 bool oom_check;
2323
2324
2325 if (fatal_signal_pending(current)) {
2326 css_put(&memcg->css);
2327 goto bypass;
2328 }
2329
2330 oom_check = false;
2331 if (oom && !nr_oom_retries) {
2332 oom_check = true;
2333 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2334 }
2335
2336 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2337 switch (ret) {
2338 case CHARGE_OK:
2339 break;
2340 case CHARGE_RETRY:
2341 batch = nr_pages;
2342 css_put(&memcg->css);
2343 memcg = NULL;
2344 goto again;
2345 case CHARGE_WOULDBLOCK:
2346 css_put(&memcg->css);
2347 goto nomem;
2348 case CHARGE_NOMEM:
2349 if (!oom) {
2350 css_put(&memcg->css);
2351 goto nomem;
2352 }
2353
2354 nr_oom_retries--;
2355 break;
2356 case CHARGE_OOM_DIE:
2357 css_put(&memcg->css);
2358 goto bypass;
2359 }
2360 } while (ret != CHARGE_OK);
2361
2362 if (batch > nr_pages)
2363 refill_stock(memcg, batch - nr_pages);
2364 css_put(&memcg->css);
2365done:
2366 *ptr = memcg;
2367 return 0;
2368nomem:
2369 *ptr = NULL;
2370 return -ENOMEM;
2371bypass:
2372 *ptr = root_mem_cgroup;
2373 return -EINTR;
2374}
2375
2376
2377
2378
2379
2380
2381static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2382 unsigned int nr_pages)
2383{
2384 if (!mem_cgroup_is_root(memcg)) {
2385 unsigned long bytes = nr_pages * PAGE_SIZE;
2386
2387 res_counter_uncharge(&memcg->res, bytes);
2388 if (do_swap_account)
2389 res_counter_uncharge(&memcg->memsw, bytes);
2390 }
2391}
2392
2393
2394
2395
2396
2397
2398
2399static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2400{
2401 struct cgroup_subsys_state *css;
2402
2403
2404 if (!id)
2405 return NULL;
2406 css = css_lookup(&mem_cgroup_subsys, id);
2407 if (!css)
2408 return NULL;
2409 return container_of(css, struct mem_cgroup, css);
2410}
2411
2412struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2413{
2414 struct mem_cgroup *memcg = NULL;
2415 struct page_cgroup *pc;
2416 unsigned short id;
2417 swp_entry_t ent;
2418
2419 VM_BUG_ON(!PageLocked(page));
2420
2421 pc = lookup_page_cgroup(page);
2422 lock_page_cgroup(pc);
2423 if (PageCgroupUsed(pc)) {
2424 memcg = pc->mem_cgroup;
2425 if (memcg && !css_tryget(&memcg->css))
2426 memcg = NULL;
2427 } else if (PageSwapCache(page)) {
2428 ent.val = page_private(page);
2429 id = lookup_swap_cgroup_id(ent);
2430 rcu_read_lock();
2431 memcg = mem_cgroup_lookup(id);
2432 if (memcg && !css_tryget(&memcg->css))
2433 memcg = NULL;
2434 rcu_read_unlock();
2435 }
2436 unlock_page_cgroup(pc);
2437 return memcg;
2438}
2439
2440static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2441 struct page *page,
2442 unsigned int nr_pages,
2443 struct page_cgroup *pc,
2444 enum charge_type ctype,
2445 bool lrucare)
2446{
2447 struct zone *uninitialized_var(zone);
2448 bool was_on_lru = false;
2449
2450 lock_page_cgroup(pc);
2451 if (unlikely(PageCgroupUsed(pc))) {
2452 unlock_page_cgroup(pc);
2453 __mem_cgroup_cancel_charge(memcg, nr_pages);
2454 return;
2455 }
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465 if (lrucare) {
2466 zone = page_zone(page);
2467 spin_lock_irq(&zone->lru_lock);
2468 if (PageLRU(page)) {
2469 ClearPageLRU(page);
2470 del_page_from_lru_list(zone, page, page_lru(page));
2471 was_on_lru = true;
2472 }
2473 }
2474
2475 pc->mem_cgroup = memcg;
2476
2477
2478
2479
2480
2481
2482
2483 smp_wmb();
2484 switch (ctype) {
2485 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487 SetPageCgroupCache(pc);
2488 SetPageCgroupUsed(pc);
2489 break;
2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491 ClearPageCgroupCache(pc);
2492 SetPageCgroupUsed(pc);
2493 break;
2494 default:
2495 break;
2496 }
2497
2498 if (lrucare) {
2499 if (was_on_lru) {
2500 VM_BUG_ON(PageLRU(page));
2501 SetPageLRU(page);
2502 add_page_to_lru_list(zone, page, page_lru(page));
2503 }
2504 spin_unlock_irq(&zone->lru_lock);
2505 }
2506
2507 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2508 unlock_page_cgroup(pc);
2509
2510
2511
2512
2513
2514
2515 memcg_check_events(memcg, page);
2516}
2517
2518#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2519
2520#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2521 (1 << PCG_MIGRATION))
2522
2523
2524
2525
2526
2527
2528void mem_cgroup_split_huge_fixup(struct page *head)
2529{
2530 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2531 struct page_cgroup *pc;
2532 int i;
2533
2534 if (mem_cgroup_disabled())
2535 return;
2536 for (i = 1; i < HPAGE_PMD_NR; i++) {
2537 pc = head_pc + i;
2538 pc->mem_cgroup = head_pc->mem_cgroup;
2539 smp_wmb();
2540 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2541 }
2542}
2543#endif
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563static int mem_cgroup_move_account(struct page *page,
2564 unsigned int nr_pages,
2565 struct page_cgroup *pc,
2566 struct mem_cgroup *from,
2567 struct mem_cgroup *to,
2568 bool uncharge)
2569{
2570 unsigned long flags;
2571 int ret;
2572
2573 VM_BUG_ON(from == to);
2574 VM_BUG_ON(PageLRU(page));
2575
2576
2577
2578
2579
2580
2581 ret = -EBUSY;
2582 if (nr_pages > 1 && !PageTransHuge(page))
2583 goto out;
2584
2585 lock_page_cgroup(pc);
2586
2587 ret = -EINVAL;
2588 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2589 goto unlock;
2590
2591 move_lock_page_cgroup(pc, &flags);
2592
2593 if (PageCgroupFileMapped(pc)) {
2594
2595 preempt_disable();
2596 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2597 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2598 preempt_enable();
2599 }
2600 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2601 if (uncharge)
2602
2603 __mem_cgroup_cancel_charge(from, nr_pages);
2604
2605
2606 pc->mem_cgroup = to;
2607 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2608
2609
2610
2611
2612
2613
2614
2615 move_unlock_page_cgroup(pc, &flags);
2616 ret = 0;
2617unlock:
2618 unlock_page_cgroup(pc);
2619
2620
2621
2622 memcg_check_events(to, page);
2623 memcg_check_events(from, page);
2624out:
2625 return ret;
2626}
2627
2628
2629
2630
2631
2632static int mem_cgroup_move_parent(struct page *page,
2633 struct page_cgroup *pc,
2634 struct mem_cgroup *child,
2635 gfp_t gfp_mask)
2636{
2637 struct cgroup *cg = child->css.cgroup;
2638 struct cgroup *pcg = cg->parent;
2639 struct mem_cgroup *parent;
2640 unsigned int nr_pages;
2641 unsigned long uninitialized_var(flags);
2642 int ret;
2643
2644
2645 if (!pcg)
2646 return -EINVAL;
2647
2648 ret = -EBUSY;
2649 if (!get_page_unless_zero(page))
2650 goto out;
2651 if (isolate_lru_page(page))
2652 goto put;
2653
2654 nr_pages = hpage_nr_pages(page);
2655
2656 parent = mem_cgroup_from_cont(pcg);
2657 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2658 if (ret)
2659 goto put_back;
2660
2661 if (nr_pages > 1)
2662 flags = compound_lock_irqsave(page);
2663
2664 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2665 if (ret)
2666 __mem_cgroup_cancel_charge(parent, nr_pages);
2667
2668 if (nr_pages > 1)
2669 compound_unlock_irqrestore(page, flags);
2670put_back:
2671 putback_lru_page(page);
2672put:
2673 put_page(page);
2674out:
2675 return ret;
2676}
2677
2678
2679
2680
2681
2682
2683
2684static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2685 gfp_t gfp_mask, enum charge_type ctype)
2686{
2687 struct mem_cgroup *memcg = NULL;
2688 unsigned int nr_pages = 1;
2689 struct page_cgroup *pc;
2690 bool oom = true;
2691 int ret;
2692
2693 if (PageTransHuge(page)) {
2694 nr_pages <<= compound_order(page);
2695 VM_BUG_ON(!PageTransHuge(page));
2696
2697
2698
2699
2700 oom = false;
2701 }
2702
2703 pc = lookup_page_cgroup(page);
2704 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2705 if (ret == -ENOMEM)
2706 return ret;
2707 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
2708 return 0;
2709}
2710
2711int mem_cgroup_newpage_charge(struct page *page,
2712 struct mm_struct *mm, gfp_t gfp_mask)
2713{
2714 if (mem_cgroup_disabled())
2715 return 0;
2716 VM_BUG_ON(page_mapped(page));
2717 VM_BUG_ON(page->mapping && !PageAnon(page));
2718 VM_BUG_ON(!mm);
2719 return mem_cgroup_charge_common(page, mm, gfp_mask,
2720 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2721}
2722
2723static void
2724__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2725 enum charge_type ctype);
2726
2727int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2728 gfp_t gfp_mask)
2729{
2730 struct mem_cgroup *memcg = NULL;
2731 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2732 int ret;
2733
2734 if (mem_cgroup_disabled())
2735 return 0;
2736 if (PageCompound(page))
2737 return 0;
2738
2739 if (unlikely(!mm))
2740 mm = &init_mm;
2741 if (!page_is_file_cache(page))
2742 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2743
2744 if (!PageSwapCache(page))
2745 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2746 else {
2747 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2748 if (!ret)
2749 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2750 }
2751 return ret;
2752}
2753
2754
2755
2756
2757
2758
2759
2760int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2761 struct page *page,
2762 gfp_t mask, struct mem_cgroup **memcgp)
2763{
2764 struct mem_cgroup *memcg;
2765 int ret;
2766
2767 *memcgp = NULL;
2768
2769 if (mem_cgroup_disabled())
2770 return 0;
2771
2772 if (!do_swap_account)
2773 goto charge_cur_mm;
2774
2775
2776
2777
2778
2779
2780 if (!PageSwapCache(page))
2781 goto charge_cur_mm;
2782 memcg = try_get_mem_cgroup_from_page(page);
2783 if (!memcg)
2784 goto charge_cur_mm;
2785 *memcgp = memcg;
2786 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2787 css_put(&memcg->css);
2788 if (ret == -EINTR)
2789 ret = 0;
2790 return ret;
2791charge_cur_mm:
2792 if (unlikely(!mm))
2793 mm = &init_mm;
2794 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2795 if (ret == -EINTR)
2796 ret = 0;
2797 return ret;
2798}
2799
2800static void
2801__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2802 enum charge_type ctype)
2803{
2804 struct page_cgroup *pc;
2805
2806 if (mem_cgroup_disabled())
2807 return;
2808 if (!memcg)
2809 return;
2810 cgroup_exclude_rmdir(&memcg->css);
2811
2812 pc = lookup_page_cgroup(page);
2813 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
2814
2815
2816
2817
2818
2819
2820
2821 if (do_swap_account && PageSwapCache(page)) {
2822 swp_entry_t ent = {.val = page_private(page)};
2823 struct mem_cgroup *swap_memcg;
2824 unsigned short id;
2825
2826 id = swap_cgroup_record(ent, 0);
2827 rcu_read_lock();
2828 swap_memcg = mem_cgroup_lookup(id);
2829 if (swap_memcg) {
2830
2831
2832
2833
2834 if (!mem_cgroup_is_root(swap_memcg))
2835 res_counter_uncharge(&swap_memcg->memsw,
2836 PAGE_SIZE);
2837 mem_cgroup_swap_statistics(swap_memcg, false);
2838 mem_cgroup_put(swap_memcg);
2839 }
2840 rcu_read_unlock();
2841 }
2842
2843
2844
2845
2846
2847 cgroup_release_and_wakeup_rmdir(&memcg->css);
2848}
2849
2850void mem_cgroup_commit_charge_swapin(struct page *page,
2851 struct mem_cgroup *memcg)
2852{
2853 __mem_cgroup_commit_charge_swapin(page, memcg,
2854 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2855}
2856
2857void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2858{
2859 if (mem_cgroup_disabled())
2860 return;
2861 if (!memcg)
2862 return;
2863 __mem_cgroup_cancel_charge(memcg, 1);
2864}
2865
2866static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2867 unsigned int nr_pages,
2868 const enum charge_type ctype)
2869{
2870 struct memcg_batch_info *batch = NULL;
2871 bool uncharge_memsw = true;
2872
2873
2874 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2875 uncharge_memsw = false;
2876
2877 batch = ¤t->memcg_batch;
2878
2879
2880
2881
2882
2883 if (!batch->memcg)
2884 batch->memcg = memcg;
2885
2886
2887
2888
2889
2890
2891
2892
2893 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2894 goto direct_uncharge;
2895
2896 if (nr_pages > 1)
2897 goto direct_uncharge;
2898
2899
2900
2901
2902
2903
2904 if (batch->memcg != memcg)
2905 goto direct_uncharge;
2906
2907 batch->nr_pages++;
2908 if (uncharge_memsw)
2909 batch->memsw_nr_pages++;
2910 return;
2911direct_uncharge:
2912 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
2913 if (uncharge_memsw)
2914 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2915 if (unlikely(batch->memcg != memcg))
2916 memcg_oom_recover(memcg);
2917 return;
2918}
2919
2920
2921
2922
2923static struct mem_cgroup *
2924__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2925{
2926 struct mem_cgroup *memcg = NULL;
2927 unsigned int nr_pages = 1;
2928 struct page_cgroup *pc;
2929
2930 if (mem_cgroup_disabled())
2931 return NULL;
2932
2933 if (PageSwapCache(page))
2934 return NULL;
2935
2936 if (PageTransHuge(page)) {
2937 nr_pages <<= compound_order(page);
2938 VM_BUG_ON(!PageTransHuge(page));
2939 }
2940
2941
2942
2943 pc = lookup_page_cgroup(page);
2944 if (unlikely(!PageCgroupUsed(pc)))
2945 return NULL;
2946
2947 lock_page_cgroup(pc);
2948
2949 memcg = pc->mem_cgroup;
2950
2951 if (!PageCgroupUsed(pc))
2952 goto unlock_out;
2953
2954 switch (ctype) {
2955 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2956 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957
2958 if (page_mapped(page) || PageCgroupMigration(pc))
2959 goto unlock_out;
2960 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2962 if (!PageAnon(page)) {
2963 if (page->mapping && !page_is_file_cache(page))
2964 goto unlock_out;
2965 } else if (page_mapped(page))
2966 goto unlock_out;
2967 break;
2968 default:
2969 break;
2970 }
2971
2972 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
2973
2974 ClearPageCgroupUsed(pc);
2975
2976
2977
2978
2979
2980
2981
2982 unlock_page_cgroup(pc);
2983
2984
2985
2986
2987 memcg_check_events(memcg, page);
2988 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2989 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg);
2991 }
2992 if (!mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994
2995 return memcg;
2996
2997unlock_out:
2998 unlock_page_cgroup(pc);
2999 return NULL;
3000}
3001
3002void mem_cgroup_uncharge_page(struct page *page)
3003{
3004
3005 if (page_mapped(page))
3006 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3009}
3010
3011void mem_cgroup_uncharge_cache_page(struct page *page)
3012{
3013 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3016}
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026void mem_cgroup_uncharge_start(void)
3027{
3028 current->memcg_batch.do_batch++;
3029
3030 if (current->memcg_batch.do_batch == 1) {
3031 current->memcg_batch.memcg = NULL;
3032 current->memcg_batch.nr_pages = 0;
3033 current->memcg_batch.memsw_nr_pages = 0;
3034 }
3035}
3036
3037void mem_cgroup_uncharge_end(void)
3038{
3039 struct memcg_batch_info *batch = ¤t->memcg_batch;
3040
3041 if (!batch->do_batch)
3042 return;
3043
3044 batch->do_batch--;
3045 if (batch->do_batch)
3046 return;
3047
3048 if (!batch->memcg)
3049 return;
3050
3051
3052
3053
3054 if (batch->nr_pages)
3055 res_counter_uncharge(&batch->memcg->res,
3056 batch->nr_pages * PAGE_SIZE);
3057 if (batch->memsw_nr_pages)
3058 res_counter_uncharge(&batch->memcg->memsw,
3059 batch->memsw_nr_pages * PAGE_SIZE);
3060 memcg_oom_recover(batch->memcg);
3061
3062 batch->memcg = NULL;
3063}
3064
3065#ifdef CONFIG_SWAP
3066
3067
3068
3069
3070void
3071mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3072{
3073 struct mem_cgroup *memcg;
3074 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3075
3076 if (!swapout)
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078
3079 memcg = __mem_cgroup_uncharge_common(page, ctype);
3080
3081
3082
3083
3084
3085 if (do_swap_account && swapout && memcg)
3086 swap_cgroup_record(ent, css_id(&memcg->css));
3087}
3088#endif
3089
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3091
3092
3093
3094
3095void mem_cgroup_uncharge_swap(swp_entry_t ent)
3096{
3097 struct mem_cgroup *memcg;
3098 unsigned short id;
3099
3100 if (!do_swap_account)
3101 return;
3102
3103 id = swap_cgroup_record(ent, 0);
3104 rcu_read_lock();
3105 memcg = mem_cgroup_lookup(id);
3106 if (memcg) {
3107
3108
3109
3110
3111 if (!mem_cgroup_is_root(memcg))
3112 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3113 mem_cgroup_swap_statistics(memcg, false);
3114 mem_cgroup_put(memcg);
3115 }
3116 rcu_read_unlock();
3117}
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134static int mem_cgroup_move_swap_account(swp_entry_t entry,
3135 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3136{
3137 unsigned short old_id, new_id;
3138
3139 old_id = css_id(&from->css);
3140 new_id = css_id(&to->css);
3141
3142 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3143 mem_cgroup_swap_statistics(from, false);
3144 mem_cgroup_swap_statistics(to, true);
3145
3146
3147
3148
3149
3150
3151
3152
3153 mem_cgroup_get(to);
3154 if (need_fixup) {
3155 if (!mem_cgroup_is_root(from))
3156 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3157 mem_cgroup_put(from);
3158
3159
3160
3161
3162 if (!mem_cgroup_is_root(to))
3163 res_counter_uncharge(&to->res, PAGE_SIZE);
3164 }
3165 return 0;
3166 }
3167 return -EINVAL;
3168}
3169#else
3170static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3171 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3172{
3173 return -EINVAL;
3174}
3175#endif
3176
3177
3178
3179
3180
3181int mem_cgroup_prepare_migration(struct page *page,
3182 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
3183{
3184 struct mem_cgroup *memcg = NULL;
3185 struct page_cgroup *pc;
3186 enum charge_type ctype;
3187 int ret = 0;
3188
3189 *memcgp = NULL;
3190
3191 VM_BUG_ON(PageTransHuge(page));
3192 if (mem_cgroup_disabled())
3193 return 0;
3194
3195 pc = lookup_page_cgroup(page);
3196 lock_page_cgroup(pc);
3197 if (PageCgroupUsed(pc)) {
3198 memcg = pc->mem_cgroup;
3199 css_get(&memcg->css);
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229 if (PageAnon(page))
3230 SetPageCgroupMigration(pc);
3231 }
3232 unlock_page_cgroup(pc);
3233
3234
3235
3236
3237 if (!memcg)
3238 return 0;
3239
3240 *memcgp = memcg;
3241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3242 css_put(&memcg->css);
3243 if (ret) {
3244 if (PageAnon(page)) {
3245 lock_page_cgroup(pc);
3246 ClearPageCgroupMigration(pc);
3247 unlock_page_cgroup(pc);
3248
3249
3250
3251 mem_cgroup_uncharge_page(page);
3252 }
3253
3254 return -ENOMEM;
3255 }
3256
3257
3258
3259
3260
3261
3262 pc = lookup_page_cgroup(newpage);
3263 if (PageAnon(page))
3264 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3265 else if (page_is_file_cache(page))
3266 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3267 else
3268 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3269 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
3270 return ret;
3271}
3272
3273
3274void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3275 struct page *oldpage, struct page *newpage, bool migration_ok)
3276{
3277 struct page *used, *unused;
3278 struct page_cgroup *pc;
3279
3280 if (!memcg)
3281 return;
3282
3283 cgroup_exclude_rmdir(&memcg->css);
3284 if (!migration_ok) {
3285 used = oldpage;
3286 unused = newpage;
3287 } else {
3288 used = newpage;
3289 unused = oldpage;
3290 }
3291
3292
3293
3294
3295
3296 pc = lookup_page_cgroup(oldpage);
3297 lock_page_cgroup(pc);
3298 ClearPageCgroupMigration(pc);
3299 unlock_page_cgroup(pc);
3300
3301 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311 if (PageAnon(used))
3312 mem_cgroup_uncharge_page(used);
3313
3314
3315
3316
3317
3318
3319 cgroup_release_and_wakeup_rmdir(&memcg->css);
3320}
3321
3322
3323
3324
3325
3326
3327void mem_cgroup_replace_page_cache(struct page *oldpage,
3328 struct page *newpage)
3329{
3330 struct mem_cgroup *memcg;
3331 struct page_cgroup *pc;
3332 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3333
3334 if (mem_cgroup_disabled())
3335 return;
3336
3337 pc = lookup_page_cgroup(oldpage);
3338
3339 lock_page_cgroup(pc);
3340 memcg = pc->mem_cgroup;
3341 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3342 ClearPageCgroupUsed(pc);
3343 unlock_page_cgroup(pc);
3344
3345 if (PageSwapBacked(oldpage))
3346 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3347
3348
3349
3350
3351
3352
3353 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
3354}
3355
3356#ifdef CONFIG_DEBUG_VM
3357static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3358{
3359 struct page_cgroup *pc;
3360
3361 pc = lookup_page_cgroup(page);
3362
3363
3364
3365
3366
3367 if (likely(pc) && PageCgroupUsed(pc))
3368 return pc;
3369 return NULL;
3370}
3371
3372bool mem_cgroup_bad_page_check(struct page *page)
3373{
3374 if (mem_cgroup_disabled())
3375 return false;
3376
3377 return lookup_page_cgroup_used(page) != NULL;
3378}
3379
3380void mem_cgroup_print_bad_page(struct page *page)
3381{
3382 struct page_cgroup *pc;
3383
3384 pc = lookup_page_cgroup_used(page);
3385 if (pc) {
3386 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3387 pc, pc->flags, pc->mem_cgroup);
3388 }
3389}
3390#endif
3391
3392static DEFINE_MUTEX(set_limit_mutex);
3393
3394static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3395 unsigned long long val)
3396{
3397 int retry_count;
3398 u64 memswlimit, memlimit;
3399 int ret = 0;
3400 int children = mem_cgroup_count_children(memcg);
3401 u64 curusage, oldusage;
3402 int enlarge;
3403
3404
3405
3406
3407
3408
3409 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3410
3411 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3412
3413 enlarge = 0;
3414 while (retry_count) {
3415 if (signal_pending(current)) {
3416 ret = -EINTR;
3417 break;
3418 }
3419
3420
3421
3422
3423
3424 mutex_lock(&set_limit_mutex);
3425 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3426 if (memswlimit < val) {
3427 ret = -EINVAL;
3428 mutex_unlock(&set_limit_mutex);
3429 break;
3430 }
3431
3432 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3433 if (memlimit < val)
3434 enlarge = 1;
3435
3436 ret = res_counter_set_limit(&memcg->res, val);
3437 if (!ret) {
3438 if (memswlimit == val)
3439 memcg->memsw_is_minimum = true;
3440 else
3441 memcg->memsw_is_minimum = false;
3442 }
3443 mutex_unlock(&set_limit_mutex);
3444
3445 if (!ret)
3446 break;
3447
3448 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3449 MEM_CGROUP_RECLAIM_SHRINK);
3450 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3451
3452 if (curusage >= oldusage)
3453 retry_count--;
3454 else
3455 oldusage = curusage;
3456 }
3457 if (!ret && enlarge)
3458 memcg_oom_recover(memcg);
3459
3460 return ret;
3461}
3462
3463static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3464 unsigned long long val)
3465{
3466 int retry_count;
3467 u64 memlimit, memswlimit, oldusage, curusage;
3468 int children = mem_cgroup_count_children(memcg);
3469 int ret = -EBUSY;
3470 int enlarge = 0;
3471
3472
3473 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3474 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3475 while (retry_count) {
3476 if (signal_pending(current)) {
3477 ret = -EINTR;
3478 break;
3479 }
3480
3481
3482
3483
3484
3485 mutex_lock(&set_limit_mutex);
3486 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3487 if (memlimit > val) {
3488 ret = -EINVAL;
3489 mutex_unlock(&set_limit_mutex);
3490 break;
3491 }
3492 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3493 if (memswlimit < val)
3494 enlarge = 1;
3495 ret = res_counter_set_limit(&memcg->memsw, val);
3496 if (!ret) {
3497 if (memlimit == val)
3498 memcg->memsw_is_minimum = true;
3499 else
3500 memcg->memsw_is_minimum = false;
3501 }
3502 mutex_unlock(&set_limit_mutex);
3503
3504 if (!ret)
3505 break;
3506
3507 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3508 MEM_CGROUP_RECLAIM_NOSWAP |
3509 MEM_CGROUP_RECLAIM_SHRINK);
3510 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3511
3512 if (curusage >= oldusage)
3513 retry_count--;
3514 else
3515 oldusage = curusage;
3516 }
3517 if (!ret && enlarge)
3518 memcg_oom_recover(memcg);
3519 return ret;
3520}
3521
3522unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3523 gfp_t gfp_mask,
3524 unsigned long *total_scanned)
3525{
3526 unsigned long nr_reclaimed = 0;
3527 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3528 unsigned long reclaimed;
3529 int loop = 0;
3530 struct mem_cgroup_tree_per_zone *mctz;
3531 unsigned long long excess;
3532 unsigned long nr_scanned;
3533
3534 if (order > 0)
3535 return 0;
3536
3537 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3538
3539
3540
3541
3542
3543 do {
3544 if (next_mz)
3545 mz = next_mz;
3546 else
3547 mz = mem_cgroup_largest_soft_limit_node(mctz);
3548 if (!mz)
3549 break;
3550
3551 nr_scanned = 0;
3552 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
3553 gfp_mask, &nr_scanned);
3554 nr_reclaimed += reclaimed;
3555 *total_scanned += nr_scanned;
3556 spin_lock(&mctz->lock);
3557
3558
3559
3560
3561
3562 next_mz = NULL;
3563 if (!reclaimed) {
3564 do {
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576 next_mz =
3577 __mem_cgroup_largest_soft_limit_node(mctz);
3578 if (next_mz == mz)
3579 css_put(&next_mz->mem->css);
3580 else
3581 break;
3582 } while (1);
3583 }
3584 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3585 excess = res_counter_soft_limit_excess(&mz->mem->res);
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3596 spin_unlock(&mctz->lock);
3597 css_put(&mz->mem->css);
3598 loop++;
3599
3600
3601
3602
3603
3604 if (!nr_reclaimed &&
3605 (next_mz == NULL ||
3606 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3607 break;
3608 } while (!nr_reclaimed);
3609 if (next_mz)
3610 css_put(&next_mz->mem->css);
3611 return nr_reclaimed;
3612}
3613
3614
3615
3616
3617
3618static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3619 int node, int zid, enum lru_list lru)
3620{
3621 struct mem_cgroup_per_zone *mz;
3622 unsigned long flags, loop;
3623 struct list_head *list;
3624 struct page *busy;
3625 struct zone *zone;
3626 int ret = 0;
3627
3628 zone = &NODE_DATA(node)->node_zones[zid];
3629 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3630 list = &mz->lruvec.lists[lru];
3631
3632 loop = MEM_CGROUP_ZSTAT(mz, lru);
3633
3634 loop += 256;
3635 busy = NULL;
3636 while (loop--) {
3637 struct page_cgroup *pc;
3638 struct page *page;
3639
3640 ret = 0;
3641 spin_lock_irqsave(&zone->lru_lock, flags);
3642 if (list_empty(list)) {
3643 spin_unlock_irqrestore(&zone->lru_lock, flags);
3644 break;
3645 }
3646 page = list_entry(list->prev, struct page, lru);
3647 if (busy == page) {
3648 list_move(&page->lru, list);
3649 busy = NULL;
3650 spin_unlock_irqrestore(&zone->lru_lock, flags);
3651 continue;
3652 }
3653 spin_unlock_irqrestore(&zone->lru_lock, flags);
3654
3655 pc = lookup_page_cgroup(page);
3656
3657 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3658 if (ret == -ENOMEM || ret == -EINTR)
3659 break;
3660
3661 if (ret == -EBUSY || ret == -EINVAL) {
3662
3663 busy = page;
3664 cond_resched();
3665 } else
3666 busy = NULL;
3667 }
3668
3669 if (!ret && !list_empty(list))
3670 return -EBUSY;
3671 return ret;
3672}
3673
3674
3675
3676
3677
3678static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3679{
3680 int ret;
3681 int node, zid, shrink;
3682 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3683 struct cgroup *cgrp = memcg->css.cgroup;
3684
3685 css_get(&memcg->css);
3686
3687 shrink = 0;
3688
3689 if (free_all)
3690 goto try_to_free;
3691move_account:
3692 do {
3693 ret = -EBUSY;
3694 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3695 goto out;
3696 ret = -EINTR;
3697 if (signal_pending(current))
3698 goto out;
3699
3700 lru_add_drain_all();
3701 drain_all_stock_sync(memcg);
3702 ret = 0;
3703 mem_cgroup_start_move(memcg);
3704 for_each_node_state(node, N_HIGH_MEMORY) {
3705 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3706 enum lru_list l;
3707 for_each_lru(l) {
3708 ret = mem_cgroup_force_empty_list(memcg,
3709 node, zid, l);
3710 if (ret)
3711 break;
3712 }
3713 }
3714 if (ret)
3715 break;
3716 }
3717 mem_cgroup_end_move(memcg);
3718 memcg_oom_recover(memcg);
3719
3720 if (ret == -ENOMEM)
3721 goto try_to_free;
3722 cond_resched();
3723
3724 } while (memcg->res.usage > 0 || ret);
3725out:
3726 css_put(&memcg->css);
3727 return ret;
3728
3729try_to_free:
3730
3731 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3732 ret = -EBUSY;
3733 goto out;
3734 }
3735
3736 lru_add_drain_all();
3737
3738 shrink = 1;
3739 while (nr_retries && memcg->res.usage > 0) {
3740 int progress;
3741
3742 if (signal_pending(current)) {
3743 ret = -EINTR;
3744 goto out;
3745 }
3746 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3747 false);
3748 if (!progress) {
3749 nr_retries--;
3750
3751 congestion_wait(BLK_RW_ASYNC, HZ/10);
3752 }
3753
3754 }
3755 lru_add_drain();
3756
3757 goto move_account;
3758}
3759
3760int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3761{
3762 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3763}
3764
3765
3766static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3767{
3768 return mem_cgroup_from_cont(cont)->use_hierarchy;
3769}
3770
3771static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3772 u64 val)
3773{
3774 int retval = 0;
3775 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3776 struct cgroup *parent = cont->parent;
3777 struct mem_cgroup *parent_memcg = NULL;
3778
3779 if (parent)
3780 parent_memcg = mem_cgroup_from_cont(parent);
3781
3782 cgroup_lock();
3783
3784
3785
3786
3787
3788
3789
3790
3791 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3792 (val == 1 || val == 0)) {
3793 if (list_empty(&cont->children))
3794 memcg->use_hierarchy = val;
3795 else
3796 retval = -EBUSY;
3797 } else
3798 retval = -EINVAL;
3799 cgroup_unlock();
3800
3801 return retval;
3802}
3803
3804
3805static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3806 enum mem_cgroup_stat_index idx)
3807{
3808 struct mem_cgroup *iter;
3809 long val = 0;
3810
3811
3812 for_each_mem_cgroup_tree(iter, memcg)
3813 val += mem_cgroup_read_stat(iter, idx);
3814
3815 if (val < 0)
3816 val = 0;
3817 return val;
3818}
3819
3820static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3821{
3822 u64 val;
3823
3824 if (!mem_cgroup_is_root(memcg)) {
3825 if (!swap)
3826 return res_counter_read_u64(&memcg->res, RES_USAGE);
3827 else
3828 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3829 }
3830
3831 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3832 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3833
3834 if (swap)
3835 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3836
3837 return val << PAGE_SHIFT;
3838}
3839
3840static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3841{
3842 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3843 u64 val;
3844 int type, name;
3845
3846 type = MEMFILE_TYPE(cft->private);
3847 name = MEMFILE_ATTR(cft->private);
3848 switch (type) {
3849 case _MEM:
3850 if (name == RES_USAGE)
3851 val = mem_cgroup_usage(memcg, false);
3852 else
3853 val = res_counter_read_u64(&memcg->res, name);
3854 break;
3855 case _MEMSWAP:
3856 if (name == RES_USAGE)
3857 val = mem_cgroup_usage(memcg, true);
3858 else
3859 val = res_counter_read_u64(&memcg->memsw, name);
3860 break;
3861 default:
3862 BUG();
3863 break;
3864 }
3865 return val;
3866}
3867
3868
3869
3870
3871static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3872 const char *buffer)
3873{
3874 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3875 int type, name;
3876 unsigned long long val;
3877 int ret;
3878
3879 type = MEMFILE_TYPE(cft->private);
3880 name = MEMFILE_ATTR(cft->private);
3881 switch (name) {
3882 case RES_LIMIT:
3883 if (mem_cgroup_is_root(memcg)) {
3884 ret = -EINVAL;
3885 break;
3886 }
3887
3888 ret = res_counter_memparse_write_strategy(buffer, &val);
3889 if (ret)
3890 break;
3891 if (type == _MEM)
3892 ret = mem_cgroup_resize_limit(memcg, val);
3893 else
3894 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3895 break;
3896 case RES_SOFT_LIMIT:
3897 ret = res_counter_memparse_write_strategy(buffer, &val);
3898 if (ret)
3899 break;
3900
3901
3902
3903
3904
3905 if (type == _MEM)
3906 ret = res_counter_set_soft_limit(&memcg->res, val);
3907 else
3908 ret = -EINVAL;
3909 break;
3910 default:
3911 ret = -EINVAL;
3912 break;
3913 }
3914 return ret;
3915}
3916
3917static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3918 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3919{
3920 struct cgroup *cgroup;
3921 unsigned long long min_limit, min_memsw_limit, tmp;
3922
3923 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3924 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3925 cgroup = memcg->css.cgroup;
3926 if (!memcg->use_hierarchy)
3927 goto out;
3928
3929 while (cgroup->parent) {
3930 cgroup = cgroup->parent;
3931 memcg = mem_cgroup_from_cont(cgroup);
3932 if (!memcg->use_hierarchy)
3933 break;
3934 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3935 min_limit = min(min_limit, tmp);
3936 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3937 min_memsw_limit = min(min_memsw_limit, tmp);
3938 }
3939out:
3940 *mem_limit = min_limit;
3941 *memsw_limit = min_memsw_limit;
3942 return;
3943}
3944
3945static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3946{
3947 struct mem_cgroup *memcg;
3948 int type, name;
3949
3950 memcg = mem_cgroup_from_cont(cont);
3951 type = MEMFILE_TYPE(event);
3952 name = MEMFILE_ATTR(event);
3953 switch (name) {
3954 case RES_MAX_USAGE:
3955 if (type == _MEM)
3956 res_counter_reset_max(&memcg->res);
3957 else
3958 res_counter_reset_max(&memcg->memsw);
3959 break;
3960 case RES_FAILCNT:
3961 if (type == _MEM)
3962 res_counter_reset_failcnt(&memcg->res);
3963 else
3964 res_counter_reset_failcnt(&memcg->memsw);
3965 break;
3966 }
3967
3968 return 0;
3969}
3970
3971static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3972 struct cftype *cft)
3973{
3974 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3975}
3976
3977#ifdef CONFIG_MMU
3978static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3979 struct cftype *cft, u64 val)
3980{
3981 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3982
3983 if (val >= (1 << NR_MOVE_TYPE))
3984 return -EINVAL;
3985
3986
3987
3988
3989
3990 cgroup_lock();
3991 memcg->move_charge_at_immigrate = val;
3992 cgroup_unlock();
3993
3994 return 0;
3995}
3996#else
3997static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3998 struct cftype *cft, u64 val)
3999{
4000 return -ENOSYS;
4001}
4002#endif
4003
4004
4005
4006enum {
4007 MCS_CACHE,
4008 MCS_RSS,
4009 MCS_FILE_MAPPED,
4010 MCS_PGPGIN,
4011 MCS_PGPGOUT,
4012 MCS_SWAP,
4013 MCS_PGFAULT,
4014 MCS_PGMAJFAULT,
4015 MCS_INACTIVE_ANON,
4016 MCS_ACTIVE_ANON,
4017 MCS_INACTIVE_FILE,
4018 MCS_ACTIVE_FILE,
4019 MCS_UNEVICTABLE,
4020 NR_MCS_STAT,
4021};
4022
4023struct mcs_total_stat {
4024 s64 stat[NR_MCS_STAT];
4025};
4026
4027struct {
4028 char *local_name;
4029 char *total_name;
4030} memcg_stat_strings[NR_MCS_STAT] = {
4031 {"cache", "total_cache"},
4032 {"rss", "total_rss"},
4033 {"mapped_file", "total_mapped_file"},
4034 {"pgpgin", "total_pgpgin"},
4035 {"pgpgout", "total_pgpgout"},
4036 {"swap", "total_swap"},
4037 {"pgfault", "total_pgfault"},
4038 {"pgmajfault", "total_pgmajfault"},
4039 {"inactive_anon", "total_inactive_anon"},
4040 {"active_anon", "total_active_anon"},
4041 {"inactive_file", "total_inactive_file"},
4042 {"active_file", "total_active_file"},
4043 {"unevictable", "total_unevictable"}
4044};
4045
4046
4047static void
4048mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4049{
4050 s64 val;
4051
4052
4053 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4054 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4055 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4056 s->stat[MCS_RSS] += val * PAGE_SIZE;
4057 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4058 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4059 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4060 s->stat[MCS_PGPGIN] += val;
4061 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4062 s->stat[MCS_PGPGOUT] += val;
4063 if (do_swap_account) {
4064 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4065 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4066 }
4067 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4068 s->stat[MCS_PGFAULT] += val;
4069 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4070 s->stat[MCS_PGMAJFAULT] += val;
4071
4072
4073 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4074 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4075 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4076 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4077 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4078 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4079 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4080 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4081 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4082 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4083}
4084
4085static void
4086mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4087{
4088 struct mem_cgroup *iter;
4089
4090 for_each_mem_cgroup_tree(iter, memcg)
4091 mem_cgroup_get_local_stat(iter, s);
4092}
4093
4094#ifdef CONFIG_NUMA
4095static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4096{
4097 int nid;
4098 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4099 unsigned long node_nr;
4100 struct cgroup *cont = m->private;
4101 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4102
4103 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4104 seq_printf(m, "total=%lu", total_nr);
4105 for_each_node_state(nid, N_HIGH_MEMORY) {
4106 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4107 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 }
4109 seq_putc(m, '\n');
4110
4111 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4112 seq_printf(m, "file=%lu", file_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4115 LRU_ALL_FILE);
4116 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 }
4118 seq_putc(m, '\n');
4119
4120 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4121 seq_printf(m, "anon=%lu", anon_nr);
4122 for_each_node_state(nid, N_HIGH_MEMORY) {
4123 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4124 LRU_ALL_ANON);
4125 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 }
4127 seq_putc(m, '\n');
4128
4129 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4130 seq_printf(m, "unevictable=%lu", unevictable_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4133 BIT(LRU_UNEVICTABLE));
4134 seq_printf(m, " N%d=%lu", nid, node_nr);
4135 }
4136 seq_putc(m, '\n');
4137 return 0;
4138}
4139#endif
4140
4141static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4142 struct cgroup_map_cb *cb)
4143{
4144 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4145 struct mcs_total_stat mystat;
4146 int i;
4147
4148 memset(&mystat, 0, sizeof(mystat));
4149 mem_cgroup_get_local_stat(mem_cont, &mystat);
4150
4151
4152 for (i = 0; i < NR_MCS_STAT; i++) {
4153 if (i == MCS_SWAP && !do_swap_account)
4154 continue;
4155 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4156 }
4157
4158
4159 {
4160 unsigned long long limit, memsw_limit;
4161 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4162 cb->fill(cb, "hierarchical_memory_limit", limit);
4163 if (do_swap_account)
4164 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4165 }
4166
4167 memset(&mystat, 0, sizeof(mystat));
4168 mem_cgroup_get_total_stat(mem_cont, &mystat);
4169 for (i = 0; i < NR_MCS_STAT; i++) {
4170 if (i == MCS_SWAP && !do_swap_account)
4171 continue;
4172 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4173 }
4174
4175#ifdef CONFIG_DEBUG_VM
4176 {
4177 int nid, zid;
4178 struct mem_cgroup_per_zone *mz;
4179 unsigned long recent_rotated[2] = {0, 0};
4180 unsigned long recent_scanned[2] = {0, 0};
4181
4182 for_each_online_node(nid)
4183 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4184 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4185
4186 recent_rotated[0] +=
4187 mz->reclaim_stat.recent_rotated[0];
4188 recent_rotated[1] +=
4189 mz->reclaim_stat.recent_rotated[1];
4190 recent_scanned[0] +=
4191 mz->reclaim_stat.recent_scanned[0];
4192 recent_scanned[1] +=
4193 mz->reclaim_stat.recent_scanned[1];
4194 }
4195 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4196 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4197 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4198 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4199 }
4200#endif
4201
4202 return 0;
4203}
4204
4205static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4206{
4207 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4208
4209 return mem_cgroup_swappiness(memcg);
4210}
4211
4212static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4213 u64 val)
4214{
4215 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4216 struct mem_cgroup *parent;
4217
4218 if (val > 100)
4219 return -EINVAL;
4220
4221 if (cgrp->parent == NULL)
4222 return -EINVAL;
4223
4224 parent = mem_cgroup_from_cont(cgrp->parent);
4225
4226 cgroup_lock();
4227
4228
4229 if ((parent->use_hierarchy) ||
4230 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4231 cgroup_unlock();
4232 return -EINVAL;
4233 }
4234
4235 memcg->swappiness = val;
4236
4237 cgroup_unlock();
4238
4239 return 0;
4240}
4241
4242static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4243{
4244 struct mem_cgroup_threshold_ary *t;
4245 u64 usage;
4246 int i;
4247
4248 rcu_read_lock();
4249 if (!swap)
4250 t = rcu_dereference(memcg->thresholds.primary);
4251 else
4252 t = rcu_dereference(memcg->memsw_thresholds.primary);
4253
4254 if (!t)
4255 goto unlock;
4256
4257 usage = mem_cgroup_usage(memcg, swap);
4258
4259
4260
4261
4262
4263
4264 i = t->current_threshold;
4265
4266
4267
4268
4269
4270
4271
4272 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4273 eventfd_signal(t->entries[i].eventfd, 1);
4274
4275
4276 i++;
4277
4278
4279
4280
4281
4282
4283
4284 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4285 eventfd_signal(t->entries[i].eventfd, 1);
4286
4287
4288 t->current_threshold = i - 1;
4289unlock:
4290 rcu_read_unlock();
4291}
4292
4293static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4294{
4295 while (memcg) {
4296 __mem_cgroup_threshold(memcg, false);
4297 if (do_swap_account)
4298 __mem_cgroup_threshold(memcg, true);
4299
4300 memcg = parent_mem_cgroup(memcg);
4301 }
4302}
4303
4304static int compare_thresholds(const void *a, const void *b)
4305{
4306 const struct mem_cgroup_threshold *_a = a;
4307 const struct mem_cgroup_threshold *_b = b;
4308
4309 return _a->threshold - _b->threshold;
4310}
4311
4312static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4313{
4314 struct mem_cgroup_eventfd_list *ev;
4315
4316 list_for_each_entry(ev, &memcg->oom_notify, list)
4317 eventfd_signal(ev->eventfd, 1);
4318 return 0;
4319}
4320
4321static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4322{
4323 struct mem_cgroup *iter;
4324
4325 for_each_mem_cgroup_tree(iter, memcg)
4326 mem_cgroup_oom_notify_cb(iter);
4327}
4328
4329static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4330 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4331{
4332 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4333 struct mem_cgroup_thresholds *thresholds;
4334 struct mem_cgroup_threshold_ary *new;
4335 int type = MEMFILE_TYPE(cft->private);
4336 u64 threshold, usage;
4337 int i, size, ret;
4338
4339 ret = res_counter_memparse_write_strategy(args, &threshold);
4340 if (ret)
4341 return ret;
4342
4343 mutex_lock(&memcg->thresholds_lock);
4344
4345 if (type == _MEM)
4346 thresholds = &memcg->thresholds;
4347 else if (type == _MEMSWAP)
4348 thresholds = &memcg->memsw_thresholds;
4349 else
4350 BUG();
4351
4352 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4353
4354
4355 if (thresholds->primary)
4356 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4357
4358 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4359
4360
4361 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4362 GFP_KERNEL);
4363 if (!new) {
4364 ret = -ENOMEM;
4365 goto unlock;
4366 }
4367 new->size = size;
4368
4369
4370 if (thresholds->primary) {
4371 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4372 sizeof(struct mem_cgroup_threshold));
4373 }
4374
4375
4376 new->entries[size - 1].eventfd = eventfd;
4377 new->entries[size - 1].threshold = threshold;
4378
4379
4380 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4381 compare_thresholds, NULL);
4382
4383
4384 new->current_threshold = -1;
4385 for (i = 0; i < size; i++) {
4386 if (new->entries[i].threshold < usage) {
4387
4388
4389
4390
4391
4392 ++new->current_threshold;
4393 }
4394 }
4395
4396
4397 kfree(thresholds->spare);
4398 thresholds->spare = thresholds->primary;
4399
4400 rcu_assign_pointer(thresholds->primary, new);
4401
4402
4403 synchronize_rcu();
4404
4405unlock:
4406 mutex_unlock(&memcg->thresholds_lock);
4407
4408 return ret;
4409}
4410
4411static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4412 struct cftype *cft, struct eventfd_ctx *eventfd)
4413{
4414 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4415 struct mem_cgroup_thresholds *thresholds;
4416 struct mem_cgroup_threshold_ary *new;
4417 int type = MEMFILE_TYPE(cft->private);
4418 u64 usage;
4419 int i, j, size;
4420
4421 mutex_lock(&memcg->thresholds_lock);
4422 if (type == _MEM)
4423 thresholds = &memcg->thresholds;
4424 else if (type == _MEMSWAP)
4425 thresholds = &memcg->memsw_thresholds;
4426 else
4427 BUG();
4428
4429
4430
4431
4432
4433 BUG_ON(!thresholds);
4434
4435 if (!thresholds->primary)
4436 goto unlock;
4437
4438 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4439
4440
4441 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4442
4443
4444 size = 0;
4445 for (i = 0; i < thresholds->primary->size; i++) {
4446 if (thresholds->primary->entries[i].eventfd != eventfd)
4447 size++;
4448 }
4449
4450 new = thresholds->spare;
4451
4452
4453 if (!size) {
4454 kfree(new);
4455 new = NULL;
4456 goto swap_buffers;
4457 }
4458
4459 new->size = size;
4460
4461
4462 new->current_threshold = -1;
4463 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4464 if (thresholds->primary->entries[i].eventfd == eventfd)
4465 continue;
4466
4467 new->entries[j] = thresholds->primary->entries[i];
4468 if (new->entries[j].threshold < usage) {
4469
4470
4471
4472
4473
4474 ++new->current_threshold;
4475 }
4476 j++;
4477 }
4478
4479swap_buffers:
4480
4481 thresholds->spare = thresholds->primary;
4482 rcu_assign_pointer(thresholds->primary, new);
4483
4484
4485 synchronize_rcu();
4486unlock:
4487 mutex_unlock(&memcg->thresholds_lock);
4488}
4489
4490static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4491 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4492{
4493 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4494 struct mem_cgroup_eventfd_list *event;
4495 int type = MEMFILE_TYPE(cft->private);
4496
4497 BUG_ON(type != _OOM_TYPE);
4498 event = kmalloc(sizeof(*event), GFP_KERNEL);
4499 if (!event)
4500 return -ENOMEM;
4501
4502 spin_lock(&memcg_oom_lock);
4503
4504 event->eventfd = eventfd;
4505 list_add(&event->list, &memcg->oom_notify);
4506
4507
4508 if (atomic_read(&memcg->under_oom))
4509 eventfd_signal(eventfd, 1);
4510 spin_unlock(&memcg_oom_lock);
4511
4512 return 0;
4513}
4514
4515static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4516 struct cftype *cft, struct eventfd_ctx *eventfd)
4517{
4518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4519 struct mem_cgroup_eventfd_list *ev, *tmp;
4520 int type = MEMFILE_TYPE(cft->private);
4521
4522 BUG_ON(type != _OOM_TYPE);
4523
4524 spin_lock(&memcg_oom_lock);
4525
4526 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4527 if (ev->eventfd == eventfd) {
4528 list_del(&ev->list);
4529 kfree(ev);
4530 }
4531 }
4532
4533 spin_unlock(&memcg_oom_lock);
4534}
4535
4536static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4537 struct cftype *cft, struct cgroup_map_cb *cb)
4538{
4539 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4540
4541 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4542
4543 if (atomic_read(&memcg->under_oom))
4544 cb->fill(cb, "under_oom", 1);
4545 else
4546 cb->fill(cb, "under_oom", 0);
4547 return 0;
4548}
4549
4550static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4551 struct cftype *cft, u64 val)
4552{
4553 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4554 struct mem_cgroup *parent;
4555
4556
4557 if (!cgrp->parent || !((val == 0) || (val == 1)))
4558 return -EINVAL;
4559
4560 parent = mem_cgroup_from_cont(cgrp->parent);
4561
4562 cgroup_lock();
4563
4564 if ((parent->use_hierarchy) ||
4565 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4566 cgroup_unlock();
4567 return -EINVAL;
4568 }
4569 memcg->oom_kill_disable = val;
4570 if (!val)
4571 memcg_oom_recover(memcg);
4572 cgroup_unlock();
4573 return 0;
4574}
4575
4576#ifdef CONFIG_NUMA
4577static const struct file_operations mem_control_numa_stat_file_operations = {
4578 .read = seq_read,
4579 .llseek = seq_lseek,
4580 .release = single_release,
4581};
4582
4583static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4584{
4585 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4586
4587 file->f_op = &mem_control_numa_stat_file_operations;
4588 return single_open(file, mem_control_numa_stat_show, cont);
4589}
4590#endif
4591
4592#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4593static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4594{
4595
4596
4597
4598
4599
4600
4601
4602 return mem_cgroup_sockets_init(cont, ss);
4603};
4604
4605static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4606 struct cgroup *cont)
4607{
4608 mem_cgroup_sockets_destroy(cont, ss);
4609}
4610#else
4611static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4612{
4613 return 0;
4614}
4615
4616static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4617 struct cgroup *cont)
4618{
4619}
4620#endif
4621
4622static struct cftype mem_cgroup_files[] = {
4623 {
4624 .name = "usage_in_bytes",
4625 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4626 .read_u64 = mem_cgroup_read,
4627 .register_event = mem_cgroup_usage_register_event,
4628 .unregister_event = mem_cgroup_usage_unregister_event,
4629 },
4630 {
4631 .name = "max_usage_in_bytes",
4632 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4633 .trigger = mem_cgroup_reset,
4634 .read_u64 = mem_cgroup_read,
4635 },
4636 {
4637 .name = "limit_in_bytes",
4638 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4639 .write_string = mem_cgroup_write,
4640 .read_u64 = mem_cgroup_read,
4641 },
4642 {
4643 .name = "soft_limit_in_bytes",
4644 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4645 .write_string = mem_cgroup_write,
4646 .read_u64 = mem_cgroup_read,
4647 },
4648 {
4649 .name = "failcnt",
4650 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4651 .trigger = mem_cgroup_reset,
4652 .read_u64 = mem_cgroup_read,
4653 },
4654 {
4655 .name = "stat",
4656 .read_map = mem_control_stat_show,
4657 },
4658 {
4659 .name = "force_empty",
4660 .trigger = mem_cgroup_force_empty_write,
4661 },
4662 {
4663 .name = "use_hierarchy",
4664 .write_u64 = mem_cgroup_hierarchy_write,
4665 .read_u64 = mem_cgroup_hierarchy_read,
4666 },
4667 {
4668 .name = "swappiness",
4669 .read_u64 = mem_cgroup_swappiness_read,
4670 .write_u64 = mem_cgroup_swappiness_write,
4671 },
4672 {
4673 .name = "move_charge_at_immigrate",
4674 .read_u64 = mem_cgroup_move_charge_read,
4675 .write_u64 = mem_cgroup_move_charge_write,
4676 },
4677 {
4678 .name = "oom_control",
4679 .read_map = mem_cgroup_oom_control_read,
4680 .write_u64 = mem_cgroup_oom_control_write,
4681 .register_event = mem_cgroup_oom_register_event,
4682 .unregister_event = mem_cgroup_oom_unregister_event,
4683 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4684 },
4685#ifdef CONFIG_NUMA
4686 {
4687 .name = "numa_stat",
4688 .open = mem_control_numa_stat_open,
4689 .mode = S_IRUGO,
4690 },
4691#endif
4692};
4693
4694#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4695static struct cftype memsw_cgroup_files[] = {
4696 {
4697 .name = "memsw.usage_in_bytes",
4698 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4699 .read_u64 = mem_cgroup_read,
4700 .register_event = mem_cgroup_usage_register_event,
4701 .unregister_event = mem_cgroup_usage_unregister_event,
4702 },
4703 {
4704 .name = "memsw.max_usage_in_bytes",
4705 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4706 .trigger = mem_cgroup_reset,
4707 .read_u64 = mem_cgroup_read,
4708 },
4709 {
4710 .name = "memsw.limit_in_bytes",
4711 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4712 .write_string = mem_cgroup_write,
4713 .read_u64 = mem_cgroup_read,
4714 },
4715 {
4716 .name = "memsw.failcnt",
4717 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4718 .trigger = mem_cgroup_reset,
4719 .read_u64 = mem_cgroup_read,
4720 },
4721};
4722
4723static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4724{
4725 if (!do_swap_account)
4726 return 0;
4727 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4728 ARRAY_SIZE(memsw_cgroup_files));
4729};
4730#else
4731static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4732{
4733 return 0;
4734}
4735#endif
4736
4737static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4738{
4739 struct mem_cgroup_per_node *pn;
4740 struct mem_cgroup_per_zone *mz;
4741 enum lru_list l;
4742 int zone, tmp = node;
4743
4744
4745
4746
4747
4748
4749
4750
4751 if (!node_state(node, N_NORMAL_MEMORY))
4752 tmp = -1;
4753 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4754 if (!pn)
4755 return 1;
4756
4757 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4758 mz = &pn->zoneinfo[zone];
4759 for_each_lru(l)
4760 INIT_LIST_HEAD(&mz->lruvec.lists[l]);
4761 mz->usage_in_excess = 0;
4762 mz->on_tree = false;
4763 mz->mem = memcg;
4764 }
4765 memcg->info.nodeinfo[node] = pn;
4766 return 0;
4767}
4768
4769static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4770{
4771 kfree(memcg->info.nodeinfo[node]);
4772}
4773
4774static struct mem_cgroup *mem_cgroup_alloc(void)
4775{
4776 struct mem_cgroup *mem;
4777 int size = sizeof(struct mem_cgroup);
4778
4779
4780 if (size < PAGE_SIZE)
4781 mem = kzalloc(size, GFP_KERNEL);
4782 else
4783 mem = vzalloc(size);
4784
4785 if (!mem)
4786 return NULL;
4787
4788 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4789 if (!mem->stat)
4790 goto out_free;
4791 spin_lock_init(&mem->pcp_counter_lock);
4792 return mem;
4793
4794out_free:
4795 if (size < PAGE_SIZE)
4796 kfree(mem);
4797 else
4798 vfree(mem);
4799 return NULL;
4800}
4801
4802
4803
4804
4805
4806
4807static void vfree_work(struct work_struct *work)
4808{
4809 struct mem_cgroup *memcg;
4810
4811 memcg = container_of(work, struct mem_cgroup, work_freeing);
4812 vfree(memcg);
4813}
4814static void vfree_rcu(struct rcu_head *rcu_head)
4815{
4816 struct mem_cgroup *memcg;
4817
4818 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4819 INIT_WORK(&memcg->work_freeing, vfree_work);
4820 schedule_work(&memcg->work_freeing);
4821}
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834static void __mem_cgroup_free(struct mem_cgroup *memcg)
4835{
4836 int node;
4837
4838 mem_cgroup_remove_from_trees(memcg);
4839 free_css_id(&mem_cgroup_subsys, &memcg->css);
4840
4841 for_each_node(node)
4842 free_mem_cgroup_per_zone_info(memcg, node);
4843
4844 free_percpu(memcg->stat);
4845 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4846 kfree_rcu(memcg, rcu_freeing);
4847 else