1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/export.h>
37#include <linux/mutex.h>
38#include <linux/rbtree.h>
39#include <linux/slab.h>
40#include <linux/swap.h>
41#include <linux/swapops.h>
42#include <linux/spinlock.h>
43#include <linux/eventfd.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/seq_file.h>
47#include <linux/vmalloc.h>
48#include <linux/mm_inline.h>
49#include <linux/page_cgroup.h>
50#include <linux/cpu.h>
51#include <linux/oom.h>
52#include "internal.h"
53#include <net/sock.h>
54#include <net/tcp_memcontrol.h>
55
56#include <asm/uaccess.h>
57
58#include <trace/events/vmscan.h>
59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65
66int do_swap_account __read_mostly;
67
68
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1;
71#else
72static int really_do_swap_account __initdata = 0;
73#endif
74
75#else
76#define do_swap_account 0
77#endif
78
79
80
81
82
83enum mem_cgroup_stat_index {
84
85
86
87 MEM_CGROUP_STAT_CACHE,
88 MEM_CGROUP_STAT_RSS,
89 MEM_CGROUP_STAT_FILE_MAPPED,
90 MEM_CGROUP_STAT_SWAPOUT,
91 MEM_CGROUP_STAT_NSTATS,
92};
93
94static const char * const mem_cgroup_stat_names[] = {
95 "cache",
96 "rss",
97 "mapped_file",
98 "swap",
99};
100
101enum mem_cgroup_events_index {
102 MEM_CGROUP_EVENTS_PGPGIN,
103 MEM_CGROUP_EVENTS_PGPGOUT,
104 MEM_CGROUP_EVENTS_PGFAULT,
105 MEM_CGROUP_EVENTS_PGMAJFAULT,
106 MEM_CGROUP_EVENTS_NSTATS,
107};
108
109static const char * const mem_cgroup_events_names[] = {
110 "pgpgin",
111 "pgpgout",
112 "pgfault",
113 "pgmajfault",
114};
115
116
117
118
119
120
121
122enum mem_cgroup_events_target {
123 MEM_CGROUP_TARGET_THRESH,
124 MEM_CGROUP_TARGET_SOFTLIMIT,
125 MEM_CGROUP_TARGET_NUMAINFO,
126 MEM_CGROUP_NTARGETS,
127};
128#define THRESHOLDS_EVENTS_TARGET 128
129#define SOFTLIMIT_EVENTS_TARGET 1024
130#define NUMAINFO_EVENTS_TARGET 1024
131
132struct mem_cgroup_stat_cpu {
133 long count[MEM_CGROUP_STAT_NSTATS];
134 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
135 unsigned long nr_page_events;
136 unsigned long targets[MEM_CGROUP_NTARGETS];
137};
138
139struct mem_cgroup_reclaim_iter {
140
141 int position;
142
143 unsigned int generation;
144};
145
146
147
148
149struct mem_cgroup_per_zone {
150 struct lruvec lruvec;
151 unsigned long lru_size[NR_LRU_LISTS];
152
153 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
154
155 struct rb_node tree_node;
156 unsigned long long usage_in_excess;
157
158 bool on_tree;
159 struct mem_cgroup *memcg;
160
161};
162
163struct mem_cgroup_per_node {
164 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
165};
166
167struct mem_cgroup_lru_info {
168 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
169};
170
171
172
173
174
175
176struct mem_cgroup_tree_per_zone {
177 struct rb_root rb_root;
178 spinlock_t lock;
179};
180
181struct mem_cgroup_tree_per_node {
182 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
183};
184
185struct mem_cgroup_tree {
186 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
187};
188
189static struct mem_cgroup_tree soft_limit_tree __read_mostly;
190
191struct mem_cgroup_threshold {
192 struct eventfd_ctx *eventfd;
193 u64 threshold;
194};
195
196
197struct mem_cgroup_threshold_ary {
198
199 int current_threshold;
200
201 unsigned int size;
202
203 struct mem_cgroup_threshold entries[0];
204};
205
206struct mem_cgroup_thresholds {
207
208 struct mem_cgroup_threshold_ary *primary;
209
210
211
212
213
214 struct mem_cgroup_threshold_ary *spare;
215};
216
217
218struct mem_cgroup_eventfd_list {
219 struct list_head list;
220 struct eventfd_ctx *eventfd;
221};
222
223static void mem_cgroup_threshold(struct mem_cgroup *memcg);
224static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
225
226
227
228
229
230
231
232
233
234
235
236
237struct mem_cgroup {
238 struct cgroup_subsys_state css;
239
240
241
242 struct res_counter res;
243
244 union {
245
246
247
248 struct res_counter memsw;
249
250
251
252
253
254
255
256
257
258
259 struct rcu_head rcu_freeing;
260
261
262
263
264 struct work_struct work_freeing;
265 };
266
267
268
269
270
271 struct mem_cgroup_lru_info info;
272 int last_scanned_node;
273#if MAX_NUMNODES > 1
274 nodemask_t scan_nodes;
275 atomic_t numainfo_events;
276 atomic_t numainfo_updating;
277#endif
278
279
280
281 bool use_hierarchy;
282
283 bool oom_lock;
284 atomic_t under_oom;
285
286 atomic_t refcnt;
287
288 int swappiness;
289
290 int oom_kill_disable;
291
292
293 bool memsw_is_minimum;
294
295
296 struct mutex thresholds_lock;
297
298
299 struct mem_cgroup_thresholds thresholds;
300
301
302 struct mem_cgroup_thresholds memsw_thresholds;
303
304
305 struct list_head oom_notify;
306
307
308
309
310
311 unsigned long move_charge_at_immigrate;
312
313
314
315 atomic_t moving_account;
316
317 spinlock_t move_lock;
318
319
320
321 struct mem_cgroup_stat_cpu __percpu *stat;
322
323
324
325
326 struct mem_cgroup_stat_cpu nocpu_base;
327 spinlock_t pcp_counter_lock;
328
329#ifdef CONFIG_INET
330 struct tcp_memcontrol tcp_mem;
331#endif
332};
333
334
335
336
337
338
339enum move_type {
340 MOVE_CHARGE_TYPE_ANON,
341 MOVE_CHARGE_TYPE_FILE,
342 NR_MOVE_TYPE,
343};
344
345
346static struct move_charge_struct {
347 spinlock_t lock;
348 struct mem_cgroup *from;
349 struct mem_cgroup *to;
350 unsigned long precharge;
351 unsigned long moved_charge;
352 unsigned long moved_swap;
353 struct task_struct *moving_task;
354 wait_queue_head_t waitq;
355} mc = {
356 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
357 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
358};
359
360static bool move_anon(void)
361{
362 return test_bit(MOVE_CHARGE_TYPE_ANON,
363 &mc.to->move_charge_at_immigrate);
364}
365
366static bool move_file(void)
367{
368 return test_bit(MOVE_CHARGE_TYPE_FILE,
369 &mc.to->move_charge_at_immigrate);
370}
371
372
373
374
375
376#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
377#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
378
379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_MAPPED,
382 MEM_CGROUP_CHARGE_TYPE_SHMEM,
383 MEM_CGROUP_CHARGE_TYPE_FORCE,
384 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
385 MEM_CGROUP_CHARGE_TYPE_DROP,
386 NR_CHARGE_TYPE,
387};
388
389
390#define _MEM (0)
391#define _MEMSWAP (1)
392#define _OOM_TYPE (2)
393#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
394#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
395#define MEMFILE_ATTR(val) ((val) & 0xffff)
396
397#define OOM_CONTROL (0)
398
399
400
401
402#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
403#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
404#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
405#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
406
407static void mem_cgroup_get(struct mem_cgroup *memcg);
408static void mem_cgroup_put(struct mem_cgroup *memcg);
409
410
411#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
412#include <net/sock.h>
413#include <net/ip.h>
414
415static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
416void sock_update_memcg(struct sock *sk)
417{
418 if (mem_cgroup_sockets_enabled) {
419 struct mem_cgroup *memcg;
420 struct cg_proto *cg_proto;
421
422 BUG_ON(!sk->sk_prot->proto_cgroup);
423
424
425
426
427
428
429
430
431
432 if (sk->sk_cgrp) {
433 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
434 mem_cgroup_get(sk->sk_cgrp->memcg);
435 return;
436 }
437
438 rcu_read_lock();
439 memcg = mem_cgroup_from_task(current);
440 cg_proto = sk->sk_prot->proto_cgroup(memcg);
441 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
442 mem_cgroup_get(memcg);
443 sk->sk_cgrp = cg_proto;
444 }
445 rcu_read_unlock();
446 }
447}
448EXPORT_SYMBOL(sock_update_memcg);
449
450void sock_release_memcg(struct sock *sk)
451{
452 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
453 struct mem_cgroup *memcg;
454 WARN_ON(!sk->sk_cgrp->memcg);
455 memcg = sk->sk_cgrp->memcg;
456 mem_cgroup_put(memcg);
457 }
458}
459
460#ifdef CONFIG_INET
461struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
462{
463 if (!memcg || mem_cgroup_is_root(memcg))
464 return NULL;
465
466 return &memcg->tcp_mem.cg_proto;
467}
468EXPORT_SYMBOL(tcp_proto_cgroup);
469#endif
470#endif
471
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg)
474{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
476 return;
477 static_key_slow_dec(&memcg_socket_limit_enabled);
478}
479#else
480static void disarm_sock_keys(struct mem_cgroup *memcg)
481{
482}
483#endif
484
485static void drain_all_stock_async(struct mem_cgroup *memcg);
486
487static struct mem_cgroup_per_zone *
488mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
489{
490 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
491}
492
493struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
494{
495 return &memcg->css;
496}
497
498static struct mem_cgroup_per_zone *
499page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
500{
501 int nid = page_to_nid(page);
502 int zid = page_zonenum(page);
503
504 return mem_cgroup_zoneinfo(memcg, nid, zid);
505}
506
507static struct mem_cgroup_tree_per_zone *
508soft_limit_tree_node_zone(int nid, int zid)
509{
510 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
511}
512
513static struct mem_cgroup_tree_per_zone *
514soft_limit_tree_from_page(struct page *page)
515{
516 int nid = page_to_nid(page);
517 int zid = page_zonenum(page);
518
519 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
520}
521
522static void
523__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
524 struct mem_cgroup_per_zone *mz,
525 struct mem_cgroup_tree_per_zone *mctz,
526 unsigned long long new_usage_in_excess)
527{
528 struct rb_node **p = &mctz->rb_root.rb_node;
529 struct rb_node *parent = NULL;
530 struct mem_cgroup_per_zone *mz_node;
531
532 if (mz->on_tree)
533 return;
534
535 mz->usage_in_excess = new_usage_in_excess;
536 if (!mz->usage_in_excess)
537 return;
538 while (*p) {
539 parent = *p;
540 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
541 tree_node);
542 if (mz->usage_in_excess < mz_node->usage_in_excess)
543 p = &(*p)->rb_left;
544
545
546
547
548 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
549 p = &(*p)->rb_right;
550 }
551 rb_link_node(&mz->tree_node, parent, p);
552 rb_insert_color(&mz->tree_node, &mctz->rb_root);
553 mz->on_tree = true;
554}
555
556static void
557__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
558 struct mem_cgroup_per_zone *mz,
559 struct mem_cgroup_tree_per_zone *mctz)
560{
561 if (!mz->on_tree)
562 return;
563 rb_erase(&mz->tree_node, &mctz->rb_root);
564 mz->on_tree = false;
565}
566
567static void
568mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
569 struct mem_cgroup_per_zone *mz,
570 struct mem_cgroup_tree_per_zone *mctz)
571{
572 spin_lock(&mctz->lock);
573 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
574 spin_unlock(&mctz->lock);
575}
576
577
578static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
579{
580 unsigned long long excess;
581 struct mem_cgroup_per_zone *mz;
582 struct mem_cgroup_tree_per_zone *mctz;
583 int nid = page_to_nid(page);
584 int zid = page_zonenum(page);
585 mctz = soft_limit_tree_from_page(page);
586
587
588
589
590
591 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
592 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
593 excess = res_counter_soft_limit_excess(&memcg->res);
594
595
596
597
598 if (excess || mz->on_tree) {
599 spin_lock(&mctz->lock);
600
601 if (mz->on_tree)
602 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
603
604
605
606
607 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
608 spin_unlock(&mctz->lock);
609 }
610 }
611}
612
613static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
614{
615 int node, zone;
616 struct mem_cgroup_per_zone *mz;
617 struct mem_cgroup_tree_per_zone *mctz;
618
619 for_each_node(node) {
620 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
621 mz = mem_cgroup_zoneinfo(memcg, node, zone);
622 mctz = soft_limit_tree_node_zone(node, zone);
623 mem_cgroup_remove_exceeded(memcg, mz, mctz);
624 }
625 }
626}
627
628static struct mem_cgroup_per_zone *
629__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
630{
631 struct rb_node *rightmost = NULL;
632 struct mem_cgroup_per_zone *mz;
633
634retry:
635 mz = NULL;
636 rightmost = rb_last(&mctz->rb_root);
637 if (!rightmost)
638 goto done;
639
640 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
641
642
643
644
645
646 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
647 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
648 !css_tryget(&mz->memcg->css))
649 goto retry;
650done:
651 return mz;
652}
653
654static struct mem_cgroup_per_zone *
655mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
656{
657 struct mem_cgroup_per_zone *mz;
658
659 spin_lock(&mctz->lock);
660 mz = __mem_cgroup_largest_soft_limit_node(mctz);
661 spin_unlock(&mctz->lock);
662 return mz;
663}
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
685 enum mem_cgroup_stat_index idx)
686{
687 long val = 0;
688 int cpu;
689
690 get_online_cpus();
691 for_each_online_cpu(cpu)
692 val += per_cpu(memcg->stat->count[idx], cpu);
693#ifdef CONFIG_HOTPLUG_CPU
694 spin_lock(&memcg->pcp_counter_lock);
695 val += memcg->nocpu_base.count[idx];
696 spin_unlock(&memcg->pcp_counter_lock);
697#endif
698 put_online_cpus();
699 return val;
700}
701
702static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
703 bool charge)
704{
705 int val = (charge) ? 1 : -1;
706 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
707}
708
709static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
710 enum mem_cgroup_events_index idx)
711{
712 unsigned long val = 0;
713 int cpu;
714
715 for_each_online_cpu(cpu)
716 val += per_cpu(memcg->stat->events[idx], cpu);
717#ifdef CONFIG_HOTPLUG_CPU
718 spin_lock(&memcg->pcp_counter_lock);
719 val += memcg->nocpu_base.events[idx];
720 spin_unlock(&memcg->pcp_counter_lock);
721#endif
722 return val;
723}
724
725static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
726 bool anon, int nr_pages)
727{
728 preempt_disable();
729
730
731
732
733
734 if (anon)
735 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
736 nr_pages);
737 else
738 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
739 nr_pages);
740
741
742 if (nr_pages > 0)
743 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
744 else {
745 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
746 nr_pages = -nr_pages;
747 }
748
749 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
750
751 preempt_enable();
752}
753
754unsigned long
755mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
756{
757 struct mem_cgroup_per_zone *mz;
758
759 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
760 return mz->lru_size[lru];
761}
762
763static unsigned long
764mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
765 unsigned int lru_mask)
766{
767 struct mem_cgroup_per_zone *mz;
768 enum lru_list lru;
769 unsigned long ret = 0;
770
771 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
772
773 for_each_lru(lru) {
774 if (BIT(lru) & lru_mask)
775 ret += mz->lru_size[lru];
776 }
777 return ret;
778}
779
780static unsigned long
781mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
782 int nid, unsigned int lru_mask)
783{
784 u64 total = 0;
785 int zid;
786
787 for (zid = 0; zid < MAX_NR_ZONES; zid++)
788 total += mem_cgroup_zone_nr_lru_pages(memcg,
789 nid, zid, lru_mask);
790
791 return total;
792}
793
794static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
795 unsigned int lru_mask)
796{
797 int nid;
798 u64 total = 0;
799
800 for_each_node_state(nid, N_HIGH_MEMORY)
801 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
802 return total;
803}
804
805static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
806 enum mem_cgroup_events_target target)
807{
808 unsigned long val, next;
809
810 val = __this_cpu_read(memcg->stat->nr_page_events);
811 next = __this_cpu_read(memcg->stat->targets[target]);
812
813 if ((long)next - (long)val < 0) {
814 switch (target) {
815 case MEM_CGROUP_TARGET_THRESH:
816 next = val + THRESHOLDS_EVENTS_TARGET;
817 break;
818 case MEM_CGROUP_TARGET_SOFTLIMIT:
819 next = val + SOFTLIMIT_EVENTS_TARGET;
820 break;
821 case MEM_CGROUP_TARGET_NUMAINFO:
822 next = val + NUMAINFO_EVENTS_TARGET;
823 break;
824 default:
825 break;
826 }
827 __this_cpu_write(memcg->stat->targets[target], next);
828 return true;
829 }
830 return false;
831}
832
833
834
835
836
837static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
838{
839 preempt_disable();
840
841 if (unlikely(mem_cgroup_event_ratelimit(memcg,
842 MEM_CGROUP_TARGET_THRESH))) {
843 bool do_softlimit;
844 bool do_numainfo __maybe_unused;
845
846 do_softlimit = mem_cgroup_event_ratelimit(memcg,
847 MEM_CGROUP_TARGET_SOFTLIMIT);
848#if MAX_NUMNODES > 1
849 do_numainfo = mem_cgroup_event_ratelimit(memcg,
850 MEM_CGROUP_TARGET_NUMAINFO);
851#endif
852 preempt_enable();
853
854 mem_cgroup_threshold(memcg);
855 if (unlikely(do_softlimit))
856 mem_cgroup_update_tree(memcg, page);
857#if MAX_NUMNODES > 1
858 if (unlikely(do_numainfo))
859 atomic_inc(&memcg->numainfo_events);
860#endif
861 } else
862 preempt_enable();
863}
864
865struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
866{
867 return container_of(cgroup_subsys_state(cont,
868 mem_cgroup_subsys_id), struct mem_cgroup,
869 css);
870}
871
872struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
873{
874
875
876
877
878
879 if (unlikely(!p))
880 return NULL;
881
882 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
883 struct mem_cgroup, css);
884}
885
886struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
887{
888 struct mem_cgroup *memcg = NULL;
889
890 if (!mm)
891 return NULL;
892
893
894
895
896
897 rcu_read_lock();
898 do {
899 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
900 if (unlikely(!memcg))
901 break;
902 } while (!css_tryget(&memcg->css));
903 rcu_read_unlock();
904 return memcg;
905}
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
925 struct mem_cgroup *prev,
926 struct mem_cgroup_reclaim_cookie *reclaim)
927{
928 struct mem_cgroup *memcg = NULL;
929 int id = 0;
930
931 if (mem_cgroup_disabled())
932 return NULL;
933
934 if (!root)
935 root = root_mem_cgroup;
936
937 if (prev && !reclaim)
938 id = css_id(&prev->css);
939
940 if (prev && prev != root)
941 css_put(&prev->css);
942
943 if (!root->use_hierarchy && root != root_mem_cgroup) {
944 if (prev)
945 return NULL;
946 return root;
947 }
948
949 while (!memcg) {
950 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
951 struct cgroup_subsys_state *css;
952
953 if (reclaim) {
954 int nid = zone_to_nid(reclaim->zone);
955 int zid = zone_idx(reclaim->zone);
956 struct mem_cgroup_per_zone *mz;
957
958 mz = mem_cgroup_zoneinfo(root, nid, zid);
959 iter = &mz->reclaim_iter[reclaim->priority];
960 if (prev && reclaim->generation != iter->generation)
961 return NULL;
962 id = iter->position;
963 }
964
965 rcu_read_lock();
966 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
967 if (css) {
968 if (css == &root->css || css_tryget(css))
969 memcg = container_of(css,
970 struct mem_cgroup, css);
971 } else
972 id = 0;
973 rcu_read_unlock();
974
975 if (reclaim) {
976 iter->position = id;
977 if (!css)
978 iter->generation++;
979 else if (!prev && memcg)
980 reclaim->generation = iter->generation;
981 }
982
983 if (prev && !css)
984 return NULL;
985 }
986 return memcg;
987}
988
989
990
991
992
993
994void mem_cgroup_iter_break(struct mem_cgroup *root,
995 struct mem_cgroup *prev)
996{
997 if (!root)
998 root = root_mem_cgroup;
999 if (prev && prev != root)
1000 css_put(&prev->css);
1001}
1002
1003
1004
1005
1006
1007
1008#define for_each_mem_cgroup_tree(iter, root) \
1009 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1010 iter != NULL; \
1011 iter = mem_cgroup_iter(root, iter, NULL))
1012
1013#define for_each_mem_cgroup(iter) \
1014 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1015 iter != NULL; \
1016 iter = mem_cgroup_iter(NULL, iter, NULL))
1017
1018static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
1019{
1020 return (memcg == root_mem_cgroup);
1021}
1022
1023void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1024{
1025 struct mem_cgroup *memcg;
1026
1027 if (!mm)
1028 return;
1029
1030 rcu_read_lock();
1031 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1032 if (unlikely(!memcg))
1033 goto out;
1034
1035 switch (idx) {
1036 case PGFAULT:
1037 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1038 break;
1039 case PGMAJFAULT:
1040 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1041 break;
1042 default:
1043 BUG();
1044 }
1045out:
1046 rcu_read_unlock();
1047}
1048EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1060 struct mem_cgroup *memcg)
1061{
1062 struct mem_cgroup_per_zone *mz;
1063
1064 if (mem_cgroup_disabled())
1065 return &zone->lruvec;
1066
1067 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1068 return &mz->lruvec;
1069}
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1091{
1092 struct mem_cgroup_per_zone *mz;
1093 struct mem_cgroup *memcg;
1094 struct page_cgroup *pc;
1095
1096 if (mem_cgroup_disabled())
1097 return &zone->lruvec;
1098
1099 pc = lookup_page_cgroup(page);
1100 memcg = pc->mem_cgroup;
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1112 pc->mem_cgroup = memcg = root_mem_cgroup;
1113
1114 mz = page_cgroup_zoneinfo(memcg, page);
1115 return &mz->lruvec;
1116}
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1128 int nr_pages)
1129{
1130 struct mem_cgroup_per_zone *mz;
1131 unsigned long *lru_size;
1132
1133 if (mem_cgroup_disabled())
1134 return;
1135
1136 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1137 lru_size = mz->lru_size + lru;
1138 *lru_size += nr_pages;
1139 VM_BUG_ON((long)(*lru_size) < 0);
1140}
1141
1142
1143
1144
1145
1146bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1147 struct mem_cgroup *memcg)
1148{
1149 if (root_memcg == memcg)
1150 return true;
1151 if (!root_memcg->use_hierarchy || !memcg)
1152 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154}
1155
1156static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1157 struct mem_cgroup *memcg)
1158{
1159 bool ret;
1160
1161 rcu_read_lock();
1162 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1163 rcu_read_unlock();
1164 return ret;
1165}
1166
1167int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1168{
1169 int ret;
1170 struct mem_cgroup *curr = NULL;
1171 struct task_struct *p;
1172
1173 p = find_lock_task_mm(task);
1174 if (p) {
1175 curr = try_get_mem_cgroup_from_mm(p->mm);
1176 task_unlock(p);
1177 } else {
1178
1179
1180
1181
1182
1183 task_lock(task);
1184 curr = mem_cgroup_from_task(task);
1185 if (curr)
1186 css_get(&curr->css);
1187 task_unlock(task);
1188 }
1189 if (!curr)
1190 return 0;
1191
1192
1193
1194
1195
1196
1197 ret = mem_cgroup_same_or_subtree(memcg, curr);
1198 css_put(&curr->css);
1199 return ret;
1200}
1201
1202int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1203{
1204 unsigned long inactive_ratio;
1205 unsigned long inactive;
1206 unsigned long active;
1207 unsigned long gb;
1208
1209 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1210 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1211
1212 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1213 if (gb)
1214 inactive_ratio = int_sqrt(10 * gb);
1215 else
1216 inactive_ratio = 1;
1217
1218 return inactive * inactive_ratio < active;
1219}
1220
1221int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1222{
1223 unsigned long active;
1224 unsigned long inactive;
1225
1226 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1227 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1228
1229 return (active > inactive);
1230}
1231
1232#define mem_cgroup_from_res_counter(counter, member) \
1233 container_of(counter, struct mem_cgroup, member)
1234
1235
1236
1237
1238
1239
1240
1241
1242static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1243{
1244 unsigned long long margin;
1245
1246 margin = res_counter_margin(&memcg->res);
1247 if (do_swap_account)
1248 margin = min(margin, res_counter_margin(&memcg->memsw));
1249 return margin >> PAGE_SHIFT;
1250}
1251
1252int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1253{
1254 struct cgroup *cgrp = memcg->css.cgroup;
1255
1256
1257 if (cgrp->parent == NULL)
1258 return vm_swappiness;
1259
1260 return memcg->swappiness;
1261}
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279atomic_t memcg_moving __read_mostly;
1280
1281static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1282{
1283 atomic_inc(&memcg_moving);
1284 atomic_inc(&memcg->moving_account);
1285 synchronize_rcu();
1286}
1287
1288static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1289{
1290
1291
1292
1293
1294 if (memcg) {
1295 atomic_dec(&memcg_moving);
1296 atomic_dec(&memcg->moving_account);
1297 }
1298}
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1313{
1314 VM_BUG_ON(!rcu_read_lock_held());
1315 return atomic_read(&memcg->moving_account) > 0;
1316}
1317
1318static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1319{
1320 struct mem_cgroup *from;
1321 struct mem_cgroup *to;
1322 bool ret = false;
1323
1324
1325
1326
1327 spin_lock(&mc.lock);
1328 from = mc.from;
1329 to = mc.to;
1330 if (!from)
1331 goto unlock;
1332
1333 ret = mem_cgroup_same_or_subtree(memcg, from)
1334 || mem_cgroup_same_or_subtree(memcg, to);
1335unlock:
1336 spin_unlock(&mc.lock);
1337 return ret;
1338}
1339
1340static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1341{
1342 if (mc.moving_task && current != mc.moving_task) {
1343 if (mem_cgroup_under_move(memcg)) {
1344 DEFINE_WAIT(wait);
1345 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1346
1347 if (mc.moving_task)
1348 schedule();
1349 finish_wait(&mc.waitq, &wait);
1350 return true;
1351 }
1352 }
1353 return false;
1354}
1355
1356
1357
1358
1359
1360
1361
1362static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1363 unsigned long *flags)
1364{
1365 spin_lock_irqsave(&memcg->move_lock, *flags);
1366}
1367
1368static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1369 unsigned long *flags)
1370{
1371 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1372}
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1383{
1384 struct cgroup *task_cgrp;
1385 struct cgroup *mem_cgrp;
1386
1387
1388
1389
1390
1391 static char memcg_name[PATH_MAX];
1392 int ret;
1393
1394 if (!memcg || !p)
1395 return;
1396
1397 rcu_read_lock();
1398
1399 mem_cgrp = memcg->css.cgroup;
1400 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1401
1402 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1403 if (ret < 0) {
1404
1405
1406
1407
1408 rcu_read_unlock();
1409 goto done;
1410 }
1411 rcu_read_unlock();
1412
1413 printk(KERN_INFO "Task in %s killed", memcg_name);
1414
1415 rcu_read_lock();
1416 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1417 if (ret < 0) {
1418 rcu_read_unlock();
1419 goto done;
1420 }
1421 rcu_read_unlock();
1422
1423
1424
1425
1426 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1427done:
1428
1429 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1430 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1431 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1432 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1433 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1434 "failcnt %llu\n",
1435 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1436 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1437 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1438}
1439
1440
1441
1442
1443
1444static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1445{
1446 int num = 0;
1447 struct mem_cgroup *iter;
1448
1449 for_each_mem_cgroup_tree(iter, memcg)
1450 num++;
1451 return num;
1452}
1453
1454
1455
1456
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{
1459 u64 limit;
1460 u64 memsw;
1461
1462 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1463 limit += total_swap_pages << PAGE_SHIFT;
1464
1465 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1466
1467
1468
1469
1470 return min(limit, memsw);
1471}
1472
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask,
1475 unsigned long flags)
1476{
1477 unsigned long total = 0;
1478 bool noswap = false;
1479 int loop;
1480
1481 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1482 noswap = true;
1483 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1484 noswap = true;
1485
1486 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1487 if (loop)
1488 drain_all_stock_async(memcg);
1489 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1490
1491
1492
1493
1494
1495 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1496 break;
1497 if (mem_cgroup_margin(memcg))
1498 break;
1499
1500
1501
1502
1503 if (loop && !total)
1504 break;
1505 }
1506 return total;
1507}
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1520 int nid, bool noswap)
1521{
1522 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1523 return true;
1524 if (noswap || !total_swap_pages)
1525 return false;
1526 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1527 return true;
1528 return false;
1529
1530}
1531#if MAX_NUMNODES > 1
1532
1533
1534
1535
1536
1537
1538
1539static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1540{
1541 int nid;
1542
1543
1544
1545
1546 if (!atomic_read(&memcg->numainfo_events))
1547 return;
1548 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1549 return;
1550
1551
1552 memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1553
1554 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1555
1556 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1557 node_clear(nid, memcg->scan_nodes);
1558 }
1559
1560 atomic_set(&memcg->numainfo_events, 0);
1561 atomic_set(&memcg->numainfo_updating, 0);
1562}
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1577{
1578 int node;
1579
1580 mem_cgroup_may_update_nodemask(memcg);
1581 node = memcg->last_scanned_node;
1582
1583 node = next_node(node, memcg->scan_nodes);
1584 if (node == MAX_NUMNODES)
1585 node = first_node(memcg->scan_nodes);
1586
1587
1588
1589
1590
1591
1592 if (unlikely(node == MAX_NUMNODES))
1593 node = numa_node_id();
1594
1595 memcg->last_scanned_node = node;
1596 return node;
1597}
1598
1599
1600
1601
1602
1603
1604
1605static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1606{
1607 int nid;
1608
1609
1610
1611
1612
1613 if (!nodes_empty(memcg->scan_nodes)) {
1614 for (nid = first_node(memcg->scan_nodes);
1615 nid < MAX_NUMNODES;
1616 nid = next_node(nid, memcg->scan_nodes)) {
1617
1618 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1619 return true;
1620 }
1621 }
1622
1623
1624
1625 for_each_node_state(nid, N_HIGH_MEMORY) {
1626 if (node_isset(nid, memcg->scan_nodes))
1627 continue;
1628 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1629 return true;
1630 }
1631 return false;
1632}
1633
1634#else
1635int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1636{
1637 return 0;
1638}
1639
1640static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1641{
1642 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1643}
1644#endif
1645
1646static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1647 struct zone *zone,
1648 gfp_t gfp_mask,
1649 unsigned long *total_scanned)
1650{
1651 struct mem_cgroup *victim = NULL;
1652 int total = 0;
1653 int loop = 0;
1654 unsigned long excess;
1655 unsigned long nr_scanned;
1656 struct mem_cgroup_reclaim_cookie reclaim = {
1657 .zone = zone,
1658 .priority = 0,
1659 };
1660
1661 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1662
1663 while (1) {
1664 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1665 if (!victim) {
1666 loop++;
1667 if (loop >= 2) {
1668
1669
1670
1671
1672
1673 if (!total)
1674 break;
1675
1676
1677
1678
1679
1680
1681 if (total >= (excess >> 2) ||
1682 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1683 break;
1684 }
1685 continue;
1686 }
1687 if (!mem_cgroup_reclaimable(victim, false))
1688 continue;
1689 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1690 zone, &nr_scanned);
1691 *total_scanned += nr_scanned;
1692 if (!res_counter_soft_limit_excess(&root_memcg->res))
1693 break;
1694 }
1695 mem_cgroup_iter_break(root_memcg, victim);
1696 return total;
1697}
1698
1699
1700
1701
1702
1703
1704static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1705{
1706 struct mem_cgroup *iter, *failed = NULL;
1707
1708 for_each_mem_cgroup_tree(iter, memcg) {
1709 if (iter->oom_lock) {
1710
1711
1712
1713
1714 failed = iter;
1715 mem_cgroup_iter_break(memcg, iter);
1716 break;
1717 } else
1718 iter->oom_lock = true;
1719 }
1720
1721 if (!failed)
1722 return true;
1723
1724
1725
1726
1727
1728 for_each_mem_cgroup_tree(iter, memcg) {
1729 if (iter == failed) {
1730 mem_cgroup_iter_break(memcg, iter);
1731 break;
1732 }
1733 iter->oom_lock = false;
1734 }
1735 return false;
1736}
1737
1738
1739
1740
1741static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1742{
1743 struct mem_cgroup *iter;
1744
1745 for_each_mem_cgroup_tree(iter, memcg)
1746 iter->oom_lock = false;
1747 return 0;
1748}
1749
1750static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1751{
1752 struct mem_cgroup *iter;
1753
1754 for_each_mem_cgroup_tree(iter, memcg)
1755 atomic_inc(&iter->under_oom);
1756}
1757
1758static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1759{
1760 struct mem_cgroup *iter;
1761
1762
1763
1764
1765
1766
1767 for_each_mem_cgroup_tree(iter, memcg)
1768 atomic_add_unless(&iter->under_oom, -1, 0);
1769}
1770
1771static DEFINE_SPINLOCK(memcg_oom_lock);
1772static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1773
1774struct oom_wait_info {
1775 struct mem_cgroup *memcg;
1776 wait_queue_t wait;
1777};
1778
1779static int memcg_oom_wake_function(wait_queue_t *wait,
1780 unsigned mode, int sync, void *arg)
1781{
1782 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1783 struct mem_cgroup *oom_wait_memcg;
1784 struct oom_wait_info *oom_wait_info;
1785
1786 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1787 oom_wait_memcg = oom_wait_info->memcg;
1788
1789
1790
1791
1792
1793 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1794 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1795 return 0;
1796 return autoremove_wake_function(wait, mode, sync, arg);
1797}
1798
1799static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1800{
1801
1802 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1803}
1804
1805static void memcg_oom_recover(struct mem_cgroup *memcg)
1806{
1807 if (memcg && atomic_read(&memcg->under_oom))
1808 memcg_wakeup_oom(memcg);
1809}
1810
1811
1812
1813
1814static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1815 int order)
1816{
1817 struct oom_wait_info owait;
1818 bool locked, need_to_kill;
1819
1820 owait.memcg = memcg;
1821 owait.wait.flags = 0;
1822 owait.wait.func = memcg_oom_wake_function;
1823 owait.wait.private = current;
1824 INIT_LIST_HEAD(&owait.wait.task_list);
1825 need_to_kill = true;
1826 mem_cgroup_mark_under_oom(memcg);
1827
1828
1829 spin_lock(&memcg_oom_lock);
1830 locked = mem_cgroup_oom_lock(memcg);
1831
1832
1833
1834
1835
1836 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1837 if (!locked || memcg->oom_kill_disable)
1838 need_to_kill = false;
1839 if (locked)
1840 mem_cgroup_oom_notify(memcg);
1841 spin_unlock(&memcg_oom_lock);
1842
1843 if (need_to_kill) {
1844 finish_wait(&memcg_oom_waitq, &owait.wait);
1845 mem_cgroup_out_of_memory(memcg, mask, order);
1846 } else {
1847 schedule();
1848 finish_wait(&memcg_oom_waitq, &owait.wait);
1849 }
1850 spin_lock(&memcg_oom_lock);
1851 if (locked)
1852 mem_cgroup_oom_unlock(memcg);
1853 memcg_wakeup_oom(memcg);
1854 spin_unlock(&memcg_oom_lock);
1855
1856 mem_cgroup_unmark_under_oom(memcg);
1857
1858 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1859 return false;
1860
1861 schedule_timeout_uninterruptible(1);
1862 return true;
1863}
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889void __mem_cgroup_begin_update_page_stat(struct page *page,
1890 bool *locked, unsigned long *flags)
1891{
1892 struct mem_cgroup *memcg;
1893 struct page_cgroup *pc;
1894
1895 pc = lookup_page_cgroup(page);
1896again:
1897 memcg = pc->mem_cgroup;
1898 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1899 return;
1900
1901
1902
1903
1904
1905
1906 if (!mem_cgroup_stolen(memcg))
1907 return;
1908
1909 move_lock_mem_cgroup(memcg, flags);
1910 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1911 move_unlock_mem_cgroup(memcg, flags);
1912 goto again;
1913 }
1914 *locked = true;
1915}
1916
1917void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1918{
1919 struct page_cgroup *pc = lookup_page_cgroup(page);
1920
1921
1922
1923
1924
1925
1926 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1927}
1928
1929void mem_cgroup_update_page_stat(struct page *page,
1930 enum mem_cgroup_page_stat_item idx, int val)
1931{
1932 struct mem_cgroup *memcg;
1933 struct page_cgroup *pc = lookup_page_cgroup(page);
1934 unsigned long uninitialized_var(flags);
1935
1936 if (mem_cgroup_disabled())
1937 return;
1938
1939 memcg = pc->mem_cgroup;
1940 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1941 return;
1942
1943 switch (idx) {
1944 case MEMCG_NR_FILE_MAPPED:
1945 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1946 break;
1947 default:
1948 BUG();
1949 }
1950
1951 this_cpu_add(memcg->stat->count[idx], val);
1952}
1953
1954
1955
1956
1957
1958#define CHARGE_BATCH 32U
1959struct memcg_stock_pcp {
1960 struct mem_cgroup *cached;
1961 unsigned int nr_pages;
1962 struct work_struct work;
1963 unsigned long flags;
1964#define FLUSHING_CACHED_CHARGE 0
1965};
1966static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1967static DEFINE_MUTEX(percpu_charge_mutex);
1968
1969
1970
1971
1972
1973
1974
1975static bool consume_stock(struct mem_cgroup *memcg)
1976{
1977 struct memcg_stock_pcp *stock;
1978 bool ret = true;
1979
1980 stock = &get_cpu_var(memcg_stock);
1981 if (memcg == stock->cached && stock->nr_pages)
1982 stock->nr_pages--;
1983 else
1984 ret = false;
1985 put_cpu_var(memcg_stock);
1986 return ret;
1987}
1988
1989
1990
1991
1992static void drain_stock(struct memcg_stock_pcp *stock)
1993{
1994 struct mem_cgroup *old = stock->cached;
1995
1996 if (stock->nr_pages) {
1997 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1998
1999 res_counter_uncharge(&old->res, bytes);
2000 if (do_swap_account)
2001 res_counter_uncharge(&old->memsw, bytes);
2002 stock->nr_pages = 0;
2003 }
2004 stock->cached = NULL;
2005}
2006
2007
2008
2009
2010
2011static void drain_local_stock(struct work_struct *dummy)
2012{
2013 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2014 drain_stock(stock);
2015 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2016}
2017
2018
2019
2020
2021
2022static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2023{
2024 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2025
2026 if (stock->cached != memcg) {
2027 drain_stock(stock);
2028 stock->cached = memcg;
2029 }
2030 stock->nr_pages += nr_pages;
2031 put_cpu_var(memcg_stock);
2032}
2033
2034
2035
2036
2037
2038
2039static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2040{
2041 int cpu, curcpu;
2042
2043
2044 get_online_cpus();
2045 curcpu = get_cpu();
2046 for_each_online_cpu(cpu) {
2047 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2048 struct mem_cgroup *memcg;
2049
2050 memcg = stock->cached;
2051 if (!memcg || !stock->nr_pages)
2052 continue;
2053 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2054 continue;
2055 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2056 if (cpu == curcpu)
2057 drain_local_stock(&stock->work);
2058 else
2059 schedule_work_on(cpu, &stock->work);
2060 }
2061 }
2062 put_cpu();
2063
2064 if (!sync)
2065 goto out;
2066
2067 for_each_online_cpu(cpu) {
2068 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2069 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2070 flush_work(&stock->work);
2071 }
2072out:
2073 put_online_cpus();
2074}
2075
2076
2077
2078
2079
2080
2081
2082static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2083{
2084
2085
2086
2087 if (!mutex_trylock(&percpu_charge_mutex))
2088 return;
2089 drain_all_stock(root_memcg, false);
2090 mutex_unlock(&percpu_charge_mutex);
2091}
2092
2093
2094static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2095{
2096
2097 mutex_lock(&percpu_charge_mutex);
2098 drain_all_stock(root_memcg, true);
2099 mutex_unlock(&percpu_charge_mutex);
2100}
2101
2102
2103
2104
2105
2106static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2107{
2108 int i;
2109
2110 spin_lock(&memcg->pcp_counter_lock);
2111 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2112 long x = per_cpu(memcg->stat->count[i], cpu);
2113
2114 per_cpu(memcg->stat->count[i], cpu) = 0;
2115 memcg->nocpu_base.count[i] += x;
2116 }
2117 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2118 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2119
2120 per_cpu(memcg->stat->events[i], cpu) = 0;
2121 memcg->nocpu_base.events[i] += x;
2122 }
2123 spin_unlock(&memcg->pcp_counter_lock);
2124}
2125
2126static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2127 unsigned long action,
2128 void *hcpu)
2129{
2130 int cpu = (unsigned long)hcpu;
2131 struct memcg_stock_pcp *stock;
2132 struct mem_cgroup *iter;
2133
2134 if (action == CPU_ONLINE)
2135 return NOTIFY_OK;
2136
2137 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2138 return NOTIFY_OK;
2139
2140 for_each_mem_cgroup(iter)
2141 mem_cgroup_drain_pcp_counter(iter, cpu);
2142
2143 stock = &per_cpu(memcg_stock, cpu);
2144 drain_stock(stock);
2145 return NOTIFY_OK;
2146}
2147
2148
2149
2150enum {
2151 CHARGE_OK,
2152 CHARGE_RETRY,
2153 CHARGE_NOMEM,
2154 CHARGE_WOULDBLOCK,
2155 CHARGE_OOM_DIE,
2156};
2157
2158static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2159 unsigned int nr_pages, bool oom_check)
2160{
2161 unsigned long csize = nr_pages * PAGE_SIZE;
2162 struct mem_cgroup *mem_over_limit;
2163 struct res_counter *fail_res;
2164 unsigned long flags = 0;
2165 int ret;
2166
2167 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2168
2169 if (likely(!ret)) {
2170 if (!do_swap_account)
2171 return CHARGE_OK;
2172 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2173 if (likely(!ret))
2174 return CHARGE_OK;
2175
2176 res_counter_uncharge(&memcg->res, csize);
2177 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2178 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2179 } else
2180 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2181
2182
2183
2184
2185
2186
2187
2188 if (nr_pages == CHARGE_BATCH)
2189 return CHARGE_RETRY;
2190
2191 if (!(gfp_mask & __GFP_WAIT))
2192 return CHARGE_WOULDBLOCK;
2193
2194 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2195 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2196 return CHARGE_RETRY;
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206 if (nr_pages == 1 && ret)
2207 return CHARGE_RETRY;
2208
2209
2210
2211
2212
2213 if (mem_cgroup_wait_acct_move(mem_over_limit))
2214 return CHARGE_RETRY;
2215
2216
2217 if (!oom_check)
2218 return CHARGE_NOMEM;
2219
2220 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2221 return CHARGE_OOM_DIE;
2222
2223 return CHARGE_RETRY;
2224}
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247static int __mem_cgroup_try_charge(struct mm_struct *mm,
2248 gfp_t gfp_mask,
2249 unsigned int nr_pages,
2250 struct mem_cgroup **ptr,
2251 bool oom)
2252{
2253 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2254 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2255 struct mem_cgroup *memcg = NULL;
2256 int ret;
2257
2258
2259
2260
2261
2262
2263 if (unlikely(test_thread_flag(TIF_MEMDIE)
2264 || fatal_signal_pending(current)))
2265 goto bypass;
2266
2267
2268
2269
2270
2271
2272
2273 if (!*ptr && !mm)
2274 *ptr = root_mem_cgroup;
2275again:
2276 if (*ptr) {
2277 memcg = *ptr;
2278 VM_BUG_ON(css_is_removed(&memcg->css));
2279 if (mem_cgroup_is_root(memcg))
2280 goto done;
2281 if (nr_pages == 1 && consume_stock(memcg))
2282 goto done;
2283 css_get(&memcg->css);
2284 } else {
2285 struct task_struct *p;
2286
2287 rcu_read_lock();
2288 p = rcu_dereference(mm->owner);
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299 memcg = mem_cgroup_from_task(p);
2300 if (!memcg)
2301 memcg = root_mem_cgroup;
2302 if (mem_cgroup_is_root(memcg)) {
2303 rcu_read_unlock();
2304 goto done;
2305 }
2306 if (nr_pages == 1 && consume_stock(memcg)) {
2307
2308
2309
2310
2311
2312
2313
2314
2315 rcu_read_unlock();
2316 goto done;
2317 }
2318
2319 if (!css_tryget(&memcg->css)) {
2320 rcu_read_unlock();
2321 goto again;
2322 }
2323 rcu_read_unlock();
2324 }
2325
2326 do {
2327 bool oom_check;
2328
2329
2330 if (fatal_signal_pending(current)) {
2331 css_put(&memcg->css);
2332 goto bypass;
2333 }
2334
2335 oom_check = false;
2336 if (oom && !nr_oom_retries) {
2337 oom_check = true;
2338 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2339 }
2340
2341 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2342 switch (ret) {
2343 case CHARGE_OK:
2344 break;
2345 case CHARGE_RETRY:
2346 batch = nr_pages;
2347 css_put(&memcg->css);
2348 memcg = NULL;
2349 goto again;
2350 case CHARGE_WOULDBLOCK:
2351 css_put(&memcg->css);
2352 goto nomem;
2353 case CHARGE_NOMEM:
2354 if (!oom) {
2355 css_put(&memcg->css);
2356 goto nomem;
2357 }
2358
2359 nr_oom_retries--;
2360 break;
2361 case CHARGE_OOM_DIE:
2362 css_put(&memcg->css);
2363 goto bypass;
2364 }
2365 } while (ret != CHARGE_OK);
2366
2367 if (batch > nr_pages)
2368 refill_stock(memcg, batch - nr_pages);
2369 css_put(&memcg->css);
2370done:
2371 *ptr = memcg;
2372 return 0;
2373nomem:
2374 *ptr = NULL;
2375 return -ENOMEM;
2376bypass:
2377 *ptr = root_mem_cgroup;
2378 return -EINTR;
2379}
2380
2381
2382
2383
2384
2385
2386static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2387 unsigned int nr_pages)
2388{
2389 if (!mem_cgroup_is_root(memcg)) {
2390 unsigned long bytes = nr_pages * PAGE_SIZE;
2391
2392 res_counter_uncharge(&memcg->res, bytes);
2393 if (do_swap_account)
2394 res_counter_uncharge(&memcg->memsw, bytes);
2395 }
2396}
2397
2398
2399
2400
2401
2402static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2403 unsigned int nr_pages)
2404{
2405 unsigned long bytes = nr_pages * PAGE_SIZE;
2406
2407 if (mem_cgroup_is_root(memcg))
2408 return;
2409
2410 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2411 if (do_swap_account)
2412 res_counter_uncharge_until(&memcg->memsw,
2413 memcg->memsw.parent, bytes);
2414}
2415
2416
2417
2418
2419
2420
2421
2422static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2423{
2424 struct cgroup_subsys_state *css;
2425
2426
2427 if (!id)
2428 return NULL;
2429 css = css_lookup(&mem_cgroup_subsys, id);
2430 if (!css)
2431 return NULL;
2432 return container_of(css, struct mem_cgroup, css);
2433}
2434
2435struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2436{
2437 struct mem_cgroup *memcg = NULL;
2438 struct page_cgroup *pc;
2439 unsigned short id;
2440 swp_entry_t ent;
2441
2442 VM_BUG_ON(!PageLocked(page));
2443
2444 pc = lookup_page_cgroup(page);
2445 lock_page_cgroup(pc);
2446 if (PageCgroupUsed(pc)) {
2447 memcg = pc->mem_cgroup;
2448 if (memcg && !css_tryget(&memcg->css))
2449 memcg = NULL;
2450 } else if (PageSwapCache(page)) {
2451 ent.val = page_private(page);
2452 id = lookup_swap_cgroup_id(ent);
2453 rcu_read_lock();
2454 memcg = mem_cgroup_lookup(id);
2455 if (memcg && !css_tryget(&memcg->css))
2456 memcg = NULL;
2457 rcu_read_unlock();
2458 }
2459 unlock_page_cgroup(pc);
2460 return memcg;
2461}
2462
2463static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2464 struct page *page,
2465 unsigned int nr_pages,
2466 enum charge_type ctype,
2467 bool lrucare)
2468{
2469 struct page_cgroup *pc = lookup_page_cgroup(page);
2470 struct zone *uninitialized_var(zone);
2471 struct lruvec *lruvec;
2472 bool was_on_lru = false;
2473 bool anon;
2474
2475 lock_page_cgroup(pc);
2476 if (unlikely(PageCgroupUsed(pc))) {
2477 unlock_page_cgroup(pc);
2478 __mem_cgroup_cancel_charge(memcg, nr_pages);
2479 return;
2480 }
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490 if (lrucare) {
2491 zone = page_zone(page);
2492 spin_lock_irq(&zone->lru_lock);
2493 if (PageLRU(page)) {
2494 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2495 ClearPageLRU(page);
2496 del_page_from_lru_list(page, lruvec, page_lru(page));
2497 was_on_lru = true;
2498 }
2499 }
2500
2501 pc->mem_cgroup = memcg;
2502
2503
2504
2505
2506
2507
2508
2509 smp_wmb();
2510 SetPageCgroupUsed(pc);
2511
2512 if (lrucare) {
2513 if (was_on_lru) {
2514 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2515 VM_BUG_ON(PageLRU(page));
2516 SetPageLRU(page);
2517 add_page_to_lru_list(page, lruvec, page_lru(page));
2518 }
2519 spin_unlock_irq(&zone->lru_lock);
2520 }
2521
2522 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2523 anon = true;
2524 else
2525 anon = false;
2526
2527 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2528 unlock_page_cgroup(pc);
2529
2530
2531
2532
2533
2534
2535 memcg_check_events(memcg, page);
2536}
2537
2538#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2539
2540#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2541
2542
2543
2544
2545
2546
2547void mem_cgroup_split_huge_fixup(struct page *head)
2548{
2549 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2550 struct page_cgroup *pc;
2551 int i;
2552
2553 if (mem_cgroup_disabled())
2554 return;
2555 for (i = 1; i < HPAGE_PMD_NR; i++) {
2556 pc = head_pc + i;
2557 pc->mem_cgroup = head_pc->mem_cgroup;
2558 smp_wmb();
2559 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2560 }
2561}
2562#endif
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579static int mem_cgroup_move_account(struct page *page,
2580 unsigned int nr_pages,
2581 struct page_cgroup *pc,
2582 struct mem_cgroup *from,
2583 struct mem_cgroup *to)
2584{
2585 unsigned long flags;
2586 int ret;
2587 bool anon = PageAnon(page);
2588
2589 VM_BUG_ON(from == to);
2590 VM_BUG_ON(PageLRU(page));
2591
2592
2593
2594
2595
2596
2597 ret = -EBUSY;
2598 if (nr_pages > 1 && !PageTransHuge(page))
2599 goto out;
2600
2601 lock_page_cgroup(pc);
2602
2603 ret = -EINVAL;
2604 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2605 goto unlock;
2606
2607 move_lock_mem_cgroup(from, &flags);
2608
2609 if (!anon && page_mapped(page)) {
2610
2611 preempt_disable();
2612 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2613 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2614 preempt_enable();
2615 }
2616 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2617
2618
2619 pc->mem_cgroup = to;
2620 mem_cgroup_charge_statistics(to, anon, nr_pages);
2621
2622
2623
2624
2625
2626
2627
2628 move_unlock_mem_cgroup(from, &flags);
2629 ret = 0;
2630unlock:
2631 unlock_page_cgroup(pc);
2632
2633
2634
2635 memcg_check_events(to, page);
2636 memcg_check_events(from, page);
2637out:
2638 return ret;
2639}
2640
2641
2642
2643
2644
2645static int mem_cgroup_move_parent(struct page *page,
2646 struct page_cgroup *pc,
2647 struct mem_cgroup *child,
2648 gfp_t gfp_mask)
2649{
2650 struct mem_cgroup *parent;
2651 unsigned int nr_pages;
2652 unsigned long uninitialized_var(flags);
2653 int ret;
2654
2655
2656 if (mem_cgroup_is_root(child))
2657 return -EINVAL;
2658
2659 ret = -EBUSY;
2660 if (!get_page_unless_zero(page))
2661 goto out;
2662 if (isolate_lru_page(page))
2663 goto put;
2664
2665 nr_pages = hpage_nr_pages(page);
2666
2667 parent = parent_mem_cgroup(child);
2668
2669
2670
2671 if (!parent)
2672 parent = root_mem_cgroup;
2673
2674 if (nr_pages > 1)
2675 flags = compound_lock_irqsave(page);
2676
2677 ret = mem_cgroup_move_account(page, nr_pages,
2678 pc, child, parent);
2679 if (!ret)
2680 __mem_cgroup_cancel_local_charge(child, nr_pages);
2681
2682 if (nr_pages > 1)
2683 compound_unlock_irqrestore(page, flags);
2684 putback_lru_page(page);
2685put:
2686 put_page(page);
2687out:
2688 return ret;
2689}
2690
2691
2692
2693
2694
2695
2696
2697static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2698 gfp_t gfp_mask, enum charge_type ctype)
2699{
2700 struct mem_cgroup *memcg = NULL;
2701 unsigned int nr_pages = 1;
2702 bool oom = true;
2703 int ret;
2704
2705 if (PageTransHuge(page)) {
2706 nr_pages <<= compound_order(page);
2707 VM_BUG_ON(!PageTransHuge(page));
2708
2709
2710
2711
2712 oom = false;
2713 }
2714
2715 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2716 if (ret == -ENOMEM)
2717 return ret;
2718 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
2719 return 0;
2720}
2721
2722int mem_cgroup_newpage_charge(struct page *page,
2723 struct mm_struct *mm, gfp_t gfp_mask)
2724{
2725 if (mem_cgroup_disabled())
2726 return 0;
2727 VM_BUG_ON(page_mapped(page));
2728 VM_BUG_ON(page->mapping && !PageAnon(page));
2729 VM_BUG_ON(!mm);
2730 return mem_cgroup_charge_common(page, mm, gfp_mask,
2731 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2732}
2733
2734static void
2735__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2736 enum charge_type ctype);
2737
2738int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2739 gfp_t gfp_mask)
2740{
2741 struct mem_cgroup *memcg = NULL;
2742 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2743 int ret;
2744
2745 if (mem_cgroup_disabled())
2746 return 0;
2747 if (PageCompound(page))
2748 return 0;
2749
2750 if (unlikely(!mm))
2751 mm = &init_mm;
2752 if (!page_is_file_cache(page))
2753 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2754
2755 if (!PageSwapCache(page))
2756 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2757 else {
2758 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2759 if (!ret)
2760 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2761 }
2762 return ret;
2763}
2764
2765
2766
2767
2768
2769
2770
2771int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2772 struct page *page,
2773 gfp_t mask, struct mem_cgroup **memcgp)
2774{
2775 struct mem_cgroup *memcg;
2776 int ret;
2777
2778 *memcgp = NULL;
2779
2780 if (mem_cgroup_disabled())
2781 return 0;
2782
2783 if (!do_swap_account)
2784 goto charge_cur_mm;
2785
2786
2787
2788
2789
2790
2791 if (!PageSwapCache(page))
2792 goto charge_cur_mm;
2793 memcg = try_get_mem_cgroup_from_page(page);
2794 if (!memcg)
2795 goto charge_cur_mm;
2796 *memcgp = memcg;
2797 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2798 css_put(&memcg->css);
2799 if (ret == -EINTR)
2800 ret = 0;
2801 return ret;
2802charge_cur_mm:
2803 if (unlikely(!mm))
2804 mm = &init_mm;
2805 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2806 if (ret == -EINTR)
2807 ret = 0;
2808 return ret;
2809}
2810
2811static void
2812__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2813 enum charge_type ctype)
2814{
2815 if (mem_cgroup_disabled())
2816 return;
2817 if (!memcg)
2818 return;
2819 cgroup_exclude_rmdir(&memcg->css);
2820
2821 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
2822
2823
2824
2825
2826
2827
2828
2829 if (do_swap_account && PageSwapCache(page)) {
2830 swp_entry_t ent = {.val = page_private(page)};
2831 mem_cgroup_uncharge_swap(ent);
2832 }
2833
2834
2835
2836
2837
2838 cgroup_release_and_wakeup_rmdir(&memcg->css);
2839}
2840
2841void mem_cgroup_commit_charge_swapin(struct page *page,
2842 struct mem_cgroup *memcg)
2843{
2844 __mem_cgroup_commit_charge_swapin(page, memcg,
2845 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2846}
2847
2848void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2849{
2850 if (mem_cgroup_disabled())
2851 return;
2852 if (!memcg)
2853 return;
2854 __mem_cgroup_cancel_charge(memcg, 1);
2855}
2856
2857static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2858 unsigned int nr_pages,
2859 const enum charge_type ctype)
2860{
2861 struct memcg_batch_info *batch = NULL;
2862 bool uncharge_memsw = true;
2863
2864
2865 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2866 uncharge_memsw = false;
2867
2868 batch = ¤t->memcg_batch;
2869
2870
2871
2872
2873
2874 if (!batch->memcg)
2875 batch->memcg = memcg;
2876
2877
2878
2879
2880
2881
2882
2883
2884 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2885 goto direct_uncharge;
2886
2887 if (nr_pages > 1)
2888 goto direct_uncharge;
2889
2890
2891
2892
2893
2894
2895 if (batch->memcg != memcg)
2896 goto direct_uncharge;
2897
2898 batch->nr_pages++;
2899 if (uncharge_memsw)
2900 batch->memsw_nr_pages++;
2901 return;
2902direct_uncharge:
2903 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
2904 if (uncharge_memsw)
2905 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2906 if (unlikely(batch->memcg != memcg))
2907 memcg_oom_recover(memcg);
2908}
2909
2910
2911
2912
2913static struct mem_cgroup *
2914__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2915{
2916 struct mem_cgroup *memcg = NULL;
2917 unsigned int nr_pages = 1;
2918 struct page_cgroup *pc;
2919 bool anon;
2920
2921 if (mem_cgroup_disabled())
2922 return NULL;
2923
2924 if (PageSwapCache(page))
2925 return NULL;
2926
2927 if (PageTransHuge(page)) {
2928 nr_pages <<= compound_order(page);
2929 VM_BUG_ON(!PageTransHuge(page));
2930 }
2931
2932
2933
2934 pc = lookup_page_cgroup(page);
2935 if (unlikely(!PageCgroupUsed(pc)))
2936 return NULL;
2937
2938 lock_page_cgroup(pc);
2939
2940 memcg = pc->mem_cgroup;
2941
2942 if (!PageCgroupUsed(pc))
2943 goto unlock_out;
2944
2945 anon = PageAnon(page);
2946
2947 switch (ctype) {
2948 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2949
2950
2951
2952
2953
2954 anon = true;
2955
2956 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957
2958 if (page_mapped(page) || PageCgroupMigration(pc))
2959 goto unlock_out;
2960 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2962 if (!PageAnon(page)) {
2963 if (page->mapping && !page_is_file_cache(page))
2964 goto unlock_out;
2965 } else if (page_mapped(page))
2966 goto unlock_out;
2967 break;
2968 default:
2969 break;
2970 }
2971
2972 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2973
2974 ClearPageCgroupUsed(pc);
2975
2976
2977
2978
2979
2980
2981
2982 unlock_page_cgroup(pc);
2983
2984
2985
2986
2987 memcg_check_events(memcg, page);
2988 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2989 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg);
2991 }
2992 if (!mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994
2995 return memcg;
2996
2997unlock_out:
2998 unlock_page_cgroup(pc);
2999 return NULL;
3000}
3001
3002void mem_cgroup_uncharge_page(struct page *page)
3003{
3004
3005 if (page_mapped(page))
3006 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3009}
3010
3011void mem_cgroup_uncharge_cache_page(struct page *page)
3012{
3013 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3016}
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026void mem_cgroup_uncharge_start(void)
3027{
3028 current->memcg_batch.do_batch++;
3029
3030 if (current->memcg_batch.do_batch == 1) {
3031 current->memcg_batch.memcg = NULL;
3032 current->memcg_batch.nr_pages = 0;
3033 current->memcg_batch.memsw_nr_pages = 0;
3034 }
3035}
3036
3037void mem_cgroup_uncharge_end(void)
3038{
3039 struct memcg_batch_info *batch = ¤t->memcg_batch;
3040
3041 if (!batch->do_batch)
3042 return;
3043
3044 batch->do_batch--;
3045 if (batch->do_batch)
3046 return;
3047
3048 if (!batch->memcg)
3049 return;
3050
3051
3052
3053
3054 if (batch->nr_pages)
3055 res_counter_uncharge(&batch->memcg->res,
3056 batch->nr_pages * PAGE_SIZE);
3057 if (batch->memsw_nr_pages)
3058 res_counter_uncharge(&batch->memcg->memsw,
3059 batch->memsw_nr_pages * PAGE_SIZE);
3060 memcg_oom_recover(batch->memcg);
3061
3062 batch->memcg = NULL;
3063}
3064
3065#ifdef CONFIG_SWAP
3066
3067
3068
3069
3070void
3071mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3072{
3073 struct mem_cgroup *memcg;
3074 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3075
3076 if (!swapout)
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078
3079 memcg = __mem_cgroup_uncharge_common(page, ctype);
3080
3081
3082
3083
3084
3085 if (do_swap_account && swapout && memcg)
3086 swap_cgroup_record(ent, css_id(&memcg->css));
3087}
3088#endif
3089
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3091
3092
3093
3094
3095void mem_cgroup_uncharge_swap(swp_entry_t ent)
3096{
3097 struct mem_cgroup *memcg;
3098 unsigned short id;
3099
3100 if (!do_swap_account)
3101 return;
3102
3103 id = swap_cgroup_record(ent, 0);
3104 rcu_read_lock();
3105 memcg = mem_cgroup_lookup(id);
3106 if (memcg) {
3107
3108
3109
3110
3111 if (!mem_cgroup_is_root(memcg))
3112 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3113 mem_cgroup_swap_statistics(memcg, false);
3114 mem_cgroup_put(memcg);
3115 }
3116 rcu_read_unlock();
3117}
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133static int mem_cgroup_move_swap_account(swp_entry_t entry,
3134 struct mem_cgroup *from, struct mem_cgroup *to)
3135{
3136 unsigned short old_id, new_id;
3137
3138 old_id = css_id(&from->css);
3139 new_id = css_id(&to->css);
3140
3141 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3142 mem_cgroup_swap_statistics(from, false);
3143 mem_cgroup_swap_statistics(to, true);
3144
3145
3146
3147
3148
3149
3150
3151
3152 mem_cgroup_get(to);
3153 return 0;
3154 }
3155 return -EINVAL;
3156}
3157#else
3158static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3159 struct mem_cgroup *from, struct mem_cgroup *to)
3160{
3161 return -EINVAL;
3162}
3163#endif
3164
3165
3166
3167
3168
3169int mem_cgroup_prepare_migration(struct page *page,
3170 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
3171{
3172 struct mem_cgroup *memcg = NULL;
3173 struct page_cgroup *pc;
3174 enum charge_type ctype;
3175 int ret = 0;
3176
3177 *memcgp = NULL;
3178
3179 VM_BUG_ON(PageTransHuge(page));
3180 if (mem_cgroup_disabled())
3181 return 0;
3182
3183 pc = lookup_page_cgroup(page);
3184 lock_page_cgroup(pc);
3185 if (PageCgroupUsed(pc)) {
3186 memcg = pc->mem_cgroup;
3187 css_get(&memcg->css);
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217 if (PageAnon(page))
3218 SetPageCgroupMigration(pc);
3219 }
3220 unlock_page_cgroup(pc);
3221
3222
3223
3224
3225 if (!memcg)
3226 return 0;
3227
3228 *memcgp = memcg;
3229 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3230 css_put(&memcg->css);
3231 if (ret) {
3232 if (PageAnon(page)) {
3233 lock_page_cgroup(pc);
3234 ClearPageCgroupMigration(pc);
3235 unlock_page_cgroup(pc);
3236
3237
3238
3239 mem_cgroup_uncharge_page(page);
3240 }
3241
3242 return -ENOMEM;
3243 }
3244
3245
3246
3247
3248
3249
3250 if (PageAnon(page))
3251 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3252 else if (page_is_file_cache(page))
3253 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3254 else
3255 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3256 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3257 return ret;
3258}
3259
3260
3261void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3262 struct page *oldpage, struct page *newpage, bool migration_ok)
3263{
3264 struct page *used, *unused;
3265 struct page_cgroup *pc;
3266 bool anon;
3267
3268 if (!memcg)
3269 return;
3270
3271 cgroup_exclude_rmdir(&memcg->css);
3272 if (!migration_ok) {
3273 used = oldpage;
3274 unused = newpage;
3275 } else {
3276 used = newpage;
3277 unused = oldpage;
3278 }
3279
3280
3281
3282
3283
3284 pc = lookup_page_cgroup(oldpage);
3285 lock_page_cgroup(pc);
3286 ClearPageCgroupMigration(pc);
3287 unlock_page_cgroup(pc);
3288 anon = PageAnon(used);
3289 __mem_cgroup_uncharge_common(unused,
3290 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3291 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301 if (anon)
3302 mem_cgroup_uncharge_page(used);
3303
3304
3305
3306
3307
3308
3309 cgroup_release_and_wakeup_rmdir(&memcg->css);
3310}
3311
3312
3313
3314
3315
3316
3317void mem_cgroup_replace_page_cache(struct page *oldpage,
3318 struct page *newpage)
3319{
3320 struct mem_cgroup *memcg = NULL;
3321 struct page_cgroup *pc;
3322 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3323
3324 if (mem_cgroup_disabled())
3325 return;
3326
3327 pc = lookup_page_cgroup(oldpage);
3328
3329 lock_page_cgroup(pc);
3330 if (PageCgroupUsed(pc)) {
3331 memcg = pc->mem_cgroup;
3332 mem_cgroup_charge_statistics(memcg, false, -1);
3333 ClearPageCgroupUsed(pc);
3334 }
3335 unlock_page_cgroup(pc);
3336
3337
3338
3339
3340
3341 if (!memcg)
3342 return;
3343
3344 if (PageSwapBacked(oldpage))
3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3346
3347
3348
3349
3350
3351
3352 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
3353}
3354
3355#ifdef CONFIG_DEBUG_VM
3356static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3357{
3358 struct page_cgroup *pc;
3359
3360 pc = lookup_page_cgroup(page);
3361
3362
3363
3364
3365
3366 if (likely(pc) && PageCgroupUsed(pc))
3367 return pc;
3368 return NULL;
3369}
3370
3371bool mem_cgroup_bad_page_check(struct page *page)
3372{
3373 if (mem_cgroup_disabled())
3374 return false;
3375
3376 return lookup_page_cgroup_used(page) != NULL;
3377}
3378
3379void mem_cgroup_print_bad_page(struct page *page)
3380{
3381 struct page_cgroup *pc;
3382
3383 pc = lookup_page_cgroup_used(page);
3384 if (pc) {
3385 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3386 pc, pc->flags, pc->mem_cgroup);
3387 }
3388}
3389#endif
3390
3391static DEFINE_MUTEX(set_limit_mutex);
3392
3393static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3394 unsigned long long val)
3395{
3396 int retry_count;
3397 u64 memswlimit, memlimit;
3398 int ret = 0;
3399 int children = mem_cgroup_count_children(memcg);
3400 u64 curusage, oldusage;
3401 int enlarge;
3402
3403
3404
3405
3406
3407
3408 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3409
3410 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3411
3412 enlarge = 0;
3413 while (retry_count) {
3414 if (signal_pending(current)) {
3415 ret = -EINTR;
3416 break;
3417 }
3418
3419
3420
3421
3422
3423 mutex_lock(&set_limit_mutex);
3424 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3425 if (memswlimit < val) {
3426 ret = -EINVAL;
3427 mutex_unlock(&set_limit_mutex);
3428 break;
3429 }
3430
3431 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3432 if (memlimit < val)
3433 enlarge = 1;
3434
3435 ret = res_counter_set_limit(&memcg->res, val);
3436 if (!ret) {
3437 if (memswlimit == val)
3438 memcg->memsw_is_minimum = true;
3439 else
3440 memcg->memsw_is_minimum = false;
3441 }
3442 mutex_unlock(&set_limit_mutex);
3443
3444 if (!ret)
3445 break;
3446
3447 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3448 MEM_CGROUP_RECLAIM_SHRINK);
3449 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3450
3451 if (curusage >= oldusage)
3452 retry_count--;
3453 else
3454 oldusage = curusage;
3455 }
3456 if (!ret && enlarge)
3457 memcg_oom_recover(memcg);
3458
3459 return ret;
3460}
3461
3462static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3463 unsigned long long val)
3464{
3465 int retry_count;
3466 u64 memlimit, memswlimit, oldusage, curusage;
3467 int children = mem_cgroup_count_children(memcg);
3468 int ret = -EBUSY;
3469 int enlarge = 0;
3470
3471
3472 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3473 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3474 while (retry_count) {
3475 if (signal_pending(current)) {
3476 ret = -EINTR;
3477 break;
3478 }
3479
3480
3481
3482
3483
3484 mutex_lock(&set_limit_mutex);
3485 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3486 if (memlimit > val) {
3487 ret = -EINVAL;
3488 mutex_unlock(&set_limit_mutex);
3489 break;
3490 }
3491 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3492 if (memswlimit < val)
3493 enlarge = 1;
3494 ret = res_counter_set_limit(&memcg->memsw, val);
3495 if (!ret) {
3496 if (memlimit == val)
3497 memcg->memsw_is_minimum = true;
3498 else
3499 memcg->memsw_is_minimum = false;
3500 }
3501 mutex_unlock(&set_limit_mutex);
3502
3503 if (!ret)
3504 break;
3505
3506 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3507 MEM_CGROUP_RECLAIM_NOSWAP |
3508 MEM_CGROUP_RECLAIM_SHRINK);
3509 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3510
3511 if (curusage >= oldusage)
3512 retry_count--;
3513 else
3514 oldusage = curusage;
3515 }
3516 if (!ret && enlarge)
3517 memcg_oom_recover(memcg);
3518 return ret;
3519}
3520
3521unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3522 gfp_t gfp_mask,
3523 unsigned long *total_scanned)
3524{
3525 unsigned long nr_reclaimed = 0;
3526 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3527 unsigned long reclaimed;
3528 int loop = 0;
3529 struct mem_cgroup_tree_per_zone *mctz;
3530 unsigned long long excess;
3531 unsigned long nr_scanned;
3532
3533 if (order > 0)
3534 return 0;
3535
3536 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3537
3538
3539
3540
3541
3542 do {
3543 if (next_mz)
3544 mz = next_mz;
3545 else
3546 mz = mem_cgroup_largest_soft_limit_node(mctz);
3547 if (!mz)
3548 break;
3549
3550 nr_scanned = 0;
3551 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3552 gfp_mask, &nr_scanned);
3553 nr_reclaimed += reclaimed;
3554 *total_scanned += nr_scanned;
3555 spin_lock(&mctz->lock);
3556
3557
3558
3559
3560
3561 next_mz = NULL;
3562 if (!reclaimed) {
3563 do {
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575 next_mz =
3576 __mem_cgroup_largest_soft_limit_node(mctz);
3577 if (next_mz == mz)
3578 css_put(&next_mz->memcg->css);
3579 else
3580 break;
3581 } while (1);
3582 }
3583 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3584 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3595 spin_unlock(&mctz->lock);
3596 css_put(&mz->memcg->css);
3597 loop++;
3598
3599
3600
3601
3602
3603 if (!nr_reclaimed &&
3604 (next_mz == NULL ||
3605 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3606 break;
3607 } while (!nr_reclaimed);
3608 if (next_mz)
3609 css_put(&next_mz->memcg->css);
3610 return nr_reclaimed;
3611}
3612
3613
3614
3615
3616
3617static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3618 int node, int zid, enum lru_list lru)
3619{
3620 struct mem_cgroup_per_zone *mz;
3621 unsigned long flags, loop;
3622 struct list_head *list;
3623 struct page *busy;
3624 struct zone *zone;
3625 int ret = 0;
3626
3627 zone = &NODE_DATA(node)->node_zones[zid];
3628 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3629 list = &mz->lruvec.lists[lru];
3630
3631 loop = mz->lru_size[lru];
3632
3633 loop += 256;
3634 busy = NULL;
3635 while (loop--) {
3636 struct page_cgroup *pc;
3637 struct page *page;
3638
3639 ret = 0;
3640 spin_lock_irqsave(&zone->lru_lock, flags);
3641 if (list_empty(list)) {
3642 spin_unlock_irqrestore(&zone->lru_lock, flags);
3643 break;
3644 }
3645 page = list_entry(list->prev, struct page, lru);
3646 if (busy == page) {
3647 list_move(&page->lru, list);
3648 busy = NULL;
3649 spin_unlock_irqrestore(&zone->lru_lock, flags);
3650 continue;
3651 }
3652 spin_unlock_irqrestore(&zone->lru_lock, flags);
3653
3654 pc = lookup_page_cgroup(page);
3655
3656 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3657 if (ret == -ENOMEM || ret == -EINTR)
3658 break;
3659
3660 if (ret == -EBUSY || ret == -EINVAL) {
3661
3662 busy = page;
3663 cond_resched();
3664 } else
3665 busy = NULL;
3666 }
3667
3668 if (!ret && !list_empty(list))
3669 return -EBUSY;
3670 return ret;
3671}
3672
3673
3674
3675
3676
3677static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3678{
3679 int ret;
3680 int node, zid, shrink;
3681 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3682 struct cgroup *cgrp = memcg->css.cgroup;
3683
3684 css_get(&memcg->css);
3685
3686 shrink = 0;
3687
3688 if (free_all)
3689 goto try_to_free;
3690move_account:
3691 do {
3692 ret = -EBUSY;
3693 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3694 goto out;
3695 ret = -EINTR;
3696 if (signal_pending(current))
3697 goto out;
3698
3699 lru_add_drain_all();
3700 drain_all_stock_sync(memcg);
3701 ret = 0;
3702 mem_cgroup_start_move(memcg);
3703 for_each_node_state(node, N_HIGH_MEMORY) {
3704 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3705 enum lru_list lru;
3706 for_each_lru(lru) {
3707 ret = mem_cgroup_force_empty_list(memcg,
3708 node, zid, lru);
3709 if (ret)
3710 break;
3711 }
3712 }
3713 if (ret)
3714 break;
3715 }
3716 mem_cgroup_end_move(memcg);
3717 memcg_oom_recover(memcg);
3718
3719 if (ret == -ENOMEM)
3720 goto try_to_free;
3721 cond_resched();
3722
3723 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
3724out:
3725 css_put(&memcg->css);
3726 return ret;
3727
3728try_to_free:
3729
3730 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3731 ret = -EBUSY;
3732 goto out;
3733 }
3734
3735 lru_add_drain_all();
3736
3737 shrink = 1;
3738 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
3739 int progress;
3740
3741 if (signal_pending(current)) {
3742 ret = -EINTR;
3743 goto out;
3744 }
3745 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3746 false);
3747 if (!progress) {
3748 nr_retries--;
3749
3750 congestion_wait(BLK_RW_ASYNC, HZ/10);
3751 }
3752
3753 }
3754 lru_add_drain();
3755
3756 goto move_account;
3757}
3758
3759static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3760{
3761 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3762}
3763
3764
3765static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3766{
3767 return mem_cgroup_from_cont(cont)->use_hierarchy;
3768}
3769
3770static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3771 u64 val)
3772{
3773 int retval = 0;
3774 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3775 struct cgroup *parent = cont->parent;
3776 struct mem_cgroup *parent_memcg = NULL;
3777
3778 if (parent)
3779 parent_memcg = mem_cgroup_from_cont(parent);
3780
3781 cgroup_lock();
3782
3783
3784
3785
3786
3787
3788
3789
3790 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3791 (val == 1 || val == 0)) {
3792 if (list_empty(&cont->children))
3793 memcg->use_hierarchy = val;
3794 else
3795 retval = -EBUSY;
3796 } else
3797 retval = -EINVAL;
3798 cgroup_unlock();
3799
3800 return retval;
3801}
3802
3803
3804static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3805 enum mem_cgroup_stat_index idx)
3806{
3807 struct mem_cgroup *iter;
3808 long val = 0;
3809
3810
3811 for_each_mem_cgroup_tree(iter, memcg)
3812 val += mem_cgroup_read_stat(iter, idx);
3813
3814 if (val < 0)
3815 val = 0;
3816 return val;
3817}
3818
3819static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3820{
3821 u64 val;
3822
3823 if (!mem_cgroup_is_root(memcg)) {
3824 if (!swap)
3825 return res_counter_read_u64(&memcg->res, RES_USAGE);
3826 else
3827 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3828 }
3829
3830 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3831 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3832
3833 if (swap)
3834 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3835
3836 return val << PAGE_SHIFT;
3837}
3838
3839static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3840 struct file *file, char __user *buf,
3841 size_t nbytes, loff_t *ppos)
3842{
3843 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3844 char str[64];
3845 u64 val;
3846 int type, name, len;
3847
3848 type = MEMFILE_TYPE(cft->private);
3849 name = MEMFILE_ATTR(cft->private);
3850
3851 if (!do_swap_account && type == _MEMSWAP)
3852 return -EOPNOTSUPP;
3853
3854 switch (type) {
3855 case _MEM:
3856 if (name == RES_USAGE)
3857 val = mem_cgroup_usage(memcg, false);
3858 else
3859 val = res_counter_read_u64(&memcg->res, name);
3860 break;
3861 case _MEMSWAP:
3862 if (name == RES_USAGE)
3863 val = mem_cgroup_usage(memcg, true);
3864 else
3865 val = res_counter_read_u64(&memcg->memsw, name);
3866 break;
3867 default:
3868 BUG();
3869 }
3870
3871 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3872 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3873}
3874
3875
3876
3877
3878static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3879 const char *buffer)
3880{
3881 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3882 int type, name;
3883 unsigned long long val;
3884 int ret;
3885
3886 type = MEMFILE_TYPE(cft->private);
3887 name = MEMFILE_ATTR(cft->private);
3888
3889 if (!do_swap_account && type == _MEMSWAP)
3890 return -EOPNOTSUPP;
3891
3892 switch (name) {
3893 case RES_LIMIT:
3894 if (mem_cgroup_is_root(memcg)) {
3895 ret = -EINVAL;
3896 break;
3897 }
3898
3899 ret = res_counter_memparse_write_strategy(buffer, &val);
3900 if (ret)
3901 break;
3902 if (type == _MEM)
3903 ret = mem_cgroup_resize_limit(memcg, val);
3904 else
3905 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3906 break;
3907 case RES_SOFT_LIMIT:
3908 ret = res_counter_memparse_write_strategy(buffer, &val);
3909 if (ret)
3910 break;
3911
3912
3913
3914
3915
3916 if (type == _MEM)
3917 ret = res_counter_set_soft_limit(&memcg->res, val);
3918 else
3919 ret = -EINVAL;
3920 break;
3921 default:
3922 ret = -EINVAL;
3923 break;
3924 }
3925 return ret;
3926}
3927
3928static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3929 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3930{
3931 struct cgroup *cgroup;
3932 unsigned long long min_limit, min_memsw_limit, tmp;
3933
3934 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3935 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3936 cgroup = memcg->css.cgroup;
3937 if (!memcg->use_hierarchy)
3938 goto out;
3939
3940 while (cgroup->parent) {
3941 cgroup = cgroup->parent;
3942 memcg = mem_cgroup_from_cont(cgroup);
3943 if (!memcg->use_hierarchy)
3944 break;
3945 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3946 min_limit = min(min_limit, tmp);
3947 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3948 min_memsw_limit = min(min_memsw_limit, tmp);
3949 }
3950out:
3951 *mem_limit = min_limit;
3952 *memsw_limit = min_memsw_limit;
3953}
3954
3955static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3956{
3957 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3958 int type, name;
3959
3960 type = MEMFILE_TYPE(event);
3961 name = MEMFILE_ATTR(event);
3962
3963 if (!do_swap_account && type == _MEMSWAP)
3964 return -EOPNOTSUPP;
3965
3966 switch (name) {
3967 case RES_MAX_USAGE:
3968 if (type == _MEM)
3969 res_counter_reset_max(&memcg->res);
3970 else
3971 res_counter_reset_max(&memcg->memsw);
3972 break;
3973 case RES_FAILCNT:
3974 if (type == _MEM)
3975 res_counter_reset_failcnt(&memcg->res);
3976 else
3977 res_counter_reset_failcnt(&memcg->memsw);
3978 break;
3979 }
3980
3981 return 0;
3982}
3983
3984static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3985 struct cftype *cft)
3986{
3987 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3988}
3989
3990#ifdef CONFIG_MMU
3991static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3992 struct cftype *cft, u64 val)
3993{
3994 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3995
3996 if (val >= (1 << NR_MOVE_TYPE))
3997 return -EINVAL;
3998
3999
4000
4001
4002
4003 cgroup_lock();
4004 memcg->move_charge_at_immigrate = val;
4005 cgroup_unlock();
4006
4007 return 0;
4008}
4009#else
4010static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4011 struct cftype *cft, u64 val)
4012{
4013 return -ENOSYS;
4014}
4015#endif
4016
4017#ifdef CONFIG_NUMA
4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m)
4020{
4021 int nid;
4022 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4023 unsigned long node_nr;
4024 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4025
4026 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4027 seq_printf(m, "total=%lu", total_nr);
4028 for_each_node_state(nid, N_HIGH_MEMORY) {
4029 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4030 seq_printf(m, " N%d=%lu", nid, node_nr);
4031 }
4032 seq_putc(m, '\n');
4033
4034 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4035 seq_printf(m, "file=%lu", file_nr);
4036 for_each_node_state(nid, N_HIGH_MEMORY) {
4037 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4038 LRU_ALL_FILE);
4039 seq_printf(m, " N%d=%lu", nid, node_nr);
4040 }
4041 seq_putc(m, '\n');
4042
4043 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4044 seq_printf(m, "anon=%lu", anon_nr);
4045 for_each_node_state(nid, N_HIGH_MEMORY) {
4046 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4047 LRU_ALL_ANON);
4048 seq_printf(m, " N%d=%lu", nid, node_nr);
4049 }
4050 seq_putc(m, '\n');
4051
4052 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4053 seq_printf(m, "unevictable=%lu", unevictable_nr);
4054 for_each_node_state(nid, N_HIGH_MEMORY) {
4055 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4056 BIT(LRU_UNEVICTABLE));
4057 seq_printf(m, " N%d=%lu", nid, node_nr);
4058 }
4059 seq_putc(m, '\n');
4060 return 0;
4061}
4062#endif
4063
4064static const char * const mem_cgroup_lru_names[] = {
4065 "inactive_anon",
4066 "active_anon",
4067 "inactive_file",
4068 "active_file",
4069 "unevictable",
4070};
4071
4072static inline void mem_cgroup_lru_names_not_uptodate(void)
4073{
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075}
4076
4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4078 struct seq_file *m)
4079{
4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4081 struct mem_cgroup *mi;
4082 unsigned int i;
4083
4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4086 continue;
4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4089 }
4090
4091 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4092 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4093 mem_cgroup_read_events(memcg, i));
4094
4095 for (i = 0; i < NR_LRU_LISTS; i++)
4096 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4097 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4098
4099
4100 {
4101 unsigned long long limit, memsw_limit;
4102 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4103 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4104 if (do_swap_account)
4105 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4106 memsw_limit);
4107 }
4108
4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4110 long long val = 0;
4111
4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4113 continue;
4114 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4116 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4117 }
4118
4119 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4120 unsigned long long val = 0;
4121
4122 for_each_mem_cgroup_tree(mi, memcg)
4123 val += mem_cgroup_read_events(mi, i);
4124 seq_printf(m, "total_%s %llu\n",
4125 mem_cgroup_events_names[i], val);
4126 }
4127
4128 for (i = 0; i < NR_LRU_LISTS; i++) {
4129 unsigned long long val = 0;
4130
4131 for_each_mem_cgroup_tree(mi, memcg)
4132 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4133 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4134 }
4135
4136#ifdef CONFIG_DEBUG_VM
4137 {
4138 int nid, zid;
4139 struct mem_cgroup_per_zone *mz;
4140 struct zone_reclaim_stat *rstat;
4141 unsigned long recent_rotated[2] = {0, 0};
4142 unsigned long recent_scanned[2] = {0, 0};
4143
4144 for_each_online_node(nid)
4145 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4146 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4147 rstat = &mz->lruvec.reclaim_stat;
4148
4149 recent_rotated[0] += rstat->recent_rotated[0];
4150 recent_rotated[1] += rstat->recent_rotated[1];
4151 recent_scanned[0] += rstat->recent_scanned[0];
4152 recent_scanned[1] += rstat->recent_scanned[1];
4153 }
4154 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4155 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4156 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4157 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4158 }
4159#endif
4160
4161 return 0;
4162}
4163
4164static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4165{
4166 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4167
4168 return mem_cgroup_swappiness(memcg);
4169}
4170
4171static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4172 u64 val)
4173{
4174 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4175 struct mem_cgroup *parent;
4176
4177 if (val > 100)
4178 return -EINVAL;
4179
4180 if (cgrp->parent == NULL)
4181 return -EINVAL;
4182
4183 parent = mem_cgroup_from_cont(cgrp->parent);
4184
4185 cgroup_lock();
4186
4187
4188 if ((parent->use_hierarchy) ||
4189 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4190 cgroup_unlock();
4191 return -EINVAL;
4192 }
4193
4194 memcg->swappiness = val;
4195
4196 cgroup_unlock();
4197
4198 return 0;
4199}
4200
4201static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4202{
4203 struct mem_cgroup_threshold_ary *t;
4204 u64 usage;
4205 int i;
4206
4207 rcu_read_lock();
4208 if (!swap)
4209 t = rcu_dereference(memcg->thresholds.primary);
4210 else
4211 t = rcu_dereference(memcg->memsw_thresholds.primary);
4212
4213 if (!t)
4214 goto unlock;
4215
4216 usage = mem_cgroup_usage(memcg, swap);
4217
4218
4219
4220
4221
4222
4223 i = t->current_threshold;
4224
4225
4226
4227
4228
4229
4230
4231 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4232 eventfd_signal(t->entries[i].eventfd, 1);
4233
4234
4235 i++;
4236
4237
4238
4239
4240
4241
4242
4243 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4244 eventfd_signal(t->entries[i].eventfd, 1);
4245
4246
4247 t->current_threshold = i - 1;
4248unlock:
4249 rcu_read_unlock();
4250}
4251
4252static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4253{
4254 while (memcg) {
4255 __mem_cgroup_threshold(memcg, false);
4256 if (do_swap_account)
4257 __mem_cgroup_threshold(memcg, true);
4258
4259 memcg = parent_mem_cgroup(memcg);
4260 }
4261}
4262
4263static int compare_thresholds(const void *a, const void *b)
4264{
4265 const struct mem_cgroup_threshold *_a = a;
4266 const struct mem_cgroup_threshold *_b = b;
4267
4268 return _a->threshold - _b->threshold;
4269}
4270
4271static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4272{
4273 struct mem_cgroup_eventfd_list *ev;
4274
4275 list_for_each_entry(ev, &memcg->oom_notify, list)
4276 eventfd_signal(ev->eventfd, 1);
4277 return 0;
4278}
4279
4280static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4281{
4282 struct mem_cgroup *iter;
4283
4284 for_each_mem_cgroup_tree(iter, memcg)
4285 mem_cgroup_oom_notify_cb(iter);
4286}
4287
4288static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4289 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4290{
4291 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4292 struct mem_cgroup_thresholds *thresholds;
4293 struct mem_cgroup_threshold_ary *new;
4294 int type = MEMFILE_TYPE(cft->private);
4295 u64 threshold, usage;
4296 int i, size, ret;
4297
4298 ret = res_counter_memparse_write_strategy(args, &threshold);
4299 if (ret)
4300 return ret;
4301
4302 mutex_lock(&memcg->thresholds_lock);
4303
4304 if (type == _MEM)
4305 thresholds = &memcg->thresholds;
4306 else if (type == _MEMSWAP)
4307 thresholds = &memcg->memsw_thresholds;
4308 else
4309 BUG();
4310
4311 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4312
4313
4314 if (thresholds->primary)
4315 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4316
4317 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4318
4319
4320 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4321 GFP_KERNEL);
4322 if (!new) {
4323 ret = -ENOMEM;
4324 goto unlock;
4325 }
4326 new->size = size;
4327
4328
4329 if (thresholds->primary) {
4330 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4331 sizeof(struct mem_cgroup_threshold));
4332 }
4333
4334
4335 new->entries[size - 1].eventfd = eventfd;
4336 new->entries[size - 1].threshold = threshold;
4337
4338
4339 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4340 compare_thresholds, NULL);
4341
4342
4343 new->current_threshold = -1;
4344 for (i = 0; i < size; i++) {
4345 if (new->entries[i].threshold <= usage) {
4346
4347
4348
4349
4350
4351 ++new->current_threshold;
4352 } else
4353 break;
4354 }
4355
4356
4357 kfree(thresholds->spare);
4358 thresholds->spare = thresholds->primary;
4359
4360 rcu_assign_pointer(thresholds->primary, new);
4361
4362
4363 synchronize_rcu();
4364
4365unlock:
4366 mutex_unlock(&memcg->thresholds_lock);
4367
4368 return ret;
4369}
4370
4371static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4372 struct cftype *cft, struct eventfd_ctx *eventfd)
4373{
4374 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4375 struct mem_cgroup_thresholds *thresholds;
4376 struct mem_cgroup_threshold_ary *new;
4377 int type = MEMFILE_TYPE(cft->private);
4378 u64 usage;
4379 int i, j, size;
4380
4381 mutex_lock(&memcg->thresholds_lock);
4382 if (type == _MEM)
4383 thresholds = &memcg->thresholds;
4384 else if (type == _MEMSWAP)
4385 thresholds = &memcg->memsw_thresholds;
4386 else
4387 BUG();
4388
4389 if (!thresholds->primary)
4390 goto unlock;
4391
4392 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4393
4394
4395 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4396
4397
4398 size = 0;
4399 for (i = 0; i < thresholds->primary->size; i++) {
4400 if (thresholds->primary->entries[i].eventfd != eventfd)
4401 size++;
4402 }
4403
4404 new = thresholds->spare;
4405
4406
4407 if (!size) {
4408 kfree(new);
4409 new = NULL;
4410 goto swap_buffers;
4411 }
4412
4413 new->size = size;
4414
4415
4416 new->current_threshold = -1;
4417 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4418 if (thresholds->primary->entries[i].eventfd == eventfd)
4419 continue;
4420
4421 new->entries[j] = thresholds->primary->entries[i];
4422 if (new->entries[j].threshold <= usage) {
4423
4424
4425
4426
4427
4428 ++new->current_threshold;
4429 }
4430 j++;
4431 }
4432
4433swap_buffers:
4434
4435 thresholds->spare = thresholds->primary;
4436
4437 if (!new) {
4438 kfree(thresholds->spare);
4439 thresholds->spare = NULL;
4440 }
4441
4442 rcu_assign_pointer(thresholds->primary, new);
4443
4444
4445 synchronize_rcu();
4446unlock:
4447 mutex_unlock(&memcg->thresholds_lock);
4448}
4449
4450static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4451 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4452{
4453 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4454 struct mem_cgroup_eventfd_list *event;
4455 int type = MEMFILE_TYPE(cft->private);
4456
4457 BUG_ON(type != _OOM_TYPE);
4458 event = kmalloc(sizeof(*event), GFP_KERNEL);
4459 if (!event)
4460 return -ENOMEM;
4461
4462 spin_lock(&memcg_oom_lock);
4463
4464 event->eventfd = eventfd;
4465 list_add(&event->list, &memcg->oom_notify);
4466
4467
4468 if (atomic_read(&memcg->under_oom))
4469 eventfd_signal(eventfd, 1);
4470 spin_unlock(&memcg_oom_lock);
4471
4472 return 0;
4473}
4474
4475static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4476 struct cftype *cft, struct eventfd_ctx *eventfd)
4477{
4478 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4479 struct mem_cgroup_eventfd_list *ev, *tmp;
4480 int type = MEMFILE_TYPE(cft->private);
4481
4482 BUG_ON(type != _OOM_TYPE);
4483
4484 spin_lock(&memcg_oom_lock);
4485
4486 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4487 if (ev->eventfd == eventfd) {
4488 list_del(&ev->list);
4489 kfree(ev);
4490 }
4491 }
4492
4493 spin_unlock(&memcg_oom_lock);
4494}
4495
4496static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4497 struct cftype *cft, struct cgroup_map_cb *cb)
4498{
4499 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4500
4501 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4502
4503 if (atomic_read(&memcg->under_oom))
4504 cb->fill(cb, "under_oom", 1);
4505 else
4506 cb->fill(cb, "under_oom", 0);
4507 return 0;
4508}
4509
4510static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4511 struct cftype *cft, u64 val)
4512{
4513 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4514 struct mem_cgroup *parent;
4515
4516
4517 if (!cgrp->parent || !((val == 0) || (val == 1)))
4518 return -EINVAL;
4519
4520 parent = mem_cgroup_from_cont(cgrp->parent);
4521
4522 cgroup_lock();
4523
4524 if ((parent->use_hierarchy) ||
4525 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4526 cgroup_unlock();
4527 return -EINVAL;
4528 }
4529 memcg->oom_kill_disable = val;
4530 if (!val)
4531 memcg_oom_recover(memcg);
4532 cgroup_unlock();
4533 return 0;
4534}
4535
4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4538{
4539 return mem_cgroup_sockets_init(memcg, ss);
4540};
4541
4542static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4543{
4544 mem_cgroup_sockets_destroy(memcg);
4545}
4546#else
4547static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4548{
4549 return 0;
4550}
4551
4552static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4553{
4554}
4555#endif
4556
4557static struct cftype mem_cgroup_files[] = {
4558 {
4559 .name = "usage_in_bytes",
4560 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4561 .read = mem_cgroup_read,
4562 .register_event = mem_cgroup_usage_register_event,
4563 .unregister_event = mem_cgroup_usage_unregister_event,
4564 },
4565 {
4566 .name = "max_usage_in_bytes",
4567 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4568 .trigger = mem_cgroup_reset,
4569 .read = mem_cgroup_read,
4570 },
4571 {
4572 .name = "limit_in_bytes",
4573 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4574 .write_string = mem_cgroup_write,
4575 .read = mem_cgroup_read,
4576 },
4577 {
4578 .name = "soft_limit_in_bytes",
4579 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4580 .write_string = mem_cgroup_write,
4581 .read = mem_cgroup_read,
4582 },
4583 {
4584 .name = "failcnt",
4585 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4586 .trigger = mem_cgroup_reset,
4587 .read = mem_cgroup_read,
4588 },
4589 {
4590 .name = "stat",
4591 .read_seq_string = mem_control_stat_show,
4592 },
4593 {
4594 .name = "force_empty",
4595 .trigger = mem_cgroup_force_empty_write,
4596 },
4597 {
4598 .name = "use_hierarchy",
4599 .write_u64 = mem_cgroup_hierarchy_write,
4600 .read_u64 = mem_cgroup_hierarchy_read,
4601 },
4602 {
4603 .name = "swappiness",
4604 .read_u64 = mem_cgroup_swappiness_read,
4605 .write_u64 = mem_cgroup_swappiness_write,
4606 },
4607 {
4608 .name = "move_charge_at_immigrate",
4609 .read_u64 = mem_cgroup_move_charge_read,
4610 .write_u64 = mem_cgroup_move_charge_write,
4611 },
4612 {
4613 .name = "oom_control",
4614 .read_map = mem_cgroup_oom_control_read,
4615 .write_u64 = mem_cgroup_oom_control_write,
4616 .register_event = mem_cgroup_oom_register_event,
4617 .unregister_event = mem_cgroup_oom_unregister_event,
4618 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4619 },
4620#ifdef CONFIG_NUMA
4621 {
4622 .name = "numa_stat",
4623 .read_seq_string = mem_control_numa_stat_show,
4624 },
4625#endif
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4627 {
4628 .name = "memsw.usage_in_bytes",
4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4630 .read = mem_cgroup_read,
4631 .register_event = mem_cgroup_usage_register_event,
4632 .unregister_event = mem_cgroup_usage_unregister_event,
4633 },
4634 {
4635 .name = "memsw.max_usage_in_bytes",
4636 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4637 .trigger = mem_cgroup_reset,
4638 .read = mem_cgroup_read,
4639 },
4640 {
4641 .name = "memsw.limit_in_bytes",
4642 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4643 .write_string = mem_cgroup_write,
4644 .read = mem_cgroup_read,
4645 },
4646 {
4647 .name = "memsw.failcnt",
4648 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4649 .trigger = mem_cgroup_reset,
4650 .read = mem_cgroup_read,
4651 },
4652#endif
4653 { },
4654};
4655
4656static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4657{
4658 struct mem_cgroup_per_node *pn;
4659 struct mem_cgroup_per_zone *mz;
4660 int zone, tmp = node;
4661
4662
4663
4664
4665
4666
4667
4668
4669 if (!node_state(node, N_NORMAL_MEMORY))
4670 tmp = -1;
4671 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4672 if (!pn)
4673 return 1;
4674
4675 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4676 mz = &pn->zoneinfo[zone];
4677 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4678 mz->usage_in_excess = 0;
4679 mz->on_tree = false;
4680 mz->memcg = memcg;
4681 }
4682 memcg->info.nodeinfo[node] = pn;
4683 return 0;
4684}
4685
4686static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4687{
4688 kfree(memcg->info.nodeinfo[node]);
4689}
4690
4691static struct mem_cgroup *mem_cgroup_alloc(void)
4692{
4693 struct mem_cgroup *memcg;
4694 int size = sizeof(struct mem_cgroup);
4695
4696
4697 if (size < PAGE_SIZE)
4698 memcg = kzalloc(size, GFP_KERNEL);
4699 else
4700 memcg = vzalloc(size);
4701
4702 if (!memcg)
4703 return NULL;
4704
4705 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4706 if (!memcg->stat)
4707 goto out_free;
4708 spin_lock_init(&memcg->pcp_counter_lock);
4709 return memcg;
4710
4711out_free:
4712 if (size < PAGE_SIZE)
4713 kfree(memcg);
4714 else
4715 vfree(memcg);
4716 return NULL;
4717}
4718
4719
4720
4721
4722
4723
4724static void free_work(struct work_struct *work)
4725{
4726 struct mem_cgroup *memcg;
4727 int size = sizeof(struct mem_cgroup);
4728
4729 memcg = container_of(work, struct mem_cgroup, work_freeing);
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741 disarm_sock_keys(memcg);
4742 if (size < PAGE_SIZE)
4743 kfree(memcg);
4744 else
4745 vfree(memcg);
4746}
4747
4748static void free_rcu(struct rcu_head *rcu_head)
4749{
4750 struct mem_cgroup *memcg;
4751
4752 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4753 INIT_WORK(&memcg->work_freeing, free_work);
4754 schedule_work(&memcg->work_freeing);
4755}
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768static void __mem_cgroup_free(struct mem_cgroup *memcg)
4769{
4770 int node;
4771
4772 mem_cgroup_remove_from_trees(memcg);
4773 free_css_id(&mem_cgroup_subsys, &memcg->css);
4774
4775 for_each_node(node)
4776 free_mem_cgroup_per_zone_info(memcg, node);
4777
4778 free_percpu(memcg->stat);
4779 call_rcu(&memcg->rcu_freeing, free_rcu);
4780}
4781
4782static void mem_cgroup_get(struct mem_cgroup *memcg)
4783{
4784 atomic_inc(&memcg->refcnt);
4785}
4786
4787static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4788{
4789 if (atomic_sub_and_test(count, &memcg->refcnt)) {
4790 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4791 __mem_cgroup_free(memcg);
4792 if (parent)
4793 mem_cgroup_put(parent);
4794 }
4795}
4796
4797static void mem_cgroup_put(struct mem_cgroup *memcg)
4798{
4799 __mem_cgroup_put(memcg, 1);
4800}
4801
4802
4803
4804
4805struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4806{
4807 if (!memcg->res.parent)
4808 return NULL;
4809 return mem_cgroup_from_res_counter(memcg->res.parent, res);
4810}
4811EXPORT_SYMBOL(parent_mem_cgroup);
4812
4813#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4814static void __init enable_swap_cgroup(void)
4815{
4816 if (!mem_cgroup_disabled() && really_do_swap_account)
4817 do_swap_account = 1;
4818}
4819#else
4820static void __init enable_swap_cgroup(void)
4821{
4822}
4823#endif
4824
4825static int mem_cgroup_soft_limit_tree_init(void)
4826{
4827 struct mem_cgroup_tree_per_node *rtpn;
4828 struct mem_cgroup_tree_per_zone *rtpz;
4829 int tmp, node, zone;
4830
4831 for_each_node(node) {
4832 tmp = node;
4833 if (!node_state(node, N_NORMAL_MEMORY))
4834 tmp = -1;
4835 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4836 if (!rtpn)
4837 goto err_cleanup;
4838
4839 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4840
4841 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4842 rtpz = &rtpn->rb_tree_per_zone[zone];
4843 rtpz->rb_root = RB_ROOT;
4844 spin_lock_init(&rtpz->lock);
4845 }
4846 }
4847 return 0;
4848
4849err_cleanup:
4850 for_each_node(node) {
4851 if (!soft_limit_tree.rb_tree_per_node[node])
4852 break;
4853 kfree(soft_limit_tree.rb_tree_per_node[node]);
4854 soft_limit_tree.rb_tree_per_node[node] = NULL;
4855 }
4856 return 1;
4857
4858}
4859
4860static struct cgroup_subsys_state * __ref
4861mem_cgroup_create(struct cgroup *cont)
4862{
4863 struct mem_cgroup *memcg, *parent;
4864 long error = -ENOMEM;
4865 int node;
4866
4867 memcg = mem_cgroup_alloc();
4868 if (!memcg)
4869 return ERR_PTR(error);
4870
4871 for_each_node(node)
4872 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4873 goto free_out;
4874
4875
4876 if (cont->parent == NULL) {
4877 int cpu;
4878 enable_swap_cgroup();
4879 parent = NULL;
4880 if (mem_cgroup_soft_limit_tree_init())
4881 goto free_out;
4882 root_mem_cgroup = memcg;
4883 for_each_possible_cpu(cpu) {
4884 struct memcg_stock_pcp *stock =
4885 &per_cpu(memcg_stock, cpu);
4886 INIT_WORK(&stock->work, drain_local_stock);
4887 }
4888 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4889 } else {
4890 parent = mem_cgroup_from_cont(cont->parent);
4891 memcg->use_hierarchy = parent->use_hierarchy;
4892 memcg->oom_kill_disable = parent->oom_kill_disable;
4893 }
4894
4895 if (parent && parent->use_hierarchy) {
4896 res_counter_init(&memcg->res, &parent->res);
4897 res_counter_init(&memcg->memsw, &parent->memsw);
4898
4899
4900
4901
4902
4903
4904 mem_cgroup_get(parent);
4905 } else {
4906 res_counter_init(&memcg->res, NULL);
4907 res_counter_init(&memcg->memsw, NULL);
4908 }
4909 memcg->last_scanned_node = MAX_NUMNODES;
4910 INIT_LIST_HEAD(&memcg->oom_notify);
4911
4912 if (parent)
4913 memcg->swappiness = mem_cgroup_swappiness(parent);
4914 atomic_set(&memcg->refcnt, 1);
4915 memcg->move_charge_at_immigrate = 0;
4916 mutex_init(&memcg->thresholds_lock);
4917 spin_lock_init(&memcg->move_lock);
4918
4919 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
4920 if (error) {
4921
4922
4923
4924
4925
4926 mem_cgroup_put(memcg);
4927 return ERR_PTR(error);
4928 }
4929 return &memcg->css;
4930free_out:
4931 __mem_cgroup_free(memcg);
4932 return ERR_PTR(error);
4933}
4934
4935static int mem_cgroup_pre_destroy(struct cgroup *cont)
4936{
4937 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4938
4939 return mem_cgroup_force_empty(memcg, false);
4940}
4941
4942static void mem_cgroup_destroy(struct cgroup *cont)
4943{
4944 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4945
4946 kmem_cgroup_destroy(memcg);
4947
4948 mem_cgroup_put(memcg);
4949}
4950
4951#ifdef CONFIG_MMU
4952
4953#define PRECHARGE_COUNT_AT_ONCE 256
4954static int mem_cgroup_do_precharge(unsigned long count)
4955{
4956 int ret = 0;
4957 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4958 struct mem_cgroup *memcg = mc.to;
4959
4960 if (mem_cgroup_is_root(memcg)) {
4961 mc.precharge += count;
4962
4963 return ret;
4964 }
4965
4966 if (count > 1) {
4967 struct res_counter *dummy;
4968
4969
4970
4971
4972
4973
4974 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
4975 goto one_by_one;
4976 if (do_swap_account && res_counter_charge(&memcg->memsw,
4977 PAGE_SIZE * count, &dummy)) {
4978 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
4979 goto one_by_one;
4980 }
4981 mc.precharge += count;
4982 return ret;
4983 }
4984one_by_one:
4985
4986 while (count--) {
4987 if (signal_pending(current)) {
4988 ret = -EINTR;
4989 break;
4990 }
4991 if (!batch_count--) {
4992 batch_count = PRECHARGE_COUNT_AT_ONCE;
4993 cond_resched();
4994 }
4995 ret = __mem_cgroup_try_charge(NULL,
4996 GFP_KERNEL, 1, &memcg, false);
4997 if (ret)
4998
4999 return ret;
5000 mc.precharge++;
5001 }
5002 return ret;
5003}
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023union mc_target {
5024 struct page *page;
5025 swp_entry_t ent;
5026};
5027
5028enum mc_target_type {
5029 MC_TARGET_NONE = 0,
5030 MC_TARGET_PAGE,
5031 MC_TARGET_SWAP,
5032};
5033
5034static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5035 unsigned long addr, pte_t ptent)
5036{
5037 struct page *page = vm_normal_page(vma, addr, ptent);
5038
5039 if (!page || !page_mapped(page))
5040 return NULL;
5041 if (PageAnon(page)) {
5042
5043 if (!move_anon())
5044 return NULL;
5045 } else if (!move_file())
5046
5047 return NULL;
5048 if (!get_page_unless_zero(page))
5049 return NULL;
5050
5051 return page;
5052}
5053
5054#ifdef CONFIG_SWAP
5055static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5056 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5057{
5058 struct page *page = NULL;
5059 swp_entry_t ent = pte_to_swp_entry(ptent);
5060
5061 if (!move_anon() || non_swap_entry(ent))
5062 return NULL;
5063
5064
5065
5066
5067 page = find_get_page(&swapper_space, ent.val);
5068 if (do_swap_account)
5069 entry->val = ent.val;
5070
5071 return page;
5072}
5073#else
5074static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5075 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5076{
5077 return NULL;
5078}
5079#endif
5080
5081static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5082 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5083{
5084 struct page *page = NULL;
5085 struct address_space *mapping;
5086 pgoff_t pgoff;
5087
5088 if (!vma->vm_file)
5089 return NULL;
5090 if (!move_file())
5091 return NULL;
5092
5093 mapping = vma->vm_file->f_mapping;
5094 if (pte_none(ptent))
5095 pgoff = linear_page_index(vma, addr);
5096 else
5097 pgoff = pte_to_pgoff(ptent);
5098
5099
5100 page = find_get_page(mapping, pgoff);
5101
5102#ifdef CONFIG_SWAP
5103
5104 if (radix_tree_exceptional_entry(page)) {
5105 swp_entry_t swap = radix_to_swp_entry(page);
5106 if (do_swap_account)
5107 *entry = swap;
5108 page = find_get_page(&swapper_space, swap.val);
5109 }
5110#endif
5111 return page;
5112}
5113
5114static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5115 unsigned long addr, pte_t ptent, union mc_target *target)
5116{
5117 struct page *page = NULL;
5118 struct page_cgroup *pc;
5119 enum mc_target_type ret = MC_TARGET_NONE;
5120 swp_entry_t ent = { .val = 0 };
5121
5122 if (pte_present(ptent))
5123 page = mc_handle_present_pte(vma, addr, ptent);
5124 else if (is_swap_pte(ptent))
5125 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5126 else if (pte_none(ptent) || pte_file(ptent))
5127 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5128
5129 if (!page && !ent.val)
5130 return ret;
5131 if (page) {
5132 pc = lookup_page_cgroup(page);
5133
5134
5135
5136
5137
5138 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5139 ret = MC_TARGET_PAGE;
5140 if (target)
5141 target->page = page;
5142 }
5143 if (!ret || !target)
5144 put_page(page);
5145 }
5146
5147 if (ent.val && !ret &&
5148 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5149 ret = MC_TARGET_SWAP;
5150 if (target)
5151 target->ent = ent;
5152 }
5153 return ret;
5154}
5155
5156#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5157
5158
5159
5160
5161
5162static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5163 unsigned long addr, pmd_t pmd, union mc_target *target)
5164{
5165 struct page *page = NULL;
5166 struct page_cgroup *pc;
5167 enum mc_target_type ret = MC_TARGET_NONE;
5168
5169 page = pmd_page(pmd);
5170 VM_BUG_ON(!page || !PageHead(page));
5171 if (!move_anon())
5172 return ret;
5173 pc = lookup_page_cgroup(page);
5174 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5175 ret = MC_TARGET_PAGE;
5176 if (target) {
5177 get_page(page);
5178 target->page = page;
5179 }
5180 }
5181 return ret;
5182}
5183#else
5184static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5185 unsigned long addr, pmd_t pmd, union mc_target *target)
5186{
5187 return MC_TARGET_NONE;
5188}
5189#endif
5190
5191static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5192 unsigned long addr, unsigned long end,
5193 struct mm_walk *walk)
5194{
5195 struct vm_area_struct *vma = walk->private;
5196 pte_t *pte;
5197 spinlock_t *ptl;
5198
5199 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5200 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5201 mc.precharge += HPAGE_PMD_NR;
5202 spin_unlock(&vma->vm_mm->page_table_lock);
5203 return 0;
5204 }
5205
5206 if (pmd_trans_unstable(pmd))
5207 return 0;
5208 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5209 for (; addr != end; pte++, addr += PAGE_SIZE)
5210 if (get_mctgt_type(vma, addr, *pte, NULL))
5211 mc.precharge++;
5212 pte_unmap_unlock(pte - 1, ptl);
5213 cond_resched();
5214
5215 return 0;
5216}
5217
5218static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5219{
5220 unsigned long precharge;
5221 struct vm_area_struct *vma;
5222
5223 down_read(&mm->mmap_sem);
5224 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5225 struct mm_walk mem_cgroup_count_precharge_walk = {
5226 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5227 .mm = mm,
5228 .private = vma,
5229 };
5230 if (is_vm_hugetlb_page(vma))
5231 continue;
5232 walk_page_range(vma->vm_start, vma->vm_end,
5233 &mem_cgroup_count_precharge_walk);
5234 }
5235 up_read(&mm->mmap_sem);
5236
5237 precharge = mc.precharge;
5238 mc.precharge = 0;
5239
5240 return precharge;
5241}
5242
5243static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5244{
5245 unsigned long precharge = mem_cgroup_count_precharge(mm);
5246
5247 VM_BUG_ON(mc.moving_task);
5248 mc.moving_task = current;
5249 return mem_cgroup_do_precharge(precharge);
5250}
5251
5252
5253static void __mem_cgroup_clear_mc(void)
5254{
5255 struct mem_cgroup *from = mc.from;
5256 struct mem_cgroup *to = mc.to;
5257
5258
5259 if (mc.precharge) {
5260 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5261 mc.precharge = 0;
5262 }
5263
5264
5265
5266
5267 if (mc.moved_charge) {
5268 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5269 mc.moved_charge = 0;
5270 }
5271
5272 if (mc.moved_swap) {
5273
5274 if (!mem_cgroup_is_root(mc.from))
5275 res_counter_uncharge(&mc.from->memsw,
5276 PAGE_SIZE * mc.moved_swap);
5277 __mem_cgroup_put(mc.from, mc.moved_swap);
5278
5279 if (!mem_cgroup_is_root(mc.to)) {
5280
5281
5282
5283
5284 res_counter_uncharge(&mc.to->res,
5285 PAGE_SIZE * mc.moved_swap);
5286 }
5287
5288 mc.moved_swap = 0;
5289 }
5290 memcg_oom_recover(from);
5291 memcg_oom_recover(to);
5292 wake_up_all(&mc.waitq);
5293}
5294
5295static void mem_cgroup_clear_mc(void)
5296{
5297 struct mem_cgroup *from = mc.from;
5298
5299
5300
5301
5302
5303 mc.moving_task = NULL;
5304 __mem_cgroup_clear_mc();
5305 spin_lock(&mc.lock);
5306 mc.from = NULL;
5307 mc.to = NULL;
5308 spin_unlock(&mc.lock);
5309 mem_cgroup_end_move(from);
5310}
5311
5312static int mem_cgroup_can_attach(struct cgroup *cgroup,
5313 struct cgroup_taskset *tset)
5314{
5315 struct task_struct *p = cgroup_taskset_first(tset);
5316 int ret = 0;
5317 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5318
5319 if (memcg->move_charge_at_immigrate) {
5320 struct mm_struct *mm;
5321 struct mem_cgroup *from = mem_cgroup_from_task(p);
5322
5323 VM_BUG_ON(from == memcg);
5324
5325 mm = get_task_mm(p);
5326 if (!mm)
5327 return 0;
5328
5329 if (mm->owner == p) {
5330 VM_BUG_ON(mc.from);
5331 VM_BUG_ON(mc.to);
5332 VM_BUG_ON(mc.precharge);
5333 VM_BUG_ON(mc.moved_charge);
5334 VM_BUG_ON(mc.moved_swap);
5335 mem_cgroup_start_move(from);
5336 spin_lock(&mc.lock);
5337 mc.from = from;
5338 mc.to = memcg;
5339 spin_unlock(&mc.lock);
5340
5341
5342 ret = mem_cgroup_precharge_mc(mm);
5343 if (ret)
5344 mem_cgroup_clear_mc();
5345 }
5346 mmput(mm);
5347 }
5348 return ret;
5349}
5350
5351static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5352 struct cgroup_taskset *tset)
5353{
5354 mem_cgroup_clear_mc();
5355}
5356
5357static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5358 unsigned long addr, unsigned long end,
5359 struct mm_walk *walk)
5360{
5361 int ret = 0;
5362 struct vm_area_struct *vma = walk->private;
5363 pte_t *pte;
5364 spinlock_t *ptl;
5365 enum mc_target_type target_type;
5366 union mc_target target;
5367 struct page *page;
5368 struct page_cgroup *pc;
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5381 if (mc.precharge < HPAGE_PMD_NR) {
5382 spin_unlock(&vma->vm_mm->page_table_lock);
5383 return 0;
5384 }
5385 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5386 if (target_type == MC_TARGET_PAGE) {
5387 page = target.page;
5388 if (!isolate_lru_page(page)) {
5389 pc = lookup_page_cgroup(page);
5390 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5391 pc, mc.from, mc.to)) {
5392 mc.precharge -= HPAGE_PMD_NR;
5393 mc.moved_charge += HPAGE_PMD_NR;
5394 }
5395 putback_lru_page(page);
5396 }
5397 put_page(page);
5398 }
5399 spin_unlock(&vma->vm_mm->page_table_lock);
5400 return 0;
5401 }
5402
5403 if (pmd_trans_unstable(pmd))
5404 return 0;
5405retry:
5406 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5407 for (; addr != end; addr += PAGE_SIZE) {
5408 pte_t ptent = *(pte++);
5409 swp_entry_t ent;
5410
5411 if (!mc.precharge)
5412 break;
5413
5414 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5415 case MC_TARGET_PAGE:
5416 page = target.page;
5417 if (isolate_lru_page(page))
5418 goto put;
5419 pc = lookup_page_cgroup(page);
5420 if (!mem_cgroup_move_account(page, 1, pc,
5421 mc.from, mc.to)) {
5422 mc.precharge--;
5423
5424 mc.moved_charge++;
5425 }
5426 putback_lru_page(page);
5427put:
5428 put_page(page);
5429 break;
5430 case MC_TARGET_SWAP:
5431 ent = target.ent;
5432 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5433 mc.precharge--;
5434
5435 mc.moved_swap++;
5436 }
5437 break;
5438 default:
5439 break;
5440 }
5441 }
5442 pte_unmap_unlock(pte - 1, ptl);
5443 cond_resched();
5444
5445 if (addr != end) {
5446
5447
5448
5449
5450
5451
5452 ret = mem_cgroup_do_precharge(1);
5453 if (!ret)
5454 goto retry;
5455 }
5456
5457 return ret;
5458}
5459
5460static void mem_cgroup_move_charge(struct mm_struct *mm)
5461{
5462 struct vm_area_struct *vma;
5463
5464 lru_add_drain_all();
5465retry:
5466 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5467
5468
5469
5470
5471
5472
5473
5474 __mem_cgroup_clear_mc();
5475 cond_resched();
5476 goto retry;
5477 }
5478 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5479 int ret;
5480 struct mm_walk mem_cgroup_move_charge_walk = {
5481 .pmd_entry = mem_cgroup_move_charge_pte_range,
5482 .mm = mm,
5483 .private = vma,
5484 };
5485 if (is_vm_hugetlb_page(vma))
5486 continue;
5487 ret = walk_page_range(vma->vm_start, vma->vm_end,
5488 &mem_cgroup_move_charge_walk);
5489 if (ret)
5490
5491
5492
5493
5494 break;
5495 }
5496 up_read(&mm->mmap_sem);
5497}
5498
5499static void mem_cgroup_move_task(struct cgroup *cont,
5500 struct cgroup_taskset *tset)
5501{
5502 struct task_struct *p = cgroup_taskset_first(tset);
5503 struct mm_struct *mm = get_task_mm(p);
5504
5505 if (mm) {
5506 if (mc.to)
5507 mem_cgroup_move_charge(mm);
5508 mmput(mm);
5509 }
5510 if (mc.to)
5511 mem_cgroup_clear_mc();
5512}
5513#else
5514static int mem_cgroup_can_attach(struct cgroup *cgroup,
5515 struct cgroup_taskset *tset)
5516{
5517 return 0;
5518}
5519static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
5520 struct cgroup_taskset *tset)
5521{
5522}
5523static void mem_cgroup_move_task(struct cgroup *cont,
5524 struct cgroup_taskset *tset)
5525{
5526}
5527#endif
5528
5529struct cgroup_subsys mem_cgroup_subsys = {
5530 .name = "memory",
5531 .subsys_id = mem_cgroup_subsys_id,
5532 .create = mem_cgroup_create,
5533 .pre_destroy = mem_cgroup_pre_destroy,
5534 .destroy = mem_cgroup_destroy,
5535 .can_attach = mem_cgroup_can_attach,
5536 .cancel_attach = mem_cgroup_cancel_attach,
5537 .attach = mem_cgroup_move_task,
5538 .base_cftypes = mem_cgroup_files,
5539 .early_init = 0,
5540 .use_id = 1,
5541 .__DEPRECATED_clear_css_refs = true,
5542};
5543
5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5545static int __init enable_swap_account(char *s)
5546{
5547
5548 if (!strcmp(s, "1"))
5549 really_do_swap_account = 1;
5550 else if (!strcmp(s, "0"))
5551 really_do_swap_account = 0;
5552 return 1;
5553}
5554__setup("swapaccount=", enable_swap_account);
5555
5556#endif
5557