1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28#include <linux/res_counter.h>
29#include <linux/memcontrol.h>
30#include <linux/cgroup.h>
31#include <linux/mm.h>
32#include <linux/hugetlb.h>
33#include <linux/pagemap.h>
34#include <linux/smp.h>
35#include <linux/page-flags.h>
36#include <linux/backing-dev.h>
37#include <linux/bit_spinlock.h>
38#include <linux/rcupdate.h>
39#include <linux/limits.h>
40#include <linux/export.h>
41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h>
44#include <linux/swap.h>
45#include <linux/swapops.h>
46#include <linux/spinlock.h>
47#include <linux/eventfd.h>
48#include <linux/sort.h>
49#include <linux/fs.h>
50#include <linux/seq_file.h>
51#include <linux/vmalloc.h>
52#include <linux/mm_inline.h>
53#include <linux/page_cgroup.h>
54#include <linux/cpu.h>
55#include <linux/oom.h>
56#include "internal.h"
57#include <net/sock.h>
58#include <net/ip.h>
59#include <net/tcp_memcontrol.h>
60
61#include <asm/uaccess.h>
62
63#include <trace/events/vmscan.h>
64
65struct cgroup_subsys mem_cgroup_subsys __read_mostly;
66EXPORT_SYMBOL(mem_cgroup_subsys);
67
68#define MEM_CGROUP_RECLAIM_RETRIES 5
69static struct mem_cgroup *root_mem_cgroup __read_mostly;
70
71#ifdef CONFIG_MEMCG_SWAP
72
73int do_swap_account __read_mostly;
74
75
76#ifdef CONFIG_MEMCG_SWAP_ENABLED
77static int really_do_swap_account __initdata = 1;
78#else
79static int really_do_swap_account __initdata = 0;
80#endif
81
82#else
83#define do_swap_account 0
84#endif
85
86
87
88
89
90enum mem_cgroup_stat_index {
91
92
93
94 MEM_CGROUP_STAT_CACHE,
95 MEM_CGROUP_STAT_RSS,
96 MEM_CGROUP_STAT_FILE_MAPPED,
97 MEM_CGROUP_STAT_SWAP,
98 MEM_CGROUP_STAT_NSTATS,
99};
100
101static const char * const mem_cgroup_stat_names[] = {
102 "cache",
103 "rss",
104 "mapped_file",
105 "swap",
106};
107
108enum mem_cgroup_events_index {
109 MEM_CGROUP_EVENTS_PGPGIN,
110 MEM_CGROUP_EVENTS_PGPGOUT,
111 MEM_CGROUP_EVENTS_PGFAULT,
112 MEM_CGROUP_EVENTS_PGMAJFAULT,
113 MEM_CGROUP_EVENTS_NSTATS,
114};
115
116static const char * const mem_cgroup_events_names[] = {
117 "pgpgin",
118 "pgpgout",
119 "pgfault",
120 "pgmajfault",
121};
122
123
124
125
126
127
128
129enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS,
134};
135#define THRESHOLDS_EVENTS_TARGET 128
136#define SOFTLIMIT_EVENTS_TARGET 1024
137#define NUMAINFO_EVENTS_TARGET 1024
138
139struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS];
144};
145
146struct mem_cgroup_reclaim_iter {
147
148 int position;
149
150 unsigned int generation;
151};
152
153
154
155
156struct mem_cgroup_per_zone {
157 struct lruvec lruvec;
158 unsigned long lru_size[NR_LRU_LISTS];
159
160 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
161
162 struct rb_node tree_node;
163 unsigned long long usage_in_excess;
164
165 bool on_tree;
166 struct mem_cgroup *memcg;
167
168};
169
170struct mem_cgroup_per_node {
171 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
172};
173
174struct mem_cgroup_lru_info {
175 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
176};
177
178
179
180
181
182
183struct mem_cgroup_tree_per_zone {
184 struct rb_root rb_root;
185 spinlock_t lock;
186};
187
188struct mem_cgroup_tree_per_node {
189 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
190};
191
192struct mem_cgroup_tree {
193 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
194};
195
196static struct mem_cgroup_tree soft_limit_tree __read_mostly;
197
198struct mem_cgroup_threshold {
199 struct eventfd_ctx *eventfd;
200 u64 threshold;
201};
202
203
204struct mem_cgroup_threshold_ary {
205
206 int current_threshold;
207
208 unsigned int size;
209
210 struct mem_cgroup_threshold entries[0];
211};
212
213struct mem_cgroup_thresholds {
214
215 struct mem_cgroup_threshold_ary *primary;
216
217
218
219
220
221 struct mem_cgroup_threshold_ary *spare;
222};
223
224
225struct mem_cgroup_eventfd_list {
226 struct list_head list;
227 struct eventfd_ctx *eventfd;
228};
229
230static void mem_cgroup_threshold(struct mem_cgroup *memcg);
231static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232
233
234
235
236
237
238
239
240
241
242
243
244struct mem_cgroup {
245 struct cgroup_subsys_state css;
246
247
248
249 struct res_counter res;
250
251 union {
252
253
254
255 struct res_counter memsw;
256
257
258
259
260
261
262
263
264
265
266 struct rcu_head rcu_freeing;
267
268
269
270
271 struct work_struct work_freeing;
272 };
273
274
275
276
277 struct res_counter kmem;
278
279
280
281
282 struct mem_cgroup_lru_info info;
283 int last_scanned_node;
284#if MAX_NUMNODES > 1
285 nodemask_t scan_nodes;
286 atomic_t numainfo_events;
287 atomic_t numainfo_updating;
288#endif
289
290
291
292 bool use_hierarchy;
293 unsigned long kmem_account_flags;
294
295 bool oom_lock;
296 atomic_t under_oom;
297
298 atomic_t refcnt;
299
300 int swappiness;
301
302 int oom_kill_disable;
303
304
305 bool memsw_is_minimum;
306
307
308 struct mutex thresholds_lock;
309
310
311 struct mem_cgroup_thresholds thresholds;
312
313
314 struct mem_cgroup_thresholds memsw_thresholds;
315
316
317 struct list_head oom_notify;
318
319
320
321
322
323 unsigned long move_charge_at_immigrate;
324
325
326
327 atomic_t moving_account;
328
329 spinlock_t move_lock;
330
331
332
333 struct mem_cgroup_stat_cpu __percpu *stat;
334
335
336
337
338 struct mem_cgroup_stat_cpu nocpu_base;
339 spinlock_t pcp_counter_lock;
340
341#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
342 struct tcp_memcontrol tcp_mem;
343#endif
344#if defined(CONFIG_MEMCG_KMEM)
345
346 struct list_head memcg_slab_caches;
347
348 struct mutex slab_caches_mutex;
349
350 int kmemcg_id;
351#endif
352};
353
354
355enum {
356 KMEM_ACCOUNTED_ACTIVE = 0,
357 KMEM_ACCOUNTED_ACTIVATED,
358 KMEM_ACCOUNTED_DEAD,
359};
360
361
362#define KMEM_ACCOUNTED_MASK \
363 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
364
365#ifdef CONFIG_MEMCG_KMEM
366static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
367{
368 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
369}
370
371static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
372{
373 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
374}
375
376static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
377{
378 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
379}
380
381static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
382{
383 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
384}
385
386static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387{
388 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
389 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
390}
391
392static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
393{
394 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
395 &memcg->kmem_account_flags);
396}
397#endif
398
399
400
401
402
403
404enum move_type {
405 MOVE_CHARGE_TYPE_ANON,
406 MOVE_CHARGE_TYPE_FILE,
407 NR_MOVE_TYPE,
408};
409
410
411static struct move_charge_struct {
412 spinlock_t lock;
413 struct mem_cgroup *from;
414 struct mem_cgroup *to;
415 unsigned long precharge;
416 unsigned long moved_charge;
417 unsigned long moved_swap;
418 struct task_struct *moving_task;
419 wait_queue_head_t waitq;
420} mc = {
421 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
422 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
423};
424
425static bool move_anon(void)
426{
427 return test_bit(MOVE_CHARGE_TYPE_ANON,
428 &mc.to->move_charge_at_immigrate);
429}
430
431static bool move_file(void)
432{
433 return test_bit(MOVE_CHARGE_TYPE_FILE,
434 &mc.to->move_charge_at_immigrate);
435}
436
437
438
439
440
441#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
442#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
443
444enum charge_type {
445 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
446 MEM_CGROUP_CHARGE_TYPE_ANON,
447 MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
448 MEM_CGROUP_CHARGE_TYPE_DROP,
449 NR_CHARGE_TYPE,
450};
451
452
453enum res_type {
454 _MEM,
455 _MEMSWAP,
456 _OOM_TYPE,
457 _KMEM,
458};
459
460#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
461#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
462#define MEMFILE_ATTR(val) ((val) & 0xffff)
463
464#define OOM_CONTROL (0)
465
466
467
468
469#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
470#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
471#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
472#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
473
474static void mem_cgroup_get(struct mem_cgroup *memcg);
475static void mem_cgroup_put(struct mem_cgroup *memcg);
476
477static inline
478struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
479{
480 return container_of(s, struct mem_cgroup, css);
481}
482
483static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
484{
485 return (memcg == root_mem_cgroup);
486}
487
488
489#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
490
491void sock_update_memcg(struct sock *sk)
492{
493 if (mem_cgroup_sockets_enabled) {
494 struct mem_cgroup *memcg;
495 struct cg_proto *cg_proto;
496
497 BUG_ON(!sk->sk_prot->proto_cgroup);
498
499
500
501
502
503
504
505
506
507 if (sk->sk_cgrp) {
508 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
509 mem_cgroup_get(sk->sk_cgrp->memcg);
510 return;
511 }
512
513 rcu_read_lock();
514 memcg = mem_cgroup_from_task(current);
515 cg_proto = sk->sk_prot->proto_cgroup(memcg);
516 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
517 mem_cgroup_get(memcg);
518 sk->sk_cgrp = cg_proto;
519 }
520 rcu_read_unlock();
521 }
522}
523EXPORT_SYMBOL(sock_update_memcg);
524
525void sock_release_memcg(struct sock *sk)
526{
527 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
528 struct mem_cgroup *memcg;
529 WARN_ON(!sk->sk_cgrp->memcg);
530 memcg = sk->sk_cgrp->memcg;
531 mem_cgroup_put(memcg);
532 }
533}
534
535struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
536{
537 if (!memcg || mem_cgroup_is_root(memcg))
538 return NULL;
539
540 return &memcg->tcp_mem.cg_proto;
541}
542EXPORT_SYMBOL(tcp_proto_cgroup);
543
544static void disarm_sock_keys(struct mem_cgroup *memcg)
545{
546 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
547 return;
548 static_key_slow_dec(&memcg_socket_limit_enabled);
549}
550#else
551static void disarm_sock_keys(struct mem_cgroup *memcg)
552{
553}
554#endif
555
556#ifdef CONFIG_MEMCG_KMEM
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574static DEFINE_IDA(kmem_limited_groups);
575int memcg_limited_groups_array_size;
576
577
578
579
580
581
582
583
584
585
586
587
588
589#define MEMCG_CACHES_MIN_SIZE 4
590#define MEMCG_CACHES_MAX_SIZE 65535
591
592
593
594
595
596
597
598struct static_key memcg_kmem_enabled_key;
599EXPORT_SYMBOL(memcg_kmem_enabled_key);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
606 }
607
608
609
610
611 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif
618
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
625static void drain_all_stock_async(struct mem_cgroup *memcg);
626
627static struct mem_cgroup_per_zone *
628mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
629{
630 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
631}
632
633struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
634{
635 return &memcg->css;
636}
637
638static struct mem_cgroup_per_zone *
639page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
640{
641 int nid = page_to_nid(page);
642 int zid = page_zonenum(page);
643
644 return mem_cgroup_zoneinfo(memcg, nid, zid);
645}
646
647static struct mem_cgroup_tree_per_zone *
648soft_limit_tree_node_zone(int nid, int zid)
649{
650 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
651}
652
653static struct mem_cgroup_tree_per_zone *
654soft_limit_tree_from_page(struct page *page)
655{
656 int nid = page_to_nid(page);
657 int zid = page_zonenum(page);
658
659 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
660}
661
662static void
663__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
664 struct mem_cgroup_per_zone *mz,
665 struct mem_cgroup_tree_per_zone *mctz,
666 unsigned long long new_usage_in_excess)
667{
668 struct rb_node **p = &mctz->rb_root.rb_node;
669 struct rb_node *parent = NULL;
670 struct mem_cgroup_per_zone *mz_node;
671
672 if (mz->on_tree)
673 return;
674
675 mz->usage_in_excess = new_usage_in_excess;
676 if (!mz->usage_in_excess)
677 return;
678 while (*p) {
679 parent = *p;
680 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
681 tree_node);
682 if (mz->usage_in_excess < mz_node->usage_in_excess)
683 p = &(*p)->rb_left;
684
685
686
687
688 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
689 p = &(*p)->rb_right;
690 }
691 rb_link_node(&mz->tree_node, parent, p);
692 rb_insert_color(&mz->tree_node, &mctz->rb_root);
693 mz->on_tree = true;
694}
695
696static void
697__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
698 struct mem_cgroup_per_zone *mz,
699 struct mem_cgroup_tree_per_zone *mctz)
700{
701 if (!mz->on_tree)
702 return;
703 rb_erase(&mz->tree_node, &mctz->rb_root);
704 mz->on_tree = false;
705}
706
707static void
708mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
709 struct mem_cgroup_per_zone *mz,
710 struct mem_cgroup_tree_per_zone *mctz)
711{
712 spin_lock(&mctz->lock);
713 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
714 spin_unlock(&mctz->lock);
715}
716
717
718static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
719{
720 unsigned long long excess;
721 struct mem_cgroup_per_zone *mz;
722 struct mem_cgroup_tree_per_zone *mctz;
723 int nid = page_to_nid(page);
724 int zid = page_zonenum(page);
725 mctz = soft_limit_tree_from_page(page);
726
727
728
729
730
731 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
732 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
733 excess = res_counter_soft_limit_excess(&memcg->res);
734
735
736
737
738 if (excess || mz->on_tree) {
739 spin_lock(&mctz->lock);
740
741 if (mz->on_tree)
742 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
743
744
745
746
747 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
748 spin_unlock(&mctz->lock);
749 }
750 }
751}
752
753static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
754{
755 int node, zone;
756 struct mem_cgroup_per_zone *mz;
757 struct mem_cgroup_tree_per_zone *mctz;
758
759 for_each_node(node) {
760 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
761 mz = mem_cgroup_zoneinfo(memcg, node, zone);
762 mctz = soft_limit_tree_node_zone(node, zone);
763 mem_cgroup_remove_exceeded(memcg, mz, mctz);
764 }
765 }
766}
767
768static struct mem_cgroup_per_zone *
769__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
770{
771 struct rb_node *rightmost = NULL;
772 struct mem_cgroup_per_zone *mz;
773
774retry:
775 mz = NULL;
776 rightmost = rb_last(&mctz->rb_root);
777 if (!rightmost)
778 goto done;
779
780 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
781
782
783
784
785
786 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
787 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
788 !css_tryget(&mz->memcg->css))
789 goto retry;
790done:
791 return mz;
792}
793
794static struct mem_cgroup_per_zone *
795mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
796{
797 struct mem_cgroup_per_zone *mz;
798
799 spin_lock(&mctz->lock);
800 mz = __mem_cgroup_largest_soft_limit_node(mctz);
801 spin_unlock(&mctz->lock);
802 return mz;
803}
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
825 enum mem_cgroup_stat_index idx)
826{
827 long val = 0;
828 int cpu;
829
830 get_online_cpus();
831 for_each_online_cpu(cpu)
832 val += per_cpu(memcg->stat->count[idx], cpu);
833#ifdef CONFIG_HOTPLUG_CPU
834 spin_lock(&memcg->pcp_counter_lock);
835 val += memcg->nocpu_base.count[idx];
836 spin_unlock(&memcg->pcp_counter_lock);
837#endif
838 put_online_cpus();
839 return val;
840}
841
842static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
843 bool charge)
844{
845 int val = (charge) ? 1 : -1;
846 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
847}
848
849static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
850 enum mem_cgroup_events_index idx)
851{
852 unsigned long val = 0;
853 int cpu;
854
855 for_each_online_cpu(cpu)
856 val += per_cpu(memcg->stat->events[idx], cpu);
857#ifdef CONFIG_HOTPLUG_CPU
858 spin_lock(&memcg->pcp_counter_lock);
859 val += memcg->nocpu_base.events[idx];
860 spin_unlock(&memcg->pcp_counter_lock);
861#endif
862 return val;
863}
864
865static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
866 bool anon, int nr_pages)
867{
868 preempt_disable();
869
870
871
872
873
874 if (anon)
875 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
876 nr_pages);
877 else
878 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
879 nr_pages);
880
881
882 if (nr_pages > 0)
883 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
884 else {
885 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
886 nr_pages = -nr_pages;
887 }
888
889 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
890
891 preempt_enable();
892}
893
894unsigned long
895mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
896{
897 struct mem_cgroup_per_zone *mz;
898
899 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
900 return mz->lru_size[lru];
901}
902
903static unsigned long
904mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
905 unsigned int lru_mask)
906{
907 struct mem_cgroup_per_zone *mz;
908 enum lru_list lru;
909 unsigned long ret = 0;
910
911 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
912
913 for_each_lru(lru) {
914 if (BIT(lru) & lru_mask)
915 ret += mz->lru_size[lru];
916 }
917 return ret;
918}
919
920static unsigned long
921mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
922 int nid, unsigned int lru_mask)
923{
924 u64 total = 0;
925 int zid;
926
927 for (zid = 0; zid < MAX_NR_ZONES; zid++)
928 total += mem_cgroup_zone_nr_lru_pages(memcg,
929 nid, zid, lru_mask);
930
931 return total;
932}
933
934static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
935 unsigned int lru_mask)
936{
937 int nid;
938 u64 total = 0;
939
940 for_each_node_state(nid, N_MEMORY)
941 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
942 return total;
943}
944
945static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
946 enum mem_cgroup_events_target target)
947{
948 unsigned long val, next;
949
950 val = __this_cpu_read(memcg->stat->nr_page_events);
951 next = __this_cpu_read(memcg->stat->targets[target]);
952
953 if ((long)next - (long)val < 0) {
954 switch (target) {
955 case MEM_CGROUP_TARGET_THRESH:
956 next = val + THRESHOLDS_EVENTS_TARGET;
957 break;
958 case MEM_CGROUP_TARGET_SOFTLIMIT:
959 next = val + SOFTLIMIT_EVENTS_TARGET;
960 break;
961 case MEM_CGROUP_TARGET_NUMAINFO:
962 next = val + NUMAINFO_EVENTS_TARGET;
963 break;
964 default:
965 break;
966 }
967 __this_cpu_write(memcg->stat->targets[target], next);
968 return true;
969 }
970 return false;
971}
972
973
974
975
976
977static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
978{
979 preempt_disable();
980
981 if (unlikely(mem_cgroup_event_ratelimit(memcg,
982 MEM_CGROUP_TARGET_THRESH))) {
983 bool do_softlimit;
984 bool do_numainfo __maybe_unused;
985
986 do_softlimit = mem_cgroup_event_ratelimit(memcg,
987 MEM_CGROUP_TARGET_SOFTLIMIT);
988#if MAX_NUMNODES > 1
989 do_numainfo = mem_cgroup_event_ratelimit(memcg,
990 MEM_CGROUP_TARGET_NUMAINFO);
991#endif
992 preempt_enable();
993
994 mem_cgroup_threshold(memcg);
995 if (unlikely(do_softlimit))
996 mem_cgroup_update_tree(memcg, page);
997#if MAX_NUMNODES > 1
998 if (unlikely(do_numainfo))
999 atomic_inc(&memcg->numainfo_events);
1000#endif
1001 } else
1002 preempt_enable();
1003}
1004
1005struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1006{
1007 return mem_cgroup_from_css(
1008 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1009}
1010
1011struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1012{
1013
1014
1015
1016
1017
1018 if (unlikely(!p))
1019 return NULL;
1020
1021 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
1022}
1023
1024struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1025{
1026 struct mem_cgroup *memcg = NULL;
1027
1028 if (!mm)
1029 return NULL;
1030
1031
1032
1033
1034
1035 rcu_read_lock();
1036 do {
1037 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1038 if (unlikely(!memcg))
1039 break;
1040 } while (!css_tryget(&memcg->css));
1041 rcu_read_unlock();
1042 return memcg;
1043}
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1063 struct mem_cgroup *prev,
1064 struct mem_cgroup_reclaim_cookie *reclaim)
1065{
1066 struct mem_cgroup *memcg = NULL;
1067 int id = 0;
1068
1069 if (mem_cgroup_disabled())
1070 return NULL;
1071
1072 if (!root)
1073 root = root_mem_cgroup;
1074
1075 if (prev && !reclaim)
1076 id = css_id(&prev->css);
1077
1078 if (prev && prev != root)
1079 css_put(&prev->css);
1080
1081 if (!root->use_hierarchy && root != root_mem_cgroup) {
1082 if (prev)
1083 return NULL;
1084 return root;
1085 }
1086
1087 while (!memcg) {
1088 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1089 struct cgroup_subsys_state *css;
1090
1091 if (reclaim) {
1092 int nid = zone_to_nid(reclaim->zone);
1093 int zid = zone_idx(reclaim->zone);
1094 struct mem_cgroup_per_zone *mz;
1095
1096 mz = mem_cgroup_zoneinfo(root, nid, zid);
1097 iter = &mz->reclaim_iter[reclaim->priority];
1098 if (prev && reclaim->generation != iter->generation)
1099 return NULL;
1100 id = iter->position;
1101 }
1102
1103 rcu_read_lock();
1104 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
1105 if (css) {
1106 if (css == &root->css || css_tryget(css))
1107 memcg = mem_cgroup_from_css(css);
1108 } else
1109 id = 0;
1110 rcu_read_unlock();
1111
1112 if (reclaim) {
1113 iter->position = id;
1114 if (!css)
1115 iter->generation++;
1116 else if (!prev && memcg)
1117 reclaim->generation = iter->generation;
1118 }
1119
1120 if (prev && !css)
1121 return NULL;
1122 }
1123 return memcg;
1124}
1125
1126
1127
1128
1129
1130
1131void mem_cgroup_iter_break(struct mem_cgroup *root,
1132 struct mem_cgroup *prev)
1133{
1134 if (!root)
1135 root = root_mem_cgroup;
1136 if (prev && prev != root)
1137 css_put(&prev->css);
1138}
1139
1140
1141
1142
1143
1144
1145#define for_each_mem_cgroup_tree(iter, root) \
1146 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1147 iter != NULL; \
1148 iter = mem_cgroup_iter(root, iter, NULL))
1149
1150#define for_each_mem_cgroup(iter) \
1151 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1152 iter != NULL; \
1153 iter = mem_cgroup_iter(NULL, iter, NULL))
1154
1155void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1156{
1157 struct mem_cgroup *memcg;
1158
1159 rcu_read_lock();
1160 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1161 if (unlikely(!memcg))
1162 goto out;
1163
1164 switch (idx) {
1165 case PGFAULT:
1166 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1167 break;
1168 case PGMAJFAULT:
1169 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1170 break;
1171 default:
1172 BUG();
1173 }
1174out:
1175 rcu_read_unlock();
1176}
1177EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1189 struct mem_cgroup *memcg)
1190{
1191 struct mem_cgroup_per_zone *mz;
1192 struct lruvec *lruvec;
1193
1194 if (mem_cgroup_disabled()) {
1195 lruvec = &zone->lruvec;
1196 goto out;
1197 }
1198
1199 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1200 lruvec = &mz->lruvec;
1201out:
1202
1203
1204
1205
1206
1207 if (unlikely(lruvec->zone != zone))
1208 lruvec->zone = zone;
1209 return lruvec;
1210}
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1232{
1233 struct mem_cgroup_per_zone *mz;
1234 struct mem_cgroup *memcg;
1235 struct page_cgroup *pc;
1236 struct lruvec *lruvec;
1237
1238 if (mem_cgroup_disabled()) {
1239 lruvec = &zone->lruvec;
1240 goto out;
1241 }
1242
1243 pc = lookup_page_cgroup(page);
1244 memcg = pc->mem_cgroup;
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1256 pc->mem_cgroup = memcg = root_mem_cgroup;
1257
1258 mz = page_cgroup_zoneinfo(memcg, page);
1259 lruvec = &mz->lruvec;
1260out:
1261
1262
1263
1264
1265
1266 if (unlikely(lruvec->zone != zone))
1267 lruvec->zone = zone;
1268 return lruvec;
1269}
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1281 int nr_pages)
1282{
1283 struct mem_cgroup_per_zone *mz;
1284 unsigned long *lru_size;
1285
1286 if (mem_cgroup_disabled())
1287 return;
1288
1289 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1290 lru_size = mz->lru_size + lru;
1291 *lru_size += nr_pages;
1292 VM_BUG_ON((long)(*lru_size) < 0);
1293}
1294
1295
1296
1297
1298
1299bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1300 struct mem_cgroup *memcg)
1301{
1302 if (root_memcg == memcg)
1303 return true;
1304 if (!root_memcg->use_hierarchy || !memcg)
1305 return false;
1306 return css_is_ancestor(&memcg->css, &root_memcg->css);
1307}
1308
1309static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1310 struct mem_cgroup *memcg)
1311{
1312 bool ret;
1313
1314 rcu_read_lock();
1315 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1316 rcu_read_unlock();
1317 return ret;
1318}
1319
1320int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1321{
1322 int ret;
1323 struct mem_cgroup *curr = NULL;
1324 struct task_struct *p;
1325
1326 p = find_lock_task_mm(task);
1327 if (p) {
1328 curr = try_get_mem_cgroup_from_mm(p->mm);
1329 task_unlock(p);
1330 } else {
1331
1332
1333
1334
1335
1336 task_lock(task);
1337 curr = mem_cgroup_from_task(task);
1338 if (curr)
1339 css_get(&curr->css);
1340 task_unlock(task);
1341 }
1342 if (!curr)
1343 return 0;
1344
1345
1346
1347
1348
1349
1350 ret = mem_cgroup_same_or_subtree(memcg, curr);
1351 css_put(&curr->css);
1352 return ret;
1353}
1354
1355int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1356{
1357 unsigned long inactive_ratio;
1358 unsigned long inactive;
1359 unsigned long active;
1360 unsigned long gb;
1361
1362 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1363 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1364
1365 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1366 if (gb)
1367 inactive_ratio = int_sqrt(10 * gb);
1368 else
1369 inactive_ratio = 1;
1370
1371 return inactive * inactive_ratio < active;
1372}
1373
1374int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1375{
1376 unsigned long active;
1377 unsigned long inactive;
1378
1379 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1380 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1381
1382 return (active > inactive);
1383}
1384
1385#define mem_cgroup_from_res_counter(counter, member) \
1386 container_of(counter, struct mem_cgroup, member)
1387
1388
1389
1390
1391
1392
1393
1394
1395static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1396{
1397 unsigned long long margin;
1398
1399 margin = res_counter_margin(&memcg->res);
1400 if (do_swap_account)
1401 margin = min(margin, res_counter_margin(&memcg->memsw));
1402 return margin >> PAGE_SHIFT;
1403}
1404
1405int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1406{
1407 struct cgroup *cgrp = memcg->css.cgroup;
1408
1409
1410 if (cgrp->parent == NULL)
1411 return vm_swappiness;
1412
1413 return memcg->swappiness;
1414}
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432atomic_t memcg_moving __read_mostly;
1433
1434static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1435{
1436 atomic_inc(&memcg_moving);
1437 atomic_inc(&memcg->moving_account);
1438 synchronize_rcu();
1439}
1440
1441static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1442{
1443
1444
1445
1446
1447 if (memcg) {
1448 atomic_dec(&memcg_moving);
1449 atomic_dec(&memcg->moving_account);
1450 }
1451}
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1466{
1467 VM_BUG_ON(!rcu_read_lock_held());
1468 return atomic_read(&memcg->moving_account) > 0;
1469}
1470
1471static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1472{
1473 struct mem_cgroup *from;
1474 struct mem_cgroup *to;
1475 bool ret = false;
1476
1477
1478
1479
1480 spin_lock(&mc.lock);
1481 from = mc.from;
1482 to = mc.to;
1483 if (!from)
1484 goto unlock;
1485
1486 ret = mem_cgroup_same_or_subtree(memcg, from)
1487 || mem_cgroup_same_or_subtree(memcg, to);
1488unlock:
1489 spin_unlock(&mc.lock);
1490 return ret;
1491}
1492
1493static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1494{
1495 if (mc.moving_task && current != mc.moving_task) {
1496 if (mem_cgroup_under_move(memcg)) {
1497 DEFINE_WAIT(wait);
1498 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1499
1500 if (mc.moving_task)
1501 schedule();
1502 finish_wait(&mc.waitq, &wait);
1503 return true;
1504 }
1505 }
1506 return false;
1507}
1508
1509
1510
1511
1512
1513
1514
1515static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1516 unsigned long *flags)
1517{
1518 spin_lock_irqsave(&memcg->move_lock, *flags);
1519}
1520
1521static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1522 unsigned long *flags)
1523{
1524 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1525}
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1536{
1537 struct cgroup *task_cgrp;
1538 struct cgroup *mem_cgrp;
1539
1540
1541
1542
1543
1544 static char memcg_name[PATH_MAX];
1545 int ret;
1546
1547 if (!memcg || !p)
1548 return;
1549
1550 rcu_read_lock();
1551
1552 mem_cgrp = memcg->css.cgroup;
1553 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1554
1555 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1556 if (ret < 0) {
1557
1558
1559
1560
1561 rcu_read_unlock();
1562 goto done;
1563 }
1564 rcu_read_unlock();
1565
1566 printk(KERN_INFO "Task in %s killed", memcg_name);
1567
1568 rcu_read_lock();
1569 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1570 if (ret < 0) {
1571 rcu_read_unlock();
1572 goto done;
1573 }
1574 rcu_read_unlock();
1575
1576
1577
1578
1579 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1580done:
1581
1582 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1583 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1584 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1585 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1586 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1587 "failcnt %llu\n",
1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1595}
1596
1597
1598
1599
1600
1601static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1602{
1603 int num = 0;
1604 struct mem_cgroup *iter;
1605
1606 for_each_mem_cgroup_tree(iter, memcg)
1607 num++;
1608 return num;
1609}
1610
1611
1612
1613
1614static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1615{
1616 u64 limit;
1617
1618 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1619
1620
1621
1622
1623 if (mem_cgroup_swappiness(memcg)) {
1624 u64 memsw;
1625
1626 limit += total_swap_pages << PAGE_SHIFT;
1627 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1628
1629
1630
1631
1632
1633 limit = min(limit, memsw);
1634 }
1635
1636 return limit;
1637}
1638
1639static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1640 int order)
1641{
1642 struct mem_cgroup *iter;
1643 unsigned long chosen_points = 0;
1644 unsigned long totalpages;
1645 unsigned int points = 0;
1646 struct task_struct *chosen = NULL;
1647
1648
1649
1650
1651
1652
1653 if (fatal_signal_pending(current)) {
1654 set_thread_flag(TIF_MEMDIE);
1655 return;
1656 }
1657
1658 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1659 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1660 for_each_mem_cgroup_tree(iter, memcg) {
1661 struct cgroup *cgroup = iter->css.cgroup;
1662 struct cgroup_iter it;
1663 struct task_struct *task;
1664
1665 cgroup_iter_start(cgroup, &it);
1666 while ((task = cgroup_iter_next(cgroup, &it))) {
1667 switch (oom_scan_process_thread(task, totalpages, NULL,
1668 false)) {
1669 case OOM_SCAN_SELECT:
1670 if (chosen)
1671 put_task_struct(chosen);
1672 chosen = task;
1673 chosen_points = ULONG_MAX;
1674 get_task_struct(chosen);
1675
1676 case OOM_SCAN_CONTINUE:
1677 continue;
1678 case OOM_SCAN_ABORT:
1679 cgroup_iter_end(cgroup, &it);
1680 mem_cgroup_iter_break(memcg, iter);
1681 if (chosen)
1682 put_task_struct(chosen);
1683 return;
1684 case OOM_SCAN_OK:
1685 break;
1686 };
1687 points = oom_badness(task, memcg, NULL, totalpages);
1688 if (points > chosen_points) {
1689 if (chosen)
1690 put_task_struct(chosen);
1691 chosen = task;
1692 chosen_points = points;
1693 get_task_struct(chosen);
1694 }
1695 }
1696 cgroup_iter_end(cgroup, &it);
1697 }
1698
1699 if (!chosen)
1700 return;
1701 points = chosen_points * 1000 / totalpages;
1702 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1703 NULL, "Memory cgroup out of memory");
1704}
1705
1706static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1707 gfp_t gfp_mask,
1708 unsigned long flags)
1709{
1710 unsigned long total = 0;
1711 bool noswap = false;
1712 int loop;
1713
1714 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1715 noswap = true;
1716 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1717 noswap = true;
1718
1719 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1720 if (loop)
1721 drain_all_stock_async(memcg);
1722 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1723
1724
1725
1726
1727
1728 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1729 break;
1730 if (mem_cgroup_margin(memcg))
1731 break;
1732
1733
1734
1735
1736 if (loop && !total)
1737 break;
1738 }
1739 return total;
1740}
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1753 int nid, bool noswap)
1754{
1755 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1756 return true;
1757 if (noswap || !total_swap_pages)
1758 return false;
1759 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1760 return true;
1761 return false;
1762
1763}
1764#if MAX_NUMNODES > 1
1765
1766
1767
1768
1769
1770
1771
1772static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1773{
1774 int nid;
1775
1776
1777
1778
1779 if (!atomic_read(&memcg->numainfo_events))
1780 return;
1781 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1782 return;
1783
1784
1785 memcg->scan_nodes = node_states[N_MEMORY];
1786
1787 for_each_node_mask(nid, node_states[N_MEMORY]) {
1788
1789 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1790 node_clear(nid, memcg->scan_nodes);
1791 }
1792
1793 atomic_set(&memcg->numainfo_events, 0);
1794 atomic_set(&memcg->numainfo_updating, 0);
1795}
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1810{
1811 int node;
1812
1813 mem_cgroup_may_update_nodemask(memcg);
1814 node = memcg->last_scanned_node;
1815
1816 node = next_node(node, memcg->scan_nodes);
1817 if (node == MAX_NUMNODES)
1818 node = first_node(memcg->scan_nodes);
1819
1820
1821
1822
1823
1824
1825 if (unlikely(node == MAX_NUMNODES))
1826 node = numa_node_id();
1827
1828 memcg->last_scanned_node = node;
1829 return node;
1830}
1831
1832
1833
1834
1835
1836
1837
1838static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1839{
1840 int nid;
1841
1842
1843
1844
1845
1846 if (!nodes_empty(memcg->scan_nodes)) {
1847 for (nid = first_node(memcg->scan_nodes);
1848 nid < MAX_NUMNODES;
1849 nid = next_node(nid, memcg->scan_nodes)) {
1850
1851 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1852 return true;
1853 }
1854 }
1855
1856
1857
1858 for_each_node_state(nid, N_MEMORY) {
1859 if (node_isset(nid, memcg->scan_nodes))
1860 continue;
1861 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1862 return true;
1863 }
1864 return false;
1865}
1866
1867#else
1868int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1869{
1870 return 0;
1871}
1872
1873static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1874{
1875 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1876}
1877#endif
1878
1879static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1880 struct zone *zone,
1881 gfp_t gfp_mask,
1882 unsigned long *total_scanned)
1883{
1884 struct mem_cgroup *victim = NULL;
1885 int total = 0;
1886 int loop = 0;
1887 unsigned long excess;
1888 unsigned long nr_scanned;
1889 struct mem_cgroup_reclaim_cookie reclaim = {
1890 .zone = zone,
1891 .priority = 0,
1892 };
1893
1894 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1895
1896 while (1) {
1897 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1898 if (!victim) {
1899 loop++;
1900 if (loop >= 2) {
1901
1902
1903
1904
1905
1906 if (!total)
1907 break;
1908
1909
1910
1911
1912
1913
1914 if (total >= (excess >> 2) ||
1915 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1916 break;
1917 }
1918 continue;
1919 }
1920 if (!mem_cgroup_reclaimable(victim, false))
1921 continue;
1922 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1923 zone, &nr_scanned);
1924 *total_scanned += nr_scanned;
1925 if (!res_counter_soft_limit_excess(&root_memcg->res))
1926 break;
1927 }
1928 mem_cgroup_iter_break(root_memcg, victim);
1929 return total;
1930}
1931
1932
1933
1934
1935
1936
1937static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1938{
1939 struct mem_cgroup *iter, *failed = NULL;
1940
1941 for_each_mem_cgroup_tree(iter, memcg) {
1942 if (iter->oom_lock) {
1943
1944
1945
1946
1947 failed = iter;
1948 mem_cgroup_iter_break(memcg, iter);
1949 break;
1950 } else
1951 iter->oom_lock = true;
1952 }
1953
1954 if (!failed)
1955 return true;
1956
1957
1958
1959
1960
1961 for_each_mem_cgroup_tree(iter, memcg) {
1962 if (iter == failed) {
1963 mem_cgroup_iter_break(memcg, iter);
1964 break;
1965 }
1966 iter->oom_lock = false;
1967 }
1968 return false;
1969}
1970
1971
1972
1973
1974static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1975{
1976 struct mem_cgroup *iter;
1977
1978 for_each_mem_cgroup_tree(iter, memcg)
1979 iter->oom_lock = false;
1980 return 0;
1981}
1982
1983static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1984{
1985 struct mem_cgroup *iter;
1986
1987 for_each_mem_cgroup_tree(iter, memcg)
1988 atomic_inc(&iter->under_oom);
1989}
1990
1991static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1992{
1993 struct mem_cgroup *iter;
1994
1995
1996
1997
1998
1999
2000 for_each_mem_cgroup_tree(iter, memcg)
2001 atomic_add_unless(&iter->under_oom, -1, 0);
2002}
2003
2004static DEFINE_SPINLOCK(memcg_oom_lock);
2005static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2006
2007struct oom_wait_info {
2008 struct mem_cgroup *memcg;
2009 wait_queue_t wait;
2010};
2011
2012static int memcg_oom_wake_function(wait_queue_t *wait,
2013 unsigned mode, int sync, void *arg)
2014{
2015 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2016 struct mem_cgroup *oom_wait_memcg;
2017 struct oom_wait_info *oom_wait_info;
2018
2019 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2020 oom_wait_memcg = oom_wait_info->memcg;
2021
2022
2023
2024
2025
2026 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2027 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2028 return 0;
2029 return autoremove_wake_function(wait, mode, sync, arg);
2030}
2031
2032static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2033{
2034
2035 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2036}
2037
2038static void memcg_oom_recover(struct mem_cgroup *memcg)
2039{
2040 if (memcg && atomic_read(&memcg->under_oom))
2041 memcg_wakeup_oom(memcg);
2042}
2043
2044
2045
2046
2047static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2048 int order)
2049{
2050 struct oom_wait_info owait;
2051 bool locked, need_to_kill;
2052
2053 owait.memcg = memcg;
2054 owait.wait.flags = 0;
2055 owait.wait.func = memcg_oom_wake_function;
2056 owait.wait.private = current;
2057 INIT_LIST_HEAD(&owait.wait.task_list);
2058 need_to_kill = true;
2059 mem_cgroup_mark_under_oom(memcg);
2060
2061
2062 spin_lock(&memcg_oom_lock);
2063 locked = mem_cgroup_oom_lock(memcg);
2064
2065
2066
2067
2068
2069 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2070 if (!locked || memcg->oom_kill_disable)
2071 need_to_kill = false;
2072 if (locked)
2073 mem_cgroup_oom_notify(memcg);
2074 spin_unlock(&memcg_oom_lock);
2075
2076 if (need_to_kill) {
2077 finish_wait(&memcg_oom_waitq, &owait.wait);
2078 mem_cgroup_out_of_memory(memcg, mask, order);
2079 } else {
2080 schedule();
2081 finish_wait(&memcg_oom_waitq, &owait.wait);
2082 }
2083 spin_lock(&memcg_oom_lock);
2084 if (locked)
2085 mem_cgroup_oom_unlock(memcg);
2086 memcg_wakeup_oom(memcg);
2087 spin_unlock(&memcg_oom_lock);
2088
2089 mem_cgroup_unmark_under_oom(memcg);
2090
2091 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2092 return false;
2093
2094 schedule_timeout_uninterruptible(1);
2095 return true;
2096}
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122void __mem_cgroup_begin_update_page_stat(struct page *page,
2123 bool *locked, unsigned long *flags)
2124{
2125 struct mem_cgroup *memcg;
2126 struct page_cgroup *pc;
2127
2128 pc = lookup_page_cgroup(page);
2129again:
2130 memcg = pc->mem_cgroup;
2131 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2132 return;
2133
2134
2135
2136
2137
2138
2139 if (!mem_cgroup_stolen(memcg))
2140 return;
2141
2142 move_lock_mem_cgroup(memcg, flags);
2143 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2144 move_unlock_mem_cgroup(memcg, flags);
2145 goto again;
2146 }
2147 *locked = true;
2148}
2149
2150void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2151{
2152 struct page_cgroup *pc = lookup_page_cgroup(page);
2153
2154
2155
2156
2157
2158
2159 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2160}
2161
2162void mem_cgroup_update_page_stat(struct page *page,
2163 enum mem_cgroup_page_stat_item idx, int val)
2164{
2165 struct mem_cgroup *memcg;
2166 struct page_cgroup *pc = lookup_page_cgroup(page);
2167 unsigned long uninitialized_var(flags);
2168
2169 if (mem_cgroup_disabled())
2170 return;
2171
2172 memcg = pc->mem_cgroup;
2173 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2174 return;
2175
2176 switch (idx) {
2177 case MEMCG_NR_FILE_MAPPED:
2178 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2179 break;
2180 default:
2181 BUG();
2182 }
2183
2184 this_cpu_add(memcg->stat->count[idx], val);
2185}
2186
2187
2188
2189
2190
2191#define CHARGE_BATCH 32U
2192struct memcg_stock_pcp {
2193 struct mem_cgroup *cached;
2194 unsigned int nr_pages;
2195 struct work_struct work;
2196 unsigned long flags;
2197#define FLUSHING_CACHED_CHARGE 0
2198};
2199static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2200static DEFINE_MUTEX(percpu_charge_mutex);
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2214{
2215 struct memcg_stock_pcp *stock;
2216 bool ret = true;
2217
2218 if (nr_pages > CHARGE_BATCH)
2219 return false;
2220
2221 stock = &get_cpu_var(memcg_stock);
2222 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2223 stock->nr_pages -= nr_pages;
2224 else
2225 ret = false;
2226 put_cpu_var(memcg_stock);
2227 return ret;
2228}
2229
2230
2231
2232
2233static void drain_stock(struct memcg_stock_pcp *stock)
2234{
2235 struct mem_cgroup *old = stock->cached;
2236
2237 if (stock->nr_pages) {
2238 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2239
2240 res_counter_uncharge(&old->res, bytes);
2241 if (do_swap_account)
2242 res_counter_uncharge(&old->memsw, bytes);
2243 stock->nr_pages = 0;
2244 }
2245 stock->cached = NULL;
2246}
2247
2248
2249
2250
2251
2252static void drain_local_stock(struct work_struct *dummy)
2253{
2254 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2255 drain_stock(stock);
2256 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2257}
2258
2259
2260
2261
2262
2263static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2264{
2265 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2266
2267 if (stock->cached != memcg) {
2268 drain_stock(stock);
2269 stock->cached = memcg;
2270 }
2271 stock->nr_pages += nr_pages;
2272 put_cpu_var(memcg_stock);
2273}
2274
2275
2276
2277
2278
2279
2280static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2281{
2282 int cpu, curcpu;
2283
2284
2285 get_online_cpus();
2286 curcpu = get_cpu();
2287 for_each_online_cpu(cpu) {
2288 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2289 struct mem_cgroup *memcg;
2290
2291 memcg = stock->cached;
2292 if (!memcg || !stock->nr_pages)
2293 continue;
2294 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2295 continue;
2296 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2297 if (cpu == curcpu)
2298 drain_local_stock(&stock->work);
2299 else
2300 schedule_work_on(cpu, &stock->work);
2301 }
2302 }
2303 put_cpu();
2304
2305 if (!sync)
2306 goto out;
2307
2308 for_each_online_cpu(cpu) {
2309 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2310 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2311 flush_work(&stock->work);
2312 }
2313out:
2314 put_online_cpus();
2315}
2316
2317
2318
2319
2320
2321
2322
2323static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2324{
2325
2326
2327
2328 if (!mutex_trylock(&percpu_charge_mutex))
2329 return;
2330 drain_all_stock(root_memcg, false);
2331 mutex_unlock(&percpu_charge_mutex);
2332}
2333
2334
2335static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2336{
2337
2338 mutex_lock(&percpu_charge_mutex);
2339 drain_all_stock(root_memcg, true);
2340 mutex_unlock(&percpu_charge_mutex);
2341}
2342
2343
2344
2345
2346
2347static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2348{
2349 int i;
2350
2351 spin_lock(&memcg->pcp_counter_lock);
2352 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2353 long x = per_cpu(memcg->stat->count[i], cpu);
2354
2355 per_cpu(memcg->stat->count[i], cpu) = 0;
2356 memcg->nocpu_base.count[i] += x;
2357 }
2358 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2359 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2360
2361 per_cpu(memcg->stat->events[i], cpu) = 0;
2362 memcg->nocpu_base.events[i] += x;
2363 }
2364 spin_unlock(&memcg->pcp_counter_lock);
2365}
2366
2367static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2368 unsigned long action,
2369 void *hcpu)
2370{
2371 int cpu = (unsigned long)hcpu;
2372 struct memcg_stock_pcp *stock;
2373 struct mem_cgroup *iter;
2374
2375 if (action == CPU_ONLINE)
2376 return NOTIFY_OK;
2377
2378 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2379 return NOTIFY_OK;
2380
2381 for_each_mem_cgroup(iter)
2382 mem_cgroup_drain_pcp_counter(iter, cpu);
2383
2384 stock = &per_cpu(memcg_stock, cpu);
2385 drain_stock(stock);
2386 return NOTIFY_OK;
2387}
2388
2389
2390
2391enum {
2392 CHARGE_OK,
2393 CHARGE_RETRY,
2394 CHARGE_NOMEM,
2395 CHARGE_WOULDBLOCK,
2396 CHARGE_OOM_DIE,
2397};
2398
2399static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2400 unsigned int nr_pages, unsigned int min_pages,
2401 bool oom_check)
2402{
2403 unsigned long csize = nr_pages * PAGE_SIZE;
2404 struct mem_cgroup *mem_over_limit;
2405 struct res_counter *fail_res;
2406 unsigned long flags = 0;
2407 int ret;
2408
2409 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2410
2411 if (likely(!ret)) {
2412 if (!do_swap_account)
2413 return CHARGE_OK;
2414 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2415 if (likely(!ret))
2416 return CHARGE_OK;
2417
2418 res_counter_uncharge(&memcg->res, csize);
2419 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2420 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2421 } else
2422 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2423
2424
2425
2426
2427 if (nr_pages > min_pages)
2428 return CHARGE_RETRY;
2429
2430 if (!(gfp_mask & __GFP_WAIT))
2431 return CHARGE_WOULDBLOCK;
2432
2433 if (gfp_mask & __GFP_NORETRY)
2434 return CHARGE_NOMEM;
2435
2436 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2437 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2438 return CHARGE_RETRY;
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2449 return CHARGE_RETRY;
2450
2451
2452
2453
2454
2455 if (mem_cgroup_wait_acct_move(mem_over_limit))
2456 return CHARGE_RETRY;
2457
2458
2459 if (!oom_check)
2460 return CHARGE_NOMEM;
2461
2462 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2463 return CHARGE_OOM_DIE;
2464
2465 return CHARGE_RETRY;
2466}
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489static int __mem_cgroup_try_charge(struct mm_struct *mm,
2490 gfp_t gfp_mask,
2491 unsigned int nr_pages,
2492 struct mem_cgroup **ptr,
2493 bool oom)
2494{
2495 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2496 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2497 struct mem_cgroup *memcg = NULL;
2498 int ret;
2499
2500
2501
2502
2503
2504
2505 if (unlikely(test_thread_flag(TIF_MEMDIE)
2506 || fatal_signal_pending(current)))
2507 goto bypass;
2508
2509
2510
2511
2512
2513
2514
2515 if (!*ptr && !mm)
2516 *ptr = root_mem_cgroup;
2517again:
2518 if (*ptr) {
2519 memcg = *ptr;
2520 if (mem_cgroup_is_root(memcg))
2521 goto done;
2522 if (consume_stock(memcg, nr_pages))
2523 goto done;
2524 css_get(&memcg->css);
2525 } else {
2526 struct task_struct *p;
2527
2528 rcu_read_lock();
2529 p = rcu_dereference(mm->owner);
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540 memcg = mem_cgroup_from_task(p);
2541 if (!memcg)
2542 memcg = root_mem_cgroup;
2543 if (mem_cgroup_is_root(memcg)) {
2544 rcu_read_unlock();
2545 goto done;
2546 }
2547 if (consume_stock(memcg, nr_pages)) {
2548
2549
2550
2551
2552
2553
2554
2555
2556 rcu_read_unlock();
2557 goto done;
2558 }
2559
2560 if (!css_tryget(&memcg->css)) {
2561 rcu_read_unlock();
2562 goto again;
2563 }
2564 rcu_read_unlock();
2565 }
2566
2567 do {
2568 bool oom_check;
2569
2570
2571 if (fatal_signal_pending(current)) {
2572 css_put(&memcg->css);
2573 goto bypass;
2574 }
2575
2576 oom_check = false;
2577 if (oom && !nr_oom_retries) {
2578 oom_check = true;
2579 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2580 }
2581
2582 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2583 oom_check);
2584 switch (ret) {
2585 case CHARGE_OK:
2586 break;
2587 case CHARGE_RETRY:
2588 batch = nr_pages;
2589 css_put(&memcg->css);
2590 memcg = NULL;
2591 goto again;
2592 case CHARGE_WOULDBLOCK:
2593 css_put(&memcg->css);
2594 goto nomem;
2595 case CHARGE_NOMEM:
2596 if (!oom) {
2597 css_put(&memcg->css);
2598 goto nomem;
2599 }
2600
2601 nr_oom_retries--;
2602 break;
2603 case CHARGE_OOM_DIE:
2604 css_put(&memcg->css);
2605 goto bypass;
2606 }
2607 } while (ret != CHARGE_OK);
2608
2609 if (batch > nr_pages)
2610 refill_stock(memcg, batch - nr_pages);
2611 css_put(&memcg->css);
2612done:
2613 *ptr = memcg;
2614 return 0;
2615nomem:
2616 *ptr = NULL;
2617 return -ENOMEM;
2618bypass:
2619 *ptr = root_mem_cgroup;
2620 return -EINTR;
2621}
2622
2623
2624
2625
2626
2627
2628static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2629 unsigned int nr_pages)
2630{
2631 if (!mem_cgroup_is_root(memcg)) {
2632 unsigned long bytes = nr_pages * PAGE_SIZE;
2633
2634 res_counter_uncharge(&memcg->res, bytes);
2635 if (do_swap_account)
2636 res_counter_uncharge(&memcg->memsw, bytes);
2637 }
2638}
2639
2640
2641
2642
2643
2644static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2645 unsigned int nr_pages)
2646{
2647 unsigned long bytes = nr_pages * PAGE_SIZE;
2648
2649 if (mem_cgroup_is_root(memcg))
2650 return;
2651
2652 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2653 if (do_swap_account)
2654 res_counter_uncharge_until(&memcg->memsw,
2655 memcg->memsw.parent, bytes);
2656}
2657
2658
2659
2660
2661
2662
2663
2664static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2665{
2666 struct cgroup_subsys_state *css;
2667
2668
2669 if (!id)
2670 return NULL;
2671 css = css_lookup(&mem_cgroup_subsys, id);
2672 if (!css)
2673 return NULL;
2674 return mem_cgroup_from_css(css);
2675}
2676
2677struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2678{
2679 struct mem_cgroup *memcg = NULL;
2680 struct page_cgroup *pc;
2681 unsigned short id;
2682 swp_entry_t ent;
2683
2684 VM_BUG_ON(!PageLocked(page));
2685
2686 pc = lookup_page_cgroup(page);
2687 lock_page_cgroup(pc);
2688 if (PageCgroupUsed(pc)) {
2689 memcg = pc->mem_cgroup;
2690 if (memcg && !css_tryget(&memcg->css))
2691 memcg = NULL;
2692 } else if (PageSwapCache(page)) {
2693 ent.val = page_private(page);
2694 id = lookup_swap_cgroup_id(ent);
2695 rcu_read_lock();
2696 memcg = mem_cgroup_lookup(id);
2697 if (memcg && !css_tryget(&memcg->css))
2698 memcg = NULL;
2699 rcu_read_unlock();
2700 }
2701 unlock_page_cgroup(pc);
2702 return memcg;
2703}
2704
2705static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2706 struct page *page,
2707 unsigned int nr_pages,
2708 enum charge_type ctype,
2709 bool lrucare)
2710{
2711 struct page_cgroup *pc = lookup_page_cgroup(page);
2712 struct zone *uninitialized_var(zone);
2713 struct lruvec *lruvec;
2714 bool was_on_lru = false;
2715 bool anon;
2716
2717 lock_page_cgroup(pc);
2718 VM_BUG_ON(PageCgroupUsed(pc));
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728 if (lrucare) {
2729 zone = page_zone(page);
2730 spin_lock_irq(&zone->lru_lock);
2731 if (PageLRU(page)) {
2732 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2733 ClearPageLRU(page);
2734 del_page_from_lru_list(page, lruvec, page_lru(page));
2735 was_on_lru = true;
2736 }
2737 }
2738
2739 pc->mem_cgroup = memcg;
2740
2741
2742
2743
2744
2745
2746
2747 smp_wmb();
2748 SetPageCgroupUsed(pc);
2749
2750 if (lrucare) {
2751 if (was_on_lru) {
2752 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2753 VM_BUG_ON(PageLRU(page));
2754 SetPageLRU(page);
2755 add_page_to_lru_list(page, lruvec, page_lru(page));
2756 }
2757 spin_unlock_irq(&zone->lru_lock);
2758 }
2759
2760 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2761 anon = true;
2762 else
2763 anon = false;
2764
2765 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2766 unlock_page_cgroup(pc);
2767
2768
2769
2770
2771
2772
2773 memcg_check_events(memcg, page);
2774}
2775
2776static DEFINE_MUTEX(set_limit_mutex);
2777
2778#ifdef CONFIG_MEMCG_KMEM
2779static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2780{
2781 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2782 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2783}
2784
2785
2786
2787
2788
2789static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2790{
2791 struct kmem_cache *cachep;
2792
2793 VM_BUG_ON(p->is_root_cache);
2794 cachep = p->root_cache;
2795 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2796}
2797
2798#ifdef CONFIG_SLABINFO
2799static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2800 struct seq_file *m)
2801{
2802 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2803 struct memcg_cache_params *params;
2804
2805 if (!memcg_can_account_kmem(memcg))
2806 return -EIO;
2807
2808 print_slabinfo_header(m);
2809
2810 mutex_lock(&memcg->slab_caches_mutex);
2811 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2812 cache_show(memcg_params_to_cache(params), m);
2813 mutex_unlock(&memcg->slab_caches_mutex);
2814
2815 return 0;
2816}
2817#endif
2818
2819static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2820{
2821 struct res_counter *fail_res;
2822 struct mem_cgroup *_memcg;
2823 int ret = 0;
2824 bool may_oom;
2825
2826 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2827 if (ret)
2828 return ret;
2829
2830
2831
2832
2833
2834 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2835
2836 _memcg = memcg;
2837 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2838 &_memcg, may_oom);
2839
2840 if (ret == -EINTR) {
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2857 if (do_swap_account)
2858 res_counter_charge_nofail(&memcg->memsw, size,
2859 &fail_res);
2860 ret = 0;
2861 } else if (ret)
2862 res_counter_uncharge(&memcg->kmem, size);
2863
2864 return ret;
2865}
2866
2867static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2868{
2869 res_counter_uncharge(&memcg->res, size);
2870 if (do_swap_account)
2871 res_counter_uncharge(&memcg->memsw, size);
2872
2873
2874 if (res_counter_uncharge(&memcg->kmem, size))
2875 return;
2876
2877 if (memcg_kmem_test_and_clear_dead(memcg))
2878 mem_cgroup_put(memcg);
2879}
2880
2881void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2882{
2883 if (!memcg)
2884 return;
2885
2886 mutex_lock(&memcg->slab_caches_mutex);
2887 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2888 mutex_unlock(&memcg->slab_caches_mutex);
2889}
2890
2891
2892
2893
2894
2895
2896int memcg_cache_id(struct mem_cgroup *memcg)
2897{
2898 return memcg ? memcg->kmemcg_id : -1;
2899}
2900
2901
2902
2903
2904
2905
2906
2907
2908int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2909{
2910 int num, ret;
2911
2912 num = ida_simple_get(&kmem_limited_groups,
2913 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2914 if (num < 0)
2915 return num;
2916
2917
2918
2919
2920
2921
2922
2923 memcg_kmem_set_activated(memcg);
2924
2925 ret = memcg_update_all_caches(num+1);
2926 if (ret) {
2927 ida_simple_remove(&kmem_limited_groups, num);
2928 memcg_kmem_clear_activated(memcg);
2929 return ret;
2930 }
2931
2932 memcg->kmemcg_id = num;
2933 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
2934 mutex_init(&memcg->slab_caches_mutex);
2935 return 0;
2936}
2937
2938static size_t memcg_caches_array_size(int num_groups)
2939{
2940 ssize_t size;
2941 if (num_groups <= 0)
2942 return 0;
2943
2944 size = 2 * num_groups;
2945 if (size < MEMCG_CACHES_MIN_SIZE)
2946 size = MEMCG_CACHES_MIN_SIZE;
2947 else if (size > MEMCG_CACHES_MAX_SIZE)
2948 size = MEMCG_CACHES_MAX_SIZE;
2949
2950 return size;
2951}
2952
2953
2954
2955
2956
2957
2958void memcg_update_array_size(int num)
2959{
2960 if (num > memcg_limited_groups_array_size)
2961 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2962}
2963
2964int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2965{
2966 struct memcg_cache_params *cur_params = s->memcg_params;
2967
2968 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
2969
2970 if (num_groups > memcg_limited_groups_array_size) {
2971 int i;
2972 ssize_t size = memcg_caches_array_size(num_groups);
2973
2974 size *= sizeof(void *);
2975 size += sizeof(struct memcg_cache_params);
2976
2977 s->memcg_params = kzalloc(size, GFP_KERNEL);
2978 if (!s->memcg_params) {
2979 s->memcg_params = cur_params;
2980 return -ENOMEM;
2981 }
2982
2983 s->memcg_params->is_root_cache = true;
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2995 if (!cur_params->memcg_caches[i])
2996 continue;
2997 s->memcg_params->memcg_caches[i] =
2998 cur_params->memcg_caches[i];
2999 }
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010 kfree(cur_params);
3011 }
3012 return 0;
3013}
3014
3015int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3016 struct kmem_cache *root_cache)
3017{
3018 size_t size = sizeof(struct memcg_cache_params);
3019
3020 if (!memcg_kmem_enabled())
3021 return 0;
3022
3023 if (!memcg)
3024 size += memcg_limited_groups_array_size * sizeof(void *);
3025
3026 s->memcg_params = kzalloc(size, GFP_KERNEL);
3027 if (!s->memcg_params)
3028 return -ENOMEM;
3029
3030 if (memcg) {
3031 s->memcg_params->memcg = memcg;
3032 s->memcg_params->root_cache = root_cache;
3033 } else
3034 s->memcg_params->is_root_cache = true;
3035
3036 return 0;
3037}
3038
3039void memcg_release_cache(struct kmem_cache *s)
3040{
3041 struct kmem_cache *root;
3042 struct mem_cgroup *memcg;
3043 int id;
3044
3045
3046
3047
3048
3049 if (!s->memcg_params)
3050 return;
3051
3052 if (s->memcg_params->is_root_cache)
3053 goto out;
3054
3055 memcg = s->memcg_params->memcg;
3056 id = memcg_cache_id(memcg);
3057
3058 root = s->memcg_params->root_cache;
3059 root->memcg_params->memcg_caches[id] = NULL;
3060 mem_cgroup_put(memcg);
3061
3062 mutex_lock(&memcg->slab_caches_mutex);
3063 list_del(&s->memcg_params->list);
3064 mutex_unlock(&memcg->slab_caches_mutex);
3065
3066out:
3067 kfree(s->memcg_params);
3068}
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089static inline void memcg_stop_kmem_account(void)
3090{
3091 VM_BUG_ON(!current->mm);
3092 current->memcg_kmem_skip_account++;
3093}
3094
3095static inline void memcg_resume_kmem_account(void)
3096{
3097 VM_BUG_ON(!current->mm);
3098 current->memcg_kmem_skip_account--;
3099}
3100
3101static void kmem_cache_destroy_work_func(struct work_struct *w)
3102{
3103 struct kmem_cache *cachep;
3104 struct memcg_cache_params *p;
3105
3106 p = container_of(w, struct memcg_cache_params, destroy);
3107
3108 cachep = memcg_params_to_cache(p);
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3127 kmem_cache_shrink(cachep);
3128 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3129 return;
3130 } else
3131 kmem_cache_destroy(cachep);
3132}
3133
3134void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3135{
3136 if (!cachep->memcg_params->dead)
3137 return;
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157 if (work_pending(&cachep->memcg_params->destroy))
3158 return;
3159
3160
3161
3162
3163 schedule_work(&cachep->memcg_params->destroy);
3164}
3165
3166static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
3167{
3168 char *name;
3169 struct dentry *dentry;
3170
3171 rcu_read_lock();
3172 dentry = rcu_dereference(memcg->css.cgroup->dentry);
3173 rcu_read_unlock();
3174
3175 BUG_ON(dentry == NULL);
3176
3177 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3178 memcg_cache_id(memcg), dentry->d_name.name);
3179
3180 return name;
3181}
3182
3183static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3184 struct kmem_cache *s)
3185{
3186 char *name;
3187 struct kmem_cache *new;
3188
3189 name = memcg_cache_name(memcg, s);
3190 if (!name)
3191 return NULL;
3192
3193 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
3194 (s->flags & ~SLAB_PANIC), s->ctor, s);
3195
3196 if (new)
3197 new->allocflags |= __GFP_KMEMCG;
3198
3199 kfree(name);
3200 return new;
3201}
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211static DEFINE_MUTEX(memcg_cache_mutex);
3212static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3213 struct kmem_cache *cachep)
3214{
3215 struct kmem_cache *new_cachep;
3216 int idx;
3217
3218 BUG_ON(!memcg_can_account_kmem(memcg));
3219
3220 idx = memcg_cache_id(memcg);
3221
3222 mutex_lock(&memcg_cache_mutex);
3223 new_cachep = cachep->memcg_params->memcg_caches[idx];
3224 if (new_cachep)
3225 goto out;
3226
3227 new_cachep = kmem_cache_dup(memcg, cachep);
3228 if (new_cachep == NULL) {
3229 new_cachep = cachep;
3230 goto out;
3231 }
3232
3233 mem_cgroup_get(memcg);
3234 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3235
3236 cachep->memcg_params->memcg_caches[idx] = new_cachep;
3237
3238
3239
3240
3241 wmb();
3242out:
3243 mutex_unlock(&memcg_cache_mutex);
3244 return new_cachep;
3245}
3246
3247void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3248{
3249 struct kmem_cache *c;
3250 int i;
3251
3252 if (!s->memcg_params)
3253 return;
3254 if (!s->memcg_params->is_root_cache)
3255 return;
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266 mutex_lock(&set_limit_mutex);
3267 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3268 c = s->memcg_params->memcg_caches[i];
3269 if (!c)
3270 continue;
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285 c->memcg_params->dead = false;
3286 cancel_work_sync(&c->memcg_params->destroy);
3287 kmem_cache_destroy(c);
3288 }
3289 mutex_unlock(&set_limit_mutex);
3290}
3291
3292struct create_work {
3293 struct mem_cgroup *memcg;
3294 struct kmem_cache *cachep;
3295 struct work_struct work;
3296};
3297
3298static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3299{
3300 struct kmem_cache *cachep;
3301 struct memcg_cache_params *params;
3302
3303 if (!memcg_kmem_is_active(memcg))
3304 return;
3305
3306 mutex_lock(&memcg->slab_caches_mutex);
3307 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3308 cachep = memcg_params_to_cache(params);
3309 cachep->memcg_params->dead = true;
3310 INIT_WORK(&cachep->memcg_params->destroy,
3311 kmem_cache_destroy_work_func);
3312 schedule_work(&cachep->memcg_params->destroy);
3313 }
3314 mutex_unlock(&memcg->slab_caches_mutex);
3315}
3316
3317static void memcg_create_cache_work_func(struct work_struct *w)
3318{
3319 struct create_work *cw;
3320
3321 cw = container_of(w, struct create_work, work);
3322 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3323
3324 css_put(&cw->memcg->css);
3325 kfree(cw);
3326}
3327
3328
3329
3330
3331
3332static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3333 struct kmem_cache *cachep)
3334{
3335 struct create_work *cw;
3336
3337 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3338 if (cw == NULL)
3339 return;
3340
3341
3342 if (!css_tryget(&memcg->css)) {
3343 kfree(cw);
3344 return;
3345 }
3346
3347 cw->memcg = memcg;
3348 cw->cachep = cachep;
3349
3350 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3351 schedule_work(&cw->work);
3352}
3353
3354static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3355 struct kmem_cache *cachep)
3356{
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368 memcg_stop_kmem_account();
3369 __memcg_create_cache_enqueue(memcg, cachep);
3370 memcg_resume_kmem_account();
3371}
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3386 gfp_t gfp)
3387{
3388 struct mem_cgroup *memcg;
3389 int idx;
3390
3391 VM_BUG_ON(!cachep->memcg_params);
3392 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3393
3394 if (!current->mm || current->memcg_kmem_skip_account)
3395 return cachep;
3396
3397 rcu_read_lock();
3398 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3399 rcu_read_unlock();
3400
3401 if (!memcg_can_account_kmem(memcg))
3402 return cachep;
3403
3404 idx = memcg_cache_id(memcg);
3405
3406
3407
3408
3409
3410 read_barrier_depends();
3411 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429 memcg_create_cache_enqueue(memcg, cachep);
3430 return cachep;
3431 }
3432
3433 return cachep->memcg_params->memcg_caches[idx];
3434}
3435EXPORT_SYMBOL(__memcg_kmem_get_cache);
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451bool
3452__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3453{
3454 struct mem_cgroup *memcg;
3455 int ret;
3456
3457 *_memcg = NULL;
3458 memcg = try_get_mem_cgroup_from_mm(current->mm);
3459
3460
3461
3462
3463
3464
3465 if (unlikely(!memcg))
3466 return true;
3467
3468 if (!memcg_can_account_kmem(memcg)) {
3469 css_put(&memcg->css);
3470 return true;
3471 }
3472
3473 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3474 if (!ret)
3475 *_memcg = memcg;
3476
3477 css_put(&memcg->css);
3478 return (ret == 0);
3479}
3480
3481void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3482 int order)
3483{
3484 struct page_cgroup *pc;
3485
3486 VM_BUG_ON(mem_cgroup_is_root(memcg));
3487
3488
3489 if (!page) {
3490 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3491 return;
3492 }
3493
3494 pc = lookup_page_cgroup(page);
3495 lock_page_cgroup(pc);
3496 pc->mem_cgroup = memcg;
3497 SetPageCgroupUsed(pc);
3498 unlock_page_cgroup(pc);
3499}
3500
3501void __memcg_kmem_uncharge_pages(struct page *page, int order)
3502{
3503 struct mem_cgroup *memcg = NULL;
3504 struct page_cgroup *pc;
3505
3506
3507 pc = lookup_page_cgroup(page);
3508
3509
3510
3511
3512 if (!PageCgroupUsed(pc))
3513 return;
3514
3515 lock_page_cgroup(pc);
3516 if (PageCgroupUsed(pc)) {
3517 memcg = pc->mem_cgroup;
3518 ClearPageCgroupUsed(pc);
3519 }
3520 unlock_page_cgroup(pc);
3521
3522
3523
3524
3525
3526 if (!memcg)
3527 return;
3528
3529 VM_BUG_ON(mem_cgroup_is_root(memcg));
3530 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3531}
3532#else
3533static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3534{
3535}
3536#endif
3537
3538#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3539
3540#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3541
3542
3543
3544
3545
3546
3547void mem_cgroup_split_huge_fixup(struct page *head)
3548{
3549 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3550 struct page_cgroup *pc;
3551 int i;
3552
3553 if (mem_cgroup_disabled())
3554 return;
3555 for (i = 1; i < HPAGE_PMD_NR; i++) {
3556 pc = head_pc + i;
3557 pc->mem_cgroup = head_pc->mem_cgroup;
3558 smp_wmb();
3559 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3560 }
3561}
3562#endif
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579static int mem_cgroup_move_account(struct page *page,
3580 unsigned int nr_pages,
3581 struct page_cgroup *pc,
3582 struct mem_cgroup *from,
3583 struct mem_cgroup *to)
3584{
3585 unsigned long flags;
3586 int ret;
3587 bool anon = PageAnon(page);
3588
3589 VM_BUG_ON(from == to);
3590 VM_BUG_ON(PageLRU(page));
3591
3592
3593
3594
3595
3596
3597 ret = -EBUSY;
3598 if (nr_pages > 1 && !PageTransHuge(page))
3599 goto out;
3600
3601 lock_page_cgroup(pc);
3602
3603 ret = -EINVAL;
3604 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3605 goto unlock;
3606
3607 move_lock_mem_cgroup(from, &flags);
3608
3609 if (!anon && page_mapped(page)) {
3610
3611 preempt_disable();
3612 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3613 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3614 preempt_enable();
3615 }
3616 mem_cgroup_charge_statistics(from, anon, -nr_pages);
3617
3618
3619 pc->mem_cgroup = to;
3620 mem_cgroup_charge_statistics(to, anon, nr_pages);
3621 move_unlock_mem_cgroup(from, &flags);
3622 ret = 0;
3623unlock:
3624 unlock_page_cgroup(pc);
3625
3626
3627
3628 memcg_check_events(to, page);
3629 memcg_check_events(from, page);
3630out:
3631 return ret;
3632}
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655static int mem_cgroup_move_parent(struct page *page,
3656 struct page_cgroup *pc,
3657 struct mem_cgroup *child)
3658{
3659 struct mem_cgroup *parent;
3660 unsigned int nr_pages;
3661 unsigned long uninitialized_var(flags);
3662 int ret;
3663
3664 VM_BUG_ON(mem_cgroup_is_root(child));
3665
3666 ret = -EBUSY;
3667 if (!get_page_unless_zero(page))
3668 goto out;
3669 if (isolate_lru_page(page))
3670 goto put;
3671
3672 nr_pages = hpage_nr_pages(page);
3673
3674 parent = parent_mem_cgroup(child);
3675
3676
3677
3678 if (!parent)
3679 parent = root_mem_cgroup;
3680
3681 if (nr_pages > 1) {
3682 VM_BUG_ON(!PageTransHuge(page));
3683 flags = compound_lock_irqsave(page);
3684 }
3685
3686 ret = mem_cgroup_move_account(page, nr_pages,
3687 pc, child, parent);
3688 if (!ret)
3689 __mem_cgroup_cancel_local_charge(child, nr_pages);
3690
3691 if (nr_pages > 1)
3692 compound_unlock_irqrestore(page, flags);
3693 putback_lru_page(page);
3694put:
3695 put_page(page);
3696out:
3697 return ret;
3698}
3699
3700
3701
3702
3703
3704
3705
3706static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3707 gfp_t gfp_mask, enum charge_type ctype)
3708{
3709 struct mem_cgroup *memcg = NULL;
3710 unsigned int nr_pages = 1;
3711 bool oom = true;
3712 int ret;
3713
3714 if (PageTransHuge(page)) {
3715 nr_pages <<= compound_order(page);
3716 VM_BUG_ON(!PageTransHuge(page));
3717
3718
3719
3720
3721 oom = false;
3722 }
3723
3724 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3725 if (ret == -ENOMEM)
3726 return ret;
3727 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3728 return 0;
3729}
3730
3731int mem_cgroup_newpage_charge(struct page *page,
3732 struct mm_struct *mm, gfp_t gfp_mask)
3733{
3734 if (mem_cgroup_disabled())
3735 return 0;
3736 VM_BUG_ON(page_mapped(page));
3737 VM_BUG_ON(page->mapping && !PageAnon(page));
3738 VM_BUG_ON(!mm);
3739 return mem_cgroup_charge_common(page, mm, gfp_mask,
3740 MEM_CGROUP_CHARGE_TYPE_ANON);
3741}
3742
3743
3744
3745
3746
3747
3748
3749static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3750 struct page *page,
3751 gfp_t mask,
3752 struct mem_cgroup **memcgp)
3753{
3754 struct mem_cgroup *memcg;
3755 struct page_cgroup *pc;
3756 int ret;
3757
3758 pc = lookup_page_cgroup(page);
3759
3760
3761
3762
3763
3764
3765
3766 if (PageCgroupUsed(pc))
3767 return 0;
3768 if (!do_swap_account)
3769 goto charge_cur_mm;
3770 memcg = try_get_mem_cgroup_from_page(page);
3771 if (!memcg)
3772 goto charge_cur_mm;
3773 *memcgp = memcg;
3774 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
3775 css_put(&memcg->css);
3776 if (ret == -EINTR)
3777 ret = 0;
3778 return ret;
3779charge_cur_mm:
3780 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
3781 if (ret == -EINTR)
3782 ret = 0;
3783 return ret;
3784}
3785
3786int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3787 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3788{
3789 *memcgp = NULL;
3790 if (mem_cgroup_disabled())
3791 return 0;
3792
3793
3794
3795
3796
3797
3798 if (!PageSwapCache(page)) {
3799 int ret;
3800
3801 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
3802 if (ret == -EINTR)
3803 ret = 0;
3804 return ret;
3805 }
3806 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3807}
3808
3809void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3810{
3811 if (mem_cgroup_disabled())
3812 return;
3813 if (!memcg)
3814 return;
3815 __mem_cgroup_cancel_charge(memcg, 1);
3816}
3817
3818static void
3819__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
3820 enum charge_type ctype)
3821{
3822 if (mem_cgroup_disabled())
3823 return;
3824 if (!memcg)
3825 return;
3826
3827 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
3828
3829
3830
3831
3832
3833
3834
3835 if (do_swap_account && PageSwapCache(page)) {
3836 swp_entry_t ent = {.val = page_private(page)};
3837 mem_cgroup_uncharge_swap(ent);
3838 }
3839}
3840
3841void mem_cgroup_commit_charge_swapin(struct page *page,
3842 struct mem_cgroup *memcg)
3843{
3844 __mem_cgroup_commit_charge_swapin(page, memcg,
3845 MEM_CGROUP_CHARGE_TYPE_ANON);
3846}
3847
3848int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
3849 gfp_t gfp_mask)
3850{
3851 struct mem_cgroup *memcg = NULL;
3852 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3853 int ret;
3854
3855 if (mem_cgroup_disabled())
3856 return 0;
3857 if (PageCompound(page))
3858 return 0;
3859
3860 if (!PageSwapCache(page))
3861 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
3862 else {
3863 ret = __mem_cgroup_try_charge_swapin(mm, page,
3864 gfp_mask, &memcg);
3865 if (!ret)
3866 __mem_cgroup_commit_charge_swapin(page, memcg, type);
3867 }
3868 return ret;
3869}
3870
3871static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3872 unsigned int nr_pages,
3873 const enum charge_type ctype)
3874{
3875 struct memcg_batch_info *batch = NULL;
3876 bool uncharge_memsw = true;
3877
3878
3879 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3880 uncharge_memsw = false;
3881
3882 batch = ¤t->memcg_batch;
3883
3884
3885
3886
3887
3888 if (!batch->memcg)
3889 batch->memcg = memcg;
3890
3891
3892
3893
3894
3895
3896
3897
3898 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
3899 goto direct_uncharge;
3900
3901 if (nr_pages > 1)
3902 goto direct_uncharge;
3903
3904
3905
3906
3907
3908
3909 if (batch->memcg != memcg)
3910 goto direct_uncharge;
3911
3912 batch->nr_pages++;
3913 if (uncharge_memsw)
3914 batch->memsw_nr_pages++;
3915 return;
3916direct_uncharge:
3917 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3918 if (uncharge_memsw)
3919 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3920 if (unlikely(batch->memcg != memcg))
3921 memcg_oom_recover(memcg);
3922}
3923
3924
3925
3926
3927static struct mem_cgroup *
3928__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
3929 bool end_migration)
3930{
3931 struct mem_cgroup *memcg = NULL;
3932 unsigned int nr_pages = 1;
3933 struct page_cgroup *pc;
3934 bool anon;
3935
3936 if (mem_cgroup_disabled())
3937 return NULL;
3938
3939 VM_BUG_ON(PageSwapCache(page));
3940
3941 if (PageTransHuge(page)) {
3942 nr_pages <<= compound_order(page);
3943 VM_BUG_ON(!PageTransHuge(page));
3944 }
3945
3946
3947
3948 pc = lookup_page_cgroup(page);
3949 if (unlikely(!PageCgroupUsed(pc)))
3950 return NULL;
3951
3952 lock_page_cgroup(pc);
3953
3954 memcg = pc->mem_cgroup;
3955
3956 if (!PageCgroupUsed(pc))
3957 goto unlock_out;
3958
3959 anon = PageAnon(page);
3960
3961 switch (ctype) {
3962 case MEM_CGROUP_CHARGE_TYPE_ANON:
3963
3964
3965
3966
3967
3968 anon = true;
3969
3970 case MEM_CGROUP_CHARGE_TYPE_DROP:
3971
3972 if (page_mapped(page))
3973 goto unlock_out;
3974
3975
3976
3977
3978
3979
3980
3981 if (!end_migration && PageCgroupMigration(pc))
3982 goto unlock_out;
3983 break;
3984 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3985 if (!PageAnon(page)) {
3986 if (page->mapping && !page_is_file_cache(page))
3987 goto unlock_out;
3988 } else if (page_mapped(page))
3989 goto unlock_out;
3990 break;
3991 default:
3992 break;
3993 }
3994
3995 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
3996
3997 ClearPageCgroupUsed(pc);
3998
3999
4000
4001
4002
4003
4004
4005 unlock_page_cgroup(pc);
4006
4007
4008
4009
4010 memcg_check_events(memcg, page);
4011 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4012 mem_cgroup_swap_statistics(memcg, true);
4013 mem_cgroup_get(memcg);
4014 }
4015
4016
4017
4018
4019
4020 if (!end_migration && !mem_cgroup_is_root(memcg))
4021 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4022
4023 return memcg;
4024
4025unlock_out:
4026 unlock_page_cgroup(pc);
4027 return NULL;
4028}
4029
4030void mem_cgroup_uncharge_page(struct page *page)
4031{
4032
4033 if (page_mapped(page))
4034 return;
4035 VM_BUG_ON(page->mapping && !PageAnon(page));
4036 if (PageSwapCache(page))
4037 return;
4038 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4039}
4040
4041void mem_cgroup_uncharge_cache_page(struct page *page)
4042{
4043 VM_BUG_ON(page_mapped(page));
4044 VM_BUG_ON(page->mapping);
4045 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4046}
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056void mem_cgroup_uncharge_start(void)
4057{
4058 current->memcg_batch.do_batch++;
4059
4060 if (current->memcg_batch.do_batch == 1) {
4061 current->memcg_batch.memcg = NULL;
4062 current->memcg_batch.nr_pages = 0;
4063 current->memcg_batch.memsw_nr_pages = 0;
4064 }
4065}
4066
4067void mem_cgroup_uncharge_end(void)
4068{
4069 struct memcg_batch_info *batch = ¤t->memcg_batch;
4070
4071 if (!batch->do_batch)
4072 return;
4073
4074 batch->do_batch--;
4075 if (batch->do_batch)
4076 return;
4077
4078 if (!batch->memcg)
4079 return;
4080
4081
4082
4083
4084 if (batch->nr_pages)
4085 res_counter_uncharge(&batch->memcg->res,
4086 batch->nr_pages * PAGE_SIZE);
4087 if (batch->memsw_nr_pages)
4088 res_counter_uncharge(&batch->memcg->memsw,
4089 batch->memsw_nr_pages * PAGE_SIZE);
4090 memcg_oom_recover(batch->memcg);
4091
4092 batch->memcg = NULL;
4093}
4094
4095#ifdef CONFIG_SWAP
4096
4097
4098
4099
4100void
4101mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4102{
4103 struct mem_cgroup *memcg;
4104 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4105
4106 if (!swapout)
4107 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4108
4109 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4110
4111
4112
4113
4114
4115 if (do_swap_account && swapout && memcg)
4116 swap_cgroup_record(ent, css_id(&memcg->css));
4117}
4118#endif
4119
4120#ifdef CONFIG_MEMCG_SWAP
4121
4122
4123
4124
4125void mem_cgroup_uncharge_swap(swp_entry_t ent)
4126{
4127 struct mem_cgroup *memcg;
4128 unsigned short id;
4129
4130 if (!do_swap_account)
4131 return;
4132
4133 id = swap_cgroup_record(ent, 0);
4134 rcu_read_lock();
4135 memcg = mem_cgroup_lookup(id);
4136 if (memcg) {
4137
4138
4139
4140
4141 if (!mem_cgroup_is_root(memcg))
4142 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4143 mem_cgroup_swap_statistics(memcg, false);
4144 mem_cgroup_put(memcg);
4145 }
4146 rcu_read_unlock();
4147}
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163static int mem_cgroup_move_swap_account(swp_entry_t entry,
4164 struct mem_cgroup *from, struct mem_cgroup *to)
4165{
4166 unsigned short old_id, new_id;
4167
4168 old_id = css_id(&from->css);
4169 new_id = css_id(&to->css);
4170
4171 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4172 mem_cgroup_swap_statistics(from, false);
4173 mem_cgroup_swap_statistics(to, true);
4174
4175
4176
4177
4178
4179
4180
4181
4182 mem_cgroup_get(to);
4183 return 0;
4184 }
4185 return -EINVAL;
4186}
4187#else
4188static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4189 struct mem_cgroup *from, struct mem_cgroup *to)
4190{
4191 return -EINVAL;
4192}
4193#endif
4194
4195
4196
4197
4198
4199void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4200 struct mem_cgroup **memcgp)
4201{
4202 struct mem_cgroup *memcg = NULL;
4203 unsigned int nr_pages = 1;
4204 struct page_cgroup *pc;
4205 enum charge_type ctype;
4206
4207 *memcgp = NULL;
4208
4209 if (mem_cgroup_disabled())
4210 return;
4211
4212 if (PageTransHuge(page))
4213 nr_pages <<= compound_order(page);
4214
4215 pc = lookup_page_cgroup(page);
4216 lock_page_cgroup(pc);
4217 if (PageCgroupUsed(pc)) {
4218 memcg = pc->mem_cgroup;
4219 css_get(&memcg->css);
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249 if (PageAnon(page))
4250 SetPageCgroupMigration(pc);
4251 }
4252 unlock_page_cgroup(pc);
4253
4254
4255
4256
4257 if (!memcg)
4258 return;
4259
4260 *memcgp = memcg;
4261
4262
4263
4264
4265
4266
4267 if (PageAnon(page))
4268 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4269 else
4270 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4271
4272
4273
4274
4275
4276 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4277}
4278
4279
4280void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4281 struct page *oldpage, struct page *newpage, bool migration_ok)
4282{
4283 struct page *used, *unused;
4284 struct page_cgroup *pc;
4285 bool anon;
4286
4287 if (!memcg)
4288 return;
4289
4290 if (!migration_ok) {
4291 used = oldpage;
4292 unused = newpage;
4293 } else {
4294 used = newpage;
4295 unused = oldpage;
4296 }
4297 anon = PageAnon(used);
4298 __mem_cgroup_uncharge_common(unused,
4299 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4300 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4301 true);
4302 css_put(&memcg->css);
4303
4304
4305
4306
4307
4308 pc = lookup_page_cgroup(oldpage);
4309 lock_page_cgroup(pc);
4310 ClearPageCgroupMigration(pc);
4311 unlock_page_cgroup(pc);
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321 if (anon)
4322 mem_cgroup_uncharge_page(used);
4323}
4324
4325
4326
4327
4328
4329
4330void mem_cgroup_replace_page_cache(struct page *oldpage,
4331 struct page *newpage)
4332{
4333 struct mem_cgroup *memcg = NULL;
4334 struct page_cgroup *pc;
4335 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4336
4337 if (mem_cgroup_disabled())
4338 return;
4339
4340 pc = lookup_page_cgroup(oldpage);
4341
4342 lock_page_cgroup(pc);
4343 if (PageCgroupUsed(pc)) {
4344 memcg = pc->mem_cgroup;
4345 mem_cgroup_charge_statistics(memcg, false, -1);
4346 ClearPageCgroupUsed(pc);
4347 }
4348 unlock_page_cgroup(pc);
4349
4350
4351
4352
4353
4354 if (!memcg)
4355 return;
4356
4357
4358
4359
4360
4361 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4362}
4363
4364#ifdef CONFIG_DEBUG_VM
4365static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4366{
4367 struct page_cgroup *pc;
4368
4369 pc = lookup_page_cgroup(page);
4370
4371
4372
4373
4374
4375 if (likely(pc) && PageCgroupUsed(pc))
4376 return pc;
4377 return NULL;
4378}
4379
4380bool mem_cgroup_bad_page_check(struct page *page)
4381{
4382 if (mem_cgroup_disabled())
4383 return false;
4384
4385 return lookup_page_cgroup_used(page) != NULL;
4386}
4387
4388void mem_cgroup_print_bad_page(struct page *page)
4389{
4390 struct page_cgroup *pc;
4391
4392 pc = lookup_page_cgroup_used(page);
4393 if (pc) {
4394 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4395 pc, pc->flags, pc->mem_cgroup);
4396 }
4397}
4398#endif
4399
4400static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4401 unsigned long long val)
4402{
4403 int retry_count;
4404 u64 memswlimit, memlimit;
4405 int ret = 0;
4406 int children = mem_cgroup_count_children(memcg);
4407 u64 curusage, oldusage;
4408 int enlarge;
4409
4410
4411
4412
4413
4414
4415 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4416
4417 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4418
4419 enlarge = 0;
4420 while (retry_count) {
4421 if (signal_pending(current)) {
4422 ret = -EINTR;
4423 break;
4424 }
4425
4426
4427
4428
4429
4430 mutex_lock(&set_limit_mutex);
4431 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4432 if (memswlimit < val) {
4433 ret = -EINVAL;
4434 mutex_unlock(&set_limit_mutex);
4435 break;
4436 }
4437
4438 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4439 if (memlimit < val)
4440 enlarge = 1;
4441
4442 ret = res_counter_set_limit(&memcg->res, val);
4443 if (!ret) {
4444 if (memswlimit == val)
4445 memcg->memsw_is_minimum = true;
4446 else
4447 memcg->memsw_is_minimum = false;
4448 }
4449 mutex_unlock(&set_limit_mutex);
4450
4451 if (!ret)
4452 break;
4453
4454 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4455 MEM_CGROUP_RECLAIM_SHRINK);
4456 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4457
4458 if (curusage >= oldusage)
4459 retry_count--;
4460 else
4461 oldusage = curusage;
4462 }
4463 if (!ret && enlarge)
4464 memcg_oom_recover(memcg);
4465
4466 return ret;
4467}
4468
4469static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4470 unsigned long long val)
4471{
4472 int retry_count;
4473 u64 memlimit, memswlimit, oldusage, curusage;
4474 int children = mem_cgroup_count_children(memcg);
4475 int ret = -EBUSY;
4476 int enlarge = 0;
4477
4478
4479 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4480 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4481 while (retry_count) {
4482 if (signal_pending(current)) {
4483 ret = -EINTR;
4484 break;
4485 }
4486
4487
4488
4489
4490
4491 mutex_lock(&set_limit_mutex);
4492 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4493 if (memlimit > val) {
4494 ret = -EINVAL;
4495 mutex_unlock(&set_limit_mutex);
4496 break;
4497 }
4498 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4499 if (memswlimit < val)
4500 enlarge = 1;
4501 ret = res_counter_set_limit(&memcg->memsw, val);
4502 if (!ret) {
4503 if (memlimit == val)
4504 memcg->memsw_is_minimum = true;
4505 else
4506 memcg->memsw_is_minimum = false;
4507 }
4508 mutex_unlock(&set_limit_mutex);
4509
4510 if (!ret)
4511 break;
4512
4513 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4514 MEM_CGROUP_RECLAIM_NOSWAP |
4515 MEM_CGROUP_RECLAIM_SHRINK);
4516 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4517
4518 if (curusage >= oldusage)
4519 retry_count--;
4520 else
4521 oldusage = curusage;
4522 }
4523 if (!ret && enlarge)
4524 memcg_oom_recover(memcg);
4525 return ret;
4526}
4527
4528unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4529 gfp_t gfp_mask,
4530 unsigned long *total_scanned)
4531{
4532 unsigned long nr_reclaimed = 0;
4533 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4534 unsigned long reclaimed;
4535 int loop = 0;
4536 struct mem_cgroup_tree_per_zone *mctz;
4537 unsigned long long excess;
4538 unsigned long nr_scanned;
4539
4540 if (order > 0)
4541 return 0;
4542
4543 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4544
4545
4546
4547
4548
4549 do {
4550 if (next_mz)
4551 mz = next_mz;
4552 else
4553 mz = mem_cgroup_largest_soft_limit_node(mctz);
4554 if (!mz)
4555 break;
4556
4557 nr_scanned = 0;
4558 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4559 gfp_mask, &nr_scanned);
4560 nr_reclaimed += reclaimed;
4561 *total_scanned += nr_scanned;
4562 spin_lock(&mctz->lock);
4563
4564
4565
4566
4567
4568 next_mz = NULL;
4569 if (!reclaimed) {
4570 do {
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582 next_mz =
4583 __mem_cgroup_largest_soft_limit_node(mctz);
4584 if (next_mz == mz)
4585 css_put(&next_mz->memcg->css);
4586 else
4587 break;
4588 } while (1);
4589 }
4590 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4591 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4602 spin_unlock(&mctz->lock);
4603 css_put(&mz->memcg->css);
4604 loop++;
4605
4606
4607
4608
4609
4610 if (!nr_reclaimed &&
4611 (next_mz == NULL ||
4612 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4613 break;
4614 } while (!nr_reclaimed);
4615 if (next_mz)
4616 css_put(&next_mz->memcg->css);
4617 return nr_reclaimed;
4618}
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4632 int node, int zid, enum lru_list lru)
4633{
4634 struct lruvec *lruvec;
4635 unsigned long flags;
4636 struct list_head *list;
4637 struct page *busy;
4638 struct zone *zone;
4639
4640 zone = &NODE_DATA(node)->node_zones[zid];
4641 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4642 list = &lruvec->lists[lru];
4643
4644 busy = NULL;
4645 do {
4646 struct page_cgroup *pc;
4647 struct page *page;
4648
4649 spin_lock_irqsave(&zone->lru_lock, flags);
4650 if (list_empty(list)) {
4651 spin_unlock_irqrestore(&zone->lru_lock, flags);
4652 break;
4653 }
4654 page = list_entry(list->prev, struct page, lru);
4655 if (busy == page) {
4656 list_move(&page->lru, list);
4657 busy = NULL;
4658 spin_unlock_irqrestore(&zone->lru_lock, flags);
4659 continue;
4660 }
4661 spin_unlock_irqrestore(&zone->lru_lock, flags);
4662
4663 pc = lookup_page_cgroup(page);
4664
4665 if (mem_cgroup_move_parent(page, pc, memcg)) {
4666
4667 busy = page;
4668 cond_resched();
4669 } else
4670 busy = NULL;
4671 } while (!list_empty(list));
4672}
4673
4674
4675
4676
4677
4678
4679
4680
4681static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4682{
4683 int node, zid;
4684 u64 usage;
4685
4686 do {
4687
4688 lru_add_drain_all();
4689 drain_all_stock_sync(memcg);
4690 mem_cgroup_start_move(memcg);
4691 for_each_node_state(node, N_MEMORY) {
4692 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4693 enum lru_list lru;
4694 for_each_lru(lru) {
4695 mem_cgroup_force_empty_list(memcg,
4696 node, zid, lru);
4697 }
4698 }
4699 }
4700 mem_cgroup_end_move(memcg);
4701 memcg_oom_recover(memcg);
4702 cond_resched();
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4717 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4718 } while (usage > 0);
4719}
4720
4721
4722
4723
4724
4725
4726
4727static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4728{
4729 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4730 struct cgroup *cgrp = memcg->css.cgroup;
4731
4732
4733 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4734 return -EBUSY;
4735
4736
4737 lru_add_drain_all();
4738
4739 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4740 int progress;
4741
4742 if (signal_pending(current))
4743 return -EINTR;
4744
4745 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4746 false);
4747 if (!progress) {
4748 nr_retries--;
4749
4750 congestion_wait(BLK_RW_ASYNC, HZ/10);
4751 }
4752
4753 }
4754 lru_add_drain();
4755 mem_cgroup_reparent_charges(memcg);
4756
4757 return 0;
4758}
4759
4760static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
4761{
4762 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4763 int ret;
4764
4765 if (mem_cgroup_is_root(memcg))
4766 return -EINVAL;
4767 css_get(&memcg->css);
4768 ret = mem_cgroup_force_empty(memcg);
4769 css_put(&memcg->css);
4770
4771 return ret;
4772}
4773
4774
4775static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
4776{
4777 return mem_cgroup_from_cont(cont)->use_hierarchy;
4778}
4779
4780static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4781 u64 val)
4782{
4783 int retval = 0;
4784 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4785 struct cgroup *parent = cont->parent;
4786 struct mem_cgroup *parent_memcg = NULL;
4787
4788 if (parent)
4789 parent_memcg = mem_cgroup_from_cont(parent);
4790
4791 cgroup_lock();
4792
4793 if (memcg->use_hierarchy == val)
4794 goto out;
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4805 (val == 1 || val == 0)) {
4806 if (list_empty(&cont->children))
4807 memcg->use_hierarchy = val;
4808 else
4809 retval = -EBUSY;
4810 } else
4811 retval = -EINVAL;
4812
4813out:
4814 cgroup_unlock();
4815
4816 return retval;
4817}
4818
4819
4820static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4821 enum mem_cgroup_stat_index idx)
4822{
4823 struct mem_cgroup *iter;
4824 long val = 0;
4825
4826
4827 for_each_mem_cgroup_tree(iter, memcg)
4828 val += mem_cgroup_read_stat(iter, idx);
4829
4830 if (val < 0)
4831 val = 0;
4832 return val;
4833}
4834
4835static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4836{
4837 u64 val;
4838
4839 if (!mem_cgroup_is_root(memcg)) {
4840 if (!swap)
4841 return res_counter_read_u64(&memcg->res, RES_USAGE);
4842 else
4843 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4844 }
4845
4846 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4847 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4848
4849 if (swap)
4850 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4851
4852 return val << PAGE_SHIFT;
4853}
4854
4855static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
4856 struct file *file, char __user *buf,
4857 size_t nbytes, loff_t *ppos)
4858{
4859 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4860 char str[64];
4861 u64 val;
4862 int name, len;
4863 enum res_type type;
4864
4865 type = MEMFILE_TYPE(cft->private);
4866 name = MEMFILE_ATTR(cft->private);
4867
4868 if (!do_swap_account && type == _MEMSWAP)
4869 return -EOPNOTSUPP;
4870
4871 switch (type) {
4872 case _MEM:
4873 if (name == RES_USAGE)
4874 val = mem_cgroup_usage(memcg, false);
4875 else
4876 val = res_counter_read_u64(&memcg->res, name);
4877 break;
4878 case _MEMSWAP:
4879 if (name == RES_USAGE)
4880 val = mem_cgroup_usage(memcg, true);
4881 else
4882 val = res_counter_read_u64(&memcg->memsw, name);
4883 break;
4884 case _KMEM:
4885 val = res_counter_read_u64(&memcg->kmem, name);
4886 break;
4887 default:
4888 BUG();
4889 }
4890
4891 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
4892 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
4893}
4894
4895static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4896{
4897 int ret = -EINVAL;
4898#ifdef CONFIG_MEMCG_KMEM
4899 bool must_inc_static_branch = false;
4900
4901 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920 cgroup_lock();
4921 mutex_lock(&set_limit_mutex);
4922 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4923 if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
4924 !list_empty(&cont->children))) {
4925 ret = -EBUSY;
4926 goto out;
4927 }
4928 ret = res_counter_set_limit(&memcg->kmem, val);
4929 VM_BUG_ON(ret);
4930
4931 ret = memcg_update_cache_sizes(memcg);
4932 if (ret) {
4933 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4934 goto out;
4935 }
4936 must_inc_static_branch = true;
4937
4938
4939
4940
4941
4942
4943 mem_cgroup_get(memcg);
4944 } else
4945 ret = res_counter_set_limit(&memcg->kmem, val);
4946out:
4947 mutex_unlock(&set_limit_mutex);
4948 cgroup_unlock();
4949
4950
4951
4952