linux/mm/memcontrol.c
<<
>>
Prefs
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * Kernel Memory Controller
  14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15 * Authors: Glauber Costa and Suleiman Souhlal
  16 *
  17 * This program is free software; you can redistribute it and/or modify
  18 * it under the terms of the GNU General Public License as published by
  19 * the Free Software Foundation; either version 2 of the License, or
  20 * (at your option) any later version.
  21 *
  22 * This program is distributed in the hope that it will be useful,
  23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 * GNU General Public License for more details.
  26 */
  27
  28#include <linux/res_counter.h>
  29#include <linux/memcontrol.h>
  30#include <linux/cgroup.h>
  31#include <linux/mm.h>
  32#include <linux/hugetlb.h>
  33#include <linux/pagemap.h>
  34#include <linux/smp.h>
  35#include <linux/page-flags.h>
  36#include <linux/backing-dev.h>
  37#include <linux/bit_spinlock.h>
  38#include <linux/rcupdate.h>
  39#include <linux/limits.h>
  40#include <linux/export.h>
  41#include <linux/mutex.h>
  42#include <linux/rbtree.h>
  43#include <linux/slab.h>
  44#include <linux/swap.h>
  45#include <linux/swapops.h>
  46#include <linux/spinlock.h>
  47#include <linux/eventfd.h>
  48#include <linux/sort.h>
  49#include <linux/fs.h>
  50#include <linux/seq_file.h>
  51#include <linux/vmalloc.h>
  52#include <linux/vmpressure.h>
  53#include <linux/mm_inline.h>
  54#include <linux/page_cgroup.h>
  55#include <linux/cpu.h>
  56#include <linux/oom.h>
  57#include "internal.h"
  58#include <net/sock.h>
  59#include <net/ip.h>
  60#include <net/tcp_memcontrol.h>
  61
  62#include <asm/uaccess.h>
  63
  64#include <trace/events/vmscan.h>
  65
  66struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  67EXPORT_SYMBOL(mem_cgroup_subsys);
  68
  69#define MEM_CGROUP_RECLAIM_RETRIES      5
  70static struct mem_cgroup *root_mem_cgroup __read_mostly;
  71
  72#ifdef CONFIG_MEMCG_SWAP
  73/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  74int do_swap_account __read_mostly;
  75
  76/* for remember boot option*/
  77#ifdef CONFIG_MEMCG_SWAP_ENABLED
  78static int really_do_swap_account __initdata = 1;
  79#else
  80static int really_do_swap_account __initdata = 0;
  81#endif
  82
  83#else
  84#define do_swap_account         0
  85#endif
  86
  87
  88/*
  89 * Statistics for memory cgroup.
  90 */
  91enum mem_cgroup_stat_index {
  92        /*
  93         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  94         */
  95        MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
  96        MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
  97        MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
  98        MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
  99        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
 100        MEM_CGROUP_STAT_NSTATS,
 101};
 102
 103static const char * const mem_cgroup_stat_names[] = {
 104        "cache",
 105        "rss",
 106        "rss_huge",
 107        "mapped_file",
 108        "swap",
 109};
 110
 111enum mem_cgroup_events_index {
 112        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
 113        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
 114        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
 115        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
 116        MEM_CGROUP_EVENTS_NSTATS,
 117};
 118
 119static const char * const mem_cgroup_events_names[] = {
 120        "pgpgin",
 121        "pgpgout",
 122        "pgfault",
 123        "pgmajfault",
 124};
 125
 126static const char * const mem_cgroup_lru_names[] = {
 127        "inactive_anon",
 128        "active_anon",
 129        "inactive_file",
 130        "active_file",
 131        "unevictable",
 132};
 133
 134/*
 135 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 136 * it will be incremated by the number of pages. This counter is used for
 137 * for trigger some periodic events. This is straightforward and better
 138 * than using jiffies etc. to handle periodic memcg event.
 139 */
 140enum mem_cgroup_events_target {
 141        MEM_CGROUP_TARGET_THRESH,
 142        MEM_CGROUP_TARGET_SOFTLIMIT,
 143        MEM_CGROUP_TARGET_NUMAINFO,
 144        MEM_CGROUP_NTARGETS,
 145};
 146#define THRESHOLDS_EVENTS_TARGET 128
 147#define SOFTLIMIT_EVENTS_TARGET 1024
 148#define NUMAINFO_EVENTS_TARGET  1024
 149
 150struct mem_cgroup_stat_cpu {
 151        long count[MEM_CGROUP_STAT_NSTATS];
 152        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 153        unsigned long nr_page_events;
 154        unsigned long targets[MEM_CGROUP_NTARGETS];
 155};
 156
 157struct mem_cgroup_reclaim_iter {
 158        /*
 159         * last scanned hierarchy member. Valid only if last_dead_count
 160         * matches memcg->dead_count of the hierarchy root group.
 161         */
 162        struct mem_cgroup *last_visited;
 163        unsigned long last_dead_count;
 164
 165        /* scan generation, increased every round-trip */
 166        unsigned int generation;
 167};
 168
 169/*
 170 * per-zone information in memory controller.
 171 */
 172struct mem_cgroup_per_zone {
 173        struct lruvec           lruvec;
 174        unsigned long           lru_size[NR_LRU_LISTS];
 175
 176        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 177
 178        struct rb_node          tree_node;      /* RB tree node */
 179        unsigned long long      usage_in_excess;/* Set to the value by which */
 180                                                /* the soft limit is exceeded*/
 181        bool                    on_tree;
 182        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
 183                                                /* use container_of        */
 184};
 185
 186struct mem_cgroup_per_node {
 187        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 188};
 189
 190/*
 191 * Cgroups above their limits are maintained in a RB-Tree, independent of
 192 * their hierarchy representation
 193 */
 194
 195struct mem_cgroup_tree_per_zone {
 196        struct rb_root rb_root;
 197        spinlock_t lock;
 198};
 199
 200struct mem_cgroup_tree_per_node {
 201        struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 202};
 203
 204struct mem_cgroup_tree {
 205        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 206};
 207
 208static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 209
 210struct mem_cgroup_threshold {
 211        struct eventfd_ctx *eventfd;
 212        u64 threshold;
 213};
 214
 215/* For threshold */
 216struct mem_cgroup_threshold_ary {
 217        /* An array index points to threshold just below or equal to usage. */
 218        int current_threshold;
 219        /* Size of entries[] */
 220        unsigned int size;
 221        /* Array of thresholds */
 222        struct mem_cgroup_threshold entries[0];
 223};
 224
 225struct mem_cgroup_thresholds {
 226        /* Primary thresholds array */
 227        struct mem_cgroup_threshold_ary *primary;
 228        /*
 229         * Spare threshold array.
 230         * This is needed to make mem_cgroup_unregister_event() "never fail".
 231         * It must be able to store at least primary->size - 1 entries.
 232         */
 233        struct mem_cgroup_threshold_ary *spare;
 234};
 235
 236/* for OOM */
 237struct mem_cgroup_eventfd_list {
 238        struct list_head list;
 239        struct eventfd_ctx *eventfd;
 240};
 241
 242static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 243static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 244
 245/*
 246 * The memory controller data structure. The memory controller controls both
 247 * page cache and RSS per cgroup. We would eventually like to provide
 248 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 249 * to help the administrator determine what knobs to tune.
 250 *
 251 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 252 * we hit the water mark. May be even add a low water mark, such that
 253 * no reclaim occurs from a cgroup at it's low water mark, this is
 254 * a feature that will be implemented much later in the future.
 255 */
 256struct mem_cgroup {
 257        struct cgroup_subsys_state css;
 258        /*
 259         * the counter to account for memory usage
 260         */
 261        struct res_counter res;
 262
 263        /* vmpressure notifications */
 264        struct vmpressure vmpressure;
 265
 266        /*
 267         * the counter to account for mem+swap usage.
 268         */
 269        struct res_counter memsw;
 270
 271        /*
 272         * the counter to account for kernel memory usage.
 273         */
 274        struct res_counter kmem;
 275        /*
 276         * Should the accounting and control be hierarchical, per subtree?
 277         */
 278        bool use_hierarchy;
 279        unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 280
 281        bool            oom_lock;
 282        atomic_t        under_oom;
 283
 284        int     swappiness;
 285        /* OOM-Killer disable */
 286        int             oom_kill_disable;
 287
 288        /* set when res.limit == memsw.limit */
 289        bool            memsw_is_minimum;
 290
 291        /* protect arrays of thresholds */
 292        struct mutex thresholds_lock;
 293
 294        /* thresholds for memory usage. RCU-protected */
 295        struct mem_cgroup_thresholds thresholds;
 296
 297        /* thresholds for mem+swap usage. RCU-protected */
 298        struct mem_cgroup_thresholds memsw_thresholds;
 299
 300        /* For oom notifier event fd */
 301        struct list_head oom_notify;
 302
 303        /*
 304         * Should we move charges of a task when a task is moved into this
 305         * mem_cgroup ? And what type of charges should we move ?
 306         */
 307        unsigned long   move_charge_at_immigrate;
 308        /*
 309         * set > 0 if pages under this cgroup are moving to other cgroup.
 310         */
 311        atomic_t        moving_account;
 312        /* taken only while moving_account > 0 */
 313        spinlock_t      move_lock;
 314        /*
 315         * percpu counter.
 316         */
 317        struct mem_cgroup_stat_cpu __percpu *stat;
 318        /*
 319         * used when a cpu is offlined or other synchronizations
 320         * See mem_cgroup_read_stat().
 321         */
 322        struct mem_cgroup_stat_cpu nocpu_base;
 323        spinlock_t pcp_counter_lock;
 324
 325        atomic_t        dead_count;
 326#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 327        struct tcp_memcontrol tcp_mem;
 328#endif
 329#if defined(CONFIG_MEMCG_KMEM)
 330        /* analogous to slab_common's slab_caches list. per-memcg */
 331        struct list_head memcg_slab_caches;
 332        /* Not a spinlock, we can take a lot of time walking the list */
 333        struct mutex slab_caches_mutex;
 334        /* Index in the kmem_cache->memcg_params->memcg_caches array */
 335        int kmemcg_id;
 336#endif
 337
 338        int last_scanned_node;
 339#if MAX_NUMNODES > 1
 340        nodemask_t      scan_nodes;
 341        atomic_t        numainfo_events;
 342        atomic_t        numainfo_updating;
 343#endif
 344
 345        struct mem_cgroup_per_node *nodeinfo[0];
 346        /* WARNING: nodeinfo must be the last member here */
 347};
 348
 349static size_t memcg_size(void)
 350{
 351        return sizeof(struct mem_cgroup) +
 352                nr_node_ids * sizeof(struct mem_cgroup_per_node);
 353}
 354
 355/* internal only representation about the status of kmem accounting. */
 356enum {
 357        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
 358        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
 359        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 360};
 361
 362/* We account when limit is on, but only after call sites are patched */
 363#define KMEM_ACCOUNTED_MASK \
 364                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
 365
 366#ifdef CONFIG_MEMCG_KMEM
 367static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 368{
 369        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 370}
 371
 372static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 373{
 374        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 375}
 376
 377static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
 378{
 379        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 380}
 381
 382static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
 383{
 384        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 385}
 386
 387static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 388{
 389        /*
 390         * Our caller must use css_get() first, because memcg_uncharge_kmem()
 391         * will call css_put() if it sees the memcg is dead.
 392         */
 393        smp_wmb();
 394        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 395                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 396}
 397
 398static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 399{
 400        return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
 401                                  &memcg->kmem_account_flags);
 402}
 403#endif
 404
 405/* Stuffs for move charges at task migration. */
 406/*
 407 * Types of charges to be moved. "move_charge_at_immitgrate" and
 408 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 409 */
 410enum move_type {
 411        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 412        MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
 413        NR_MOVE_TYPE,
 414};
 415
 416/* "mc" and its members are protected by cgroup_mutex */
 417static struct move_charge_struct {
 418        spinlock_t        lock; /* for from, to */
 419        struct mem_cgroup *from;
 420        struct mem_cgroup *to;
 421        unsigned long immigrate_flags;
 422        unsigned long precharge;
 423        unsigned long moved_charge;
 424        unsigned long moved_swap;
 425        struct task_struct *moving_task;        /* a task moving charges */
 426        wait_queue_head_t waitq;                /* a waitq for other context */
 427} mc = {
 428        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 429        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 430};
 431
 432static bool move_anon(void)
 433{
 434        return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 435}
 436
 437static bool move_file(void)
 438{
 439        return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 440}
 441
 442/*
 443 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 444 * limit reclaim to prevent infinite loops, if they ever occur.
 445 */
 446#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 447#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 448
 449enum charge_type {
 450        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 451        MEM_CGROUP_CHARGE_TYPE_ANON,
 452        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 453        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 454        NR_CHARGE_TYPE,
 455};
 456
 457/* for encoding cft->private value on file */
 458enum res_type {
 459        _MEM,
 460        _MEMSWAP,
 461        _OOM_TYPE,
 462        _KMEM,
 463};
 464
 465#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 466#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 467#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 468/* Used for OOM nofiier */
 469#define OOM_CONTROL             (0)
 470
 471/*
 472 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 473 */
 474#define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 475#define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 476#define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 477#define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 478
 479/*
 480 * The memcg_create_mutex will be held whenever a new cgroup is created.
 481 * As a consequence, any change that needs to protect against new child cgroups
 482 * appearing has to hold it as well.
 483 */
 484static DEFINE_MUTEX(memcg_create_mutex);
 485
 486static inline
 487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 488{
 489        return container_of(s, struct mem_cgroup, css);
 490}
 491
 492/* Some nice accessors for the vmpressure. */
 493struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 494{
 495        if (!memcg)
 496                memcg = root_mem_cgroup;
 497        return &memcg->vmpressure;
 498}
 499
 500struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 501{
 502        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 503}
 504
 505struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
 506{
 507        return &mem_cgroup_from_css(css)->vmpressure;
 508}
 509
 510static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 511{
 512        return (memcg == root_mem_cgroup);
 513}
 514
 515/* Writing them here to avoid exposing memcg's inner layout */
 516#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 517
 518void sock_update_memcg(struct sock *sk)
 519{
 520        if (mem_cgroup_sockets_enabled) {
 521                struct mem_cgroup *memcg;
 522                struct cg_proto *cg_proto;
 523
 524                BUG_ON(!sk->sk_prot->proto_cgroup);
 525
 526                /* Socket cloning can throw us here with sk_cgrp already
 527                 * filled. It won't however, necessarily happen from
 528                 * process context. So the test for root memcg given
 529                 * the current task's memcg won't help us in this case.
 530                 *
 531                 * Respecting the original socket's memcg is a better
 532                 * decision in this case.
 533                 */
 534                if (sk->sk_cgrp) {
 535                        BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 536                        css_get(&sk->sk_cgrp->memcg->css);
 537                        return;
 538                }
 539
 540                rcu_read_lock();
 541                memcg = mem_cgroup_from_task(current);
 542                cg_proto = sk->sk_prot->proto_cgroup(memcg);
 543                if (!mem_cgroup_is_root(memcg) &&
 544                    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 545                        sk->sk_cgrp = cg_proto;
 546                }
 547                rcu_read_unlock();
 548        }
 549}
 550EXPORT_SYMBOL(sock_update_memcg);
 551
 552void sock_release_memcg(struct sock *sk)
 553{
 554        if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 555                struct mem_cgroup *memcg;
 556                WARN_ON(!sk->sk_cgrp->memcg);
 557                memcg = sk->sk_cgrp->memcg;
 558                css_put(&sk->sk_cgrp->memcg->css);
 559        }
 560}
 561
 562struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 563{
 564        if (!memcg || mem_cgroup_is_root(memcg))
 565                return NULL;
 566
 567        return &memcg->tcp_mem.cg_proto;
 568}
 569EXPORT_SYMBOL(tcp_proto_cgroup);
 570
 571static void disarm_sock_keys(struct mem_cgroup *memcg)
 572{
 573        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 574                return;
 575        static_key_slow_dec(&memcg_socket_limit_enabled);
 576}
 577#else
 578static void disarm_sock_keys(struct mem_cgroup *memcg)
 579{
 580}
 581#endif
 582
 583#ifdef CONFIG_MEMCG_KMEM
 584/*
 585 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
 586 * There are two main reasons for not using the css_id for this:
 587 *  1) this works better in sparse environments, where we have a lot of memcgs,
 588 *     but only a few kmem-limited. Or also, if we have, for instance, 200
 589 *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
 590 *     200 entry array for that.
 591 *
 592 *  2) In order not to violate the cgroup API, we would like to do all memory
 593 *     allocation in ->create(). At that point, we haven't yet allocated the
 594 *     css_id. Having a separate index prevents us from messing with the cgroup
 595 *     core for this
 596 *
 597 * The current size of the caches array is stored in
 598 * memcg_limited_groups_array_size.  It will double each time we have to
 599 * increase it.
 600 */
 601static DEFINE_IDA(kmem_limited_groups);
 602int memcg_limited_groups_array_size;
 603
 604/*
 605 * MIN_SIZE is different than 1, because we would like to avoid going through
 606 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 607 * cgroups is a reasonable guess. In the future, it could be a parameter or
 608 * tunable, but that is strictly not necessary.
 609 *
 610 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
 611 * this constant directly from cgroup, but it is understandable that this is
 612 * better kept as an internal representation in cgroup.c. In any case, the
 613 * css_id space is not getting any smaller, and we don't have to necessarily
 614 * increase ours as well if it increases.
 615 */
 616#define MEMCG_CACHES_MIN_SIZE 4
 617#define MEMCG_CACHES_MAX_SIZE 65535
 618
 619/*
 620 * A lot of the calls to the cache allocation functions are expected to be
 621 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 622 * conditional to this static branch, we'll have to allow modules that does
 623 * kmem_cache_alloc and the such to see this symbol as well
 624 */
 625struct static_key memcg_kmem_enabled_key;
 626EXPORT_SYMBOL(memcg_kmem_enabled_key);
 627
 628static void disarm_kmem_keys(struct mem_cgroup *memcg)
 629{
 630        if (memcg_kmem_is_active(memcg)) {
 631                static_key_slow_dec(&memcg_kmem_enabled_key);
 632                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
 633        }
 634        /*
 635         * This check can't live in kmem destruction function,
 636         * since the charges will outlive the cgroup
 637         */
 638        WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 639}
 640#else
 641static void disarm_kmem_keys(struct mem_cgroup *memcg)
 642{
 643}
 644#endif /* CONFIG_MEMCG_KMEM */
 645
 646static void disarm_static_keys(struct mem_cgroup *memcg)
 647{
 648        disarm_sock_keys(memcg);
 649        disarm_kmem_keys(memcg);
 650}
 651
 652static void drain_all_stock_async(struct mem_cgroup *memcg);
 653
 654static struct mem_cgroup_per_zone *
 655mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 656{
 657        VM_BUG_ON((unsigned)nid >= nr_node_ids);
 658        return &memcg->nodeinfo[nid]->zoneinfo[zid];
 659}
 660
 661struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 662{
 663        return &memcg->css;
 664}
 665
 666static struct mem_cgroup_per_zone *
 667page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 668{
 669        int nid = page_to_nid(page);
 670        int zid = page_zonenum(page);
 671
 672        return mem_cgroup_zoneinfo(memcg, nid, zid);
 673}
 674
 675static struct mem_cgroup_tree_per_zone *
 676soft_limit_tree_node_zone(int nid, int zid)
 677{
 678        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 679}
 680
 681static struct mem_cgroup_tree_per_zone *
 682soft_limit_tree_from_page(struct page *page)
 683{
 684        int nid = page_to_nid(page);
 685        int zid = page_zonenum(page);
 686
 687        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 688}
 689
 690static void
 691__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 692                                struct mem_cgroup_per_zone *mz,
 693                                struct mem_cgroup_tree_per_zone *mctz,
 694                                unsigned long long new_usage_in_excess)
 695{
 696        struct rb_node **p = &mctz->rb_root.rb_node;
 697        struct rb_node *parent = NULL;
 698        struct mem_cgroup_per_zone *mz_node;
 699
 700        if (mz->on_tree)
 701                return;
 702
 703        mz->usage_in_excess = new_usage_in_excess;
 704        if (!mz->usage_in_excess)
 705                return;
 706        while (*p) {
 707                parent = *p;
 708                mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 709                                        tree_node);
 710                if (mz->usage_in_excess < mz_node->usage_in_excess)
 711                        p = &(*p)->rb_left;
 712                /*
 713                 * We can't avoid mem cgroups that are over their soft
 714                 * limit by the same amount
 715                 */
 716                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 717                        p = &(*p)->rb_right;
 718        }
 719        rb_link_node(&mz->tree_node, parent, p);
 720        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 721        mz->on_tree = true;
 722}
 723
 724static void
 725__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 726                                struct mem_cgroup_per_zone *mz,
 727                                struct mem_cgroup_tree_per_zone *mctz)
 728{
 729        if (!mz->on_tree)
 730                return;
 731        rb_erase(&mz->tree_node, &mctz->rb_root);
 732        mz->on_tree = false;
 733}
 734
 735static void
 736mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 737                                struct mem_cgroup_per_zone *mz,
 738                                struct mem_cgroup_tree_per_zone *mctz)
 739{
 740        spin_lock(&mctz->lock);
 741        __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 742        spin_unlock(&mctz->lock);
 743}
 744
 745
 746static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 747{
 748        unsigned long long excess;
 749        struct mem_cgroup_per_zone *mz;
 750        struct mem_cgroup_tree_per_zone *mctz;
 751        int nid = page_to_nid(page);
 752        int zid = page_zonenum(page);
 753        mctz = soft_limit_tree_from_page(page);
 754
 755        /*
 756         * Necessary to update all ancestors when hierarchy is used.
 757         * because their event counter is not touched.
 758         */
 759        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 760                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 761                excess = res_counter_soft_limit_excess(&memcg->res);
 762                /*
 763                 * We have to update the tree if mz is on RB-tree or
 764                 * mem is over its softlimit.
 765                 */
 766                if (excess || mz->on_tree) {
 767                        spin_lock(&mctz->lock);
 768                        /* if on-tree, remove it */
 769                        if (mz->on_tree)
 770                                __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 771                        /*
 772                         * Insert again. mz->usage_in_excess will be updated.
 773                         * If excess is 0, no tree ops.
 774                         */
 775                        __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 776                        spin_unlock(&mctz->lock);
 777                }
 778        }
 779}
 780
 781static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 782{
 783        int node, zone;
 784        struct mem_cgroup_per_zone *mz;
 785        struct mem_cgroup_tree_per_zone *mctz;
 786
 787        for_each_node(node) {
 788                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 789                        mz = mem_cgroup_zoneinfo(memcg, node, zone);
 790                        mctz = soft_limit_tree_node_zone(node, zone);
 791                        mem_cgroup_remove_exceeded(memcg, mz, mctz);
 792                }
 793        }
 794}
 795
 796static struct mem_cgroup_per_zone *
 797__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 798{
 799        struct rb_node *rightmost = NULL;
 800        struct mem_cgroup_per_zone *mz;
 801
 802retry:
 803        mz = NULL;
 804        rightmost = rb_last(&mctz->rb_root);
 805        if (!rightmost)
 806                goto done;              /* Nothing to reclaim from */
 807
 808        mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 809        /*
 810         * Remove the node now but someone else can add it back,
 811         * we will to add it back at the end of reclaim to its correct
 812         * position in the tree.
 813         */
 814        __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 815        if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
 816                !css_tryget(&mz->memcg->css))
 817                goto retry;
 818done:
 819        return mz;
 820}
 821
 822static struct mem_cgroup_per_zone *
 823mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 824{
 825        struct mem_cgroup_per_zone *mz;
 826
 827        spin_lock(&mctz->lock);
 828        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 829        spin_unlock(&mctz->lock);
 830        return mz;
 831}
 832
 833/*
 834 * Implementation Note: reading percpu statistics for memcg.
 835 *
 836 * Both of vmstat[] and percpu_counter has threshold and do periodic
 837 * synchronization to implement "quick" read. There are trade-off between
 838 * reading cost and precision of value. Then, we may have a chance to implement
 839 * a periodic synchronizion of counter in memcg's counter.
 840 *
 841 * But this _read() function is used for user interface now. The user accounts
 842 * memory usage by memory cgroup and he _always_ requires exact value because
 843 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 844 * have to visit all online cpus and make sum. So, for now, unnecessary
 845 * synchronization is not implemented. (just implemented for cpu hotplug)
 846 *
 847 * If there are kernel internal actions which can make use of some not-exact
 848 * value, and reading all cpu value can be performance bottleneck in some
 849 * common workload, threashold and synchonization as vmstat[] should be
 850 * implemented.
 851 */
 852static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 853                                 enum mem_cgroup_stat_index idx)
 854{
 855        long val = 0;
 856        int cpu;
 857
 858        get_online_cpus();
 859        for_each_online_cpu(cpu)
 860                val += per_cpu(memcg->stat->count[idx], cpu);
 861#ifdef CONFIG_HOTPLUG_CPU
 862        spin_lock(&memcg->pcp_counter_lock);
 863        val += memcg->nocpu_base.count[idx];
 864        spin_unlock(&memcg->pcp_counter_lock);
 865#endif
 866        put_online_cpus();
 867        return val;
 868}
 869
 870static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 871                                         bool charge)
 872{
 873        int val = (charge) ? 1 : -1;
 874        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 875}
 876
 877static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 878                                            enum mem_cgroup_events_index idx)
 879{
 880        unsigned long val = 0;
 881        int cpu;
 882
 883        for_each_online_cpu(cpu)
 884                val += per_cpu(memcg->stat->events[idx], cpu);
 885#ifdef CONFIG_HOTPLUG_CPU
 886        spin_lock(&memcg->pcp_counter_lock);
 887        val += memcg->nocpu_base.events[idx];
 888        spin_unlock(&memcg->pcp_counter_lock);
 889#endif
 890        return val;
 891}
 892
 893static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 894                                         struct page *page,
 895                                         bool anon, int nr_pages)
 896{
 897        preempt_disable();
 898
 899        /*
 900         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 901         * counted as CACHE even if it's on ANON LRU.
 902         */
 903        if (anon)
 904                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 905                                nr_pages);
 906        else
 907                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 908                                nr_pages);
 909
 910        if (PageTransHuge(page))
 911                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 912                                nr_pages);
 913
 914        /* pagein of a big page is an event. So, ignore page size */
 915        if (nr_pages > 0)
 916                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 917        else {
 918                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 919                nr_pages = -nr_pages; /* for event */
 920        }
 921
 922        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 923
 924        preempt_enable();
 925}
 926
 927unsigned long
 928mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 929{
 930        struct mem_cgroup_per_zone *mz;
 931
 932        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 933        return mz->lru_size[lru];
 934}
 935
 936static unsigned long
 937mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 938                        unsigned int lru_mask)
 939{
 940        struct mem_cgroup_per_zone *mz;
 941        enum lru_list lru;
 942        unsigned long ret = 0;
 943
 944        mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 945
 946        for_each_lru(lru) {
 947                if (BIT(lru) & lru_mask)
 948                        ret += mz->lru_size[lru];
 949        }
 950        return ret;
 951}
 952
 953static unsigned long
 954mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 955                        int nid, unsigned int lru_mask)
 956{
 957        u64 total = 0;
 958        int zid;
 959
 960        for (zid = 0; zid < MAX_NR_ZONES; zid++)
 961                total += mem_cgroup_zone_nr_lru_pages(memcg,
 962                                                nid, zid, lru_mask);
 963
 964        return total;
 965}
 966
 967static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 968                        unsigned int lru_mask)
 969{
 970        int nid;
 971        u64 total = 0;
 972
 973        for_each_node_state(nid, N_MEMORY)
 974                total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 975        return total;
 976}
 977
 978static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 979                                       enum mem_cgroup_events_target target)
 980{
 981        unsigned long val, next;
 982
 983        val = __this_cpu_read(memcg->stat->nr_page_events);
 984        next = __this_cpu_read(memcg->stat->targets[target]);
 985        /* from time_after() in jiffies.h */
 986        if ((long)next - (long)val < 0) {
 987                switch (target) {
 988                case MEM_CGROUP_TARGET_THRESH:
 989                        next = val + THRESHOLDS_EVENTS_TARGET;
 990                        break;
 991                case MEM_CGROUP_TARGET_SOFTLIMIT:
 992                        next = val + SOFTLIMIT_EVENTS_TARGET;
 993                        break;
 994                case MEM_CGROUP_TARGET_NUMAINFO:
 995                        next = val + NUMAINFO_EVENTS_TARGET;
 996                        break;
 997                default:
 998                        break;
 999                }
1000                __this_cpu_write(memcg->stat->targets[target], next);
1001                return true;
1002        }
1003        return false;
1004}
1005
1006/*
1007 * Check events in order.
1008 *
1009 */
1010static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1011{
1012        preempt_disable();
1013        /* threshold event is triggered in finer grain than soft limit */
1014        if (unlikely(mem_cgroup_event_ratelimit(memcg,
1015                                                MEM_CGROUP_TARGET_THRESH))) {
1016                bool do_softlimit;
1017                bool do_numainfo __maybe_unused;
1018
1019                do_softlimit = mem_cgroup_event_ratelimit(memcg,
1020                                                MEM_CGROUP_TARGET_SOFTLIMIT);
1021#if MAX_NUMNODES > 1
1022                do_numainfo = mem_cgroup_event_ratelimit(memcg,
1023                                                MEM_CGROUP_TARGET_NUMAINFO);
1024#endif
1025                preempt_enable();
1026
1027                mem_cgroup_threshold(memcg);
1028                if (unlikely(do_softlimit))
1029                        mem_cgroup_update_tree(memcg, page);
1030#if MAX_NUMNODES > 1
1031                if (unlikely(do_numainfo))
1032                        atomic_inc(&memcg->numainfo_events);
1033#endif
1034        } else
1035                preempt_enable();
1036}
1037
1038struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1039{
1040        return mem_cgroup_from_css(
1041                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1042}
1043
1044struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1045{
1046        /*
1047         * mm_update_next_owner() may clear mm->owner to NULL
1048         * if it races with swapoff, page migration, etc.
1049         * So this can be called with p == NULL.
1050         */
1051        if (unlikely(!p))
1052                return NULL;
1053
1054        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
1055}
1056
1057struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1058{
1059        struct mem_cgroup *memcg = NULL;
1060
1061        if (!mm)
1062                return NULL;
1063        /*
1064         * Because we have no locks, mm->owner's may be being moved to other
1065         * cgroup. We use css_tryget() here even if this looks
1066         * pessimistic (rather than adding locks here).
1067         */
1068        rcu_read_lock();
1069        do {
1070                memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1071                if (unlikely(!memcg))
1072                        break;
1073        } while (!css_tryget(&memcg->css));
1074        rcu_read_unlock();
1075        return memcg;
1076}
1077
1078/*
1079 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1080 * ref. count) or NULL if the whole root's subtree has been visited.
1081 *
1082 * helper function to be used by mem_cgroup_iter
1083 */
1084static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1085                struct mem_cgroup *last_visited)
1086{
1087        struct cgroup *prev_cgroup, *next_cgroup;
1088
1089        /*
1090         * Root is not visited by cgroup iterators so it needs an
1091         * explicit visit.
1092         */
1093        if (!last_visited)
1094                return root;
1095
1096        prev_cgroup = (last_visited == root) ? NULL
1097                : last_visited->css.cgroup;
1098skip_node:
1099        next_cgroup = cgroup_next_descendant_pre(
1100                        prev_cgroup, root->css.cgroup);
1101
1102        /*
1103         * Even if we found a group we have to make sure it is
1104         * alive. css && !memcg means that the groups should be
1105         * skipped and we should continue the tree walk.
1106         * last_visited css is safe to use because it is
1107         * protected by css_get and the tree walk is rcu safe.
1108         */
1109        if (next_cgroup) {
1110                struct mem_cgroup *mem = mem_cgroup_from_cont(
1111                                next_cgroup);
1112                if (css_tryget(&mem->css))
1113                        return mem;
1114                else {
1115                        prev_cgroup = next_cgroup;
1116                        goto skip_node;
1117                }
1118        }
1119
1120        return NULL;
1121}
1122
1123static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1124{
1125        /*
1126         * When a group in the hierarchy below root is destroyed, the
1127         * hierarchy iterator can no longer be trusted since it might
1128         * have pointed to the destroyed group.  Invalidate it.
1129         */
1130        atomic_inc(&root->dead_count);
1131}
1132
1133static struct mem_cgroup *
1134mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1135                     struct mem_cgroup *root,
1136                     int *sequence)
1137{
1138        struct mem_cgroup *position = NULL;
1139        /*
1140         * A cgroup destruction happens in two stages: offlining and
1141         * release.  They are separated by a RCU grace period.
1142         *
1143         * If the iterator is valid, we may still race with an
1144         * offlining.  The RCU lock ensures the object won't be
1145         * released, tryget will fail if we lost the race.
1146         */
1147        *sequence = atomic_read(&root->dead_count);
1148        if (iter->last_dead_count == *sequence) {
1149                smp_rmb();
1150                position = iter->last_visited;
1151                if (position && !css_tryget(&position->css))
1152                        position = NULL;
1153        }
1154        return position;
1155}
1156
1157static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1158                                   struct mem_cgroup *last_visited,
1159                                   struct mem_cgroup *new_position,
1160                                   int sequence)
1161{
1162        if (last_visited)
1163                css_put(&last_visited->css);
1164        /*
1165         * We store the sequence count from the time @last_visited was
1166         * loaded successfully instead of rereading it here so that we
1167         * don't lose destruction events in between.  We could have
1168         * raced with the destruction of @new_position after all.
1169         */
1170        iter->last_visited = new_position;
1171        smp_wmb();
1172        iter->last_dead_count = sequence;
1173}
1174
1175/**
1176 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1177 * @root: hierarchy root
1178 * @prev: previously returned memcg, NULL on first invocation
1179 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1180 *
1181 * Returns references to children of the hierarchy below @root, or
1182 * @root itself, or %NULL after a full round-trip.
1183 *
1184 * Caller must pass the return value in @prev on subsequent
1185 * invocations for reference counting, or use mem_cgroup_iter_break()
1186 * to cancel a hierarchy walk before the round-trip is complete.
1187 *
1188 * Reclaimers can specify a zone and a priority level in @reclaim to
1189 * divide up the memcgs in the hierarchy among all concurrent
1190 * reclaimers operating on the same zone and priority.
1191 */
1192struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1193                                   struct mem_cgroup *prev,
1194                                   struct mem_cgroup_reclaim_cookie *reclaim)
1195{
1196        struct mem_cgroup *memcg = NULL;
1197        struct mem_cgroup *last_visited = NULL;
1198
1199        if (mem_cgroup_disabled())
1200                return NULL;
1201
1202        if (!root)
1203                root = root_mem_cgroup;
1204
1205        if (prev && !reclaim)
1206                last_visited = prev;
1207
1208        if (!root->use_hierarchy && root != root_mem_cgroup) {
1209                if (prev)
1210                        goto out_css_put;
1211                return root;
1212        }
1213
1214        rcu_read_lock();
1215        while (!memcg) {
1216                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1217                int uninitialized_var(seq);
1218
1219                if (reclaim) {
1220                        int nid = zone_to_nid(reclaim->zone);
1221                        int zid = zone_idx(reclaim->zone);
1222                        struct mem_cgroup_per_zone *mz;
1223
1224                        mz = mem_cgroup_zoneinfo(root, nid, zid);
1225                        iter = &mz->reclaim_iter[reclaim->priority];
1226                        if (prev && reclaim->generation != iter->generation) {
1227                                iter->last_visited = NULL;
1228                                goto out_unlock;
1229                        }
1230
1231                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
1232                }
1233
1234                memcg = __mem_cgroup_iter_next(root, last_visited);
1235
1236                if (reclaim) {
1237                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1238
1239                        if (!memcg)
1240                                iter->generation++;
1241                        else if (!prev && memcg)
1242                                reclaim->generation = iter->generation;
1243                }
1244
1245                if (prev && !memcg)
1246                        goto out_unlock;
1247        }
1248out_unlock:
1249        rcu_read_unlock();
1250out_css_put:
1251        if (prev && prev != root)
1252                css_put(&prev->css);
1253
1254        return memcg;
1255}
1256
1257/**
1258 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1259 * @root: hierarchy root
1260 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1261 */
1262void mem_cgroup_iter_break(struct mem_cgroup *root,
1263                           struct mem_cgroup *prev)
1264{
1265        if (!root)
1266                root = root_mem_cgroup;
1267        if (prev && prev != root)
1268                css_put(&prev->css);
1269}
1270
1271/*
1272 * Iteration constructs for visiting all cgroups (under a tree).  If
1273 * loops are exited prematurely (break), mem_cgroup_iter_break() must
1274 * be used for reference counting.
1275 */
1276#define for_each_mem_cgroup_tree(iter, root)            \
1277        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
1278             iter != NULL;                              \
1279             iter = mem_cgroup_iter(root, iter, NULL))
1280
1281#define for_each_mem_cgroup(iter)                       \
1282        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
1283             iter != NULL;                              \
1284             iter = mem_cgroup_iter(NULL, iter, NULL))
1285
1286void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1287{
1288        struct mem_cgroup *memcg;
1289
1290        rcu_read_lock();
1291        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1292        if (unlikely(!memcg))
1293                goto out;
1294
1295        switch (idx) {
1296        case PGFAULT:
1297                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1298                break;
1299        case PGMAJFAULT:
1300                this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1301                break;
1302        default:
1303                BUG();
1304        }
1305out:
1306        rcu_read_unlock();
1307}
1308EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1309
1310/**
1311 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1312 * @zone: zone of the wanted lruvec
1313 * @memcg: memcg of the wanted lruvec
1314 *
1315 * Returns the lru list vector holding pages for the given @zone and
1316 * @mem.  This can be the global zone lruvec, if the memory controller
1317 * is disabled.
1318 */
1319struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1320                                      struct mem_cgroup *memcg)
1321{
1322        struct mem_cgroup_per_zone *mz;
1323        struct lruvec *lruvec;
1324
1325        if (mem_cgroup_disabled()) {
1326                lruvec = &zone->lruvec;
1327                goto out;
1328        }
1329
1330        mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1331        lruvec = &mz->lruvec;
1332out:
1333        /*
1334         * Since a node can be onlined after the mem_cgroup was created,
1335         * we have to be prepared to initialize lruvec->zone here;
1336         * and if offlined then reonlined, we need to reinitialize it.
1337         */
1338        if (unlikely(lruvec->zone != zone))
1339                lruvec->zone = zone;
1340        return lruvec;
1341}
1342
1343/*
1344 * Following LRU functions are allowed to be used without PCG_LOCK.
1345 * Operations are called by routine of global LRU independently from memcg.
1346 * What we have to take care of here is validness of pc->mem_cgroup.
1347 *
1348 * Changes to pc->mem_cgroup happens when
1349 * 1. charge
1350 * 2. moving account
1351 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1352 * It is added to LRU before charge.
1353 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1354 * When moving account, the page is not on LRU. It's isolated.
1355 */
1356
1357/**
1358 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1359 * @page: the page
1360 * @zone: zone of the page
1361 */
1362struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1363{
1364        struct mem_cgroup_per_zone *mz;
1365        struct mem_cgroup *memcg;
1366        struct page_cgroup *pc;
1367        struct lruvec *lruvec;
1368
1369        if (mem_cgroup_disabled()) {
1370                lruvec = &zone->lruvec;
1371                goto out;
1372        }
1373
1374        pc = lookup_page_cgroup(page);
1375        memcg = pc->mem_cgroup;
1376
1377        /*
1378         * Surreptitiously switch any uncharged offlist page to root:
1379         * an uncharged page off lru does nothing to secure
1380         * its former mem_cgroup from sudden removal.
1381         *
1382         * Our caller holds lru_lock, and PageCgroupUsed is updated
1383         * under page_cgroup lock: between them, they make all uses
1384         * of pc->mem_cgroup safe.
1385         */
1386        if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1387                pc->mem_cgroup = memcg = root_mem_cgroup;
1388
1389        mz = page_cgroup_zoneinfo(memcg, page);
1390        lruvec = &mz->lruvec;
1391out:
1392        /*
1393         * Since a node can be onlined after the mem_cgroup was created,
1394         * we have to be prepared to initialize lruvec->zone here;
1395         * and if offlined then reonlined, we need to reinitialize it.
1396         */
1397        if (unlikely(lruvec->zone != zone))
1398                lruvec->zone = zone;
1399        return lruvec;
1400}
1401
1402/**
1403 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1404 * @lruvec: mem_cgroup per zone lru vector
1405 * @lru: index of lru list the page is sitting on
1406 * @nr_pages: positive when adding or negative when removing
1407 *
1408 * This function must be called when a page is added to or removed from an
1409 * lru list.
1410 */
1411void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1412                                int nr_pages)
1413{
1414        struct mem_cgroup_per_zone *mz;
1415        unsigned long *lru_size;
1416
1417        if (mem_cgroup_disabled())
1418                return;
1419
1420        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1421        lru_size = mz->lru_size + lru;
1422        *lru_size += nr_pages;
1423        VM_BUG_ON((long)(*lru_size) < 0);
1424}
1425
1426/*
1427 * Checks whether given mem is same or in the root_mem_cgroup's
1428 * hierarchy subtree
1429 */
1430bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1431                                  struct mem_cgroup *memcg)
1432{
1433        if (root_memcg == memcg)
1434                return true;
1435        if (!root_memcg->use_hierarchy || !memcg)
1436                return false;
1437        return css_is_ancestor(&memcg->css, &root_memcg->css);
1438}
1439
1440static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1441                                       struct mem_cgroup *memcg)
1442{
1443        bool ret;
1444
1445        rcu_read_lock();
1446        ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1447        rcu_read_unlock();
1448        return ret;
1449}
1450
1451bool task_in_mem_cgroup(struct task_struct *task,
1452                        const struct mem_cgroup *memcg)
1453{
1454        struct mem_cgroup *curr = NULL;
1455        struct task_struct *p;
1456        bool ret;
1457
1458        p = find_lock_task_mm(task);
1459        if (p) {
1460                curr = try_get_mem_cgroup_from_mm(p->mm);
1461                task_unlock(p);
1462        } else {
1463                /*
1464                 * All threads may have already detached their mm's, but the oom
1465                 * killer still needs to detect if they have already been oom
1466                 * killed to prevent needlessly killing additional tasks.
1467                 */
1468                rcu_read_lock();
1469                curr = mem_cgroup_from_task(task);
1470                if (curr)
1471                        css_get(&curr->css);
1472                rcu_read_unlock();
1473        }
1474        if (!curr)
1475                return false;
1476        /*
1477         * We should check use_hierarchy of "memcg" not "curr". Because checking
1478         * use_hierarchy of "curr" here make this function true if hierarchy is
1479         * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1480         * hierarchy(even if use_hierarchy is disabled in "memcg").
1481         */
1482        ret = mem_cgroup_same_or_subtree(memcg, curr);
1483        css_put(&curr->css);
1484        return ret;
1485}
1486
1487int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1488{
1489        unsigned long inactive_ratio;
1490        unsigned long inactive;
1491        unsigned long active;
1492        unsigned long gb;
1493
1494        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1495        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1496
1497        gb = (inactive + active) >> (30 - PAGE_SHIFT);
1498        if (gb)
1499                inactive_ratio = int_sqrt(10 * gb);
1500        else
1501                inactive_ratio = 1;
1502
1503        return inactive * inactive_ratio < active;
1504}
1505
1506#define mem_cgroup_from_res_counter(counter, member)    \
1507        container_of(counter, struct mem_cgroup, member)
1508
1509/**
1510 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1511 * @memcg: the memory cgroup
1512 *
1513 * Returns the maximum amount of memory @mem can be charged with, in
1514 * pages.
1515 */
1516static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1517{
1518        unsigned long long margin;
1519
1520        margin = res_counter_margin(&memcg->res);
1521        if (do_swap_account)
1522                margin = min(margin, res_counter_margin(&memcg->memsw));
1523        return margin >> PAGE_SHIFT;
1524}
1525
1526int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1527{
1528        struct cgroup *cgrp = memcg->css.cgroup;
1529
1530        /* root ? */
1531        if (cgrp->parent == NULL)
1532                return vm_swappiness;
1533
1534        return memcg->swappiness;
1535}
1536
1537/*
1538 * memcg->moving_account is used for checking possibility that some thread is
1539 * calling move_account(). When a thread on CPU-A starts moving pages under
1540 * a memcg, other threads should check memcg->moving_account under
1541 * rcu_read_lock(), like this:
1542 *
1543 *         CPU-A                                    CPU-B
1544 *                                              rcu_read_lock()
1545 *         memcg->moving_account+1              if (memcg->mocing_account)
1546 *                                                   take heavy locks.
1547 *         synchronize_rcu()                    update something.
1548 *                                              rcu_read_unlock()
1549 *         start move here.
1550 */
1551
1552/* for quick checking without looking up memcg */
1553atomic_t memcg_moving __read_mostly;
1554
1555static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1556{
1557        atomic_inc(&memcg_moving);
1558        atomic_inc(&memcg->moving_account);
1559        synchronize_rcu();
1560}
1561
1562static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1563{
1564        /*
1565         * Now, mem_cgroup_clear_mc() may call this function with NULL.
1566         * We check NULL in callee rather than caller.
1567         */
1568        if (memcg) {
1569                atomic_dec(&memcg_moving);
1570                atomic_dec(&memcg->moving_account);
1571        }
1572}
1573
1574/*
1575 * 2 routines for checking "mem" is under move_account() or not.
1576 *
1577 * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
1578 *                        is used for avoiding races in accounting.  If true,
1579 *                        pc->mem_cgroup may be overwritten.
1580 *
1581 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1582 *                        under hierarchy of moving cgroups. This is for
1583 *                        waiting at hith-memory prressure caused by "move".
1584 */
1585
1586static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1587{
1588        VM_BUG_ON(!rcu_read_lock_held());
1589        return atomic_read(&memcg->moving_account) > 0;
1590}
1591
1592static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1593{
1594        struct mem_cgroup *from;
1595        struct mem_cgroup *to;
1596        bool ret = false;
1597        /*
1598         * Unlike task_move routines, we access mc.to, mc.from not under
1599         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1600         */
1601        spin_lock(&mc.lock);
1602        from = mc.from;
1603        to = mc.to;
1604        if (!from)
1605                goto unlock;
1606
1607        ret = mem_cgroup_same_or_subtree(memcg, from)
1608                || mem_cgroup_same_or_subtree(memcg, to);
1609unlock:
1610        spin_unlock(&mc.lock);
1611        return ret;
1612}
1613
1614static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1615{
1616        if (mc.moving_task && current != mc.moving_task) {
1617                if (mem_cgroup_under_move(memcg)) {
1618                        DEFINE_WAIT(wait);
1619                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1620                        /* moving charge context might have finished. */
1621                        if (mc.moving_task)
1622                                schedule();
1623                        finish_wait(&mc.waitq, &wait);
1624                        return true;
1625                }
1626        }
1627        return false;
1628}
1629
1630/*
1631 * Take this lock when
1632 * - a code tries to modify page's memcg while it's USED.
1633 * - a code tries to modify page state accounting in a memcg.
1634 * see mem_cgroup_stolen(), too.
1635 */
1636static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1637                                  unsigned long *flags)
1638{
1639        spin_lock_irqsave(&memcg->move_lock, *flags);
1640}
1641
1642static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1643                                unsigned long *flags)
1644{
1645        spin_unlock_irqrestore(&memcg->move_lock, *flags);
1646}
1647
1648#define K(x) ((x) << (PAGE_SHIFT-10))
1649/**
1650 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1651 * @memcg: The memory cgroup that went over limit
1652 * @p: Task that is going to be killed
1653 *
1654 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1655 * enabled
1656 */
1657void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1658{
1659        struct cgroup *task_cgrp;
1660        struct cgroup *mem_cgrp;
1661        /*
1662         * Need a buffer in BSS, can't rely on allocations. The code relies
1663         * on the assumption that OOM is serialized for memory controller.
1664         * If this assumption is broken, revisit this code.
1665         */
1666        static char memcg_name[PATH_MAX];
1667        int ret;
1668        struct mem_cgroup *iter;
1669        unsigned int i;
1670
1671        if (!p)
1672                return;
1673
1674        rcu_read_lock();
1675
1676        mem_cgrp = memcg->css.cgroup;
1677        task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1678
1679        ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1680        if (ret < 0) {
1681                /*
1682                 * Unfortunately, we are unable to convert to a useful name
1683                 * But we'll still print out the usage information
1684                 */
1685                rcu_read_unlock();
1686                goto done;
1687        }
1688        rcu_read_unlock();
1689
1690        pr_info("Task in %s killed", memcg_name);
1691
1692        rcu_read_lock();
1693        ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1694        if (ret < 0) {
1695                rcu_read_unlock();
1696                goto done;
1697        }
1698        rcu_read_unlock();
1699
1700        /*
1701         * Continues from above, so we don't need an KERN_ level
1702         */
1703        pr_cont(" as a result of limit of %s\n", memcg_name);
1704done:
1705
1706        pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1707                res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1708                res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1709                res_counter_read_u64(&memcg->res, RES_FAILCNT));
1710        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1711                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1712                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1713                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1714        pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1715                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1716                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1717                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1718
1719        for_each_mem_cgroup_tree(iter, memcg) {
1720                pr_info("Memory cgroup stats");
1721
1722                rcu_read_lock();
1723                ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1724                if (!ret)
1725                        pr_cont(" for %s", memcg_name);
1726                rcu_read_unlock();
1727                pr_cont(":");
1728
1729                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1730                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1731                                continue;
1732                        pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1733                                K(mem_cgroup_read_stat(iter, i)));
1734                }
1735
1736                for (i = 0; i < NR_LRU_LISTS; i++)
1737                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1738                                K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1739
1740                pr_cont("\n");
1741        }
1742}
1743
1744/*
1745 * This function returns the number of memcg under hierarchy tree. Returns
1746 * 1(self count) if no children.
1747 */
1748static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1749{
1750        int num = 0;
1751        struct mem_cgroup *iter;
1752
1753        for_each_mem_cgroup_tree(iter, memcg)
1754                num++;
1755        return num;
1756}
1757
1758/*
1759 * Return the memory (and swap, if configured) limit for a memcg.
1760 */
1761static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1762{
1763        u64 limit;
1764
1765        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1766
1767        /*
1768         * Do not consider swap space if we cannot swap due to swappiness
1769         */
1770        if (mem_cgroup_swappiness(memcg)) {
1771                u64 memsw;
1772
1773                limit += total_swap_pages << PAGE_SHIFT;
1774                memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1775
1776                /*
1777                 * If memsw is finite and limits the amount of swap space
1778                 * available to this memcg, return that limit.
1779                 */
1780                limit = min(limit, memsw);
1781        }
1782
1783        return limit;
1784}
1785
1786static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1787                                     int order)
1788{
1789        struct mem_cgroup *iter;
1790        unsigned long chosen_points = 0;
1791        unsigned long totalpages;
1792        unsigned int points = 0;
1793        struct task_struct *chosen = NULL;
1794
1795        /*
1796         * If current has a pending SIGKILL or is exiting, then automatically
1797         * select it.  The goal is to allow it to allocate so that it may
1798         * quickly exit and free its memory.
1799         */
1800        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1801                set_thread_flag(TIF_MEMDIE);
1802                return;
1803        }
1804
1805        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1806        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1807        for_each_mem_cgroup_tree(iter, memcg) {
1808                struct cgroup *cgroup = iter->css.cgroup;
1809                struct cgroup_iter it;
1810                struct task_struct *task;
1811
1812                cgroup_iter_start(cgroup, &it);
1813                while ((task = cgroup_iter_next(cgroup, &it))) {
1814                        switch (oom_scan_process_thread(task, totalpages, NULL,
1815                                                        false)) {
1816                        case OOM_SCAN_SELECT:
1817                                if (chosen)
1818                                        put_task_struct(chosen);
1819                                chosen = task;
1820                                chosen_points = ULONG_MAX;
1821                                get_task_struct(chosen);
1822                                /* fall through */
1823                        case OOM_SCAN_CONTINUE:
1824                                continue;
1825                        case OOM_SCAN_ABORT:
1826                                cgroup_iter_end(cgroup, &it);
1827                                mem_cgroup_iter_break(memcg, iter);
1828                                if (chosen)
1829                                        put_task_struct(chosen);
1830                                return;
1831                        case OOM_SCAN_OK:
1832                                break;
1833                        };
1834                        points = oom_badness(task, memcg, NULL, totalpages);
1835                        if (points > chosen_points) {
1836                                if (chosen)
1837                                        put_task_struct(chosen);
1838                                chosen = task;
1839                                chosen_points = points;
1840                                get_task_struct(chosen);
1841                        }
1842                }
1843                cgroup_iter_end(cgroup, &it);
1844        }
1845
1846        if (!chosen)
1847                return;
1848        points = chosen_points * 1000 / totalpages;
1849        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1850                         NULL, "Memory cgroup out of memory");
1851}
1852
1853static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1854                                        gfp_t gfp_mask,
1855                                        unsigned long flags)
1856{
1857        unsigned long total = 0;
1858        bool noswap = false;
1859        int loop;
1860
1861        if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1862                noswap = true;
1863        if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1864                noswap = true;
1865
1866        for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1867                if (loop)
1868                        drain_all_stock_async(memcg);
1869                total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1870                /*
1871                 * Allow limit shrinkers, which are triggered directly
1872                 * by userspace, to catch signals and stop reclaim
1873                 * after minimal progress, regardless of the margin.
1874                 */
1875                if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1876                        break;
1877                if (mem_cgroup_margin(memcg))
1878                        break;
1879                /*
1880                 * If nothing was reclaimed after two attempts, there
1881                 * may be no reclaimable pages in this hierarchy.
1882                 */
1883                if (loop && !total)
1884                        break;
1885        }
1886        return total;
1887}
1888
1889/**
1890 * test_mem_cgroup_node_reclaimable
1891 * @memcg: the target memcg
1892 * @nid: the node ID to be checked.
1893 * @noswap : specify true here if the user wants flle only information.
1894 *
1895 * This function returns whether the specified memcg contains any
1896 * reclaimable pages on a node. Returns true if there are any reclaimable
1897 * pages in the node.
1898 */
1899static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1900                int nid, bool noswap)
1901{
1902        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1903                return true;
1904        if (noswap || !total_swap_pages)
1905                return false;
1906        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1907                return true;
1908        return false;
1909
1910}
1911#if MAX_NUMNODES > 1
1912
1913/*
1914 * Always updating the nodemask is not very good - even if we have an empty
1915 * list or the wrong list here, we can start from some node and traverse all
1916 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1917 *
1918 */
1919static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1920{
1921        int nid;
1922        /*
1923         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1924         * pagein/pageout changes since the last update.
1925         */
1926        if (!atomic_read(&memcg->numainfo_events))
1927                return;
1928        if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1929                return;
1930
1931        /* make a nodemask where this memcg uses memory from */
1932        memcg->scan_nodes = node_states[N_MEMORY];
1933
1934        for_each_node_mask(nid, node_states[N_MEMORY]) {
1935
1936                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1937                        node_clear(nid, memcg->scan_nodes);
1938        }
1939
1940        atomic_set(&memcg->numainfo_events, 0);
1941        atomic_set(&memcg->numainfo_updating, 0);
1942}
1943
1944/*
1945 * Selecting a node where we start reclaim from. Because what we need is just
1946 * reducing usage counter, start from anywhere is O,K. Considering
1947 * memory reclaim from current node, there are pros. and cons.
1948 *
1949 * Freeing memory from current node means freeing memory from a node which
1950 * we'll use or we've used. So, it may make LRU bad. And if several threads
1951 * hit limits, it will see a contention on a node. But freeing from remote
1952 * node means more costs for memory reclaim because of memory latency.
1953 *
1954 * Now, we use round-robin. Better algorithm is welcomed.
1955 */
1956int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1957{
1958        int node;
1959
1960        mem_cgroup_may_update_nodemask(memcg);
1961        node = memcg->last_scanned_node;
1962
1963        node = next_node(node, memcg->scan_nodes);
1964        if (node == MAX_NUMNODES)
1965                node = first_node(memcg->scan_nodes);
1966        /*
1967         * We call this when we hit limit, not when pages are added to LRU.
1968         * No LRU may hold pages because all pages are UNEVICTABLE or
1969         * memcg is too small and all pages are not on LRU. In that case,
1970         * we use curret node.
1971         */
1972        if (unlikely(node == MAX_NUMNODES))
1973                node = numa_node_id();
1974
1975        memcg->last_scanned_node = node;
1976        return node;
1977}
1978
1979/*
1980 * Check all nodes whether it contains reclaimable pages or not.
1981 * For quick scan, we make use of scan_nodes. This will allow us to skip
1982 * unused nodes. But scan_nodes is lazily updated and may not cotain
1983 * enough new information. We need to do double check.
1984 */
1985static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1986{
1987        int nid;
1988
1989        /*
1990         * quick check...making use of scan_node.
1991         * We can skip unused nodes.
1992         */
1993        if (!nodes_empty(memcg->scan_nodes)) {
1994                for (nid = first_node(memcg->scan_nodes);
1995                     nid < MAX_NUMNODES;
1996                     nid = next_node(nid, memcg->scan_nodes)) {
1997
1998                        if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1999                                return true;
2000                }
2001        }
2002        /*
2003         * Check rest of nodes.
2004         */
2005        for_each_node_state(nid, N_MEMORY) {
2006                if (node_isset(nid, memcg->scan_nodes))
2007                        continue;
2008                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
2009                        return true;
2010        }
2011        return false;
2012}
2013
2014#else
2015int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2016{
2017        return 0;
2018}
2019
2020static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2021{
2022        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2023}
2024#endif
2025
2026static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2027                                   struct zone *zone,
2028                                   gfp_t gfp_mask,
2029                                   unsigned long *total_scanned)
2030{
2031        struct mem_cgroup *victim = NULL;
2032        int total = 0;
2033        int loop = 0;
2034        unsigned long excess;
2035        unsigned long nr_scanned;
2036        struct mem_cgroup_reclaim_cookie reclaim = {
2037                .zone = zone,
2038                .priority = 0,
2039        };
2040
2041        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2042
2043        while (1) {
2044                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2045                if (!victim) {
2046                        loop++;
2047                        if (loop >= 2) {
2048                                /*
2049                                 * If we have not been able to reclaim
2050                                 * anything, it might because there are
2051                                 * no reclaimable pages under this hierarchy
2052                                 */
2053                                if (!total)
2054                                        break;
2055                                /*
2056                                 * We want to do more targeted reclaim.
2057                                 * excess >> 2 is not to excessive so as to
2058                                 * reclaim too much, nor too less that we keep
2059                                 * coming back to reclaim from this cgroup
2060                                 */
2061                                if (total >= (excess >> 2) ||
2062                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2063                                        break;
2064                        }
2065                        continue;
2066                }
2067                if (!mem_cgroup_reclaimable(victim, false))
2068                        continue;
2069                total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2070                                                     zone, &nr_scanned);
2071                *total_scanned += nr_scanned;
2072                if (!res_counter_soft_limit_excess(&root_memcg->res))
2073                        break;
2074        }
2075        mem_cgroup_iter_break(root_memcg, victim);
2076        return total;
2077}
2078
2079/*
2080 * Check OOM-Killer is already running under our hierarchy.
2081 * If someone is running, return false.
2082 * Has to be called with memcg_oom_lock
2083 */
2084static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
2085{
2086        struct mem_cgroup *iter, *failed = NULL;
2087
2088        for_each_mem_cgroup_tree(iter, memcg) {
2089                if (iter->oom_lock) {
2090                        /*
2091                         * this subtree of our hierarchy is already locked
2092                         * so we cannot give a lock.
2093                         */
2094                        failed = iter;
2095                        mem_cgroup_iter_break(memcg, iter);
2096                        break;
2097                } else
2098                        iter->oom_lock = true;
2099        }
2100
2101        if (!failed)
2102                return true;
2103
2104        /*
2105         * OK, we failed to lock the whole subtree so we have to clean up
2106         * what we set up to the failing subtree
2107         */
2108        for_each_mem_cgroup_tree(iter, memcg) {
2109                if (iter == failed) {
2110                        mem_cgroup_iter_break(memcg, iter);
2111                        break;
2112                }
2113                iter->oom_lock = false;
2114        }
2115        return false;
2116}
2117
2118/*
2119 * Has to be called with memcg_oom_lock
2120 */
2121static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2122{
2123        struct mem_cgroup *iter;
2124
2125        for_each_mem_cgroup_tree(iter, memcg)
2126                iter->oom_lock = false;
2127        return 0;
2128}
2129
2130static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2131{
2132        struct mem_cgroup *iter;
2133
2134        for_each_mem_cgroup_tree(iter, memcg)
2135                atomic_inc(&iter->under_oom);
2136}
2137
2138static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2139{
2140        struct mem_cgroup *iter;
2141
2142        /*
2143         * When a new child is created while the hierarchy is under oom,
2144         * mem_cgroup_oom_lock() may not be called. We have to use
2145         * atomic_add_unless() here.
2146         */
2147        for_each_mem_cgroup_tree(iter, memcg)
2148                atomic_add_unless(&iter->under_oom, -1, 0);
2149}
2150
2151static DEFINE_SPINLOCK(memcg_oom_lock);
2152static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2153
2154struct oom_wait_info {
2155        struct mem_cgroup *memcg;
2156        wait_queue_t    wait;
2157};
2158
2159static int memcg_oom_wake_function(wait_queue_t *wait,
2160        unsigned mode, int sync, void *arg)
2161{
2162        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2163        struct mem_cgroup *oom_wait_memcg;
2164        struct oom_wait_info *oom_wait_info;
2165
2166        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2167        oom_wait_memcg = oom_wait_info->memcg;
2168
2169        /*
2170         * Both of oom_wait_info->memcg and wake_memcg are stable under us.
2171         * Then we can use css_is_ancestor without taking care of RCU.
2172         */
2173        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2174                && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2175                return 0;
2176        return autoremove_wake_function(wait, mode, sync, arg);
2177}
2178
2179static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2180{
2181        /* for filtering, pass "memcg" as argument. */
2182        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2183}
2184
2185static void memcg_oom_recover(struct mem_cgroup *memcg)
2186{
2187        if (memcg && atomic_read(&memcg->under_oom))
2188                memcg_wakeup_oom(memcg);
2189}
2190
2191/*
2192 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
2193 */
2194static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2195                                  int order)
2196{
2197        struct oom_wait_info owait;
2198        bool locked, need_to_kill;
2199
2200        owait.memcg = memcg;
2201        owait.wait.flags = 0;
2202        owait.wait.func = memcg_oom_wake_function;
2203        owait.wait.private = current;
2204        INIT_LIST_HEAD(&owait.wait.task_list);
2205        need_to_kill = true;
2206        mem_cgroup_mark_under_oom(memcg);
2207
2208        /* At first, try to OOM lock hierarchy under memcg.*/
2209        spin_lock(&memcg_oom_lock);
2210        locked = mem_cgroup_oom_lock(memcg);
2211        /*
2212         * Even if signal_pending(), we can't quit charge() loop without
2213         * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
2214         * under OOM is always welcomed, use TASK_KILLABLE here.
2215         */
2216        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2217        if (!locked || memcg->oom_kill_disable)
2218                need_to_kill = false;
2219        if (locked)
2220                mem_cgroup_oom_notify(memcg);
2221        spin_unlock(&memcg_oom_lock);
2222
2223        if (need_to_kill) {
2224                finish_wait(&memcg_oom_waitq, &owait.wait);
2225                mem_cgroup_out_of_memory(memcg, mask, order);
2226        } else {
2227                schedule();
2228                finish_wait(&memcg_oom_waitq, &owait.wait);
2229        }
2230        spin_lock(&memcg_oom_lock);
2231        if (locked)
2232                mem_cgroup_oom_unlock(memcg);
2233        memcg_wakeup_oom(memcg);
2234        spin_unlock(&memcg_oom_lock);
2235
2236        mem_cgroup_unmark_under_oom(memcg);
2237
2238        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2239                return false;
2240        /* Give chance to dying process */
2241        schedule_timeout_uninterruptible(1);
2242        return true;
2243}
2244
2245/*
2246 * Currently used to update mapped file statistics, but the routine can be
2247 * generalized to update other statistics as well.
2248 *
2249 * Notes: Race condition
2250 *
2251 * We usually use page_cgroup_lock() for accessing page_cgroup member but
2252 * it tends to be costly. But considering some conditions, we doesn't need
2253 * to do so _always_.
2254 *
2255 * Considering "charge", lock_page_cgroup() is not required because all
2256 * file-stat operations happen after a page is attached to radix-tree. There
2257 * are no race with "charge".
2258 *
2259 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
2260 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2261 * if there are race with "uncharge". Statistics itself is properly handled
2262 * by flags.
2263 *
2264 * Considering "move", this is an only case we see a race. To make the race
2265 * small, we check mm->moving_account and detect there are possibility of race
2266 * If there is, we take a lock.
2267 */
2268
2269void __mem_cgroup_begin_update_page_stat(struct page *page,
2270                                bool *locked, unsigned long *flags)
2271{
2272        struct mem_cgroup *memcg;
2273        struct page_cgroup *pc;
2274
2275        pc = lookup_page_cgroup(page);
2276again:
2277        memcg = pc->mem_cgroup;
2278        if (unlikely(!memcg || !PageCgroupUsed(pc)))
2279                return;
2280        /*
2281         * If this memory cgroup is not under account moving, we don't
2282         * need to take move_lock_mem_cgroup(). Because we already hold
2283         * rcu_read_lock(), any calls to move_account will be delayed until
2284         * rcu_read_unlock() if mem_cgroup_stolen() == true.
2285         */
2286        if (!mem_cgroup_stolen(memcg))
2287                return;
2288
2289        move_lock_mem_cgroup(memcg, flags);
2290        if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2291                move_unlock_mem_cgroup(memcg, flags);
2292                goto again;
2293        }
2294        *locked = true;
2295}
2296
2297void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2298{
2299        struct page_cgroup *pc = lookup_page_cgroup(page);
2300
2301        /*
2302         * It's guaranteed that pc->mem_cgroup never changes while
2303         * lock is held because a routine modifies pc->mem_cgroup
2304         * should take move_lock_mem_cgroup().
2305         */
2306        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2307}
2308
2309void mem_cgroup_update_page_stat(struct page *page,
2310                                 enum mem_cgroup_page_stat_item idx, int val)
2311{
2312        struct mem_cgroup *memcg;
2313        struct page_cgroup *pc = lookup_page_cgroup(page);
2314        unsigned long uninitialized_var(flags);
2315
2316        if (mem_cgroup_disabled())
2317                return;
2318
2319        memcg = pc->mem_cgroup;
2320        if (unlikely(!memcg || !PageCgroupUsed(pc)))
2321                return;
2322
2323        switch (idx) {
2324        case MEMCG_NR_FILE_MAPPED:
2325                idx = MEM_CGROUP_STAT_FILE_MAPPED;
2326                break;
2327        default:
2328                BUG();
2329        }
2330
2331        this_cpu_add(memcg->stat->count[idx], val);
2332}
2333
2334/*
2335 * size of first charge trial. "32" comes from vmscan.c's magic value.
2336 * TODO: maybe necessary to use big numbers in big irons.
2337 */
2338#define CHARGE_BATCH    32U
2339struct memcg_stock_pcp {
2340        struct mem_cgroup *cached; /* this never be root cgroup */
2341        unsigned int nr_pages;
2342        struct work_struct work;
2343        unsigned long flags;
2344#define FLUSHING_CACHED_CHARGE  0
2345};
2346static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2347static DEFINE_MUTEX(percpu_charge_mutex);
2348
2349/**
2350 * consume_stock: Try to consume stocked charge on this cpu.
2351 * @memcg: memcg to consume from.
2352 * @nr_pages: how many pages to charge.
2353 *
2354 * The charges will only happen if @memcg matches the current cpu's memcg
2355 * stock, and at least @nr_pages are available in that stock.  Failure to
2356 * service an allocation will refill the stock.
2357 *
2358 * returns true if successful, false otherwise.
2359 */
2360static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2361{
2362        struct memcg_stock_pcp *stock;
2363        bool ret = true;
2364
2365        if (nr_pages > CHARGE_BATCH)
2366                return false;
2367
2368        stock = &get_cpu_var(memcg_stock);
2369        if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2370                stock->nr_pages -= nr_pages;
2371        else /* need to call res_counter_charge */
2372                ret = false;
2373        put_cpu_var(memcg_stock);
2374        return ret;
2375}
2376
2377/*
2378 * Returns stocks cached in percpu to res_counter and reset cached information.
2379 */
2380static void drain_stock(struct memcg_stock_pcp *stock)
2381{
2382        struct mem_cgroup *old = stock->cached;
2383
2384        if (stock->nr_pages) {
2385                unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2386
2387                res_counter_uncharge(&old->res, bytes);
2388                if (do_swap_account)
2389                        res_counter_uncharge(&old->memsw, bytes);
2390                stock->nr_pages = 0;
2391        }
2392        stock->cached = NULL;
2393}
2394
2395/*
2396 * This must be called under preempt disabled or must be called by
2397 * a thread which is pinned to local cpu.
2398 */
2399static void drain_local_stock(struct work_struct *dummy)
2400{
2401        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2402        drain_stock(stock);
2403        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2404}
2405
2406static void __init memcg_stock_init(void)
2407{
2408        int cpu;
2409
2410        for_each_possible_cpu(cpu) {
2411                struct memcg_stock_pcp *stock =
2412                                        &per_cpu(memcg_stock, cpu);
2413                INIT_WORK(&stock->work, drain_local_stock);
2414        }
2415}
2416
2417/*
2418 * Cache charges(val) which is from res_counter, to local per_cpu area.
2419 * This will be consumed by consume_stock() function, later.
2420 */
2421static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2422{
2423        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2424
2425        if (stock->cached != memcg) { /* reset if necessary */
2426                drain_stock(stock);
2427                stock->cached = memcg;
2428        }
2429        stock->nr_pages += nr_pages;
2430        put_cpu_var(memcg_stock);
2431}
2432
2433/*
2434 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2435 * of the hierarchy under it. sync flag says whether we should block
2436 * until the work is done.
2437 */
2438static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2439{
2440        int cpu, curcpu;
2441
2442        /* Notify other cpus that system-wide "drain" is running */
2443        get_online_cpus();
2444        curcpu = get_cpu();
2445        for_each_online_cpu(cpu) {
2446                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2447                struct mem_cgroup *memcg;
2448
2449                memcg = stock->cached;
2450                if (!memcg || !stock->nr_pages)
2451                        continue;
2452                if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2453                        continue;
2454                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2455                        if (cpu == curcpu)
2456                                drain_local_stock(&stock->work);
2457                        else
2458                                schedule_work_on(cpu, &stock->work);
2459                }
2460        }
2461        put_cpu();
2462
2463        if (!sync)
2464                goto out;
2465
2466        for_each_online_cpu(cpu) {
2467                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2468                if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2469                        flush_work(&stock->work);
2470        }
2471out:
2472        put_online_cpus();
2473}
2474
2475/*
2476 * Tries to drain stocked charges in other cpus. This function is asynchronous
2477 * and just put a work per cpu for draining localy on each cpu. Caller can
2478 * expects some charges will be back to res_counter later but cannot wait for
2479 * it.
2480 */
2481static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2482{
2483        /*
2484         * If someone calls draining, avoid adding more kworker runs.
2485         */
2486        if (!mutex_trylock(&percpu_charge_mutex))
2487                return;
2488        drain_all_stock(root_memcg, false);
2489        mutex_unlock(&percpu_charge_mutex);
2490}
2491
2492/* This is a synchronous drain interface. */
2493static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2494{
2495        /* called when force_empty is called */
2496        mutex_lock(&percpu_charge_mutex);
2497        drain_all_stock(root_memcg, true);
2498        mutex_unlock(&percpu_charge_mutex);
2499}
2500
2501/*
2502 * This function drains percpu counter value from DEAD cpu and
2503 * move it to local cpu. Note that this function can be preempted.
2504 */
2505static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2506{
2507        int i;
2508
2509        spin_lock(&memcg->pcp_counter_lock);
2510        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2511                long x = per_cpu(memcg->stat->count[i], cpu);
2512
2513                per_cpu(memcg->stat->count[i], cpu) = 0;
2514                memcg->nocpu_base.count[i] += x;
2515        }
2516        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2517                unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2518
2519                per_cpu(memcg->stat->events[i], cpu) = 0;
2520                memcg->nocpu_base.events[i] += x;
2521        }
2522        spin_unlock(&memcg->pcp_counter_lock);
2523}
2524
2525static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2526                                        unsigned long action,
2527                                        void *hcpu)
2528{
2529        int cpu = (unsigned long)hcpu;
2530        struct memcg_stock_pcp *stock;
2531        struct mem_cgroup *iter;
2532
2533        if (action == CPU_ONLINE)
2534                return NOTIFY_OK;
2535
2536        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2537                return NOTIFY_OK;
2538
2539        for_each_mem_cgroup(iter)
2540                mem_cgroup_drain_pcp_counter(iter, cpu);
2541
2542        stock = &per_cpu(memcg_stock, cpu);
2543        drain_stock(stock);
2544        return NOTIFY_OK;
2545}
2546
2547
2548/* See __mem_cgroup_try_charge() for details */
2549enum {
2550        CHARGE_OK,              /* success */
2551        CHARGE_RETRY,           /* need to retry but retry is not bad */
2552        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
2553        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
2554        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
2555};
2556
2557static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2558                                unsigned int nr_pages, unsigned int min_pages,
2559                                bool oom_check)
2560{
2561        unsigned long csize = nr_pages * PAGE_SIZE;
2562        struct mem_cgroup *mem_over_limit;
2563        struct res_counter *fail_res;
2564        unsigned long flags = 0;
2565        int ret;
2566
2567        ret = res_counter_charge(&memcg->res, csize, &fail_res);
2568
2569        if (likely(!ret)) {
2570                if (!do_swap_account)
2571                        return CHARGE_OK;
2572                ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2573                if (likely(!ret))
2574                        return CHARGE_OK;
2575
2576                res_counter_uncharge(&memcg->res, csize);
2577                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2578                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2579        } else
2580                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2581        /*
2582         * Never reclaim on behalf of optional batching, retry with a
2583         * single page instead.
2584         */
2585        if (nr_pages > min_pages)
2586                return CHARGE_RETRY;
2587
2588        if (!(gfp_mask & __GFP_WAIT))
2589                return CHARGE_WOULDBLOCK;
2590
2591        if (gfp_mask & __GFP_NORETRY)
2592                return CHARGE_NOMEM;
2593
2594        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2595        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2596                return CHARGE_RETRY;
2597        /*
2598         * Even though the limit is exceeded at this point, reclaim
2599         * may have been able to free some pages.  Retry the charge
2600         * before killing the task.
2601         *
2602         * Only for regular pages, though: huge pages are rather
2603         * unlikely to succeed so close to the limit, and we fall back
2604         * to regular pages anyway in case of failure.
2605         */
2606        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2607                return CHARGE_RETRY;
2608
2609        /*
2610         * At task move, charge accounts can be doubly counted. So, it's
2611         * better to wait until the end of task_move if something is going on.
2612         */
2613        if (mem_cgroup_wait_acct_move(mem_over_limit))
2614                return CHARGE_RETRY;
2615
2616        /* If we don't need to call oom-killer at el, return immediately */
2617        if (!oom_check)
2618                return CHARGE_NOMEM;
2619        /* check OOM */
2620        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2621                return CHARGE_OOM_DIE;
2622
2623        return CHARGE_RETRY;
2624}
2625
2626/*
2627 * __mem_cgroup_try_charge() does
2628 * 1. detect memcg to be charged against from passed *mm and *ptr,
2629 * 2. update res_counter
2630 * 3. call memory reclaim if necessary.
2631 *
2632 * In some special case, if the task is fatal, fatal_signal_pending() or
2633 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2634 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2635 * as possible without any hazards. 2: all pages should have a valid
2636 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2637 * pointer, that is treated as a charge to root_mem_cgroup.
2638 *
2639 * So __mem_cgroup_try_charge() will return
2640 *  0       ...  on success, filling *ptr with a valid memcg pointer.
2641 *  -ENOMEM ...  charge failure because of resource limits.
2642 *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
2643 *
2644 * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2645 * the oom-killer can be invoked.
2646 */
2647static int __mem_cgroup_try_charge(struct mm_struct *mm,
2648                                   gfp_t gfp_mask,
2649                                   unsigned int nr_pages,
2650                                   struct mem_cgroup **ptr,
2651                                   bool oom)
2652{
2653        unsigned int batch = max(CHARGE_BATCH, nr_pages);
2654        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2655        struct mem_cgroup *memcg = NULL;
2656        int ret;
2657
2658        /*
2659         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2660         * in system level. So, allow to go ahead dying process in addition to
2661         * MEMDIE process.
2662         */
2663        if (unlikely(test_thread_flag(TIF_MEMDIE)
2664                     || fatal_signal_pending(current)))
2665                goto bypass;
2666
2667        /*
2668         * We always charge the cgroup the mm_struct belongs to.
2669         * The mm_struct's mem_cgroup changes on task migration if the
2670         * thread group leader migrates. It's possible that mm is not
2671         * set, if so charge the root memcg (happens for pagecache usage).
2672         */
2673        if (!*ptr && !mm)
2674                *ptr = root_mem_cgroup;
2675again:
2676        if (*ptr) { /* css should be a valid one */
2677                memcg = *ptr;
2678                if (mem_cgroup_is_root(memcg))
2679                        goto done;
2680                if (consume_stock(memcg, nr_pages))
2681                        goto done;
2682                css_get(&memcg->css);
2683        } else {
2684                struct task_struct *p;
2685
2686                rcu_read_lock();
2687                p = rcu_dereference(mm->owner);
2688                /*
2689                 * Because we don't have task_lock(), "p" can exit.
2690                 * In that case, "memcg" can point to root or p can be NULL with
2691                 * race with swapoff. Then, we have small risk of mis-accouning.
2692                 * But such kind of mis-account by race always happens because
2693                 * we don't have cgroup_mutex(). It's overkill and we allo that
2694                 * small race, here.
2695                 * (*) swapoff at el will charge against mm-struct not against
2696                 * task-struct. So, mm->owner can be NULL.
2697                 */
2698                memcg = mem_cgroup_from_task(p);
2699                if (!memcg)
2700                        memcg = root_mem_cgroup;
2701                if (mem_cgroup_is_root(memcg)) {
2702                        rcu_read_unlock();
2703                        goto done;
2704                }
2705                if (consume_stock(memcg, nr_pages)) {
2706                        /*
2707                         * It seems dagerous to access memcg without css_get().
2708                         * But considering how consume_stok works, it's not
2709                         * necessary. If consume_stock success, some charges
2710                         * from this memcg are cached on this cpu. So, we
2711                         * don't need to call css_get()/css_tryget() before
2712                         * calling consume_stock().
2713                         */
2714                        rcu_read_unlock();
2715                        goto done;
2716                }
2717                /* after here, we may be blocked. we need to get refcnt */
2718                if (!css_tryget(&memcg->css)) {
2719                        rcu_read_unlock();
2720                        goto again;
2721                }
2722                rcu_read_unlock();
2723        }
2724
2725        do {
2726                bool oom_check;
2727
2728                /* If killed, bypass charge */
2729                if (fatal_signal_pending(current)) {
2730                        css_put(&memcg->css);
2731                        goto bypass;
2732                }
2733
2734                oom_check = false;
2735                if (oom && !nr_oom_retries) {
2736                        oom_check = true;
2737                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2738                }
2739
2740                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2741                    oom_check);
2742                switch (ret) {
2743                case CHARGE_OK:
2744                        break;
2745                case CHARGE_RETRY: /* not in OOM situation but retry */
2746                        batch = nr_pages;
2747                        css_put(&memcg->css);
2748                        memcg = NULL;
2749                        goto again;
2750                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2751                        css_put(&memcg->css);
2752                        goto nomem;
2753                case CHARGE_NOMEM: /* OOM routine works */
2754                        if (!oom) {
2755                                css_put(&memcg->css);
2756                                goto nomem;
2757                        }
2758                        /* If oom, we never return -ENOMEM */
2759                        nr_oom_retries--;
2760                        break;
2761                case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2762                        css_put(&memcg->css);
2763                        goto bypass;
2764                }
2765        } while (ret != CHARGE_OK);
2766
2767        if (batch > nr_pages)
2768                refill_stock(memcg, batch - nr_pages);
2769        css_put(&memcg->css);
2770done:
2771        *ptr = memcg;
2772        return 0;
2773nomem:
2774        *ptr = NULL;
2775        return -ENOMEM;
2776bypass:
2777        *ptr = root_mem_cgroup;
2778        return -EINTR;
2779}
2780
2781/*
2782 * Somemtimes we have to undo a charge we got by try_charge().
2783 * This function is for that and do uncharge, put css's refcnt.
2784 * gotten by try_charge().
2785 */
2786static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2787                                       unsigned int nr_pages)
2788{
2789        if (!mem_cgroup_is_root(memcg)) {
2790                unsigned long bytes = nr_pages * PAGE_SIZE;
2791
2792                res_counter_uncharge(&memcg->res, bytes);
2793                if (do_swap_account)
2794                        res_counter_uncharge(&memcg->memsw, bytes);
2795        }
2796}
2797
2798/*
2799 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2800 * This is useful when moving usage to parent cgroup.
2801 */
2802static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2803                                        unsigned int nr_pages)
2804{
2805        unsigned long bytes = nr_pages * PAGE_SIZE;
2806
2807        if (mem_cgroup_is_root(memcg))
2808                return;
2809
2810        res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2811        if (do_swap_account)
2812                res_counter_uncharge_until(&memcg->memsw,
2813                                                memcg->memsw.parent, bytes);
2814}
2815
2816/*
2817 * A helper function to get mem_cgroup from ID. must be called under
2818 * rcu_read_lock().  The caller is responsible for calling css_tryget if
2819 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2820 * called against removed memcg.)
2821 */
2822static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2823{
2824        struct cgroup_subsys_state *css;
2825
2826        /* ID 0 is unused ID */
2827        if (!id)
2828                return NULL;
2829        css = css_lookup(&mem_cgroup_subsys, id);
2830        if (!css)
2831                return NULL;
2832        return mem_cgroup_from_css(css);
2833}
2834
2835struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2836{
2837        struct mem_cgroup *memcg = NULL;
2838        struct page_cgroup *pc;
2839        unsigned short id;
2840        swp_entry_t ent;
2841
2842        VM_BUG_ON(!PageLocked(page));
2843
2844        pc = lookup_page_cgroup(page);
2845        lock_page_cgroup(pc);
2846        if (PageCgroupUsed(pc)) {
2847                memcg = pc->mem_cgroup;
2848                if (memcg && !css_tryget(&memcg->css))
2849                        memcg = NULL;
2850        } else if (PageSwapCache(page)) {
2851                ent.val = page_private(page);
2852                id = lookup_swap_cgroup_id(ent);
2853                rcu_read_lock();
2854                memcg = mem_cgroup_lookup(id);
2855                if (memcg && !css_tryget(&memcg->css))
2856                        memcg = NULL;
2857                rcu_read_unlock();
2858        }
2859        unlock_page_cgroup(pc);
2860        return memcg;
2861}
2862
2863static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2864                                       struct page *page,
2865                                       unsigned int nr_pages,
2866                                       enum charge_type ctype,
2867                                       bool lrucare)
2868{
2869        struct page_cgroup *pc = lookup_page_cgroup(page);
2870        struct zone *uninitialized_var(zone);
2871        struct lruvec *lruvec;
2872        bool was_on_lru = false;
2873        bool anon;
2874
2875        lock_page_cgroup(pc);
2876        VM_BUG_ON(PageCgroupUsed(pc));
2877        /*
2878         * we don't need page_cgroup_lock about tail pages, becase they are not
2879         * accessed by any other context at this point.
2880         */
2881
2882        /*
2883         * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2884         * may already be on some other mem_cgroup's LRU.  Take care of it.
2885         */
2886        if (lrucare) {
2887                zone = page_zone(page);
2888                spin_lock_irq(&zone->lru_lock);
2889                if (PageLRU(page)) {
2890                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2891                        ClearPageLRU(page);
2892                        del_page_from_lru_list(page, lruvec, page_lru(page));
2893                        was_on_lru = true;
2894                }
2895        }
2896
2897        pc->mem_cgroup = memcg;
2898        /*
2899         * We access a page_cgroup asynchronously without lock_page_cgroup().
2900         * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2901         * is accessed after testing USED bit. To make pc->mem_cgroup visible
2902         * before USED bit, we need memory barrier here.
2903         * See mem_cgroup_add_lru_list(), etc.
2904         */
2905        smp_wmb();
2906        SetPageCgroupUsed(pc);
2907
2908        if (lrucare) {
2909                if (was_on_lru) {
2910                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2911                        VM_BUG_ON(PageLRU(page));
2912                        SetPageLRU(page);
2913                        add_page_to_lru_list(page, lruvec, page_lru(page));
2914                }
2915                spin_unlock_irq(&zone->lru_lock);
2916        }
2917
2918        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2919                anon = true;
2920        else
2921                anon = false;
2922
2923        mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2924        unlock_page_cgroup(pc);
2925
2926        /*
2927         * "charge_statistics" updated event counter. Then, check it.
2928         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2929         * if they exceeds softlimit.
2930         */
2931        memcg_check_events(memcg, page);
2932}
2933
2934static DEFINE_MUTEX(set_limit_mutex);
2935
2936#ifdef CONFIG_MEMCG_KMEM
2937static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2938{
2939        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2940                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2941}
2942
2943/*
2944 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2945 * in the memcg_cache_params struct.
2946 */
2947static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2948{
2949        struct kmem_cache *cachep;
2950
2951        VM_BUG_ON(p->is_root_cache);
2952        cachep = p->root_cache;
2953        return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2954}
2955
2956#ifdef CONFIG_SLABINFO
2957static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
2958                                        struct seq_file *m)
2959{
2960        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2961        struct memcg_cache_params *params;
2962
2963        if (!memcg_can_account_kmem(memcg))
2964                return -EIO;
2965
2966        print_slabinfo_header(m);
2967
2968        mutex_lock(&memcg->slab_caches_mutex);
2969        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2970                cache_show(memcg_params_to_cache(params), m);
2971        mutex_unlock(&memcg->slab_caches_mutex);
2972
2973        return 0;
2974}
2975#endif
2976
2977static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2978{
2979        struct res_counter *fail_res;
2980        struct mem_cgroup *_memcg;
2981        int ret = 0;
2982        bool may_oom;
2983
2984        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2985        if (ret)
2986                return ret;
2987
2988        /*
2989         * Conditions under which we can wait for the oom_killer. Those are
2990         * the same conditions tested by the core page allocator
2991         */
2992        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2993
2994        _memcg = memcg;
2995        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2996                                      &_memcg, may_oom);
2997
2998        if (ret == -EINTR)  {
2999                /*
3000                 * __mem_cgroup_try_charge() chosed to bypass to root due to
3001                 * OOM kill or fatal signal.  Since our only options are to
3002                 * either fail the allocation or charge it to this cgroup, do
3003                 * it as a temporary condition. But we can't fail. From a
3004                 * kmem/slab perspective, the cache has already been selected,
3005                 * by mem_cgroup_kmem_get_cache(), so it is too late to change
3006                 * our minds.
3007                 *
3008                 * This condition will only trigger if the task entered
3009                 * memcg_charge_kmem in a sane state, but was OOM-killed during
3010                 * __mem_cgroup_try_charge() above. Tasks that were already
3011                 * dying when the allocation triggers should have been already
3012                 * directed to the root cgroup in memcontrol.h
3013                 */
3014                res_counter_charge_nofail(&memcg->res, size, &fail_res);
3015                if (do_swap_account)
3016                        res_counter_charge_nofail(&memcg->memsw, size,
3017                                                  &fail_res);
3018                ret = 0;
3019        } else if (ret)
3020                res_counter_uncharge(&memcg->kmem, size);
3021
3022        return ret;
3023}
3024
3025static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3026{
3027        res_counter_uncharge(&memcg->res, size);
3028        if (do_swap_account)
3029                res_counter_uncharge(&memcg->memsw, size);
3030
3031        /* Not down to 0 */
3032        if (res_counter_uncharge(&memcg->kmem, size))
3033                return;
3034
3035        /*
3036         * Releases a reference taken in kmem_cgroup_css_offline in case
3037         * this last uncharge is racing with the offlining code or it is
3038         * outliving the memcg existence.
3039         *
3040         * The memory barrier imposed by test&clear is paired with the
3041         * explicit one in memcg_kmem_mark_dead().
3042         */
3043        if (memcg_kmem_test_and_clear_dead(memcg))
3044                css_put(&memcg->css);
3045}
3046
3047void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
3048{
3049        if (!memcg)
3050                return;
3051
3052        mutex_lock(&memcg->slab_caches_mutex);
3053        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3054        mutex_unlock(&memcg->slab_caches_mutex);
3055}
3056
3057/*
3058 * helper for acessing a memcg's index. It will be used as an index in the
3059 * child cache array in kmem_cache, and also to derive its name. This function
3060 * will return -1 when this is not a kmem-limited memcg.
3061 */
3062int memcg_cache_id(struct mem_cgroup *memcg)
3063{
3064        return memcg ? memcg->kmemcg_id : -1;
3065}
3066
3067/*
3068 * This ends up being protected by the set_limit mutex, during normal
3069 * operation, because that is its main call site.
3070 *
3071 * But when we create a new cache, we can call this as well if its parent
3072 * is kmem-limited. That will have to hold set_limit_mutex as well.
3073 */
3074int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3075{
3076        int num, ret;
3077
3078        num = ida_simple_get(&kmem_limited_groups,
3079                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
3080        if (num < 0)
3081                return num;
3082        /*
3083         * After this point, kmem_accounted (that we test atomically in
3084         * the beginning of this conditional), is no longer 0. This
3085         * guarantees only one process will set the following boolean
3086         * to true. We don't need test_and_set because we're protected
3087         * by the set_limit_mutex anyway.
3088         */
3089        memcg_kmem_set_activated(memcg);
3090
3091        ret = memcg_update_all_caches(num+1);
3092        if (ret) {
3093                ida_simple_remove(&kmem_limited_groups, num);
3094                memcg_kmem_clear_activated(memcg);
3095                return ret;
3096        }
3097
3098        memcg->kmemcg_id = num;
3099        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3100        mutex_init(&memcg->slab_caches_mutex);
3101        return 0;
3102}
3103
3104static size_t memcg_caches_array_size(int num_groups)
3105{
3106        ssize_t size;
3107        if (num_groups <= 0)
3108                return 0;
3109
3110        size = 2 * num_groups;
3111        if (size < MEMCG_CACHES_MIN_SIZE)
3112                size = MEMCG_CACHES_MIN_SIZE;
3113        else if (size > MEMCG_CACHES_MAX_SIZE)
3114                size = MEMCG_CACHES_MAX_SIZE;
3115
3116        return size;
3117}
3118
3119/*
3120 * We should update the current array size iff all caches updates succeed. This
3121 * can only be done from the slab side. The slab mutex needs to be held when
3122 * calling this.
3123 */
3124void memcg_update_array_size(int num)
3125{
3126        if (num > memcg_limited_groups_array_size)
3127                memcg_limited_groups_array_size = memcg_caches_array_size(num);
3128}
3129
3130static void kmem_cache_destroy_work_func(struct work_struct *w);
3131
3132int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3133{
3134        struct memcg_cache_params *cur_params = s->memcg_params;
3135
3136        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
3137
3138        if (num_groups > memcg_limited_groups_array_size) {
3139                int i;
3140                ssize_t size = memcg_caches_array_size(num_groups);
3141
3142                size *= sizeof(void *);
3143                size += sizeof(struct memcg_cache_params);
3144
3145                s->memcg_params = kzalloc(size, GFP_KERNEL);
3146                if (!s->memcg_params) {
3147                        s->memcg_params = cur_params;
3148                        return -ENOMEM;
3149                }
3150
3151                s->memcg_params->is_root_cache = true;
3152
3153                /*
3154                 * There is the chance it will be bigger than
3155                 * memcg_limited_groups_array_size, if we failed an allocation
3156                 * in a cache, in which case all caches updated before it, will
3157                 * have a bigger array.
3158                 *
3159                 * But if that is the case, the data after
3160                 * memcg_limited_groups_array_size is certainly unused
3161                 */
3162                for (i = 0; i < memcg_limited_groups_array_size; i++) {
3163                        if (!cur_params->memcg_caches[i])
3164                                continue;
3165                        s->memcg_params->memcg_caches[i] =
3166                                                cur_params->memcg_caches[i];
3167                }
3168
3169                /*
3170                 * Ideally, we would wait until all caches succeed, and only
3171                 * then free the old one. But this is not worth the extra
3172                 * pointer per-cache we'd have to have for this.
3173                 *
3174                 * It is not a big deal if some caches are left with a size
3175                 * bigger than the others. And all updates will reset this
3176                 * anyway.
3177                 */
3178                kfree(cur_params);
3179        }
3180        return 0;
3181}
3182
3183int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3184                         struct kmem_cache *root_cache)
3185{
3186        size_t size = sizeof(struct memcg_cache_params);
3187
3188        if (!memcg_kmem_enabled())
3189                return 0;
3190
3191        if (!memcg)
3192                size += memcg_limited_groups_array_size * sizeof(void *);
3193
3194        s->memcg_params = kzalloc(size, GFP_KERNEL);
3195        if (!s->memcg_params)
3196                return -ENOMEM;
3197
3198        if (memcg) {
3199                s->memcg_params->memcg = memcg;
3200                s->memcg_params->root_cache = root_cache;
3201                INIT_WORK(&s->memcg_params->destroy,
3202                                kmem_cache_destroy_work_func);
3203        } else
3204                s->memcg_params->is_root_cache = true;
3205
3206        return 0;
3207}
3208
3209void memcg_release_cache(struct kmem_cache *s)
3210{
3211        struct kmem_cache *root;
3212        struct mem_cgroup *memcg;
3213        int id;
3214
3215        /*
3216         * This happens, for instance, when a root cache goes away before we
3217         * add any memcg.
3218         */
3219        if (!s->memcg_params)
3220                return;
3221
3222        if (s->memcg_params->is_root_cache)
3223                goto out;
3224
3225        memcg = s->memcg_params->memcg;
3226        id  = memcg_cache_id(memcg);
3227
3228        root = s->memcg_params->root_cache;
3229        root->memcg_params->memcg_caches[id] = NULL;
3230
3231        mutex_lock(&memcg->slab_caches_mutex);
3232        list_del(&s->memcg_params->list);
3233        mutex_unlock(&memcg->slab_caches_mutex);
3234
3235        css_put(&memcg->css);
3236out:
3237        kfree(s->memcg_params);
3238}
3239
3240/*
3241 * During the creation a new cache, we need to disable our accounting mechanism
3242 * altogether. This is true even if we are not creating, but rather just
3243 * enqueing new caches to be created.
3244 *
3245 * This is because that process will trigger allocations; some visible, like
3246 * explicit kmallocs to auxiliary data structures, name strings and internal
3247 * cache structures; some well concealed, like INIT_WORK() that can allocate
3248 * objects during debug.
3249 *
3250 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3251 * to it. This may not be a bounded recursion: since the first cache creation
3252 * failed to complete (waiting on the allocation), we'll just try to create the
3253 * cache again, failing at the same point.
3254 *
3255 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3256 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3257 * inside the following two functions.
3258 */
3259static inline void memcg_stop_kmem_account(void)
3260{
3261        VM_BUG_ON(!current->mm);
3262        current->memcg_kmem_skip_account++;
3263}
3264
3265static inline void memcg_resume_kmem_account(void)
3266{
3267        VM_BUG_ON(!current->mm);
3268        current->memcg_kmem_skip_account--;
3269}
3270
3271static void kmem_cache_destroy_work_func(struct work_struct *w)
3272{
3273        struct kmem_cache *cachep;
3274        struct memcg_cache_params *p;
3275
3276        p = container_of(w, struct memcg_cache_params, destroy);
3277
3278        cachep = memcg_params_to_cache(p);
3279
3280        /*
3281         * If we get down to 0 after shrink, we could delete right away.
3282         * However, memcg_release_pages() already puts us back in the workqueue
3283         * in that case. If we proceed deleting, we'll get a dangling
3284         * reference, and removing the object from the workqueue in that case
3285         * is unnecessary complication. We are not a fast path.
3286         *
3287         * Note that this case is fundamentally different from racing with
3288         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3289         * kmem_cache_shrink, not only we would be reinserting a dead cache
3290         * into the queue, but doing so from inside the worker racing to
3291         * destroy it.
3292         *
3293         * So if we aren't down to zero, we'll just schedule a worker and try
3294         * again
3295         */
3296        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3297                kmem_cache_shrink(cachep);
3298                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3299                        return;
3300        } else
3301                kmem_cache_destroy(cachep);
3302}
3303
3304void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3305{
3306        if (!cachep->memcg_params->dead)
3307                return;
3308
3309        /*
3310         * There are many ways in which we can get here.
3311         *
3312         * We can get to a memory-pressure situation while the delayed work is
3313         * still pending to run. The vmscan shrinkers can then release all
3314         * cache memory and get us to destruction. If this is the case, we'll
3315         * be executed twice, which is a bug (the second time will execute over
3316         * bogus data). In this case, cancelling the work should be fine.
3317         *
3318         * But we can also get here from the worker itself, if
3319         * kmem_cache_shrink is enough to shake all the remaining objects and
3320         * get the page count to 0. In this case, we'll deadlock if we try to
3321         * cancel the work (the worker runs with an internal lock held, which
3322         * is the same lock we would hold for cancel_work_sync().)
3323         *
3324         * Since we can't possibly know who got us here, just refrain from
3325         * running if there is already work pending
3326         */
3327        if (work_pending(&cachep->memcg_params->destroy))
3328                return;
3329        /*
3330         * We have to defer the actual destroying to a workqueue, because
3331         * we might currently be in a context that cannot sleep.
3332         */
3333        schedule_work(&cachep->memcg_params->destroy);
3334}
3335
3336/*
3337 * This lock protects updaters, not readers. We want readers to be as fast as
3338 * they can, and they will either see NULL or a valid cache value. Our model
3339 * allow them to see NULL, in which case the root memcg will be selected.
3340 *
3341 * We need this lock because multiple allocations to the same cache from a non
3342 * will span more than one worker. Only one of them can create the cache.
3343 */
3344static DEFINE_MUTEX(memcg_cache_mutex);
3345
3346/*
3347 * Called with memcg_cache_mutex held
3348 */
3349static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3350                                         struct kmem_cache *s)
3351{
3352        struct kmem_cache *new;
3353        static char *tmp_name = NULL;
3354
3355        lockdep_assert_held(&memcg_cache_mutex);
3356
3357        /*
3358         * kmem_cache_create_memcg duplicates the given name and
3359         * cgroup_name for this name requires RCU context.
3360         * This static temporary buffer is used to prevent from
3361         * pointless shortliving allocation.
3362         */
3363        if (!tmp_name) {
3364                tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3365                if (!tmp_name)
3366                        return NULL;
3367        }
3368
3369        rcu_read_lock();
3370        snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3371                         memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3372        rcu_read_unlock();
3373
3374        new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3375                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
3376
3377        if (new)
3378                new->allocflags |= __GFP_KMEMCG;
3379
3380        return new;
3381}
3382
3383static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3384                                                  struct kmem_cache *cachep)
3385{
3386        struct kmem_cache *new_cachep;
3387        int idx;
3388
3389        BUG_ON(!memcg_can_account_kmem(memcg));
3390
3391        idx = memcg_cache_id(memcg);
3392
3393        mutex_lock(&memcg_cache_mutex);
3394        new_cachep = cachep->memcg_params->memcg_caches[idx];
3395        if (new_cachep) {
3396                css_put(&memcg->css);
3397                goto out;
3398        }
3399
3400        new_cachep = kmem_cache_dup(memcg, cachep);
3401        if (new_cachep == NULL) {
3402                new_cachep = cachep;
3403                css_put(&memcg->css);
3404                goto out;
3405        }
3406
3407        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3408
3409        cachep->memcg_params->memcg_caches[idx] = new_cachep;
3410        /*
3411         * the readers won't lock, make sure everybody sees the updated value,
3412         * so they won't put stuff in the queue again for no reason
3413         */
3414        wmb();
3415out:
3416        mutex_unlock(&memcg_cache_mutex);
3417        return new_cachep;
3418}
3419
3420void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3421{
3422        struct kmem_cache *c;
3423        int i;
3424
3425        if (!s->memcg_params)
3426                return;
3427        if (!s->memcg_params->is_root_cache)
3428                return;
3429
3430        /*
3431         * If the cache is being destroyed, we trust that there is no one else
3432         * requesting objects from it. Even if there are, the sanity checks in
3433         * kmem_cache_destroy should caught this ill-case.
3434         *
3435         * Still, we don't want anyone else freeing memcg_caches under our
3436         * noses, which can happen if a new memcg comes to life. As usual,
3437         * we'll take the set_limit_mutex to protect ourselves against this.
3438         */
3439        mutex_lock(&set_limit_mutex);
3440        for (i = 0; i < memcg_limited_groups_array_size; i++) {
3441                c = s->memcg_params->memcg_caches[i];
3442                if (!c)
3443                        continue;
3444
3445                /*
3446                 * We will now manually delete the caches, so to avoid races
3447                 * we need to cancel all pending destruction workers and
3448                 * proceed with destruction ourselves.
3449                 *
3450                 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3451                 * and that could spawn the workers again: it is likely that
3452                 * the cache still have active pages until this very moment.
3453                 * This would lead us back to mem_cgroup_destroy_cache.
3454                 *
3455                 * But that will not execute at all if the "dead" flag is not
3456                 * set, so flip it down to guarantee we are in control.
3457                 */
3458                c->memcg_params->dead = false;
3459                cancel_work_sync(&c->memcg_params->destroy);
3460                kmem_cache_destroy(c);
3461        }
3462        mutex_unlock(&set_limit_mutex);
3463}
3464
3465struct create_work {
3466        struct mem_cgroup *memcg;
3467        struct kmem_cache *cachep;
3468        struct work_struct work;
3469};
3470
3471static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3472{
3473        struct kmem_cache *cachep;
3474        struct memcg_cache_params *params;
3475
3476        if (!memcg_kmem_is_active(memcg))
3477                return;
3478
3479        mutex_lock(&memcg->slab_caches_mutex);
3480        list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3481                cachep = memcg_params_to_cache(params);
3482                cachep->memcg_params->dead = true;
3483                schedule_work(&cachep->memcg_params->destroy);
3484        }
3485        mutex_unlock(&memcg->slab_caches_mutex);
3486}
3487
3488static void memcg_create_cache_work_func(struct work_struct *w)
3489{
3490        struct create_work *cw;
3491
3492        cw = container_of(w, struct create_work, work);
3493        memcg_create_kmem_cache(cw->memcg, cw->cachep);
3494        kfree(cw);
3495}
3496
3497/*
3498 * Enqueue the creation of a per-memcg kmem_cache.
3499 */
3500static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3501                                         struct kmem_cache *cachep)
3502{
3503        struct create_work *cw;
3504
3505        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3506        if (cw == NULL) {
3507                css_put(&memcg->css);
3508                return;
3509        }
3510
3511        cw->memcg = memcg;
3512        cw->cachep = cachep;
3513
3514        INIT_WORK(&cw->work, memcg_create_cache_work_func);
3515        schedule_work(&cw->work);
3516}
3517
3518static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3519                                       struct kmem_cache *cachep)
3520{
3521        /*
3522         * We need to stop accounting when we kmalloc, because if the
3523         * corresponding kmalloc cache is not yet created, the first allocation
3524         * in __memcg_create_cache_enqueue will recurse.
3525         *
3526         * However, it is better to enclose the whole function. Depending on
3527         * the debugging options enabled, INIT_WORK(), for instance, can
3528         * trigger an allocation. This too, will make us recurse. Because at
3529         * this point we can't allow ourselves back into memcg_kmem_get_cache,
3530         * the safest choice is to do it like this, wrapping the whole function.
3531         */
3532        memcg_stop_kmem_account();
3533        __memcg_create_cache_enqueue(memcg, cachep);
3534        memcg_resume_kmem_account();
3535}
3536/*
3537 * Return the kmem_cache we're supposed to use for a slab allocation.
3538 * We try to use the current memcg's version of the cache.
3539 *
3540 * If the cache does not exist yet, if we are the first user of it,
3541 * we either create it immediately, if possible, or create it asynchronously
3542 * in a workqueue.
3543 * In the latter case, we will let the current allocation go through with
3544 * the original cache.
3545 *
3546 * Can't be called in interrupt context or from kernel threads.
3547 * This function needs to be called with rcu_read_lock() held.
3548 */
3549struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3550                                          gfp_t gfp)
3551{
3552        struct mem_cgroup *memcg;
3553        int idx;
3554
3555        VM_BUG_ON(!cachep->memcg_params);
3556        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3557
3558        if (!current->mm || current->memcg_kmem_skip_account)
3559                return cachep;
3560
3561        rcu_read_lock();
3562        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3563
3564        if (!memcg_can_account_kmem(memcg))
3565                goto out;
3566
3567        idx = memcg_cache_id(memcg);
3568
3569        /*
3570         * barrier to mare sure we're always seeing the up to date value.  The
3571         * code updating memcg_caches will issue a write barrier to match this.
3572         */
3573        read_barrier_depends();
3574        if (likely(cachep->memcg_params->memcg_caches[idx])) {
3575                cachep = cachep->memcg_params->memcg_caches[idx];
3576                goto out;
3577        }
3578
3579        /* The corresponding put will be done in the workqueue. */
3580        if (!css_tryget(&memcg->css))
3581                goto out;
3582        rcu_read_unlock();
3583
3584        /*
3585         * If we are in a safe context (can wait, and not in interrupt
3586         * context), we could be be predictable and return right away.
3587         * This would guarantee that the allocation being performed
3588         * already belongs in the new cache.
3589         *
3590         * However, there are some clashes that can arrive from locking.
3591         * For instance, because we acquire the slab_mutex while doing
3592         * kmem_cache_dup, this means no further allocation could happen
3593         * with the slab_mutex held.
3594         *
3595         * Also, because cache creation issue get_online_cpus(), this
3596         * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3597         * that ends up reversed during cpu hotplug. (cpuset allocates
3598         * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3599         * better to defer everything.
3600         */
3601        memcg_create_cache_enqueue(memcg, cachep);
3602        return cachep;
3603out:
3604        rcu_read_unlock();
3605        return cachep;
3606}
3607EXPORT_SYMBOL(__memcg_kmem_get_cache);
3608
3609/*
3610 * We need to verify if the allocation against current->mm->owner's memcg is
3611 * possible for the given order. But the page is not allocated yet, so we'll
3612 * need a further commit step to do the final arrangements.
3613 *
3614 * It is possible for the task to switch cgroups in this mean time, so at
3615 * commit time, we can't rely on task conversion any longer.  We'll then use
3616 * the handle argument to return to the caller which cgroup we should commit
3617 * against. We could also return the memcg directly and avoid the pointer
3618 * passing, but a boolean return value gives better semantics considering
3619 * the compiled-out case as well.
3620 *
3621 * Returning true means the allocation is possible.
3622 */
3623bool
3624__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3625{
3626        struct mem_cgroup *memcg;
3627        int ret;
3628
3629        *_memcg = NULL;
3630
3631        /*
3632         * Disabling accounting is only relevant for some specific memcg
3633         * internal allocations. Therefore we would initially not have such
3634         * check here, since direct calls to the page allocator that are marked
3635         * with GFP_KMEMCG only happen outside memcg core. We are mostly
3636         * concerned with cache allocations, and by having this test at
3637         * memcg_kmem_get_cache, we are already able to relay the allocation to
3638         * the root cache and bypass the memcg cache altogether.
3639         *
3640         * There is one exception, though: the SLUB allocator does not create
3641         * large order caches, but rather service large kmallocs directly from
3642         * the page allocator. Therefore, the following sequence when backed by
3643         * the SLUB allocator:
3644         *
3645         *      memcg_stop_kmem_account();
3646         *      kmalloc(<large_number>)
3647         *      memcg_resume_kmem_account();
3648         *
3649         * would effectively ignore the fact that we should skip accounting,
3650         * since it will drive us directly to this function without passing
3651         * through the cache selector memcg_kmem_get_cache. Such large
3652         * allocations are extremely rare but can happen, for instance, for the
3653         * cache arrays. We bring this test here.
3654         */
3655        if (!current->mm || current->memcg_kmem_skip_account)
3656                return true;
3657
3658        memcg = try_get_mem_cgroup_from_mm(current->mm);
3659
3660        /*
3661         * very rare case described in mem_cgroup_from_task. Unfortunately there
3662         * isn't much we can do without complicating this too much, and it would
3663         * be gfp-dependent anyway. Just let it go
3664         */
3665        if (unlikely(!memcg))
3666                return true;
3667
3668        if (!memcg_can_account_kmem(memcg)) {
3669                css_put(&memcg->css);
3670                return true;
3671        }
3672
3673        ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3674        if (!ret)
3675                *_memcg = memcg;
3676
3677        css_put(&memcg->css);
3678        return (ret == 0);
3679}
3680
3681void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3682                              int order)
3683{
3684        struct page_cgroup *pc;
3685
3686        VM_BUG_ON(mem_cgroup_is_root(memcg));
3687
3688        /* The page allocation failed. Revert */
3689        if (!page) {
3690                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3691                return;
3692        }
3693
3694        pc = lookup_page_cgroup(page);
3695        lock_page_cgroup(pc);
3696        pc->mem_cgroup = memcg;
3697        SetPageCgroupUsed(pc);
3698        unlock_page_cgroup(pc);
3699}
3700
3701void __memcg_kmem_uncharge_pages(struct page *page, int order)
3702{
3703        struct mem_cgroup *memcg = NULL;
3704        struct page_cgroup *pc;
3705
3706
3707        pc = lookup_page_cgroup(page);
3708        /*
3709         * Fast unlocked return. Theoretically might have changed, have to
3710         * check again after locking.
3711         */
3712        if (!PageCgroupUsed(pc))
3713                return;
3714
3715        lock_page_cgroup(pc);
3716        if (PageCgroupUsed(pc)) {
3717                memcg = pc->mem_cgroup;
3718                ClearPageCgroupUsed(pc);
3719        }
3720        unlock_page_cgroup(pc);
3721
3722        /*
3723         * We trust that only if there is a memcg associated with the page, it
3724         * is a valid allocation
3725         */
3726        if (!memcg)
3727                return;
3728
3729        VM_BUG_ON(mem_cgroup_is_root(memcg));
3730        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3731}
3732#else
3733static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3734{
3735}
3736#endif /* CONFIG_MEMCG_KMEM */
3737
3738#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3739
3740#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3741/*
3742 * Because tail pages are not marked as "used", set it. We're under
3743 * zone->lru_lock, 'splitting on pmd' and compound_lock.
3744 * charge/uncharge will be never happen and move_account() is done under
3745 * compound_lock(), so we don't have to take care of races.
3746 */
3747void mem_cgroup_split_huge_fixup(struct page *head)
3748{
3749        struct page_cgroup *head_pc = lookup_page_cgroup(head);
3750        struct page_cgroup *pc;
3751        struct mem_cgroup *memcg;
3752        int i;
3753
3754        if (mem_cgroup_disabled())
3755                return;
3756
3757        memcg = head_pc->mem_cgroup;
3758        for (i = 1; i < HPAGE_PMD_NR; i++) {
3759                pc = head_pc + i;
3760                pc->mem_cgroup = memcg;
3761                smp_wmb();/* see __commit_charge() */
3762                pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3763        }
3764        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3765                       HPAGE_PMD_NR);
3766}
3767#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3768
3769/**
3770 * mem_cgroup_move_account - move account of the page
3771 * @page: the page
3772 * @nr_pages: number of regular pages (>1 for huge pages)
3773 * @pc: page_cgroup of the page.
3774 * @from: mem_cgroup which the page is moved from.
3775 * @to: mem_cgroup which the page is moved to. @from != @to.
3776 *
3777 * The caller must confirm following.
3778 * - page is not on LRU (isolate_page() is useful.)
3779 * - compound_lock is held when nr_pages > 1
3780 *
3781 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
3782 * from old cgroup.
3783 */
3784static int mem_cgroup_move_account(struct page *page,
3785                                   unsigned int nr_pages,
3786                                   struct page_cgroup *pc,
3787                                   struct mem_cgroup *from,
3788                                   struct mem_cgroup *to)
3789{
3790        unsigned long flags;
3791        int ret;
3792        bool anon = PageAnon(page);
3793
3794        VM_BUG_ON(from == to);
3795        VM_BUG_ON(PageLRU(page));
3796        /*
3797         * The page is isolated from LRU. So, collapse function
3798         * will not handle this page. But page splitting can happen.
3799         * Do this check under compound_page_lock(). The caller should
3800         * hold it.
3801         */
3802        ret = -EBUSY;
3803        if (nr_pages > 1 && !PageTransHuge(page))
3804                goto out;
3805
3806        lock_page_cgroup(pc);
3807
3808        ret = -EINVAL;
3809        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3810                goto unlock;
3811
3812        move_lock_mem_cgroup(from, &flags);
3813
3814        if (!anon && page_mapped(page)) {
3815                /* Update mapped_file data for mem_cgroup */
3816                preempt_disable();
3817                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3818                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
3819                preempt_enable();
3820        }
3821        mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3822
3823        /* caller should have done css_get */
3824        pc->mem_cgroup = to;
3825        mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3826        move_unlock_mem_cgroup(from, &flags);
3827        ret = 0;
3828unlock:
3829        unlock_page_cgroup(pc);
3830        /*
3831         * check events
3832         */
3833        memcg_check_events(to, page);
3834        memcg_check_events(from, page);
3835out:
3836        return ret;
3837}
3838
3839/**
3840 * mem_cgroup_move_parent - moves page to the parent group
3841 * @page: the page to move
3842 * @pc: page_cgroup of the page
3843 * @child: page's cgroup
3844 *
3845 * move charges to its parent or the root cgroup if the group has no
3846 * parent (aka use_hierarchy==0).
3847 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3848 * mem_cgroup_move_account fails) the failure is always temporary and
3849 * it signals a race with a page removal/uncharge or migration. In the
3850 * first case the page is on the way out and it will vanish from the LRU
3851 * on the next attempt and the call should be retried later.
3852 * Isolation from the LRU fails only if page has been isolated from
3853 * the LRU since we looked at it and that usually means either global
3854 * reclaim or migration going on. The page will either get back to the
3855 * LRU or vanish.
3856 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3857 * (!PageCgroupUsed) or moved to a different group. The page will
3858 * disappear in the next attempt.
3859 */
3860static int mem_cgroup_move_parent(struct page *page,
3861                                  struct page_cgroup *pc,
3862                                  struct mem_cgroup *child)
3863{
3864        struct mem_cgroup *parent;
3865        unsigned int nr_pages;
3866        unsigned long uninitialized_var(flags);
3867        int ret;
3868
3869        VM_BUG_ON(mem_cgroup_is_root(child));
3870
3871        ret = -EBUSY;
3872        if (!get_page_unless_zero(page))
3873                goto out;
3874        if (isolate_lru_page(page))
3875                goto put;
3876
3877        nr_pages = hpage_nr_pages(page);
3878
3879        parent = parent_mem_cgroup(child);
3880        /*
3881         * If no parent, move charges to root cgroup.
3882         */
3883        if (!parent)
3884                parent = root_mem_cgroup;
3885
3886        if (nr_pages > 1) {
3887                VM_BUG_ON(!PageTransHuge(page));
3888                flags = compound_lock_irqsave(page);
3889        }
3890
3891        ret = mem_cgroup_move_account(page, nr_pages,
3892                                pc, child, parent);
3893        if (!ret)
3894                __mem_cgroup_cancel_local_charge(child, nr_pages);
3895
3896        if (nr_pages > 1)
3897                compound_unlock_irqrestore(page, flags);
3898        putback_lru_page(page);
3899put:
3900        put_page(page);
3901out:
3902        return ret;
3903}
3904
3905/*
3906 * Charge the memory controller for page usage.
3907 * Return
3908 * 0 if the charge was successful
3909 * < 0 if the cgroup is over its limit
3910 */
3911static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3912                                gfp_t gfp_mask, enum charge_type ctype)
3913{
3914        struct mem_cgroup *memcg = NULL;
3915        unsigned int nr_pages = 1;
3916        bool oom = true;
3917        int ret;
3918
3919        if (PageTransHuge(page)) {
3920                nr_pages <<= compound_order(page);
3921                VM_BUG_ON(!PageTransHuge(page));
3922                /*
3923                 * Never OOM-kill a process for a huge page.  The
3924                 * fault handler will fall back to regular pages.
3925                 */
3926                oom = false;
3927        }
3928
3929        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3930        if (ret == -ENOMEM)
3931                return ret;
3932        __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3933        return 0;
3934}
3935
3936int mem_cgroup_newpage_charge(struct page *page,
3937                              struct mm_struct *mm, gfp_t gfp_mask)
3938{
3939        if (mem_cgroup_disabled())
3940                return 0;
3941        VM_BUG_ON(page_mapped(page));
3942        VM_BUG_ON(page->mapping && !PageAnon(page));
3943        VM_BUG_ON(!mm);
3944        return mem_cgroup_charge_common(page, mm, gfp_mask,
3945                                        MEM_CGROUP_CHARGE_TYPE_ANON);
3946}
3947
3948/*
3949 * While swap-in, try_charge -> commit or cancel, the page is locked.
3950 * And when try_charge() successfully returns, one refcnt to memcg without
3951 * struct page_cgroup is acquired. This refcnt will be consumed by
3952 * "commit()" or removed by "cancel()"
3953 */
3954static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3955                                          struct page *page,
3956                                          gfp_t mask,
3957                                          struct mem_cgroup **memcgp)
3958{
3959        struct mem_cgroup *memcg;
3960        struct page_cgroup *pc;
3961        int ret;
3962
3963        pc = lookup_page_cgroup(page);
3964        /*
3965         * Every swap fault against a single page tries to charge the
3966         * page, bail as early as possible.  shmem_unuse() encounters
3967         * already charged pages, too.  The USED bit is protected by
3968         * the page lock, which serializes swap cache removal, which
3969         * in turn serializes uncharging.
3970         */
3971        if (PageCgroupUsed(pc))
3972                return 0;
3973        if (!do_swap_account)
3974                goto charge_cur_mm;
3975        memcg = try_get_mem_cgroup_from_page(page);
3976        if (!memcg)
3977                goto charge_cur_mm;
3978        *memcgp = memcg;
3979        ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
3980        css_put(&memcg->css);
3981        if (ret == -EINTR)
3982                ret = 0;
3983        return ret;
3984charge_cur_mm:
3985        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
3986        if (ret == -EINTR)
3987                ret = 0;
3988        return ret;
3989}
3990
3991int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3992                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3993{
3994        *memcgp = NULL;
3995        if (mem_cgroup_disabled())
3996                return 0;
3997        /*
3998         * A racing thread's fault, or swapoff, may have already
3999         * updated the pte, and even removed page from swap cache: in
4000         * those cases unuse_pte()'s pte_same() test will fail; but
4001         * there's also a KSM case which does need to charge the page.
4002         */
4003        if (!PageSwapCache(page)) {
4004                int ret;
4005
4006                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
4007                if (ret == -EINTR)
4008                        ret = 0;
4009                return ret;
4010        }
4011        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4012}
4013
4014void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
4015{
4016        if (mem_cgroup_disabled())
4017                return;
4018        if (!memcg)
4019                return;
4020        __mem_cgroup_cancel_charge(memcg, 1);
4021}
4022
4023static void
4024__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
4025                                        enum charge_type ctype)
4026{
4027        if (mem_cgroup_disabled())
4028                return;
4029        if (!memcg)
4030                return;
4031
4032        __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
4033        /*
4034         * Now swap is on-memory. This means this page may be
4035         * counted both as mem and swap....double count.
4036         * Fix it by uncharging from memsw. Basically, this SwapCache is stable
4037         * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
4038         * may call delete_from_swap_cache() before reach here.
4039         */
4040        if (do_swap_account && PageSwapCache(page)) {
4041                swp_entry_t ent = {.val = page_private(page)};
4042                mem_cgroup_uncharge_swap(ent);
4043        }
4044}
4045
4046void mem_cgroup_commit_charge_swapin(struct page *page,
4047                                     struct mem_cgroup *memcg)
4048{
4049        __mem_cgroup_commit_charge_swapin(page, memcg,
4050                                          MEM_CGROUP_CHARGE_TYPE_ANON);
4051}
4052
4053int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4054                                gfp_t gfp_mask)
4055{
4056        struct mem_cgroup *memcg = NULL;
4057        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4058        int ret;
4059
4060        if (mem_cgroup_disabled())
4061                return 0;
4062        if (PageCompound(page))
4063                return 0;
4064
4065        if (!PageSwapCache(page))
4066                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4067        else { /* page is swapcache/shmem */
4068                ret = __mem_cgroup_try_charge_swapin(mm, page,
4069                                                     gfp_mask, &memcg);
4070                if (!ret)
4071                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
4072        }
4073        return ret;
4074}
4075
4076static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
4077                                   unsigned int nr_pages,
4078                                   const enum charge_type ctype)
4079{
4080        struct memcg_batch_info *batch = NULL;
4081        bool uncharge_memsw = true;
4082
4083        /* If swapout, usage of swap doesn't decrease */
4084        if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4085                uncharge_memsw = false;
4086
4087        batch = &current->memcg_batch;
4088        /*
4089         * In usual, we do css_get() when we remember memcg pointer.
4090         * But in this case, we keep res->usage until end of a series of
4091         * uncharges. Then, it's ok to ignore memcg's refcnt.
4092         */
4093        if (!batch->memcg)
4094                batch->memcg = memcg;
4095        /*
4096         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
4097         * In those cases, all pages freed continuously can be expected to be in
4098         * the same cgroup and we have chance to coalesce uncharges.
4099         * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
4100         * because we want to do uncharge as soon as possible.
4101         */
4102
4103        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4104                goto direct_uncharge;
4105
4106        if (nr_pages > 1)
4107                goto direct_uncharge;
4108
4109        /*
4110         * In typical case, batch->memcg == mem. This means we can
4111         * merge a series of uncharges to an uncharge of res_counter.
4112         * If not, we uncharge res_counter ony by one.
4113         */
4114        if (batch->memcg != memcg)
4115                goto direct_uncharge;
4116        /* remember freed charge and uncharge it later */
4117        batch->nr_pages++;
4118        if (uncharge_memsw)
4119                batch->memsw_nr_pages++;
4120        return;
4121direct_uncharge:
4122        res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4123        if (uncharge_memsw)
4124                res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4125        if (unlikely(batch->memcg != memcg))
4126                memcg_oom_recover(memcg);
4127}
4128
4129/*
4130 * uncharge if !page_mapped(page)
4131 */
4132static struct mem_cgroup *
4133__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4134                             bool end_migration)
4135{
4136        struct mem_cgroup *memcg = NULL;
4137        unsigned int nr_pages = 1;
4138        struct page_cgroup *pc;
4139        bool anon;
4140
4141        if (mem_cgroup_disabled())
4142                return NULL;
4143
4144        if (PageTransHuge(page)) {
4145                nr_pages <<= compound_order(page);
4146                VM_BUG_ON(!PageTransHuge(page));
4147        }
4148        /*
4149         * Check if our page_cgroup is valid
4150         */
4151        pc = lookup_page_cgroup(page);
4152        if (unlikely(!PageCgroupUsed(pc)))
4153                return NULL;
4154
4155        lock_page_cgroup(pc);
4156
4157        memcg = pc->mem_cgroup;
4158
4159        if (!PageCgroupUsed(pc))
4160                goto unlock_out;
4161
4162        anon = PageAnon(page);
4163
4164        switch (ctype) {
4165        case MEM_CGROUP_CHARGE_TYPE_ANON:
4166                /*
4167                 * Generally PageAnon tells if it's the anon statistics to be
4168                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
4169                 * used before page reached the stage of being marked PageAnon.
4170                 */
4171                anon = true;
4172                /* fallthrough */
4173        case MEM_CGROUP_CHARGE_TYPE_DROP:
4174                /* See mem_cgroup_prepare_migration() */
4175                if (page_mapped(page))
4176                        goto unlock_out;
4177                /*
4178                 * Pages under migration may not be uncharged.  But
4179                 * end_migration() /must/ be the one uncharging the
4180                 * unused post-migration page and so it has to call
4181                 * here with the migration bit still set.  See the
4182                 * res_counter handling below.
4183                 */
4184                if (!end_migration && PageCgroupMigration(pc))
4185                        goto unlock_out;
4186                break;
4187        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4188                if (!PageAnon(page)) {  /* Shared memory */
4189                        if (page->mapping && !page_is_file_cache(page))
4190                                goto unlock_out;
4191                } else if (page_mapped(page)) /* Anon */
4192                                goto unlock_out;
4193                break;
4194        default:
4195                break;
4196        }
4197
4198        mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4199
4200        ClearPageCgroupUsed(pc);
4201        /*
4202         * pc->mem_cgroup is not cleared here. It will be accessed when it's
4203         * freed from LRU. This is safe because uncharged page is expected not
4204         * to be reused (freed soon). Exception is SwapCache, it's handled by
4205         * special functions.
4206         */
4207
4208        unlock_page_cgroup(pc);
4209        /*
4210         * even after unlock, we have memcg->res.usage here and this memcg
4211         * will never be freed, so it's safe to call css_get().
4212         */
4213        memcg_check_events(memcg, page);
4214        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4215                mem_cgroup_swap_statistics(memcg, true);
4216                css_get(&memcg->css);
4217        }
4218        /*
4219         * Migration does not charge the res_counter for the
4220         * replacement page, so leave it alone when phasing out the
4221         * page that is unused after the migration.
4222         */
4223        if (!end_migration && !mem_cgroup_is_root(memcg))
4224                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4225
4226        return memcg;
4227
4228unlock_out:
4229        unlock_page_cgroup(pc);
4230        return NULL;
4231}
4232
4233void mem_cgroup_uncharge_page(struct page *page)
4234{
4235        /* early check. */
4236        if (page_mapped(page))
4237                return;
4238        VM_BUG_ON(page->mapping && !PageAnon(page));
4239        /*
4240         * If the page is in swap cache, uncharge should be deferred
4241         * to the swap path, which also properly accounts swap usage
4242         * and handles memcg lifetime.
4243         *
4244         * Note that this check is not stable and reclaim may add the
4245         * page to swap cache at any time after this.  However, if the
4246         * page is not in swap cache by the time page->mapcount hits
4247         * 0, there won't be any page table references to the swap
4248         * slot, and reclaim will free it and not actually write the
4249         * page to disk.
4250         */
4251        if (PageSwapCache(page))
4252                return;
4253        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4254}
4255
4256void mem_cgroup_uncharge_cache_page(struct page *page)
4257{
4258        VM_BUG_ON(page_mapped(page));
4259        VM_BUG_ON(page->mapping);
4260        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4261}
4262
4263/*
4264 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
4265 * In that cases, pages are freed continuously and we can expect pages
4266 * are in the same memcg. All these calls itself limits the number of
4267 * pages freed at once, then uncharge_start/end() is called properly.
4268 * This may be called prural(2) times in a context,
4269 */
4270
4271void mem_cgroup_uncharge_start(void)
4272{
4273        current->memcg_batch.do_batch++;
4274        /* We can do nest. */
4275        if (current->memcg_batch.do_batch == 1) {
4276                current->memcg_batch.memcg = NULL;
4277                current->memcg_batch.nr_pages = 0;
4278                current->memcg_batch.memsw_nr_pages = 0;
4279        }
4280}
4281
4282void mem_cgroup_uncharge_end(void)
4283{
4284        struct memcg_batch_info *batch = &current->memcg_batch;
4285
4286        if (!batch->do_batch)
4287                return;
4288
4289        batch->do_batch--;
4290        if (batch->do_batch) /* If stacked, do nothing. */
4291                return;
4292
4293        if (!batch->memcg)
4294                return;
4295        /*
4296         * This "batch->memcg" is valid without any css_get/put etc...
4297         * bacause we hide charges behind us.
4298         */
4299        if (batch->nr_pages)
4300                res_counter_uncharge(&batch->memcg->res,
4301                                     batch->nr_pages * PAGE_SIZE);
4302        if (batch->memsw_nr_pages)
4303                res_counter_uncharge(&batch->memcg->memsw,
4304                                     batch->memsw_nr_pages * PAGE_SIZE);
4305        memcg_oom_recover(batch->memcg);
4306        /* forget this pointer (for sanity check) */
4307        batch->memcg = NULL;
4308}
4309
4310#ifdef CONFIG_SWAP
4311/*
4312 * called after __delete_from_swap_cache() and drop "page" account.
4313 * memcg information is recorded to swap_cgroup of "ent"
4314 */
4315void
4316mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4317{
4318        struct mem_cgroup *memcg;
4319        int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4320
4321        if (!swapout) /* this was a swap cache but the swap is unused ! */
4322                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4323
4324        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4325
4326        /*
4327         * record memcg information,  if swapout && memcg != NULL,
4328         * css_get() was called in uncharge().
4329         */
4330        if (do_swap_account && swapout && memcg)
4331                swap_cgroup_record(ent, css_id(&memcg->css));
4332}
4333#endif
4334
4335#ifdef CONFIG_MEMCG_SWAP
4336/*
4337 * called from swap_entry_free(). remove record in swap_cgroup and
4338 * uncharge "memsw" account.
4339 */
4340void mem_cgroup_uncharge_swap(swp_entry_t ent)
4341{
4342        struct mem_cgroup *memcg;
4343        unsigned short id;
4344
4345        if (!do_swap_account)
4346                return;
4347
4348        id = swap_cgroup_record(ent, 0);
4349        rcu_read_lock();
4350        memcg = mem_cgroup_lookup(id);
4351        if (memcg) {
4352                /*
4353                 * We uncharge this because swap is freed.
4354                 * This memcg can be obsolete one. We avoid calling css_tryget
4355                 */
4356                if (!mem_cgroup_is_root(memcg))
4357                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4358                mem_cgroup_swap_statistics(memcg, false);
4359                css_put(&memcg->css);
4360        }
4361        rcu_read_unlock();
4362}
4363
4364/**
4365 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
4366 * @entry: swap entry to be moved
4367 * @from:  mem_cgroup which the entry is moved from
4368 * @to:  mem_cgroup which the entry is moved to
4369 *
4370 * It succeeds only when the swap_cgroup's record for this entry is the same
4371 * as the mem_cgroup's id of @from.
4372 *
4373 * Returns 0 on success, -EINVAL on failure.
4374 *
4375 * The caller must have charged to @to, IOW, called res_counter_charge() about
4376 * both res and memsw, and called css_get().
4377 */
4378static int mem_cgroup_move_swap_account(swp_entry_t entry,
4379                                struct mem_cgroup *from, struct mem_cgroup *to)
4380{
4381        unsigned short old_id, new_id;
4382
4383        old_id = css_id(&from->css);
4384        new_id = css_id(&to->css);
4385
4386        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4387                mem_cgroup_swap_statistics(from, false);
4388                mem_cgroup_swap_statistics(to, true);
4389                /*
4390                 * This function is only called from task migration context now.
4391                 * It postpones res_counter and refcount handling till the end
4392                 * of task migration(mem_cgroup_clear_mc()) for performance
4393                 * improvement. But we cannot postpone css_get(to)  because if
4394                 * the process that has been moved to @to does swap-in, the
4395                 * refcount of @to might be decreased to 0.
4396                 *
4397                 * We are in attach() phase, so the cgroup is guaranteed to be
4398                 * alive, so we can just call css_get().
4399                 */
4400                css_get(&to->css);
4401                return 0;
4402        }
4403        return -EINVAL;
4404}
4405#else
4406static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4407                                struct mem_cgroup *from, struct mem_cgroup *to)
4408{
4409        return -EINVAL;
4410}
4411#endif
4412
4413/*
4414 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
4415 * page belongs to.
4416 */
4417void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4418                                  struct mem_cgroup **memcgp)
4419{
4420        struct mem_cgroup *memcg = NULL;
4421        unsigned int nr_pages = 1;
4422        struct page_cgroup *pc;
4423        enum charge_type ctype;
4424
4425        *memcgp = NULL;
4426
4427        if (mem_cgroup_disabled())
4428                return;
4429
4430        if (PageTransHuge(page))
4431                nr_pages <<= compound_order(page);
4432
4433        pc = lookup_page_cgroup(page);
4434        lock_page_cgroup(pc);
4435        if (PageCgroupUsed(pc)) {
4436                memcg = pc->mem_cgroup;
4437                css_get(&memcg->css);
4438                /*
4439                 * At migrating an anonymous page, its mapcount goes down
4440                 * to 0 and uncharge() will be called. But, even if it's fully
4441                 * unmapped, migration may fail and this page has to be
4442                 * charged again. We set MIGRATION flag here and delay uncharge
4443                 * until end_migration() is called
4444                 *
4445                 * Corner Case Thinking
4446                 * A)
4447                 * When the old page was mapped as Anon and it's unmap-and-freed
4448                 * while migration was ongoing.
4449                 * If unmap finds the old page, uncharge() of it will be delayed
4450                 * until end_migration(). If unmap finds a new page, it's
4451                 * uncharged when it make mapcount to be 1->0. If unmap code
4452                 * finds swap_migration_entry, the new page will not be mapped
4453                 * and end_migration() will find it(mapcount==0).
4454                 *
4455                 * B)
4456                 * When the old page was mapped but migraion fails, the kernel
4457                 * remaps it. A charge for it is kept by MIGRATION flag even
4458                 * if mapcount goes down to 0. We can do remap successfully
4459                 * without charging it again.
4460                 *
4461                 * C)
4462                 * The "old" page is under lock_page() until the end of
4463                 * migration, so, the old page itself will not be swapped-out.
4464                 * If the new page is swapped out before end_migraton, our
4465                 * hook to usual swap-out path will catch the event.
4466                 */
4467                if (PageAnon(page))
4468                        SetPageCgroupMigration(pc);
4469        }
4470        unlock_page_cgroup(pc);
4471        /*
4472         * If the page is not charged at this point,
4473         * we return here.
4474         */
4475        if (!memcg)
4476                return;
4477
4478        *memcgp = memcg;
4479        /*
4480         * We charge new page before it's used/mapped. So, even if unlock_page()
4481         * is called before end_migration, we can catch all events on this new
4482         * page. In the case new page is migrated but not remapped, new page's
4483         * mapcount will be finally 0 and we call uncharge in end_migration().
4484         */
4485        if (PageAnon(page))
4486                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4487        else
4488                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4489        /*
4490         * The page is committed to the memcg, but it's not actually
4491         * charged to the res_counter since we plan on replacing the
4492         * old one and only one page is going to be left afterwards.
4493         */
4494        __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4495}
4496
4497/* remove redundant charge if migration failed*/
4498void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4499        struct page *oldpage, struct page *newpage, bool migration_ok)
4500{
4501        struct page *used, *unused;
4502        struct page_cgroup *pc;
4503        bool anon;
4504
4505        if (!memcg)
4506                return;
4507
4508        if (!migration_ok) {
4509                used = oldpage;
4510                unused = newpage;
4511        } else {
4512                used = newpage;
4513                unused = oldpage;
4514        }
4515        anon = PageAnon(used);
4516        __mem_cgroup_uncharge_common(unused,
4517                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4518                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
4519                                     true);
4520        css_put(&memcg->css);
4521        /*
4522         * We disallowed uncharge of pages under migration because mapcount
4523         * of the page goes down to zero, temporarly.
4524         * Clear the flag and check the page should be charged.
4525         */
4526        pc = lookup_page_cgroup(oldpage);
4527        lock_page_cgroup(pc);
4528        ClearPageCgroupMigration(pc);
4529        unlock_page_cgroup(pc);
4530
4531        /*
4532         * If a page is a file cache, radix-tree replacement is very atomic
4533         * and we can skip this check. When it was an Anon page, its mapcount
4534         * goes down to 0. But because we added MIGRATION flage, it's not
4535         * uncharged yet. There are several case but page->mapcount check
4536         * and USED bit check in mem_cgroup_uncharge_page() will do enough
4537         * check. (see prepare_charge() also)
4538         */
4539        if (anon)
4540                mem_cgroup_uncharge_page(used);
4541}
4542
4543/*
4544 * At replace page cache, newpage is not under any memcg but it's on
4545 * LRU. So, this function doesn't touch res_counter but handles LRU
4546 * in correct way. Both pages are locked so we cannot race with uncharge.
4547 */
4548void mem_cgroup_replace_page_cache(struct page *oldpage,
4549                                  struct page *newpage)
4550{
4551        struct mem_cgroup *memcg = NULL;
4552        struct page_cgroup *pc;
4553        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4554
4555        if (mem_cgroup_disabled())
4556                return;
4557
4558        pc = lookup_page_cgroup(oldpage);
4559        /* fix accounting on old pages */
4560        lock_page_cgroup(pc);
4561        if (PageCgroupUsed(pc)) {
4562                memcg = pc->mem_cgroup;
4563                mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4564                ClearPageCgroupUsed(pc);
4565        }
4566        unlock_page_cgroup(pc);
4567
4568        /*
4569         * When called from shmem_replace_page(), in some cases the
4570         * oldpage has already been charged, and in some cases not.
4571         */
4572        if (!memcg)
4573                return;
4574        /*
4575         * Even if newpage->mapping was NULL before starting replacement,
4576         * the newpage may be on LRU(or pagevec for LRU) already. We lock
4577         * LRU while we overwrite pc->mem_cgroup.
4578         */
4579        __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4580}
4581
4582#ifdef CONFIG_DEBUG_VM
4583static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4584{
4585        struct page_cgroup *pc;
4586
4587        pc = lookup_page_cgroup(page);
4588        /*
4589         * Can be NULL while feeding pages into the page allocator for
4590         * the first time, i.e. during boot or memory hotplug;
4591         * or when mem_cgroup_disabled().
4592         */
4593        if (likely(pc) && PageCgroupUsed(pc))
4594                return pc;
4595        return NULL;
4596}
4597
4598bool mem_cgroup_bad_page_check(struct page *page)
4599{
4600        if (mem_cgroup_disabled())
4601                return false;
4602
4603        return lookup_page_cgroup_used(page) != NULL;
4604}
4605
4606void mem_cgroup_print_bad_page(struct page *page)
4607{
4608        struct page_cgroup *pc;
4609
4610        pc = lookup_page_cgroup_used(page);
4611        if (pc) {
4612                pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4613                         pc, pc->flags, pc->mem_cgroup);
4614        }
4615}
4616#endif
4617
4618static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4619                                unsigned long long val)
4620{
4621        int retry_count;
4622        u64 memswlimit, memlimit;
4623        int ret = 0;
4624        int children = mem_cgroup_count_children(memcg);
4625        u64 curusage, oldusage;
4626        int enlarge;
4627
4628        /*
4629         * For keeping hierarchical_reclaim simple, how long we should retry
4630         * is depends on callers. We set our retry-count to be function
4631         * of # of children which we should visit in this loop.
4632         */
4633        retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4634
4635        oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4636
4637        enlarge = 0;
4638        while (retry_count) {
4639                if (signal_pending(current)) {
4640                        ret = -EINTR;
4641                        break;
4642                }
4643                /*
4644                 * Rather than hide all in some function, I do this in
4645                 * open coded manner. You see what this really does.
4646                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4647                 */
4648                mutex_lock(&set_limit_mutex);
4649                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4650                if (memswlimit < val) {
4651                        ret = -EINVAL;
4652                        mutex_unlock(&set_limit_mutex);
4653                        break;
4654                }
4655
4656                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4657                if (memlimit < val)
4658                        enlarge = 1;
4659
4660                ret = res_counter_set_limit(&memcg->res, val);
4661                if (!ret) {
4662                        if (memswlimit == val)
4663                                memcg->memsw_is_minimum = true;
4664                        else
4665                                memcg->memsw_is_minimum = false;
4666                }
4667                mutex_unlock(&set_limit_mutex);
4668
4669                if (!ret)
4670                        break;
4671
4672                mem_cgroup_reclaim(memcg, GFP_KERNEL,
4673                                   MEM_CGROUP_RECLAIM_SHRINK);
4674                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4675                /* Usage is reduced ? */
4676                if (curusage >= oldusage)
4677                        retry_count--;
4678                else
4679                        oldusage = curusage;
4680        }
4681        if (!ret && enlarge)
4682                memcg_oom_recover(memcg);
4683
4684        return ret;
4685}
4686
4687static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4688                                        unsigned long long val)
4689{
4690        int retry_count;
4691        u64 memlimit, memswlimit, oldusage, curusage;
4692        int children = mem_cgroup_count_children(memcg);
4693        int ret = -EBUSY;
4694        int enlarge = 0;
4695
4696        /* see mem_cgroup_resize_res_limit */
4697        retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4698        oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4699        while (retry_count) {
4700                if (signal_pending(current)) {
4701                        ret = -EINTR;
4702                        break;
4703                }
4704                /*
4705                 * Rather than hide all in some function, I do this in
4706                 * open coded manner. You see what this really does.
4707                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4708                 */
4709                mutex_lock(&set_limit_mutex);
4710                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4711                if (memlimit > val) {
4712                        ret = -EINVAL;
4713                        mutex_unlock(&set_limit_mutex);
4714                        break;
4715                }
4716                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4717                if (memswlimit < val)
4718                        enlarge = 1;
4719                ret = res_counter_set_limit(&memcg->memsw, val);
4720                if (!ret) {
4721                        if (memlimit == val)
4722                                memcg->memsw_is_minimum = true;
4723                        else
4724                                memcg->memsw_is_minimum = false;
4725                }
4726                mutex_unlock(&set_limit_mutex);
4727
4728                if (!ret)
4729                        break;
4730
4731                mem_cgroup_reclaim(memcg, GFP_KERNEL,
4732                                   MEM_CGROUP_RECLAIM_NOSWAP |
4733                                   MEM_CGROUP_RECLAIM_SHRINK);
4734                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4735                /* Usage is reduced ? */
4736                if (curusage >= oldusage)
4737                        retry_count--;
4738                else
4739                        oldusage = curusage;
4740        }
4741        if (!ret && enlarge)
4742                memcg_oom_recover(memcg);
4743        return ret;
4744}
4745
4746unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4747                                            gfp_t gfp_mask,
4748                                            unsigned long *total_scanned)
4749{
4750        unsigned long nr_reclaimed = 0;
4751        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4752        unsigned long reclaimed;
4753        int loop = 0;
4754        struct mem_cgroup_tree_per_zone *mctz;
4755        unsigned long long excess;
4756        unsigned long nr_scanned;
4757
4758        if (order > 0)
4759                return 0;
4760
4761        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4762        /*
4763         * This loop can run a while, specially if mem_cgroup's continuously
4764         * keep exceeding their soft limit and putting the system under
4765         * pressure
4766         */
4767        do {
4768                if (next_mz)
4769                        mz = next_mz;
4770                else
4771                        mz = mem_cgroup_largest_soft_limit_node(mctz);
4772                if (!mz)
4773                        break;
4774
4775                nr_scanned = 0;
4776                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4777                                                    gfp_mask, &nr_scanned);
4778                nr_reclaimed += reclaimed;
4779                *total_scanned += nr_scanned;
4780                spin_lock(&mctz->lock);
4781
4782                /*
4783                 * If we failed to reclaim anything from this memory cgroup
4784                 * it is time to move on to the next cgroup
4785                 */
4786                next_mz = NULL;
4787                if (!reclaimed) {
4788                        do {
4789                                /*
4790                                 * Loop until we find yet another one.
4791                                 *
4792                                 * By the time we get the soft_limit lock
4793                                 * again, someone might have aded the
4794                                 * group back on the RB tree. Iterate to
4795                                 * make sure we get a different mem.
4796                                 * mem_cgroup_largest_soft_limit_node returns
4797                                 * NULL if no other cgroup is present on
4798                                 * the tree
4799                                 */
4800                                next_mz =
4801                                __mem_cgroup_largest_soft_limit_node(mctz);
4802                                if (next_mz == mz)
4803                                        css_put(&next_mz->memcg->css);
4804                                else /* next_mz == NULL or other memcg */
4805                                        break;
4806                        } while (1);
4807                }
4808                __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4809                excess = res_counter_soft_limit_excess(&mz->memcg->res);
4810                /*
4811                 * One school of thought says that we should not add
4812                 * back the node to the tree if reclaim returns 0.
4813                 * But our reclaim could return 0, simply because due
4814                 * to priority we are exposing a smaller subset of
4815                 * memory to reclaim from. Consider this as a longer
4816                 * term TODO.
4817                 */
4818                /* If excess == 0, no tree ops */
4819                __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4820                spin_unlock(&mctz->lock);
4821                css_put(&mz->memcg->css);
4822                loop++;
4823                /*
4824                 * Could not reclaim anything and there are no more
4825                 * mem cgroups to try or we seem to be looping without
4826                 * reclaiming anything.
4827                 */
4828                if (!nr_reclaimed &&
4829                        (next_mz == NULL ||
4830                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4831                        break;
4832        } while (!nr_reclaimed);
4833        if (next_mz)
4834                css_put(&next_mz->memcg->css);
4835        return nr_reclaimed;
4836}
4837
4838/**
4839 * mem_cgroup_force_empty_list - clears LRU of a group
4840 * @memcg: group to clear
4841 * @node: NUMA node
4842 * @zid: zone id
4843 * @lru: lru to to clear
4844 *
4845 * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
4846 * reclaim the pages page themselves - pages are moved to the parent (or root)
4847 * group.
4848 */
4849static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4850                                int node, int zid, enum lru_list lru)
4851{
4852        struct lruvec *lruvec;
4853        unsigned long flags;
4854        struct list_head *list;
4855        struct page *busy;
4856        struct zone *zone;
4857
4858        zone = &NODE_DATA(node)->node_zones[zid];
4859        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4860        list = &lruvec->lists[lru];
4861
4862        busy = NULL;
4863        do {
4864                struct page_cgroup *pc;
4865                struct page *page;
4866
4867                spin_lock_irqsave(&zone->lru_lock, flags);
4868                if (list_empty(list)) {
4869                        spin_unlock_irqrestore(&zone->lru_lock, flags);
4870                        break;
4871                }
4872                page = list_entry(list->prev, struct page, lru);
4873                if (busy == page) {
4874                        list_move(&page->lru, list);
4875                        busy = NULL;
4876                        spin_unlock_irqrestore(&zone->lru_lock, flags);
4877                        continue;
4878                }
4879                spin_unlock_irqrestore(&zone->lru_lock, flags);
4880
4881                pc = lookup_page_cgroup(page);
4882
4883                if (mem_cgroup_move_parent(page, pc, memcg)) {
4884                        /* found lock contention or "pc" is obsolete. */
4885                        busy = page;
4886                        cond_resched();
4887                } else
4888                        busy = NULL;
4889        } while (!list_empty(list));
4890}
4891
4892/*
4893 * make mem_cgroup's charge to be 0 if there is no task by moving
4894 * all the charges and pages to the parent.
4895 * This enables deleting this mem_cgroup.
4896 *
4897 * Caller is responsible for holding css reference on the memcg.
4898 */
4899static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4900{
4901        int node, zid;
4902        u64 usage;
4903
4904        do {
4905                /* This is for making all *used* pages to be on LRU. */
4906                lru_add_drain_all();
4907                drain_all_stock_sync(memcg);
4908                mem_cgroup_start_move(memcg);
4909                for_each_node_state(node, N_MEMORY) {
4910                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4911                                enum lru_list lru;
4912                                for_each_lru(lru) {
4913                                        mem_cgroup_force_empty_list(memcg,
4914                                                        node, zid, lru);
4915                                }
4916                        }
4917                }
4918                mem_cgroup_end_move(memcg);
4919                memcg_oom_recover(memcg);
4920                cond_resched();
4921
4922                /*
4923                 * Kernel memory may not necessarily be trackable to a specific
4924                 * process. So they are not migrated, and therefore we can't
4925                 * expect their value to drop to 0 here.
4926                 * Having res filled up with kmem only is enough.
4927                 *
4928                 * This is a safety check because mem_cgroup_force_empty_list
4929                 * could have raced with mem_cgroup_replace_page_cache callers
4930                 * so the lru seemed empty but the page could have been added
4931                 * right after the check. RES_USAGE should be safe as we always
4932                 * charge before adding to the LRU.
4933                 */
4934                usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4935                        res_counter_read_u64(&memcg->kmem, RES_USAGE);
4936        } while (usage > 0);
4937}
4938
4939/*
4940 * This mainly exists for tests during the setting of set of use_hierarchy.
4941 * Since this is the very setting we are changing, the current hierarchy value
4942 * is meaningless
4943 */
4944static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4945{
4946        struct cgroup *pos;
4947
4948        /* bounce at first found */
4949        cgroup_for_each_child(pos, memcg->css.cgroup)
4950                return true;
4951        return false;
4952}
4953
4954/*
4955 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4956 * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
4957 * from mem_cgroup_count_children(), in the sense that we don't really care how
4958 * many children we have; we only need to know if we have any.  It also counts
4959 * any memcg without hierarchy as infertile.
4960 */
4961static inline bool memcg_has_children(struct mem_cgroup *memcg)
4962{
4963        return memcg->use_hierarchy && __memcg_has_children(memcg);
4964}
4965
4966/*
4967 * Reclaims as many pages from the given memcg as possible and moves
4968 * the rest to the parent.
4969 *
4970 * Caller is responsible for holding css reference for memcg.
4971 */
4972static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4973{
4974        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4975        struct cgroup *cgrp = memcg->css.cgroup;
4976
4977        /* returns EBUSY if there is a task or if we come here twice. */
4978        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4979                return -EBUSY;
4980
4981        /* we call try-to-free pages for make this cgroup empty */
4982        lru_add_drain_all();
4983        /* try to free all pages in this cgroup */
4984        while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4985                int progress;
4986
4987                if (signal_pending(current))
4988                        return -EINTR;
4989
4990                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4991                                                false);
4992                if (!progress) {
4993                        nr_retries--;
4994                        /* maybe some writeback is necessary */
4995                        congestion_wait(BLK_RW_ASYNC, HZ/10);
4996                }
4997
4998        }
4999        lru_add_drain();
5000        mem_cgroup_reparent_charges(memcg);
5001
5002        return 0;
5003}
5004
5005static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
5006{
5007        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5008        int ret;
5009
5010        if (mem_cgroup_is_root(memcg))
5011                return -EINVAL;
5012        css_get(&memcg->css);
5013        ret = mem_cgroup_force_empty(memcg);
5014        css_put(&memcg->css);
5015
5016        return ret;
5017}
5018
5019
5020static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
5021{
5022        return mem_cgroup_from_cont(cont)->use_hierarchy;
5023}
5024
5025static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
5026                                        u64 val)
5027{
5028        int retval = 0;
5029        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5030        struct cgroup *parent = cont->parent;
5031        struct mem_cgroup *parent_memcg = NULL;
5032
5033        if (parent)
5034                parent_memcg = mem_cgroup_from_cont(parent);
5035
5036        mutex_lock(&memcg_create_mutex);
5037
5038        if (memcg->use_hierarchy == val)
5039                goto out;
5040
5041        /*
5042         * If parent's use_hierarchy is set, we can't make any modifications
5043         * in the child subtrees. If it is unset, then the change can
5044         * occur, provided the current cgroup has no children.
5045         *
5046         * For the root cgroup, parent_mem is NULL, we allow value to be
5047         * set if there are no children.
5048         */
5049        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
5050                                (val == 1 || val == 0)) {
5051                if (!__memcg_has_children(memcg))
5052                        memcg->use_hierarchy = val;
5053                else
5054                        retval = -EBUSY;
5055        } else
5056                retval = -EINVAL;
5057
5058out:
5059        mutex_unlock(&memcg_create_mutex);
5060
5061        return retval;
5062}
5063
5064
5065static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
5066                                               enum mem_cgroup_stat_index idx)
5067{
5068        struct mem_cgroup *iter;
5069        long val = 0;
5070
5071        /* Per-cpu values can be negative, use a signed accumulator */
5072        for_each_mem_cgroup_tree(iter, memcg)
5073                val += mem_cgroup_read_stat(iter, idx);
5074
5075        if (val < 0) /* race ? */
5076                val = 0;
5077        return val;
5078}
5079
5080static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5081{
5082        u64 val;
5083
5084        if (!mem_cgroup_is_root(memcg)) {
5085                if (!swap)
5086                        return res_counter_read_u64(&memcg->res, RES_USAGE);
5087                else
5088                        return res_counter_read_u64(&memcg->memsw, RES_USAGE);
5089        }
5090
5091        /*
5092         * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
5093         * as well as in MEM_CGROUP_STAT_RSS_HUGE.
5094         */
5095        val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
5096        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
5097
5098        if (swap)
5099                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
5100
5101        return val << PAGE_SHIFT;
5102}
5103
5104static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
5105                               struct file *file, char __user *buf,
5106                               size_t nbytes, loff_t *ppos)
5107{
5108        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5109        char str[64];
5110        u64 val;
5111        int name, len;
5112        enum res_type type;
5113
5114        type = MEMFILE_TYPE(cft->private);
5115        name = MEMFILE_ATTR(cft->private);
5116
5117        switch (type) {
5118        case _MEM:
5119                if (name == RES_USAGE)
5120                        val = mem_cgroup_usage(memcg, false);
5121                else
5122                        val = res_counter_read_u64(&memcg->res, name);
5123                break;
5124        case _MEMSWAP:
5125                if (name == RES_USAGE)
5126                        val = mem_cgroup_usage(memcg, true);
5127                else
5128                        val = res_counter_read_u64(&memcg->memsw, name);
5129                break;
5130        case _KMEM:
5131                val = res_counter_read_u64(&memcg->kmem, name);
5132                break;
5133        default:
5134                BUG();
5135        }
5136
5137        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
5138        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5139}
5140
5141static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5142{
5143        int ret = -EINVAL;
5144#ifdef CONFIG_MEMCG_KMEM
5145        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5146        /*
5147         * For simplicity, we won't allow this to be disabled.  It also can't
5148         * be changed if the cgroup has children already, or if tasks had
5149         * already joined.
5150         *
5151         * If tasks join before we set the limit, a person looking at
5152         * kmem.usage_in_bytes will have no way to determine when it took
5153         * place, which makes the value quite meaningless.
5154         *
5155         * After it first became limited, changes in the value of the limit are
5156         * of course permitted.
5157         */
5158        mutex_lock(&memcg_create_mutex);
5159        mutex_lock(&set_limit_mutex);
5160        if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
5161                if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
5162                        ret = -EBUSY;
5163                        goto out;
5164                }
5165                ret = res_counter_set_limit(&memcg->kmem, val);
5166                VM_BUG_ON(ret);
5167
5168                ret = memcg_update_cache_sizes(memcg);
5169                if (ret) {
5170                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
5171                        goto out;
5172                }
5173                static_key_slow_inc(&memcg_kmem_enabled_key);
5174                /*
5175                 * setting the active bit after the inc will guarantee no one
5176                 * starts accounting before all call sites are patched
5177                 */
5178                memcg_kmem_set_active(memcg);
5179        } else
5180                ret = res_counter_set_limit(&memcg->kmem, val);
5181out:
5182        mutex_unlock(&set_limit_mutex);
5183        mutex_unlock(&memcg_create_mutex);
5184#endif
5185        return ret;
5186}
5187
5188#ifdef CONFIG_MEMCG_KMEM
5189static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5190{
5191        int ret = 0;
5192        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5193        if (!parent)
5194                goto out;
5195
5196        memcg->kmem_account_flags = parent->kmem_account_flags;
5197        /*
5198         * When that happen, we need to disable the static branch only on those
5199         * memcgs that enabled it. To achieve this, we would be forced to
5200         * complicate the code by keeping track of which memcgs were the ones
5201         * that actually enabled limits, and which ones got it from its
5202         * parents.
5203         *
5204         * It is a lot simpler just to do static_key_slow_inc() on every child
5205         * that is accounted.
5206         */
5207        if (!memcg_kmem_is_active(memcg))
5208                goto out;
5209
5210        /*
5211         * __mem_cgroup_free() will issue static_key_slow_dec() because this
5212         * memcg is active already. If the later initialization fails then the
5213         * cgroup core triggers the cleanup so we do not have to do it here.
5214         */
5215        static_key_slow_inc(&memcg_kmem_enabled_key);
5216
5217        mutex_lock(&set_limit_mutex);
5218        memcg_stop_kmem_account();
5219        ret = memcg_update_cache_sizes(memcg);
5220        memcg_resume_kmem_account();
5221        mutex_unlock(&set_limit_mutex);
5222out:
5223        return ret;
5224}
5225#endif /* CONFIG_MEMCG_KMEM */
5226
5227/*
5228 * The user of this function is...
5229 * RES_LIMIT.
5230 */
5231static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5232                            const char *buffer)
5233{
5234        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5235        enum res_type type;
5236        int name;
5237        unsigned long long val;
5238        int ret;
5239
5240        type = MEMFILE_TYPE(cft->private);
5241        name = MEMFILE_ATTR(cft->private);
5242
5243        switch (name) {
5244        case RES_LIMIT:
5245                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
5246                        ret = -EINVAL;
5247                        break;
5248                }
5249                /* This function does all necessary parse...reuse it */
5250                ret = res_counter_memparse_write_strategy(buffer, &val);
5251                if (ret)
5252                        break;
5253                if (type == _MEM)
5254                        ret = mem_cgroup_resize_limit(memcg, val);
5255                else if (type == _MEMSWAP)
5256                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
5257                else if (type == _KMEM)
5258                        ret = memcg_update_kmem_limit(cont, val);
5259                else
5260                        return -EINVAL;
5261                break;
5262        case RES_SOFT_LIMIT:
5263                ret = res_counter_memparse_write_strategy(buffer, &val);
5264                if (ret)
5265                        break;
5266                /*
5267                 * For memsw, soft limits are hard to implement in terms
5268                 * of semantics, for now, we support soft limits for
5269                 * control without swap
5270                 */
5271                if (type == _MEM)
5272                        ret = res_counter_set_soft_limit(&memcg->res, val);
5273                else
5274                        ret = -EINVAL;
5275                break;
5276        default:
5277                ret = -EINVAL; /* should be BUG() ? */
5278                break;
5279        }
5280        return ret;
5281}
5282
5283static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5284                unsigned long long *mem_limit, unsigned long long *memsw_limit)
5285{
5286        struct cgroup *cgroup;
5287        unsigned long long min_limit, min_memsw_limit, tmp;
5288
5289        min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5290        min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5291        cgroup = memcg->css.cgroup;
5292        if (!memcg->use_hierarchy)
5293                goto out;
5294
5295        while (cgroup->parent) {
5296                cgroup = cgroup->parent;
5297                memcg = mem_cgroup_from_cont(cgroup);
5298                if (!memcg->use_hierarchy)
5299                        break;
5300                tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5301                min_limit = min(min_limit, tmp);
5302                tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5303                min_memsw_limit = min(min_memsw_limit, tmp);
5304        }
5305out:
5306        *mem_limit = min_limit;
5307        *memsw_limit = min_memsw_limit;
5308}
5309
5310static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5311{
5312        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5313        int name;
5314        enum res_type type;
5315
5316        type = MEMFILE_TYPE(event);
5317        name = MEMFILE_ATTR(event);
5318
5319        switch (name) {
5320        case RES_MAX_USAGE:
5321                if (type == _MEM)
5322                        res_counter_reset_max(&memcg->res);
5323                else if (type == _MEMSWAP)
5324                        res_counter_reset_max(&memcg->memsw);
5325                else if (type == _KMEM)
5326                        res_counter_reset_max(&memcg->kmem);
5327                else
5328                        return -EINVAL;
5329                break;
5330        case RES_FAILCNT:
5331                if (type == _MEM)
5332                        res_counter_reset_failcnt(&memcg->res);
5333                else if (type == _MEMSWAP)
5334                        res_counter_reset_failcnt(&memcg->memsw);
5335                else if (type == _KMEM)
5336                        res_counter_reset_failcnt(&memcg->kmem);
5337                else
5338                        return -EINVAL;
5339                break;
5340        }
5341
5342        return 0;
5343}
5344
5345static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
5346                                        struct cftype *cft)
5347{
5348        return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
5349}
5350
5351#ifdef CONFIG_MMU
5352static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5353                                        struct cftype *cft, u64 val)
5354{
5355        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5356
5357        if (val >= (1 << NR_MOVE_TYPE))
5358                return -EINVAL;
5359
5360        /*
5361         * No kind of locking is needed in here, because ->can_attach() will
5362         * check this value once in the beginning of the process, and then carry
5363         * on with stale data. This means that changes to this value will only
5364         * affect task migrations starting after the change.
5365         */
5366        memcg->move_charge_at_immigrate = val;
5367        return 0;
5368}
5369#else
5370static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5371                                        struct cftype *cft, u64 val)
5372{
5373        return -ENOSYS;
5374}
5375#endif
5376
5377#ifdef CONFIG_NUMA
5378static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5379                                      struct seq_file *m)
5380{
5381        int nid;
5382        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5383        unsigned long node_nr;
5384        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5385
5386        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5387        seq_printf(m, "total=%lu", total_nr);
5388        for_each_node_state(nid, N_MEMORY) {
5389                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
5390                seq_printf(m, " N%d=%lu", nid, node_nr);
5391        }
5392        seq_putc(m, '\n');
5393
5394        file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
5395        seq_printf(m, "file=%lu", file_nr);
5396        for_each_node_state(nid, N_MEMORY) {
5397                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5398                                LRU_ALL_FILE);
5399                seq_printf(m, " N%d=%lu", nid, node_nr);
5400        }
5401        seq_putc(m, '\n');
5402
5403        anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
5404        seq_printf(m, "anon=%lu", anon_nr);
5405        for_each_node_state(nid, N_MEMORY) {
5406                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5407                                LRU_ALL_ANON);
5408                seq_printf(m, " N%d=%lu", nid, node_nr);
5409        }
5410        seq_putc(m, '\n');
5411
5412        unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5413        seq_printf(m, "unevictable=%lu", unevictable_nr);
5414        for_each_node_state(nid, N_MEMORY) {
5415                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5416                                BIT(LRU_UNEVICTABLE));
5417                seq_printf(m, " N%d=%lu", nid, node_nr);
5418        }
5419        seq_putc(m, '\n');
5420        return 0;
5421}
5422#endif /* CONFIG_NUMA */
5423
5424static inline void mem_cgroup_lru_names_not_uptodate(void)
5425{
5426        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5427}
5428
5429static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
5430                                 struct seq_file *m)
5431{
5432        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5433        struct mem_cgroup *mi;
5434        unsigned int i;
5435
5436        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5437                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5438                        continue;
5439                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5440                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5441        }
5442
5443        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5444                seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5445                           mem_cgroup_read_events(memcg, i));
5446
5447        for (i = 0; i < NR_LRU_LISTS; i++)
5448                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5449                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5450
5451        /* Hierarchical information */
5452        {
5453                unsigned long long limit, memsw_limit;
5454                memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5455                seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5456                if (do_swap_account)
5457                        seq_printf(m, "hierarchical_memsw_limit %llu\n",
5458                                   memsw_limit);
5459        }
5460
5461        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5462                long long val = 0;
5463
5464                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5465                        continue;
5466                for_each_mem_cgroup_tree(mi, memcg)
5467                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5468                seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5469        }
5470
5471        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5472                unsigned long long val = 0;
5473
5474                for_each_mem_cgroup_tree(mi, memcg)
5475                        val += mem_cgroup_read_events(mi, i);
5476                seq_printf(m, "total_%s %llu\n",
5477                           mem_cgroup_events_names[i], val);
5478        }
5479
5480        for (i = 0; i < NR_LRU_LISTS; i++) {
5481                unsigned long long val = 0;
5482
5483                for_each_mem_cgroup_tree(mi, memcg)
5484                        val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5485                seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5486        }
5487
5488#ifdef CONFIG_DEBUG_VM
5489        {
5490                int nid, zid;
5491                struct mem_cgroup_per_zone *mz;
5492                struct zone_reclaim_stat *rstat;
5493                unsigned long recent_rotated[2] = {0, 0};
5494                unsigned long recent_scanned[2] = {0, 0};
5495
5496                for_each_online_node(nid)
5497                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5498                                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5499                                rstat = &mz->lruvec.reclaim_stat;
5500
5501                                recent_rotated[0] += rstat->recent_rotated[0];
5502                                recent_rotated[1] += rstat->recent_rotated[1];
5503                                recent_scanned[0] += rstat->recent_scanned[0];
5504                                recent_scanned[1] += rstat->recent_scanned[1];
5505                        }
5506                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5507                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5508                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5509                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5510        }
5511#endif
5512
5513        return 0;
5514}
5515
5516static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
5517{
5518        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5519
5520        return mem_cgroup_swappiness(memcg);
5521}
5522
5523static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5524                                       u64 val)
5525{
5526        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5527        struct mem_cgroup *parent;
5528
5529        if (val > 100)
5530                return -EINVAL;
5531
5532        if (cgrp->parent == NULL)
5533                return -EINVAL;
5534
5535        parent = mem_cgroup_from_cont(cgrp->parent);
5536
5537        mutex_lock(&memcg_create_mutex);
5538
5539        /* If under hierarchy, only empty-root can set this value */
5540        if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5541                mutex_unlock(&memcg_create_mutex);
5542                return -EINVAL;
5543        }
5544
5545        memcg->swappiness = val;
5546
5547        mutex_unlock(&memcg_create_mutex);
5548
5549        return 0;
5550}
5551
5552static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5553{
5554        struct mem_cgroup_threshold_ary *t;
5555        u64 usage;
5556        int i;
5557
5558        rcu_read_lock();
5559        if (!swap)
5560                t = rcu_dereference(memcg->thresholds.primary);
5561        else
5562                t = rcu_dereference(memcg->memsw_thresholds.primary);
5563
5564        if (!t)
5565                goto unlock;
5566
5567        usage = mem_cgroup_usage(memcg, swap);
5568
5569        /*
5570         * current_threshold points to threshold just below or equal to usage.
5571         * If it's not true, a threshold was crossed after last
5572         * call of __mem_cgroup_threshold().
5573         */
5574        i = t->current_threshold;
5575
5576        /*
5577         * Iterate backward over array of thresholds starting from
5578         * current_threshold and check if a threshold is crossed.
5579         * If none of thresholds below usage is crossed, we read
5580         * only one element of the array here.
5581         */
5582        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5583                eventfd_signal(t->entries[i].eventfd, 1);
5584
5585        /* i = current_threshold + 1 */
5586        i++;
5587
5588        /*
5589         * Iterate forward over array of thresholds starting from
5590         * current_threshold+1 and check if a threshold is crossed.
5591         * If none of thresholds above usage is crossed, we read
5592         * only one element of the array here.
5593         */
5594        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5595                eventfd_signal(t->entries[i].eventfd, 1);
5596
5597        /* Update current_threshold */
5598        t->current_threshold = i - 1;
5599unlock:
5600        rcu_read_unlock();
5601}
5602
5603static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5604{
5605        while (memcg) {
5606                __mem_cgroup_threshold(memcg, false);
5607                if (do_swap_account)
5608                        __mem_cgroup_threshold(memcg, true);
5609
5610                memcg = parent_mem_cgroup(memcg);
5611        }
5612}
5613
5614static int compare_thresholds(const void *a, const void *b)
5615{
5616        const struct mem_cgroup_threshold *_a = a;
5617        const struct mem_cgroup_threshold *_b = b;
5618
5619        return _a->threshold - _b->threshold;
5620}
5621
5622static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5623{
5624        struct mem_cgroup_eventfd_list *ev;
5625
5626        list_for_each_entry(ev, &memcg->oom_notify, list)
5627                eventfd_signal(ev->eventfd, 1);
5628        return 0;
5629}
5630
5631static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5632{
5633        struct mem_cgroup *iter;
5634
5635        for_each_mem_cgroup_tree(iter, memcg)
5636                mem_cgroup_oom_notify_cb(iter);
5637}
5638
5639static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
5640        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5641{
5642        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5643        struct mem_cgroup_thresholds *thresholds;
5644        struct mem_cgroup_threshold_ary *new;
5645        enum res_type type = MEMFILE_TYPE(cft->private);
5646        u64 threshold, usage;
5647        int i, size, ret;
5648
5649        ret = res_counter_memparse_write_strategy(args, &threshold);
5650        if (ret)
5651                return ret;
5652
5653        mutex_lock(&memcg->thresholds_lock);
5654
5655        if (type == _MEM)
5656                thresholds = &memcg->thresholds;
5657        else if (type == _MEMSWAP)
5658                thresholds = &memcg->memsw_thresholds;
5659        else
5660                BUG();
5661
5662        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5663
5664        /* Check if a threshold crossed before adding a new one */
5665        if (thresholds->primary)
5666                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5667
5668        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5669
5670        /* Allocate memory for new array of thresholds */
5671        new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5672                        GFP_KERNEL);
5673        if (!new) {
5674                ret = -ENOMEM;
5675                goto unlock;
5676        }
5677        new->size = size;
5678
5679        /* Copy thresholds (if any) to new array */
5680        if (thresholds->primary) {
5681                memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5682                                sizeof(struct mem_cgroup_threshold));
5683        }
5684
5685        /* Add new threshold */
5686        new->entries[size - 1].eventfd = eventfd;
5687        new->entries[size - 1].threshold = threshold;
5688
5689        /* Sort thresholds. Registering of new threshold isn't time-critical */
5690        sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5691                        compare_thresholds, NULL);
5692
5693        /* Find current threshold */
5694        new->current_threshold = -1;
5695        for (i = 0; i < size; i++) {
5696                if (new->entries[i].threshold <= usage) {
5697                        /*
5698                         * new->current_threshold will not be used until
5699                         * rcu_assign_pointer(), so it's safe to increment
5700                         * it here.
5701                         */
5702                        ++new->current_threshold;
5703                } else
5704                        break;
5705        }
5706
5707        /* Free old spare buffer and save old primary buffer as spare */
5708        kfree(thresholds->spare);
5709        thresholds->spare = thresholds->primary;
5710
5711        rcu_assign_pointer(thresholds->primary, new);
5712
5713        /* To be sure that nobody uses thresholds */
5714        synchronize_rcu();
5715
5716unlock:
5717        mutex_unlock(&memcg->thresholds_lock);
5718
5719        return ret;
5720}
5721
5722static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
5723        struct cftype *cft, struct eventfd_ctx *eventfd)
5724{
5725        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
5726        struct mem_cgroup_thresholds *thresholds;
5727        struct mem_cgroup_threshold_ary *new;
5728        enum res_type type = MEMFILE_TYPE(cft->private);
5729        u64 usage;
5730        int i, j, size;
5731
5732        mutex_lock(&memcg->thresholds_lock);
5733        if (type == _MEM)
5734                thresholds = &memcg->thresholds;
5735        else if (type == _MEMSWAP)
5736                thresholds = &memcg->memsw_thresholds;
5737        else
5738                BUG();
5739
5740        if (!thresholds->primary)
5741                goto unlock;
5742
5743        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5744
5745        /* Check if a threshold crossed before removing */
5746        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5747
5748        /* Calculate new number of threshold */
5749        size = 0;
5750        for (i = 0; i < thresholds->primary->size; i++) {
5751                if (thresholds->primary->entries[i].eventfd != eventfd)
5752                        size++;
5753        }
5754
5755        new = thresholds->spare;
5756
5757        /* Set thresholds array to NULL if we don't have thresholds */
5758        if (!size) {
5759                kfree(new);
5760                new = NULL;
5761                goto swap_buffers;
5762        }
5763
5764        new->size = size;
5765
5766        /* Copy thresholds and find current threshold */
5767        new->current_threshold = -1;
5768        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5769                if (thresholds->primary->entries[i].eventfd == eventfd)
5770                        continue;
5771
5772                new->entries[j] = thresholds->primary->entries[i];
5773                if (new->entries[j].threshold <= usage) {
5774                        /*
5775                         * new->current_threshold will not be used
5776                         * until rcu_assign_pointer(), so it's safe to increment
5777                         * it here.
5778                         */
5779                        ++new->current_threshold;
5780                }
5781                j++;
5782        }