linux/mm/memcontrol.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23 *
  24 * Per memcg lru locking
  25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26 */
  27
  28#include <linux/page_counter.h>
  29#include <linux/memcontrol.h>
  30#include <linux/cgroup.h>
  31#include <linux/pagewalk.h>
  32#include <linux/sched/mm.h>
  33#include <linux/shmem_fs.h>
  34#include <linux/hugetlb.h>
  35#include <linux/pagemap.h>
  36#include <linux/vm_event_item.h>
  37#include <linux/smp.h>
  38#include <linux/page-flags.h>
  39#include <linux/backing-dev.h>
  40#include <linux/bit_spinlock.h>
  41#include <linux/rcupdate.h>
  42#include <linux/limits.h>
  43#include <linux/export.h>
  44#include <linux/mutex.h>
  45#include <linux/rbtree.h>
  46#include <linux/slab.h>
  47#include <linux/swap.h>
  48#include <linux/swapops.h>
  49#include <linux/spinlock.h>
  50#include <linux/eventfd.h>
  51#include <linux/poll.h>
  52#include <linux/sort.h>
  53#include <linux/fs.h>
  54#include <linux/seq_file.h>
  55#include <linux/vmpressure.h>
  56#include <linux/mm_inline.h>
  57#include <linux/swap_cgroup.h>
  58#include <linux/cpu.h>
  59#include <linux/oom.h>
  60#include <linux/lockdep.h>
  61#include <linux/file.h>
  62#include <linux/tracehook.h>
  63#include <linux/psi.h>
  64#include <linux/seq_buf.h>
  65#include "internal.h"
  66#include <net/sock.h>
  67#include <net/ip.h>
  68#include "slab.h"
  69
  70#include <linux/uaccess.h>
  71
  72#include <trace/events/vmscan.h>
  73
  74struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  75EXPORT_SYMBOL(memory_cgrp_subsys);
  76
  77struct mem_cgroup *root_mem_cgroup __read_mostly;
  78
  79/* Active memory cgroup to use from an interrupt context */
  80DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  81
  82/* Socket memory accounting disabled? */
  83static bool cgroup_memory_nosocket;
  84
  85/* Kernel memory accounting disabled? */
  86static bool cgroup_memory_nokmem;
  87
  88/* Whether the swap controller is active */
  89#ifdef CONFIG_MEMCG_SWAP
  90bool cgroup_memory_noswap __read_mostly;
  91#else
  92#define cgroup_memory_noswap            1
  93#endif
  94
  95#ifdef CONFIG_CGROUP_WRITEBACK
  96static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  97#endif
  98
  99/* Whether legacy memory+swap accounting is active */
 100static bool do_memsw_account(void)
 101{
 102        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 103}
 104
 105#define THRESHOLDS_EVENTS_TARGET 128
 106#define SOFTLIMIT_EVENTS_TARGET 1024
 107
 108/*
 109 * Cgroups above their limits are maintained in a RB-Tree, independent of
 110 * their hierarchy representation
 111 */
 112
 113struct mem_cgroup_tree_per_node {
 114        struct rb_root rb_root;
 115        struct rb_node *rb_rightmost;
 116        spinlock_t lock;
 117};
 118
 119struct mem_cgroup_tree {
 120        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 121};
 122
 123static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 124
 125/* for OOM */
 126struct mem_cgroup_eventfd_list {
 127        struct list_head list;
 128        struct eventfd_ctx *eventfd;
 129};
 130
 131/*
 132 * cgroup_event represents events which userspace want to receive.
 133 */
 134struct mem_cgroup_event {
 135        /*
 136         * memcg which the event belongs to.
 137         */
 138        struct mem_cgroup *memcg;
 139        /*
 140         * eventfd to signal userspace about the event.
 141         */
 142        struct eventfd_ctx *eventfd;
 143        /*
 144         * Each of these stored in a list by the cgroup.
 145         */
 146        struct list_head list;
 147        /*
 148         * register_event() callback will be used to add new userspace
 149         * waiter for changes related to this event.  Use eventfd_signal()
 150         * on eventfd to send notification to userspace.
 151         */
 152        int (*register_event)(struct mem_cgroup *memcg,
 153                              struct eventfd_ctx *eventfd, const char *args);
 154        /*
 155         * unregister_event() callback will be called when userspace closes
 156         * the eventfd or on cgroup removing.  This callback must be set,
 157         * if you want provide notification functionality.
 158         */
 159        void (*unregister_event)(struct mem_cgroup *memcg,
 160                                 struct eventfd_ctx *eventfd);
 161        /*
 162         * All fields below needed to unregister event when
 163         * userspace closes eventfd.
 164         */
 165        poll_table pt;
 166        wait_queue_head_t *wqh;
 167        wait_queue_entry_t wait;
 168        struct work_struct remove;
 169};
 170
 171static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 172static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 173
 174/* Stuffs for move charges at task migration. */
 175/*
 176 * Types of charges to be moved.
 177 */
 178#define MOVE_ANON       0x1U
 179#define MOVE_FILE       0x2U
 180#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 181
 182/* "mc" and its members are protected by cgroup_mutex */
 183static struct move_charge_struct {
 184        spinlock_t        lock; /* for from, to */
 185        struct mm_struct  *mm;
 186        struct mem_cgroup *from;
 187        struct mem_cgroup *to;
 188        unsigned long flags;
 189        unsigned long precharge;
 190        unsigned long moved_charge;
 191        unsigned long moved_swap;
 192        struct task_struct *moving_task;        /* a task moving charges */
 193        wait_queue_head_t waitq;                /* a waitq for other context */
 194} mc = {
 195        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 196        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 197};
 198
 199/*
 200 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 201 * limit reclaim to prevent infinite loops, if they ever occur.
 202 */
 203#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 204#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 205
 206/* for encoding cft->private value on file */
 207enum res_type {
 208        _MEM,
 209        _MEMSWAP,
 210        _OOM_TYPE,
 211        _KMEM,
 212        _TCP,
 213};
 214
 215#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 216#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 217#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 218/* Used for OOM notifier */
 219#define OOM_CONTROL             (0)
 220
 221/*
 222 * Iteration constructs for visiting all cgroups (under a tree).  If
 223 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 224 * be used for reference counting.
 225 */
 226#define for_each_mem_cgroup_tree(iter, root)            \
 227        for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 228             iter != NULL;                              \
 229             iter = mem_cgroup_iter(root, iter, NULL))
 230
 231#define for_each_mem_cgroup(iter)                       \
 232        for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 233             iter != NULL;                              \
 234             iter = mem_cgroup_iter(NULL, iter, NULL))
 235
 236static inline bool should_force_charge(void)
 237{
 238        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 239                (current->flags & PF_EXITING);
 240}
 241
 242/* Some nice accessors for the vmpressure. */
 243struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 244{
 245        if (!memcg)
 246                memcg = root_mem_cgroup;
 247        return &memcg->vmpressure;
 248}
 249
 250struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 251{
 252        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 253}
 254
 255#ifdef CONFIG_MEMCG_KMEM
 256extern spinlock_t css_set_lock;
 257
 258static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 259                                      unsigned int nr_pages);
 260
 261static void obj_cgroup_release(struct percpu_ref *ref)
 262{
 263        struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 264        struct mem_cgroup *memcg;
 265        unsigned int nr_bytes;
 266        unsigned int nr_pages;
 267        unsigned long flags;
 268
 269        /*
 270         * At this point all allocated objects are freed, and
 271         * objcg->nr_charged_bytes can't have an arbitrary byte value.
 272         * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 273         *
 274         * The following sequence can lead to it:
 275         * 1) CPU0: objcg == stock->cached_objcg
 276         * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 277         *          PAGE_SIZE bytes are charged
 278         * 3) CPU1: a process from another memcg is allocating something,
 279         *          the stock if flushed,
 280         *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 281         * 5) CPU0: we do release this object,
 282         *          92 bytes are added to stock->nr_bytes
 283         * 6) CPU0: stock is flushed,
 284         *          92 bytes are added to objcg->nr_charged_bytes
 285         *
 286         * In the result, nr_charged_bytes == PAGE_SIZE.
 287         * This page will be uncharged in obj_cgroup_release().
 288         */
 289        nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 290        WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 291        nr_pages = nr_bytes >> PAGE_SHIFT;
 292
 293        spin_lock_irqsave(&css_set_lock, flags);
 294        memcg = obj_cgroup_memcg(objcg);
 295        if (nr_pages)
 296                obj_cgroup_uncharge_pages(objcg, nr_pages);
 297        list_del(&objcg->list);
 298        mem_cgroup_put(memcg);
 299        spin_unlock_irqrestore(&css_set_lock, flags);
 300
 301        percpu_ref_exit(ref);
 302        kfree_rcu(objcg, rcu);
 303}
 304
 305static struct obj_cgroup *obj_cgroup_alloc(void)
 306{
 307        struct obj_cgroup *objcg;
 308        int ret;
 309
 310        objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 311        if (!objcg)
 312                return NULL;
 313
 314        ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 315                              GFP_KERNEL);
 316        if (ret) {
 317                kfree(objcg);
 318                return NULL;
 319        }
 320        INIT_LIST_HEAD(&objcg->list);
 321        return objcg;
 322}
 323
 324static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 325                                  struct mem_cgroup *parent)
 326{
 327        struct obj_cgroup *objcg, *iter;
 328
 329        objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 330
 331        spin_lock_irq(&css_set_lock);
 332
 333        /* Move active objcg to the parent's list */
 334        xchg(&objcg->memcg, parent);
 335        css_get(&parent->css);
 336        list_add(&objcg->list, &parent->objcg_list);
 337
 338        /* Move already reparented objcgs to the parent's list */
 339        list_for_each_entry(iter, &memcg->objcg_list, list) {
 340                css_get(&parent->css);
 341                xchg(&iter->memcg, parent);
 342                css_put(&memcg->css);
 343        }
 344        list_splice(&memcg->objcg_list, &parent->objcg_list);
 345
 346        spin_unlock_irq(&css_set_lock);
 347
 348        percpu_ref_kill(&objcg->refcnt);
 349}
 350
 351/*
 352 * This will be used as a shrinker list's index.
 353 * The main reason for not using cgroup id for this:
 354 *  this works better in sparse environments, where we have a lot of memcgs,
 355 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 356 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 357 *  200 entry array for that.
 358 *
 359 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 360 * will double each time we have to increase it.
 361 */
 362static DEFINE_IDA(memcg_cache_ida);
 363int memcg_nr_cache_ids;
 364
 365/* Protects memcg_nr_cache_ids */
 366static DECLARE_RWSEM(memcg_cache_ids_sem);
 367
 368void memcg_get_cache_ids(void)
 369{
 370        down_read(&memcg_cache_ids_sem);
 371}
 372
 373void memcg_put_cache_ids(void)
 374{
 375        up_read(&memcg_cache_ids_sem);
 376}
 377
 378/*
 379 * MIN_SIZE is different than 1, because we would like to avoid going through
 380 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 381 * cgroups is a reasonable guess. In the future, it could be a parameter or
 382 * tunable, but that is strictly not necessary.
 383 *
 384 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 385 * this constant directly from cgroup, but it is understandable that this is
 386 * better kept as an internal representation in cgroup.c. In any case, the
 387 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 388 * increase ours as well if it increases.
 389 */
 390#define MEMCG_CACHES_MIN_SIZE 4
 391#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 392
 393/*
 394 * A lot of the calls to the cache allocation functions are expected to be
 395 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
 396 * conditional to this static branch, we'll have to allow modules that does
 397 * kmem_cache_alloc and the such to see this symbol as well
 398 */
 399DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 400EXPORT_SYMBOL(memcg_kmem_enabled_key);
 401#endif
 402
 403/**
 404 * mem_cgroup_css_from_page - css of the memcg associated with a page
 405 * @page: page of interest
 406 *
 407 * If memcg is bound to the default hierarchy, css of the memcg associated
 408 * with @page is returned.  The returned css remains associated with @page
 409 * until it is released.
 410 *
 411 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 412 * is returned.
 413 */
 414struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 415{
 416        struct mem_cgroup *memcg;
 417
 418        memcg = page_memcg(page);
 419
 420        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 421                memcg = root_mem_cgroup;
 422
 423        return &memcg->css;
 424}
 425
 426/**
 427 * page_cgroup_ino - return inode number of the memcg a page is charged to
 428 * @page: the page
 429 *
 430 * Look up the closest online ancestor of the memory cgroup @page is charged to
 431 * and return its inode number or 0 if @page is not charged to any cgroup. It
 432 * is safe to call this function without holding a reference to @page.
 433 *
 434 * Note, this function is inherently racy, because there is nothing to prevent
 435 * the cgroup inode from getting torn down and potentially reallocated a moment
 436 * after page_cgroup_ino() returns, so it only should be used by callers that
 437 * do not care (such as procfs interfaces).
 438 */
 439ino_t page_cgroup_ino(struct page *page)
 440{
 441        struct mem_cgroup *memcg;
 442        unsigned long ino = 0;
 443
 444        rcu_read_lock();
 445        memcg = page_memcg_check(page);
 446
 447        while (memcg && !(memcg->css.flags & CSS_ONLINE))
 448                memcg = parent_mem_cgroup(memcg);
 449        if (memcg)
 450                ino = cgroup_ino(memcg->css.cgroup);
 451        rcu_read_unlock();
 452        return ino;
 453}
 454
 455static struct mem_cgroup_per_node *
 456mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 457{
 458        int nid = page_to_nid(page);
 459
 460        return memcg->nodeinfo[nid];
 461}
 462
 463static struct mem_cgroup_tree_per_node *
 464soft_limit_tree_node(int nid)
 465{
 466        return soft_limit_tree.rb_tree_per_node[nid];
 467}
 468
 469static struct mem_cgroup_tree_per_node *
 470soft_limit_tree_from_page(struct page *page)
 471{
 472        int nid = page_to_nid(page);
 473
 474        return soft_limit_tree.rb_tree_per_node[nid];
 475}
 476
 477static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 478                                         struct mem_cgroup_tree_per_node *mctz,
 479                                         unsigned long new_usage_in_excess)
 480{
 481        struct rb_node **p = &mctz->rb_root.rb_node;
 482        struct rb_node *parent = NULL;
 483        struct mem_cgroup_per_node *mz_node;
 484        bool rightmost = true;
 485
 486        if (mz->on_tree)
 487                return;
 488
 489        mz->usage_in_excess = new_usage_in_excess;
 490        if (!mz->usage_in_excess)
 491                return;
 492        while (*p) {
 493                parent = *p;
 494                mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 495                                        tree_node);
 496                if (mz->usage_in_excess < mz_node->usage_in_excess) {
 497                        p = &(*p)->rb_left;
 498                        rightmost = false;
 499                } else {
 500                        p = &(*p)->rb_right;
 501                }
 502        }
 503
 504        if (rightmost)
 505                mctz->rb_rightmost = &mz->tree_node;
 506
 507        rb_link_node(&mz->tree_node, parent, p);
 508        rb_insert_color(&mz->tree_node, &mctz->rb_root);
 509        mz->on_tree = true;
 510}
 511
 512static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 513                                         struct mem_cgroup_tree_per_node *mctz)
 514{
 515        if (!mz->on_tree)
 516                return;
 517
 518        if (&mz->tree_node == mctz->rb_rightmost)
 519                mctz->rb_rightmost = rb_prev(&mz->tree_node);
 520
 521        rb_erase(&mz->tree_node, &mctz->rb_root);
 522        mz->on_tree = false;
 523}
 524
 525static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 526                                       struct mem_cgroup_tree_per_node *mctz)
 527{
 528        unsigned long flags;
 529
 530        spin_lock_irqsave(&mctz->lock, flags);
 531        __mem_cgroup_remove_exceeded(mz, mctz);
 532        spin_unlock_irqrestore(&mctz->lock, flags);
 533}
 534
 535static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 536{
 537        unsigned long nr_pages = page_counter_read(&memcg->memory);
 538        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 539        unsigned long excess = 0;
 540
 541        if (nr_pages > soft_limit)
 542                excess = nr_pages - soft_limit;
 543
 544        return excess;
 545}
 546
 547static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 548{
 549        unsigned long excess;
 550        struct mem_cgroup_per_node *mz;
 551        struct mem_cgroup_tree_per_node *mctz;
 552
 553        mctz = soft_limit_tree_from_page(page);
 554        if (!mctz)
 555                return;
 556        /*
 557         * Necessary to update all ancestors when hierarchy is used.
 558         * because their event counter is not touched.
 559         */
 560        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 561                mz = mem_cgroup_page_nodeinfo(memcg, page);
 562                excess = soft_limit_excess(memcg);
 563                /*
 564                 * We have to update the tree if mz is on RB-tree or
 565                 * mem is over its softlimit.
 566                 */
 567                if (excess || mz->on_tree) {
 568                        unsigned long flags;
 569
 570                        spin_lock_irqsave(&mctz->lock, flags);
 571                        /* if on-tree, remove it */
 572                        if (mz->on_tree)
 573                                __mem_cgroup_remove_exceeded(mz, mctz);
 574                        /*
 575                         * Insert again. mz->usage_in_excess will be updated.
 576                         * If excess is 0, no tree ops.
 577                         */
 578                        __mem_cgroup_insert_exceeded(mz, mctz, excess);
 579                        spin_unlock_irqrestore(&mctz->lock, flags);
 580                }
 581        }
 582}
 583
 584static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 585{
 586        struct mem_cgroup_tree_per_node *mctz;
 587        struct mem_cgroup_per_node *mz;
 588        int nid;
 589
 590        for_each_node(nid) {
 591                mz = memcg->nodeinfo[nid];
 592                mctz = soft_limit_tree_node(nid);
 593                if (mctz)
 594                        mem_cgroup_remove_exceeded(mz, mctz);
 595        }
 596}
 597
 598static struct mem_cgroup_per_node *
 599__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 600{
 601        struct mem_cgroup_per_node *mz;
 602
 603retry:
 604        mz = NULL;
 605        if (!mctz->rb_rightmost)
 606                goto done;              /* Nothing to reclaim from */
 607
 608        mz = rb_entry(mctz->rb_rightmost,
 609                      struct mem_cgroup_per_node, tree_node);
 610        /*
 611         * Remove the node now but someone else can add it back,
 612         * we will to add it back at the end of reclaim to its correct
 613         * position in the tree.
 614         */
 615        __mem_cgroup_remove_exceeded(mz, mctz);
 616        if (!soft_limit_excess(mz->memcg) ||
 617            !css_tryget(&mz->memcg->css))
 618                goto retry;
 619done:
 620        return mz;
 621}
 622
 623static struct mem_cgroup_per_node *
 624mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 625{
 626        struct mem_cgroup_per_node *mz;
 627
 628        spin_lock_irq(&mctz->lock);
 629        mz = __mem_cgroup_largest_soft_limit_node(mctz);
 630        spin_unlock_irq(&mctz->lock);
 631        return mz;
 632}
 633
 634/**
 635 * __mod_memcg_state - update cgroup memory statistics
 636 * @memcg: the memory cgroup
 637 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 638 * @val: delta to add to the counter, can be negative
 639 */
 640void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 641{
 642        if (mem_cgroup_disabled())
 643                return;
 644
 645        __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 646        cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 647}
 648
 649/* idx can be of type enum memcg_stat_item or node_stat_item. */
 650static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 651{
 652        long x = READ_ONCE(memcg->vmstats.state[idx]);
 653#ifdef CONFIG_SMP
 654        if (x < 0)
 655                x = 0;
 656#endif
 657        return x;
 658}
 659
 660/* idx can be of type enum memcg_stat_item or node_stat_item. */
 661static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 662{
 663        long x = 0;
 664        int cpu;
 665
 666        for_each_possible_cpu(cpu)
 667                x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
 668#ifdef CONFIG_SMP
 669        if (x < 0)
 670                x = 0;
 671#endif
 672        return x;
 673}
 674
 675static struct mem_cgroup_per_node *
 676parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 677{
 678        struct mem_cgroup *parent;
 679
 680        parent = parent_mem_cgroup(pn->memcg);
 681        if (!parent)
 682                return NULL;
 683        return parent->nodeinfo[nid];
 684}
 685
 686void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 687                              int val)
 688{
 689        struct mem_cgroup_per_node *pn;
 690        struct mem_cgroup *memcg;
 691        long x, threshold = MEMCG_CHARGE_BATCH;
 692
 693        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 694        memcg = pn->memcg;
 695
 696        /* Update memcg */
 697        __mod_memcg_state(memcg, idx, val);
 698
 699        /* Update lruvec */
 700        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 701
 702        if (vmstat_item_in_bytes(idx))
 703                threshold <<= PAGE_SHIFT;
 704
 705        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 706        if (unlikely(abs(x) > threshold)) {
 707                pg_data_t *pgdat = lruvec_pgdat(lruvec);
 708                struct mem_cgroup_per_node *pi;
 709
 710                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
 711                        atomic_long_add(x, &pi->lruvec_stat[idx]);
 712                x = 0;
 713        }
 714        __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 715}
 716
 717/**
 718 * __mod_lruvec_state - update lruvec memory statistics
 719 * @lruvec: the lruvec
 720 * @idx: the stat item
 721 * @val: delta to add to the counter, can be negative
 722 *
 723 * The lruvec is the intersection of the NUMA node and a cgroup. This
 724 * function updates the all three counters that are affected by a
 725 * change of state at this level: per-node, per-cgroup, per-lruvec.
 726 */
 727void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 728                        int val)
 729{
 730        /* Update node */
 731        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 732
 733        /* Update memcg and lruvec */
 734        if (!mem_cgroup_disabled())
 735                __mod_memcg_lruvec_state(lruvec, idx, val);
 736}
 737
 738void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 739                             int val)
 740{
 741        struct page *head = compound_head(page); /* rmap on tail pages */
 742        struct mem_cgroup *memcg;
 743        pg_data_t *pgdat = page_pgdat(page);
 744        struct lruvec *lruvec;
 745
 746        rcu_read_lock();
 747        memcg = page_memcg(head);
 748        /* Untracked pages have no memcg, no lruvec. Update only the node */
 749        if (!memcg) {
 750                rcu_read_unlock();
 751                __mod_node_page_state(pgdat, idx, val);
 752                return;
 753        }
 754
 755        lruvec = mem_cgroup_lruvec(memcg, pgdat);
 756        __mod_lruvec_state(lruvec, idx, val);
 757        rcu_read_unlock();
 758}
 759EXPORT_SYMBOL(__mod_lruvec_page_state);
 760
 761void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 762{
 763        pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 764        struct mem_cgroup *memcg;
 765        struct lruvec *lruvec;
 766
 767        rcu_read_lock();
 768        memcg = mem_cgroup_from_obj(p);
 769
 770        /*
 771         * Untracked pages have no memcg, no lruvec. Update only the
 772         * node. If we reparent the slab objects to the root memcg,
 773         * when we free the slab object, we need to update the per-memcg
 774         * vmstats to keep it correct for the root memcg.
 775         */
 776        if (!memcg) {
 777                __mod_node_page_state(pgdat, idx, val);
 778        } else {
 779                lruvec = mem_cgroup_lruvec(memcg, pgdat);
 780                __mod_lruvec_state(lruvec, idx, val);
 781        }
 782        rcu_read_unlock();
 783}
 784
 785/**
 786 * __count_memcg_events - account VM events in a cgroup
 787 * @memcg: the memory cgroup
 788 * @idx: the event item
 789 * @count: the number of events that occurred
 790 */
 791void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 792                          unsigned long count)
 793{
 794        if (mem_cgroup_disabled())
 795                return;
 796
 797        __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
 798        cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 799}
 800
 801static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 802{
 803        return READ_ONCE(memcg->vmstats.events[event]);
 804}
 805
 806static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 807{
 808        long x = 0;
 809        int cpu;
 810
 811        for_each_possible_cpu(cpu)
 812                x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
 813        return x;
 814}
 815
 816static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 817                                         struct page *page,
 818                                         int nr_pages)
 819{
 820        /* pagein of a big page is an event. So, ignore page size */
 821        if (nr_pages > 0)
 822                __count_memcg_events(memcg, PGPGIN, 1);
 823        else {
 824                __count_memcg_events(memcg, PGPGOUT, 1);
 825                nr_pages = -nr_pages; /* for event */
 826        }
 827
 828        __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 829}
 830
 831static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 832                                       enum mem_cgroup_events_target target)
 833{
 834        unsigned long val, next;
 835
 836        val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 837        next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 838        /* from time_after() in jiffies.h */
 839        if ((long)(next - val) < 0) {
 840                switch (target) {
 841                case MEM_CGROUP_TARGET_THRESH:
 842                        next = val + THRESHOLDS_EVENTS_TARGET;
 843                        break;
 844                case MEM_CGROUP_TARGET_SOFTLIMIT:
 845                        next = val + SOFTLIMIT_EVENTS_TARGET;
 846                        break;
 847                default:
 848                        break;
 849                }
 850                __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 851                return true;
 852        }
 853        return false;
 854}
 855
 856/*
 857 * Check events in order.
 858 *
 859 */
 860static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 861{
 862        /* threshold event is triggered in finer grain than soft limit */
 863        if (unlikely(mem_cgroup_event_ratelimit(memcg,
 864                                                MEM_CGROUP_TARGET_THRESH))) {
 865                bool do_softlimit;
 866
 867                do_softlimit = mem_cgroup_event_ratelimit(memcg,
 868                                                MEM_CGROUP_TARGET_SOFTLIMIT);
 869                mem_cgroup_threshold(memcg);
 870                if (unlikely(do_softlimit))
 871                        mem_cgroup_update_tree(memcg, page);
 872        }
 873}
 874
 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 876{
 877        /*
 878         * mm_update_next_owner() may clear mm->owner to NULL
 879         * if it races with swapoff, page migration, etc.
 880         * So this can be called with p == NULL.
 881         */
 882        if (unlikely(!p))
 883                return NULL;
 884
 885        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 886}
 887EXPORT_SYMBOL(mem_cgroup_from_task);
 888
 889/**
 890 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 891 * @mm: mm from which memcg should be extracted. It can be NULL.
 892 *
 893 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
 894 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
 895 * returned.
 896 */
 897struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 898{
 899        struct mem_cgroup *memcg;
 900
 901        if (mem_cgroup_disabled())
 902                return NULL;
 903
 904        rcu_read_lock();
 905        do {
 906                /*
 907                 * Page cache insertions can happen without an
 908                 * actual mm context, e.g. during disk probing
 909                 * on boot, loopback IO, acct() writes etc.
 910                 */
 911                if (unlikely(!mm))
 912                        memcg = root_mem_cgroup;
 913                else {
 914                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 915                        if (unlikely(!memcg))
 916                                memcg = root_mem_cgroup;
 917                }
 918        } while (!css_tryget(&memcg->css));
 919        rcu_read_unlock();
 920        return memcg;
 921}
 922EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 923
 924static __always_inline struct mem_cgroup *active_memcg(void)
 925{
 926        if (in_interrupt())
 927                return this_cpu_read(int_active_memcg);
 928        else
 929                return current->active_memcg;
 930}
 931
 932static __always_inline bool memcg_kmem_bypass(void)
 933{
 934        /* Allow remote memcg charging from any context. */
 935        if (unlikely(active_memcg()))
 936                return false;
 937
 938        /* Memcg to charge can't be determined. */
 939        if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
 940                return true;
 941
 942        return false;
 943}
 944
 945/**
 946 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 947 * @root: hierarchy root
 948 * @prev: previously returned memcg, NULL on first invocation
 949 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 950 *
 951 * Returns references to children of the hierarchy below @root, or
 952 * @root itself, or %NULL after a full round-trip.
 953 *
 954 * Caller must pass the return value in @prev on subsequent
 955 * invocations for reference counting, or use mem_cgroup_iter_break()
 956 * to cancel a hierarchy walk before the round-trip is complete.
 957 *
 958 * Reclaimers can specify a node in @reclaim to divide up the memcgs
 959 * in the hierarchy among all concurrent reclaimers operating on the
 960 * same node.
 961 */
 962struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 963                                   struct mem_cgroup *prev,
 964                                   struct mem_cgroup_reclaim_cookie *reclaim)
 965{
 966        struct mem_cgroup_reclaim_iter *iter;
 967        struct cgroup_subsys_state *css = NULL;
 968        struct mem_cgroup *memcg = NULL;
 969        struct mem_cgroup *pos = NULL;
 970
 971        if (mem_cgroup_disabled())
 972                return NULL;
 973
 974        if (!root)
 975                root = root_mem_cgroup;
 976
 977        if (prev && !reclaim)
 978                pos = prev;
 979
 980        rcu_read_lock();
 981
 982        if (reclaim) {
 983                struct mem_cgroup_per_node *mz;
 984
 985                mz = root->nodeinfo[reclaim->pgdat->node_id];
 986                iter = &mz->iter;
 987
 988                if (prev && reclaim->generation != iter->generation)
 989                        goto out_unlock;
 990
 991                while (1) {
 992                        pos = READ_ONCE(iter->position);
 993                        if (!pos || css_tryget(&pos->css))
 994                                break;
 995                        /*
 996                         * css reference reached zero, so iter->position will
 997                         * be cleared by ->css_released. However, we should not
 998                         * rely on this happening soon, because ->css_released
 999                         * is called from a work queue, and by busy-waiting we
1000                         * might block it. So we clear iter->position right
1001                         * away.
1002                         */
1003                        (void)cmpxchg(&iter->position, pos, NULL);
1004                }
1005        }
1006
1007        if (pos)
1008                css = &pos->css;
1009
1010        for (;;) {
1011                css = css_next_descendant_pre(css, &root->css);
1012                if (!css) {
1013                        /*
1014                         * Reclaimers share the hierarchy walk, and a
1015                         * new one might jump in right at the end of
1016                         * the hierarchy - make sure they see at least
1017                         * one group and restart from the beginning.
1018                         */
1019                        if (!prev)
1020                                continue;
1021                        break;
1022                }
1023
1024                /*
1025                 * Verify the css and acquire a reference.  The root
1026                 * is provided by the caller, so we know it's alive
1027                 * and kicking, and don't take an extra reference.
1028                 */
1029                memcg = mem_cgroup_from_css(css);
1030
1031                if (css == &root->css)
1032                        break;
1033
1034                if (css_tryget(css))
1035                        break;
1036
1037                memcg = NULL;
1038        }
1039
1040        if (reclaim) {
1041                /*
1042                 * The position could have already been updated by a competing
1043                 * thread, so check that the value hasn't changed since we read
1044                 * it to avoid reclaiming from the same cgroup twice.
1045                 */
1046                (void)cmpxchg(&iter->position, pos, memcg);
1047
1048                if (pos)
1049                        css_put(&pos->css);
1050
1051                if (!memcg)
1052                        iter->generation++;
1053                else if (!prev)
1054                        reclaim->generation = iter->generation;
1055        }
1056
1057out_unlock:
1058        rcu_read_unlock();
1059        if (prev && prev != root)
1060                css_put(&prev->css);
1061
1062        return memcg;
1063}
1064
1065/**
1066 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1067 * @root: hierarchy root
1068 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1069 */
1070void mem_cgroup_iter_break(struct mem_cgroup *root,
1071                           struct mem_cgroup *prev)
1072{
1073        if (!root)
1074                root = root_mem_cgroup;
1075        if (prev && prev != root)
1076                css_put(&prev->css);
1077}
1078
1079static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1080                                        struct mem_cgroup *dead_memcg)
1081{
1082        struct mem_cgroup_reclaim_iter *iter;
1083        struct mem_cgroup_per_node *mz;
1084        int nid;
1085
1086        for_each_node(nid) {
1087                mz = from->nodeinfo[nid];
1088                iter = &mz->iter;
1089                cmpxchg(&iter->position, dead_memcg, NULL);
1090        }
1091}
1092
1093static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1094{
1095        struct mem_cgroup *memcg = dead_memcg;
1096        struct mem_cgroup *last;
1097
1098        do {
1099                __invalidate_reclaim_iterators(memcg, dead_memcg);
1100                last = memcg;
1101        } while ((memcg = parent_mem_cgroup(memcg)));
1102
1103        /*
1104         * When cgruop1 non-hierarchy mode is used,
1105         * parent_mem_cgroup() does not walk all the way up to the
1106         * cgroup root (root_mem_cgroup). So we have to handle
1107         * dead_memcg from cgroup root separately.
1108         */
1109        if (last != root_mem_cgroup)
1110                __invalidate_reclaim_iterators(root_mem_cgroup,
1111                                                dead_memcg);
1112}
1113
1114/**
1115 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1116 * @memcg: hierarchy root
1117 * @fn: function to call for each task
1118 * @arg: argument passed to @fn
1119 *
1120 * This function iterates over tasks attached to @memcg or to any of its
1121 * descendants and calls @fn for each task. If @fn returns a non-zero
1122 * value, the function breaks the iteration loop and returns the value.
1123 * Otherwise, it will iterate over all tasks and return 0.
1124 *
1125 * This function must not be called for the root memory cgroup.
1126 */
1127int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1128                          int (*fn)(struct task_struct *, void *), void *arg)
1129{
1130        struct mem_cgroup *iter;
1131        int ret = 0;
1132
1133        BUG_ON(memcg == root_mem_cgroup);
1134
1135        for_each_mem_cgroup_tree(iter, memcg) {
1136                struct css_task_iter it;
1137                struct task_struct *task;
1138
1139                css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1140                while (!ret && (task = css_task_iter_next(&it)))
1141                        ret = fn(task, arg);
1142                css_task_iter_end(&it);
1143                if (ret) {
1144                        mem_cgroup_iter_break(memcg, iter);
1145                        break;
1146                }
1147        }
1148        return ret;
1149}
1150
1151#ifdef CONFIG_DEBUG_VM
1152void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
1153{
1154        struct mem_cgroup *memcg;
1155
1156        if (mem_cgroup_disabled())
1157                return;
1158
1159        memcg = page_memcg(page);
1160
1161        if (!memcg)
1162                VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
1163        else
1164                VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
1165}
1166#endif
1167
1168/**
1169 * lock_page_lruvec - lock and return lruvec for a given page.
1170 * @page: the page
1171 *
1172 * These functions are safe to use under any of the following conditions:
1173 * - page locked
1174 * - PageLRU cleared
1175 * - lock_page_memcg()
1176 * - page->_refcount is zero
1177 */
1178struct lruvec *lock_page_lruvec(struct page *page)
1179{
1180        struct lruvec *lruvec;
1181        struct pglist_data *pgdat = page_pgdat(page);
1182
1183        lruvec = mem_cgroup_page_lruvec(page, pgdat);
1184        spin_lock(&lruvec->lru_lock);
1185
1186        lruvec_memcg_debug(lruvec, page);
1187
1188        return lruvec;
1189}
1190
1191struct lruvec *lock_page_lruvec_irq(struct page *page)
1192{
1193        struct lruvec *lruvec;
1194        struct pglist_data *pgdat = page_pgdat(page);
1195
1196        lruvec = mem_cgroup_page_lruvec(page, pgdat);
1197        spin_lock_irq(&lruvec->lru_lock);
1198
1199        lruvec_memcg_debug(lruvec, page);
1200
1201        return lruvec;
1202}
1203
1204struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
1205{
1206        struct lruvec *lruvec;
1207        struct pglist_data *pgdat = page_pgdat(page);
1208
1209        lruvec = mem_cgroup_page_lruvec(page, pgdat);
1210        spin_lock_irqsave(&lruvec->lru_lock, *flags);
1211
1212        lruvec_memcg_debug(lruvec, page);
1213
1214        return lruvec;
1215}
1216
1217/**
1218 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1219 * @lruvec: mem_cgroup per zone lru vector
1220 * @lru: index of lru list the page is sitting on
1221 * @zid: zone id of the accounted pages
1222 * @nr_pages: positive when adding or negative when removing
1223 *
1224 * This function must be called under lru_lock, just before a page is added
1225 * to or just after a page is removed from an lru list (that ordering being
1226 * so as to allow it to check that lru_size 0 is consistent with list_empty).
1227 */
1228void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1229                                int zid, int nr_pages)
1230{
1231        struct mem_cgroup_per_node *mz;
1232        unsigned long *lru_size;
1233        long size;
1234
1235        if (mem_cgroup_disabled())
1236                return;
1237
1238        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1239        lru_size = &mz->lru_zone_size[zid][lru];
1240
1241        if (nr_pages < 0)
1242                *lru_size += nr_pages;
1243
1244        size = *lru_size;
1245        if (WARN_ONCE(size < 0,
1246                "%s(%p, %d, %d): lru_size %ld\n",
1247                __func__, lruvec, lru, nr_pages, size)) {
1248                VM_BUG_ON(1);
1249                *lru_size = 0;
1250        }
1251
1252        if (nr_pages > 0)
1253                *lru_size += nr_pages;
1254}
1255
1256/**
1257 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1258 * @memcg: the memory cgroup
1259 *
1260 * Returns the maximum amount of memory @mem can be charged with, in
1261 * pages.
1262 */
1263static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1264{
1265        unsigned long margin = 0;
1266        unsigned long count;
1267        unsigned long limit;
1268
1269        count = page_counter_read(&memcg->memory);
1270        limit = READ_ONCE(memcg->memory.max);
1271        if (count < limit)
1272                margin = limit - count;
1273
1274        if (do_memsw_account()) {
1275                count = page_counter_read(&memcg->memsw);
1276                limit = READ_ONCE(memcg->memsw.max);
1277                if (count < limit)
1278                        margin = min(margin, limit - count);
1279                else
1280                        margin = 0;
1281        }
1282
1283        return margin;
1284}
1285
1286/*
1287 * A routine for checking "mem" is under move_account() or not.
1288 *
1289 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1290 * moving cgroups. This is for waiting at high-memory pressure
1291 * caused by "move".
1292 */
1293static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1294{
1295        struct mem_cgroup *from;
1296        struct mem_cgroup *to;
1297        bool ret = false;
1298        /*
1299         * Unlike task_move routines, we access mc.to, mc.from not under
1300         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1301         */
1302        spin_lock(&mc.lock);
1303        from = mc.from;
1304        to = mc.to;
1305        if (!from)
1306                goto unlock;
1307
1308        ret = mem_cgroup_is_descendant(from, memcg) ||
1309                mem_cgroup_is_descendant(to, memcg);
1310unlock:
1311        spin_unlock(&mc.lock);
1312        return ret;
1313}
1314
1315static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1316{
1317        if (mc.moving_task && current != mc.moving_task) {
1318                if (mem_cgroup_under_move(memcg)) {
1319                        DEFINE_WAIT(wait);
1320                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1321                        /* moving charge context might have finished. */
1322                        if (mc.moving_task)
1323                                schedule();
1324                        finish_wait(&mc.waitq, &wait);
1325                        return true;
1326                }
1327        }
1328        return false;
1329}
1330
1331struct memory_stat {
1332        const char *name;
1333        unsigned int idx;
1334};
1335
1336static const struct memory_stat memory_stats[] = {
1337        { "anon",                       NR_ANON_MAPPED                  },
1338        { "file",                       NR_FILE_PAGES                   },
1339        { "kernel_stack",               NR_KERNEL_STACK_KB              },
1340        { "pagetables",                 NR_PAGETABLE                    },
1341        { "percpu",                     MEMCG_PERCPU_B                  },
1342        { "sock",                       MEMCG_SOCK                      },
1343        { "shmem",                      NR_SHMEM                        },
1344        { "file_mapped",                NR_FILE_MAPPED                  },
1345        { "file_dirty",                 NR_FILE_DIRTY                   },
1346        { "file_writeback",             NR_WRITEBACK                    },
1347#ifdef CONFIG_SWAP
1348        { "swapcached",                 NR_SWAPCACHE                    },
1349#endif
1350#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1351        { "anon_thp",                   NR_ANON_THPS                    },
1352        { "file_thp",                   NR_FILE_THPS                    },
1353        { "shmem_thp",                  NR_SHMEM_THPS                   },
1354#endif
1355        { "inactive_anon",              NR_INACTIVE_ANON                },
1356        { "active_anon",                NR_ACTIVE_ANON                  },
1357        { "inactive_file",              NR_INACTIVE_FILE                },
1358        { "active_file",                NR_ACTIVE_FILE                  },
1359        { "unevictable",                NR_UNEVICTABLE                  },
1360        { "slab_reclaimable",           NR_SLAB_RECLAIMABLE_B           },
1361        { "slab_unreclaimable",         NR_SLAB_UNRECLAIMABLE_B         },
1362
1363        /* The memory events */
1364        { "workingset_refault_anon",    WORKINGSET_REFAULT_ANON         },
1365        { "workingset_refault_file",    WORKINGSET_REFAULT_FILE         },
1366        { "workingset_activate_anon",   WORKINGSET_ACTIVATE_ANON        },
1367        { "workingset_activate_file",   WORKINGSET_ACTIVATE_FILE        },
1368        { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
1369        { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
1370        { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
1371};
1372
1373/* Translate stat items to the correct unit for memory.stat output */
1374static int memcg_page_state_unit(int item)
1375{
1376        switch (item) {
1377        case MEMCG_PERCPU_B:
1378        case NR_SLAB_RECLAIMABLE_B:
1379        case NR_SLAB_UNRECLAIMABLE_B:
1380        case WORKINGSET_REFAULT_ANON:
1381        case WORKINGSET_REFAULT_FILE:
1382        case WORKINGSET_ACTIVATE_ANON:
1383        case WORKINGSET_ACTIVATE_FILE:
1384        case WORKINGSET_RESTORE_ANON:
1385        case WORKINGSET_RESTORE_FILE:
1386        case WORKINGSET_NODERECLAIM:
1387                return 1;
1388        case NR_KERNEL_STACK_KB:
1389                return SZ_1K;
1390        default:
1391                return PAGE_SIZE;
1392        }
1393}
1394
1395static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1396                                                    int item)
1397{
1398        return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1399}
1400
1401static char *memory_stat_format(struct mem_cgroup *memcg)
1402{
1403        struct seq_buf s;
1404        int i;
1405
1406        seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1407        if (!s.buffer)
1408                return NULL;
1409
1410        /*
1411         * Provide statistics on the state of the memory subsystem as
1412         * well as cumulative event counters that show past behavior.
1413         *
1414         * This list is ordered following a combination of these gradients:
1415         * 1) generic big picture -> specifics and details
1416         * 2) reflecting userspace activity -> reflecting kernel heuristics
1417         *
1418         * Current memory state:
1419         */
1420        cgroup_rstat_flush(memcg->css.cgroup);
1421
1422        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1423                u64 size;
1424
1425                size = memcg_page_state_output(memcg, memory_stats[i].idx);
1426                seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1427
1428                if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1429                        size += memcg_page_state_output(memcg,
1430                                                        NR_SLAB_RECLAIMABLE_B);
1431                        seq_buf_printf(&s, "slab %llu\n", size);
1432                }
1433        }
1434
1435        /* Accumulated memory events */
1436
1437        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1438                       memcg_events(memcg, PGFAULT));
1439        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1440                       memcg_events(memcg, PGMAJFAULT));
1441        seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
1442                       memcg_events(memcg, PGREFILL));
1443        seq_buf_printf(&s, "pgscan %lu\n",
1444                       memcg_events(memcg, PGSCAN_KSWAPD) +
1445                       memcg_events(memcg, PGSCAN_DIRECT));
1446        seq_buf_printf(&s, "pgsteal %lu\n",
1447                       memcg_events(memcg, PGSTEAL_KSWAPD) +
1448                       memcg_events(memcg, PGSTEAL_DIRECT));
1449        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1450                       memcg_events(memcg, PGACTIVATE));
1451        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1452                       memcg_events(memcg, PGDEACTIVATE));
1453        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1454                       memcg_events(memcg, PGLAZYFREE));
1455        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1456                       memcg_events(memcg, PGLAZYFREED));
1457
1458#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1459        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1460                       memcg_events(memcg, THP_FAULT_ALLOC));
1461        seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1462                       memcg_events(memcg, THP_COLLAPSE_ALLOC));
1463#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1464
1465        /* The above should easily fit into one page */
1466        WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1467
1468        return s.buffer;
1469}
1470
1471#define K(x) ((x) << (PAGE_SHIFT-10))
1472/**
1473 * mem_cgroup_print_oom_context: Print OOM information relevant to
1474 * memory controller.
1475 * @memcg: The memory cgroup that went over limit
1476 * @p: Task that is going to be killed
1477 *
1478 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1479 * enabled
1480 */
1481void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1482{
1483        rcu_read_lock();
1484
1485        if (memcg) {
1486                pr_cont(",oom_memcg=");
1487                pr_cont_cgroup_path(memcg->css.cgroup);
1488        } else
1489                pr_cont(",global_oom");
1490        if (p) {
1491                pr_cont(",task_memcg=");
1492                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1493        }
1494        rcu_read_unlock();
1495}
1496
1497/**
1498 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1499 * memory controller.
1500 * @memcg: The memory cgroup that went over limit
1501 */
1502void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1503{
1504        char *buf;
1505
1506        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1507                K((u64)page_counter_read(&memcg->memory)),
1508                K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1509        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1510                pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1511                        K((u64)page_counter_read(&memcg->swap)),
1512                        K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1513        else {
1514                pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1515                        K((u64)page_counter_read(&memcg->memsw)),
1516                        K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1517                pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1518                        K((u64)page_counter_read(&memcg->kmem)),
1519                        K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1520        }
1521
1522        pr_info("Memory cgroup stats for ");
1523        pr_cont_cgroup_path(memcg->css.cgroup);
1524        pr_cont(":");
1525        buf = memory_stat_format(memcg);
1526        if (!buf)
1527                return;
1528        pr_info("%s", buf);
1529        kfree(buf);
1530}
1531
1532/*
1533 * Return the memory (and swap, if configured) limit for a memcg.
1534 */
1535unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1536{
1537        unsigned long max = READ_ONCE(memcg->memory.max);
1538
1539        if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1540                if (mem_cgroup_swappiness(memcg))
1541                        max += min(READ_ONCE(memcg->swap.max),
1542                                   (unsigned long)total_swap_pages);
1543        } else { /* v1 */
1544                if (mem_cgroup_swappiness(memcg)) {
1545                        /* Calculate swap excess capacity from memsw limit */
1546                        unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1547
1548                        max += min(swap, (unsigned long)total_swap_pages);
1549                }
1550        }
1551        return max;
1552}
1553
1554unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1555{
1556        return page_counter_read(&memcg->memory);
1557}
1558
1559static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1560                                     int order)
1561{
1562        struct oom_control oc = {
1563                .zonelist = NULL,
1564                .nodemask = NULL,
1565                .memcg = memcg,
1566                .gfp_mask = gfp_mask,
1567                .order = order,
1568        };
1569        bool ret = true;
1570
1571        if (mutex_lock_killable(&oom_lock))
1572                return true;
1573
1574        if (mem_cgroup_margin(memcg) >= (1 << order))
1575                goto unlock;
1576
1577        /*
1578         * A few threads which were not waiting at mutex_lock_killable() can
1579         * fail to bail out. Therefore, check again after holding oom_lock.
1580         */
1581        ret = should_force_charge() || out_of_memory(&oc);
1582
1583unlock:
1584        mutex_unlock(&oom_lock);
1585        return ret;
1586}
1587
1588static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1589                                   pg_data_t *pgdat,
1590                                   gfp_t gfp_mask,
1591                                   unsigned long *total_scanned)
1592{
1593        struct mem_cgroup *victim = NULL;
1594        int total = 0;
1595        int loop = 0;
1596        unsigned long excess;
1597        unsigned long nr_scanned;
1598        struct mem_cgroup_reclaim_cookie reclaim = {
1599                .pgdat = pgdat,
1600        };
1601
1602        excess = soft_limit_excess(root_memcg);
1603
1604        while (1) {
1605                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1606                if (!victim) {
1607                        loop++;
1608                        if (loop >= 2) {
1609                                /*
1610                                 * If we have not been able to reclaim
1611                                 * anything, it might because there are
1612                                 * no reclaimable pages under this hierarchy
1613                                 */
1614                                if (!total)
1615                                        break;
1616                                /*
1617                                 * We want to do more targeted reclaim.
1618                                 * excess >> 2 is not to excessive so as to
1619                                 * reclaim too much, nor too less that we keep
1620                                 * coming back to reclaim from this cgroup
1621                                 */
1622                                if (total >= (excess >> 2) ||
1623                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1624                                        break;
1625                        }
1626                        continue;
1627                }
1628                total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1629                                        pgdat, &nr_scanned);
1630                *total_scanned += nr_scanned;
1631                if (!soft_limit_excess(root_memcg))
1632                        break;
1633        }
1634        mem_cgroup_iter_break(root_memcg, victim);
1635        return total;
1636}
1637
1638#ifdef CONFIG_LOCKDEP
1639static struct lockdep_map memcg_oom_lock_dep_map = {
1640        .name = "memcg_oom_lock",
1641};
1642#endif
1643
1644static DEFINE_SPINLOCK(memcg_oom_lock);
1645
1646/*
1647 * Check OOM-Killer is already running under our hierarchy.
1648 * If someone is running, return false.
1649 */
1650static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1651{
1652        struct mem_cgroup *iter, *failed = NULL;
1653
1654        spin_lock(&memcg_oom_lock);
1655
1656        for_each_mem_cgroup_tree(iter, memcg) {
1657                if (iter->oom_lock) {
1658                        /*
1659                         * this subtree of our hierarchy is already locked
1660                         * so we cannot give a lock.
1661                         */
1662                        failed = iter;
1663                        mem_cgroup_iter_break(memcg, iter);
1664                        break;
1665                } else
1666                        iter->oom_lock = true;
1667        }
1668
1669        if (failed) {
1670                /*
1671                 * OK, we failed to lock the whole subtree so we have
1672                 * to clean up what we set up to the failing subtree
1673                 */
1674                for_each_mem_cgroup_tree(iter, memcg) {
1675                        if (iter == failed) {
1676                                mem_cgroup_iter_break(memcg, iter);
1677                                break;
1678                        }
1679                        iter->oom_lock = false;
1680                }
1681        } else
1682                mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1683
1684        spin_unlock(&memcg_oom_lock);
1685
1686        return !failed;
1687}
1688
1689static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1690{
1691        struct mem_cgroup *iter;
1692
1693        spin_lock(&memcg_oom_lock);
1694        mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1695        for_each_mem_cgroup_tree(iter, memcg)
1696                iter->oom_lock = false;
1697        spin_unlock(&memcg_oom_lock);
1698}
1699
1700static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1701{
1702        struct mem_cgroup *iter;
1703
1704        spin_lock(&memcg_oom_lock);
1705        for_each_mem_cgroup_tree(iter, memcg)
1706                iter->under_oom++;
1707        spin_unlock(&memcg_oom_lock);
1708}
1709
1710static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1711{
1712        struct mem_cgroup *iter;
1713
1714        /*
1715         * Be careful about under_oom underflows because a child memcg
1716         * could have been added after mem_cgroup_mark_under_oom.
1717         */
1718        spin_lock(&memcg_oom_lock);
1719        for_each_mem_cgroup_tree(iter, memcg)
1720                if (iter->under_oom > 0)
1721                        iter->under_oom--;
1722        spin_unlock(&memcg_oom_lock);
1723}
1724
1725static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1726
1727struct oom_wait_info {
1728        struct mem_cgroup *memcg;
1729        wait_queue_entry_t      wait;
1730};
1731
1732static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1733        unsigned mode, int sync, void *arg)
1734{
1735        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1736        struct mem_cgroup *oom_wait_memcg;
1737        struct oom_wait_info *oom_wait_info;
1738
1739        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1740        oom_wait_memcg = oom_wait_info->memcg;
1741
1742        if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1743            !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1744                return 0;
1745        return autoremove_wake_function(wait, mode, sync, arg);
1746}
1747
1748static void memcg_oom_recover(struct mem_cgroup *memcg)
1749{
1750        /*
1751         * For the following lockless ->under_oom test, the only required
1752         * guarantee is that it must see the state asserted by an OOM when
1753         * this function is called as a result of userland actions
1754         * triggered by the notification of the OOM.  This is trivially
1755         * achieved by invoking mem_cgroup_mark_under_oom() before
1756         * triggering notification.
1757         */
1758        if (memcg && memcg->under_oom)
1759                __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1760}
1761
1762enum oom_status {
1763        OOM_SUCCESS,
1764        OOM_FAILED,
1765        OOM_ASYNC,
1766        OOM_SKIPPED
1767};
1768
1769static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1770{
1771        enum oom_status ret;
1772        bool locked;
1773
1774        if (order > PAGE_ALLOC_COSTLY_ORDER)
1775                return OOM_SKIPPED;
1776
1777        memcg_memory_event(memcg, MEMCG_OOM);
1778
1779        /*
1780         * We are in the middle of the charge context here, so we
1781         * don't want to block when potentially sitting on a callstack
1782         * that holds all kinds of filesystem and mm locks.
1783         *
1784         * cgroup1 allows disabling the OOM killer and waiting for outside
1785         * handling until the charge can succeed; remember the context and put
1786         * the task to sleep at the end of the page fault when all locks are
1787         * released.
1788         *
1789         * On the other hand, in-kernel OOM killer allows for an async victim
1790         * memory reclaim (oom_reaper) and that means that we are not solely
1791         * relying on the oom victim to make a forward progress and we can
1792         * invoke the oom killer here.
1793         *
1794         * Please note that mem_cgroup_out_of_memory might fail to find a
1795         * victim and then we have to bail out from the charge path.
1796         */
1797        if (memcg->oom_kill_disable) {
1798                if (!current->in_user_fault)
1799                        return OOM_SKIPPED;
1800                css_get(&memcg->css);
1801                current->memcg_in_oom = memcg;
1802                current->memcg_oom_gfp_mask = mask;
1803                current->memcg_oom_order = order;
1804
1805                return OOM_ASYNC;
1806        }
1807
1808        mem_cgroup_mark_under_oom(memcg);
1809
1810        locked = mem_cgroup_oom_trylock(memcg);
1811
1812        if (locked)
1813                mem_cgroup_oom_notify(memcg);
1814
1815        mem_cgroup_unmark_under_oom(memcg);
1816        if (mem_cgroup_out_of_memory(memcg, mask, order))
1817                ret = OOM_SUCCESS;
1818        else
1819                ret = OOM_FAILED;
1820
1821        if (locked)
1822                mem_cgroup_oom_unlock(memcg);
1823
1824        return ret;
1825}
1826
1827/**
1828 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1829 * @handle: actually kill/wait or just clean up the OOM state
1830 *
1831 * This has to be called at the end of a page fault if the memcg OOM
1832 * handler was enabled.
1833 *
1834 * Memcg supports userspace OOM handling where failed allocations must
1835 * sleep on a waitqueue until the userspace task resolves the
1836 * situation.  Sleeping directly in the charge context with all kinds
1837 * of locks held is not a good idea, instead we remember an OOM state
1838 * in the task and mem_cgroup_oom_synchronize() has to be called at
1839 * the end of the page fault to complete the OOM handling.
1840 *
1841 * Returns %true if an ongoing memcg OOM situation was detected and
1842 * completed, %false otherwise.
1843 */
1844bool mem_cgroup_oom_synchronize(bool handle)
1845{
1846        struct mem_cgroup *memcg = current->memcg_in_oom;
1847        struct oom_wait_info owait;
1848        bool locked;
1849
1850        /* OOM is global, do not handle */
1851        if (!memcg)
1852                return false;
1853
1854        if (!handle)
1855                goto cleanup;
1856
1857        owait.memcg = memcg;
1858        owait.wait.flags = 0;
1859        owait.wait.func = memcg_oom_wake_function;
1860        owait.wait.private = current;
1861        INIT_LIST_HEAD(&owait.wait.entry);
1862
1863        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1864        mem_cgroup_mark_under_oom(memcg);
1865
1866        locked = mem_cgroup_oom_trylock(memcg);
1867
1868        if (locked)
1869                mem_cgroup_oom_notify(memcg);
1870
1871        if (locked && !memcg->oom_kill_disable) {
1872                mem_cgroup_unmark_under_oom(memcg);
1873                finish_wait(&memcg_oom_waitq, &owait.wait);
1874                mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1875                                         current->memcg_oom_order);
1876        } else {
1877                schedule();
1878                mem_cgroup_unmark_under_oom(memcg);
1879                finish_wait(&memcg_oom_waitq, &owait.wait);
1880        }
1881
1882        if (locked) {
1883                mem_cgroup_oom_unlock(memcg);
1884                /*
1885                 * There is no guarantee that an OOM-lock contender
1886                 * sees the wakeups triggered by the OOM kill
1887                 * uncharges.  Wake any sleepers explicitly.
1888                 */
1889                memcg_oom_recover(memcg);
1890        }
1891cleanup:
1892        current->memcg_in_oom = NULL;
1893        css_put(&memcg->css);
1894        return true;
1895}
1896
1897/**
1898 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1899 * @victim: task to be killed by the OOM killer
1900 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1901 *
1902 * Returns a pointer to a memory cgroup, which has to be cleaned up
1903 * by killing all belonging OOM-killable tasks.
1904 *
1905 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1906 */
1907struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1908                                            struct mem_cgroup *oom_domain)
1909{
1910        struct mem_cgroup *oom_group = NULL;
1911        struct mem_cgroup *memcg;
1912
1913        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1914                return NULL;
1915
1916        if (!oom_domain)
1917                oom_domain = root_mem_cgroup;
1918
1919        rcu_read_lock();
1920
1921        memcg = mem_cgroup_from_task(victim);
1922        if (memcg == root_mem_cgroup)
1923                goto out;
1924
1925        /*
1926         * If the victim task has been asynchronously moved to a different
1927         * memory cgroup, we might end up killing tasks outside oom_domain.
1928         * In this case it's better to ignore memory.group.oom.
1929         */
1930        if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1931                goto out;
1932
1933        /*
1934         * Traverse the memory cgroup hierarchy from the victim task's
1935         * cgroup up to the OOMing cgroup (or root) to find the
1936         * highest-level memory cgroup with oom.group set.
1937         */
1938        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1939                if (memcg->oom_group)
1940                        oom_group = memcg;
1941
1942                if (memcg == oom_domain)
1943                        break;
1944        }
1945
1946        if (oom_group)
1947                css_get(&oom_group->css);
1948out:
1949        rcu_read_unlock();
1950
1951        return oom_group;
1952}
1953
1954void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1955{
1956        pr_info("Tasks in ");
1957        pr_cont_cgroup_path(memcg->css.cgroup);
1958        pr_cont(" are going to be killed due to memory.oom.group set\n");
1959}
1960
1961/**
1962 * lock_page_memcg - lock a page and memcg binding
1963 * @page: the page
1964 *
1965 * This function protects unlocked LRU pages from being moved to
1966 * another cgroup.
1967 *
1968 * It ensures lifetime of the locked memcg. Caller is responsible
1969 * for the lifetime of the page.
1970 */
1971void lock_page_memcg(struct page *page)
1972{
1973        struct page *head = compound_head(page); /* rmap on tail pages */
1974        struct mem_cgroup *memcg;
1975        unsigned long flags;
1976
1977        /*
1978         * The RCU lock is held throughout the transaction.  The fast
1979         * path can get away without acquiring the memcg->move_lock
1980         * because page moving starts with an RCU grace period.
1981         */
1982        rcu_read_lock();
1983
1984        if (mem_cgroup_disabled())
1985                return;
1986again:
1987        memcg = page_memcg(head);
1988        if (unlikely(!memcg))
1989                return;
1990
1991#ifdef CONFIG_PROVE_LOCKING
1992        local_irq_save(flags);
1993        might_lock(&memcg->move_lock);
1994        local_irq_restore(flags);
1995#endif
1996
1997        if (atomic_read(&memcg->moving_account) <= 0)
1998                return;
1999
2000        spin_lock_irqsave(&memcg->move_lock, flags);
2001        if (memcg != page_memcg(head)) {
2002                spin_unlock_irqrestore(&memcg->move_lock, flags);
2003                goto again;
2004        }
2005
2006        /*
2007         * When charge migration first begins, we can have multiple
2008         * critical sections holding the fast-path RCU lock and one
2009         * holding the slowpath move_lock. Track the task who has the
2010         * move_lock for unlock_page_memcg().
2011         */
2012        memcg->move_lock_task = current;
2013        memcg->move_lock_flags = flags;
2014}
2015EXPORT_SYMBOL(lock_page_memcg);
2016
2017static void __unlock_page_memcg(struct mem_cgroup *memcg)
2018{
2019        if (memcg && memcg->move_lock_task == current) {
2020                unsigned long flags = memcg->move_lock_flags;
2021
2022                memcg->move_lock_task = NULL;
2023                memcg->move_lock_flags = 0;
2024
2025                spin_unlock_irqrestore(&memcg->move_lock, flags);
2026        }
2027
2028        rcu_read_unlock();
2029}
2030
2031/**
2032 * unlock_page_memcg - unlock a page and memcg binding
2033 * @page: the page
2034 */
2035void unlock_page_memcg(struct page *page)
2036{
2037        struct page *head = compound_head(page);
2038
2039        __unlock_page_memcg(page_memcg(head));
2040}
2041EXPORT_SYMBOL(unlock_page_memcg);
2042
2043struct memcg_stock_pcp {
2044        struct mem_cgroup *cached; /* this never be root cgroup */
2045        unsigned int nr_pages;
2046
2047#ifdef CONFIG_MEMCG_KMEM
2048        struct obj_cgroup *cached_objcg;
2049        unsigned int nr_bytes;
2050#endif
2051
2052        struct work_struct work;
2053        unsigned long flags;
2054#define FLUSHING_CACHED_CHARGE  0
2055};
2056static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2057static DEFINE_MUTEX(percpu_charge_mutex);
2058
2059#ifdef CONFIG_MEMCG_KMEM
2060static void drain_obj_stock(struct memcg_stock_pcp *stock);
2061static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2062                                     struct mem_cgroup *root_memcg);
2063
2064#else
2065static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2066{
2067}
2068static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2069                                     struct mem_cgroup *root_memcg)
2070{
2071        return false;
2072}
2073#endif
2074
2075/**
2076 * consume_stock: Try to consume stocked charge on this cpu.
2077 * @memcg: memcg to consume from.
2078 * @nr_pages: how many pages to charge.
2079 *
2080 * The charges will only happen if @memcg matches the current cpu's memcg
2081 * stock, and at least @nr_pages are available in that stock.  Failure to
2082 * service an allocation will refill the stock.
2083 *
2084 * returns true if successful, false otherwise.
2085 */
2086static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2087{
2088        struct memcg_stock_pcp *stock;
2089        unsigned long flags;
2090        bool ret = false;
2091
2092        if (nr_pages > MEMCG_CHARGE_BATCH)
2093                return ret;
2094
2095        local_irq_save(flags);
2096
2097        stock = this_cpu_ptr(&memcg_stock);
2098        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2099                stock->nr_pages -= nr_pages;
2100                ret = true;
2101        }
2102
2103        local_irq_restore(flags);
2104
2105        return ret;
2106}
2107
2108/*
2109 * Returns stocks cached in percpu and reset cached information.
2110 */
2111static void drain_stock(struct memcg_stock_pcp *stock)
2112{
2113        struct mem_cgroup *old = stock->cached;
2114
2115        if (!old)
2116                return;
2117
2118        if (stock->nr_pages) {
2119                page_counter_uncharge(&old->memory, stock->nr_pages);
2120                if (do_memsw_account())
2121                        page_counter_uncharge(&old->memsw, stock->nr_pages);
2122                stock->nr_pages = 0;
2123        }
2124
2125        css_put(&old->css);
2126        stock->cached = NULL;
2127}
2128
2129static void drain_local_stock(struct work_struct *dummy)
2130{
2131        struct memcg_stock_pcp *stock;
2132        unsigned long flags;
2133
2134        /*
2135         * The only protection from memory hotplug vs. drain_stock races is
2136         * that we always operate on local CPU stock here with IRQ disabled
2137         */
2138        local_irq_save(flags);
2139
2140        stock = this_cpu_ptr(&memcg_stock);
2141        drain_obj_stock(stock);
2142        drain_stock(stock);
2143        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2144
2145        local_irq_restore(flags);
2146}
2147
2148/*
2149 * Cache charges(val) to local per_cpu area.
2150 * This will be consumed by consume_stock() function, later.
2151 */
2152static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2153{
2154        struct memcg_stock_pcp *stock;
2155        unsigned long flags;
2156
2157        local_irq_save(flags);
2158
2159        stock = this_cpu_ptr(&memcg_stock);
2160        if (stock->cached != memcg) { /* reset if necessary */
2161                drain_stock(stock);
2162                css_get(&memcg->css);
2163                stock->cached = memcg;
2164        }
2165        stock->nr_pages += nr_pages;
2166
2167        if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2168                drain_stock(stock);
2169
2170        local_irq_restore(flags);
2171}
2172
2173/*
2174 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2175 * of the hierarchy under it.
2176 */
2177static void drain_all_stock(struct mem_cgroup *root_memcg)
2178{
2179        int cpu, curcpu;
2180
2181        /* If someone's already draining, avoid adding running more workers. */
2182        if (!mutex_trylock(&percpu_charge_mutex))
2183                return;
2184        /*
2185         * Notify other cpus that system-wide "drain" is running
2186         * We do not care about races with the cpu hotplug because cpu down
2187         * as well as workers from this path always operate on the local
2188         * per-cpu data. CPU up doesn't touch memcg_stock at all.
2189         */
2190        curcpu = get_cpu();
2191        for_each_online_cpu(cpu) {
2192                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2193                struct mem_cgroup *memcg;
2194                bool flush = false;
2195
2196                rcu_read_lock();
2197                memcg = stock->cached;
2198                if (memcg && stock->nr_pages &&
2199                    mem_cgroup_is_descendant(memcg, root_memcg))
2200                        flush = true;
2201                if (obj_stock_flush_required(stock, root_memcg))
2202                        flush = true;
2203                rcu_read_unlock();
2204
2205                if (flush &&
2206                    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2207                        if (cpu == curcpu)
2208                                drain_local_stock(&stock->work);
2209                        else
2210                                schedule_work_on(cpu, &stock->work);
2211                }
2212        }
2213        put_cpu();
2214        mutex_unlock(&percpu_charge_mutex);
2215}
2216
2217static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
2218{
2219        int nid;
2220
2221        for_each_node(nid) {
2222                struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
2223                unsigned long stat[NR_VM_NODE_STAT_ITEMS];
2224                struct batched_lruvec_stat *lstatc;
2225                int i;
2226
2227                lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
2228                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
2229                        stat[i] = lstatc->count[i];
2230                        lstatc->count[i] = 0;
2231                }
2232
2233                do {
2234                        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
2235                                atomic_long_add(stat[i], &pn->lruvec_stat[i]);
2236                } while ((pn = parent_nodeinfo(pn, nid)));
2237        }
2238}
2239
2240static int memcg_hotplug_cpu_dead(unsigned int cpu)
2241{
2242        struct memcg_stock_pcp *stock;
2243        struct mem_cgroup *memcg;
2244
2245        stock = &per_cpu(memcg_stock, cpu);
2246        drain_stock(stock);
2247
2248        for_each_mem_cgroup(memcg)
2249                memcg_flush_lruvec_page_state(memcg, cpu);
2250
2251        return 0;
2252}
2253
2254static unsigned long reclaim_high(struct mem_cgroup *memcg,
2255                                  unsigned int nr_pages,
2256                                  gfp_t gfp_mask)
2257{
2258        unsigned long nr_reclaimed = 0;
2259
2260        do {
2261                unsigned long pflags;
2262
2263                if (page_counter_read(&memcg->memory) <=
2264                    READ_ONCE(memcg->memory.high))
2265                        continue;
2266
2267                memcg_memory_event(memcg, MEMCG_HIGH);
2268
2269                psi_memstall_enter(&pflags);
2270                nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2271                                                             gfp_mask, true);
2272                psi_memstall_leave(&pflags);
2273        } while ((memcg = parent_mem_cgroup(memcg)) &&
2274                 !mem_cgroup_is_root(memcg));
2275
2276        return nr_reclaimed;
2277}
2278
2279static void high_work_func(struct work_struct *work)
2280{
2281        struct mem_cgroup *memcg;
2282
2283        memcg = container_of(work, struct mem_cgroup, high_work);
2284        reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2285}
2286
2287/*
2288 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2289 * enough to still cause a significant slowdown in most cases, while still
2290 * allowing diagnostics and tracing to proceed without becoming stuck.
2291 */
2292#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2293
2294/*
2295 * When calculating the delay, we use these either side of the exponentiation to
2296 * maintain precision and scale to a reasonable number of jiffies (see the table
2297 * below.
2298 *
2299 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2300 *   overage ratio to a delay.
2301 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2302 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2303 *   to produce a reasonable delay curve.
2304 *
2305 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2306 * reasonable delay curve compared to precision-adjusted overage, not
2307 * penalising heavily at first, but still making sure that growth beyond the
2308 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2309 * example, with a high of 100 megabytes:
2310 *
2311 *  +-------+------------------------+
2312 *  | usage | time to allocate in ms |
2313 *  +-------+------------------------+
2314 *  | 100M  |                      0 |
2315 *  | 101M  |                      6 |
2316 *  | 102M  |                     25 |
2317 *  | 103M  |                     57 |
2318 *  | 104M  |                    102 |
2319 *  | 105M  |                    159 |
2320 *  | 106M  |                    230 |
2321 *  | 107M  |                    313 |
2322 *  | 108M  |                    409 |
2323 *  | 109M  |                    518 |
2324 *  | 110M  |                    639 |
2325 *  | 111M  |                    774 |
2326 *  | 112M  |                    921 |
2327 *  | 113M  |                   1081 |
2328 *  | 114M  |                   1254 |
2329 *  | 115M  |                   1439 |
2330 *  | 116M  |                   1638 |
2331 *  | 117M  |                   1849 |
2332 *  | 118M  |                   2000 |
2333 *  | 119M  |                   2000 |
2334 *  | 120M  |                   2000 |
2335 *  +-------+------------------------+
2336 */
2337 #define MEMCG_DELAY_PRECISION_SHIFT 20
2338 #define MEMCG_DELAY_SCALING_SHIFT 14
2339
2340static u64 calculate_overage(unsigned long usage, unsigned long high)
2341{
2342        u64 overage;
2343
2344        if (usage <= high)
2345                return 0;
2346
2347        /*
2348         * Prevent division by 0 in overage calculation by acting as if
2349         * it was a threshold of 1 page
2350         */
2351        high = max(high, 1UL);
2352
2353        overage = usage - high;
2354        overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2355        return div64_u64(overage, high);
2356}
2357
2358static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2359{
2360        u64 overage, max_overage = 0;
2361
2362        do {
2363                overage = calculate_overage(page_counter_read(&memcg->memory),
2364                                            READ_ONCE(memcg->memory.high));
2365                max_overage = max(overage, max_overage);
2366        } while ((memcg = parent_mem_cgroup(memcg)) &&
2367                 !mem_cgroup_is_root(memcg));
2368
2369        return max_overage;
2370}
2371
2372static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2373{
2374        u64 overage, max_overage = 0;
2375
2376        do {
2377                overage = calculate_overage(page_counter_read(&memcg->swap),
2378                                            READ_ONCE(memcg->swap.high));
2379                if (overage)
2380                        memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2381                max_overage = max(overage, max_overage);
2382        } while ((memcg = parent_mem_cgroup(memcg)) &&
2383                 !mem_cgroup_is_root(memcg));
2384
2385        return max_overage;
2386}
2387
2388/*
2389 * Get the number of jiffies that we should penalise a mischievous cgroup which
2390 * is exceeding its memory.high by checking both it and its ancestors.
2391 */
2392static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2393                                          unsigned int nr_pages,
2394                                          u64 max_overage)
2395{
2396        unsigned long penalty_jiffies;
2397
2398        if (!max_overage)
2399                return 0;
2400
2401        /*
2402         * We use overage compared to memory.high to calculate the number of
2403         * jiffies to sleep (penalty_jiffies). Ideally this value should be
2404         * fairly lenient on small overages, and increasingly harsh when the
2405         * memcg in question makes it clear that it has no intention of stopping
2406         * its crazy behaviour, so we exponentially increase the delay based on
2407         * overage amount.
2408         */
2409        penalty_jiffies = max_overage * max_overage * HZ;
2410        penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2411        penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2412
2413        /*
2414         * Factor in the task's own contribution to the overage, such that four
2415         * N-sized allocations are throttled approximately the same as one
2416         * 4N-sized allocation.
2417         *
2418         * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2419         * larger the current charge patch is than that.
2420         */
2421        return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2422}
2423
2424/*
2425 * Scheduled by try_charge() to be executed from the userland return path
2426 * and reclaims memory over the high limit.
2427 */
2428void mem_cgroup_handle_over_high(void)
2429{
2430        unsigned long penalty_jiffies;
2431        unsigned long pflags;
2432        unsigned long nr_reclaimed;
2433        unsigned int nr_pages = current->memcg_nr_pages_over_high;
2434        int nr_retries = MAX_RECLAIM_RETRIES;
2435        struct mem_cgroup *memcg;
2436        bool in_retry = false;
2437
2438        if (likely(!nr_pages))
2439                return;
2440
2441        memcg = get_mem_cgroup_from_mm(current->mm);
2442        current->memcg_nr_pages_over_high = 0;
2443
2444retry_reclaim:
2445        /*
2446         * The allocating task should reclaim at least the batch size, but for
2447         * subsequent retries we only want to do what's necessary to prevent oom
2448         * or breaching resource isolation.
2449         *
2450         * This is distinct from memory.max or page allocator behaviour because
2451         * memory.high is currently batched, whereas memory.max and the page
2452         * allocator run every time an allocation is made.
2453         */
2454        nr_reclaimed = reclaim_high(memcg,
2455                                    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2456                                    GFP_KERNEL);
2457
2458        /*
2459         * memory.high is breached and reclaim is unable to keep up. Throttle
2460         * allocators proactively to slow down excessive growth.
2461         */
2462        penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2463                                               mem_find_max_overage(memcg));
2464
2465        penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2466                                                swap_find_max_overage(memcg));
2467
2468        /*
2469         * Clamp the max delay per usermode return so as to still keep the
2470         * application moving forwards and also permit diagnostics, albeit
2471         * extremely slowly.
2472         */
2473        penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2474
2475        /*
2476         * Don't sleep if the amount of jiffies this memcg owes us is so low
2477         * that it's not even worth doing, in an attempt to be nice to those who
2478         * go only a small amount over their memory.high value and maybe haven't
2479         * been aggressively reclaimed enough yet.
2480         */
2481        if (penalty_jiffies <= HZ / 100)
2482                goto out;
2483
2484        /*
2485         * If reclaim is making forward progress but we're still over
2486         * memory.high, we want to encourage that rather than doing allocator
2487         * throttling.
2488         */
2489        if (nr_reclaimed || nr_retries--) {
2490                in_retry = true;
2491                goto retry_reclaim;
2492        }
2493
2494        /*
2495         * If we exit early, we're guaranteed to die (since
2496         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2497         * need to account for any ill-begotten jiffies to pay them off later.
2498         */
2499        psi_memstall_enter(&pflags);
2500        schedule_timeout_killable(penalty_jiffies);
2501        psi_memstall_leave(&pflags);
2502
2503out:
2504        css_put(&memcg->css);
2505}
2506
2507static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2508                      unsigned int nr_pages)
2509{
2510        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2511        int nr_retries = MAX_RECLAIM_RETRIES;
2512        struct mem_cgroup *mem_over_limit;
2513        struct page_counter *counter;
2514        enum oom_status oom_status;
2515        unsigned long nr_reclaimed;
2516        bool may_swap = true;
2517        bool drained = false;
2518        unsigned long pflags;
2519
2520        if (mem_cgroup_is_root(memcg))
2521                return 0;
2522retry:
2523        if (consume_stock(memcg, nr_pages))
2524                return 0;
2525
2526        if (!do_memsw_account() ||
2527            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2528                if (page_counter_try_charge(&memcg->memory, batch, &counter))
2529                        goto done_restock;
2530                if (do_memsw_account())
2531                        page_counter_uncharge(&memcg->memsw, batch);
2532                mem_over_limit = mem_cgroup_from_counter(counter, memory);
2533        } else {
2534                mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2535                may_swap = false;
2536        }
2537
2538        if (batch > nr_pages) {
2539                batch = nr_pages;
2540                goto retry;
2541        }
2542
2543        /*
2544         * Memcg doesn't have a dedicated reserve for atomic
2545         * allocations. But like the global atomic pool, we need to
2546         * put the burden of reclaim on regular allocation requests
2547         * and let these go through as privileged allocations.
2548         */
2549        if (gfp_mask & __GFP_ATOMIC)
2550                goto force;
2551
2552        /*
2553         * Unlike in global OOM situations, memcg is not in a physical
2554         * memory shortage.  Allow dying and OOM-killed tasks to
2555         * bypass the last charges so that they can exit quickly and
2556         * free their memory.
2557         */
2558        if (unlikely(should_force_charge()))
2559                goto force;
2560
2561        /*
2562         * Prevent unbounded recursion when reclaim operations need to
2563         * allocate memory. This might exceed the limits temporarily,
2564         * but we prefer facilitating memory reclaim and getting back
2565         * under the limit over triggering OOM kills in these cases.
2566         */
2567        if (unlikely(current->flags & PF_MEMALLOC))
2568                goto force;
2569
2570        if (unlikely(task_in_memcg_oom(current)))
2571                goto nomem;
2572
2573        if (!gfpflags_allow_blocking(gfp_mask))
2574                goto nomem;
2575
2576        memcg_memory_event(mem_over_limit, MEMCG_MAX);
2577
2578        psi_memstall_enter(&pflags);
2579        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2580                                                    gfp_mask, may_swap);
2581        psi_memstall_leave(&pflags);
2582
2583        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2584                goto retry;
2585
2586        if (!drained) {
2587                drain_all_stock(mem_over_limit);
2588                drained = true;
2589                goto retry;
2590        }
2591
2592        if (gfp_mask & __GFP_NORETRY)
2593                goto nomem;
2594        /*
2595         * Even though the limit is exceeded at this point, reclaim
2596         * may have been able to free some pages.  Retry the charge
2597         * before killing the task.
2598         *
2599         * Only for regular pages, though: huge pages are rather
2600         * unlikely to succeed so close to the limit, and we fall back
2601         * to regular pages anyway in case of failure.
2602         */
2603        if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2604                goto retry;
2605        /*
2606         * At task move, charge accounts can be doubly counted. So, it's
2607         * better to wait until the end of task_move if something is going on.
2608         */
2609        if (mem_cgroup_wait_acct_move(mem_over_limit))
2610                goto retry;
2611
2612        if (nr_retries--)
2613                goto retry;
2614
2615        if (gfp_mask & __GFP_RETRY_MAYFAIL)
2616                goto nomem;
2617
2618        if (fatal_signal_pending(current))
2619                goto force;
2620
2621        /*
2622         * keep retrying as long as the memcg oom killer is able to make
2623         * a forward progress or bypass the charge if the oom killer
2624         * couldn't make any progress.
2625         */
2626        oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2627                       get_order(nr_pages * PAGE_SIZE));
2628        switch (oom_status) {
2629        case OOM_SUCCESS:
2630                nr_retries = MAX_RECLAIM_RETRIES;
2631                goto retry;
2632        case OOM_FAILED:
2633                goto force;
2634        default:
2635                goto nomem;
2636        }
2637nomem:
2638        if (!(gfp_mask & __GFP_NOFAIL))
2639                return -ENOMEM;
2640force:
2641        /*
2642         * The allocation either can't fail or will lead to more memory
2643         * being freed very soon.  Allow memory usage go over the limit
2644         * temporarily by force charging it.
2645         */
2646        page_counter_charge(&memcg->memory, nr_pages);
2647        if (do_memsw_account())
2648                page_counter_charge(&memcg->memsw, nr_pages);
2649
2650        return 0;
2651
2652done_restock:
2653        if (batch > nr_pages)
2654                refill_stock(memcg, batch - nr_pages);
2655
2656        /*
2657         * If the hierarchy is above the normal consumption range, schedule
2658         * reclaim on returning to userland.  We can perform reclaim here
2659         * if __GFP_RECLAIM but let's always punt for simplicity and so that
2660         * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2661         * not recorded as it most likely matches current's and won't
2662         * change in the meantime.  As high limit is checked again before
2663         * reclaim, the cost of mismatch is negligible.
2664         */
2665        do {
2666                bool mem_high, swap_high;
2667
2668                mem_high = page_counter_read(&memcg->memory) >
2669                        READ_ONCE(memcg->memory.high);
2670                swap_high = page_counter_read(&memcg->swap) >
2671                        READ_ONCE(memcg->swap.high);
2672
2673                /* Don't bother a random interrupted task */
2674                if (in_interrupt()) {
2675                        if (mem_high) {
2676                                schedule_work(&memcg->high_work);
2677                                break;
2678                        }
2679                        continue;
2680                }
2681
2682                if (mem_high || swap_high) {
2683                        /*
2684                         * The allocating tasks in this cgroup will need to do
2685                         * reclaim or be throttled to prevent further growth
2686                         * of the memory or swap footprints.
2687                         *
2688                         * Target some best-effort fairness between the tasks,
2689                         * and distribute reclaim work and delay penalties
2690                         * based on how much each task is actually allocating.
2691                         */
2692                        current->memcg_nr_pages_over_high += batch;
2693                        set_notify_resume(current);
2694                        break;
2695                }
2696        } while ((memcg = parent_mem_cgroup(memcg)));
2697
2698        return 0;
2699}
2700
2701#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2702static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2703{
2704        if (mem_cgroup_is_root(memcg))
2705                return;
2706
2707        page_counter_uncharge(&memcg->memory, nr_pages);
2708        if (do_memsw_account())
2709                page_counter_uncharge(&memcg->memsw, nr_pages);
2710}
2711#endif
2712
2713static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2714{
2715        VM_BUG_ON_PAGE(page_memcg(page), page);
2716        /*
2717         * Any of the following ensures page's memcg stability:
2718         *
2719         * - the page lock
2720         * - LRU isolation
2721         * - lock_page_memcg()
2722         * - exclusive reference
2723         */
2724        page->memcg_data = (unsigned long)memcg;
2725}
2726
2727static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
2728{
2729        struct mem_cgroup *memcg;
2730
2731        rcu_read_lock();
2732retry:
2733        memcg = obj_cgroup_memcg(objcg);
2734        if (unlikely(!css_tryget(&memcg->css)))
2735                goto retry;
2736        rcu_read_unlock();
2737
2738        return memcg;
2739}
2740
2741#ifdef CONFIG_MEMCG_KMEM
2742/*
2743 * The allocated objcg pointers array is not accounted directly.
2744 * Moreover, it should not come from DMA buffer and is not readily
2745 * reclaimable. So those GFP bits should be masked off.
2746 */
2747#define OBJCGS_CLEAR_MASK       (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2748
2749int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2750                                 gfp_t gfp, bool new_page)
2751{
2752        unsigned int objects = objs_per_slab_page(s, page);
2753        unsigned long memcg_data;
2754        void *vec;
2755
2756        gfp &= ~OBJCGS_CLEAR_MASK;
2757        vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2758                           page_to_nid(page));
2759        if (!vec)
2760                return -ENOMEM;
2761
2762        memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2763        if (new_page) {
2764                /*
2765                 * If the slab page is brand new and nobody can yet access
2766                 * it's memcg_data, no synchronization is required and
2767                 * memcg_data can be simply assigned.
2768                 */
2769                page->memcg_data = memcg_data;
2770        } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
2771                /*
2772                 * If the slab page is already in use, somebody can allocate
2773                 * and assign obj_cgroups in parallel. In this case the existing
2774                 * objcg vector should be reused.
2775                 */
2776                kfree(vec);
2777                return 0;
2778        }
2779
2780        kmemleak_not_leak(vec);
2781        return 0;
2782}
2783
2784/*
2785 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2786 *
2787 * A passed kernel object can be a slab object or a generic kernel page, so
2788 * different mechanisms for getting the memory cgroup pointer should be used.
2789 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
2790 * can not know for sure how the kernel object is implemented.
2791 * mem_cgroup_from_obj() can be safely used in such cases.
2792 *
2793 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2794 * cgroup_mutex, etc.
2795 */
2796struct mem_cgroup *mem_cgroup_from_obj(void *p)
2797{
2798        struct page *page;
2799
2800        if (mem_cgroup_disabled())
2801                return NULL;
2802
2803        page = virt_to_head_page(p);
2804
2805        /*
2806         * Slab objects are accounted individually, not per-page.
2807         * Memcg membership data for each individual object is saved in
2808         * the page->obj_cgroups.
2809         */
2810        if (page_objcgs_check(page)) {
2811                struct obj_cgroup *objcg;
2812                unsigned int off;
2813
2814                off = obj_to_index(page->slab_cache, page, p);
2815                objcg = page_objcgs(page)[off];
2816                if (objcg)
2817                        return obj_cgroup_memcg(objcg);
2818
2819                return NULL;
2820        }
2821
2822        /*
2823         * page_memcg_check() is used here, because page_has_obj_cgroups()
2824         * check above could fail because the object cgroups vector wasn't set
2825         * at that moment, but it can be set concurrently.
2826         * page_memcg_check(page) will guarantee that a proper memory
2827         * cgroup pointer or NULL will be returned.
2828         */
2829        return page_memcg_check(page);
2830}
2831
2832__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2833{
2834        struct obj_cgroup *objcg = NULL;
2835        struct mem_cgroup *memcg;
2836
2837        if (memcg_kmem_bypass())
2838                return NULL;
2839
2840        rcu_read_lock();
2841        if (unlikely(active_memcg()))
2842                memcg = active_memcg();
2843        else
2844                memcg = mem_cgroup_from_task(current);
2845
2846        for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2847                objcg = rcu_dereference(memcg->objcg);
2848                if (objcg && obj_cgroup_tryget(objcg))
2849                        break;
2850                objcg = NULL;
2851        }
2852        rcu_read_unlock();
2853
2854        return objcg;
2855}
2856
2857static int memcg_alloc_cache_id(void)
2858{
2859        int id, size;
2860        int err;
2861
2862        id = ida_simple_get(&memcg_cache_ida,
2863                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2864        if (id < 0)
2865                return id;
2866
2867        if (id < memcg_nr_cache_ids)
2868                return id;
2869
2870        /*
2871         * There's no space for the new id in memcg_caches arrays,
2872         * so we have to grow them.
2873         */
2874        down_write(&memcg_cache_ids_sem);
2875
2876        size = 2 * (id + 1);
2877        if (size < MEMCG_CACHES_MIN_SIZE)
2878                size = MEMCG_CACHES_MIN_SIZE;
2879        else if (size > MEMCG_CACHES_MAX_SIZE)
2880                size = MEMCG_CACHES_MAX_SIZE;
2881
2882        err = memcg_update_all_list_lrus(size);
2883        if (!err)
2884                memcg_nr_cache_ids = size;
2885
2886        up_write(&memcg_cache_ids_sem);
2887
2888        if (err) {
2889                ida_simple_remove(&memcg_cache_ida, id);
2890                return err;
2891        }
2892        return id;
2893}
2894
2895static void memcg_free_cache_id(int id)
2896{
2897        ida_simple_remove(&memcg_cache_ida, id);
2898}
2899
2900/*
2901 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2902 * @objcg: object cgroup to uncharge
2903 * @nr_pages: number of pages to uncharge
2904 */
2905static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2906                                      unsigned int nr_pages)
2907{
2908        struct mem_cgroup *memcg;
2909
2910        memcg = get_mem_cgroup_from_objcg(objcg);
2911
2912        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2913                page_counter_uncharge(&memcg->kmem, nr_pages);
2914        refill_stock(memcg, nr_pages);
2915
2916        css_put(&memcg->css);
2917}
2918
2919/*
2920 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2921 * @objcg: object cgroup to charge
2922 * @gfp: reclaim mode
2923 * @nr_pages: number of pages to charge
2924 *
2925 * Returns 0 on success, an error code on failure.
2926 */
2927static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2928                                   unsigned int nr_pages)
2929{
2930        struct page_counter *counter;
2931        struct mem_cgroup *memcg;
2932        int ret;
2933
2934        memcg = get_mem_cgroup_from_objcg(objcg);
2935
2936        ret = try_charge(memcg, gfp, nr_pages);
2937        if (ret)
2938                goto out;
2939
2940        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2941            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2942
2943                /*
2944                 * Enforce __GFP_NOFAIL allocation because callers are not
2945                 * prepared to see failures and likely do not have any failure
2946                 * handling code.
2947                 */
2948                if (gfp & __GFP_NOFAIL) {
2949                        page_counter_charge(&memcg->kmem, nr_pages);
2950                        goto out;
2951                }
2952                cancel_charge(memcg, nr_pages);
2953                ret = -ENOMEM;
2954        }
2955out:
2956        css_put(&memcg->css);
2957
2958        return ret;
2959}
2960
2961/**
2962 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2963 * @page: page to charge
2964 * @gfp: reclaim mode
2965 * @order: allocation order
2966 *
2967 * Returns 0 on success, an error code on failure.
2968 */
2969int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2970{
2971        struct obj_cgroup *objcg;
2972        int ret = 0;
2973
2974        objcg = get_obj_cgroup_from_current();
2975        if (objcg) {
2976                ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
2977                if (!ret) {
2978                        page->memcg_data = (unsigned long)objcg |
2979                                MEMCG_DATA_KMEM;
2980                        return 0;
2981                }
2982                obj_cgroup_put(objcg);
2983        }
2984        return ret;
2985}
2986
2987/**
2988 * __memcg_kmem_uncharge_page: uncharge a kmem page
2989 * @page: page to uncharge
2990 * @order: allocation order
2991 */
2992void __memcg_kmem_uncharge_page(struct page *page, int order)
2993{
2994        struct obj_cgroup *objcg;
2995        unsigned int nr_pages = 1 << order;
2996
2997        if (!PageMemcgKmem(page))
2998                return;
2999
3000        objcg = __page_objcg(page);
3001        obj_cgroup_uncharge_pages(objcg, nr_pages);
3002        page->memcg_data = 0;
3003        obj_cgroup_put(objcg);
3004}
3005
3006static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3007{
3008        struct memcg_stock_pcp *stock;
3009        unsigned long flags;
3010        bool ret = false;
3011
3012        local_irq_save(flags);
3013
3014        stock = this_cpu_ptr(&memcg_stock);
3015        if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3016                stock->nr_bytes -= nr_bytes;
3017                ret = true;
3018        }
3019
3020        local_irq_restore(flags);
3021
3022        return ret;
3023}
3024
3025static void drain_obj_stock(struct memcg_stock_pcp *stock)
3026{
3027        struct obj_cgroup *old = stock->cached_objcg;
3028
3029        if (!old)
3030                return;
3031
3032        if (stock->nr_bytes) {
3033                unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3034                unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3035
3036                if (nr_pages)
3037                        obj_cgroup_uncharge_pages(old, nr_pages);
3038
3039                /*
3040                 * The leftover is flushed to the centralized per-memcg value.
3041                 * On the next attempt to refill obj stock it will be moved
3042                 * to a per-cpu stock (probably, on an other CPU), see
3043                 * refill_obj_stock().
3044                 *
3045                 * How often it's flushed is a trade-off between the memory
3046                 * limit enforcement accuracy and potential CPU contention,
3047                 * so it might be changed in the future.
3048                 */
3049                atomic_add(nr_bytes, &old->nr_charged_bytes);
3050                stock->nr_bytes = 0;
3051        }
3052
3053        obj_cgroup_put(old);
3054        stock->cached_objcg = NULL;
3055}
3056
3057static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3058                                     struct mem_cgroup *root_memcg)
3059{
3060        struct mem_cgroup *memcg;
3061
3062        if (stock->cached_objcg) {
3063                memcg = obj_cgroup_memcg(stock->cached_objcg);
3064                if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3065                        return true;
3066        }
3067
3068        return false;
3069}
3070
3071static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3072{
3073        struct memcg_stock_pcp *stock;
3074        unsigned long flags;
3075
3076        local_irq_save(flags);
3077
3078        stock = this_cpu_ptr(&memcg_stock);
3079        if (stock->cached_objcg != objcg) { /* reset if necessary */
3080                drain_obj_stock(stock);
3081                obj_cgroup_get(objcg);
3082                stock->cached_objcg = objcg;
3083                stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3084        }
3085        stock->nr_bytes += nr_bytes;
3086
3087        if (stock->nr_bytes > PAGE_SIZE)
3088                drain_obj_stock(stock);
3089
3090        local_irq_restore(flags);
3091}
3092
3093int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3094{
3095        unsigned int nr_pages, nr_bytes;
3096        int ret;
3097
3098        if (consume_obj_stock(objcg, size))
3099                return 0;
3100
3101        /*
3102         * In theory, memcg->nr_charged_bytes can have enough
3103         * pre-charged bytes to satisfy the allocation. However,
3104         * flushing memcg->nr_charged_bytes requires two atomic
3105         * operations, and memcg->nr_charged_bytes can't be big,
3106         * so it's better to ignore it and try grab some new pages.
3107         * memcg->nr_charged_bytes will be flushed in
3108         * refill_obj_stock(), called from this function or
3109         * independently later.
3110         */
3111        nr_pages = size >> PAGE_SHIFT;
3112        nr_bytes = size & (PAGE_SIZE - 1);
3113
3114        if (nr_bytes)
3115                nr_pages += 1;
3116
3117        ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3118        if (!ret && nr_bytes)
3119                refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3120
3121        return ret;
3122}
3123
3124void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3125{
3126        refill_obj_stock(objcg, size);
3127}
3128
3129#endif /* CONFIG_MEMCG_KMEM */
3130
3131/*
3132 * Because page_memcg(head) is not set on tails, set it now.
3133 */
3134void split_page_memcg(struct page *head, unsigned int nr)
3135{
3136        struct mem_cgroup *memcg = page_memcg(head);
3137        int i;
3138
3139        if (mem_cgroup_disabled() || !memcg)
3140                return;
3141
3142        for (i = 1; i < nr; i++)
3143                head[i].memcg_data = head->memcg_data;
3144
3145        if (PageMemcgKmem(head))
3146                obj_cgroup_get_many(__page_objcg(head), nr - 1);
3147        else
3148                css_get_many(&memcg->css, nr - 1);
3149}
3150
3151#ifdef CONFIG_MEMCG_SWAP
3152/**
3153 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3154 * @entry: swap entry to be moved
3155 * @from:  mem_cgroup which the entry is moved from
3156 * @to:  mem_cgroup which the entry is moved to
3157 *
3158 * It succeeds only when the swap_cgroup's record for this entry is the same
3159 * as the mem_cgroup's id of @from.
3160 *
3161 * Returns 0 on success, -EINVAL on failure.
3162 *
3163 * The caller must have charged to @to, IOW, called page_counter_charge() about
3164 * both res and memsw, and called css_get().
3165 */
3166static int mem_cgroup_move_swap_account(swp_entry_t entry,
3167                                struct mem_cgroup *from, struct mem_cgroup *to)
3168{
3169        unsigned short old_id, new_id;
3170
3171        old_id = mem_cgroup_id(from);
3172        new_id = mem_cgroup_id(to);
3173
3174        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3175                mod_memcg_state(from, MEMCG_SWAP, -1);
3176                mod_memcg_state(to, MEMCG_SWAP, 1);
3177                return 0;
3178        }
3179        return -EINVAL;
3180}
3181#else
3182static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3183                                struct mem_cgroup *from, struct mem_cgroup *to)
3184{
3185        return -EINVAL;
3186}
3187#endif
3188
3189static DEFINE_MUTEX(memcg_max_mutex);
3190
3191static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3192                                 unsigned long max, bool memsw)
3193{
3194        bool enlarge = false;
3195        bool drained = false;
3196        int ret;
3197        bool limits_invariant;
3198        struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3199
3200        do {
3201                if (signal_pending(current)) {
3202                        ret = -EINTR;
3203                        break;
3204                }
3205
3206                mutex_lock(&memcg_max_mutex);
3207                /*
3208                 * Make sure that the new limit (memsw or memory limit) doesn't
3209                 * break our basic invariant rule memory.max <= memsw.max.
3210                 */
3211                limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3212                                           max <= memcg->memsw.max;
3213                if (!limits_invariant) {
3214                        mutex_unlock(&memcg_max_mutex);
3215                        ret = -EINVAL;
3216                        break;
3217                }
3218                if (max > counter->max)
3219                        enlarge = true;
3220                ret = page_counter_set_max(counter, max);
3221                mutex_unlock(&memcg_max_mutex);
3222
3223                if (!ret)
3224                        break;
3225
3226                if (!drained) {
3227                        drain_all_stock(memcg);
3228                        drained = true;
3229                        continue;
3230                }
3231
3232                if (!try_to_free_mem_cgroup_pages(memcg, 1,
3233                                        GFP_KERNEL, !memsw)) {
3234                        ret = -EBUSY;
3235                        break;
3236                }
3237        } while (true);
3238
3239        if (!ret && enlarge)
3240                memcg_oom_recover(memcg);
3241
3242        return ret;
3243}
3244
3245unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3246                                            gfp_t gfp_mask,
3247                                            unsigned long *total_scanned)
3248{
3249        unsigned long nr_reclaimed = 0;
3250        struct mem_cgroup_per_node *mz, *next_mz = NULL;
3251        unsigned long reclaimed;
3252        int loop = 0;
3253        struct mem_cgroup_tree_per_node *mctz;
3254        unsigned long excess;
3255        unsigned long nr_scanned;
3256
3257        if (order > 0)
3258                return 0;
3259
3260        mctz = soft_limit_tree_node(pgdat->node_id);
3261
3262        /*
3263         * Do not even bother to check the largest node if the root
3264         * is empty. Do it lockless to prevent lock bouncing. Races
3265         * are acceptable as soft limit is best effort anyway.
3266         */
3267        if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3268                return 0;
3269
3270        /*
3271         * This loop can run a while, specially if mem_cgroup's continuously
3272         * keep exceeding their soft limit and putting the system under
3273         * pressure
3274         */
3275        do {
3276                if (next_mz)
3277                        mz = next_mz;
3278                else
3279                        mz = mem_cgroup_largest_soft_limit_node(mctz);
3280                if (!mz)
3281                        break;
3282
3283                nr_scanned = 0;
3284                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3285                                                    gfp_mask, &nr_scanned);
3286                nr_reclaimed += reclaimed;
3287                *total_scanned += nr_scanned;
3288                spin_lock_irq(&mctz->lock);
3289                __mem_cgroup_remove_exceeded(mz, mctz);
3290
3291                /*
3292                 * If we failed to reclaim anything from this memory cgroup
3293                 * it is time to move on to the next cgroup
3294                 */
3295                next_mz = NULL;
3296                if (!reclaimed)
3297                        next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3298
3299                excess = soft_limit_excess(mz->memcg);
3300                /*
3301                 * One school of thought says that we should not add
3302                 * back the node to the tree if reclaim returns 0.
3303                 * But our reclaim could return 0, simply because due
3304                 * to priority we are exposing a smaller subset of
3305                 * memory to reclaim from. Consider this as a longer
3306                 * term TODO.
3307                 */
3308                /* If excess == 0, no tree ops */
3309                __mem_cgroup_insert_exceeded(mz, mctz, excess);
3310                spin_unlock_irq(&mctz->lock);
3311                css_put(&mz->memcg->css);
3312                loop++;
3313                /*
3314                 * Could not reclaim anything and there are no more
3315                 * mem cgroups to try or we seem to be looping without
3316                 * reclaiming anything.
3317                 */
3318                if (!nr_reclaimed &&
3319                        (next_mz == NULL ||
3320                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3321                        break;
3322        } while (!nr_reclaimed);
3323        if (next_mz)
3324                css_put(&next_mz->memcg->css);
3325        return nr_reclaimed;
3326}
3327
3328/*
3329 * Reclaims as many pages from the given memcg as possible.
3330 *
3331 * Caller is responsible for holding css reference for memcg.
3332 */
3333static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3334{
3335        int nr_retries = MAX_RECLAIM_RETRIES;
3336
3337        /* we call try-to-free pages for make this cgroup empty */
3338        lru_add_drain_all();
3339
3340        drain_all_stock(memcg);
3341
3342        /* try to free all pages in this cgroup */
3343        while (nr_retries && page_counter_read(&memcg->memory)) {
3344                int progress;
3345
3346                if (signal_pending(current))
3347                        return -EINTR;
3348
3349                progress = try_to_free_mem_cgroup_pages(memcg, 1,
3350                                                        GFP_KERNEL, true);
3351                if (!progress) {
3352                        nr_retries--;
3353                        /* maybe some writeback is necessary */
3354                        congestion_wait(BLK_RW_ASYNC, HZ/10);
3355                }
3356
3357        }
3358
3359        return 0;
3360}
3361
3362static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3363                                            char *buf, size_t nbytes,
3364                                            loff_t off)
3365{
3366        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3367
3368        if (mem_cgroup_is_root(memcg))
3369                return -EINVAL;
3370        return mem_cgroup_force_empty(memcg) ?: nbytes;
3371}
3372
3373static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3374                                     struct cftype *cft)
3375{
3376        return 1;
3377}
3378
3379static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3380                                      struct cftype *cft, u64 val)
3381{
3382        if (val == 1)
3383                return 0;
3384
3385        pr_warn_once("Non-hierarchical mode is deprecated. "
3386                     "Please report your usecase to linux-mm@kvack.org if you "
3387                     "depend on this functionality.\n");
3388
3389        return -EINVAL;
3390}
3391
3392static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3393{
3394        unsigned long val;
3395
3396        if (mem_cgroup_is_root(memcg)) {
3397                /* mem_cgroup_threshold() calls here from irqsafe context */
3398                cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
3399                val = memcg_page_state(memcg, NR_FILE_PAGES) +
3400                        memcg_page_state(memcg, NR_ANON_MAPPED);
3401                if (swap)
3402                        val += memcg_page_state(memcg, MEMCG_SWAP);
3403        } else {
3404                if (!swap)
3405                        val = page_counter_read(&memcg->memory);
3406                else
3407                        val = page_counter_read(&memcg->memsw);
3408        }
3409        return val;
3410}
3411
3412enum {
3413        RES_USAGE,
3414        RES_LIMIT,
3415        RES_MAX_USAGE,
3416        RES_FAILCNT,
3417        RES_SOFT_LIMIT,
3418};
3419
3420static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3421                               struct cftype *cft)
3422{
3423        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3424        struct page_counter *counter;
3425
3426        switch (MEMFILE_TYPE(cft->private)) {
3427        case _MEM:
3428                counter = &memcg->memory;
3429                break;
3430        case _MEMSWAP:
3431                counter = &memcg->memsw;
3432                break;
3433        case _KMEM:
3434                counter = &memcg->kmem;
3435                break;
3436        case _TCP:
3437                counter = &memcg->tcpmem;
3438                break;
3439        default:
3440                BUG();
3441        }
3442
3443        switch (MEMFILE_ATTR(cft->private)) {
3444        case RES_USAGE:
3445                if (counter == &memcg->memory)
3446                        return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3447                if (counter == &memcg->memsw)
3448                        return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3449                return (u64)page_counter_read(counter) * PAGE_SIZE;
3450        case RES_LIMIT:
3451                return (u64)counter->max * PAGE_SIZE;
3452        case RES_MAX_USAGE:
3453                return (u64)counter->watermark * PAGE_SIZE;
3454        case RES_FAILCNT:
3455                return counter->failcnt;
3456        case RES_SOFT_LIMIT:
3457                return (u64)memcg->soft_limit * PAGE_SIZE;
3458        default:
3459                BUG();
3460        }
3461}
3462
3463#ifdef CONFIG_MEMCG_KMEM
3464static int memcg_online_kmem(struct mem_cgroup *memcg)
3465{
3466        struct obj_cgroup *objcg;
3467        int memcg_id;
3468
3469        if (cgroup_memory_nokmem)
3470                return 0;
3471
3472        BUG_ON(memcg->kmemcg_id >= 0);
3473        BUG_ON(memcg->kmem_state);
3474
3475        memcg_id = memcg_alloc_cache_id();
3476        if (memcg_id < 0)
3477                return memcg_id;
3478
3479        objcg = obj_cgroup_alloc();
3480        if (!objcg) {
3481                memcg_free_cache_id(memcg_id);
3482                return -ENOMEM;
3483        }
3484        objcg->memcg = memcg;
3485        rcu_assign_pointer(memcg->objcg, objcg);
3486
3487        static_branch_enable(&memcg_kmem_enabled_key);
3488
3489        memcg->kmemcg_id = memcg_id;
3490        memcg->kmem_state = KMEM_ONLINE;
3491
3492        return 0;
3493}
3494
3495static void memcg_offline_kmem(struct mem_cgroup *memcg)
3496{
3497        struct cgroup_subsys_state *css;
3498        struct mem_cgroup *parent, *child;
3499        int kmemcg_id;
3500
3501        if (memcg->kmem_state != KMEM_ONLINE)
3502                return;
3503
3504        memcg->kmem_state = KMEM_ALLOCATED;
3505
3506        parent = parent_mem_cgroup(memcg);
3507        if (!parent)
3508                parent = root_mem_cgroup;
3509
3510        memcg_reparent_objcgs(memcg, parent);
3511
3512        kmemcg_id = memcg->kmemcg_id;
3513        BUG_ON(kmemcg_id < 0);
3514
3515        /*
3516         * Change kmemcg_id of this cgroup and all its descendants to the
3517         * parent's id, and then move all entries from this cgroup's list_lrus
3518         * to ones of the parent. After we have finished, all list_lrus
3519         * corresponding to this cgroup are guaranteed to remain empty. The
3520         * ordering is imposed by list_lru_node->lock taken by
3521         * memcg_drain_all_list_lrus().
3522         */
3523        rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3524        css_for_each_descendant_pre(css, &memcg->css) {
3525                child = mem_cgroup_from_css(css);
3526                BUG_ON(child->kmemcg_id != kmemcg_id);
3527                child->kmemcg_id = parent->kmemcg_id;
3528        }
3529        rcu_read_unlock();
3530
3531        memcg_drain_all_list_lrus(kmemcg_id, parent);
3532
3533        memcg_free_cache_id(kmemcg_id);
3534}
3535
3536static void memcg_free_kmem(struct mem_cgroup *memcg)
3537{
3538        /* css_alloc() failed, offlining didn't happen */
3539        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3540                memcg_offline_kmem(memcg);
3541}
3542#else
3543static int memcg_online_kmem(struct mem_cgroup *memcg)
3544{
3545        return 0;
3546}
3547static void memcg_offline_kmem(struct mem_cgroup *memcg)
3548{
3549}
3550static void memcg_free_kmem(struct mem_cgroup *memcg)
3551{
3552}
3553#endif /* CONFIG_MEMCG_KMEM */
3554
3555static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3556                                 unsigned long max)
3557{
3558        int ret;
3559
3560        mutex_lock(&memcg_max_mutex);
3561        ret = page_counter_set_max(&memcg->kmem, max);
3562        mutex_unlock(&memcg_max_mutex);
3563        return ret;
3564}
3565
3566static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3567{
3568        int ret;
3569
3570        mutex_lock(&memcg_max_mutex);
3571
3572        ret = page_counter_set_max(&memcg->tcpmem, max);
3573        if (ret)
3574                goto out;
3575
3576        if (!memcg->tcpmem_active) {
3577                /*
3578                 * The active flag needs to be written after the static_key
3579                 * update. This is what guarantees that the socket activation
3580                 * function is the last one to run. See mem_cgroup_sk_alloc()
3581                 * for details, and note that we don't mark any socket as
3582                 * belonging to this memcg until that flag is up.
3583                 *
3584                 * We need to do this, because static_keys will span multiple
3585                 * sites, but we can't control their order. If we mark a socket
3586                 * as accounted, but the accounting functions are not patched in
3587                 * yet, we'll lose accounting.
3588                 *
3589                 * We never race with the readers in mem_cgroup_sk_alloc(),
3590                 * because when this value change, the code to process it is not
3591                 * patched in yet.
3592                 */
3593                static_branch_inc(&memcg_sockets_enabled_key);
3594                memcg->tcpmem_active = true;
3595        }
3596out:
3597        mutex_unlock(&memcg_max_mutex);
3598        return ret;
3599}
3600
3601/*
3602 * The user of this function is...
3603 * RES_LIMIT.
3604 */
3605static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3606                                char *buf, size_t nbytes, loff_t off)
3607{
3608        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3609        unsigned long nr_pages;
3610        int ret;
3611
3612        buf = strstrip(buf);
3613        ret = page_counter_memparse(buf, "-1", &nr_pages);
3614        if (ret)
3615                return ret;
3616
3617        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3618        case RES_LIMIT:
3619                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3620                        ret = -EINVAL;
3621                        break;
3622                }
3623                switch (MEMFILE_TYPE(of_cft(of)->private)) {
3624                case _MEM:
3625                        ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3626                        break;
3627                case _MEMSWAP:
3628                        ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3629                        break;
3630                case _KMEM:
3631                        pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3632                                     "Please report your usecase to linux-mm@kvack.org if you "
3633                                     "depend on this functionality.\n");
3634                        ret = memcg_update_kmem_max(memcg, nr_pages);
3635                        break;
3636                case _TCP:
3637                        ret = memcg_update_tcp_max(memcg, nr_pages);
3638                        break;
3639                }
3640                break;
3641        case RES_SOFT_LIMIT:
3642                memcg->soft_limit = nr_pages;
3643                ret = 0;
3644                break;
3645        }
3646        return ret ?: nbytes;
3647}
3648
3649static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3650                                size_t nbytes, loff_t off)
3651{
3652        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3653        struct page_counter *counter;
3654
3655        switch (MEMFILE_TYPE(of_cft(of)->private)) {
3656        case _MEM:
3657                counter = &memcg->memory;
3658                break;
3659        case _MEMSWAP:
3660                counter = &memcg->memsw;
3661                break;
3662        case _KMEM:
3663                counter = &memcg->kmem;
3664                break;
3665        case _TCP:
3666                counter = &memcg->tcpmem;
3667                break;
3668        default:
3669                BUG();
3670        }
3671
3672        switch (MEMFILE_ATTR(of_cft(of)->private)) {
3673        case RES_MAX_USAGE:
3674                page_counter_reset_watermark(counter);
3675                break;
3676        case RES_FAILCNT:
3677                counter->failcnt = 0;
3678                break;
3679        default:
3680                BUG();
3681        }
3682
3683        return nbytes;
3684}
3685
3686static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3687                                        struct cftype *cft)
3688{
3689        return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3690}
3691
3692#ifdef CONFIG_MMU
3693static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3694                                        struct cftype *cft, u64 val)
3695{
3696        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3697
3698        if (val & ~MOVE_MASK)
3699                return -EINVAL;
3700
3701        /*
3702         * No kind of locking is needed in here, because ->can_attach() will
3703         * check this value once in the beginning of the process, and then carry
3704         * on with stale data. This means that changes to this value will only
3705         * affect task migrations starting after the change.
3706         */
3707        memcg->move_charge_at_immigrate = val;
3708        return 0;
3709}
3710#else
3711static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3712                                        struct cftype *cft, u64 val)
3713{
3714        return -ENOSYS;
3715}
3716#endif
3717
3718#ifdef CONFIG_NUMA
3719
3720#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3721#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3722#define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
3723
3724static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3725                                int nid, unsigned int lru_mask, bool tree)
3726{
3727        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3728        unsigned long nr = 0;
3729        enum lru_list lru;
3730
3731        VM_BUG_ON((unsigned)nid >= nr_node_ids);
3732
3733        for_each_lru(lru) {
3734                if (!(BIT(lru) & lru_mask))
3735                        continue;
3736                if (tree)
3737                        nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3738                else
3739                        nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3740        }
3741        return nr;
3742}
3743
3744static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3745                                             unsigned int lru_mask,
3746                                             bool tree)
3747{
3748        unsigned long nr = 0;
3749        enum lru_list lru;
3750
3751        for_each_lru(lru) {
3752                if (!(BIT(lru) & lru_mask))
3753                        continue;
3754                if (tree)
3755                        nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3756                else
3757                        nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3758        }
3759        return nr;
3760}
3761
3762static int memcg_numa_stat_show(struct seq_file *m, void *v)
3763{
3764        struct numa_stat {
3765                const char *name;
3766                unsigned int lru_mask;
3767        };
3768
3769        static const struct numa_stat stats[] = {
3770                { "total", LRU_ALL },
3771                { "file", LRU_ALL_FILE },
3772                { "anon", LRU_ALL_ANON },
3773                { "unevictable", BIT(LRU_UNEVICTABLE) },
3774        };
3775        const struct numa_stat *stat;
3776        int nid;
3777        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3778
3779        cgroup_rstat_flush(memcg->css.cgroup);
3780
3781        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3782                seq_printf(m, "%s=%lu", stat->name,
3783                           mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3784                                                   false));
3785                for_each_node_state(nid, N_MEMORY)
3786                        seq_printf(m, " N%d=%lu", nid,
3787                                   mem_cgroup_node_nr_lru_pages(memcg, nid,
3788                                                        stat->lru_mask, false));
3789                seq_putc(m, '\n');
3790        }
3791
3792        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3793
3794                seq_printf(m, "hierarchical_%s=%lu", stat->name,
3795                           mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3796                                                   true));
3797                for_each_node_state(nid, N_MEMORY)
3798                        seq_printf(m, " N%d=%lu", nid,
3799                                   mem_cgroup_node_nr_lru_pages(memcg, nid,
3800                                                        stat->lru_mask, true));
3801                seq_putc(m, '\n');
3802        }
3803
3804        return 0;
3805}
3806#endif /* CONFIG_NUMA */
3807
3808static const unsigned int memcg1_stats[] = {
3809        NR_FILE_PAGES,
3810        NR_ANON_MAPPED,
3811#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3812        NR_ANON_THPS,
3813#endif
3814        NR_SHMEM,
3815        NR_FILE_MAPPED,
3816        NR_FILE_DIRTY,
3817        NR_WRITEBACK,
3818        MEMCG_SWAP,
3819};
3820
3821static const char *const memcg1_stat_names[] = {
3822        "cache",
3823        "rss",
3824#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3825        "rss_huge",
3826#endif
3827        "shmem",
3828        "mapped_file",
3829        "dirty",
3830        "writeback",
3831        "swap",
3832};
3833
3834/* Universal VM events cgroup1 shows, original sort order */
3835static const unsigned int memcg1_events[] = {
3836        PGPGIN,
3837        PGPGOUT,
3838        PGFAULT,
3839        PGMAJFAULT,
3840};
3841
3842static int memcg_stat_show(struct seq_file *m, void *v)
3843{
3844        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3845        unsigned long memory, memsw;
3846        struct mem_cgroup *mi;
3847        unsigned int i;
3848
3849        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3850
3851        cgroup_rstat_flush(memcg->css.cgroup);
3852
3853        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3854                unsigned long nr;
3855
3856                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3857                        continue;
3858                nr = memcg_page_state_local(memcg, memcg1_stats[i]);
3859                seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3860        }
3861
3862        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3863                seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
3864                           memcg_events_local(memcg, memcg1_events[i]));
3865
3866        for (i = 0; i < NR_LRU_LISTS; i++)
3867                seq_printf(m, "%s %lu\n", lru_list_name(i),
3868                           memcg_page_state_local(memcg, NR_LRU_BASE + i) *
3869                           PAGE_SIZE);
3870
3871        /* Hierarchical information */
3872        memory = memsw = PAGE_COUNTER_MAX;
3873        for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3874                memory = min(memory, READ_ONCE(mi->memory.max));
3875                memsw = min(memsw, READ_ONCE(mi->memsw.max));
3876        }
3877        seq_printf(m, "hierarchical_memory_limit %llu\n",
3878                   (u64)memory * PAGE_SIZE);
3879        if (do_memsw_account())
3880                seq_printf(m, "hierarchical_memsw_limit %llu\n",
3881                           (u64)memsw * PAGE_SIZE);
3882
3883        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3884                unsigned long nr;
3885
3886                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3887                        continue;
3888                nr = memcg_page_state(memcg, memcg1_stats[i]);
3889                seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3890                                                (u64)nr * PAGE_SIZE);
3891        }
3892
3893        for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3894                seq_printf(m, "total_%s %llu\n",
3895                           vm_event_name(memcg1_events[i]),
3896                           (u64)memcg_events(memcg, memcg1_events[i]));
3897
3898        for (i = 0; i < NR_LRU_LISTS; i++)
3899                seq_printf(m, "total_%s %llu\n", lru_list_name(i),
3900                           (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
3901                           PAGE_SIZE);
3902
3903#ifdef CONFIG_DEBUG_VM
3904        {
3905                pg_data_t *pgdat;
3906                struct mem_cgroup_per_node *mz;
3907                unsigned long anon_cost = 0;
3908                unsigned long file_cost = 0;
3909
3910                for_each_online_pgdat(pgdat) {
3911                        mz = memcg->nodeinfo[pgdat->node_id];
3912
3913                        anon_cost += mz->lruvec.anon_cost;
3914                        file_cost += mz->lruvec.file_cost;
3915                }
3916                seq_printf(m, "anon_cost %lu\n", anon_cost);
3917                seq_printf(m, "file_cost %lu\n", file_cost);
3918        }
3919#endif
3920
3921        return 0;
3922}
3923
3924static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3925                                      struct cftype *cft)
3926{
3927        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3928
3929        return mem_cgroup_swappiness(memcg);
3930}
3931
3932static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3933                                       struct cftype *cft, u64 val)
3934{
3935        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3936
3937        if (val > 100)
3938                return -EINVAL;
3939
3940        if (!mem_cgroup_is_root(memcg))
3941                memcg->swappiness = val;
3942        else
3943                vm_swappiness = val;
3944
3945        return 0;
3946}
3947
3948static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3949{
3950        struct mem_cgroup_threshold_ary *t;
3951        unsigned long usage;
3952        int i;
3953
3954        rcu_read_lock();
3955        if (!swap)
3956                t = rcu_dereference(memcg->thresholds.primary);
3957        else
3958                t = rcu_dereference(memcg->memsw_thresholds.primary);
3959
3960        if (!t)
3961                goto unlock;
3962
3963        usage = mem_cgroup_usage(memcg, swap);
3964
3965        /*
3966         * current_threshold points to threshold just below or equal to usage.
3967         * If it's not true, a threshold was crossed after last
3968         * call of __mem_cgroup_threshold().
3969         */
3970        i = t->current_threshold;
3971
3972        /*
3973         * Iterate backward over array of thresholds starting from
3974         * current_threshold and check if a threshold is crossed.
3975         * If none of thresholds below usage is crossed, we read
3976         * only one element of the array here.
3977         */
3978        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3979                eventfd_signal(t->entries[i].eventfd, 1);
3980
3981        /* i = current_threshold + 1 */
3982        i++;
3983
3984        /*
3985         * Iterate forward over array of thresholds starting from
3986         * current_threshold+1 and check if a threshold is crossed.
3987         * If none of thresholds above usage is crossed, we read
3988         * only one element of the array here.
3989         */
3990        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3991                eventfd_signal(t->entries[i].eventfd, 1);
3992
3993        /* Update current_threshold */
3994        t->current_threshold = i - 1;
3995unlock:
3996        rcu_read_unlock();
3997}
3998
3999static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4000{
4001        while (memcg) {
4002                __mem_cgroup_threshold(memcg, false);
4003                if (do_memsw_account())
4004                        __mem_cgroup_threshold(memcg, true);
4005
4006                memcg = parent_mem_cgroup(memcg);
4007        }
4008}
4009
4010static int compare_thresholds(const void *a, const void *b)
4011{
4012        const struct mem_cgroup_threshold *_a = a;
4013        const struct mem_cgroup_threshold *_b = b;
4014
4015        if (_a->threshold > _b->threshold)
4016                return 1;
4017
4018        if (_a->threshold < _b->threshold)
4019                return -1;
4020
4021        return 0;
4022}
4023
4024static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4025{
4026        struct mem_cgroup_eventfd_list *ev;
4027
4028        spin_lock(&memcg_oom_lock);
4029
4030        list_for_each_entry(ev, &memcg->oom_notify, list)
4031                eventfd_signal(ev->eventfd, 1);
4032
4033        spin_unlock(&memcg_oom_lock);
4034        return 0;
4035}
4036
4037static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4038{
4039        struct mem_cgroup *iter;
4040
4041        for_each_mem_cgroup_tree(iter, memcg)
4042                mem_cgroup_oom_notify_cb(iter);
4043}
4044
4045static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4046        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4047{
4048        struct mem_cgroup_thresholds *thresholds;
4049        struct mem_cgroup_threshold_ary *new;
4050        unsigned long threshold;
4051        unsigned long usage;
4052        int i, size, ret;
4053
4054        ret = page_counter_memparse(args, "-1", &threshold);
4055        if (ret)
4056                return ret;
4057
4058        mutex_lock(&memcg->thresholds_lock);
4059
4060        if (type == _MEM) {
4061                thresholds = &memcg->thresholds;
4062                usage = mem_cgroup_usage(memcg, false);
4063        } else if (type == _MEMSWAP) {
4064                thresholds = &memcg->memsw_thresholds;
4065                usage = mem_cgroup_usage(memcg, true);
4066        } else
4067                BUG();
4068
4069        /* Check if a threshold crossed before adding a new one */
4070        if (thresholds->primary)
4071                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4072
4073        size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4074
4075        /* Allocate memory for new array of thresholds */
4076        new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4077        if (!new) {
4078                ret = -ENOMEM;
4079                goto unlock;
4080        }
4081        new->size = size;
4082
4083        /* Copy thresholds (if any) to new array */
4084        if (thresholds->primary)
4085                memcpy(new->entries, thresholds->primary->entries,
4086                       flex_array_size(new, entries, size - 1));
4087
4088        /* Add new threshold */
4089        new->entries[size - 1].eventfd = eventfd;
4090        new->entries[size - 1].threshold = threshold;
4091
4092        /* Sort thresholds. Registering of new threshold isn't time-critical */
4093        sort(new->entries, size, sizeof(*new->entries),
4094                        compare_thresholds, NULL);
4095
4096        /* Find current threshold */
4097        new->current_threshold = -1;
4098        for (i = 0; i < size; i++) {
4099                if (new->entries[i].threshold <= usage) {
4100                        /*
4101                         * new->current_threshold will not be used until
4102                         * rcu_assign_pointer(), so it's safe to increment
4103                         * it here.
4104                         */
4105                        ++new->current_threshold;
4106                } else
4107                        break;
4108        }
4109
4110        /* Free old spare buffer and save old primary buffer as spare */
4111        kfree(thresholds->spare);
4112        thresholds->spare = thresholds->primary;
4113
4114        rcu_assign_pointer(thresholds->primary, new);
4115
4116        /* To be sure that nobody uses thresholds */
4117        synchronize_rcu();
4118
4119unlock:
4120        mutex_unlock(&memcg->thresholds_lock);
4121
4122        return ret;
4123}
4124
4125static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4126        struct eventfd_ctx *eventfd, const char *args)
4127{
4128        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4129}
4130
4131static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4132        struct eventfd_ctx *eventfd, const char *args)
4133{
4134        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4135}
4136
4137static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4138        struct eventfd_ctx *eventfd, enum res_type type)
4139{
4140        struct mem_cgroup_thresholds *thresholds;
4141        struct mem_cgroup_threshold_ary *new;
4142        unsigned long usage;
4143        int i, j, size, entries;
4144
4145        mutex_lock(&memcg->thresholds_lock);
4146
4147        if (type == _MEM) {
4148                thresholds = &memcg->thresholds;
4149                usage = mem_cgroup_usage(memcg, false);
4150        } else if (type == _MEMSWAP) {
4151                thresholds = &memcg->memsw_thresholds;
4152                usage = mem_cgroup_usage(memcg, true);
4153        } else
4154                BUG();
4155
4156        if (!thresholds->primary)
4157                goto unlock;
4158
4159        /* Check if a threshold crossed before removing */
4160        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4161
4162        /* Calculate new number of threshold */
4163        size = entries = 0;
4164        for (i = 0; i < thresholds->primary->size; i++) {
4165                if (thresholds->primary->entries[i].eventfd != eventfd)
4166                        size++;
4167                else
4168                        entries++;
4169        }
4170
4171        new = thresholds->spare;
4172
4173        /* If no items related to eventfd have been cleared, nothing to do */
4174        if (!entries)
4175                goto unlock;
4176
4177        /* Set thresholds array to NULL if we don't have thresholds */
4178        if (!size) {
4179                kfree(new);
4180                new = NULL;
4181                goto swap_buffers;
4182        }
4183
4184        new->size = size;
4185
4186        /* Copy thresholds and find current threshold */
4187        new->current_threshold = -1;
4188        for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4189                if (thresholds->primary->entries[i].eventfd == eventfd)
4190                        continue;
4191
4192                new->entries[j] = thresholds->primary->entries[i];
4193                if (new->entries[j].threshold <= usage) {
4194                        /*
4195                         * new->current_threshold will not be used
4196                         * until rcu_assign_pointer(), so it's safe to increment
4197                         * it here.
4198                         */
4199                        ++new->current_threshold;
4200                }
4201                j++;
4202        }
4203
4204swap_buffers:
4205        /* Swap primary and spare array */
4206        thresholds->spare = thresholds->primary;
4207
4208        rcu_assign_pointer(thresholds->primary, new);
4209
4210        /* To be sure that nobody uses thresholds */
4211        synchronize_rcu();
4212
4213        /* If all events are unregistered, free the spare array */
4214        if (!new) {
4215                kfree(thresholds->spare);
4216                thresholds->spare = NULL;
4217        }
4218unlock:
4219        mutex_unlock(&memcg->thresholds_lock);
4220}
4221
4222static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4223        struct eventfd_ctx *eventfd)
4224{
4225        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4226}
4227
4228static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4229        struct eventfd_ctx *eventfd)
4230{
4231        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4232}
4233
4234static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4235        struct eventfd_ctx *eventfd, const char *args)
4236{
4237        struct mem_cgroup_eventfd_list *event;
4238
4239        event = kmalloc(sizeof(*event), GFP_KERNEL);
4240        if (!event)
4241                return -ENOMEM;
4242
4243        spin_lock(&memcg_oom_lock);
4244
4245        event->eventfd = eventfd;
4246        list_add(&event->list, &memcg->oom_notify);
4247
4248        /* already in OOM ? */
4249        if (memcg->under_oom)
4250                eventfd_signal(eventfd, 1);
4251        spin_unlock(&memcg_oom_lock);
4252
4253        return 0;
4254}
4255
4256static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4257        struct eventfd_ctx *eventfd)
4258{
4259        struct mem_cgroup_eventfd_list *ev, *tmp;
4260
4261        spin_lock(&memcg_oom_lock);
4262
4263        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4264                if (ev->eventfd == eventfd) {
4265                        list_del(&ev->list);
4266                        kfree(ev);
4267                }
4268        }
4269
4270        spin_unlock(&memcg_oom_lock);
4271}
4272
4273static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4274{
4275        struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4276
4277        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4278        seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4279        seq_printf(sf, "oom_kill %lu\n",
4280                   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4281        return 0;
4282}
4283
4284static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4285        struct cftype *cft, u64 val)
4286{
4287        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4288
4289        /* cannot set to root cgroup and only 0 and 1 are allowed */
4290        if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4291                return -EINVAL;
4292
4293        memcg->oom_kill_disable = val;
4294        if (!val)
4295                memcg_oom_recover(memcg);
4296
4297        return 0;
4298}
4299
4300#ifdef CONFIG_CGROUP_WRITEBACK
4301
4302#include <trace/events/writeback.h>
4303
4304static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4305{
4306        return wb_domain_init(&memcg->cgwb_domain, gfp);
4307}
4308
4309static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4310{
4311        wb_domain_exit(&memcg->cgwb_domain);
4312}
4313
4314static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4315{
4316        wb_domain_size_changed(&memcg->cgwb_domain);
4317}
4318
4319struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4320{
4321        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4322
4323        if (!memcg->css.parent)
4324                return NULL;
4325
4326        return &memcg->cgwb_domain;
4327}
4328
4329/**
4330 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4331 * @wb: bdi_writeback in question
4332 * @pfilepages: out parameter for number of file pages
4333 * @pheadroom: out parameter for number of allocatable pages according to memcg
4334 * @pdirty: out parameter for number of dirty pages
4335 * @pwriteback: out parameter for number of pages under writeback
4336 *
4337 * Determine the numbers of file, headroom, dirty, and writeback pages in
4338 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4339 * is a bit more involved.
4340 *
4341 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4342 * headroom is calculated as the lowest headroom of itself and the
4343 * ancestors.  Note that this doesn't consider the actual amount of
4344 * available memory in the system.  The caller should further cap
4345 * *@pheadroom accordingly.
4346 */
4347void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4348                         unsigned long *pheadroom, unsigned long *pdirty,
4349                         unsigned long *pwriteback)
4350{
4351        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4352        struct mem_cgroup *parent;
4353
4354        cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
4355
4356        *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4357        *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4358        *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4359                        memcg_page_state(memcg, NR_ACTIVE_FILE);
4360
4361        *pheadroom = PAGE_COUNTER_MAX;
4362        while ((parent = parent_mem_cgroup(memcg))) {
4363                unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4364                                            READ_ONCE(memcg->memory.high));
4365                unsigned long used = page_counter_read(&memcg->memory);
4366
4367                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4368                memcg = parent;
4369        }
4370}
4371
4372/*
4373 * Foreign dirty flushing
4374 *
4375 * There's an inherent mismatch between memcg and writeback.  The former
4376 * tracks ownership per-page while the latter per-inode.  This was a
4377 * deliberate design decision because honoring per-page ownership in the
4378 * writeback path is complicated, may lead to higher CPU and IO overheads
4379 * and deemed unnecessary given that write-sharing an inode across
4380 * different cgroups isn't a common use-case.
4381 *
4382 * Combined with inode majority-writer ownership switching, this works well
4383 * enough in most cases but there are some pathological cases.  For
4384 * example, let's say there are two cgroups A and B which keep writing to
4385 * different but confined parts of the same inode.  B owns the inode and
4386 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4387 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4388 * triggering background writeback.  A will be slowed down without a way to
4389 * make writeback of the dirty pages happen.
4390 *
4391 * Conditions like the above can lead to a cgroup getting repeatedly and
4392 * severely throttled after making some progress after each
4393 * dirty_expire_interval while the underlying IO device is almost
4394 * completely idle.
4395 *
4396 * Solving this problem completely requires matching the ownership tracking
4397 * granularities between memcg and writeback in either direction.  However,
4398 * the more egregious behaviors can be avoided by simply remembering the
4399 * most recent foreign dirtying events and initiating remote flushes on
4400 * them when local writeback isn't enough to keep the memory clean enough.
4401 *
4402 * The following two functions implement such mechanism.  When a foreign
4403 * page - a page whose memcg and writeback ownerships don't match - is
4404 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4405 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4406 * decides that the memcg needs to sleep due to high dirty ratio, it calls
4407 * mem_cgroup_flush_foreign() which queues writeback on the recorded
4408 * foreign bdi_writebacks which haven't expired.  Both the numbers of
4409 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4410 * limited to MEMCG_CGWB_FRN_CNT.
4411 *
4412 * The mechanism only remembers IDs and doesn't hold any object references.
4413 * As being wrong occasionally doesn't matter, updates and accesses to the
4414 * records are lockless and racy.
4415 */
4416void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4417                                             struct bdi_writeback *wb)
4418{
4419        struct mem_cgroup *memcg = page_memcg(page);
4420        struct memcg_cgwb_frn *frn;
4421        u64 now = get_jiffies_64();
4422        u64 oldest_at = now;
4423        int oldest = -1;
4424        int i;
4425
4426        trace_track_foreign_dirty(page, wb);
4427
4428        /*
4429         * Pick the slot to use.  If there is already a slot for @wb, keep
4430         * using it.  If not replace the oldest one which isn't being
4431         * written out.
4432         */
4433        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4434                frn = &memcg->cgwb_frn[i];
4435                if (frn->bdi_id == wb->bdi->id &&
4436                    frn->memcg_id == wb->memcg_css->id)
4437                        break;
4438                if (time_before64(frn->at, oldest_at) &&
4439                    atomic_read(&frn->done.cnt) == 1) {
4440                        oldest = i;
4441                        oldest_at = frn->at;
4442                }
4443        }
4444
4445        if (i < MEMCG_CGWB_FRN_CNT) {
4446                /*
4447                 * Re-using an existing one.  Update timestamp lazily to
4448                 * avoid making the cacheline hot.  We want them to be
4449                 * reasonably up-to-date and significantly shorter than
4450                 * dirty_expire_interval as that's what expires the record.
4451                 * Use the shorter of 1s and dirty_expire_interval / 8.
4452                 */
4453                unsigned long update_intv =
4454                        min_t(unsigned long, HZ,
4455                              msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4456
4457                if (time_before64(frn->at, now - update_intv))
4458                        frn->at = now;
4459        } else if (oldest >= 0) {
4460                /* replace the oldest free one */
4461                frn = &memcg->cgwb_frn[oldest];
4462                frn->bdi_id = wb->bdi->id;
4463                frn->memcg_id = wb->memcg_css->id;
4464                frn->at = now;
4465        }
4466}
4467
4468/* issue foreign writeback flushes for recorded foreign dirtying events */
4469void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4470{
4471        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4472        unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4473        u64 now = jiffies_64;
4474        int i;
4475
4476        for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4477                struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4478
4479                /*
4480                 * If the record is older than dirty_expire_interval,
4481                 * writeback on it has already started.  No need to kick it
4482                 * off again.  Also, don't start a new one if there's
4483                 * already one in flight.
4484                 */
4485                if (time_after64(frn->at, now - intv) &&
4486                    atomic_read(&frn->done.cnt) == 1) {
4487                        frn->at = 0;
4488                        trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4489                        cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4490                                               WB_REASON_FOREIGN_FLUSH,
4491                                               &frn->done);
4492                }
4493        }
4494}
4495
4496#else   /* CONFIG_CGROUP_WRITEBACK */
4497
4498static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4499{
4500        return 0;
4501}
4502
4503static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4504{
4505}
4506
4507static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4508{
4509}
4510
4511#endif  /* CONFIG_CGROUP_WRITEBACK */
4512
4513/*
4514 * DO NOT USE IN NEW FILES.
4515 *
4516 * "cgroup.event_control" implementation.
4517 *
4518 * This is way over-engineered.  It tries to support fully configurable
4519 * events for each user.  Such level of flexibility is completely
4520 * unnecessary especially in the light of the planned unified hierarchy.
4521 *
4522 * Please deprecate this and replace with something simpler if at all
4523 * possible.
4524 */
4525
4526/*
4527 * Unregister event and free resources.
4528 *
4529 * Gets called from workqueue.
4530 */
4531static void memcg_event_remove(struct work_struct *work)
4532{
4533        struct mem_cgroup_event *event =
4534                container_of(work, struct mem_cgroup_event, remove);
4535        struct mem_cgroup *memcg = event->memcg;
4536
4537        remove_wait_queue(event->wqh, &event->wait);
4538
4539        event->unregister_event(memcg, event->eventfd);
4540
4541        /* Notify userspace the event is going away. */
4542        eventfd_signal(event->eventfd, 1);
4543
4544        eventfd_ctx_put(event->eventfd);
4545        kfree(event);
4546        css_put(&memcg->css);
4547}
4548
4549/*
4550 * Gets called on EPOLLHUP on eventfd when user closes it.
4551 *
4552 * Called with wqh->lock held and interrupts disabled.
4553 */
4554static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4555                            int sync, void *key)
4556{
4557        struct mem_cgroup_event *event =
4558                container_of(wait, struct mem_cgroup_event, wait);
4559        struct mem_cgroup *memcg = event->memcg;
4560        __poll_t flags = key_to_poll(key);
4561
4562        if (flags & EPOLLHUP) {
4563                /*
4564                 * If the event has been detached at cgroup removal, we
4565                 * can simply return knowing the other side will cleanup
4566                 * for us.
4567                 *
4568                 * We can't race against event freeing since the other
4569                 * side will require wqh->lock via remove_wait_queue(),
4570                 * which we hold.
4571                 */
4572                spin_lock(&memcg->event_list_lock);
4573                if (!list_empty(&event->list)) {
4574                        list_del_init(&event->list);
4575                        /*
4576                         * We are in atomic context, but cgroup_event_remove()
4577                         * may sleep, so we have to call it in workqueue.
4578                         */
4579                        schedule_work(&event->remove);
4580                }
4581                spin_unlock(&memcg->event_list_lock);
4582        }
4583
4584        return 0;
4585}
4586
4587static void memcg_event_ptable_queue_proc(struct file *file,
4588                wait_queue_head_t *wqh, poll_table *pt)
4589{
4590        struct mem_cgroup_event *event =
4591                container_of(pt, struct mem_cgroup_event, pt);
4592
4593        event->wqh = wqh;
4594        add_wait_queue(wqh, &event->wait);
4595}
4596
4597/*
4598 * DO NOT USE IN NEW FILES.
4599 *
4600 * Parse input and register new cgroup event handler.
4601 *
4602 * Input must be in format '<event_fd> <control_fd> <args>'.
4603 * Interpretation of args is defined by control file implementation.
4604 */
4605static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4606                                         char *buf, size_t nbytes, loff_t off)
4607{
4608        struct cgroup_subsys_state *css = of_css(of);
4609        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4610        struct mem_cgroup_event *event;
4611        struct cgroup_subsys_state *cfile_css;
4612        unsigned int efd, cfd;
4613        struct fd efile;
4614        struct fd cfile;
4615        const char *name;
4616        char *endp;
4617        int ret;
4618
4619        buf = strstrip(buf);
4620
4621        efd = simple_strtoul(buf, &endp, 10);
4622        if (*endp != ' ')
4623                return -EINVAL;
4624        buf = endp + 1;
4625
4626        cfd = simple_strtoul(buf, &endp, 10);
4627        if ((*endp != ' ') && (*endp != '\0'))
4628                return -EINVAL;
4629        buf = endp + 1;
4630
4631        event = kzalloc(sizeof(*event), GFP_KERNEL);
4632        if (!event)
4633                return -ENOMEM;
4634
4635        event->memcg = memcg;
4636        INIT_LIST_HEAD(&event->list);
4637        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4638        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4639        INIT_WORK(&event->remove, memcg_event_remove);
4640
4641        efile = fdget(efd);
4642        if (!efile.file) {
4643                ret = -EBADF;
4644                goto out_kfree;
4645        }
4646
4647        event->eventfd = eventfd_ctx_fileget(efile.file);
4648        if (IS_ERR(event->eventfd)) {
4649                ret = PTR_ERR(event->eventfd);
4650                goto out_put_efile;
4651        }
4652
4653        cfile = fdget(cfd);
4654        if (!cfile.file) {
4655                ret = -EBADF;
4656                goto out_put_eventfd;
4657        }
4658
4659        /* the process need read permission on control file */
4660        /* AV: shouldn't we check that it's been opened for read instead? */
4661        ret = file_permission(cfile.file, MAY_READ);
4662        if (ret < 0)
4663                goto out_put_cfile;
4664
4665        /*
4666         * Determine the event callbacks and set them in @event.  This used
4667         * to be done via struct cftype but cgroup core no longer knows
4668         * about these events.  The following is crude but the whole thing
4669         * is for compatibility anyway.
4670         *
4671         * DO NOT ADD NEW FILES.
4672         */
4673        name = cfile.file->f_path.dentry->d_name.name;
4674
4675        if (!strcmp(name, "memory.usage_in_bytes")) {
4676                event->register_event = mem_cgroup_usage_register_event;
4677                event->unregister_event = mem_cgroup_usage_unregister_event;
4678        } else if (!strcmp(name, "memory.oom_control")) {
4679                event->register_event = mem_cgroup_oom_register_event;
4680                event->unregister_event = mem_cgroup_oom_unregister_event;
4681        } else if (!strcmp(name, "memory.pressure_level")) {
4682                event->register_event = vmpressure_register_event;
4683                event->unregister_event = vmpressure_unregister_event;
4684        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4685                event->register_event = memsw_cgroup_usage_register_event;
4686                event->unregister_event = memsw_cgroup_usage_unregister_event;
4687        } else {
4688                ret = -EINVAL;
4689                goto out_put_cfile;
4690        }
4691
4692        /*
4693         * Verify @cfile should belong to @css.  Also, remaining events are
4694         * automatically removed on cgroup destruction but the removal is
4695         * asynchronous, so take an extra ref on @css.
4696         */
4697        cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4698                                               &memory_cgrp_subsys);
4699        ret = -EINVAL;
4700        if (IS_ERR(cfile_css))
4701                goto out_put_cfile;
4702        if (cfile_css != css) {
4703                css_put(cfile_css);
4704                goto out_put_cfile;
4705        }
4706
4707        ret = event->register_event(memcg, event->eventfd, buf);
4708        if (ret)
4709                goto out_put_css;
4710
4711        vfs_poll(efile.file, &event->pt);
4712
4713        spin_lock(&memcg->event_list_lock);
4714        list_add(&event->list, &memcg->event_list);
4715        spin_unlock(&memcg->event_list_lock);
4716
4717        fdput(cfile);
4718        fdput(efile);
4719
4720        return nbytes;
4721
4722out_put_css:
4723        css_put(css);
4724out_put_cfile:
4725        fdput(cfile);
4726out_put_eventfd:
4727        eventfd_ctx_put(event->eventfd);
4728out_put_efile:
4729        fdput(efile);
4730out_kfree:
4731        kfree(event);
4732
4733        return ret;
4734}
4735
4736static struct cftype mem_cgroup_legacy_files[] = {
4737        {
4738                .name = "usage_in_bytes",
4739                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4740                .read_u64 = mem_cgroup_read_u64,
4741        },
4742        {
4743                .name = "max_usage_in_bytes",
4744                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4745                .write = mem_cgroup_reset,
4746                .read_u64 = mem_cgroup_read_u64,
4747        },
4748        {
4749                .name = "limit_in_bytes",
4750                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4751                .write = mem_cgroup_write,
4752                .read_u64 = mem_cgroup_read_u64,
4753        },
4754        {
4755                .name = "soft_limit_in_bytes",
4756                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4757                .write = mem_cgroup_write,
4758                .read_u64 = mem_cgroup_read_u64,
4759        },
4760        {
4761                .name = "failcnt",
4762                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4763                .write = mem_cgroup_reset,
4764                .read_u64 = mem_cgroup_read_u64,
4765        },
4766        {
4767                .name = "stat",
4768                .seq_show = memcg_stat_show,
4769        },
4770        {
4771                .name = "force_empty",
4772                .write = mem_cgroup_force_empty_write,
4773        },
4774        {
4775                .name = "use_hierarchy",
4776                .write_u64 = mem_cgroup_hierarchy_write,
4777                .read_u64 = mem_cgroup_hierarchy_read,
4778        },
4779        {
4780                .name = "cgroup.event_control",         /* XXX: for compat */
4781                .write = memcg_write_event_control,
4782                .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4783        },
4784        {
4785                .name = "swappiness",
4786                .read_u64 = mem_cgroup_swappiness_read,
4787                .write_u64 = mem_cgroup_swappiness_write,
4788        },
4789        {
4790                .name = "move_charge_at_immigrate",
4791                .read_u64 = mem_cgroup_move_charge_read,
4792                .write_u64 = mem_cgroup_move_charge_write,
4793        },
4794        {
4795                .name = "oom_control",
4796                .seq_show = mem_cgroup_oom_control_read,
4797                .write_u64 = mem_cgroup_oom_control_write,
4798                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4799        },
4800        {
4801                .name = "pressure_level",
4802        },
4803#ifdef CONFIG_NUMA
4804        {
4805                .name = "numa_stat",
4806                .seq_show = memcg_numa_stat_show,
4807        },
4808#endif
4809        {
4810                .name = "kmem.limit_in_bytes",
4811                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4812                .write = mem_cgroup_write,