linux/mm/memcontrol.c
<<
>>
Prefs
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License as published by
  11 * the Free Software Foundation; either version 2 of the License, or
  12 * (at your option) any later version.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 */
  19
  20#include <linux/res_counter.h>
  21#include <linux/memcontrol.h>
  22#include <linux/cgroup.h>
  23#include <linux/mm.h>
  24#include <linux/pagemap.h>
  25#include <linux/smp.h>
  26#include <linux/page-flags.h>
  27#include <linux/backing-dev.h>
  28#include <linux/bit_spinlock.h>
  29#include <linux/rcupdate.h>
  30#include <linux/limits.h>
  31#include <linux/mutex.h>
  32#include <linux/slab.h>
  33#include <linux/swap.h>
  34#include <linux/spinlock.h>
  35#include <linux/fs.h>
  36#include <linux/seq_file.h>
  37#include <linux/vmalloc.h>
  38#include <linux/mm_inline.h>
  39#include <linux/page_cgroup.h>
  40#include "internal.h"
  41
  42#include <asm/uaccess.h>
  43
  44struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  45#define MEM_CGROUP_RECLAIM_RETRIES      5
  46
  47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
  49int do_swap_account __read_mostly;
  50static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  51#else
  52#define do_swap_account         (0)
  53#endif
  54
  55static DEFINE_MUTEX(memcg_tasklist);    /* can be hold under cgroup_mutex */
  56
  57/*
  58 * Statistics for memory cgroup.
  59 */
  60enum mem_cgroup_stat_index {
  61        /*
  62         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  63         */
  64        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  65        MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
  66        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  67        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  68
  69        MEM_CGROUP_STAT_NSTATS,
  70};
  71
  72struct mem_cgroup_stat_cpu {
  73        s64 count[MEM_CGROUP_STAT_NSTATS];
  74} ____cacheline_aligned_in_smp;
  75
  76struct mem_cgroup_stat {
  77        struct mem_cgroup_stat_cpu cpustat[0];
  78};
  79
  80/*
  81 * For accounting under irq disable, no need for increment preempt count.
  82 */
  83static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  84                enum mem_cgroup_stat_index idx, int val)
  85{
  86        stat->count[idx] += val;
  87}
  88
  89static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  90                enum mem_cgroup_stat_index idx)
  91{
  92        int cpu;
  93        s64 ret = 0;
  94        for_each_possible_cpu(cpu)
  95                ret += stat->cpustat[cpu].count[idx];
  96        return ret;
  97}
  98
  99static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
 100{
 101        s64 ret;
 102
 103        ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
 104        ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
 105        return ret;
 106}
 107
 108/*
 109 * per-zone information in memory controller.
 110 */
 111struct mem_cgroup_per_zone {
 112        /*
 113         * spin_lock to protect the per cgroup LRU
 114         */
 115        struct list_head        lists[NR_LRU_LISTS];
 116        unsigned long           count[NR_LRU_LISTS];
 117
 118        struct zone_reclaim_stat reclaim_stat;
 119};
 120/* Macro for accessing counter */
 121#define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 122
 123struct mem_cgroup_per_node {
 124        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 125};
 126
 127struct mem_cgroup_lru_info {
 128        struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 129};
 130
 131/*
 132 * The memory controller data structure. The memory controller controls both
 133 * page cache and RSS per cgroup. We would eventually like to provide
 134 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 135 * to help the administrator determine what knobs to tune.
 136 *
 137 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 138 * we hit the water mark. May be even add a low water mark, such that
 139 * no reclaim occurs from a cgroup at it's low water mark, this is
 140 * a feature that will be implemented much later in the future.
 141 */
 142struct mem_cgroup {
 143        struct cgroup_subsys_state css;
 144        /*
 145         * the counter to account for memory usage
 146         */
 147        struct res_counter res;
 148        /*
 149         * the counter to account for mem+swap usage.
 150         */
 151        struct res_counter memsw;
 152        /*
 153         * Per cgroup active and inactive list, similar to the
 154         * per zone LRU lists.
 155         */
 156        struct mem_cgroup_lru_info info;
 157
 158        /*
 159          protect against reclaim related member.
 160        */
 161        spinlock_t reclaim_param_lock;
 162
 163        int     prev_priority;  /* for recording reclaim priority */
 164
 165        /*
 166         * While reclaiming in a hiearchy, we cache the last child we
 167         * reclaimed from.
 168         */
 169        int last_scanned_child;
 170        /*
 171         * Should the accounting and control be hierarchical, per subtree?
 172         */
 173        bool use_hierarchy;
 174        unsigned long   last_oom_jiffies;
 175        atomic_t        refcnt;
 176
 177        unsigned int    swappiness;
 178
 179        /*
 180         * statistics. This must be placed at the end of memcg.
 181         */
 182        struct mem_cgroup_stat stat;
 183};
 184
 185enum charge_type {
 186        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 187        MEM_CGROUP_CHARGE_TYPE_MAPPED,
 188        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 189        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 190        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 191        NR_CHARGE_TYPE,
 192};
 193
 194/* only for here (for easy reading.) */
 195#define PCGF_CACHE      (1UL << PCG_CACHE)
 196#define PCGF_USED       (1UL << PCG_USED)
 197#define PCGF_LOCK       (1UL << PCG_LOCK)
 198static const unsigned long
 199pcg_default_flags[NR_CHARGE_TYPE] = {
 200        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
 201        PCGF_USED | PCGF_LOCK, /* Anon */
 202        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 203        0, /* FORCE */
 204};
 205
 206/* for encoding cft->private value on file */
 207#define _MEM                    (0)
 208#define _MEMSWAP                (1)
 209#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
 210#define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
 211#define MEMFILE_ATTR(val)       ((val) & 0xffff)
 212
 213static void mem_cgroup_get(struct mem_cgroup *mem);
 214static void mem_cgroup_put(struct mem_cgroup *mem);
 215static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 216
 217static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 218                                         struct page_cgroup *pc,
 219                                         bool charge)
 220{
 221        int val = (charge)? 1 : -1;
 222        struct mem_cgroup_stat *stat = &mem->stat;
 223        struct mem_cgroup_stat_cpu *cpustat;
 224        int cpu = get_cpu();
 225
 226        cpustat = &stat->cpustat[cpu];
 227        if (PageCgroupCache(pc))
 228                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 229        else
 230                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 231
 232        if (charge)
 233                __mem_cgroup_stat_add_safe(cpustat,
 234                                MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 235        else
 236                __mem_cgroup_stat_add_safe(cpustat,
 237                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 238        put_cpu();
 239}
 240
 241static struct mem_cgroup_per_zone *
 242mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 243{
 244        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 245}
 246
 247static struct mem_cgroup_per_zone *
 248page_cgroup_zoneinfo(struct page_cgroup *pc)
 249{
 250        struct mem_cgroup *mem = pc->mem_cgroup;
 251        int nid = page_cgroup_nid(pc);
 252        int zid = page_cgroup_zid(pc);
 253
 254        if (!mem)
 255                return NULL;
 256
 257        return mem_cgroup_zoneinfo(mem, nid, zid);
 258}
 259
 260static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 261                                        enum lru_list idx)
 262{
 263        int nid, zid;
 264        struct mem_cgroup_per_zone *mz;
 265        u64 total = 0;
 266
 267        for_each_online_node(nid)
 268                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 269                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
 270                        total += MEM_CGROUP_ZSTAT(mz, idx);
 271                }
 272        return total;
 273}
 274
 275static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 276{
 277        return container_of(cgroup_subsys_state(cont,
 278                                mem_cgroup_subsys_id), struct mem_cgroup,
 279                                css);
 280}
 281
 282struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 283{
 284        /*
 285         * mm_update_next_owner() may clear mm->owner to NULL
 286         * if it races with swapoff, page migration, etc.
 287         * So this can be called with p == NULL.
 288         */
 289        if (unlikely(!p))
 290                return NULL;
 291
 292        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 293                                struct mem_cgroup, css);
 294}
 295
 296static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 297{
 298        struct mem_cgroup *mem = NULL;
 299
 300        if (!mm)
 301                return NULL;
 302        /*
 303         * Because we have no locks, mm->owner's may be being moved to other
 304         * cgroup. We use css_tryget() here even if this looks
 305         * pessimistic (rather than adding locks here).
 306         */
 307        rcu_read_lock();
 308        do {
 309                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 310                if (unlikely(!mem))
 311                        break;
 312        } while (!css_tryget(&mem->css));
 313        rcu_read_unlock();
 314        return mem;
 315}
 316
 317/*
 318 * Call callback function against all cgroup under hierarchy tree.
 319 */
 320static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 321                          int (*func)(struct mem_cgroup *, void *))
 322{
 323        int found, ret, nextid;
 324        struct cgroup_subsys_state *css;
 325        struct mem_cgroup *mem;
 326
 327        if (!root->use_hierarchy)
 328                return (*func)(root, data);
 329
 330        nextid = 1;
 331        do {
 332                ret = 0;
 333                mem = NULL;
 334
 335                rcu_read_lock();
 336                css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
 337                                   &found);
 338                if (css && css_tryget(css))
 339                        mem = container_of(css, struct mem_cgroup, css);
 340                rcu_read_unlock();
 341
 342                if (mem) {
 343                        ret = (*func)(mem, data);
 344                        css_put(&mem->css);
 345                }
 346                nextid = found + 1;
 347        } while (!ret && css);
 348
 349        return ret;
 350}
 351
 352/*
 353 * Following LRU functions are allowed to be used without PCG_LOCK.
 354 * Operations are called by routine of global LRU independently from memcg.
 355 * What we have to take care of here is validness of pc->mem_cgroup.
 356 *
 357 * Changes to pc->mem_cgroup happens when
 358 * 1. charge
 359 * 2. moving account
 360 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 361 * It is added to LRU before charge.
 362 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 363 * When moving account, the page is not on LRU. It's isolated.
 364 */
 365
 366void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 367{
 368        struct page_cgroup *pc;
 369        struct mem_cgroup *mem;
 370        struct mem_cgroup_per_zone *mz;
 371
 372        if (mem_cgroup_disabled())
 373                return;
 374        pc = lookup_page_cgroup(page);
 375        /* can happen while we handle swapcache. */
 376        if (list_empty(&pc->lru) || !pc->mem_cgroup)
 377                return;
 378        /*
 379         * We don't check PCG_USED bit. It's cleared when the "page" is finally
 380         * removed from global LRU.
 381         */
 382        mz = page_cgroup_zoneinfo(pc);
 383        mem = pc->mem_cgroup;
 384        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 385        list_del_init(&pc->lru);
 386        return;
 387}
 388
 389void mem_cgroup_del_lru(struct page *page)
 390{
 391        mem_cgroup_del_lru_list(page, page_lru(page));
 392}
 393
 394void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 395{
 396        struct mem_cgroup_per_zone *mz;
 397        struct page_cgroup *pc;
 398
 399        if (mem_cgroup_disabled())
 400                return;
 401
 402        pc = lookup_page_cgroup(page);
 403        /*
 404         * Used bit is set without atomic ops but after smp_wmb().
 405         * For making pc->mem_cgroup visible, insert smp_rmb() here.
 406         */
 407        smp_rmb();
 408        /* unused page is not rotated. */
 409        if (!PageCgroupUsed(pc))
 410                return;
 411        mz = page_cgroup_zoneinfo(pc);
 412        list_move(&pc->lru, &mz->lists[lru]);
 413}
 414
 415void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 416{
 417        struct page_cgroup *pc;
 418        struct mem_cgroup_per_zone *mz;
 419
 420        if (mem_cgroup_disabled())
 421                return;
 422        pc = lookup_page_cgroup(page);
 423        /*
 424         * Used bit is set without atomic ops but after smp_wmb().
 425         * For making pc->mem_cgroup visible, insert smp_rmb() here.
 426         */
 427        smp_rmb();
 428        if (!PageCgroupUsed(pc))
 429                return;
 430
 431        mz = page_cgroup_zoneinfo(pc);
 432        MEM_CGROUP_ZSTAT(mz, lru) += 1;
 433        list_add(&pc->lru, &mz->lists[lru]);
 434}
 435
 436/*
 437 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
 438 * lru because the page may.be reused after it's fully uncharged (because of
 439 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
 440 * it again. This function is only used to charge SwapCache. It's done under
 441 * lock_page and expected that zone->lru_lock is never held.
 442 */
 443static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
 444{
 445        unsigned long flags;
 446        struct zone *zone = page_zone(page);
 447        struct page_cgroup *pc = lookup_page_cgroup(page);
 448
 449        spin_lock_irqsave(&zone->lru_lock, flags);
 450        /*
 451         * Forget old LRU when this page_cgroup is *not* used. This Used bit
 452         * is guarded by lock_page() because the page is SwapCache.
 453         */
 454        if (!PageCgroupUsed(pc))
 455                mem_cgroup_del_lru_list(page, page_lru(page));
 456        spin_unlock_irqrestore(&zone->lru_lock, flags);
 457}
 458
 459static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 460{
 461        unsigned long flags;
 462        struct zone *zone = page_zone(page);
 463        struct page_cgroup *pc = lookup_page_cgroup(page);
 464
 465        spin_lock_irqsave(&zone->lru_lock, flags);
 466        /* link when the page is linked to LRU but page_cgroup isn't */
 467        if (PageLRU(page) && list_empty(&pc->lru))
 468                mem_cgroup_add_lru_list(page, page_lru(page));
 469        spin_unlock_irqrestore(&zone->lru_lock, flags);
 470}
 471
 472
 473void mem_cgroup_move_lists(struct page *page,
 474                           enum lru_list from, enum lru_list to)
 475{
 476        if (mem_cgroup_disabled())
 477                return;
 478        mem_cgroup_del_lru_list(page, from);
 479        mem_cgroup_add_lru_list(page, to);
 480}
 481
 482int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 483{
 484        int ret;
 485        struct mem_cgroup *curr = NULL;
 486
 487        task_lock(task);
 488        rcu_read_lock();
 489        curr = try_get_mem_cgroup_from_mm(task->mm);
 490        rcu_read_unlock();
 491        task_unlock(task);
 492        if (!curr)
 493                return 0;
 494        if (curr->use_hierarchy)
 495                ret = css_is_ancestor(&curr->css, &mem->css);
 496        else
 497                ret = (curr == mem);
 498        css_put(&curr->css);
 499        return ret;
 500}
 501
 502/*
 503 * prev_priority control...this will be used in memory reclaim path.
 504 */
 505int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 506{
 507        int prev_priority;
 508
 509        spin_lock(&mem->reclaim_param_lock);
 510        prev_priority = mem->prev_priority;
 511        spin_unlock(&mem->reclaim_param_lock);
 512
 513        return prev_priority;
 514}
 515
 516void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 517{
 518        spin_lock(&mem->reclaim_param_lock);
 519        if (priority < mem->prev_priority)
 520                mem->prev_priority = priority;
 521        spin_unlock(&mem->reclaim_param_lock);
 522}
 523
 524void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 525{
 526        spin_lock(&mem->reclaim_param_lock);
 527        mem->prev_priority = priority;
 528        spin_unlock(&mem->reclaim_param_lock);
 529}
 530
 531static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 532{
 533        unsigned long active;
 534        unsigned long inactive;
 535        unsigned long gb;
 536        unsigned long inactive_ratio;
 537
 538        inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
 539        active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
 540
 541        gb = (inactive + active) >> (30 - PAGE_SHIFT);
 542        if (gb)
 543                inactive_ratio = int_sqrt(10 * gb);
 544        else
 545                inactive_ratio = 1;
 546
 547        if (present_pages) {
 548                present_pages[0] = inactive;
 549                present_pages[1] = active;
 550        }
 551
 552        return inactive_ratio;
 553}
 554
 555int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 556{
 557        unsigned long active;
 558        unsigned long inactive;
 559        unsigned long present_pages[2];
 560        unsigned long inactive_ratio;
 561
 562        inactive_ratio = calc_inactive_ratio(memcg, present_pages);
 563
 564        inactive = present_pages[0];
 565        active = present_pages[1];
 566
 567        if (inactive * inactive_ratio < active)
 568                return 1;
 569
 570        return 0;
 571}
 572
 573unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 574                                       struct zone *zone,
 575                                       enum lru_list lru)
 576{
 577        int nid = zone->zone_pgdat->node_id;
 578        int zid = zone_idx(zone);
 579        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 580
 581        return MEM_CGROUP_ZSTAT(mz, lru);
 582}
 583
 584struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 585                                                      struct zone *zone)
 586{
 587        int nid = zone->zone_pgdat->node_id;
 588        int zid = zone_idx(zone);
 589        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 590
 591        return &mz->reclaim_stat;
 592}
 593
 594struct zone_reclaim_stat *
 595mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 596{
 597        struct page_cgroup *pc;
 598        struct mem_cgroup_per_zone *mz;
 599
 600        if (mem_cgroup_disabled())
 601                return NULL;
 602
 603        pc = lookup_page_cgroup(page);
 604        /*
 605         * Used bit is set without atomic ops but after smp_wmb().
 606         * For making pc->mem_cgroup visible, insert smp_rmb() here.
 607         */
 608        smp_rmb();
 609        if (!PageCgroupUsed(pc))
 610                return NULL;
 611
 612        mz = page_cgroup_zoneinfo(pc);
 613        if (!mz)
 614                return NULL;
 615
 616        return &mz->reclaim_stat;
 617}
 618
 619unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 620                                        struct list_head *dst,
 621                                        unsigned long *scanned, int order,
 622                                        int mode, struct zone *z,
 623                                        struct mem_cgroup *mem_cont,
 624                                        int active, int file)
 625{
 626        unsigned long nr_taken = 0;
 627        struct page *page;
 628        unsigned long scan;
 629        LIST_HEAD(pc_list);
 630        struct list_head *src;
 631        struct page_cgroup *pc, *tmp;
 632        int nid = z->zone_pgdat->node_id;
 633        int zid = zone_idx(z);
 634        struct mem_cgroup_per_zone *mz;
 635        int lru = LRU_FILE * !!file + !!active;
 636
 637        BUG_ON(!mem_cont);
 638        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 639        src = &mz->lists[lru];
 640
 641        scan = 0;
 642        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 643                if (scan >= nr_to_scan)
 644                        break;
 645
 646                page = pc->page;
 647                if (unlikely(!PageCgroupUsed(pc)))
 648                        continue;
 649                if (unlikely(!PageLRU(page)))
 650                        continue;
 651
 652                scan++;
 653                if (__isolate_lru_page(page, mode, file) == 0) {
 654                        list_move(&page->lru, dst);
 655                        nr_taken++;
 656                }
 657        }
 658
 659        *scanned = scan;
 660        return nr_taken;
 661}
 662
 663#define mem_cgroup_from_res_counter(counter, member)    \
 664        container_of(counter, struct mem_cgroup, member)
 665
 666static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
 667{
 668        if (do_swap_account) {
 669                if (res_counter_check_under_limit(&mem->res) &&
 670                        res_counter_check_under_limit(&mem->memsw))
 671                        return true;
 672        } else
 673                if (res_counter_check_under_limit(&mem->res))
 674                        return true;
 675        return false;
 676}
 677
 678static unsigned int get_swappiness(struct mem_cgroup *memcg)
 679{
 680        struct cgroup *cgrp = memcg->css.cgroup;
 681        unsigned int swappiness;
 682
 683        /* root ? */
 684        if (cgrp->parent == NULL)
 685                return vm_swappiness;
 686
 687        spin_lock(&memcg->reclaim_param_lock);
 688        swappiness = memcg->swappiness;
 689        spin_unlock(&memcg->reclaim_param_lock);
 690
 691        return swappiness;
 692}
 693
 694static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 695{
 696        int *val = data;
 697        (*val)++;
 698        return 0;
 699}
 700
 701/**
 702 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
 703 * @memcg: The memory cgroup that went over limit
 704 * @p: Task that is going to be killed
 705 *
 706 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
 707 * enabled
 708 */
 709void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 710{
 711        struct cgroup *task_cgrp;
 712        struct cgroup *mem_cgrp;
 713        /*
 714         * Need a buffer in BSS, can't rely on allocations. The code relies
 715         * on the assumption that OOM is serialized for memory controller.
 716         * If this assumption is broken, revisit this code.
 717         */
 718        static char memcg_name[PATH_MAX];
 719        int ret;
 720
 721        if (!memcg)
 722                return;
 723
 724
 725        rcu_read_lock();
 726
 727        mem_cgrp = memcg->css.cgroup;
 728        task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
 729
 730        ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 731        if (ret < 0) {
 732                /*
 733                 * Unfortunately, we are unable to convert to a useful name
 734                 * But we'll still print out the usage information
 735                 */
 736                rcu_read_unlock();
 737                goto done;
 738        }
 739        rcu_read_unlock();
 740
 741        printk(KERN_INFO "Task in %s killed", memcg_name);
 742
 743        rcu_read_lock();
 744        ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
 745        if (ret < 0) {
 746                rcu_read_unlock();
 747                goto done;
 748        }
 749        rcu_read_unlock();
 750
 751        /*
 752         * Continues from above, so we don't need an KERN_ level
 753         */
 754        printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
 755done:
 756
 757        printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
 758                res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 759                res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 760                res_counter_read_u64(&memcg->res, RES_FAILCNT));
 761        printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
 762                "failcnt %llu\n",
 763                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 764                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 765                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 766}
 767
 768/*
 769 * This function returns the number of memcg under hierarchy tree. Returns
 770 * 1(self count) if no children.
 771 */
 772static int mem_cgroup_count_children(struct mem_cgroup *mem)
 773{
 774        int num = 0;
 775        mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
 776        return num;
 777}
 778
 779/*
 780 * Visit the first child (need not be the first child as per the ordering
 781 * of the cgroup list, since we track last_scanned_child) of @mem and use
 782 * that to reclaim free pages from.
 783 */
 784static struct mem_cgroup *
 785mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 786{
 787        struct mem_cgroup *ret = NULL;
 788        struct cgroup_subsys_state *css;
 789        int nextid, found;
 790
 791        if (!root_mem->use_hierarchy) {
 792                css_get(&root_mem->css);
 793                ret = root_mem;
 794        }
 795
 796        while (!ret) {
 797                rcu_read_lock();
 798                nextid = root_mem->last_scanned_child + 1;
 799                css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
 800                                   &found);
 801                if (css && css_tryget(css))
 802                        ret = container_of(css, struct mem_cgroup, css);
 803
 804                rcu_read_unlock();
 805                /* Updates scanning parameter */
 806                spin_lock(&root_mem->reclaim_param_lock);
 807                if (!css) {
 808                        /* this means start scan from ID:1 */
 809                        root_mem->last_scanned_child = 0;
 810                } else
 811                        root_mem->last_scanned_child = found;
 812                spin_unlock(&root_mem->reclaim_param_lock);
 813        }
 814
 815        return ret;
 816}
 817
 818/*
 819 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 820 * we reclaimed from, so that we don't end up penalizing one child extensively
 821 * based on its position in the children list.
 822 *
 823 * root_mem is the original ancestor that we've been reclaim from.
 824 *
 825 * We give up and return to the caller when we visit root_mem twice.
 826 * (other groups can be removed while we're walking....)
 827 *
 828 * If shrink==true, for avoiding to free too much, this returns immedieately.
 829 */
 830static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 831                                   gfp_t gfp_mask, bool noswap, bool shrink)
 832{
 833        struct mem_cgroup *victim;
 834        int ret, total = 0;
 835        int loop = 0;
 836
 837        while (loop < 2) {
 838                victim = mem_cgroup_select_victim(root_mem);
 839                if (victim == root_mem)
 840                        loop++;
 841                if (!mem_cgroup_local_usage(&victim->stat)) {
 842                        /* this cgroup's local usage == 0 */
 843                        css_put(&victim->css);
 844                        continue;
 845                }
 846                /* we use swappiness of local cgroup */
 847                ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
 848                                                   get_swappiness(victim));
 849                css_put(&victim->css);
 850                /*
 851                 * At shrinking usage, we can't check we should stop here or
 852                 * reclaim more. It's depends on callers. last_scanned_child
 853                 * will work enough for keeping fairness under tree.
 854                 */
 855                if (shrink)
 856                        return ret;
 857                total += ret;
 858                if (mem_cgroup_check_under_limit(root_mem))
 859                        return 1 + total;
 860        }
 861        return total;
 862}
 863
 864bool mem_cgroup_oom_called(struct task_struct *task)
 865{
 866        bool ret = false;
 867        struct mem_cgroup *mem;
 868        struct mm_struct *mm;
 869
 870        rcu_read_lock();
 871        mm = task->mm;
 872        if (!mm)
 873                mm = &init_mm;
 874        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 875        if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
 876                ret = true;
 877        rcu_read_unlock();
 878        return ret;
 879}
 880
 881static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
 882{
 883        mem->last_oom_jiffies = jiffies;
 884        return 0;
 885}
 886
 887static void record_last_oom(struct mem_cgroup *mem)
 888{
 889        mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
 890}
 891
 892
 893/*
 894 * Unlike exported interface, "oom" parameter is added. if oom==true,
 895 * oom-killer can be invoked.
 896 */
 897static int __mem_cgroup_try_charge(struct mm_struct *mm,
 898                        gfp_t gfp_mask, struct mem_cgroup **memcg,
 899                        bool oom)
 900{
 901        struct mem_cgroup *mem, *mem_over_limit;
 902        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 903        struct res_counter *fail_res;
 904
 905        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
 906                /* Don't account this! */
 907                *memcg = NULL;
 908                return 0;
 909        }
 910
 911        /*
 912         * We always charge the cgroup the mm_struct belongs to.
 913         * The mm_struct's mem_cgroup changes on task migration if the
 914         * thread group leader migrates. It's possible that mm is not
 915         * set, if so charge the init_mm (happens for pagecache usage).
 916         */
 917        mem = *memcg;
 918        if (likely(!mem)) {
 919                mem = try_get_mem_cgroup_from_mm(mm);
 920                *memcg = mem;
 921        } else {
 922                css_get(&mem->css);
 923        }
 924        if (unlikely(!mem))
 925                return 0;
 926
 927        VM_BUG_ON(css_is_removed(&mem->css));
 928
 929        while (1) {
 930                int ret;
 931                bool noswap = false;
 932
 933                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
 934                if (likely(!ret)) {
 935                        if (!do_swap_account)
 936                                break;
 937                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
 938                                                        &fail_res);
 939                        if (likely(!ret))
 940                                break;
 941                        /* mem+swap counter fails */
 942                        res_counter_uncharge(&mem->res, PAGE_SIZE);
 943                        noswap = true;
 944                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
 945                                                                        memsw);
 946                } else
 947                        /* mem counter fails */
 948                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
 949                                                                        res);
 950
 951                if (!(gfp_mask & __GFP_WAIT))
 952                        goto nomem;
 953
 954                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
 955                                                        noswap, false);
 956                if (ret)
 957                        continue;
 958
 959                /*
 960                 * try_to_free_mem_cgroup_pages() might not give us a full
 961                 * picture of reclaim. Some pages are reclaimed and might be
 962                 * moved to swap cache or just unmapped from the cgroup.
 963                 * Check the limit again to see if the reclaim reduced the
 964                 * current usage of the cgroup before giving up
 965                 *
 966                 */
 967                if (mem_cgroup_check_under_limit(mem_over_limit))
 968                        continue;
 969
 970                if (!nr_retries--) {
 971                        if (oom) {
 972                                mutex_lock(&memcg_tasklist);
 973                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
 974                                mutex_unlock(&memcg_tasklist);
 975                                record_last_oom(mem_over_limit);
 976                        }
 977                        goto nomem;
 978                }
 979        }
 980        return 0;
 981nomem:
 982        css_put(&mem->css);
 983        return -ENOMEM;
 984}
 985
 986
 987/*
 988 * A helper function to get mem_cgroup from ID. must be called under
 989 * rcu_read_lock(). The caller must check css_is_removed() or some if
 990 * it's concern. (dropping refcnt from swap can be called against removed
 991 * memcg.)
 992 */
 993static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 994{
 995        struct cgroup_subsys_state *css;
 996
 997        /* ID 0 is unused ID */
 998        if (!id)
 999                return NULL;
1000        css = css_lookup(&mem_cgroup_subsys, id);
1001        if (!css)
1002                return NULL;
1003        return container_of(css, struct mem_cgroup, css);
1004}
1005
1006static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1007{
1008        struct mem_cgroup *mem;
1009        struct page_cgroup *pc;
1010        unsigned short id;
1011        swp_entry_t ent;
1012
1013        VM_BUG_ON(!PageLocked(page));
1014
1015        if (!PageSwapCache(page))
1016                return NULL;
1017
1018        pc = lookup_page_cgroup(page);
1019        lock_page_cgroup(pc);
1020        if (PageCgroupUsed(pc)) {
1021                mem = pc->mem_cgroup;
1022                if (mem && !css_tryget(&mem->css))
1023                        mem = NULL;
1024        } else {
1025                ent.val = page_private(page);
1026                id = lookup_swap_cgroup(ent);
1027                rcu_read_lock();
1028                mem = mem_cgroup_lookup(id);
1029                if (mem && !css_tryget(&mem->css))
1030                        mem = NULL;
1031                rcu_read_unlock();
1032        }
1033        unlock_page_cgroup(pc);
1034        return mem;
1035}
1036
1037/*
1038 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1039 * USED state. If already USED, uncharge and return.
1040 */
1041
1042static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1043                                     struct page_cgroup *pc,
1044                                     enum charge_type ctype)
1045{
1046        /* try_charge() can return NULL to *memcg, taking care of it. */
1047        if (!mem)
1048                return;
1049
1050        lock_page_cgroup(pc);
1051        if (unlikely(PageCgroupUsed(pc))) {
1052                unlock_page_cgroup(pc);
1053                res_counter_uncharge(&mem->res, PAGE_SIZE);
1054                if (do_swap_account)
1055                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1056                css_put(&mem->css);
1057                return;
1058        }
1059        pc->mem_cgroup = mem;
1060        smp_wmb();
1061        pc->flags = pcg_default_flags[ctype];
1062
1063        mem_cgroup_charge_statistics(mem, pc, true);
1064
1065        unlock_page_cgroup(pc);
1066}
1067
1068/**
1069 * mem_cgroup_move_account - move account of the page
1070 * @pc: page_cgroup of the page.
1071 * @from: mem_cgroup which the page is moved from.
1072 * @to: mem_cgroup which the page is moved to. @from != @to.
1073 *
1074 * The caller must confirm following.
1075 * - page is not on LRU (isolate_page() is useful.)
1076 *
1077 * returns 0 at success,
1078 * returns -EBUSY when lock is busy or "pc" is unstable.
1079 *
1080 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1081 * new cgroup. It should be done by a caller.
1082 */
1083
1084static int mem_cgroup_move_account(struct page_cgroup *pc,
1085        struct mem_cgroup *from, struct mem_cgroup *to)
1086{
1087        struct mem_cgroup_per_zone *from_mz, *to_mz;
1088        int nid, zid;
1089        int ret = -EBUSY;
1090
1091        VM_BUG_ON(from == to);
1092        VM_BUG_ON(PageLRU(pc->page));
1093
1094        nid = page_cgroup_nid(pc);
1095        zid = page_cgroup_zid(pc);
1096        from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
1097        to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
1098
1099        if (!trylock_page_cgroup(pc))
1100                return ret;
1101
1102        if (!PageCgroupUsed(pc))
1103                goto out;
1104
1105        if (pc->mem_cgroup != from)
1106                goto out;
1107
1108        res_counter_uncharge(&from->res, PAGE_SIZE);
1109        mem_cgroup_charge_statistics(from, pc, false);
1110        if (do_swap_account)
1111                res_counter_uncharge(&from->memsw, PAGE_SIZE);
1112        css_put(&from->css);
1113
1114        css_get(&to->css);
1115        pc->mem_cgroup = to;
1116        mem_cgroup_charge_statistics(to, pc, true);
1117        ret = 0;
1118out:
1119        unlock_page_cgroup(pc);
1120        return ret;
1121}
1122
1123/*
1124 * move charges to its parent.
1125 */
1126
1127static int mem_cgroup_move_parent(struct page_cgroup *pc,
1128                                  struct mem_cgroup *child,
1129                                  gfp_t gfp_mask)
1130{
1131        struct page *page = pc->page;
1132        struct cgroup *cg = child->css.cgroup;
1133        struct cgroup *pcg = cg->parent;
1134        struct mem_cgroup *parent;
1135        int ret;
1136
1137        /* Is ROOT ? */
1138        if (!pcg)
1139                return -EINVAL;
1140
1141
1142        parent = mem_cgroup_from_cont(pcg);
1143
1144
1145        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1146        if (ret || !parent)
1147                return ret;
1148
1149        if (!get_page_unless_zero(page)) {
1150                ret = -EBUSY;
1151                goto uncharge;
1152        }
1153
1154        ret = isolate_lru_page(page);
1155
1156        if (ret)
1157                goto cancel;
1158
1159        ret = mem_cgroup_move_account(pc, child, parent);
1160
1161        putback_lru_page(page);
1162        if (!ret) {
1163                put_page(page);
1164                /* drop extra refcnt by try_charge() */
1165                css_put(&parent->css);
1166                return 0;
1167        }
1168
1169cancel:
1170        put_page(page);
1171uncharge:
1172        /* drop extra refcnt by try_charge() */
1173        css_put(&parent->css);
1174        /* uncharge if move fails */
1175        res_counter_uncharge(&parent->res, PAGE_SIZE);
1176        if (do_swap_account)
1177                res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1178        return ret;
1179}
1180
1181/*
1182 * Charge the memory controller for page usage.
1183 * Return
1184 * 0 if the charge was successful
1185 * < 0 if the cgroup is over its limit
1186 */
1187static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1188                                gfp_t gfp_mask, enum charge_type ctype,
1189                                struct mem_cgroup *memcg)
1190{
1191        struct mem_cgroup *mem;
1192        struct page_cgroup *pc;
1193        int ret;
1194
1195        pc = lookup_page_cgroup(page);
1196        /* can happen at boot */
1197        if (unlikely(!pc))
1198                return 0;
1199        prefetchw(pc);
1200
1201        mem = memcg;
1202        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1203        if (ret || !mem)
1204                return ret;
1205
1206        __mem_cgroup_commit_charge(mem, pc, ctype);
1207        return 0;
1208}
1209
1210int mem_cgroup_newpage_charge(struct page *page,
1211                              struct mm_struct *mm, gfp_t gfp_mask)
1212{
1213        if (mem_cgroup_disabled())
1214                return 0;
1215        if (PageCompound(page))
1216                return 0;
1217        /*
1218         * If already mapped, we don't have to account.
1219         * If page cache, page->mapping has address_space.
1220         * But page->mapping may have out-of-use anon_vma pointer,
1221         * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1222         * is NULL.
1223         */
1224        if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1225                return 0;
1226        if (unlikely(!mm))
1227                mm = &init_mm;
1228        return mem_cgroup_charge_common(page, mm, gfp_mask,
1229                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1230}
1231
1232static void
1233__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1234                                        enum charge_type ctype);
1235
1236int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1237                                gfp_t gfp_mask)
1238{
1239        struct mem_cgroup *mem = NULL;
1240        int ret;
1241
1242        if (mem_cgroup_disabled())
1243                return 0;
1244        if (PageCompound(page))
1245                return 0;
1246        /*
1247         * Corner case handling. This is called from add_to_page_cache()
1248         * in usual. But some FS (shmem) precharges this page before calling it
1249         * and call add_to_page_cache() with GFP_NOWAIT.
1250         *
1251         * For GFP_NOWAIT case, the page may be pre-charged before calling
1252         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1253         * charge twice. (It works but has to pay a bit larger cost.)
1254         * And when the page is SwapCache, it should take swap information
1255         * into account. This is under lock_page() now.
1256         */
1257        if (!(gfp_mask & __GFP_WAIT)) {
1258                struct page_cgroup *pc;
1259
1260
1261                pc = lookup_page_cgroup(page);
1262                if (!pc)
1263                        return 0;
1264                lock_page_cgroup(pc);
1265                if (PageCgroupUsed(pc)) {
1266                        unlock_page_cgroup(pc);
1267                        return 0;
1268                }
1269                unlock_page_cgroup(pc);
1270        }
1271
1272        if (unlikely(!mm && !mem))
1273                mm = &init_mm;
1274
1275        if (page_is_file_cache(page))
1276                return mem_cgroup_charge_common(page, mm, gfp_mask,
1277                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1278
1279        /* shmem */
1280        if (PageSwapCache(page)) {
1281                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1282                if (!ret)
1283                        __mem_cgroup_commit_charge_swapin(page, mem,
1284                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
1285        } else
1286                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1287                                        MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1288
1289        return ret;
1290}
1291
1292/*
1293 * While swap-in, try_charge -> commit or cancel, the page is locked.
1294 * And when try_charge() successfully returns, one refcnt to memcg without
1295 * struct page_cgroup is aquired. This refcnt will be cumsumed by
1296 * "commit()" or removed by "cancel()"
1297 */
1298int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1299                                 struct page *page,
1300                                 gfp_t mask, struct mem_cgroup **ptr)
1301{
1302        struct mem_cgroup *mem;
1303        int ret;
1304
1305        if (mem_cgroup_disabled())
1306                return 0;
1307
1308        if (!do_swap_account)
1309                goto charge_cur_mm;
1310        /*
1311         * A racing thread's fault, or swapoff, may have already updated
1312         * the pte, and even removed page from swap cache: return success
1313         * to go on to do_swap_page()'s pte_same() test, which should fail.
1314         */
1315        if (!PageSwapCache(page))
1316                return 0;
1317        mem = try_get_mem_cgroup_from_swapcache(page);
1318        if (!mem)
1319                goto charge_cur_mm;
1320        *ptr = mem;
1321        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1322        /* drop extra refcnt from tryget */
1323        css_put(&mem->css);
1324        return ret;
1325charge_cur_mm:
1326        if (unlikely(!mm))
1327                mm = &init_mm;
1328        return __mem_cgroup_try_charge(mm, mask, ptr, true);
1329}
1330
1331static void
1332__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1333                                        enum charge_type ctype)
1334{
1335        struct page_cgroup *pc;
1336
1337        if (mem_cgroup_disabled())
1338                return;
1339        if (!ptr)
1340                return;
1341        pc = lookup_page_cgroup(page);
1342        mem_cgroup_lru_del_before_commit_swapcache(page);
1343        __mem_cgroup_commit_charge(ptr, pc, ctype);
1344        mem_cgroup_lru_add_after_commit_swapcache(page);
1345        /*
1346         * Now swap is on-memory. This means this page may be
1347         * counted both as mem and swap....double count.
1348         * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1349         * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1350         * may call delete_from_swap_cache() before reach here.
1351         */
1352        if (do_swap_account && PageSwapCache(page)) {
1353                swp_entry_t ent = {.val = page_private(page)};
1354                unsigned short id;
1355                struct mem_cgroup *memcg;
1356
1357                id = swap_cgroup_record(ent, 0);
1358                rcu_read_lock();
1359                memcg = mem_cgroup_lookup(id);
1360                if (memcg) {
1361                        /*
1362                         * This recorded memcg can be obsolete one. So, avoid
1363                         * calling css_tryget
1364                         */
1365                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1366                        mem_cgroup_put(memcg);
1367                }
1368                rcu_read_unlock();
1369        }
1370        /* add this page(page_cgroup) to the LRU we want. */
1371
1372}
1373
1374void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1375{
1376        __mem_cgroup_commit_charge_swapin(page, ptr,
1377                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
1378}
1379
1380void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1381{
1382        if (mem_cgroup_disabled())
1383                return;
1384        if (!mem)
1385                return;
1386        res_counter_uncharge(&mem->res, PAGE_SIZE);
1387        if (do_swap_account)
1388                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1389        css_put(&mem->css);
1390}
1391
1392
1393/*
1394 * uncharge if !page_mapped(page)
1395 */
1396static struct mem_cgroup *
1397__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1398{
1399        struct page_cgroup *pc;
1400        struct mem_cgroup *mem = NULL;
1401        struct mem_cgroup_per_zone *mz;
1402
1403        if (mem_cgroup_disabled())
1404                return NULL;
1405
1406        if (PageSwapCache(page))
1407                return NULL;
1408
1409        /*
1410         * Check if our page_cgroup is valid
1411         */
1412        pc = lookup_page_cgroup(page);
1413        if (unlikely(!pc || !PageCgroupUsed(pc)))
1414                return NULL;
1415
1416        lock_page_cgroup(pc);
1417
1418        mem = pc->mem_cgroup;
1419
1420        if (!PageCgroupUsed(pc))
1421                goto unlock_out;
1422
1423        switch (ctype) {
1424        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1425                if (page_mapped(page))
1426                        goto unlock_out;
1427                break;
1428        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1429                if (!PageAnon(page)) {  /* Shared memory */
1430                        if (page->mapping && !page_is_file_cache(page))
1431                                goto unlock_out;
1432                } else if (page_mapped(page)) /* Anon */
1433                                goto unlock_out;
1434                break;
1435        default:
1436                break;
1437        }
1438
1439        res_counter_uncharge(&mem->res, PAGE_SIZE);
1440        if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1441                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1442        mem_cgroup_charge_statistics(mem, pc, false);
1443
1444        ClearPageCgroupUsed(pc);
1445        /*
1446         * pc->mem_cgroup is not cleared here. It will be accessed when it's
1447         * freed from LRU. This is safe because uncharged page is expected not
1448         * to be reused (freed soon). Exception is SwapCache, it's handled by
1449         * special functions.
1450         */
1451
1452        mz = page_cgroup_zoneinfo(pc);
1453        unlock_page_cgroup(pc);
1454
1455        /* at swapout, this memcg will be accessed to record to swap */
1456        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1457                css_put(&mem->css);
1458
1459        return mem;
1460
1461unlock_out:
1462        unlock_page_cgroup(pc);
1463        return NULL;
1464}
1465
1466void mem_cgroup_uncharge_page(struct page *page)
1467{
1468        /* early check. */
1469        if (page_mapped(page))
1470                return;
1471        if (page->mapping && !PageAnon(page))
1472                return;
1473        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1474}
1475
1476void mem_cgroup_uncharge_cache_page(struct page *page)
1477{
1478        VM_BUG_ON(page_mapped(page));
1479        VM_BUG_ON(page->mapping);
1480        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1481}
1482
1483#ifdef CONFIG_SWAP
1484/*
1485 * called after __delete_from_swap_cache() and drop "page" account.
1486 * memcg information is recorded to swap_cgroup of "ent"
1487 */
1488void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1489{
1490        struct mem_cgroup *memcg;
1491
1492        memcg = __mem_cgroup_uncharge_common(page,
1493                                        MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1494        /* record memcg information */
1495        if (do_swap_account && memcg) {
1496                swap_cgroup_record(ent, css_id(&memcg->css));
1497                mem_cgroup_get(memcg);
1498        }
1499        if (memcg)
1500                css_put(&memcg->css);
1501}
1502#endif
1503
1504#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1505/*
1506 * called from swap_entry_free(). remove record in swap_cgroup and
1507 * uncharge "memsw" account.
1508 */
1509void mem_cgroup_uncharge_swap(swp_entry_t ent)
1510{
1511        struct mem_cgroup *memcg;
1512        unsigned short id;
1513
1514        if (!do_swap_account)
1515                return;
1516
1517        id = swap_cgroup_record(ent, 0);
1518        rcu_read_lock();
1519        memcg = mem_cgroup_lookup(id);
1520        if (memcg) {
1521                /*
1522                 * We uncharge this because swap is freed.
1523                 * This memcg can be obsolete one. We avoid calling css_tryget
1524                 */
1525                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1526                mem_cgroup_put(memcg);
1527        }
1528        rcu_read_unlock();
1529}
1530#endif
1531
1532/*
1533 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1534 * page belongs to.
1535 */
1536int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1537{
1538        struct page_cgroup *pc;
1539        struct mem_cgroup *mem = NULL;
1540        int ret = 0;
1541
1542        if (mem_cgroup_disabled())
1543                return 0;
1544
1545        pc = lookup_page_cgroup(page);
1546        lock_page_cgroup(pc);
1547        if (PageCgroupUsed(pc)) {
1548                mem = pc->mem_cgroup;
1549                css_get(&mem->css);
1550        }
1551        unlock_page_cgroup(pc);
1552
1553        if (mem) {
1554                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
1555                css_put(&mem->css);
1556        }
1557        *ptr = mem;
1558        return ret;
1559}
1560
1561/* remove redundant charge if migration failed*/
1562void mem_cgroup_end_migration(struct mem_cgroup *mem,
1563                struct page *oldpage, struct page *newpage)
1564{
1565        struct page *target, *unused;
1566        struct page_cgroup *pc;
1567        enum charge_type ctype;
1568
1569        if (!mem)
1570                return;
1571
1572        /* at migration success, oldpage->mapping is NULL. */
1573        if (oldpage->mapping) {
1574                target = oldpage;
1575                unused = NULL;
1576        } else {
1577                target = newpage;
1578                unused = oldpage;
1579        }
1580
1581        if (PageAnon(target))
1582                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1583        else if (page_is_file_cache(target))
1584                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1585        else
1586                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1587
1588        /* unused page is not on radix-tree now. */
1589        if (unused)
1590                __mem_cgroup_uncharge_common(unused, ctype);
1591
1592        pc = lookup_page_cgroup(target);
1593        /*
1594         * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1595         * So, double-counting is effectively avoided.
1596         */
1597        __mem_cgroup_commit_charge(mem, pc, ctype);
1598
1599        /*
1600         * Both of oldpage and newpage are still under lock_page().
1601         * Then, we don't have to care about race in radix-tree.
1602         * But we have to be careful that this page is unmapped or not.
1603         *
1604         * There is a case for !page_mapped(). At the start of
1605         * migration, oldpage was mapped. But now, it's zapped.
1606         * But we know *target* page is not freed/reused under us.
1607         * mem_cgroup_uncharge_page() does all necessary checks.
1608         */
1609        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1610                mem_cgroup_uncharge_page(target);
1611}
1612
1613/*
1614 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1615 * Calling hierarchical_reclaim is not enough because we should update
1616 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
1617 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
1618 * not from the memcg which this page would be charged to.
1619 * try_charge_swapin does all of these works properly.
1620 */
1621int mem_cgroup_shmem_charge_fallback(struct page *page,
1622                            struct mm_struct *mm,
1623                            gfp_t gfp_mask)
1624{
1625        struct mem_cgroup *mem = NULL;
1626        int ret;
1627
1628        if (mem_cgroup_disabled())
1629                return 0;
1630
1631        ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1632        if (!ret)
1633                mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1634
1635        return ret;
1636}
1637
1638static DEFINE_MUTEX(set_limit_mutex);
1639
1640static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1641                                unsigned long long val)
1642{
1643        int retry_count;
1644        int progress;
1645        u64 memswlimit;
1646        int ret = 0;
1647        int children = mem_cgroup_count_children(memcg);
1648        u64 curusage, oldusage;
1649
1650        /*
1651         * For keeping hierarchical_reclaim simple, how long we should retry
1652         * is depends on callers. We set our retry-count to be function
1653         * of # of children which we should visit in this loop.
1654         */
1655        retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
1656
1657        oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1658
1659        while (retry_count) {
1660                if (signal_pending(current)) {
1661                        ret = -EINTR;
1662                        break;
1663                }
1664                /*
1665                 * Rather than hide all in some function, I do this in
1666                 * open coded manner. You see what this really does.
1667                 * We have to guarantee mem->res.limit < mem->memsw.limit.
1668                 */
1669                mutex_lock(&set_limit_mutex);
1670                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1671                if (memswlimit < val) {
1672                        ret = -EINVAL;
1673                        mutex_unlock(&set_limit_mutex);
1674                        break;
1675                }
1676                ret = res_counter_set_limit(&memcg->res, val);
1677                mutex_unlock(&set_limit_mutex);
1678
1679                if (!ret)
1680                        break;
1681
1682                progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1683                                                   false, true);
1684                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1685                /* Usage is reduced ? */
1686                if (curusage >= oldusage)
1687                        retry_count--;
1688                else
1689                        oldusage = curusage;
1690        }
1691
1692        return ret;
1693}
1694
1695int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1696                                unsigned long long val)
1697{
1698        int retry_count;
1699        u64 memlimit, oldusage, curusage;
1700        int children = mem_cgroup_count_children(memcg);
1701        int ret = -EBUSY;
1702
1703        if (!do_swap_account)
1704                return -EINVAL;
1705        /* see mem_cgroup_resize_res_limit */
1706        retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1707        oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1708        while (retry_count) {
1709                if (signal_pending(current)) {
1710                        ret = -EINTR;
1711                        break;
1712                }
1713                /*
1714                 * Rather than hide all in some function, I do this in
1715                 * open coded manner. You see what this really does.
1716                 * We have to guarantee mem->res.limit < mem->memsw.limit.
1717                 */
1718                mutex_lock(&set_limit_mutex);
1719                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1720                if (memlimit > val) {
1721                        ret = -EINVAL;
1722                        mutex_unlock(&set_limit_mutex);
1723                        break;
1724                }
1725                ret = res_counter_set_limit(&memcg->memsw, val);
1726                mutex_unlock(&set_limit_mutex);
1727
1728                if (!ret)
1729                        break;
1730
1731                mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
1732                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1733                /* Usage is reduced ? */
1734                if (curusage >= oldusage)
1735                        retry_count--;
1736                else
1737                        oldusage = curusage;
1738        }
1739        return ret;
1740}
1741
1742/*
1743 * This routine traverse page_cgroup in given list and drop them all.
1744 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1745 */
1746static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1747                                int node, int zid, enum lru_list lru)
1748{
1749        struct zone *zone;
1750        struct mem_cgroup_per_zone *mz;
1751        struct page_cgroup *pc, *busy;
1752        unsigned long flags, loop;
1753        struct list_head *list;
1754        int ret = 0;
1755
1756        zone = &NODE_DATA(node)->node_zones[zid];
1757        mz = mem_cgroup_zoneinfo(mem, node, zid);
1758        list = &mz->lists[lru];
1759
1760        loop = MEM_CGROUP_ZSTAT(mz, lru);
1761        /* give some margin against EBUSY etc...*/
1762        loop += 256;
1763        busy = NULL;
1764        while (loop--) {
1765                ret = 0;
1766                spin_lock_irqsave(&zone->lru_lock, flags);
1767                if (list_empty(list)) {
1768                        spin_unlock_irqrestore(&zone->lru_lock, flags);
1769                        break;
1770                }
1771                pc = list_entry(list->prev, struct page_cgroup, lru);
1772                if (busy == pc) {
1773                        list_move(&pc->lru, list);
1774                        busy = 0;
1775                        spin_unlock_irqrestore(&zone->lru_lock, flags);
1776                        continue;
1777                }
1778                spin_unlock_irqrestore(&zone->lru_lock, flags);
1779
1780                ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1781                if (ret == -ENOMEM)
1782                        break;
1783
1784                if (ret == -EBUSY || ret == -EINVAL) {
1785                        /* found lock contention or "pc" is obsolete. */
1786                        busy = pc;
1787                        cond_resched();
1788                } else
1789                        busy = NULL;
1790        }
1791
1792        if (!ret && !list_empty(list))
1793                return -EBUSY;
1794        return ret;
1795}
1796
1797/*
1798 * make mem_cgroup's charge to be 0 if there is no task.
1799 * This enables deleting this mem_cgroup.
1800 */
1801static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1802{
1803        int ret;
1804        int node, zid, shrink;
1805        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1806        struct cgroup *cgrp = mem->css.cgroup;
1807
1808        css_get(&mem->css);
1809
1810        shrink = 0;
1811        /* should free all ? */
1812        if (free_all)
1813                goto try_to_free;
1814move_account:
1815        while (mem->res.usage > 0) {
1816                ret = -EBUSY;
1817                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1818                        goto out;
1819                ret = -EINTR;
1820                if (signal_pending(current))
1821                        goto out;
1822                /* This is for making all *used* pages to be on LRU. */
1823                lru_add_drain_all();
1824                ret = 0;
1825                for_each_node_state(node, N_HIGH_MEMORY) {
1826                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1827                                enum lru_list l;
1828                                for_each_lru(l) {
1829                                        ret = mem_cgroup_force_empty_list(mem,
1830                                                        node, zid, l);
1831                                        if (ret)
1832                                                break;
1833                                }
1834                        }
1835                        if (ret)
1836                                break;
1837                }
1838                /* it seems parent cgroup doesn't have enough mem */
1839                if (ret == -ENOMEM)
1840                        goto try_to_free;
1841                cond_resched();
1842        }
1843        ret = 0;
1844out:
1845        css_put(&mem->css);
1846        return ret;
1847
1848try_to_free:
1849        /* returns EBUSY if there is a task or if we come here twice. */
1850        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1851                ret = -EBUSY;
1852                goto out;
1853        }
1854        /* we call try-to-free pages for make this cgroup empty */
1855        lru_add_drain_all();
1856        /* try to free all pages in this cgroup */
1857        shrink = 1;
1858        while (nr_retries && mem->res.usage > 0) {
1859                int progress;
1860
1861                if (signal_pending(current)) {
1862                        ret = -EINTR;
1863                        goto out;
1864                }
1865                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1866                                                false, get_swappiness(mem));
1867                if (!progress) {
1868                        nr_retries--;
1869                        /* maybe some writeback is necessary */
1870                        congestion_wait(WRITE, HZ/10);
1871                }
1872
1873        }
1874        lru_add_drain();
1875        /* try move_account...there may be some *locked* pages. */
1876        if (mem->res.usage)
1877                goto move_account;
1878        ret = 0;
1879        goto out;
1880}
1881
1882int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1883{
1884        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1885}
1886
1887
1888static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1889{
1890        return mem_cgroup_from_cont(cont)->use_hierarchy;
1891}
1892
1893static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1894                                        u64 val)
1895{
1896        int retval = 0;
1897        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1898        struct cgroup *parent = cont->parent;
1899        struct mem_cgroup *parent_mem = NULL;
1900
1901        if (parent)
1902                parent_mem = mem_cgroup_from_cont(parent);
1903
1904        cgroup_lock();
1905        /*
1906         * If parent's use_hiearchy is set, we can't make any modifications
1907         * in the child subtrees. If it is unset, then the change can
1908         * occur, provided the current cgroup has no children.
1909         *
1910         * For the root cgroup, parent_mem is NULL, we allow value to be
1911         * set if there are no children.
1912         */
1913        if ((!parent_mem || !parent_mem->use_hierarchy) &&
1914                                (val == 1 || val == 0)) {
1915                if (list_empty(&cont->children))
1916                        mem->use_hierarchy = val;
1917                else
1918                        retval = -EBUSY;
1919        } else
1920                retval = -EINVAL;
1921        cgroup_unlock();
1922
1923        return retval;
1924}
1925
1926static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1927{
1928        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1929        u64 val = 0;
1930        int type, name;
1931
1932        type = MEMFILE_TYPE(cft->private);
1933        name = MEMFILE_ATTR(cft->private);
1934        switch (type) {
1935        case _MEM:
1936                val = res_counter_read_u64(&mem->res, name);
1937                break;
1938        case _MEMSWAP:
1939                if (do_swap_account)
1940                        val = res_counter_read_u64(&mem->memsw, name);
1941                break;
1942        default:
1943                BUG();
1944                break;
1945        }
1946        return val;
1947}
1948/*
1949 * The user of this function is...
1950 * RES_LIMIT.
1951 */
1952static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1953                            const char *buffer)
1954{
1955        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1956        int type, name;
1957        unsigned long long val;
1958        int ret;
1959
1960        type = MEMFILE_TYPE(cft->private);
1961        name = MEMFILE_ATTR(cft->private);
1962        switch (name) {
1963        case RES_LIMIT:
1964                /* This function does all necessary parse...reuse it */
1965                ret = res_counter_memparse_write_strategy(buffer, &val);
1966                if (ret)
1967                        break;
1968                if (type == _MEM)
1969                        ret = mem_cgroup_resize_limit(memcg, val);
1970                else
1971                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
1972                break;
1973        default:
1974                ret = -EINVAL; /* should be BUG() ? */
1975                break;
1976        }
1977        return ret;
1978}
1979
1980static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
1981                unsigned long long *mem_limit, unsigned long long *memsw_limit)
1982{
1983        struct cgroup *cgroup;
1984        unsigned long long min_limit, min_memsw_limit, tmp;
1985
1986        min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1987        min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1988        cgroup = memcg->css.cgroup;
1989        if (!memcg->use_hierarchy)
1990                goto out;
1991
1992        while (cgroup->parent) {
1993                cgroup = cgroup->parent;
1994                memcg = mem_cgroup_from_cont(cgroup);
1995                if (!memcg->use_hierarchy)
1996                        break;
1997                tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
1998                min_limit = min(min_limit, tmp);
1999                tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2000                min_memsw_limit = min(min_memsw_limit, tmp);
2001        }
2002out:
2003        *mem_limit = min_limit;
2004        *memsw_limit = min_memsw_limit;
2005        return;
2006}
2007
2008static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2009{
2010        struct mem_cgroup *mem;
2011        int type, name;
2012
2013        mem = mem_cgroup_from_cont(cont);
2014        type = MEMFILE_TYPE(event);
2015        name = MEMFILE_ATTR(event);
2016        switch (name) {
2017        case RES_MAX_USAGE:
2018                if (type == _MEM)
2019                        res_counter_reset_max(&mem->res);
2020                else
2021                        res_counter_reset_max(&mem->memsw);
2022                break;
2023        case RES_FAILCNT:
2024                if (type == _MEM)
2025                        res_counter_reset_failcnt(&mem->res);
2026                else
2027                        res_counter_reset_failcnt(&mem->memsw);
2028                break;
2029        }
2030        return 0;
2031}
2032
2033
2034/* For read statistics */
2035enum {
2036        MCS_CACHE,
2037        MCS_RSS,
2038        MCS_PGPGIN,
2039        MCS_PGPGOUT,
2040        MCS_INACTIVE_ANON,
2041        MCS_ACTIVE_ANON,
2042        MCS_INACTIVE_FILE,
2043        MCS_ACTIVE_FILE,
2044        MCS_UNEVICTABLE,
2045        NR_MCS_STAT,
2046};
2047
2048struct mcs_total_stat {
2049        s64 stat[NR_MCS_STAT];
2050};
2051
2052struct {
2053        char *local_name;
2054        char *total_name;
2055} memcg_stat_strings[NR_MCS_STAT] = {
2056        {"cache", "total_cache"},
2057        {"rss", "total_rss"},
2058        {"pgpgin", "total_pgpgin"},
2059        {"pgpgout", "total_pgpgout"},
2060        {"inactive_anon", "total_inactive_anon"},
2061        {"active_anon", "total_active_anon"},
2062        {"inactive_file", "total_inactive_file"},
2063        {"active_file", "total_active_file"},
2064        {"unevictable", "total_unevictable"}
2065};
2066
2067
2068static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2069{
2070        struct mcs_total_stat *s = data;
2071        s64 val;
2072
2073        /* per cpu stat */
2074        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2075        s->stat[MCS_CACHE] += val * PAGE_SIZE;
2076        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2077        s->stat[MCS_RSS] += val * PAGE_SIZE;
2078        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2079        s->stat[MCS_PGPGIN] += val;
2080        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2081        s->stat[MCS_PGPGOUT] += val;
2082
2083        /* per zone stat */
2084        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2085        s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2086        val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2087        s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2088        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2089        s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2090        val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2091        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2092        val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2093        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2094        return 0;
2095}
2096
2097static void
2098mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2099{
2100        mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2101}
2102
2103static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2104                                 struct cgroup_map_cb *cb)
2105{
2106        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
2107        struct mcs_total_stat mystat;
2108        int i;
2109
2110        memset(&mystat, 0, sizeof(mystat));
2111        mem_cgroup_get_local_stat(mem_cont, &mystat);
2112
2113        for (i = 0; i < NR_MCS_STAT; i++)
2114                cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2115
2116        /* Hierarchical information */
2117        {
2118                unsigned long long limit, memsw_limit;
2119                memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
2120                cb->fill(cb, "hierarchical_memory_limit", limit);
2121                if (do_swap_account)
2122                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
2123        }
2124
2125        memset(&mystat, 0, sizeof(mystat));
2126        mem_cgroup_get_total_stat(mem_cont, &mystat);
2127        for (i = 0; i < NR_MCS_STAT; i++)
2128                cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2129
2130
2131#ifdef CONFIG_DEBUG_VM
2132        cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
2133
2134        {
2135                int nid, zid;
2136                struct mem_cgroup_per_zone *mz;
2137                unsigned long recent_rotated[2] = {0, 0};
2138                unsigned long recent_scanned[2] = {0, 0};
2139
2140                for_each_online_node(nid)
2141                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2142                                mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
2143
2144                                recent_rotated[0] +=
2145                                        mz->reclaim_stat.recent_rotated[0];
2146                                recent_rotated[1] +=
2147                                        mz->reclaim_stat.recent_rotated[1];
2148                                recent_scanned[0] +=
2149                                        mz->reclaim_stat.recent_scanned[0];
2150                                recent_scanned[1] +=
2151                                        mz->reclaim_stat.recent_scanned[1];
2152                        }
2153                cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
2154                cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
2155                cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
2156                cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
2157        }
2158#endif
2159
2160        return 0;
2161}
2162
2163static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
2164{
2165        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2166
2167        return get_swappiness(memcg);
2168}
2169
2170static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
2171                                       u64 val)
2172{
2173        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2174        struct mem_cgroup *parent;
2175
2176        if (val > 100)
2177                return -EINVAL;
2178
2179        if (cgrp->parent == NULL)
2180                return -EINVAL;
2181
2182        parent = mem_cgroup_from_cont(cgrp->parent);
2183
2184        cgroup_lock();
2185
2186        /* If under hierarchy, only empty-root can set this value */
2187        if ((parent->use_hierarchy) ||
2188            (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2189                cgroup_unlock();
2190                return -EINVAL;
2191        }
2192
2193        spin_lock(&memcg->reclaim_param_lock);
2194        memcg->swappiness = val;
2195        spin_unlock(&memcg->reclaim_param_lock);
2196
2197        cgroup_unlock();
2198
2199        return 0;
2200}
2201
2202
2203static struct cftype mem_cgroup_files[] = {
2204        {
2205                .name = "usage_in_bytes",
2206                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2207                .read_u64 = mem_cgroup_read,
2208        },
2209        {
2210                .name = "max_usage_in_bytes",
2211                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2212                .trigger = mem_cgroup_reset,
2213                .read_u64 = mem_cgroup_read,
2214        },
2215        {
2216                .name = "limit_in_bytes",
2217                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2218                .write_string = mem_cgroup_write,
2219                .read_u64 = mem_cgroup_read,
2220        },
2221        {
2222                .name = "failcnt",
2223                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2224                .trigger = mem_cgroup_reset,
2225                .read_u64 = mem_cgroup_read,
2226        },
2227        {
2228                .name = "stat",
2229                .read_map = mem_control_stat_show,
2230        },
2231        {
2232                .name = "force_empty",
2233                .trigger = mem_cgroup_force_empty_write,
2234        },
2235        {
2236                .name = "use_hierarchy",
2237                .write_u64 = mem_cgroup_hierarchy_write,
2238                .read_u64 = mem_cgroup_hierarchy_read,
2239        },
2240        {
2241                .name = "swappiness",
2242                .read_u64 = mem_cgroup_swappiness_read,
2243                .write_u64 = mem_cgroup_swappiness_write,
2244        },
2245};
2246
2247#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2248static struct cftype memsw_cgroup_files[] = {
2249        {
2250                .name = "memsw.usage_in_bytes",
2251                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2252                .read_u64 = mem_cgroup_read,
2253        },
2254        {
2255                .name = "memsw.max_usage_in_bytes",
2256                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2257                .trigger = mem_cgroup_reset,
2258                .read_u64 = mem_cgroup_read,
2259        },
2260        {
2261                .name = "memsw.limit_in_bytes",
2262                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2263                .write_string = mem_cgroup_write,
2264                .read_u64 = mem_cgroup_read,
2265        },
2266        {
2267                .name = "memsw.failcnt",
2268                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2269                .trigger = mem_cgroup_reset,
2270                .read_u64 = mem_cgroup_read,
2271        },
2272};
2273
2274static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2275{
2276        if (!do_swap_account)
2277                return 0;
2278        return cgroup_add_files(cont, ss, memsw_cgroup_files,
2279                                ARRAY_SIZE(memsw_cgroup_files));
2280};
2281#else
2282static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2283{
2284        return 0;
2285}
2286#endif
2287
2288static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2289{
2290        struct mem_cgroup_per_node *pn;
2291        struct mem_cgroup_per_zone *mz;
2292        enum lru_list l;
2293        int zone, tmp = node;
2294        /*
2295         * This routine is called against possible nodes.
2296         * But it's BUG to call kmalloc() against offline node.
2297         *
2298         * TODO: this routine can waste much memory for nodes which will
2299         *       never be onlined. It's better to use memory hotplug callback
2300         *       function.
2301         */
2302        if (!node_state(node, N_NORMAL_MEMORY))
2303                tmp = -1;
2304        pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
2305        if (!pn)
2306                return 1;
2307
2308        mem->info.nodeinfo[node] = pn;
2309        memset(pn, 0, sizeof(*pn));
2310
2311        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2312                mz = &pn->zoneinfo[zone];
2313                for_each_lru(l)
2314                        INIT_LIST_HEAD(&mz->lists[l]);
2315        }
2316        return 0;
2317}
2318
2319static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2320{
2321        kfree(mem->info.nodeinfo[node]);
2322}
2323
2324static int mem_cgroup_size(void)
2325{
2326        int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2327        return sizeof(struct mem_cgroup) + cpustat_size;
2328}
2329
2330static struct mem_cgroup *mem_cgroup_alloc(void)
2331{
2332        struct mem_cgroup *mem;
2333        int size = mem_cgroup_size();
2334
2335        if (size < PAGE_SIZE)
2336                mem = kmalloc(size, GFP_KERNEL);
2337        else
2338                mem = vmalloc(size);
2339
2340        if (mem)
2341                memset(mem, 0, size);
2342        return mem;
2343}
2344
2345/*
2346 * At destroying mem_cgroup, references from swap_cgroup can remain.
2347 * (scanning all at force_empty is too costly...)
2348 *
2349 * Instead of clearing all references at force_empty, we remember
2350 * the number of reference from swap_cgroup and free mem_cgroup when
2351 * it goes down to 0.
2352 *
2353 * Removal of cgroup itself succeeds regardless of refs from swap.
2354 */
2355
2356static void __mem_cgroup_free(struct mem_cgroup *mem)
2357{
2358        int node;
2359
2360        free_css_id(&mem_cgroup_subsys, &mem->css);
2361
2362        for_each_node_state(node, N_POSSIBLE)
2363                free_mem_cgroup_per_zone_info(mem, node);
2364
2365        if (mem_cgroup_size() < PAGE_SIZE)
2366                kfree(mem);
2367        else
2368                vfree(mem);
2369}
2370
2371static void mem_cgroup_get(struct mem_cgroup *mem)
2372{
2373        atomic_inc(&mem->refcnt);
2374}
2375
2376static void mem_cgroup_put(struct mem_cgroup *mem)
2377{
2378        if (atomic_dec_and_test(&mem->refcnt)) {
2379                struct mem_cgroup *parent = parent_mem_cgroup(mem);
2380                __mem_cgroup_free(mem);
2381                if (parent)
2382                        mem_cgroup_put(parent);
2383        }
2384}
2385
2386/*
2387 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
2388 */
2389static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
2390{
2391        if (!mem->res.parent)
2392                return NULL;
2393        return mem_cgroup_from_res_counter(mem->res.parent, res);
2394}
2395
2396#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2397static void __init enable_swap_cgroup(void)
2398{
2399        if (!mem_cgroup_disabled() && really_do_swap_account)
2400                do_swap_account = 1;
2401}
2402#else
2403static void __init enable_swap_cgroup(void)
2404{
2405}
2406#endif
2407
2408static struct cgroup_subsys_state * __ref
2409mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2410{
2411        struct mem_cgroup *mem, *parent;
2412        long error = -ENOMEM;
2413        int node;
2414
2415        mem = mem_cgroup_alloc();
2416        if (!mem)
2417                return ERR_PTR(error);
2418
2419        for_each_node_state(node, N_POSSIBLE)
2420                if (alloc_mem_cgroup_per_zone_info(mem, node))
2421                        goto free_out;
2422        /* root ? */
2423        if (cont->parent == NULL) {
2424                enable_swap_cgroup();
2425                parent = NULL;
2426        } else {
2427                parent = mem_cgroup_from_cont(cont->parent);
2428                mem->use_hierarchy = parent->use_hierarchy;
2429        }
2430
2431        if (parent && parent->use_hierarchy) {
2432                res_counter_init(&mem->res, &parent->res);
2433                res_counter_init(&mem->memsw, &parent->memsw);
2434                /*
2435                 * We increment refcnt of the parent to ensure that we can
2436                 * safely access it on res_counter_charge/uncharge.
2437                 * This refcnt will be decremented when freeing this
2438                 * mem_cgroup(see mem_cgroup_put).
2439                 */
2440                mem_cgroup_get(parent);
2441        } else {
2442                res_counter_init(&mem->res, NULL);
2443                res_counter_init(&mem->memsw, NULL);
2444        }
2445        mem->last_scanned_child = 0;
2446        spin_lock_init(&mem->reclaim_param_lock);
2447
2448        if (parent)
2449                mem->swappiness = get_swappiness(parent);
2450        atomic_set(&mem->refcnt, 1);
2451        return &mem->css;
2452free_out:
2453        __mem_cgroup_free(mem);
2454        return ERR_PTR(error);
2455}
2456
2457static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2458                                        struct cgroup *cont)
2459{
2460        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2461
2462        return mem_cgroup_force_empty(mem, false);
2463}
2464
2465static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2466                                struct cgroup *cont)
2467{
2468        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2469
2470        mem_cgroup_put(mem);
2471}
2472
2473static int mem_cgroup_populate(struct cgroup_subsys *ss,
2474                                struct cgroup *cont)
2475{
2476        int ret;
2477
2478        ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2479                                ARRAY_SIZE(mem_cgroup_files));
2480
2481        if (!ret)
2482                ret = register_memsw_files(cont, ss);
2483        return ret;
2484}
2485
2486static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2487                                struct cgroup *cont,
2488                                struct cgroup *old_cont,
2489                                struct task_struct *p)
2490{
2491        mutex_lock(&memcg_tasklist);
2492        /*
2493         * FIXME: It's better to move charges of this process from old
2494         * memcg to new memcg. But it's just on TODO-List now.
2495         */
2496        mutex_unlock(&memcg_tasklist);
2497}
2498
2499struct cgroup_subsys mem_cgroup_subsys = {
2500        .name = "memory",
2501        .subsys_id = mem_cgroup_subsys_id,
2502        .create = mem_cgroup_create,
2503        .pre_destroy = mem_cgroup_pre_destroy,
2504        .destroy = mem_cgroup_destroy,
2505        .populate = mem_cgroup_populate,
2506        .attach = mem_cgroup_move_task,
2507        .early_init = 0,
2508        .use_id = 1,
2509};
2510
2511#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2512
2513static int __init disable_swap_account(char *s)
2514{
2515        really_do_swap_account = 0;
2516        return 1;
2517}
2518__setup("noswapaccount", disable_swap_account);
2519#endif
2520