linux/mm/memcontrol.c
<<
>>
Prefs
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License as published by
  11 * the Free Software Foundation; either version 2 of the License, or
  12 * (at your option) any later version.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 */
  19
  20#include <linux/res_counter.h>
  21#include <linux/memcontrol.h>
  22#include <linux/cgroup.h>
  23#include <linux/mm.h>
  24#include <linux/smp.h>
  25#include <linux/page-flags.h>
  26#include <linux/backing-dev.h>
  27#include <linux/bit_spinlock.h>
  28#include <linux/rcupdate.h>
  29#include <linux/slab.h>
  30#include <linux/swap.h>
  31#include <linux/spinlock.h>
  32#include <linux/fs.h>
  33#include <linux/seq_file.h>
  34#include <linux/vmalloc.h>
  35#include <linux/mm_inline.h>
  36#include <linux/page_cgroup.h>
  37
  38#include <asm/uaccess.h>
  39
  40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  41#define MEM_CGROUP_RECLAIM_RETRIES      5
  42
  43/*
  44 * Statistics for memory cgroup.
  45 */
  46enum mem_cgroup_stat_index {
  47        /*
  48         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  49         */
  50        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  51        MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
  52        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  53        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  54
  55        MEM_CGROUP_STAT_NSTATS,
  56};
  57
  58struct mem_cgroup_stat_cpu {
  59        s64 count[MEM_CGROUP_STAT_NSTATS];
  60} ____cacheline_aligned_in_smp;
  61
  62struct mem_cgroup_stat {
  63        struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
  64};
  65
  66/*
  67 * For accounting under irq disable, no need for increment preempt count.
  68 */
  69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  70                enum mem_cgroup_stat_index idx, int val)
  71{
  72        stat->count[idx] += val;
  73}
  74
  75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  76                enum mem_cgroup_stat_index idx)
  77{
  78        int cpu;
  79        s64 ret = 0;
  80        for_each_possible_cpu(cpu)
  81                ret += stat->cpustat[cpu].count[idx];
  82        return ret;
  83}
  84
  85/*
  86 * per-zone information in memory controller.
  87 */
  88struct mem_cgroup_per_zone {
  89        /*
  90         * spin_lock to protect the per cgroup LRU
  91         */
  92        spinlock_t              lru_lock;
  93        struct list_head        lists[NR_LRU_LISTS];
  94        unsigned long           count[NR_LRU_LISTS];
  95};
  96/* Macro for accessing counter */
  97#define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
  98
  99struct mem_cgroup_per_node {
 100        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 101};
 102
 103struct mem_cgroup_lru_info {
 104        struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 105};
 106
 107/*
 108 * The memory controller data structure. The memory controller controls both
 109 * page cache and RSS per cgroup. We would eventually like to provide
 110 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 111 * to help the administrator determine what knobs to tune.
 112 *
 113 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 114 * we hit the water mark. May be even add a low water mark, such that
 115 * no reclaim occurs from a cgroup at it's low water mark, this is
 116 * a feature that will be implemented much later in the future.
 117 */
 118struct mem_cgroup {
 119        struct cgroup_subsys_state css;
 120        /*
 121         * the counter to account for memory usage
 122         */
 123        struct res_counter res;
 124        /*
 125         * Per cgroup active and inactive list, similar to the
 126         * per zone LRU lists.
 127         */
 128        struct mem_cgroup_lru_info info;
 129
 130        int     prev_priority;  /* for recording reclaim priority */
 131        /*
 132         * statistics.
 133         */
 134        struct mem_cgroup_stat stat;
 135};
 136static struct mem_cgroup init_mem_cgroup;
 137
 138enum charge_type {
 139        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 140        MEM_CGROUP_CHARGE_TYPE_MAPPED,
 141        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 142        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 143        NR_CHARGE_TYPE,
 144};
 145
 146/* only for here (for easy reading.) */
 147#define PCGF_CACHE      (1UL << PCG_CACHE)
 148#define PCGF_USED       (1UL << PCG_USED)
 149#define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
 150#define PCGF_LOCK       (1UL << PCG_LOCK)
 151#define PCGF_FILE       (1UL << PCG_FILE)
 152static const unsigned long
 153pcg_default_flags[NR_CHARGE_TYPE] = {
 154        PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
 155        PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
 156        PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 157        0, /* FORCE */
 158};
 159
 160/*
 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
 162 */
 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 164                                         struct page_cgroup *pc,
 165                                         bool charge)
 166{
 167        int val = (charge)? 1 : -1;
 168        struct mem_cgroup_stat *stat = &mem->stat;
 169        struct mem_cgroup_stat_cpu *cpustat;
 170
 171        VM_BUG_ON(!irqs_disabled());
 172
 173        cpustat = &stat->cpustat[smp_processor_id()];
 174        if (PageCgroupCache(pc))
 175                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 176        else
 177                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 178
 179        if (charge)
 180                __mem_cgroup_stat_add_safe(cpustat,
 181                                MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 182        else
 183                __mem_cgroup_stat_add_safe(cpustat,
 184                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 185}
 186
 187static struct mem_cgroup_per_zone *
 188mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 189{
 190        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 191}
 192
 193static struct mem_cgroup_per_zone *
 194page_cgroup_zoneinfo(struct page_cgroup *pc)
 195{
 196        struct mem_cgroup *mem = pc->mem_cgroup;
 197        int nid = page_cgroup_nid(pc);
 198        int zid = page_cgroup_zid(pc);
 199
 200        return mem_cgroup_zoneinfo(mem, nid, zid);
 201}
 202
 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 204                                        enum lru_list idx)
 205{
 206        int nid, zid;
 207        struct mem_cgroup_per_zone *mz;
 208        u64 total = 0;
 209
 210        for_each_online_node(nid)
 211                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 212                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
 213                        total += MEM_CGROUP_ZSTAT(mz, idx);
 214                }
 215        return total;
 216}
 217
 218static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 219{
 220        return container_of(cgroup_subsys_state(cont,
 221                                mem_cgroup_subsys_id), struct mem_cgroup,
 222                                css);
 223}
 224
 225struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 226{
 227        /*
 228         * mm_update_next_owner() may clear mm->owner to NULL
 229         * if it races with swapoff, page migration, etc.
 230         * So this can be called with p == NULL.
 231         */
 232        if (unlikely(!p))
 233                return NULL;
 234
 235        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 236                                struct mem_cgroup, css);
 237}
 238
 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 240                        struct page_cgroup *pc)
 241{
 242        int lru = LRU_BASE;
 243
 244        if (PageCgroupUnevictable(pc))
 245                lru = LRU_UNEVICTABLE;
 246        else {
 247                if (PageCgroupActive(pc))
 248                        lru += LRU_ACTIVE;
 249                if (PageCgroupFile(pc))
 250                        lru += LRU_FILE;
 251        }
 252
 253        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 254
 255        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 256        list_del(&pc->lru);
 257}
 258
 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 260                                struct page_cgroup *pc)
 261{
 262        int lru = LRU_BASE;
 263
 264        if (PageCgroupUnevictable(pc))
 265                lru = LRU_UNEVICTABLE;
 266        else {
 267                if (PageCgroupActive(pc))
 268                        lru += LRU_ACTIVE;
 269                if (PageCgroupFile(pc))
 270                        lru += LRU_FILE;
 271        }
 272
 273        MEM_CGROUP_ZSTAT(mz, lru) += 1;
 274        list_add(&pc->lru, &mz->lists[lru]);
 275
 276        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 277}
 278
 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 280{
 281        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 282        int active    = PageCgroupActive(pc);
 283        int file      = PageCgroupFile(pc);
 284        int unevictable = PageCgroupUnevictable(pc);
 285        enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 286                                (LRU_FILE * !!file + !!active);
 287
 288        if (lru == from)
 289                return;
 290
 291        MEM_CGROUP_ZSTAT(mz, from) -= 1;
 292        /*
 293         * However this is done under mz->lru_lock, another flags, which
 294         * are not related to LRU, will be modified from out-of-lock.
 295         * We have to use atomic set/clear flags.
 296         */
 297        if (is_unevictable_lru(lru)) {
 298                ClearPageCgroupActive(pc);
 299                SetPageCgroupUnevictable(pc);
 300        } else {
 301                if (is_active_lru(lru))
 302                        SetPageCgroupActive(pc);
 303                else
 304                        ClearPageCgroupActive(pc);
 305                ClearPageCgroupUnevictable(pc);
 306        }
 307
 308        MEM_CGROUP_ZSTAT(mz, lru) += 1;
 309        list_move(&pc->lru, &mz->lists[lru]);
 310}
 311
 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 313{
 314        int ret;
 315
 316        task_lock(task);
 317        ret = task->mm && mm_match_cgroup(task->mm, mem);
 318        task_unlock(task);
 319        return ret;
 320}
 321
 322/*
 323 * This routine assumes that the appropriate zone's lru lock is already held
 324 */
 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 326{
 327        struct page_cgroup *pc;
 328        struct mem_cgroup_per_zone *mz;
 329        unsigned long flags;
 330
 331        if (mem_cgroup_subsys.disabled)
 332                return;
 333
 334        /*
 335         * We cannot lock_page_cgroup while holding zone's lru_lock,
 336         * because other holders of lock_page_cgroup can be interrupted
 337         * with an attempt to rotate_reclaimable_page.  But we cannot
 338         * safely get to page_cgroup without it, so just try_lock it:
 339         * mem_cgroup_isolate_pages allows for page left on wrong list.
 340         */
 341        pc = lookup_page_cgroup(page);
 342        if (!trylock_page_cgroup(pc))
 343                return;
 344        if (pc && PageCgroupUsed(pc)) {
 345                mz = page_cgroup_zoneinfo(pc);
 346                spin_lock_irqsave(&mz->lru_lock, flags);
 347                __mem_cgroup_move_lists(pc, lru);
 348                spin_unlock_irqrestore(&mz->lru_lock, flags);
 349        }
 350        unlock_page_cgroup(pc);
 351}
 352
 353/*
 354 * Calculate mapped_ratio under memory controller. This will be used in
 355 * vmscan.c for deteremining we have to reclaim mapped pages.
 356 */
 357int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 358{
 359        long total, rss;
 360
 361        /*
 362         * usage is recorded in bytes. But, here, we assume the number of
 363         * physical pages can be represented by "long" on any arch.
 364         */
 365        total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 366        rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 367        return (int)((rss * 100L) / total);
 368}
 369
 370/*
 371 * prev_priority control...this will be used in memory reclaim path.
 372 */
 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 374{
 375        return mem->prev_priority;
 376}
 377
 378void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 379{
 380        if (priority < mem->prev_priority)
 381                mem->prev_priority = priority;
 382}
 383
 384void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 385{
 386        mem->prev_priority = priority;
 387}
 388
 389/*
 390 * Calculate # of pages to be scanned in this priority/zone.
 391 * See also vmscan.c
 392 *
 393 * priority starts from "DEF_PRIORITY" and decremented in each loop.
 394 * (see include/linux/mmzone.h)
 395 */
 396
 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 398                                        int priority, enum lru_list lru)
 399{
 400        long nr_pages;
 401        int nid = zone->zone_pgdat->node_id;
 402        int zid = zone_idx(zone);
 403        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 404
 405        nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 406
 407        return (nr_pages >> priority);
 408}
 409
 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 411                                        struct list_head *dst,
 412                                        unsigned long *scanned, int order,
 413                                        int mode, struct zone *z,
 414                                        struct mem_cgroup *mem_cont,
 415                                        int active, int file)
 416{
 417        unsigned long nr_taken = 0;
 418        struct page *page;
 419        unsigned long scan;
 420        LIST_HEAD(pc_list);
 421        struct list_head *src;
 422        struct page_cgroup *pc, *tmp;
 423        int nid = z->zone_pgdat->node_id;
 424        int zid = zone_idx(z);
 425        struct mem_cgroup_per_zone *mz;
 426        int lru = LRU_FILE * !!file + !!active;
 427
 428        BUG_ON(!mem_cont);
 429        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 430        src = &mz->lists[lru];
 431
 432        spin_lock(&mz->lru_lock);
 433        scan = 0;
 434        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 435                if (scan >= nr_to_scan)
 436                        break;
 437                if (unlikely(!PageCgroupUsed(pc)))
 438                        continue;
 439                page = pc->page;
 440
 441                if (unlikely(!PageLRU(page)))
 442                        continue;
 443
 444                /*
 445                 * TODO: play better with lumpy reclaim, grabbing anything.
 446                 */
 447                if (PageUnevictable(page) ||
 448                    (PageActive(page) && !active) ||
 449                    (!PageActive(page) && active)) {
 450                        __mem_cgroup_move_lists(pc, page_lru(page));
 451                        continue;
 452                }
 453
 454                scan++;
 455                list_move(&pc->lru, &pc_list);
 456
 457                if (__isolate_lru_page(page, mode, file) == 0) {
 458                        list_move(&page->lru, dst);
 459                        nr_taken++;
 460                }
 461        }
 462
 463        list_splice(&pc_list, src);
 464        spin_unlock(&mz->lru_lock);
 465
 466        *scanned = scan;
 467        return nr_taken;
 468}
 469
 470/*
 471 * Charge the memory controller for page usage.
 472 * Return
 473 * 0 if the charge was successful
 474 * < 0 if the cgroup is over its limit
 475 */
 476static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 477                                gfp_t gfp_mask, enum charge_type ctype,
 478                                struct mem_cgroup *memcg)
 479{
 480        struct mem_cgroup *mem;
 481        struct page_cgroup *pc;
 482        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 483        struct mem_cgroup_per_zone *mz;
 484        unsigned long flags;
 485
 486        pc = lookup_page_cgroup(page);
 487        /* can happen at boot */
 488        if (unlikely(!pc))
 489                return 0;
 490        prefetchw(pc);
 491        /*
 492         * We always charge the cgroup the mm_struct belongs to.
 493         * The mm_struct's mem_cgroup changes on task migration if the
 494         * thread group leader migrates. It's possible that mm is not
 495         * set, if so charge the init_mm (happens for pagecache usage).
 496         */
 497
 498        if (likely(!memcg)) {
 499                rcu_read_lock();
 500                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 501                if (unlikely(!mem)) {
 502                        rcu_read_unlock();
 503                        return 0;
 504                }
 505                /*
 506                 * For every charge from the cgroup, increment reference count
 507                 */
 508                css_get(&mem->css);
 509                rcu_read_unlock();
 510        } else {
 511                mem = memcg;
 512                css_get(&memcg->css);
 513        }
 514
 515        while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 516                if (!(gfp_mask & __GFP_WAIT))
 517                        goto out;
 518
 519                if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 520                        continue;
 521
 522                /*
 523                 * try_to_free_mem_cgroup_pages() might not give us a full
 524                 * picture of reclaim. Some pages are reclaimed and might be
 525                 * moved to swap cache or just unmapped from the cgroup.
 526                 * Check the limit again to see if the reclaim reduced the
 527                 * current usage of the cgroup before giving up
 528                 */
 529                if (res_counter_check_under_limit(&mem->res))
 530                        continue;
 531
 532                if (!nr_retries--) {
 533                        mem_cgroup_out_of_memory(mem, gfp_mask);
 534                        goto out;
 535                }
 536        }
 537
 538
 539        lock_page_cgroup(pc);
 540        if (unlikely(PageCgroupUsed(pc))) {
 541                unlock_page_cgroup(pc);
 542                res_counter_uncharge(&mem->res, PAGE_SIZE);
 543                css_put(&mem->css);
 544
 545                goto done;
 546        }
 547        pc->mem_cgroup = mem;
 548        /*
 549         * If a page is accounted as a page cache, insert to inactive list.
 550         * If anon, insert to active list.
 551         */
 552        pc->flags = pcg_default_flags[ctype];
 553
 554        mz = page_cgroup_zoneinfo(pc);
 555
 556        spin_lock_irqsave(&mz->lru_lock, flags);
 557        __mem_cgroup_add_list(mz, pc);
 558        spin_unlock_irqrestore(&mz->lru_lock, flags);
 559        unlock_page_cgroup(pc);
 560
 561done:
 562        return 0;
 563out:
 564        css_put(&mem->css);
 565        return -ENOMEM;
 566}
 567
 568int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 569{
 570        if (mem_cgroup_subsys.disabled)
 571                return 0;
 572        if (PageCompound(page))
 573                return 0;
 574        /*
 575         * If already mapped, we don't have to account.
 576         * If page cache, page->mapping has address_space.
 577         * But page->mapping may have out-of-use anon_vma pointer,
 578         * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 579         * is NULL.
 580         */
 581        if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 582                return 0;
 583        if (unlikely(!mm))
 584                mm = &init_mm;
 585        return mem_cgroup_charge_common(page, mm, gfp_mask,
 586                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 587}
 588
 589int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 590                                gfp_t gfp_mask)
 591{
 592        if (mem_cgroup_subsys.disabled)
 593                return 0;
 594        if (PageCompound(page))
 595                return 0;
 596        /*
 597         * Corner case handling. This is called from add_to_page_cache()
 598         * in usual. But some FS (shmem) precharges this page before calling it
 599         * and call add_to_page_cache() with GFP_NOWAIT.
 600         *
 601         * For GFP_NOWAIT case, the page may be pre-charged before calling
 602         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 603         * charge twice. (It works but has to pay a bit larger cost.)
 604         */
 605        if (!(gfp_mask & __GFP_WAIT)) {
 606                struct page_cgroup *pc;
 607
 608
 609                pc = lookup_page_cgroup(page);
 610                if (!pc)
 611                        return 0;
 612                lock_page_cgroup(pc);
 613                if (PageCgroupUsed(pc)) {
 614                        unlock_page_cgroup(pc);
 615                        return 0;
 616                }
 617                unlock_page_cgroup(pc);
 618        }
 619
 620        if (unlikely(!mm))
 621                mm = &init_mm;
 622
 623        if (page_is_file_cache(page))
 624                return mem_cgroup_charge_common(page, mm, gfp_mask,
 625                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 626        else
 627                return mem_cgroup_charge_common(page, mm, gfp_mask,
 628                                MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 629}
 630
 631/*
 632 * uncharge if !page_mapped(page)
 633 */
 634static void
 635__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 636{
 637        struct page_cgroup *pc;
 638        struct mem_cgroup *mem;
 639        struct mem_cgroup_per_zone *mz;
 640        unsigned long flags;
 641
 642        if (mem_cgroup_subsys.disabled)
 643                return;
 644
 645        /*
 646         * Check if our page_cgroup is valid
 647         */
 648        pc = lookup_page_cgroup(page);
 649        if (unlikely(!pc || !PageCgroupUsed(pc)))
 650                return;
 651
 652        lock_page_cgroup(pc);
 653        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
 654             || !PageCgroupUsed(pc)) {
 655                /* This happens at race in zap_pte_range() and do_swap_page()*/
 656                unlock_page_cgroup(pc);
 657                return;
 658        }
 659        ClearPageCgroupUsed(pc);
 660        mem = pc->mem_cgroup;
 661
 662        mz = page_cgroup_zoneinfo(pc);
 663        spin_lock_irqsave(&mz->lru_lock, flags);
 664        __mem_cgroup_remove_list(mz, pc);
 665        spin_unlock_irqrestore(&mz->lru_lock, flags);
 666        unlock_page_cgroup(pc);
 667
 668        res_counter_uncharge(&mem->res, PAGE_SIZE);
 669        css_put(&mem->css);
 670
 671        return;
 672}
 673
 674void mem_cgroup_uncharge_page(struct page *page)
 675{
 676        /* early check. */
 677        if (page_mapped(page))
 678                return;
 679        if (page->mapping && !PageAnon(page))
 680                return;
 681        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 682}
 683
 684void mem_cgroup_uncharge_cache_page(struct page *page)
 685{
 686        VM_BUG_ON(page_mapped(page));
 687        VM_BUG_ON(page->mapping);
 688        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 689}
 690
 691/*
 692 * Before starting migration, account against new page.
 693 */
 694int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 695{
 696        struct page_cgroup *pc;
 697        struct mem_cgroup *mem = NULL;
 698        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 699        int ret = 0;
 700
 701        if (mem_cgroup_subsys.disabled)
 702                return 0;
 703
 704        pc = lookup_page_cgroup(page);
 705        lock_page_cgroup(pc);
 706        if (PageCgroupUsed(pc)) {
 707                mem = pc->mem_cgroup;
 708                css_get(&mem->css);
 709                if (PageCgroupCache(pc)) {
 710                        if (page_is_file_cache(page))
 711                                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 712                        else
 713                                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 714                }
 715        }
 716        unlock_page_cgroup(pc);
 717        if (mem) {
 718                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 719                        ctype, mem);
 720                css_put(&mem->css);
 721        }
 722        return ret;
 723}
 724
 725/* remove redundant charge if migration failed*/
 726void mem_cgroup_end_migration(struct page *newpage)
 727{
 728        /*
 729         * At success, page->mapping is not NULL.
 730         * special rollback care is necessary when
 731         * 1. at migration failure. (newpage->mapping is cleared in this case)
 732         * 2. the newpage was moved but not remapped again because the task
 733         *    exits and the newpage is obsolete. In this case, the new page
 734         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
 735         *    always for avoiding mess. The  page_cgroup will be removed if
 736         *    unnecessary. File cache pages is still on radix-tree. Don't
 737         *    care it.
 738         */
 739        if (!newpage->mapping)
 740                __mem_cgroup_uncharge_common(newpage,
 741                                MEM_CGROUP_CHARGE_TYPE_FORCE);
 742        else if (PageAnon(newpage))
 743                mem_cgroup_uncharge_page(newpage);
 744}
 745
 746/*
 747 * A call to try to shrink memory usage under specified resource controller.
 748 * This is typically used for page reclaiming for shmem for reducing side
 749 * effect of page allocation from shmem, which is used by some mem_cgroup.
 750 */
 751int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 752{
 753        struct mem_cgroup *mem;
 754        int progress = 0;
 755        int retry = MEM_CGROUP_RECLAIM_RETRIES;
 756
 757        if (mem_cgroup_subsys.disabled)
 758                return 0;
 759        if (!mm)
 760                return 0;
 761
 762        rcu_read_lock();
 763        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 764        if (unlikely(!mem)) {
 765                rcu_read_unlock();
 766                return 0;
 767        }
 768        css_get(&mem->css);
 769        rcu_read_unlock();
 770
 771        do {
 772                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
 773                progress += res_counter_check_under_limit(&mem->res);
 774        } while (!progress && --retry);
 775
 776        css_put(&mem->css);
 777        if (!retry)
 778                return -ENOMEM;
 779        return 0;
 780}
 781
 782int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 783{
 784
 785        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 786        int progress;
 787        int ret = 0;
 788
 789        while (res_counter_set_limit(&memcg->res, val)) {
 790                if (signal_pending(current)) {
 791                        ret = -EINTR;
 792                        break;
 793                }
 794                if (!retry_count) {
 795                        ret = -EBUSY;
 796                        break;
 797                }
 798                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
 799                if (!progress)
 800                        retry_count--;
 801        }
 802        return ret;
 803}
 804
 805
 806/*
 807 * This routine traverse page_cgroup in given list and drop them all.
 808 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 809 */
 810#define FORCE_UNCHARGE_BATCH    (128)
 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 812                            struct mem_cgroup_per_zone *mz,
 813                            enum lru_list lru)
 814{
 815        struct page_cgroup *pc;
 816        struct page *page;
 817        int count = FORCE_UNCHARGE_BATCH;
 818        unsigned long flags;
 819        struct list_head *list;
 820
 821        list = &mz->lists[lru];
 822
 823        spin_lock_irqsave(&mz->lru_lock, flags);
 824        while (!list_empty(list)) {
 825                pc = list_entry(list->prev, struct page_cgroup, lru);
 826                page = pc->page;
 827                if (!PageCgroupUsed(pc))
 828                        break;
 829                get_page(page);
 830                spin_unlock_irqrestore(&mz->lru_lock, flags);
 831                /*
 832                 * Check if this page is on LRU. !LRU page can be found
 833                 * if it's under page migration.
 834                 */
 835                if (PageLRU(page)) {
 836                        __mem_cgroup_uncharge_common(page,
 837                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
 838                        put_page(page);
 839                        if (--count <= 0) {
 840                                count = FORCE_UNCHARGE_BATCH;
 841                                cond_resched();
 842                        }
 843                } else {
 844                        spin_lock_irqsave(&mz->lru_lock, flags);
 845                        break;
 846                }
 847                spin_lock_irqsave(&mz->lru_lock, flags);
 848        }
 849        spin_unlock_irqrestore(&mz->lru_lock, flags);
 850}
 851
 852/*
 853 * make mem_cgroup's charge to be 0 if there is no task.
 854 * This enables deleting this mem_cgroup.
 855 */
 856static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 857{
 858        int ret = -EBUSY;
 859        int node, zid;
 860
 861        css_get(&mem->css);
 862        /*
 863         * page reclaim code (kswapd etc..) will move pages between
 864         * active_list <-> inactive_list while we don't take a lock.
 865         * So, we have to do loop here until all lists are empty.
 866         */
 867        while (mem->res.usage > 0) {
 868                if (atomic_read(&mem->css.cgroup->count) > 0)
 869                        goto out;
 870                /* This is for making all *used* pages to be on LRU. */
 871                lru_add_drain_all();
 872                for_each_node_state(node, N_POSSIBLE)
 873                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 874                                struct mem_cgroup_per_zone *mz;
 875                                enum lru_list l;
 876                                mz = mem_cgroup_zoneinfo(mem, node, zid);
 877                                for_each_lru(l)
 878                                        mem_cgroup_force_empty_list(mem, mz, l);
 879                        }
 880                cond_resched();
 881        }
 882        ret = 0;
 883out:
 884        css_put(&mem->css);
 885        return ret;
 886}
 887
 888static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 889{
 890        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
 891                                    cft->private);
 892}
 893/*
 894 * The user of this function is...
 895 * RES_LIMIT.
 896 */
 897static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 898                            const char *buffer)
 899{
 900        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 901        unsigned long long val;
 902        int ret;
 903
 904        switch (cft->private) {
 905        case RES_LIMIT:
 906                /* This function does all necessary parse...reuse it */
 907                ret = res_counter_memparse_write_strategy(buffer, &val);
 908                if (!ret)
 909                        ret = mem_cgroup_resize_limit(memcg, val);
 910                break;
 911        default:
 912                ret = -EINVAL; /* should be BUG() ? */
 913                break;
 914        }
 915        return ret;
 916}
 917
 918static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 919{
 920        struct mem_cgroup *mem;
 921
 922        mem = mem_cgroup_from_cont(cont);
 923        switch (event) {
 924        case RES_MAX_USAGE:
 925                res_counter_reset_max(&mem->res);
 926                break;
 927        case RES_FAILCNT:
 928                res_counter_reset_failcnt(&mem->res);
 929                break;
 930        }
 931        return 0;
 932}
 933
 934static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
 935{
 936        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
 937}
 938
 939static const struct mem_cgroup_stat_desc {
 940        const char *msg;
 941        u64 unit;
 942} mem_cgroup_stat_desc[] = {
 943        [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
 944        [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 945        [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
 946        [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
 947};
 948
 949static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 950                                 struct cgroup_map_cb *cb)
 951{
 952        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 953        struct mem_cgroup_stat *stat = &mem_cont->stat;
 954        int i;
 955
 956        for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
 957                s64 val;
 958
 959                val = mem_cgroup_read_stat(stat, i);
 960                val *= mem_cgroup_stat_desc[i].unit;
 961                cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
 962        }
 963        /* showing # of active pages */
 964        {
 965                unsigned long active_anon, inactive_anon;
 966                unsigned long active_file, inactive_file;
 967                unsigned long unevictable;
 968
 969                inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
 970                                                LRU_INACTIVE_ANON);
 971                active_anon = mem_cgroup_get_all_zonestat(mem_cont,
 972                                                LRU_ACTIVE_ANON);
 973                inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
 974                                                LRU_INACTIVE_FILE);
 975                active_file = mem_cgroup_get_all_zonestat(mem_cont,
 976                                                LRU_ACTIVE_FILE);
 977                unevictable = mem_cgroup_get_all_zonestat(mem_cont,
 978                                                        LRU_UNEVICTABLE);
 979
 980                cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
 981                cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
 982                cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
 983                cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
 984                cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
 985
 986        }
 987        return 0;
 988}
 989
 990static struct cftype mem_cgroup_files[] = {
 991        {
 992                .name = "usage_in_bytes",
 993                .private = RES_USAGE,
 994                .read_u64 = mem_cgroup_read,
 995        },
 996        {
 997                .name = "max_usage_in_bytes",
 998                .private = RES_MAX_USAGE,
 999                .trigger = mem_cgroup_reset,
1000                .read_u64 = mem_cgroup_read,
1001        },
1002        {
1003                .name = "limit_in_bytes",
1004                .private = RES_LIMIT,
1005                .write_string = mem_cgroup_write,
1006                .read_u64 = mem_cgroup_read,
1007        },
1008        {
1009                .name = "failcnt",
1010                .private = RES_FAILCNT,
1011                .trigger = mem_cgroup_reset,
1012                .read_u64 = mem_cgroup_read,
1013        },
1014        {
1015                .name = "force_empty",
1016                .trigger = mem_force_empty_write,
1017        },
1018        {
1019                .name = "stat",
1020                .read_map = mem_control_stat_show,
1021        },
1022};
1023
1024static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1025{
1026        struct mem_cgroup_per_node *pn;
1027        struct mem_cgroup_per_zone *mz;
1028        enum lru_list l;
1029        int zone, tmp = node;
1030        /*
1031         * This routine is called against possible nodes.
1032         * But it's BUG to call kmalloc() against offline node.
1033         *
1034         * TODO: this routine can waste much memory for nodes which will
1035         *       never be onlined. It's better to use memory hotplug callback
1036         *       function.
1037         */
1038        if (!node_state(node, N_NORMAL_MEMORY))
1039                tmp = -1;
1040        pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1041        if (!pn)
1042                return 1;
1043
1044        mem->info.nodeinfo[node] = pn;
1045        memset(pn, 0, sizeof(*pn));
1046
1047        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1048                mz = &pn->zoneinfo[zone];
1049                spin_lock_init(&mz->lru_lock);
1050                for_each_lru(l)
1051                        INIT_LIST_HEAD(&mz->lists[l]);
1052        }
1053        return 0;
1054}
1055
1056static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1057{
1058        kfree(mem->info.nodeinfo[node]);
1059}
1060
1061static struct mem_cgroup *mem_cgroup_alloc(void)
1062{
1063        struct mem_cgroup *mem;
1064
1065        if (sizeof(*mem) < PAGE_SIZE)
1066                mem = kmalloc(sizeof(*mem), GFP_KERNEL);
1067        else
1068                mem = vmalloc(sizeof(*mem));
1069
1070        if (mem)
1071                memset(mem, 0, sizeof(*mem));
1072        return mem;
1073}
1074
1075static void mem_cgroup_free(struct mem_cgroup *mem)
1076{
1077        if (sizeof(*mem) < PAGE_SIZE)
1078                kfree(mem);
1079        else
1080                vfree(mem);
1081}
1082
1083
1084static struct cgroup_subsys_state *
1085mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1086{
1087        struct mem_cgroup *mem;
1088        int node;
1089
1090        if (unlikely((cont->parent) == NULL)) {
1091                mem = &init_mem_cgroup;
1092        } else {
1093                mem = mem_cgroup_alloc();
1094                if (!mem)
1095                        return ERR_PTR(-ENOMEM);
1096        }
1097
1098        res_counter_init(&mem->res);
1099
1100        for_each_node_state(node, N_POSSIBLE)
1101                if (alloc_mem_cgroup_per_zone_info(mem, node))
1102                        goto free_out;
1103
1104        return &mem->css;
1105free_out:
1106        for_each_node_state(node, N_POSSIBLE)
1107                free_mem_cgroup_per_zone_info(mem, node);
1108        if (cont->parent != NULL)
1109                mem_cgroup_free(mem);
1110        return ERR_PTR(-ENOMEM);
1111}
1112
1113static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1114                                        struct cgroup *cont)
1115{
1116        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1117        mem_cgroup_force_empty(mem);
1118}
1119
1120static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1121                                struct cgroup *cont)
1122{
1123        int node;
1124        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1125
1126        for_each_node_state(node, N_POSSIBLE)
1127                free_mem_cgroup_per_zone_info(mem, node);
1128
1129        mem_cgroup_free(mem_cgroup_from_cont(cont));
1130}
1131
1132static int mem_cgroup_populate(struct cgroup_subsys *ss,
1133                                struct cgroup *cont)
1134{
1135        return cgroup_add_files(cont, ss, mem_cgroup_files,
1136                                        ARRAY_SIZE(mem_cgroup_files));
1137}
1138
1139static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1140                                struct cgroup *cont,
1141                                struct cgroup *old_cont,
1142                                struct task_struct *p)
1143{
1144        struct mm_struct *mm;
1145        struct mem_cgroup *mem, *old_mem;
1146
1147        mm = get_task_mm(p);
1148        if (mm == NULL)
1149                return;
1150
1151        mem = mem_cgroup_from_cont(cont);
1152        old_mem = mem_cgroup_from_cont(old_cont);
1153
1154        /*
1155         * Only thread group leaders are allowed to migrate, the mm_struct is
1156         * in effect owned by the leader
1157         */
1158        if (!thread_group_leader(p))
1159                goto out;
1160
1161out:
1162        mmput(mm);
1163}
1164
1165struct cgroup_subsys mem_cgroup_subsys = {
1166        .name = "memory",
1167        .subsys_id = mem_cgroup_subsys_id,
1168        .create = mem_cgroup_create,
1169        .pre_destroy = mem_cgroup_pre_destroy,
1170        .destroy = mem_cgroup_destroy,
1171        .populate = mem_cgroup_populate,
1172        .attach = mem_cgroup_move_task,
1173        .early_init = 0,
1174};
1175