linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12#include <linux/kmemleak.h>
  13
  14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
  15{
  16        pc->flags = 0;
  17        set_page_cgroup_array_id(pc, id);
  18        pc->mem_cgroup = NULL;
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45struct page *lookup_cgroup_page(struct page_cgroup *pc)
  46{
  47        unsigned long pfn;
  48        struct page *page;
  49        pg_data_t *pgdat;
  50
  51        pgdat = NODE_DATA(page_cgroup_array_id(pc));
  52        pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
  53        page = pfn_to_page(pfn);
  54        VM_BUG_ON(pc != lookup_page_cgroup(page));
  55        return page;
  56}
  57
  58static int __init alloc_node_page_cgroup(int nid)
  59{
  60        struct page_cgroup *base, *pc;
  61        unsigned long table_size;
  62        unsigned long start_pfn, nr_pages, index;
  63
  64        start_pfn = NODE_DATA(nid)->node_start_pfn;
  65        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  66
  67        if (!nr_pages)
  68                return 0;
  69
  70        table_size = sizeof(struct page_cgroup) * nr_pages;
  71
  72        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  73                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  74        if (!base)
  75                return -ENOMEM;
  76        for (index = 0; index < nr_pages; index++) {
  77                pc = base + index;
  78                init_page_cgroup(pc, nid);
  79        }
  80        NODE_DATA(nid)->node_page_cgroup = base;
  81        total_usage += table_size;
  82        return 0;
  83}
  84
  85void __init page_cgroup_init_flatmem(void)
  86{
  87
  88        int nid, fail;
  89
  90        if (mem_cgroup_disabled())
  91                return;
  92
  93        for_each_online_node(nid)  {
  94                fail = alloc_node_page_cgroup(nid);
  95                if (fail)
  96                        goto fail;
  97        }
  98        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  99        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
 100        " don't want memory cgroups\n");
 101        return;
 102fail:
 103        printk(KERN_CRIT "allocation of page_cgroup failed.\n");
 104        printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 105        panic("Out of memory");
 106}
 107
 108#else /* CONFIG_FLAT_NODE_MEM_MAP */
 109
 110struct page_cgroup *lookup_page_cgroup(struct page *page)
 111{
 112        unsigned long pfn = page_to_pfn(page);
 113        struct mem_section *section = __pfn_to_section(pfn);
 114
 115        if (!section->page_cgroup)
 116                return NULL;
 117        return section->page_cgroup + pfn;
 118}
 119
 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
 121{
 122        struct mem_section *section;
 123        struct page *page;
 124        unsigned long nr;
 125
 126        nr = page_cgroup_array_id(pc);
 127        section = __nr_to_section(nr);
 128        page = pfn_to_page(pc - section->page_cgroup);
 129        VM_BUG_ON(pc != lookup_page_cgroup(page));
 130        return page;
 131}
 132
 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
 134{
 135        void *addr = NULL;
 136        gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
 137
 138        addr = alloc_pages_exact_nid(nid, size, flags);
 139        if (addr) {
 140                kmemleak_alloc(addr, size, 1, flags);
 141                return addr;
 142        }
 143
 144        if (node_state(nid, N_HIGH_MEMORY))
 145                addr = vmalloc_node(size, nid);
 146        else
 147                addr = vmalloc(size);
 148
 149        return addr;
 150}
 151
 152#ifdef CONFIG_MEMORY_HOTPLUG
 153static void free_page_cgroup(void *addr)
 154{
 155        if (is_vmalloc_addr(addr)) {
 156                vfree(addr);
 157        } else {
 158                struct page *page = virt_to_page(addr);
 159                size_t table_size =
 160                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 161
 162                BUG_ON(PageReserved(page));
 163                free_pages_exact(addr, table_size);
 164        }
 165}
 166#endif
 167
 168static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 169{
 170        struct page_cgroup *base, *pc;
 171        struct mem_section *section;
 172        unsigned long table_size;
 173        unsigned long nr;
 174        int index;
 175
 176        nr = pfn_to_section_nr(pfn);
 177        section = __nr_to_section(nr);
 178
 179        if (section->page_cgroup)
 180                return 0;
 181
 182        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 183        base = alloc_page_cgroup(table_size, nid);
 184
 185        /*
 186         * The value stored in section->page_cgroup is (base - pfn)
 187         * and it does not point to the memory block allocated above,
 188         * causing kmemleak false positives.
 189         */
 190        kmemleak_not_leak(base);
 191
 192        if (!base) {
 193                printk(KERN_ERR "page cgroup allocation failure\n");
 194                return -ENOMEM;
 195        }
 196
 197        for (index = 0; index < PAGES_PER_SECTION; index++) {
 198                pc = base + index;
 199                init_page_cgroup(pc, nr);
 200        }
 201        /*
 202         * The passed "pfn" may not be aligned to SECTION.  For the calculation
 203         * we need to apply a mask.
 204         */
 205        pfn &= PAGE_SECTION_MASK;
 206        section->page_cgroup = base - pfn;
 207        total_usage += table_size;
 208        return 0;
 209}
 210#ifdef CONFIG_MEMORY_HOTPLUG
 211void __free_page_cgroup(unsigned long pfn)
 212{
 213        struct mem_section *ms;
 214        struct page_cgroup *base;
 215
 216        ms = __pfn_to_section(pfn);
 217        if (!ms || !ms->page_cgroup)
 218                return;
 219        base = ms->page_cgroup + pfn;
 220        free_page_cgroup(base);
 221        ms->page_cgroup = NULL;
 222}
 223
 224int __meminit online_page_cgroup(unsigned long start_pfn,
 225                        unsigned long nr_pages,
 226                        int nid)
 227{
 228        unsigned long start, end, pfn;
 229        int fail = 0;
 230
 231        start = SECTION_ALIGN_DOWN(start_pfn);
 232        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 233
 234        if (nid == -1) {
 235                /*
 236                 * In this case, "nid" already exists and contains valid memory.
 237                 * "start_pfn" passed to us is a pfn which is an arg for
 238                 * online__pages(), and start_pfn should exist.
 239                 */
 240                nid = pfn_to_nid(start_pfn);
 241                VM_BUG_ON(!node_state(nid, N_ONLINE));
 242        }
 243
 244        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 245                if (!pfn_present(pfn))
 246                        continue;
 247                fail = init_section_page_cgroup(pfn, nid);
 248        }
 249        if (!fail)
 250                return 0;
 251
 252        /* rollback */
 253        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 254                __free_page_cgroup(pfn);
 255
 256        return -ENOMEM;
 257}
 258
 259int __meminit offline_page_cgroup(unsigned long start_pfn,
 260                unsigned long nr_pages, int nid)
 261{
 262        unsigned long start, end, pfn;
 263
 264        start = SECTION_ALIGN_DOWN(start_pfn);
 265        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 266
 267        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 268                __free_page_cgroup(pfn);
 269        return 0;
 270
 271}
 272
 273static int __meminit page_cgroup_callback(struct notifier_block *self,
 274                               unsigned long action, void *arg)
 275{
 276        struct memory_notify *mn = arg;
 277        int ret = 0;
 278        switch (action) {
 279        case MEM_GOING_ONLINE:
 280                ret = online_page_cgroup(mn->start_pfn,
 281                                   mn->nr_pages, mn->status_change_nid);
 282                break;
 283        case MEM_OFFLINE:
 284                offline_page_cgroup(mn->start_pfn,
 285                                mn->nr_pages, mn->status_change_nid);
 286                break;
 287        case MEM_CANCEL_ONLINE:
 288        case MEM_GOING_OFFLINE:
 289                break;
 290        case MEM_ONLINE:
 291        case MEM_CANCEL_OFFLINE:
 292                break;
 293        }
 294
 295        return notifier_from_errno(ret);
 296}
 297
 298#endif
 299
 300void __init page_cgroup_init(void)
 301{
 302        unsigned long pfn;
 303        int nid;
 304
 305        if (mem_cgroup_disabled())
 306                return;
 307
 308        for_each_node_state(nid, N_HIGH_MEMORY) {
 309                unsigned long start_pfn, end_pfn;
 310
 311                start_pfn = node_start_pfn(nid);
 312                end_pfn = node_end_pfn(nid);
 313                /*
 314                 * start_pfn and end_pfn may not be aligned to SECTION and the
 315                 * page->flags of out of node pages are not initialized.  So we
 316                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 317                 */
 318                for (pfn = start_pfn;
 319                     pfn < end_pfn;
 320                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 321
 322                        if (!pfn_valid(pfn))
 323                                continue;
 324                        /*
 325                         * Nodes's pfns can be overlapping.
 326                         * We know some arch can have a nodes layout such as
 327                         * -------------pfn-------------->
 328                         * N0 | N1 | N2 | N0 | N1 | N2|....
 329                         */
 330                        if (pfn_to_nid(pfn) != nid)
 331                                continue;
 332                        if (init_section_page_cgroup(pfn, nid))
 333                                goto oom;
 334                }
 335        }
 336        hotplug_memory_notifier(page_cgroup_callback, 0);
 337        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 338        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 339                         "don't want memory cgroups\n");
 340        return;
 341oom:
 342        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 343        panic("Out of memory");
 344}
 345
 346void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 347{
 348        return;
 349}
 350
 351#endif
 352
 353
 354#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 355
 356static DEFINE_MUTEX(swap_cgroup_mutex);
 357struct swap_cgroup_ctrl {
 358        struct page **map;
 359        unsigned long length;
 360        spinlock_t      lock;
 361};
 362
 363static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 364
 365struct swap_cgroup {
 366        unsigned short          id;
 367};
 368#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 369#define SC_POS_MASK     (SC_PER_PAGE - 1)
 370
 371/*
 372 * SwapCgroup implements "lookup" and "exchange" operations.
 373 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 374 * against SwapCache. At swap_free(), this is accessed directly from swap.
 375 *
 376 * This means,
 377 *  - we have no race in "exchange" when we're accessed via SwapCache because
 378 *    SwapCache(and its swp_entry) is under lock.
 379 *  - When called via swap_free(), there is no user of this entry and no race.
 380 * Then, we don't need lock around "exchange".
 381 *
 382 * TODO: we can push these buffers out to HIGHMEM.
 383 */
 384
 385/*
 386 * allocate buffer for swap_cgroup.
 387 */
 388static int swap_cgroup_prepare(int type)
 389{
 390        struct page *page;
 391        struct swap_cgroup_ctrl *ctrl;
 392        unsigned long idx, max;
 393
 394        ctrl = &swap_cgroup_ctrl[type];
 395
 396        for (idx = 0; idx < ctrl->length; idx++) {
 397                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 398                if (!page)
 399                        goto not_enough_page;
 400                ctrl->map[idx] = page;
 401        }
 402        return 0;
 403not_enough_page:
 404        max = idx;
 405        for (idx = 0; idx < max; idx++)
 406                __free_page(ctrl->map[idx]);
 407
 408        return -ENOMEM;
 409}
 410
 411/**
 412 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 413 * @end: swap entry to be cmpxchged
 414 * @old: old id
 415 * @new: new id
 416 *
 417 * Returns old id at success, 0 at failure.
 418 * (There is no mem_cgroup using 0 as its id)
 419 */
 420unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 421                                        unsigned short old, unsigned short new)
 422{
 423        int type = swp_type(ent);
 424        unsigned long offset = swp_offset(ent);
 425        unsigned long idx = offset / SC_PER_PAGE;
 426        unsigned long pos = offset & SC_POS_MASK;
 427        struct swap_cgroup_ctrl *ctrl;
 428        struct page *mappage;
 429        struct swap_cgroup *sc;
 430        unsigned long flags;
 431        unsigned short retval;
 432
 433        ctrl = &swap_cgroup_ctrl[type];
 434
 435        mappage = ctrl->map[idx];
 436        sc = page_address(mappage);
 437        sc += pos;
 438        spin_lock_irqsave(&ctrl->lock, flags);
 439        retval = sc->id;
 440        if (retval == old)
 441                sc->id = new;
 442        else
 443                retval = 0;
 444        spin_unlock_irqrestore(&ctrl->lock, flags);
 445        return retval;
 446}
 447
 448/**
 449 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 450 * @ent: swap entry to be recorded into
 451 * @mem: mem_cgroup to be recorded
 452 *
 453 * Returns old value at success, 0 at failure.
 454 * (Of course, old value can be 0.)
 455 */
 456unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 457{
 458        int type = swp_type(ent);
 459        unsigned long offset = swp_offset(ent);
 460        unsigned long idx = offset / SC_PER_PAGE;
 461        unsigned long pos = offset & SC_POS_MASK;
 462        struct swap_cgroup_ctrl *ctrl;
 463        struct page *mappage;
 464        struct swap_cgroup *sc;
 465        unsigned short old;
 466        unsigned long flags;
 467
 468        ctrl = &swap_cgroup_ctrl[type];
 469
 470        mappage = ctrl->map[idx];
 471        sc = page_address(mappage);
 472        sc += pos;
 473        spin_lock_irqsave(&ctrl->lock, flags);
 474        old = sc->id;
 475        sc->id = id;
 476        spin_unlock_irqrestore(&ctrl->lock, flags);
 477
 478        return old;
 479}
 480
 481/**
 482 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 483 * @ent: swap entry to be looked up.
 484 *
 485 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 486 */
 487unsigned short lookup_swap_cgroup(swp_entry_t ent)
 488{
 489        int type = swp_type(ent);
 490        unsigned long offset = swp_offset(ent);
 491        unsigned long idx = offset / SC_PER_PAGE;
 492        unsigned long pos = offset & SC_POS_MASK;
 493        struct swap_cgroup_ctrl *ctrl;
 494        struct page *mappage;
 495        struct swap_cgroup *sc;
 496        unsigned short ret;
 497
 498        ctrl = &swap_cgroup_ctrl[type];
 499        mappage = ctrl->map[idx];
 500        sc = page_address(mappage);
 501        sc += pos;
 502        ret = sc->id;
 503        return ret;
 504}
 505
 506int swap_cgroup_swapon(int type, unsigned long max_pages)
 507{
 508        void *array;
 509        unsigned long array_size;
 510        unsigned long length;
 511        struct swap_cgroup_ctrl *ctrl;
 512
 513        if (!do_swap_account)
 514                return 0;
 515
 516        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 517        array_size = length * sizeof(void *);
 518
 519        array = vzalloc(array_size);
 520        if (!array)
 521                goto nomem;
 522
 523        ctrl = &swap_cgroup_ctrl[type];
 524        mutex_lock(&swap_cgroup_mutex);
 525        ctrl->length = length;
 526        ctrl->map = array;
 527        spin_lock_init(&ctrl->lock);
 528        if (swap_cgroup_prepare(type)) {
 529                /* memory shortage */
 530                ctrl->map = NULL;
 531                ctrl->length = 0;
 532                mutex_unlock(&swap_cgroup_mutex);
 533                vfree(array);
 534                goto nomem;
 535        }
 536        mutex_unlock(&swap_cgroup_mutex);
 537
 538        return 0;
 539nomem:
 540        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 541        printk(KERN_INFO
 542                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 543        return -ENOMEM;
 544}
 545
 546void swap_cgroup_swapoff(int type)
 547{
 548        struct page **map;
 549        unsigned long i, length;
 550        struct swap_cgroup_ctrl *ctrl;
 551
 552        if (!do_swap_account)
 553                return;
 554
 555        mutex_lock(&swap_cgroup_mutex);
 556        ctrl = &swap_cgroup_ctrl[type];
 557        map = ctrl->map;
 558        length = ctrl->length;
 559        ctrl->map = NULL;
 560        ctrl->length = 0;
 561        mutex_unlock(&swap_cgroup_mutex);
 562
 563        if (map) {
 564                for (i = 0; i < length; i++) {
 565                        struct page *page = map[i];
 566                        if (page)
 567                                __free_page(page);
 568                }
 569                vfree(map);
 570        }
 571}
 572
 573#endif
 574
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.