linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12
  13static void __meminit
  14__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
  15{
  16        pc->flags = 0;
  17        pc->mem_cgroup = NULL;
  18        pc->page = pfn_to_page(pfn);
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45static int __init alloc_node_page_cgroup(int nid)
  46{
  47        struct page_cgroup *base, *pc;
  48        unsigned long table_size;
  49        unsigned long start_pfn, nr_pages, index;
  50
  51        start_pfn = NODE_DATA(nid)->node_start_pfn;
  52        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  53
  54        if (!nr_pages)
  55                return 0;
  56
  57        table_size = sizeof(struct page_cgroup) * nr_pages;
  58
  59        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  60                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  61        if (!base)
  62                return -ENOMEM;
  63        for (index = 0; index < nr_pages; index++) {
  64                pc = base + index;
  65                __init_page_cgroup(pc, start_pfn + index);
  66        }
  67        NODE_DATA(nid)->node_page_cgroup = base;
  68        total_usage += table_size;
  69        return 0;
  70}
  71
  72void __init page_cgroup_init(void)
  73{
  74
  75        int nid, fail;
  76
  77        if (mem_cgroup_disabled())
  78                return;
  79
  80        for_each_online_node(nid)  {
  81                fail = alloc_node_page_cgroup(nid);
  82                if (fail)
  83                        goto fail;
  84        }
  85        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  86        printk(KERN_INFO "please try cgroup_disable=memory option if you"
  87        " don't want\n");
  88        return;
  89fail:
  90        printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
  91        printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
  92        panic("Out of memory");
  93}
  94
  95#else /* CONFIG_FLAT_NODE_MEM_MAP */
  96
  97struct page_cgroup *lookup_page_cgroup(struct page *page)
  98{
  99        unsigned long pfn = page_to_pfn(page);
 100        struct mem_section *section = __pfn_to_section(pfn);
 101
 102        return section->page_cgroup + pfn;
 103}
 104
 105/* __alloc_bootmem...() is protected by !slab_available() */
 106static int __init_refok init_section_page_cgroup(unsigned long pfn)
 107{
 108        struct mem_section *section = __pfn_to_section(pfn);
 109        struct page_cgroup *base, *pc;
 110        unsigned long table_size;
 111        int nid, index;
 112
 113        if (!section->page_cgroup) {
 114                nid = page_to_nid(pfn_to_page(pfn));
 115                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 116                if (slab_is_available()) {
 117                        base = kmalloc_node(table_size,
 118                                        GFP_KERNEL | __GFP_NOWARN, nid);
 119                        if (!base)
 120                                base = vmalloc_node(table_size, nid);
 121                } else {
 122                        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
 123                                table_size,
 124                                PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 125                }
 126        } else {
 127                /*
 128                 * We don't have to allocate page_cgroup again, but
 129                 * address of memmap may be changed. So, we have to initialize
 130                 * again.
 131                 */
 132                base = section->page_cgroup + pfn;
 133                table_size = 0;
 134                /* check address of memmap is changed or not. */
 135                if (base->page == pfn_to_page(pfn))
 136                        return 0;
 137        }
 138
 139        if (!base) {
 140                printk(KERN_ERR "page cgroup allocation failure\n");
 141                return -ENOMEM;
 142        }
 143
 144        for (index = 0; index < PAGES_PER_SECTION; index++) {
 145                pc = base + index;
 146                __init_page_cgroup(pc, pfn + index);
 147        }
 148
 149        section->page_cgroup = base - pfn;
 150        total_usage += table_size;
 151        return 0;
 152}
 153#ifdef CONFIG_MEMORY_HOTPLUG
 154void __free_page_cgroup(unsigned long pfn)
 155{
 156        struct mem_section *ms;
 157        struct page_cgroup *base;
 158
 159        ms = __pfn_to_section(pfn);
 160        if (!ms || !ms->page_cgroup)
 161                return;
 162        base = ms->page_cgroup + pfn;
 163        if (is_vmalloc_addr(base)) {
 164                vfree(base);
 165                ms->page_cgroup = NULL;
 166        } else {
 167                struct page *page = virt_to_page(base);
 168                if (!PageReserved(page)) { /* Is bootmem ? */
 169                        kfree(base);
 170                        ms->page_cgroup = NULL;
 171                }
 172        }
 173}
 174
 175int __meminit online_page_cgroup(unsigned long start_pfn,
 176                        unsigned long nr_pages,
 177                        int nid)
 178{
 179        unsigned long start, end, pfn;
 180        int fail = 0;
 181
 182        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 183        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 184
 185        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 186                if (!pfn_present(pfn))
 187                        continue;
 188                fail = init_section_page_cgroup(pfn);
 189        }
 190        if (!fail)
 191                return 0;
 192
 193        /* rollback */
 194        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 195                __free_page_cgroup(pfn);
 196
 197        return -ENOMEM;
 198}
 199
 200int __meminit offline_page_cgroup(unsigned long start_pfn,
 201                unsigned long nr_pages, int nid)
 202{
 203        unsigned long start, end, pfn;
 204
 205        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 206        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 207
 208        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 209                __free_page_cgroup(pfn);
 210        return 0;
 211
 212}
 213
 214static int __meminit page_cgroup_callback(struct notifier_block *self,
 215                               unsigned long action, void *arg)
 216{
 217        struct memory_notify *mn = arg;
 218        int ret = 0;
 219        switch (action) {
 220        case MEM_GOING_ONLINE:
 221                ret = online_page_cgroup(mn->start_pfn,
 222                                   mn->nr_pages, mn->status_change_nid);
 223                break;
 224        case MEM_OFFLINE:
 225                offline_page_cgroup(mn->start_pfn,
 226                                mn->nr_pages, mn->status_change_nid);
 227                break;
 228        case MEM_CANCEL_ONLINE:
 229        case MEM_GOING_OFFLINE:
 230                break;
 231        case MEM_ONLINE:
 232        case MEM_CANCEL_OFFLINE:
 233                break;
 234        }
 235
 236        if (ret)
 237                ret = notifier_from_errno(ret);
 238        else
 239                ret = NOTIFY_OK;
 240
 241        return ret;
 242}
 243
 244#endif
 245
 246void __init page_cgroup_init(void)
 247{
 248        unsigned long pfn;
 249        int fail = 0;
 250
 251        if (mem_cgroup_disabled())
 252                return;
 253
 254        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 255                if (!pfn_present(pfn))
 256                        continue;
 257                fail = init_section_page_cgroup(pfn);
 258        }
 259        if (fail) {
 260                printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
 261                panic("Out of memory");
 262        } else {
 263                hotplug_memory_notifier(page_cgroup_callback, 0);
 264        }
 265        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 266        printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
 267        " want\n");
 268}
 269
 270void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 271{
 272        return;
 273}
 274
 275#endif
 276
 277
 278#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 279
 280static DEFINE_MUTEX(swap_cgroup_mutex);
 281struct swap_cgroup_ctrl {
 282        struct page **map;
 283        unsigned long length;
 284};
 285
 286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 287
 288struct swap_cgroup {
 289        unsigned short          id;
 290};
 291#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 292#define SC_POS_MASK     (SC_PER_PAGE - 1)
 293
 294/*
 295 * SwapCgroup implements "lookup" and "exchange" operations.
 296 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 297 * against SwapCache. At swap_free(), this is accessed directly from swap.
 298 *
 299 * This means,
 300 *  - we have no race in "exchange" when we're accessed via SwapCache because
 301 *    SwapCache(and its swp_entry) is under lock.
 302 *  - When called via swap_free(), there is no user of this entry and no race.
 303 * Then, we don't need lock around "exchange".
 304 *
 305 * TODO: we can push these buffers out to HIGHMEM.
 306 */
 307
 308/*
 309 * allocate buffer for swap_cgroup.
 310 */
 311static int swap_cgroup_prepare(int type)
 312{
 313        struct page *page;
 314        struct swap_cgroup_ctrl *ctrl;
 315        unsigned long idx, max;
 316
 317        if (!do_swap_account)
 318                return 0;
 319        ctrl = &swap_cgroup_ctrl[type];
 320
 321        for (idx = 0; idx < ctrl->length; idx++) {
 322                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 323                if (!page)
 324                        goto not_enough_page;
 325                ctrl->map[idx] = page;
 326        }
 327        return 0;
 328not_enough_page:
 329        max = idx;
 330        for (idx = 0; idx < max; idx++)
 331                __free_page(ctrl->map[idx]);
 332
 333        return -ENOMEM;
 334}
 335
 336/**
 337 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 338 * @ent: swap entry to be recorded into
 339 * @mem: mem_cgroup to be recorded
 340 *
 341 * Returns old value at success, 0 at failure.
 342 * (Of course, old value can be 0.)
 343 */
 344unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 345{
 346        int type = swp_type(ent);
 347        unsigned long offset = swp_offset(ent);
 348        unsigned long idx = offset / SC_PER_PAGE;
 349        unsigned long pos = offset & SC_POS_MASK;
 350        struct swap_cgroup_ctrl *ctrl;
 351        struct page *mappage;
 352        struct swap_cgroup *sc;
 353        unsigned short old;
 354
 355        if (!do_swap_account)
 356                return 0;
 357
 358        ctrl = &swap_cgroup_ctrl[type];
 359
 360        mappage = ctrl->map[idx];
 361        sc = page_address(mappage);
 362        sc += pos;
 363        old = sc->id;
 364        sc->id = id;
 365
 366        return old;
 367}
 368
 369/**
 370 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 371 * @ent: swap entry to be looked up.
 372 *
 373 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 374 */
 375unsigned short lookup_swap_cgroup(swp_entry_t ent)
 376{
 377        int type = swp_type(ent);
 378        unsigned long offset = swp_offset(ent);
 379        unsigned long idx = offset / SC_PER_PAGE;
 380        unsigned long pos = offset & SC_POS_MASK;
 381        struct swap_cgroup_ctrl *ctrl;
 382        struct page *mappage;
 383        struct swap_cgroup *sc;
 384        unsigned short ret;
 385
 386        if (!do_swap_account)
 387                return 0;
 388
 389        ctrl = &swap_cgroup_ctrl[type];
 390        mappage = ctrl->map[idx];
 391        sc = page_address(mappage);
 392        sc += pos;
 393        ret = sc->id;
 394        return ret;
 395}
 396
 397int swap_cgroup_swapon(int type, unsigned long max_pages)
 398{
 399        void *array;
 400        unsigned long array_size;
 401        unsigned long length;
 402        struct swap_cgroup_ctrl *ctrl;
 403
 404        if (!do_swap_account)
 405                return 0;
 406
 407        length = ((max_pages/SC_PER_PAGE) + 1);
 408        array_size = length * sizeof(void *);
 409
 410        array = vmalloc(array_size);
 411        if (!array)
 412                goto nomem;
 413
 414        memset(array, 0, array_size);
 415        ctrl = &swap_cgroup_ctrl[type];
 416        mutex_lock(&swap_cgroup_mutex);
 417        ctrl->length = length;
 418        ctrl->map = array;
 419        if (swap_cgroup_prepare(type)) {
 420                /* memory shortage */
 421                ctrl->map = NULL;
 422                ctrl->length = 0;
 423                vfree(array);
 424                mutex_unlock(&swap_cgroup_mutex);
 425                goto nomem;
 426        }
 427        mutex_unlock(&swap_cgroup_mutex);
 428
 429        return 0;
 430nomem:
 431        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 432        printk(KERN_INFO
 433                "swap_cgroup can be disabled by noswapaccount boot option\n");
 434        return -ENOMEM;
 435}
 436
 437void swap_cgroup_swapoff(int type)
 438{
 439        int i;
 440        struct swap_cgroup_ctrl *ctrl;
 441
 442        if (!do_swap_account)
 443                return;
 444
 445        mutex_lock(&swap_cgroup_mutex);
 446        ctrl = &swap_cgroup_ctrl[type];
 447        if (ctrl->map) {
 448                for (i = 0; i < ctrl->length; i++) {
 449                        struct page *page = ctrl->map[i];
 450                        if (page)
 451                                __free_page(page);
 452                }
 453                vfree(ctrl->map);
 454                ctrl->map = NULL;
 455                ctrl->length = 0;
 456        }
 457        mutex_unlock(&swap_cgroup_mutex);
 458}
 459
 460#endif
 461