linux/mm/page_cgroup.c
<<
>>
Prefs
   1#include <linux/mm.h>
   2#include <linux/mmzone.h>
   3#include <linux/bootmem.h>
   4#include <linux/bit_spinlock.h>
   5#include <linux/page_cgroup.h>
   6#include <linux/hash.h>
   7#include <linux/slab.h>
   8#include <linux/memory.h>
   9#include <linux/vmalloc.h>
  10#include <linux/cgroup.h>
  11#include <linux/swapops.h>
  12
  13static void __meminit
  14__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
  15{
  16        pc->flags = 0;
  17        pc->mem_cgroup = NULL;
  18        pc->page = pfn_to_page(pfn);
  19        INIT_LIST_HEAD(&pc->lru);
  20}
  21static unsigned long total_usage;
  22
  23#if !defined(CONFIG_SPARSEMEM)
  24
  25
  26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27{
  28        pgdat->node_page_cgroup = NULL;
  29}
  30
  31struct page_cgroup *lookup_page_cgroup(struct page *page)
  32{
  33        unsigned long pfn = page_to_pfn(page);
  34        unsigned long offset;
  35        struct page_cgroup *base;
  36
  37        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38        if (unlikely(!base))
  39                return NULL;
  40
  41        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42        return base + offset;
  43}
  44
  45static int __init alloc_node_page_cgroup(int nid)
  46{
  47        struct page_cgroup *base, *pc;
  48        unsigned long table_size;
  49        unsigned long start_pfn, nr_pages, index;
  50
  51        start_pfn = NODE_DATA(nid)->node_start_pfn;
  52        nr_pages = NODE_DATA(nid)->node_spanned_pages;
  53
  54        if (!nr_pages)
  55                return 0;
  56
  57        table_size = sizeof(struct page_cgroup) * nr_pages;
  58
  59        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  60                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  61        if (!base)
  62                return -ENOMEM;
  63        for (index = 0; index < nr_pages; index++) {
  64                pc = base + index;
  65                __init_page_cgroup(pc, start_pfn + index);
  66        }
  67        NODE_DATA(nid)->node_page_cgroup = base;
  68        total_usage += table_size;
  69        return 0;
  70}
  71
  72void __init page_cgroup_init(void)
  73{
  74
  75        int nid, fail;
  76
  77        if (mem_cgroup_disabled())
  78                return;
  79
  80        for_each_online_node(nid)  {
  81                fail = alloc_node_page_cgroup(nid);
  82                if (fail)
  83                        goto fail;
  84        }
  85        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  86        printk(KERN_INFO "please try cgroup_disable=memory option if you"
  87        " don't want\n");
  88        return;
  89fail:
  90        printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
  91        printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
  92        panic("Out of memory");
  93}
  94
  95#else /* CONFIG_FLAT_NODE_MEM_MAP */
  96
  97struct page_cgroup *lookup_page_cgroup(struct page *page)
  98{
  99        unsigned long pfn = page_to_pfn(page);
 100        struct mem_section *section = __pfn_to_section(pfn);
 101
 102        return section->page_cgroup + pfn;
 103}
 104
 105/* __alloc_bootmem...() is protected by !slab_available() */
 106static int __init_refok init_section_page_cgroup(unsigned long pfn)
 107{
 108        struct mem_section *section = __pfn_to_section(pfn);
 109        struct page_cgroup *base, *pc;
 110        unsigned long table_size;
 111        int nid, index;
 112
 113        if (!section->page_cgroup) {
 114                nid = page_to_nid(pfn_to_page(pfn));
 115                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 116                if (slab_is_available()) {
 117                        base = kmalloc_node(table_size,
 118                                        GFP_KERNEL | __GFP_NOWARN, nid);
 119                        if (!base)
 120                                base = vmalloc_node(table_size, nid);
 121                } else {
 122                        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
 123                                table_size,
 124                                PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 125                }
 126        } else {
 127                /*
 128                 * We don't have to allocate page_cgroup again, but
 129                 * address of memmap may be changed. So, we have to initialize
 130                 * again.
 131                 */
 132                base = section->page_cgroup + pfn;
 133                table_size = 0;
 134                /* check address of memmap is changed or not. */
 135                if (base->page == pfn_to_page(pfn))
 136                        return 0;
 137        }
 138
 139        if (!base) {
 140                printk(KERN_ERR "page cgroup allocation failure\n");
 141                return -ENOMEM;
 142        }
 143
 144        for (index = 0; index < PAGES_PER_SECTION; index++) {
 145                pc = base + index;
 146                __init_page_cgroup(pc, pfn + index);
 147        }
 148
 149        section->page_cgroup = base - pfn;
 150        total_usage += table_size;
 151        return 0;
 152}
 153#ifdef CONFIG_MEMORY_HOTPLUG
 154void __free_page_cgroup(unsigned long pfn)
 155{
 156        struct mem_section *ms;
 157        struct page_cgroup *base;
 158
 159        ms = __pfn_to_section(pfn);
 160        if (!ms || !ms->page_cgroup)
 161                return;
 162        base = ms->page_cgroup + pfn;
 163        if (is_vmalloc_addr(base)) {
 164                vfree(base);
 165                ms->page_cgroup = NULL;
 166        } else {
 167                struct page *page = virt_to_page(base);
 168                if (!PageReserved(page)) { /* Is bootmem ? */
 169                        kfree(base);
 170                        ms->page_cgroup = NULL;
 171                }
 172        }
 173}
 174
 175int __meminit online_page_cgroup(unsigned long start_pfn,
 176                        unsigned long nr_pages,
 177                        int nid)
 178{
 179        unsigned long start, end, pfn;
 180        int fail = 0;
 181
 182        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 183        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 184
 185        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 186                if (!pfn_present(pfn))
 187                        continue;
 188                fail = init_section_page_cgroup(pfn);
 189        }
 190        if (!fail)
 191                return 0;
 192
 193        /* rollback */
 194        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 195                __free_page_cgroup(pfn);
 196
 197        return -ENOMEM;
 198}
 199
 200int __meminit offline_page_cgroup(unsigned long start_pfn,
 201                unsigned long nr_pages, int nid)
 202{
 203        unsigned long start, end, pfn;
 204
 205        start = start_pfn & ~(PAGES_PER_SECTION - 1);
 206        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 207
 208        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 209                __free_page_cgroup(pfn);
 210        return 0;
 211
 212}
 213
 214static int __meminit page_cgroup_callback(struct notifier_block *self,
 215                               unsigned long action, void *arg)
 216{
 217        struct memory_notify *mn = arg;
 218        int ret = 0;
 219        switch (action) {
 220        case MEM_GOING_ONLINE:
 221                ret = online_page_cgroup(mn->start_pfn,
 222                                   mn->nr_pages, mn->status_change_nid);
 223                break;
 224        case MEM_OFFLINE:
 225                offline_page_cgroup(mn->start_pfn,
 226                                mn->nr_pages, mn->status_change_nid);
 227                break;
 228        case MEM_CANCEL_ONLINE:
 229        case MEM_GOING_OFFLINE:
 230                break;
 231        case MEM_ONLINE:
 232        case MEM_CANCEL_OFFLINE:
 233                break;
 234        }
 235
 236        if (ret)
 237                ret = notifier_from_errno(ret);
 238        else
 239                ret = NOTIFY_OK;
 240
 241        return ret;
 242}
 243
 244#endif
 245
 246void __init page_cgroup_init(void)
 247{
 248        unsigned long pfn;
 249        int fail = 0;
 250
 251        if (mem_cgroup_disabled())
 252                return;
 253
 254        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 255                if (!pfn_present(pfn))
 256                        continue;
 257                fail = init_section_page_cgroup(pfn);
 258        }
 259        if (fail) {
 260                printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
 261                panic("Out of memory");
 262        } else {
 263                hotplug_memory_notifier(page_cgroup_callback, 0);
 264        }
 265        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 266        printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
 267        " want\n");
 268}
 269
 270void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 271{
 272        return;
 273}
 274
 275#endif
 276
 277
 278#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 279
 280static DEFINE_MUTEX(swap_cgroup_mutex);
 281struct swap_cgroup_ctrl {
 282        struct page **map;
 283        unsigned long length;
 284};
 285
 286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 287
 288/*
 289 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
 290 * cgroup rather than pointer.
 291 */
 292struct swap_cgroup {
 293        struct mem_cgroup       *val;
 294};
 295#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 296#define SC_POS_MASK     (SC_PER_PAGE - 1)
 297
 298/*
 299 * SwapCgroup implements "lookup" and "exchange" operations.
 300 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 301 * against SwapCache. At swap_free(), this is accessed directly from swap.
 302 *
 303 * This means,
 304 *  - we have no race in "exchange" when we're accessed via SwapCache because
 305 *    SwapCache(and its swp_entry) is under lock.
 306 *  - When called via swap_free(), there is no user of this entry and no race.
 307 * Then, we don't need lock around "exchange".
 308 *
 309 * TODO: we can push these buffers out to HIGHMEM.
 310 */
 311
 312/*
 313 * allocate buffer for swap_cgroup.
 314 */
 315static int swap_cgroup_prepare(int type)
 316{
 317        struct page *page;
 318        struct swap_cgroup_ctrl *ctrl;
 319        unsigned long idx, max;
 320
 321        if (!do_swap_account)
 322                return 0;
 323        ctrl = &swap_cgroup_ctrl[type];
 324
 325        for (idx = 0; idx < ctrl->length; idx++) {
 326                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 327                if (!page)
 328                        goto not_enough_page;
 329                ctrl->map[idx] = page;
 330        }
 331        return 0;
 332not_enough_page:
 333        max = idx;
 334        for (idx = 0; idx < max; idx++)
 335                __free_page(ctrl->map[idx]);
 336
 337        return -ENOMEM;
 338}
 339
 340/**
 341 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 342 * @ent: swap entry to be recorded into
 343 * @mem: mem_cgroup to be recorded
 344 *
 345 * Returns old value at success, NULL at failure.
 346 * (Of course, old value can be NULL.)
 347 */
 348struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
 349{
 350        int type = swp_type(ent);
 351        unsigned long offset = swp_offset(ent);
 352        unsigned long idx = offset / SC_PER_PAGE;
 353        unsigned long pos = offset & SC_POS_MASK;
 354        struct swap_cgroup_ctrl *ctrl;
 355        struct page *mappage;
 356        struct swap_cgroup *sc;
 357        struct mem_cgroup *old;
 358
 359        if (!do_swap_account)
 360                return NULL;
 361
 362        ctrl = &swap_cgroup_ctrl[type];
 363
 364        mappage = ctrl->map[idx];
 365        sc = page_address(mappage);
 366        sc += pos;
 367        old = sc->val;
 368        sc->val = mem;
 369
 370        return old;
 371}
 372
 373/**
 374 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 375 * @ent: swap entry to be looked up.
 376 *
 377 * Returns pointer to mem_cgroup at success. NULL at failure.
 378 */
 379struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
 380{
 381        int type = swp_type(ent);
 382        unsigned long offset = swp_offset(ent);
 383        unsigned long idx = offset / SC_PER_PAGE;
 384        unsigned long pos = offset & SC_POS_MASK;
 385        struct swap_cgroup_ctrl *ctrl;
 386        struct page *mappage;
 387        struct swap_cgroup *sc;
 388        struct mem_cgroup *ret;
 389
 390        if (!do_swap_account)
 391                return NULL;
 392
 393        ctrl = &swap_cgroup_ctrl[type];
 394        mappage = ctrl->map[idx];
 395        sc = page_address(mappage);
 396        sc += pos;
 397        ret = sc->val;
 398        return ret;
 399}
 400
 401int swap_cgroup_swapon(int type, unsigned long max_pages)
 402{
 403        void *array;
 404        unsigned long array_size;
 405        unsigned long length;
 406        struct swap_cgroup_ctrl *ctrl;
 407
 408        if (!do_swap_account)
 409                return 0;
 410
 411        length = ((max_pages/SC_PER_PAGE) + 1);
 412        array_size = length * sizeof(void *);
 413
 414        array = vmalloc(array_size);
 415        if (!array)
 416                goto nomem;
 417
 418        memset(array, 0, array_size);
 419        ctrl = &swap_cgroup_ctrl[type];
 420        mutex_lock(&swap_cgroup_mutex);
 421        ctrl->length = length;
 422        ctrl->map = array;
 423        if (swap_cgroup_prepare(type)) {
 424                /* memory shortage */
 425                ctrl->map = NULL;
 426                ctrl->length = 0;
 427                vfree(array);
 428                mutex_unlock(&swap_cgroup_mutex);
 429                goto nomem;
 430        }
 431        mutex_unlock(&swap_cgroup_mutex);
 432
 433        printk(KERN_INFO
 434                "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
 435                " and %ld bytes to hold mem_cgroup pointers on swap\n",
 436                array_size, length * PAGE_SIZE);
 437        printk(KERN_INFO
 438        "swap_cgroup can be disabled by noswapaccount boot option.\n");
 439
 440        return 0;
 441nomem:
 442        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 443        printk(KERN_INFO
 444                "swap_cgroup can be disabled by noswapaccount boot option\n");
 445        return -ENOMEM;
 446}
 447
 448void swap_cgroup_swapoff(int type)
 449{
 450        int i;
 451        struct swap_cgroup_ctrl *ctrl;
 452
 453        if (!do_swap_account)
 454                return;
 455
 456        mutex_lock(&swap_cgroup_mutex);
 457        ctrl = &swap_cgroup_ctrl[type];
 458        if (ctrl->map) {
 459                for (i = 0; i < ctrl->length; i++) {
 460                        struct page *page = ctrl->map[i];
 461                        if (page)
 462                                __free_page(page);
 463                }
 464                vfree(ctrl->map);
 465                ctrl->map = NULL;
 466                ctrl->length = 0;
 467        }
 468        mutex_unlock(&swap_cgroup_mutex);
 469}
 470
 471#endif
 472