linux/mm/slub.c
<<
>>
Prefs
   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks or atomic operatios
   6 * and only uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 * (C) 2011 Linux Foundation, Christoph Lameter
  10 */
  11
  12#include <linux/mm.h>
  13#include <linux/swap.h> /* struct reclaim_state */
  14#include <linux/module.h>
  15#include <linux/bit_spinlock.h>
  16#include <linux/interrupt.h>
  17#include <linux/bitops.h>
  18#include <linux/slab.h>
  19#include "slab.h"
  20#include <linux/proc_fs.h>
  21#include <linux/notifier.h>
  22#include <linux/seq_file.h>
  23#include <linux/kmemcheck.h>
  24#include <linux/cpu.h>
  25#include <linux/cpuset.h>
  26#include <linux/mempolicy.h>
  27#include <linux/ctype.h>
  28#include <linux/debugobjects.h>
  29#include <linux/kallsyms.h>
  30#include <linux/memory.h>
  31#include <linux/math64.h>
  32#include <linux/fault-inject.h>
  33#include <linux/stacktrace.h>
  34#include <linux/prefetch.h>
  35#include <linux/memcontrol.h>
  36
  37#include <trace/events/kmem.h>
  38
  39#include "internal.h"
  40
  41/*
  42 * Lock order:
  43 *   1. slab_mutex (Global Mutex)
  44 *   2. node->list_lock
  45 *   3. slab_lock(page) (Only on some arches and for debugging)
  46 *
  47 *   slab_mutex
  48 *
  49 *   The role of the slab_mutex is to protect the list of all the slabs
  50 *   and to synchronize major metadata changes to slab cache structures.
  51 *
  52 *   The slab_lock is only used for debugging and on arches that do not
  53 *   have the ability to do a cmpxchg_double. It only protects the second
  54 *   double word in the page struct. Meaning
  55 *      A. page->freelist       -> List of object free in a page
  56 *      B. page->counters       -> Counters of objects
  57 *      C. page->frozen         -> frozen state
  58 *
  59 *   If a slab is frozen then it is exempt from list management. It is not
  60 *   on any list. The processor that froze the slab is the one who can
  61 *   perform list operations on the page. Other processors may put objects
  62 *   onto the freelist but the processor that froze the slab is the only
  63 *   one that can retrieve the objects from the page's freelist.
  64 *
  65 *   The list_lock protects the partial and full list on each node and
  66 *   the partial slab counter. If taken then no new slabs may be added or
  67 *   removed from the lists nor make the number of partial slabs be modified.
  68 *   (Note that the total number of slabs is an atomic value that may be
  69 *   modified without taking the list lock).
  70 *
  71 *   The list_lock is a centralized lock and thus we avoid taking it as
  72 *   much as possible. As long as SLUB does not have to handle partial
  73 *   slabs, operations can continue without any centralized lock. F.e.
  74 *   allocating a long series of objects that fill up slabs does not require
  75 *   the list lock.
  76 *   Interrupts are disabled during allocation and deallocation in order to
  77 *   make the slab allocator safe to use in the context of an irq. In addition
  78 *   interrupts are disabled to ensure that the processor does not change
  79 *   while handling per_cpu slabs, due to kernel preemption.
  80 *
  81 * SLUB assigns one slab for allocation to each processor.
  82 * Allocations only occur from these slabs called cpu slabs.
  83 *
  84 * Slabs with free elements are kept on a partial list and during regular
  85 * operations no list for full slabs is used. If an object in a full slab is
  86 * freed then the slab will show up again on the partial lists.
  87 * We track full slabs for debugging purposes though because otherwise we
  88 * cannot scan all objects.
  89 *
  90 * Slabs are freed when they become empty. Teardown and setup is
  91 * minimal so we rely on the page allocators per cpu caches for
  92 * fast frees and allocs.
  93 *
  94 * Overloading of page flags that are otherwise used for LRU management.
  95 *
  96 * PageActive           The slab is frozen and exempt from list processing.
  97 *                      This means that the slab is dedicated to a purpose
  98 *                      such as satisfying allocations for a specific
  99 *                      processor. Objects may be freed in the slab while
 100 *                      it is frozen but slab_free will then skip the usual
 101 *                      list operations. It is up to the processor holding
 102 *                      the slab to integrate the slab into the slab lists
 103 *                      when the slab is no longer needed.
 104 *
 105 *                      One use of this flag is to mark slabs that are
 106 *                      used for allocations. Then such a slab becomes a cpu
 107 *                      slab. The cpu slab may be equipped with an additional
 108 *                      freelist that allows lockless access to
 109 *                      free objects in addition to the regular freelist
 110 *                      that requires the slab lock.
 111 *
 112 * PageError            Slab requires special handling due to debug
 113 *                      options set. This moves slab handling out of
 114 *                      the fast path and disables lockless freelists.
 115 */
 116
 117static inline int kmem_cache_debug(struct kmem_cache *s)
 118{
 119#ifdef CONFIG_SLUB_DEBUG
 120        return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 121#else
 122        return 0;
 123#endif
 124}
 125
 126static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 127{
 128#ifdef CONFIG_SLUB_CPU_PARTIAL
 129        return !kmem_cache_debug(s);
 130#else
 131        return false;
 132#endif
 133}
 134
 135/*
 136 * Issues still to be resolved:
 137 *
 138 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 139 *
 140 * - Variable sizing of the per node arrays
 141 */
 142
 143/* Enable to test recovery from slab corruption on boot */
 144#undef SLUB_RESILIENCY_TEST
 145
 146/* Enable to log cmpxchg failures */
 147#undef SLUB_DEBUG_CMPXCHG
 148
 149/*
 150 * Mininum number of partial slabs. These will be left on the partial
 151 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 152 */
 153#define MIN_PARTIAL 5
 154
 155/*
 156 * Maximum number of desirable partial slabs.
 157 * The existence of more partial slabs makes kmem_cache_shrink
 158 * sort the partial list by the number of objects in the.
 159 */
 160#define MAX_PARTIAL 10
 161
 162#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 163                                SLAB_POISON | SLAB_STORE_USER)
 164
 165/*
 166 * Debugging flags that require metadata to be stored in the slab.  These get
 167 * disabled when slub_debug=O is used and a cache's min order increases with
 168 * metadata.
 169 */
 170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 171
 172/*
 173 * Set of flags that will prevent slab merging
 174 */
 175#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 176                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
 177                SLAB_FAILSLAB)
 178
 179#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 180                SLAB_CACHE_DMA | SLAB_NOTRACK)
 181
 182#define OO_SHIFT        16
 183#define OO_MASK         ((1 << OO_SHIFT) - 1)
 184#define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 185
 186/* Internal SLUB flags */
 187#define __OBJECT_POISON         0x80000000UL /* Poison object */
 188#define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
 189
 190#ifdef CONFIG_SMP
 191static struct notifier_block slab_notifier;
 192#endif
 193
 194/*
 195 * Tracking user of a slab.
 196 */
 197#define TRACK_ADDRS_COUNT 16
 198struct track {
 199        unsigned long addr;     /* Called from address */
 200#ifdef CONFIG_STACKTRACE
 201        unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
 202#endif
 203        int cpu;                /* Was running on cpu */
 204        int pid;                /* Pid context */
 205        unsigned long when;     /* When did the operation occur */
 206};
 207
 208enum track_item { TRACK_ALLOC, TRACK_FREE };
 209
 210#ifdef CONFIG_SYSFS
 211static int sysfs_slab_add(struct kmem_cache *);
 212static int sysfs_slab_alias(struct kmem_cache *, const char *);
 213static void sysfs_slab_remove(struct kmem_cache *);
 214static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 215#else
 216static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 217static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 218                                                        { return 0; }
 219static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 220
 221static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 222#endif
 223
 224static inline void stat(const struct kmem_cache *s, enum stat_item si)
 225{
 226#ifdef CONFIG_SLUB_STATS
 227        __this_cpu_inc(s->cpu_slab->stat[si]);
 228#endif
 229}
 230
 231/********************************************************************
 232 *                      Core slab cache functions
 233 *******************************************************************/
 234
 235static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 236{
 237        return s->node[node];
 238}
 239
 240/* Verify that a pointer has an address that is valid within a slab page */
 241static inline int check_valid_pointer(struct kmem_cache *s,
 242                                struct page *page, const void *object)
 243{
 244        void *base;
 245
 246        if (!object)
 247                return 1;
 248
 249        base = page_address(page);
 250        if (object < base || object >= base + page->objects * s->size ||
 251                (object - base) % s->size) {
 252                return 0;
 253        }
 254
 255        return 1;
 256}
 257
 258static inline void *get_freepointer(struct kmem_cache *s, void *object)
 259{
 260        return *(void **)(object + s->offset);
 261}
 262
 263static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 264{
 265        prefetch(object + s->offset);
 266}
 267
 268static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 269{
 270        void *p;
 271
 272#ifdef CONFIG_DEBUG_PAGEALLOC
 273        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
 274#else
 275        p = get_freepointer(s, object);
 276#endif
 277        return p;
 278}
 279
 280static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 281{
 282        *(void **)(object + s->offset) = fp;
 283}
 284
 285/* Loop over all objects in a slab */
 286#define for_each_object(__p, __s, __addr, __objects) \
 287        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 288                        __p += (__s)->size)
 289
 290/* Determine object index from a given position */
 291static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 292{
 293        return (p - addr) / s->size;
 294}
 295
 296static inline size_t slab_ksize(const struct kmem_cache *s)
 297{
 298#ifdef CONFIG_SLUB_DEBUG
 299        /*
 300         * Debugging requires use of the padding between object
 301         * and whatever may come after it.
 302         */
 303        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
 304                return s->object_size;
 305
 306#endif
 307        /*
 308         * If we have the need to store the freelist pointer
 309         * back there or track user information then we can
 310         * only use the space before that information.
 311         */
 312        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
 313                return s->inuse;
 314        /*
 315         * Else we can use all the padding etc for the allocation
 316         */
 317        return s->size;
 318}
 319
 320static inline int order_objects(int order, unsigned long size, int reserved)
 321{
 322        return ((PAGE_SIZE << order) - reserved) / size;
 323}
 324
 325static inline struct kmem_cache_order_objects oo_make(int order,
 326                unsigned long size, int reserved)
 327{
 328        struct kmem_cache_order_objects x = {
 329                (order << OO_SHIFT) + order_objects(order, size, reserved)
 330        };
 331
 332        return x;
 333}
 334
 335static inline int oo_order(struct kmem_cache_order_objects x)
 336{
 337        return x.x >> OO_SHIFT;
 338}
 339
 340static inline int oo_objects(struct kmem_cache_order_objects x)
 341{
 342        return x.x & OO_MASK;
 343}
 344
 345/*
 346 * Per slab locking using the pagelock
 347 */
 348static __always_inline void slab_lock(struct page *page)
 349{
 350        bit_spin_lock(PG_locked, &page->flags);
 351}
 352
 353static __always_inline void slab_unlock(struct page *page)
 354{
 355        __bit_spin_unlock(PG_locked, &page->flags);
 356}
 357
 358/* Interrupts must be disabled (for the fallback code to work right) */
 359static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 360                void *freelist_old, unsigned long counters_old,
 361                void *freelist_new, unsigned long counters_new,
 362                const char *n)
 363{
 364        VM_BUG_ON(!irqs_disabled());
 365#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 366    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 367        if (s->flags & __CMPXCHG_DOUBLE) {
 368                if (cmpxchg_double(&page->freelist, &page->counters,
 369                        freelist_old, counters_old,
 370                        freelist_new, counters_new))
 371                return 1;
 372        } else
 373#endif
 374        {
 375                slab_lock(page);
 376                if (page->freelist == freelist_old &&
 377                                        page->counters == counters_old) {
 378                        page->freelist = freelist_new;
 379                        page->counters = counters_new;
 380                        slab_unlock(page);
 381                        return 1;
 382                }
 383                slab_unlock(page);
 384        }
 385
 386        cpu_relax();
 387        stat(s, CMPXCHG_DOUBLE_FAIL);
 388
 389#ifdef SLUB_DEBUG_CMPXCHG
 390        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 391#endif
 392
 393        return 0;
 394}
 395
 396static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 397                void *freelist_old, unsigned long counters_old,
 398                void *freelist_new, unsigned long counters_new,
 399                const char *n)
 400{
 401#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 402    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 403        if (s->flags & __CMPXCHG_DOUBLE) {
 404                if (cmpxchg_double(&page->freelist, &page->counters,
 405                        freelist_old, counters_old,
 406                        freelist_new, counters_new))
 407                return 1;
 408        } else
 409#endif
 410        {
 411                unsigned long flags;
 412
 413                local_irq_save(flags);
 414                slab_lock(page);
 415                if (page->freelist == freelist_old &&
 416                                        page->counters == counters_old) {
 417                        page->freelist = freelist_new;
 418                        page->counters = counters_new;
 419                        slab_unlock(page);
 420                        local_irq_restore(flags);
 421                        return 1;
 422                }
 423                slab_unlock(page);
 424                local_irq_restore(flags);
 425        }
 426
 427        cpu_relax();
 428        stat(s, CMPXCHG_DOUBLE_FAIL);
 429
 430#ifdef SLUB_DEBUG_CMPXCHG
 431        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
 432#endif
 433
 434        return 0;
 435}
 436
 437#ifdef CONFIG_SLUB_DEBUG
 438/*
 439 * Determine a map of object in use on a page.
 440 *
 441 * Node listlock must be held to guarantee that the page does
 442 * not vanish from under us.
 443 */
 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 445{
 446        void *p;
 447        void *addr = page_address(page);
 448
 449        for (p = page->freelist; p; p = get_freepointer(s, p))
 450                set_bit(slab_index(p, s, addr), map);
 451}
 452
 453/*
 454 * Debug settings:
 455 */
 456#ifdef CONFIG_SLUB_DEBUG_ON
 457static int slub_debug = DEBUG_DEFAULT_FLAGS;
 458#else
 459static int slub_debug;
 460#endif
 461
 462static char *slub_debug_slabs;
 463static int disable_higher_order_debug;
 464
 465/*
 466 * Object debugging
 467 */
 468static void print_section(char *text, u8 *addr, unsigned int length)
 469{
 470        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
 471                        length, 1);
 472}
 473
 474static struct track *get_track(struct kmem_cache *s, void *object,
 475        enum track_item alloc)
 476{
 477        struct track *p;
 478
 479        if (s->offset)
 480                p = object + s->offset + sizeof(void *);
 481        else
 482                p = object + s->inuse;
 483
 484        return p + alloc;
 485}
 486
 487static void set_track(struct kmem_cache *s, void *object,
 488                        enum track_item alloc, unsigned long addr)
 489{
 490        struct track *p = get_track(s, object, alloc);
 491
 492        if (addr) {
 493#ifdef CONFIG_STACKTRACE
 494                struct stack_trace trace;
 495                int i;
 496
 497                trace.nr_entries = 0;
 498                trace.max_entries = TRACK_ADDRS_COUNT;
 499                trace.entries = p->addrs;
 500                trace.skip = 3;
 501                save_stack_trace(&trace);
 502
 503                /* See rant in lockdep.c */
 504                if (trace.nr_entries != 0 &&
 505                    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
 506                        trace.nr_entries--;
 507
 508                for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
 509                        p->addrs[i] = 0;
 510#endif
 511                p->addr = addr;
 512                p->cpu = smp_processor_id();
 513                p->pid = current->pid;
 514                p->when = jiffies;
 515        } else
 516                memset(p, 0, sizeof(struct track));
 517}
 518
 519static void init_tracking(struct kmem_cache *s, void *object)
 520{
 521        if (!(s->flags & SLAB_STORE_USER))
 522                return;
 523
 524        set_track(s, object, TRACK_FREE, 0UL);
 525        set_track(s, object, TRACK_ALLOC, 0UL);
 526}
 527
 528static void print_track(const char *s, struct track *t)
 529{
 530        if (!t->addr)
 531                return;
 532
 533        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 534                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 535#ifdef CONFIG_STACKTRACE
 536        {
 537                int i;
 538                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
 539                        if (t->addrs[i])
 540                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
 541                        else
 542                                break;
 543        }
 544#endif
 545}
 546
 547static void print_tracking(struct kmem_cache *s, void *object)
 548{
 549        if (!(s->flags & SLAB_STORE_USER))
 550                return;
 551
 552        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 553        print_track("Freed", get_track(s, object, TRACK_FREE));
 554}
 555
 556static void print_page_info(struct page *page)
 557{
 558        printk(KERN_ERR
 559               "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 560               page, page->objects, page->inuse, page->freelist, page->flags);
 561
 562}
 563
 564static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 565{
 566        va_list args;
 567        char buf[100];
 568
 569        va_start(args, fmt);
 570        vsnprintf(buf, sizeof(buf), fmt, args);
 571        va_end(args);
 572        printk(KERN_ERR "========================================"
 573                        "=====================================\n");
 574        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
 575        printk(KERN_ERR "----------------------------------------"
 576                        "-------------------------------------\n\n");
 577
 578        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 579}
 580
 581static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 582{
 583        va_list args;
 584        char buf[100];
 585
 586        va_start(args, fmt);
 587        vsnprintf(buf, sizeof(buf), fmt, args);
 588        va_end(args);
 589        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 590}
 591
 592static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 593{
 594        unsigned int off;       /* Offset of last byte */
 595        u8 *addr = page_address(page);
 596
 597        print_tracking(s, p);
 598
 599        print_page_info(page);
 600
 601        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 602                        p, p - addr, get_freepointer(s, p));
 603
 604        if (p > addr + 16)
 605                print_section("Bytes b4 ", p - 16, 16);
 606
 607        print_section("Object ", p, min_t(unsigned long, s->object_size,
 608                                PAGE_SIZE));
 609        if (s->flags & SLAB_RED_ZONE)
 610                print_section("Redzone ", p + s->object_size,
 611                        s->inuse - s->object_size);
 612
 613        if (s->offset)
 614                off = s->offset + sizeof(void *);
 615        else
 616                off = s->inuse;
 617
 618        if (s->flags & SLAB_STORE_USER)
 619                off += 2 * sizeof(struct track);
 620
 621        if (off != s->size)
 622                /* Beginning of the filler is the free pointer */
 623                print_section("Padding ", p + off, s->size - off);
 624
 625        dump_stack();
 626}
 627
 628static void object_err(struct kmem_cache *s, struct page *page,
 629                        u8 *object, char *reason)
 630{
 631        slab_bug(s, "%s", reason);
 632        print_trailer(s, page, object);
 633}
 634
 635static void slab_err(struct kmem_cache *s, struct page *page,
 636                        const char *fmt, ...)
 637{
 638        va_list args;
 639        char buf[100];
 640
 641        va_start(args, fmt);
 642        vsnprintf(buf, sizeof(buf), fmt, args);
 643        va_end(args);
 644        slab_bug(s, "%s", buf);
 645        print_page_info(page);
 646        dump_stack();
 647}
 648
 649static void init_object(struct kmem_cache *s, void *object, u8 val)
 650{
 651        u8 *p = object;
 652
 653        if (s->flags & __OBJECT_POISON) {
 654                memset(p, POISON_FREE, s->object_size - 1);
 655                p[s->object_size - 1] = POISON_END;
 656        }
 657
 658        if (s->flags & SLAB_RED_ZONE)
 659                memset(p + s->object_size, val, s->inuse - s->object_size);
 660}
 661
 662static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 663                                                void *from, void *to)
 664{
 665        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 666        memset(from, data, to - from);
 667}
 668
 669static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 670                        u8 *object, char *what,
 671                        u8 *start, unsigned int value, unsigned int bytes)
 672{
 673        u8 *fault;
 674        u8 *end;
 675
 676        fault = memchr_inv(start, value, bytes);
 677        if (!fault)
 678                return 1;
 679
 680        end = start + bytes;
 681        while (end > fault && end[-1] == value)
 682                end--;
 683
 684        slab_bug(s, "%s overwritten", what);
 685        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 686                                        fault, end - 1, fault[0], value);
 687        print_trailer(s, page, object);
 688
 689        restore_bytes(s, what, value, fault, end);
 690        return 0;
 691}
 692
 693/*
 694 * Object layout:
 695 *
 696 * object address
 697 *      Bytes of the object to be managed.
 698 *      If the freepointer may overlay the object then the free
 699 *      pointer is the first word of the object.
 700 *
 701 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 702 *      0xa5 (POISON_END)
 703 *
 704 * object + s->object_size
 705 *      Padding to reach word boundary. This is also used for Redzoning.
 706 *      Padding is extended by another word if Redzoning is enabled and
 707 *      object_size == inuse.
 708 *
 709 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 710 *      0xcc (RED_ACTIVE) for objects in use.
 711 *
 712 * object + s->inuse
 713 *      Meta data starts here.
 714 *
 715 *      A. Free pointer (if we cannot overwrite object on free)
 716 *      B. Tracking data for SLAB_STORE_USER
 717 *      C. Padding to reach required alignment boundary or at mininum
 718 *              one word if debugging is on to be able to detect writes
 719 *              before the word boundary.
 720 *
 721 *      Padding is done using 0x5a (POISON_INUSE)
 722 *
 723 * object + s->size
 724 *      Nothing is used beyond s->size.
 725 *
 726 * If slabcaches are merged then the object_size and inuse boundaries are mostly
 727 * ignored. And therefore no slab options that rely on these boundaries
 728 * may be used with merged slabcaches.
 729 */
 730
 731static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 732{
 733        unsigned long off = s->inuse;   /* The end of info */
 734
 735        if (s->offset)
 736                /* Freepointer is placed after the object. */
 737                off += sizeof(void *);
 738
 739        if (s->flags & SLAB_STORE_USER)
 740                /* We also have user information there */
 741                off += 2 * sizeof(struct track);
 742
 743        if (s->size == off)
 744                return 1;
 745
 746        return check_bytes_and_report(s, page, p, "Object padding",
 747                                p + off, POISON_INUSE, s->size - off);
 748}
 749
 750/* Check the pad bytes at the end of a slab page */
 751static int slab_pad_check(struct kmem_cache *s, struct page *page)
 752{
 753        u8 *start;
 754        u8 *fault;
 755        u8 *end;
 756        int length;
 757        int remainder;
 758
 759        if (!(s->flags & SLAB_POISON))
 760                return 1;
 761
 762        start = page_address(page);
 763        length = (PAGE_SIZE << compound_order(page)) - s->reserved;
 764        end = start + length;
 765        remainder = length % s->size;
 766        if (!remainder)
 767                return 1;
 768
 769        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
 770        if (!fault)
 771                return 1;
 772        while (end > fault && end[-1] == POISON_INUSE)
 773                end--;
 774
 775        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 776        print_section("Padding ", end - remainder, remainder);
 777
 778        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
 779        return 0;
 780}
 781
 782static int check_object(struct kmem_cache *s, struct page *page,
 783                                        void *object, u8 val)
 784{
 785        u8 *p = object;
 786        u8 *endobject = object + s->object_size;
 787
 788        if (s->flags & SLAB_RED_ZONE) {
 789                if (!check_bytes_and_report(s, page, object, "Redzone",
 790                        endobject, val, s->inuse - s->object_size))
 791                        return 0;
 792        } else {
 793                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 794                        check_bytes_and_report(s, page, p, "Alignment padding",
 795                                endobject, POISON_INUSE,
 796                                s->inuse - s->object_size);
 797                }
 798        }
 799
 800        if (s->flags & SLAB_POISON) {
 801                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
 802                        (!check_bytes_and_report(s, page, p, "Poison", p,
 803                                        POISON_FREE, s->object_size - 1) ||
 804                         !check_bytes_and_report(s, page, p, "Poison",
 805                                p + s->object_size - 1, POISON_END, 1)))
 806                        return 0;
 807                /*
 808                 * check_pad_bytes cleans up on its own.
 809                 */
 810                check_pad_bytes(s, page, p);
 811        }
 812
 813        if (!s->offset && val == SLUB_RED_ACTIVE)
 814                /*
 815                 * Object and freepointer overlap. Cannot check
 816                 * freepointer while object is allocated.
 817                 */
 818                return 1;
 819
 820        /* Check free pointer validity */
 821        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 822                object_err(s, page, p, "Freepointer corrupt");
 823                /*
 824                 * No choice but to zap it and thus lose the remainder
 825                 * of the free objects in this slab. May cause
 826                 * another error because the object count is now wrong.
 827                 */
 828                set_freepointer(s, p, NULL);
 829                return 0;
 830        }
 831        return 1;
 832}
 833
 834static int check_slab(struct kmem_cache *s, struct page *page)
 835{
 836        int maxobj;
 837
 838        VM_BUG_ON(!irqs_disabled());
 839
 840        if (!PageSlab(page)) {
 841                slab_err(s, page, "Not a valid slab page");
 842                return 0;
 843        }
 844
 845        maxobj = order_objects(compound_order(page), s->size, s->reserved);
 846        if (page->objects > maxobj) {
 847                slab_err(s, page, "objects %u > max %u",
 848                        s->name, page->objects, maxobj);
 849                return 0;
 850        }
 851        if (page->inuse > page->objects) {
 852                slab_err(s, page, "inuse %u > max %u",
 853                        s->name, page->inuse, page->objects);
 854                return 0;
 855        }
 856        /* Slab_pad_check fixes things up after itself */
 857        slab_pad_check(s, page);
 858        return 1;
 859}
 860
 861/*
 862 * Determine if a certain object on a page is on the freelist. Must hold the
 863 * slab lock to guarantee that the chains are in a consistent state.
 864 */
 865static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 866{
 867        int nr = 0;
 868        void *fp;
 869        void *object = NULL;
 870        unsigned long max_objects;
 871
 872        fp = page->freelist;
 873        while (fp && nr <= page->objects) {
 874                if (fp == search)
 875                        return 1;
 876                if (!check_valid_pointer(s, page, fp)) {
 877                        if (object) {
 878                                object_err(s, page, object,
 879                                        "Freechain corrupt");
 880                                set_freepointer(s, object, NULL);
 881                        } else {
 882                                slab_err(s, page, "Freepointer corrupt");
 883                                page->freelist = NULL;
 884                                page->inuse = page->objects;
 885                                slab_fix(s, "Freelist cleared");
 886                                return 0;
 887                        }
 888                        break;
 889                }
 890                object = fp;
 891                fp = get_freepointer(s, object);
 892                nr++;
 893        }
 894
 895        max_objects = order_objects(compound_order(page), s->size, s->reserved);
 896        if (max_objects > MAX_OBJS_PER_PAGE)
 897                max_objects = MAX_OBJS_PER_PAGE;
 898
 899        if (page->objects != max_objects) {
 900                slab_err(s, page, "Wrong number of objects. Found %d but "
 901                        "should be %d", page->objects, max_objects);
 902                page->objects = max_objects;
 903                slab_fix(s, "Number of objects adjusted.");
 904        }
 905        if (page->inuse != page->objects - nr) {
 906                slab_err(s, page, "Wrong object count. Counter is %d but "
 907                        "counted were %d", page->inuse, page->objects - nr);
 908                page->inuse = page->objects - nr;
 909                slab_fix(s, "Object count adjusted.");
 910        }
 911        return search == NULL;
 912}
 913
 914static void trace(struct kmem_cache *s, struct page *page, void *object,
 915                                                                int alloc)
 916{
 917        if (s->flags & SLAB_TRACE) {
 918                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 919                        s->name,
 920                        alloc ? "alloc" : "free",
 921                        object, page->inuse,
 922                        page->freelist);
 923
 924                if (!alloc)
 925                        print_section("Object ", (void *)object,
 926                                        s->object_size);
 927
 928                dump_stack();
 929        }
 930}
 931
 932/*
 933 * Hooks for other subsystems that check memory allocations. In a typical
 934 * production configuration these hooks all should produce no code at all.
 935 */
 936static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 937{
 938        flags &= gfp_allowed_mask;
 939        lockdep_trace_alloc(flags);
 940        might_sleep_if(flags & __GFP_WAIT);
 941
 942        return should_failslab(s->object_size, flags, s->flags);
 943}
 944
 945static inline void slab_post_alloc_hook(struct kmem_cache *s,
 946                                        gfp_t flags, void *object)
 947{
 948        flags &= gfp_allowed_mask;
 949        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 950        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 951}
 952
 953static inline void slab_free_hook(struct kmem_cache *s, void *x)
 954{
 955        kmemleak_free_recursive(x, s->flags);
 956
 957        /*
 958         * Trouble is that we may no longer disable interupts in the fast path
 959         * So in order to make the debug calls that expect irqs to be
 960         * disabled we need to disable interrupts temporarily.
 961         */
 962#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
 963        {
 964                unsigned long flags;
 965
 966                local_irq_save(flags);
 967                kmemcheck_slab_free(s, x, s->object_size);
 968                debug_check_no_locks_freed(x, s->object_size);
 969                local_irq_restore(flags);
 970        }
 971#endif
 972        if (!(s->flags & SLAB_DEBUG_OBJECTS))
 973                debug_check_no_obj_freed(x, s->object_size);
 974}
 975
 976/*
 977 * Tracking of fully allocated slabs for debugging purposes.
 978 *
 979 * list_lock must be held.
 980 */
 981static void add_full(struct kmem_cache *s,
 982        struct kmem_cache_node *n, struct page *page)
 983{
 984        if (!(s->flags & SLAB_STORE_USER))
 985                return;
 986
 987        list_add(&page->lru, &n->full);
 988}
 989
 990/*
 991 * list_lock must be held.
 992 */
 993static void remove_full(struct kmem_cache *s, struct page *page)
 994{
 995        if (!(s->flags & SLAB_STORE_USER))
 996                return;
 997
 998        list_del(&page->lru);
 999}
1000
1001/* Tracking of the number of slabs for debugging purposes */
1002static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1003{
1004        struct kmem_cache_node *n = get_node(s, node);
1005
1006        return atomic_long_read(&n->nr_slabs);
1007}
1008
1009static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1010{
1011        return atomic_long_read(&n->nr_slabs);
1012}
1013
1014static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1015{
1016        struct kmem_cache_node *n = get_node(s, node);
1017
1018        /*
1019         * May be called early in order to allocate a slab for the
1020         * kmem_cache_node structure. Solve the chicken-egg
1021         * dilemma by deferring the increment of the count during
1022         * bootstrap (see early_kmem_cache_node_alloc).
1023         */
1024        if (likely(n)) {
1025                atomic_long_inc(&n->nr_slabs);
1026                atomic_long_add(objects, &n->total_objects);
1027        }
1028}
1029static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1030{
1031        struct kmem_cache_node *n = get_node(s, node);
1032
1033        atomic_long_dec(&n->nr_slabs);
1034        atomic_long_sub(objects, &n->total_objects);
1035}
1036
1037/* Object debug checks for alloc/free paths */
1038static void setup_object_debug(struct kmem_cache *s, struct page *page,
1039                                                                void *object)
1040{
1041        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1042                return;
1043
1044        init_object(s, object, SLUB_RED_INACTIVE);
1045        init_tracking(s, object);
1046}
1047
1048static noinline int alloc_debug_processing(struct kmem_cache *s,
1049                                        struct page *page,
1050                                        void *object, unsigned long addr)
1051{
1052        if (!check_slab(s, page))
1053                goto bad;
1054
1055        if (!check_valid_pointer(s, page, object)) {
1056                object_err(s, page, object, "Freelist Pointer check fails");
1057                goto bad;
1058        }
1059
1060        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1061                goto bad;
1062
1063        /* Success perform special debug activities for allocs */
1064        if (s->flags & SLAB_STORE_USER)
1065                set_track(s, object, TRACK_ALLOC, addr);
1066        trace(s, page, object, 1);
1067        init_object(s, object, SLUB_RED_ACTIVE);
1068        return 1;
1069
1070bad:
1071        if (PageSlab(page)) {
1072                /*
1073                 * If this is a slab page then lets do the best we can
1074                 * to avoid issues in the future. Marking all objects
1075                 * as used avoids touching the remaining objects.
1076                 */
1077                slab_fix(s, "Marking all objects used");
1078                page->inuse = page->objects;
1079                page->freelist = NULL;
1080        }
1081        return 0;
1082}
1083
1084static noinline struct kmem_cache_node *free_debug_processing(
1085        struct kmem_cache *s, struct page *page, void *object,
1086        unsigned long addr, unsigned long *flags)
1087{
1088        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1089
1090        spin_lock_irqsave(&n->list_lock, *flags);
1091        slab_lock(page);
1092
1093        if (!check_slab(s, page))
1094                goto fail;
1095
1096        if (!check_valid_pointer(s, page, object)) {
1097                slab_err(s, page, "Invalid object pointer 0x%p", object);
1098                goto fail;
1099        }
1100
1101        if (on_freelist(s, page, object)) {
1102                object_err(s, page, object, "Object already free");
1103                goto fail;
1104        }
1105
1106        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1107                goto out;
1108
1109        if (unlikely(s != page->slab_cache)) {
1110                if (!PageSlab(page)) {
1111                        slab_err(s, page, "Attempt to free object(0x%p) "
1112                                "outside of slab", object);
1113                } else if (!page->slab_cache) {
1114                        printk(KERN_ERR
1115                                "SLUB <none>: no slab for object 0x%p.\n",
1116                                                object);
1117                        dump_stack();
1118                } else
1119                        object_err(s, page, object,
1120                                        "page slab pointer corrupt.");
1121                goto fail;
1122        }
1123
1124        if (s->flags & SLAB_STORE_USER)
1125                set_track(s, object, TRACK_FREE, addr);
1126        trace(s, page, object, 0);
1127        init_object(s, object, SLUB_RED_INACTIVE);
1128out:
1129        slab_unlock(page);
1130        /*
1131         * Keep node_lock to preserve integrity
1132         * until the object is actually freed
1133         */
1134        return n;
1135
1136fail:
1137        slab_unlock(page);
1138        spin_unlock_irqrestore(&n->list_lock, *flags);
1139        slab_fix(s, "Object at 0x%p not freed", object);
1140        return NULL;
1141}
1142
1143static int __init setup_slub_debug(char *str)
1144{
1145        slub_debug = DEBUG_DEFAULT_FLAGS;
1146        if (*str++ != '=' || !*str)
1147                /*
1148                 * No options specified. Switch on full debugging.
1149                 */
1150                goto out;
1151
1152        if (*str == ',')
1153                /*
1154                 * No options but restriction on slabs. This means full
1155                 * debugging for slabs matching a pattern.
1156                 */
1157                goto check_slabs;
1158
1159        if (tolower(*str) == 'o') {
1160                /*
1161                 * Avoid enabling debugging on caches if its minimum order
1162                 * would increase as a result.
1163                 */
1164                disable_higher_order_debug = 1;
1165                goto out;
1166        }
1167
1168        slub_debug = 0;
1169        if (*str == '-')
1170                /*
1171                 * Switch off all debugging measures.
1172                 */
1173                goto out;
1174
1175        /*
1176         * Determine which debug features should be switched on
1177         */
1178        for (; *str && *str != ','; str++) {
1179                switch (tolower(*str)) {
1180                case 'f':
1181                        slub_debug |= SLAB_DEBUG_FREE;
1182                        break;
1183                case 'z':
1184                        slub_debug |= SLAB_RED_ZONE;
1185                        break;
1186                case 'p':
1187                        slub_debug |= SLAB_POISON;
1188                        break;
1189                case 'u':
1190                        slub_debug |= SLAB_STORE_USER;
1191                        break;
1192                case 't':
1193                        slub_debug |= SLAB_TRACE;
1194                        break;
1195                case 'a':
1196                        slub_debug |= SLAB_FAILSLAB;
1197                        break;
1198                default:
1199                        printk(KERN_ERR "slub_debug option '%c' "
1200                                "unknown. skipped\n", *str);
1201                }
1202        }
1203
1204check_slabs:
1205        if (*str == ',')
1206                slub_debug_slabs = str + 1;
1207out:
1208        return 1;
1209}
1210
1211__setup("slub_debug", setup_slub_debug);
1212
1213static unsigned long kmem_cache_flags(unsigned long object_size,
1214        unsigned long flags, const char *name,
1215        void (*ctor)(void *))
1216{
1217        /*
1218         * Enable debugging if selected on the kernel commandline.
1219         */
1220        if (slub_debug && (!slub_debug_slabs ||
1221                !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1222                flags |= slub_debug;
1223
1224        return flags;
1225}
1226#else
1227static inline void setup_object_debug(struct kmem_cache *s,
1228                        struct page *page, void *object) {}
1229
1230static inline int alloc_debug_processing(struct kmem_cache *s,
1231        struct page *page, void *object, unsigned long addr) { return 0; }
1232
1233static inline struct kmem_cache_node *free_debug_processing(
1234        struct kmem_cache *s, struct page *page, void *object,
1235        unsigned long addr, unsigned long *flags) { return NULL; }
1236
1237static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1238                        { return 1; }
1239static inline int check_object(struct kmem_cache *s, struct page *page,
1240                        void *object, u8 val) { return 1; }
1241static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1242                                        struct page *page) {}
1243static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1244static inline unsigned long kmem_cache_flags(unsigned long object_size,
1245        unsigned long flags, const char *name,
1246        void (*ctor)(void *))
1247{
1248        return flags;
1249}
1250#define slub_debug 0
1251
1252#define disable_higher_order_debug 0
1253
1254static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1255                                                        { return 0; }
1256static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1257                                                        { return 0; }
1258static inline void inc_slabs_node(struct kmem_cache *s, int node,
1259                                                        int objects) {}
1260static inline void dec_slabs_node(struct kmem_cache *s, int node,
1261                                                        int objects) {}
1262
1263static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1264                                                        { return 0; }
1265
1266static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1267                void *object) {}
1268
1269static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1270
1271#endif /* CONFIG_SLUB_DEBUG */
1272
1273/*
1274 * Slab allocation and freeing
1275 */
1276static inline struct page *alloc_slab_page(gfp_t flags, int node,
1277                                        struct kmem_cache_order_objects oo)
1278{
1279        int order = oo_order(oo);
1280
1281        flags |= __GFP_NOTRACK;
1282
1283        if (node == NUMA_NO_NODE)
1284                return alloc_pages(flags, order);
1285        else
1286                return alloc_pages_exact_node(node, flags, order);
1287}
1288
1289static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1290{
1291        struct page *page;
1292        struct kmem_cache_order_objects oo = s->oo;
1293        gfp_t alloc_gfp;
1294
1295        flags &= gfp_allowed_mask;
1296
1297        if (flags & __GFP_WAIT)
1298                local_irq_enable();
1299
1300        flags |= s->allocflags;
1301
1302        /*
1303         * Let the initial higher-order allocation fail under memory pressure
1304         * so we fall-back to the minimum order allocation.
1305         */
1306        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1307
1308        page = alloc_slab_page(alloc_gfp, node, oo);
1309        if (unlikely(!page)) {
1310                oo = s->min;
1311                /*
1312                 * Allocation may have failed due to fragmentation.
1313                 * Try a lower order alloc if possible
1314                 */
1315                page = alloc_slab_page(flags, node, oo);
1316
1317                if (page)
1318                        stat(s, ORDER_FALLBACK);
1319        }
1320
1321        if (kmemcheck_enabled && page
1322                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1323                int pages = 1 << oo_order(oo);
1324
1325                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1326
1327                /*
1328                 * Objects from caches that have a constructor don't get
1329                 * cleared when they're allocated, so we need to do it here.
1330                 */
1331                if (s->ctor)
1332                        kmemcheck_mark_uninitialized_pages(page, pages);
1333                else
1334                        kmemcheck_mark_unallocated_pages(page, pages);
1335        }
1336
1337        if (flags & __GFP_WAIT)
1338                local_irq_disable();
1339        if (!page)
1340                return NULL;
1341
1342        page->objects = oo_objects(oo);
1343        mod_zone_page_state(page_zone(page),
1344                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1345                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1346                1 << oo_order(oo));
1347
1348        return page;
1349}
1350
1351static void setup_object(struct kmem_cache *s, struct page *page,
1352                                void *object)
1353{
1354        setup_object_debug(s, page, object);
1355        if (unlikely(s->ctor))
1356                s->ctor(object);
1357}
1358
1359static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1360{
1361        struct page *page;
1362        void *start;
1363        void *last;
1364        void *p;
1365        int order;
1366
1367        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1368
1369        page = allocate_slab(s,
1370                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1371        if (!page)
1372                goto out;
1373
1374        order = compound_order(page);
1375        inc_slabs_node(s, page_to_nid(page), page->objects);
1376        memcg_bind_pages(s, order);
1377        page->slab_cache = s;
1378        __SetPageSlab(page);
1379        if (page->pfmemalloc)
1380                SetPageSlabPfmemalloc(page);
1381
1382        start = page_address(page);
1383
1384        if (unlikely(s->flags & SLAB_POISON))
1385                memset(start, POISON_INUSE, PAGE_SIZE << order);
1386
1387        last = start;
1388        for_each_object(p, s, start, page->objects) {
1389                setup_object(s, page, last);
1390                set_freepointer(s, last, p);
1391                last = p;
1392        }
1393        setup_object(s, page, last);
1394        set_freepointer(s, last, NULL);
1395
1396        page->freelist = start;
1397        page->inuse = page->objects;
1398        page->frozen = 1;
1399out:
1400        return page;
1401}
1402
1403static void __free_slab(struct kmem_cache *s, struct page *page)
1404{
1405        int order = compound_order(page);
1406        int pages = 1 << order;
1407
1408        if (kmem_cache_debug(s)) {
1409                void *p;
1410
1411                slab_pad_check(s, page);
1412                for_each_object(p, s, page_address(page),
1413                                                page->objects)
1414                        check_object(s, page, p, SLUB_RED_INACTIVE);
1415        }
1416
1417        kmemcheck_free_shadow(page, compound_order(page));
1418
1419        mod_zone_page_state(page_zone(page),
1420                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1421                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1422                -pages);
1423
1424        __ClearPageSlabPfmemalloc(page);
1425        __ClearPageSlab(page);
1426
1427        memcg_release_pages(s, order);
1428        page_mapcount_reset(page);
1429        if (current->reclaim_state)
1430                current->reclaim_state->reclaimed_slab += pages;
1431        __free_memcg_kmem_pages(page, order);
1432}
1433
1434#define need_reserve_slab_rcu                                           \
1435        (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1436
1437static void rcu_free_slab(struct rcu_head *h)
1438{
1439        struct page *page;
1440
1441        if (need_reserve_slab_rcu)
1442                page = virt_to_head_page(h);
1443        else
1444                page = container_of((struct list_head *)h, struct page, lru);
1445
1446        __free_slab(page->slab_cache, page);
1447}
1448
1449static void free_slab(struct kmem_cache *s, struct page *page)
1450{
1451        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1452                struct rcu_head *head;
1453
1454                if (need_reserve_slab_rcu) {
1455                        int order = compound_order(page);
1456                        int offset = (PAGE_SIZE << order) - s->reserved;
1457
1458                        VM_BUG_ON(s->reserved != sizeof(*head));
1459                        head = page_address(page) + offset;
1460                } else {
1461                        /*
1462                         * RCU free overloads the RCU head over the LRU
1463                         */
1464                        head = (void *)&page->lru;
1465                }
1466
1467                call_rcu(head, rcu_free_slab);
1468        } else
1469                __free_slab(s, page);
1470}
1471
1472static void discard_slab(struct kmem_cache *s, struct page *page)
1473{
1474        dec_slabs_node(s, page_to_nid(page), page->objects);
1475        free_slab(s, page);
1476}
1477
1478/*
1479 * Management of partially allocated slabs.
1480 *
1481 * list_lock must be held.
1482 */
1483static inline void add_partial(struct kmem_cache_node *n,
1484                                struct page *page, int tail)
1485{
1486        n->nr_partial++;
1487        if (tail == DEACTIVATE_TO_TAIL)
1488                list_add_tail(&page->lru, &n->partial);
1489        else
1490                list_add(&page->lru, &n->partial);
1491}
1492
1493/*
1494 * list_lock must be held.
1495 */
1496static inline void remove_partial(struct kmem_cache_node *n,
1497                                        struct page *page)
1498{
1499        list_del(&page->lru);
1500        n->nr_partial--;
1501}
1502
1503/*
1504 * Remove slab from the partial list, freeze it and
1505 * return the pointer to the freelist.
1506 *
1507 * Returns a list of objects or NULL if it fails.
1508 *
1509 * Must hold list_lock since we modify the partial list.
1510 */
1511static inline void *acquire_slab(struct kmem_cache *s,
1512                struct kmem_cache_node *n, struct page *page,
1513                int mode, int *objects)
1514{
1515        void *freelist;
1516        unsigned long counters;
1517        struct page new;
1518
1519        /*
1520         * Zap the freelist and set the frozen bit.
1521         * The old freelist is the list of objects for the
1522         * per cpu allocation list.
1523         */
1524        freelist = page->freelist;
1525        counters = page->counters;
1526        new.counters = counters;
1527        *objects = new.objects - new.inuse;
1528        if (mode) {
1529                new.inuse = page->objects;
1530                new.freelist = NULL;
1531        } else {
1532                new.freelist = freelist;
1533        }
1534
1535        VM_BUG_ON(new.frozen);
1536        new.frozen = 1;
1537
1538        if (!__cmpxchg_double_slab(s, page,
1539                        freelist, counters,
1540                        new.freelist, new.counters,
1541                        "acquire_slab"))
1542                return NULL;
1543
1544        remove_partial(n, page);
1545        WARN_ON(!freelist);
1546        return freelist;
1547}
1548
1549static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
1550static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
1551
1552/*
1553 * Try to allocate a partial slab from a specific node.
1554 */
1555static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1556                                struct kmem_cache_cpu *c, gfp_t flags)
1557{
1558        struct page *page, *page2;
1559        void *object = NULL;
1560        int available = 0;
1561        int objects;
1562
1563        /*
1564         * Racy check. If we mistakenly see no partial slabs then we
1565         * just allocate an empty slab. If we mistakenly try to get a
1566         * partial slab and there is none available then get_partials()
1567         * will return NULL.
1568         */
1569        if (!n || !n->nr_partial)
1570                return NULL;
1571
1572        spin_lock(&n->list_lock);
1573        list_for_each_entry_safe(page, page2, &n->partial, lru) {
1574                void *t;
1575
1576                if (!pfmemalloc_match(page, flags))
1577                        continue;
1578
1579                t = acquire_slab(s, n, page, object == NULL, &objects);
1580                if (!t)
1581                        break;
1582
1583                available += objects;
1584                if (!object) {
1585                        c->page = page;
1586                        stat(s, ALLOC_FROM_PARTIAL);
1587                        object = t;
1588                } else {
1589                        put_cpu_partial(s, page, 0);
1590                        stat(s, CPU_PARTIAL_NODE);
1591                }
1592                if (!kmem_cache_has_cpu_partial(s)
1593                        || available > s->cpu_partial / 2)
1594                        break;
1595
1596        }
1597        spin_unlock(&n->list_lock);
1598        return object;
1599}
1600
1601/*
1602 * Get a page from somewhere. Search in increasing NUMA distances.
1603 */
1604static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1605                struct kmem_cache_cpu *c)
1606{
1607#ifdef CONFIG_NUMA
1608        struct zonelist *zonelist;
1609        struct zoneref *z;
1610        struct zone *zone;
1611        enum zone_type high_zoneidx = gfp_zone(flags);
1612        void *object;
1613        unsigned int cpuset_mems_cookie;
1614
1615        /*
1616         * The defrag ratio allows a configuration of the tradeoffs between
1617         * inter node defragmentation and node local allocations. A lower
1618         * defrag_ratio increases the tendency to do local allocations
1619         * instead of attempting to obtain partial slabs from other nodes.
1620         *
1621         * If the defrag_ratio is set to 0 then kmalloc() always
1622         * returns node local objects. If the ratio is higher then kmalloc()
1623         * may return off node objects because partial slabs are obtained
1624         * from other nodes and filled up.
1625         *
1626         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1627         * defrag_ratio = 1000) then every (well almost) allocation will
1628         * first attempt to defrag slab caches on other nodes. This means
1629         * scanning over all nodes to look for partial slabs which may be
1630         * expensive if we do it every time we are trying to find a slab
1631         * with available objects.
1632         */
1633        if (!s->remote_node_defrag_ratio ||
1634                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1635                return NULL;
1636
1637        do {
1638                cpuset_mems_cookie = get_mems_allowed();
1639                zonelist = node_zonelist(slab_node(), flags);
1640                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1641                        struct kmem_cache_node *n;
1642
1643                        n = get_node(s, zone_to_nid(zone));
1644
1645                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1646                                        n->nr_partial > s->min_partial) {
1647                                object = get_partial_node(s, n, c, flags);
1648                                if (object) {
1649                                        /*
1650                                         * Return the object even if
1651                                         * put_mems_allowed indicated that
1652                                         * the cpuset mems_allowed was
1653                                         * updated in parallel. It's a
1654                                         * harmless race between the alloc
1655                                         * and the cpuset update.
1656                                         */
1657                                        put_mems_allowed(cpuset_mems_cookie);
1658                                        return object;
1659                                }
1660                        }
1661                }
1662        } while (!put_mems_allowed(cpuset_mems_cookie));
1663#endif
1664        return NULL;
1665}
1666
1667/*
1668 * Get a partial page, lock it and return it.
1669 */
1670static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1671                struct kmem_cache_cpu *c)
1672{
1673        void *object;
1674        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1675
1676        object = get_partial_node(s, get_node(s, searchnode), c, flags);
1677        if (object || node != NUMA_NO_NODE)
1678                return object;
1679
1680        return get_any_partial(s, flags, c);
1681}
1682
1683#ifdef CONFIG_PREEMPT
1684/*
1685 * Calculate the next globally unique transaction for disambiguiation
1686 * during cmpxchg. The transactions start with the cpu number and are then
1687 * incremented by CONFIG_NR_CPUS.
1688 */
1689#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
1690#else
1691/*
1692 * No preemption supported therefore also no need to check for
1693 * different cpus.
1694 */
1695#define TID_STEP 1
1696#endif
1697
1698static inline unsigned long next_tid(unsigned long tid)
1699{
1700        return tid + TID_STEP;
1701}
1702
1703static inline unsigned int tid_to_cpu(unsigned long tid)
1704{
1705        return tid % TID_STEP;
1706}
1707
1708static inline unsigned long tid_to_event(unsigned long tid)
1709{
1710        return tid / TID_STEP;
1711}
1712
1713static inline unsigned int init_tid(int cpu)
1714{
1715        return cpu;
1716}
1717
1718static inline void note_cmpxchg_failure(const char *n,
1719                const struct kmem_cache *s, unsigned long tid)
1720{
1721#ifdef SLUB_DEBUG_CMPXCHG
1722        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1723
1724        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1725
1726#ifdef CONFIG_PREEMPT
1727        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1728                printk("due to cpu change %d -> %d\n",
1729                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
1730        else
1731#endif
1732        if (tid_to_event(tid) != tid_to_event(actual_tid))
1733                printk("due to cpu running other code. Event %ld->%ld\n",
1734                        tid_to_event(tid), tid_to_event(actual_tid));
1735        else
1736                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1737                        actual_tid, tid, next_tid(tid));
1738#endif
1739        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1740}
1741
1742static void init_kmem_cache_cpus(struct kmem_cache *s)
1743{
1744        int cpu;
1745
1746        for_each_possible_cpu(cpu)
1747                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1748}
1749
1750/*
1751 * Remove the cpu slab
1752 */
1753static void deactivate_slab(struct kmem_cache *s, struct page *page,
1754                                void *freelist)
1755{
1756        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1757        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1758        int lock = 0;
1759        enum slab_modes l = M_NONE, m = M_NONE;
1760        void *nextfree;
1761        int tail = DEACTIVATE_TO_HEAD;
1762        struct page new;
1763        struct page old;
1764
1765        if (page->freelist) {
1766                stat(s, DEACTIVATE_REMOTE_FREES);
1767                tail = DEACTIVATE_TO_TAIL;
1768        }
1769
1770        /*
1771         * Stage one: Free all available per cpu objects back
1772         * to the page freelist while it is still frozen. Leave the
1773         * last one.
1774         *
1775         * There is no need to take the list->lock because the page
1776         * is still frozen.
1777         */
1778        while (freelist && (nextfree = get_freepointer(s, freelist))) {
1779                void *prior;
1780                unsigned long counters;
1781
1782                do {
1783                        prior = page->freelist;
1784                        counters = page->counters;
1785                        set_freepointer(s, freelist, prior);
1786                        new.counters = counters;
1787                        new.inuse--;
1788                        VM_BUG_ON(!new.frozen);
1789
1790                } while (!__cmpxchg_double_slab(s, page,
1791                        prior, counters,
1792                        freelist, new.counters,
1793                        "drain percpu freelist"));
1794
1795                freelist = nextfree;
1796        }
1797
1798        /*
1799         * Stage two: Ensure that the page is unfrozen while the
1800         * list presence reflects the actual number of objects
1801         * during unfreeze.
1802         *
1803         * We setup the list membership and then perform a cmpxchg
1804         * with the count. If there is a mismatch then the page
1805         * is not unfrozen but the page is on the wrong list.
1806         *
1807         * Then we restart the process which may have to remove
1808         * the page from the list that we just put it on again
1809         * because the number of objects in the slab may have
1810         * changed.
1811         */
1812redo:
1813
1814        old.freelist = page->freelist;
1815        old.counters = page->counters;
1816        VM_BUG_ON(!old.frozen);
1817
1818        /* Determine target state of the slab */
1819        new.counters = old.counters;
1820        if (freelist) {
1821                new.inuse--;
1822                set_freepointer(s, freelist, old.freelist);
1823                new.freelist = freelist;
1824        } else
1825                new.freelist = old.freelist;
1826
1827        new.frozen = 0;
1828
1829        if (!new.inuse && n->nr_partial > s->min_partial)
1830                m = M_FREE;
1831        else if (new.freelist) {
1832                m = M_PARTIAL;
1833                if (!lock) {
1834                        lock = 1;
1835                        /*
1836                         * Taking the spinlock removes the possiblity
1837                         * that acquire_slab() will see a slab page that
1838                         * is frozen
1839                         */
1840                        spin_lock(&n->list_lock);
1841                }
1842        } else {
1843                m = M_FULL;
1844                if (kmem_cache_debug(s) && !lock) {
1845                        lock = 1;
1846                        /*
1847                         * This also ensures that the scanning of full
1848                         * slabs from diagnostic functions will not see
1849                         * any frozen slabs.
1850                         */
1851                        spin_lock(&n->list_lock);
1852                }
1853        }
1854
1855        if (l != m) {
1856
1857                if (l == M_PARTIAL)
1858
1859                        remove_partial(n, page);
1860
1861                else if (l == M_FULL)
1862
1863                        remove_full(s, page);
1864
1865                if (m == M_PARTIAL) {
1866
1867                        add_partial(n, page, tail);
1868                        stat(s, tail);
1869
1870                } else if (m == M_FULL) {
1871
1872                        stat(s, DEACTIVATE_FULL);
1873                        add_full(s, n, page);
1874
1875                }
1876        }
1877
1878        l = m;
1879        if (!__cmpxchg_double_slab(s, page,
1880                                old.freelist, old.counters,
1881                                new.freelist, new.counters,
1882                                "unfreezing slab"))
1883                goto redo;
1884
1885        if (lock)
1886                spin_unlock(&n->list_lock);
1887
1888        if (m == M_FREE) {
1889                stat(s, DEACTIVATE_EMPTY);
1890                discard_slab(s, page);
1891                stat(s, FREE_SLAB);
1892        }
1893}
1894
1895/*
1896 * Unfreeze all the cpu partial slabs.
1897 *
1898 * This function must be called with interrupts disabled
1899 * for the cpu using c (or some other guarantee must be there
1900 * to guarantee no concurrent accesses).
1901 */
1902static void unfreeze_partials(struct kmem_cache *s,
1903                struct kmem_cache_cpu *c)
1904{
1905#ifdef CONFIG_SLUB_CPU_PARTIAL
1906        struct kmem_cache_node *n = NULL, *n2 = NULL;
1907        struct page *page, *discard_page = NULL;
1908
1909        while ((page = c->partial)) {
1910                struct page new;
1911                struct page old;
1912
1913                c->partial = page->next;
1914
1915                n2 = get_node(s, page_to_nid(page));
1916                if (n != n2) {
1917                        if (n)
1918                                spin_unlock(&n->list_lock);
1919
1920                        n = n2;
1921                        spin_lock(&n->list_lock);
1922                }
1923
1924                do {
1925
1926                        old.freelist = page->freelist;
1927                        old.counters = page->counters;
1928                        VM_BUG_ON(!old.frozen);
1929
1930                        new.counters = old.counters;
1931                        new.freelist = old.freelist;
1932
1933                        new.frozen = 0;
1934
1935                } while (!__cmpxchg_double_slab(s, page,
1936                                old.freelist, old.counters,
1937                                new.freelist, new.counters,
1938                                "unfreezing slab"));
1939
1940                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1941                        page->next = discard_page;
1942                        discard_page = page;
1943                } else {
1944                        add_partial(n, page, DEACTIVATE_TO_TAIL);
1945                        stat(s, FREE_ADD_PARTIAL);
1946                }
1947        }
1948
1949        if (n)
1950                spin_unlock(&n->list_lock);
1951
1952        while (discard_page) {
1953                page = discard_page;
1954                discard_page = discard_page->next;
1955
1956                stat(s, DEACTIVATE_EMPTY);
1957                discard_slab(s, page);
1958                stat(s, FREE_SLAB);
1959        }
1960#endif
1961}
1962
1963/*
1964 * Put a page that was just frozen (in __slab_free) into a partial page
1965 * slot if available. This is done without interrupts disabled and without
1966 * preemption disabled. The cmpxchg is racy and may put the partial page
1967 * onto a random cpus partial slot.
1968 *
1969 * If we did not find a slot then simply move all the partials to the
1970 * per node partial list.
1971 */
1972static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1973{
1974#ifdef CONFIG_SLUB_CPU_PARTIAL
1975        struct page *oldpage;
1976        int pages;
1977        int pobjects;
1978
1979        do {
1980                pages = 0;
1981                pobjects = 0;
1982                oldpage = this_cpu_read(s->cpu_slab->partial);
1983
1984                if (oldpage) {
1985                        pobjects = oldpage->pobjects;
1986                        pages = oldpage->pages;
1987                        if (drain && pobjects > s->cpu_partial) {
1988                                unsigned long flags;
1989                                /*
1990                                 * partial array is full. Move the existing
1991                                 * set to the per node partial list.
1992                                 */
1993                                local_irq_save(flags);
1994                                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
1995                                local_irq_restore(flags);
1996                                oldpage = NULL;
1997                                pobjects = 0;
1998                                pages = 0;
1999                                stat(s, CPU_PARTIAL_DRAIN);
2000                        }
2001                }
2002
2003                pages++;
2004                pobjects += page->objects - page->inuse;
2005
2006                page->pages = pages;
2007                page->pobjects = pobjects;
2008                page->next = oldpage;
2009
2010        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2011                                                                != oldpage);
2012#endif
2013}
2014
2015static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2016{
2017        stat(s, CPUSLAB_FLUSH);
2018        deactivate_slab(s, c->page, c->freelist);
2019
2020        c->tid = next_tid(c->tid);
2021        c->page = NULL;
2022        c->freelist = NULL;
2023}
2024
2025/*
2026 * Flush cpu slab.
2027 *
2028 * Called from IPI handler with interrupts disabled.
2029 */
2030static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2031{
2032        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2033
2034        if (likely(c)) {
2035                if (c->page)
2036                        flush_slab(s, c);
2037
2038                unfreeze_partials(s, c);
2039        }
2040}
2041
2042static void flush_cpu_slab(void *d)
2043{
2044        struct kmem_cache *s = d;
2045
2046        __flush_cpu_slab(s, smp_processor_id());
2047}
2048
2049static bool has_cpu_slab(int cpu, void *info)
2050{
2051        struct kmem_cache *s = info;
2052        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2053
2054        return c->page || c->partial;
2055}
2056
2057static void flush_all(struct kmem_cache *s)
2058{
2059        on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2060}
2061
2062/*
2063 * Check if the objects in a per cpu structure fit numa
2064 * locality expectations.
2065 */
2066static inline int node_match(struct page *page, int node)
2067{
2068#ifdef CONFIG_NUMA
2069        if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2070                return 0;
2071#endif
2072        return 1;
2073}
2074
2075static int count_free(struct page *page)
2076{
2077        return page->objects - page->inuse;
2078}
2079
2080static unsigned long count_partial(struct kmem_cache_node *n,
2081                                        int (*get_count)(struct page *))
2082{
2083        unsigned long flags;
2084        unsigned long x = 0;
2085        struct page *page;
2086
2087        spin_lock_irqsave(&n->list_lock, flags);
2088        list_for_each_entry(page, &n->partial, lru)
2089                x += get_count(page);
2090        spin_unlock_irqrestore(&n->list_lock, flags);
2091        return x;
2092}
2093
2094static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2095{
2096#ifdef CONFIG_SLUB_DEBUG
2097        return atomic_long_read(&n->total_objects);
2098#else
2099        return 0;
2100#endif
2101}
2102
2103static noinline void
2104slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2105{
2106        int node;
2107
2108        printk(KERN_WARNING
2109                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2110                nid, gfpflags);
2111        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
2112                "default order: %d, min order: %d\n", s->name, s->object_size,
2113                s->size, oo_order(s->oo), oo_order(s->min));
2114
2115        if (oo_order(s->min) > get_order(s->object_size))
2116                printk(KERN_WARNING "  %s debugging increased min order, use "
2117                       "slub_debug=O to disable.\n", s->name);
2118
2119        for_each_online_node(node) {
2120                struct kmem_cache_node *n = get_node(s, node);
2121                unsigned long nr_slabs;
2122                unsigned long nr_objs;
2123                unsigned long nr_free;
2124
2125                if (!n)
2126                        continue;
2127
2128                nr_free  = count_partial(n, count_free);
2129                nr_slabs = node_nr_slabs(n);
2130                nr_objs  = node_nr_objs(n);
2131
2132                printk(KERN_WARNING
2133                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2134                        node, nr_slabs, nr_objs, nr_free);
2135        }
2136}
2137
2138static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2139                        int node, struct kmem_cache_cpu **pc)
2140{
2141        void *freelist;
2142        struct kmem_cache_cpu *c = *pc;
2143        struct page *page;
2144
2145        freelist = get_partial(s, flags, node, c);
2146
2147        if (freelist)
2148                return freelist;
2149
2150        page = new_slab(s, flags, node);
2151        if (page) {
2152                c = __this_cpu_ptr(s->cpu_slab);
2153                if (c->page)
2154                        flush_slab(s, c);
2155
2156                /*
2157                 * No other reference to the page yet so we can
2158                 * muck around with it freely without cmpxchg
2159                 */
2160                freelist = page->freelist;
2161                page->freelist = NULL;
2162
2163                stat(s, ALLOC_SLAB);
2164                c->page = page;
2165                *pc = c;
2166        } else
2167                freelist = NULL;
2168
2169        return freelist;
2170}
2171
2172static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2173{
2174        if (unlikely(PageSlabPfmemalloc(page)))
2175                return gfp_pfmemalloc_allowed(gfpflags);
2176
2177        return true;
2178}
2179
2180/*
2181 * Check the page->freelist of a page and either transfer the freelist to the
2182 * per cpu freelist or deactivate the page.
2183 *
2184 * The page is still frozen if the return value is not NULL.
2185 *
2186 * If this function returns NULL then the page has been unfrozen.
2187 *
2188 * This function must be called with interrupt disabled.
2189 */
2190static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2191{
2192        struct page new;
2193        unsigned long counters;
2194        void *freelist;
2195
2196        do {
2197                freelist = page->freelist;
2198                counters = page->counters;
2199
2200                new.counters = counters;
2201                VM_BUG_ON(!new.frozen);
2202
2203                new.inuse = page->objects;
2204                new.frozen = freelist != NULL;
2205
2206        } while (!__cmpxchg_double_slab(s, page,
2207                freelist, counters,
2208                NULL, new.counters,
2209                "get_freelist"));
2210
2211        return freelist;
2212}
2213
2214/*
2215 * Slow path. The lockless freelist is empty or we need to perform
2216 * debugging duties.
2217 *
2218 * Processing is still very fast if new objects have been freed to the
2219 * regular freelist. In that case we simply take over the regular freelist
2220 * as the lockless freelist and zap the regular freelist.
2221 *
2222 * If that is not working then we fall back to the partial lists. We take the
2223 * first element of the freelist as the object to allocate now and move the
2224 * rest of the freelist to the lockless freelist.
2225 *
2226 * And if we were unable to get a new slab from the partial slab lists then
2227 * we need to allocate a new slab. This is the slowest path since it involves
2228 * a call to the page allocator and the setup of a new slab.
2229 */
2230static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2231                          unsigned long addr, struct kmem_cache_cpu *c)
2232{
2233        void *freelist;
2234        struct page *page;
2235        unsigned long flags;
2236
2237        local_irq_save(flags);
2238#ifdef CONFIG_PREEMPT
2239        /*
2240         * We may have been preempted and rescheduled on a different
2241         * cpu before disabling interrupts. Need to reload cpu area
2242         * pointer.
2243         */
2244        c = this_cpu_ptr(s->cpu_slab);
2245#endif
2246
2247        page = c->page;
2248        if (!page)
2249                goto new_slab;
2250redo:
2251
2252        if (unlikely(!node_match(page, node))) {
2253                stat(s, ALLOC_NODE_MISMATCH);
2254                deactivate_slab(s, page, c->freelist);
2255                c->page = NULL;
2256                c->freelist = NULL;
2257                goto new_slab;
2258        }
2259
2260        /*
2261         * By rights, we should be searching for a slab page that was
2262         * PFMEMALLOC but right now, we are losing the pfmemalloc
2263         * information when the page leaves the per-cpu allocator
2264         */
2265        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2266                deactivate_slab(s, page, c->freelist);
2267                c->page = NULL;
2268                c->freelist = NULL;
2269                goto new_slab;
2270        }
2271
2272        /* must check again c->freelist in case of cpu migration or IRQ */
2273        freelist = c->freelist;
2274        if (freelist)
2275                goto load_freelist;
2276
2277        stat(s, ALLOC_SLOWPATH);
2278
2279        freelist = get_freelist(s, page);
2280
2281        if (!freelist) {
2282                c->page = NULL;
2283                stat(s, DEACTIVATE_BYPASS);
2284                goto new_slab;
2285        }
2286
2287        stat(s, ALLOC_REFILL);
2288
2289load_freelist:
2290        /*
2291         * freelist is pointing to the list of objects to be used.
2292         * page is pointing to the page from which the objects are obtained.
2293         * That page must be frozen for per cpu allocations to work.
2294         */
2295        VM_BUG_ON(!c->page->frozen);
2296        c->freelist = get_freepointer(s, freelist);
2297        c->tid = next_tid(c->tid);
2298        local_irq_restore(flags);
2299        return freelist;
2300
2301new_slab:
2302
2303        if (c->partial) {
2304                page = c->page = c->partial;
2305                c->partial = page->next;
2306                stat(s, CPU_PARTIAL_ALLOC);
2307                c->freelist = NULL;
2308                goto redo;
2309        }
2310
2311        freelist = new_slab_objects(s, gfpflags, node, &c);
2312
2313        if (unlikely(!freelist)) {
2314                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2315                        slab_out_of_memory(s, gfpflags, node);
2316
2317                local_irq_restore(flags);
2318                return NULL;
2319        }
2320
2321        page = c->page;
2322        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2323                goto load_freelist;
2324
2325        /* Only entered in the debug case */
2326        if (kmem_cache_debug(s) &&
2327                        !alloc_debug_processing(s, page, freelist, addr))
2328                goto new_slab;  /* Slab failed checks. Next slab needed */
2329
2330        deactivate_slab(s, page, get_freepointer(s, freelist));
2331        c->page = NULL;
2332        c->freelist = NULL;
2333        local_irq_restore(flags);
2334        return freelist;
2335}
2336
2337/*
2338 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
2339 * have the fastpath folded into their functions. So no function call
2340 * overhead for requests that can be satisfied on the fastpath.
2341 *
2342 * The fastpath works by first checking if the lockless freelist can be used.
2343 * If not then __slab_alloc is called for slow processing.
2344 *
2345 * Otherwise we can simply pick the next object from the lockless free list.
2346 */
2347static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2348                gfp_t gfpflags, int node, unsigned long addr)
2349{
2350        void **object;
2351        struct kmem_cache_cpu *c;
2352        struct page *page;
2353        unsigned long tid;
2354
2355        if (slab_pre_alloc_hook(s, gfpflags))
2356                return NULL;
2357
2358        s = memcg_kmem_get_cache(s, gfpflags);
2359redo:
2360        /*
2361         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2362         * enabled. We may switch back and forth between cpus while
2363         * reading from one cpu area. That does not matter as long
2364         * as we end up on the original cpu again when doing the cmpxchg.
2365         *
2366         * Preemption is disabled for the retrieval of the tid because that
2367         * must occur from the current processor. We cannot allow rescheduling
2368         * on a different processor between the determination of the pointer
2369         * and the retrieval of the tid.
2370         */
2371        preempt_disable();
2372        c = __this_cpu_ptr(s->cpu_slab);
2373
2374        /*
2375         * The transaction ids are globally unique per cpu and per operation on
2376         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2377         * occurs on the right processor and that there was no operation on the
2378         * linked list in between.
2379         */
2380        tid = c->tid;
2381        preempt_enable();
2382
2383        object = c->freelist;
2384        page = c->page;
2385        if (unlikely(!object || !node_match(page, node)))
2386                object = __slab_alloc(s, gfpflags, node, addr, c);
2387
2388        else {
2389                void *next_object = get_freepointer_safe(s, object);
2390
2391                /*
2392                 * The cmpxchg will only match if there was no additional
2393                 * operation and if we are on the right processor.
2394                 *
2395                 * The cmpxchg does the following atomically (without lock
2396                 * semantics!)
2397                 * 1. Relocate first pointer to the current per cpu area.
2398                 * 2. Verify that tid and freelist have not been changed
2399                 * 3. If they were not changed replace tid and freelist
2400                 *
2401                 * Since this is without lock semantics the protection is only
2402                 * against code executing on this cpu *not* from access by
2403                 * other cpus.
2404                 */
2405                if (unlikely(!this_cpu_cmpxchg_double(
2406                                s->cpu_slab->freelist, s->cpu_slab->tid,
2407                                object, tid,
2408                                next_object, next_tid(tid)))) {
2409
2410                        note_cmpxchg_failure("slab_alloc", s, tid);
2411                        goto redo;
2412                }
2413                prefetch_freepointer(s, next_object);
2414                stat(s, ALLOC_FASTPATH);
2415        }
2416
2417        if (unlikely(gfpflags & __GFP_ZERO) && object)
2418                memset(object, 0, s->object_size);
2419
2420        slab_post_alloc_hook(s, gfpflags, object);
2421
2422        return object;
2423}
2424
2425static __always_inline void *slab_alloc(struct kmem_cache *s,
2426                gfp_t gfpflags, unsigned long addr)
2427{
2428        return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2429}
2430
2431void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2432{
2433        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2434
2435        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
2436                                s->size, gfpflags);
2437
2438        return ret;
2439}
2440EXPORT_SYMBOL(kmem_cache_alloc);
2441
2442#ifdef CONFIG_TRACING
2443void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2444{
2445        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2446        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2447        return ret;
2448}
2449EXPORT_SYMBOL(kmem_cache_alloc_trace);
2450#endif
2451
2452#ifdef CONFIG_NUMA
2453void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2454{
2455        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2456
2457        trace_kmem_cache_alloc_node(_RET_IP_, ret,
2458                                    s->object_size, s->size, gfpflags, node);
2459
2460        return ret;
2461}
2462EXPORT_SYMBOL(kmem_cache_alloc_node);
2463
2464#ifdef CONFIG_TRACING
2465void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2466                                    gfp_t gfpflags,
2467                                    int node, size_t size)
2468{
2469        void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2470
2471        trace_kmalloc_node(_RET_IP_, ret,
2472                           size, s->size, gfpflags, node);
2473        return ret;
2474}
2475EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2476#endif
2477#endif
2478
2479/*
2480 * Slow patch handling. This may still be called frequently since objects
2481 * have a longer lifetime than the cpu slabs in most processing loads.
2482 *
2483 * So we still attempt to reduce cache line usage. Just take the slab
2484 * lock and free the item. If there is no additional partial page
2485 * handling required then we can return immediately.
2486 */
2487static void __slab_free(struct kmem_cache *s, struct page *page,
2488                        void *x, unsigned long addr)
2489{
2490        void *prior;
2491        void **object = (void *)x;
2492        int was_frozen;
2493        struct page new;
2494        unsigned long counters;
2495        struct kmem_cache_node *n = NULL;
2496        unsigned long uninitialized_var(flags);
2497
2498        stat(s, FREE_SLOWPATH);
2499
2500        if (kmem_cache_debug(s) &&
2501                !(n = free_debug_processing(s, page, x, addr, &flags)))
2502                return;
2503
2504        do {
2505                if (unlikely(n)) {
2506                        spin_unlock_irqrestore(&n->list_lock, flags);
2507                        n = NULL;
2508                }
2509                prior = page->freelist;
2510                counters = page->counters;
2511                set_freepointer(s, object, prior);
2512                new.counters = counters;
2513                was_frozen = new.frozen;
2514                new.inuse--;
2515                if ((!new.inuse || !prior) && !was_frozen) {
2516
2517                        if (kmem_cache_has_cpu_partial(s) && !prior)
2518
2519                                /*
2520                                 * Slab was on no list before and will be
2521                                 * partially empty
2522                                 * We can defer the list move and instead
2523                                 * freeze it.
2524                                 */
2525                                new.frozen = 1;
2526
2527                        else { /* Needs to be taken off a list */
2528
2529                                n = get_node(s, page_to_nid(page));
2530                                /*
2531                                 * Speculatively acquire the list_lock.
2532                                 * If the cmpxchg does not succeed then we may
2533                                 * drop the list_lock without any processing.
2534                                 *
2535                                 * Otherwise the list_lock will synchronize with
2536                                 * other processors updating the list of slabs.
2537                                 */
2538                                spin_lock_irqsave(&n->list_lock, flags);
2539
2540                        }
2541                }
2542
2543        } while (!cmpxchg_double_slab(s, page,
2544                prior, counters,
2545                object, new.counters,
2546                "__slab_free"));
2547
2548        if (likely(!n)) {
2549
2550                /*
2551                 * If we just froze the page then put it onto the
2552                 * per cpu partial list.
2553                 */
2554                if (new.frozen && !was_frozen) {
2555                        put_cpu_partial(s, page, 1);
2556                        stat(s, CPU_PARTIAL_FREE);
2557                }
2558                /*
2559                 * The list lock was not taken therefore no list
2560                 * activity can be necessary.
2561                 */
2562                if (was_frozen)
2563                        stat(s, FREE_FROZEN);
2564                return;
2565        }
2566
2567        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2568                goto slab_empty;
2569
2570        /*
2571         * Objects left in the slab. If it was not on the partial list before
2572         * then add it.
2573         */
2574        if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2575                if (kmem_cache_debug(s))
2576                        remove_full(s, page);
2577                add_partial(n, page, DEACTIVATE_TO_TAIL);
2578                stat(s, FREE_ADD_PARTIAL);
2579        }
2580        spin_unlock_irqrestore(&n->list_lock, flags);
2581        return;
2582
2583slab_empty:
2584        if (prior) {
2585                /*
2586                 * Slab on the partial list.
2587                 */
2588                remove_partial(n, page);
2589                stat(s, FREE_REMOVE_PARTIAL);
2590        } else
2591                /* Slab must be on the full list */
2592                remove_full(s, page);
2593
2594        spin_unlock_irqrestore(&n->list_lock, flags);
2595        stat(s, FREE_SLAB);
2596        discard_slab(s, page);
2597}
2598
2599/*
2600 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
2601 * can perform fastpath freeing without additional function calls.
2602 *
2603 * The fastpath is only possible if we are freeing to the current cpu slab
2604 * of this processor. This typically the case if we have just allocated
2605 * the item before.
2606 *
2607 * If fastpath is not possible then fall back to __slab_free where we deal
2608 * with all sorts of special processing.
2609 */
2610static __always_inline void slab_free(struct kmem_cache *s,
2611                        struct page *page, void *x, unsigned long addr)
2612{
2613        void **object = (void *)x;
2614        struct kmem_cache_cpu *c;
2615        unsigned long tid;
2616
2617        slab_free_hook(s, x);
2618
2619redo:
2620        /*
2621         * Determine the currently cpus per cpu slab.
2622         * The cpu may change afterward. However that does not matter since
2623         * data is retrieved via this pointer. If we are on the same cpu
2624         * during the cmpxchg then the free will succedd.
2625         */
2626        preempt_disable();
2627        c = __this_cpu_ptr(s->cpu_slab);
2628
2629        tid = c->tid;
2630        preempt_enable();
2631
2632        if (likely(page == c->page)) {
2633                set_freepointer(s, object, c->freelist);
2634
2635                if (unlikely(!this_cpu_cmpxchg_double(
2636                                s->cpu_slab->freelist, s->cpu_slab->tid,
2637                                c->freelist, tid,
2638                                object, next_tid(tid)))) {
2639
2640                        note_cmpxchg_failure("slab_free", s, tid);
2641                        goto redo;
2642                }
2643                stat(s, FREE_FASTPATH);
2644        } else
2645                __slab_free(s, page, x, addr);
2646
2647}
2648
2649void kmem_cache_free(struct kmem_cache *s, void *x)
2650{
2651        s = cache_from_obj(s, x);
2652        if (!s)
2653                return;
2654        slab_free(s, virt_to_head_page(x), x, _RET_IP_);
2655        trace_kmem_cache_free(_RET_IP_, x);
2656}
2657EXPORT_SYMBOL(kmem_cache_free);
2658
2659/*
2660 * Object placement in a slab is made very easy because we always start at
2661 * offset 0. If we tune the size of the object to the alignment then we can
2662 * get the required alignment by putting one properly sized object after
2663 * another.
2664 *
2665 * Notice that the allocation order determines the sizes of the per cpu
2666 * caches. Each processor has always one slab available for allocations.
2667 * Increasing the allocation order reduces the number of times that slabs
2668 * must be moved on and off the partial lists and is therefore a factor in
2669 * locking overhead.
2670 */
2671
2672/*
2673 * Mininum / Maximum order of slab pages. This influences locking overhead
2674 * and slab fragmentation. A higher order reduces the number of partial slabs
2675 * and increases the number of allocations possible without having to
2676 * take the list_lock.
2677 */
2678static int slub_min_order;
2679static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2680static int slub_min_objects;
2681
2682/*
2683 * Merge control. If this is set then no merging of slab caches will occur.
2684 * (Could be removed. This was introduced to pacify the merge skeptics.)
2685 */
2686static int slub_nomerge;
2687
2688/*
2689 * Calculate the order of allocation given an slab object size.
2690 *
2691 * The order of allocation has significant impact on performance and other
2692 * system components. Generally order 0 allocations should be preferred since
2693 * order 0 does not cause fragmentation in the page allocator. Larger objects
2694 * be problematic to put into order 0 slabs because there may be too much
2695 * unused space left. We go to a higher order if more than 1/16th of the slab
2696 * would be wasted.
2697 *
2698 * In order to reach satisfactory performance we must ensure that a minimum
2699 * number of objects is in one slab. Otherwise we may generate too much
2700 * activity on the partial lists which requires taking the list_lock. This is
2701 * less a concern for large slabs though which are rarely used.
2702 *
2703 * slub_max_order specifies the order where we begin to stop considering the
2704 * number of objects in a slab as critical. If we reach slub_max_order then
2705 * we try to keep the page order as low as possible. So we accept more waste
2706 * of space in favor of a small page order.
2707 *
2708 * Higher order allocations also allow the placement of more objects in a
2709 * slab and thereby reduce object handling overhead. If the user has
2710 * requested a higher mininum order then we start with that one instead of
2711 * the smallest order which will fit the object.
2712 */
2713static inline int slab_order(int size, int min_objects,
2714                                int max_order, int fract_leftover, int reserved)
2715{
2716        int order;
2717        int rem;
2718        int min_order = slub_min_order;
2719
2720        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2721                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2722
2723        for (order = max(min_order,
2724                                fls(min_objects * size - 1) - PAGE_SHIFT);
2725                        order <= max_order; order++) {
2726
2727                unsigned long slab_size = PAGE_SIZE << order;
2728
2729                if (slab_size < min_objects * size + reserved)
2730                        continue;
2731
2732                rem = (slab_size - reserved) % size;
2733
2734                if (rem <= slab_size / fract_leftover)
2735                        break;
2736
2737        }
2738
2739        return order;
2740}
2741
2742static inline int calculate_order(int size, int reserved)
2743{
2744        int order;
2745        int min_objects;
2746        int fraction;
2747        int max_objects;
2748
2749        /*
2750         * Attempt to find best configuration for a slab. This
2751         * works by first attempting to generate a layout with
2752         * the best configuration and backing off gradually.
2753         *
2754         * First we reduce the acceptable waste in a slab. Then
2755         * we reduce the minimum objects required in a slab.
2756         */
2757        min_objects = slub_min_objects;
2758        if (!min_objects)
2759                min_objects = 4 * (fls(nr_cpu_ids) + 1);
2760        max_objects = order_objects(slub_max_order, size, reserved);
2761        min_objects = min(min_objects, max_objects);
2762
2763        while (min_objects > 1) {
2764                fraction = 16;
2765                while (fraction >= 4) {
2766                        order = slab_order(size, min_objects,
2767                                        slub_max_order, fraction, reserved);
2768                        if (order <= slub_max_order)
2769                                return order;
2770                        fraction /= 2;
2771                }
2772                min_objects--;
2773        }
2774
2775        /*
2776         * We were unable to place multiple objects in a slab. Now
2777         * lets see if we can place a single object there.
2778         */
2779        order = slab_order(size, 1, slub_max_order, 1, reserved);
2780        if (order <= slub_max_order)
2781                return order;
2782
2783        /*
2784         * Doh this slab cannot be placed using slub_max_order.
2785         */
2786        order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2787        if (order < MAX_ORDER)
2788                return order;
2789        return -ENOSYS;
2790}
2791
2792static void
2793init_kmem_cache_node(struct kmem_cache_node *n)
2794{
2795        n->nr_partial = 0;
2796        spin_lock_init(&n->list_lock);
2797        INIT_LIST_HEAD(&n->partial);
2798#ifdef CONFIG_SLUB_DEBUG
2799        atomic_long_set(&n->nr_slabs, 0);
2800        atomic_long_set(&n->total_objects, 0);
2801        INIT_LIST_HEAD(&n->full);
2802#endif
2803}
2804
2805static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2806{
2807        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2808                        KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
2809
2810        /*
2811         * Must align to double word boundary for the double cmpxchg
2812         * instructions to work; see __pcpu_double_call_return_bool().
2813         */
2814        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2815                                     2 * sizeof(void *));
2816
2817        if (!s->cpu_slab)
2818                return 0;
2819
2820        init_kmem_cache_cpus(s);
2821
2822        return 1;
2823}
2824
2825static struct kmem_cache *kmem_cache_node;
2826
2827/*
2828 * No kmalloc_node yet so do it by hand. We know that this is the first
2829 * slab on the node for this slabcache. There are no concurrent accesses
2830 * possible.
2831 *
2832 * Note that this function only works on the kmalloc_node_cache
2833 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2834 * memory on a fresh node that has no slab structures yet.
2835 */
2836static void early_kmem_cache_node_alloc(int node)
2837{
2838        struct page *page;
2839        struct kmem_cache_node *n;
2840
2841        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2842
2843        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2844
2845        BUG_ON(!page);
2846        if (page_to_nid(page) != node) {
2847                printk(KERN_ERR "SLUB: Unable to allocate memory from "
2848                                "node %d\n", node);
2849                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2850                                "in order to be able to continue\n");
2851        }
2852
2853        n = page->freelist;
2854        BUG_ON(!n);
2855        page->freelist = get_freepointer(kmem_cache_node, n);
2856        page->inuse = 1;
2857        page->frozen = 0;
2858        kmem_cache_node->node[node] = n;
2859#ifdef CONFIG_SLUB_DEBUG
2860        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2861        init_tracking(kmem_cache_node, n);
2862#endif
2863        init_kmem_cache_node(n);
2864        inc_slabs_node(kmem_cache_node, node, page->objects);
2865
2866        add_partial(n, page, DEACTIVATE_TO_HEAD);
2867}
2868
2869static void free_kmem_cache_nodes(struct kmem_cache *s)
2870{
2871        int node;
2872
2873        for_each_node_state(node, N_NORMAL_MEMORY) {
2874                struct kmem_cache_node *n = s->node[node];
2875
2876                if (n)
2877                        kmem_cache_free(kmem_cache_node, n);
2878
2879                s->node[node] = NULL;
2880        }
2881}
2882
2883static int init_kmem_cache_nodes(struct kmem_cache *s)
2884{
2885        int node;
2886
2887        for_each_node_state(node, N_NORMAL_MEMORY) {
2888                struct kmem_cache_node *n;
2889
2890                if (slab_state == DOWN) {
2891                        early_kmem_cache_node_alloc(node);
2892                        continue;
2893                }
2894                n = kmem_cache_alloc_node(kmem_cache_node,
2895                                                GFP_KERNEL, node);
2896
2897                if (!n) {
2898                        free_kmem_cache_nodes(s);
2899                        return 0;
2900                }
2901
2902                s->node[node] = n;
2903                init_kmem_cache_node(n);
2904        }
2905        return 1;
2906}
2907
2908static void set_min_partial(struct kmem_cache *s, unsigned long min)
2909{
2910        if (min < MIN_PARTIAL)
2911                min = MIN_PARTIAL;
2912        else if (min > MAX_PARTIAL)
2913                min = MAX_PARTIAL;
2914        s->min_partial = min;
2915}
2916
2917/*
2918 * calculate_sizes() determines the order and the distribution of data within
2919 * a slab object.
2920 */
2921static int calculate_sizes(struct kmem_cache *s, int forced_order)
2922{
2923        unsigned long flags = s->flags;
2924        unsigned long size = s->object_size;
2925        int order;
2926
2927        /*
2928         * Round up object size to the next word boundary. We can only
2929         * place the free pointer at word boundaries and this determines
2930         * the possible location of the free pointer.
2931         */
2932        size = ALIGN(size, sizeof(void *));
2933
2934#ifdef CONFIG_SLUB_DEBUG
2935        /*
2936         * Determine if we can poison the object itself. If the user of
2937         * the slab may touch the object after free or before allocation
2938         * then we should never poison the object itself.
2939         */
2940        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2941                        !s->ctor)
2942                s->flags |= __OBJECT_POISON;
2943        else
2944                s->flags &= ~__OBJECT_POISON;
2945
2946
2947        /*
2948         * If we are Redzoning then check if there is some space between the
2949         * end of the object and the free pointer. If not then add an
2950         * additional word to have some bytes to store Redzone information.
2951         */
2952        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
2953                size += sizeof(void *);
2954#endif
2955
2956        /*
2957         * With that we have determined the number of bytes in actual use
2958         * by the object. This is the potential offset to the free pointer.
2959         */
2960        s->inuse = size;
2961
2962        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2963                s->ctor)) {
2964                /*
2965                 * Relocate free pointer after the object if it is not
2966                 * permitted to overwrite the first word of the object on
2967                 * kmem_cache_free.
2968                 *
2969                 * This is the case if we do RCU, have a constructor or
2970                 * destructor or are poisoning the objects.
2971                 */
2972                s->offset = size;
2973                size += sizeof(void *);
2974        }
2975
2976#ifdef CONFIG_SLUB_DEBUG
2977        if (flags & SLAB_STORE_USER)
2978                /*
2979                 * Need to store information about allocs and frees after
2980                 * the object.
2981                 */
2982                size += 2 * sizeof(struct track);
2983
2984        if (flags & SLAB_RED_ZONE)
2985                /*
2986                 * Add some empty padding so that we can catch
2987                 * overwrites from earlier objects rather than let
2988                 * tracking information or the free pointer be
2989                 * corrupted if a user writes before the start
2990                 * of the object.
2991                 */
2992                size += sizeof(void *);
2993#endif
2994
2995        /*
2996         * SLUB stores one object immediately after another beginning from
2997         * offset 0. In order to align the objects we have to simply size
2998         * each object to conform to the alignment.
2999         */
3000        size = ALIGN(size, s->align);
3001        s->size = size;
3002        if (forced_order >= 0)
3003                order = forced_order;
3004        else
3005                order = calculate_order(size, s->reserved);
3006
3007        if (order < 0)
3008                return 0;
3009
3010        s->allocflags = 0;
3011        if (order)
3012                s->allocflags |= __GFP_COMP;
3013
3014        if (s->flags & SLAB_CACHE_DMA)
3015                s->allocflags |= GFP_DMA;
3016
3017        if (s->flags & SLAB_RECLAIM_ACCOUNT)
3018                s->allocflags |= __GFP_RECLAIMABLE;
3019
3020        /*
3021         * Determine the number of objects per slab
3022         */
3023        s->oo = oo_make(order, size, s->reserved);
3024        s->min = oo_make(get_order(size), size, s->reserved);
3025        if (oo_objects(s->oo) > oo_objects(s->max))
3026                s->max = s->oo;
3027
3028        return !!oo_objects(s->oo);
3029}
3030
3031static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3032{
3033        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3034        s->reserved = 0;
3035
3036        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
3037                s->reserved = sizeof(struct rcu_head);
3038
3039        if (!calculate_sizes(s, -1))
3040                goto error;
3041        if (disable_higher_order_debug) {
3042                /*
3043                 * Disable debugging flags that store metadata if the min slab
3044                 * order increased.
3045                 */
3046                if (get_order(s->size) > get_order(s->object_size)) {
3047                        s->flags &= ~DEBUG_METADATA_FLAGS;
3048                        s->offset = 0;
3049                        if (!calculate_sizes(s, -1))
3050                                goto error;
3051                }
3052        }
3053
3054#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3055    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3056        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3057                /* Enable fast mode */
3058                s->flags |= __CMPXCHG_DOUBLE;
3059#endif
3060
3061        /*
3062         * The larger the object size is, the more pages we want on the partial
3063         * list to avoid pounding the page allocator excessively.
3064         */
3065        set_min_partial(s, ilog2(s->size) / 2);
3066
3067        /*
3068         * cpu_partial determined the maximum number of objects kept in the
3069         * per cpu partial lists of a processor.
3070         *
3071         * Per cpu partial lists mainly contain slabs that just have one
3072         * object freed. If they are used for allocation then they can be
3073         * filled up again with minimal effort. The slab will never hit the
3074         * per node partial lists and therefore no locking will be required.
3075         *
3076         * This setting also determines
3077         *
3078         * A) The number of objects from per cpu partial slabs dumped to the
3079         *    per node list when we reach the limit.
3080         * B) The number of objects in cpu partial slabs to extract from the
3081         *    per node list when we run out of per cpu objects. We only fetch
3082         *    50% to keep some capacity around for frees.
3083         */
3084        if (!kmem_cache_has_cpu_partial(s))
3085                s->cpu_partial = 0;
3086        else if (s->size >= PAGE_SIZE)
3087                s->cpu_partial = 2;
3088        else if (s->size >= 1024)
3089                s->cpu_partial = 6;
3090        else if (s->size >= 256)
3091                s->cpu_partial = 13;
3092        else
3093                s->cpu_partial = 30;
3094
3095#ifdef CONFIG_NUMA
3096        s->remote_node_defrag_ratio = 1000;
3097#endif
3098        if (!init_kmem_cache_nodes(s))
3099                goto error;
3100
3101        if (alloc_kmem_cache_cpus(s))
3102                return 0;
3103
3104        free_kmem_cache_nodes(s);
3105error:
3106        if (flags & SLAB_PANIC)
3107                panic("Cannot create slab %s size=%lu realsize=%u "
3108                        "order=%u offset=%u flags=%lx\n",
3109                        s->name, (unsigned long)s->size, s->size,
3110                        oo_order(s->oo), s->offset, flags);
3111        return -EINVAL;
3112}
3113
3114static void list_slab_objects(struct kmem_cache *s, struct page *page,
3115                                                        const char *text)
3116{
3117#ifdef CONFIG_SLUB_DEBUG
3118        void *addr = page_address(page);
3119        void *p;
3120        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
3121                                     sizeof(long), GFP_ATOMIC);
3122        if (!map)
3123                return;
3124        slab_err(s, page, text, s->name);
3125        slab_lock(page);
3126
3127        get_map(s, page, map);
3128        for_each_object(p, s, addr, page->objects) {
3129
3130                if (!test_bit(slab_index(p, s, addr), map)) {
3131                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
3132                                                        p, p - addr);
3133                        print_tracking(s, p);
3134                }
3135        }
3136        slab_unlock(page);
3137        kfree(map);
3138#endif
3139}
3140
3141/*
3142 * Attempt to free all partial slabs on a node.
3143 * This is called from kmem_cache_close(). We must be the last thread
3144 * using the cache and therefore we do not need to lock anymore.
3145 */
3146static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3147{
3148        struct page *page, *h;
3149
3150        list_for_each_entry_safe(page, h, &n->partial, lru) {
3151                if (!page->inuse) {
3152                        remove_partial(n, page);
3153                        discard_slab(s, page);
3154                } else {
3155                        list_slab_objects(s, page,
3156                        "Objects remaining in %s on kmem_cache_close()");
3157                }
3158        }
3159}
3160
3161/*
3162 * Release all resources used by a slab cache.
3163 */
3164static inline int kmem_cache_close(struct kmem_cache *s)
3165{
3166        int node;
3167
3168        flush_all(s);
3169        /* Attempt to free all objects */
3170        for_each_node_state(node, N_NORMAL_MEMORY) {
3171                struct kmem_cache_node *n = get_node(s, node);
3172
3173                free_partial(s, n);
3174                if (n->nr_partial || slabs_node(s, node))
3175                        return 1;
3176        }
3177        free_percpu(s->cpu_slab);
3178        free_kmem_cache_nodes(s);
3179        return 0;
3180}
3181
3182int __kmem_cache_shutdown(struct kmem_cache *s)
3183{
3184        int rc = kmem_cache_close(s);
3185
3186        if (!rc) {
3187                /*
3188                 * We do the same lock strategy around sysfs_slab_add, see
3189                 * __kmem_cache_create. Because this is pretty much the last
3190                 * operation we do and the lock will be released shortly after
3191                 * that in slab_common.c, we could just move sysfs_slab_remove
3192                 * to a later point in common code. We should do that when we
3193                 * have a common sysfs framework for all allocators.
3194                 */
3195                mutex_unlock(&slab_mutex);
3196                sysfs_slab_remove(s);
3197                mutex_lock(&slab_mutex);
3198        }
3199
3200        return rc;
3201}
3202
3203/********************************************************************
3204 *              Kmalloc subsystem
3205 *******************************************************************/
3206
3207static int __init setup_slub_min_order(char *str)
3208{
3209        get_option(&str, &slub_min_order);
3210
3211        return 1;
3212}
3213
3214__setup("slub_min_order=", setup_slub_min_order);
3215
3216static int __init setup_slub_max_order(char *str)
3217{
3218        get_option(&str, &slub_max_order);
3219        slub_max_order = min(slub_max_order, MAX_ORDER - 1);
3220
3221        return 1;
3222}
3223
3224__setup("slub_max_order=", setup_slub_max_order);
3225
3226static int __init setup_slub_min_objects(char *str)
3227{
3228        get_option(&str, &slub_min_objects);
3229
3230        return 1;
3231}
3232
3233__setup("slub_min_objects=", setup_slub_min_objects);
3234
3235static int __init setup_slub_nomerge(char *str)
3236{
3237        slub_nomerge = 1;
3238        return 1;
3239}
3240
3241__setup("slub_nomerge", setup_slub_nomerge);
3242
3243void *__kmalloc(size_t size, gfp_t flags)
3244{
3245        struct kmem_cache *s;
3246        void *ret;
3247
3248        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3249                return kmalloc_large(size, flags);
3250
3251        s = kmalloc_slab(size, flags);
3252
3253        if (unlikely(ZERO_OR_NULL_PTR(s)))
3254                return s;
3255
3256        ret = slab_alloc(s, flags, _RET_IP_);
3257
3258        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3259
3260        return ret;
3261}
3262EXPORT_SYMBOL(__kmalloc);
3263
3264#ifdef CONFIG_NUMA
3265static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3266{
3267        struct page *page;
3268        void *ptr = NULL;
3269
3270        flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
3271        page = alloc_pages_node(node, flags, get_order(size));
3272        if (page)
3273                ptr = page_address(page);
3274
3275        kmemleak_alloc(ptr, size, 1, flags);
3276        return ptr;
3277}
3278
3279void *__kmalloc_node(size_t size, gfp_t flags, int node)
3280{
3281        struct kmem_cache *s;
3282        void *ret;
3283
3284        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3285                ret = kmalloc_large_node(size, flags, node);
3286
3287                trace_kmalloc_node(_RET_IP_, ret,
3288                                   size, PAGE_SIZE << get_order(size),
3289                                   flags, node);
3290
3291                return ret;
3292        }
3293
3294        s = kmalloc_slab(size, flags);
3295
3296        if (unlikely(ZERO_OR_NULL_PTR(s)))
3297                return s;
3298
3299        ret = slab_alloc_node(s, flags, node, _RET_IP_);
3300
3301        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3302
3303        return ret;
3304}
3305EXPORT_SYMBOL(__kmalloc_node);
3306#endif
3307
3308size_t ksize(const void *object)
3309{
3310        struct page *page;
3311
3312        if (unlikely(object == ZERO_SIZE_PTR))
3313                return 0;
3314
3315        page = virt_to_head_page(object);
3316
3317        if (unlikely(!PageSlab(page))) {
3318                WARN_ON(!PageCompound(page));
3319                return PAGE_SIZE << compound_order(page);
3320        }
3321
3322        return slab_ksize(page->slab_cache);
3323}
3324EXPORT_SYMBOL(ksize);
3325
3326void kfree(const void *x)
3327{
3328        struct page *page;
3329        void *object = (void *)x;
3330
3331        trace_kfree(_RET_IP_, x);
3332
3333        if (unlikely(ZERO_OR_NULL_PTR(x)))
3334                return;
3335
3336        page = virt_to_head_page(x);
3337        if (unlikely(!PageSlab(page))) {
3338                BUG_ON(!PageCompound(page));
3339                kmemleak_free(x);
3340                __free_memcg_kmem_pages(page, compound_order(page));
3341                return;
3342        }
3343        slab_free(page->slab_cache, page, object, _RET_IP_);
3344}
3345EXPORT_SYMBOL(kfree);
3346
3347/*
3348 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3349 * the remaining slabs by the number of items in use. The slabs with the
3350 * most items in use come first. New allocations will then fill those up
3351 * and thus they can be removed from the partial lists.
3352 *
3353 * The slabs with the least items are placed last. This results in them
3354 * being allocated from last increasing the chance that the last objects
3355 * are freed in them.
3356 */
3357int kmem_cache_shrink(struct kmem_cache *s)
3358{
3359        int node;
3360        int i;
3361        struct kmem_cache_node *n;
3362        struct page *page;
3363        struct page *t;
3364        int objects = oo_objects(s->max);
3365        struct list_head *slabs_by_inuse =
3366                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3367        unsigned long flags;
3368
3369        if (!slabs_by_inuse)
3370                return -ENOMEM;
3371
3372        flush_all(s);
3373        for_each_node_state(node, N_NORMAL_MEMORY) {
3374                n = get_node(s, node);
3375
3376                if (!n->nr_partial)
3377                        continue;
3378
3379                for (i = 0; i < objects; i++)
3380                        INIT_LIST_HEAD(slabs_by_inuse + i);
3381
3382                spin_lock_irqsave(&n->list_lock, flags);
3383
3384                /*
3385                 * Build lists indexed by the items in use in each slab.
3386                 *
3387                 * Note that concurrent frees may occur while we hold the
3388                 * list_lock. page->inuse here is the upper limit.
3389                 */
3390                list_for_each_entry_safe(page, t, &n->partial, lru) {
3391                        list_move(&page->lru, slabs_by_inuse + page->inuse);
3392                        if (!page->inuse)
3393                                n->nr_partial--;
3394                }
3395
3396                /*
3397                 * Rebuild the partial list with the slabs filled up most
3398                 * first and the least used slabs at the end.
3399                 */
3400                for (i = objects - 1; i > 0; i--)
3401                        list_splice(slabs_by_inuse + i, n->partial.prev);
3402
3403                spin_unlock_irqrestore(&n->list_lock, flags);
3404
3405                /* Release empty slabs */
3406                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3407                        discard_slab(s, page);
3408        }
3409
3410        kfree(slabs_by_inuse);
3411        return 0;
3412}
3413EXPORT_SYMBOL(kmem_cache_shrink);
3414
3415static int slab_mem_going_offline_callback(void *arg)
3416{
3417        struct kmem_cache *s;
3418
3419        mutex_lock(&slab_mutex);
3420        list_for_each_entry(s, &slab_caches, list)
3421                kmem_cache_shrink(s);
3422        mutex_unlock(&slab_mutex);
3423
3424        return 0;
3425}
3426
3427static void slab_mem_offline_callback(void *arg)
3428{
3429        struct kmem_cache_node *n;
3430        struct kmem_cache *s;
3431        struct memory_notify *marg = arg;
3432        int offline_node;
3433
3434        offline_node = marg->status_change_nid_normal;
3435
3436        /*
3437         * If the node still has available memory. we need kmem_cache_node
3438         * for it yet.
3439         */
3440        if (offline_node < 0)
3441                return;
3442
3443        mutex_lock(&slab_mutex);
3444        list_for_each_entry(s, &slab_caches, list) {
3445                n = get_node(s, offline_node);
3446                if (n) {
3447                        /*
3448                         * if n->nr_slabs > 0, slabs still exist on the node
3449                         * that is going down. We were unable to free them,
3450                         * and offline_pages() function shouldn't call this
3451                         * callback. So, we must fail.
3452                         */
3453                        BUG_ON(slabs_node(s, offline_node));
3454
3455                        s->node[offline_node] = NULL;
3456                        kmem_cache_free(kmem_cache_node, n);
3457                }
3458        }
3459        mutex_unlock(&slab_mutex);
3460}
3461
3462static int slab_mem_going_online_callback(void *arg)
3463{
3464        struct kmem_cache_node *n;
3465        struct kmem_cache *s;
3466        struct memory_notify *marg = arg;
3467        int nid = marg->status_change_nid_normal;
3468        int ret = 0;
3469
3470        /*
3471         * If the node's memory is already available, then kmem_cache_node is
3472         * already created. Nothing to do.
3473         */
3474        if (nid < 0)
3475                return 0;
3476
3477        /*
3478         * We are bringing a node online. No memory is available yet. We must
3479         * allocate a kmem_cache_node structure in order to bring the node
3480         * online.
3481         */
3482        mutex_lock(&slab_mutex);
3483        list_for_each_entry(s, &slab_caches, list) {
3484                /*
3485                 * XXX: kmem_cache_alloc_node will fallback to other nodes
3486                 *      since memory is not yet available from the node that
3487                 *      is brought up.
3488                 */
3489                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
3490                if (!n) {
3491                        ret = -ENOMEM;
3492                        goto out;
3493                }
3494                init_kmem_cache_node(n);
3495                s->node[nid] = n;
3496        }
3497out:
3498        mutex_unlock(&slab_mutex);
3499        return ret;
3500}
3501
3502static int slab_memory_callback(struct notifier_block *self,
3503                                unsigned long action, void *arg)
3504{
3505        int ret = 0;
3506
3507        switch (action) {
3508        case MEM_GOING_ONLINE:
3509                ret = slab_mem_going_online_callback(arg);
3510                break;
3511        case MEM_GOING_OFFLINE:
3512                ret = slab_mem_going_offline_callback(arg);
3513                break;
3514        case MEM_OFFLINE:
3515        case MEM_CANCEL_ONLINE:
3516                slab_mem_offline_callback(arg);
3517                break;
3518        case MEM_ONLINE:
3519        case MEM_CANCEL_OFFLINE:
3520                break;
3521        }
3522        if (ret)
3523                ret = notifier_from_errno(ret);
3524        else
3525                ret = NOTIFY_OK;
3526        return ret;
3527}
3528
3529static struct notifier_block slab_memory_callback_nb = {
3530        .notifier_call = slab_memory_callback,
3531        .priority = SLAB_CALLBACK_PRI,
3532};
3533
3534/********************************************************************
3535 *                      Basic setup of slabs
3536 *******************************************************************/
3537
3538/*
3539 * Used for early kmem_cache structures that were allocated using
3540 * the page allocator. Allocate them properly then fix up the pointers
3541 * that may be pointing to the wrong kmem_cache structure.
3542 */
3543
3544static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3545{
3546        int node;
3547        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3548
3549        memcpy(s, static_cache, kmem_cache->object_size);
3550
3551        /*
3552         * This runs very early, and only the boot processor is supposed to be
3553         * up.  Even if it weren't true, IRQs are not up so we couldn't fire
3554         * IPIs around.
3555         */
3556        __flush_cpu_slab(s, smp_processor_id());
3557        for_each_node_state(node, N_NORMAL_MEMORY) {
3558                struct kmem_cache_node *n = get_node(s, node);
3559                struct page *p;
3560
3561                if (n) {
3562                        list_for_each_entry(p, &n->partial, lru)
3563                                p->slab_cache = s;
3564
3565#ifdef CONFIG_SLUB_DEBUG
3566                        list_for_each_entry(p, &n->full, lru)
3567                                p->slab_cache = s;
3568#endif
3569                }
3570        }
3571        list_add(&s->list, &slab_caches);
3572        return s;
3573}
3574
3575void __init kmem_cache_init(void)
3576{
3577        static __initdata struct kmem_cache boot_kmem_cache,
3578                boot_kmem_cache_node;
3579
3580        if (debug_guardpage_minorder())
3581                slub_max_order = 0;
3582
3583        kmem_cache_node = &boot_kmem_cache_node;
3584        kmem_cache = &boot_kmem_cache;
3585
3586        create_boot_cache(kmem_cache_node, "kmem_cache_node",
3587                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3588
3589        register_hotmemory_notifier(&slab_memory_callback_nb);
3590
3591        /* Able to allocate the per node structures */
3592        slab_state = PARTIAL;
3593
3594        create_boot_cache(kmem_cache, "kmem_cache",
3595                        offsetof(struct kmem_cache, node) +
3596                                nr_node_ids * sizeof(struct kmem_cache_node *),
3597                       SLAB_HWCACHE_ALIGN);
3598
3599        kmem_cache = bootstrap(&boot_kmem_cache);
3600
3601        /*
3602         * Allocate kmem_cache_node properly from the kmem_cache slab.
3603         * kmem_cache_node is separately allocated so no need to
3604         * update any list pointers.
3605         */
3606        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
3607
3608        /* Now we can use the kmem_cache to allocate kmalloc slabs */
3609        create_kmalloc_caches(0);
3610
3611#ifdef CONFIG_SMP
3612        register_cpu_notifier(&slab_notifier);
3613#endif
3614
3615        printk(KERN_INFO
3616                "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
3617                " CPUs=%d, Nodes=%d\n",
3618                cache_line_size(),
3619                slub_min_order, slub_max_order, slub_min_objects,
3620                nr_cpu_ids, nr_node_ids);
3621}
3622
3623void __init kmem_cache_init_late(void)
3624{
3625}
3626
3627/*
3628 * Find a mergeable slab cache
3629 */
3630static int slab_unmergeable(struct kmem_cache *s)
3631{
3632        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3633                return 1;
3634
3635        if (s->ctor)
3636                return 1;
3637
3638        /*
3639         * We may have set a slab to be unmergeable during bootstrap.
3640         */
3641        if (s->refcount < 0)
3642                return 1;
3643
3644        return 0;
3645}
3646
3647static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3648                size_t align, unsigned long flags, const char *name,
3649                void (*ctor)(void *))
3650{
3651        struct kmem_cache *s;
3652
3653        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3654                return NULL;
3655
3656        if (ctor)
3657                return NULL;
3658
3659        size = ALIGN(size, sizeof(void *));
3660        align = calculate_alignment(flags, align, size);
3661        size = ALIGN(size, align);
3662        flags = kmem_cache_flags(size, flags, name, NULL);
3663
3664        list_for_each_entry(s, &slab_caches, list) {
3665                if (slab_unmergeable(s))
3666                        continue;
3667
3668                if (size > s->size)
3669                        continue;
3670
3671                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3672                                continue;
3673                /*
3674                 * Check if alignment is compatible.
3675                 * Courtesy of Adrian Drzewiecki
3676                 */
3677                if ((s->size & ~(align - 1)) != s->size)
3678                        continue;
3679
3680                if (s->size - size >= sizeof(void *))
3681                        continue;
3682
3683                if (!cache_match_memcg(s, memcg))
3684                        continue;
3685
3686                return s;
3687        }
3688        return NULL;
3689}
3690
3691struct kmem_cache *
3692__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3693                   size_t align, unsigned long flags, void (*ctor)(void *))
3694{
3695        struct kmem_cache *s;
3696
3697        s = find_mergeable(memcg, size, align, flags, name, ctor);
3698        if (s) {
3699                s->refcount++;
3700                /*
3701                 * Adjust the object sizes so that we clear
3702                 * the complete object on kzalloc.
3703                 */
3704                s->object_size = max(s->object_size, (int)size);
3705                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3706
3707                if (sysfs_slab_alias(s, name)) {
3708                        s->refcount--;
3709                        s = NULL;
3710                }
3711        }
3712
3713        return s;
3714}
3715
3716int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3717{
3718        int err;
3719
3720        err = kmem_cache_open(s, flags);
3721        if (err)
3722                return err;
3723
3724        /* Mutex is not taken during early boot */
3725        if (slab_state <= UP)
3726                return 0;
3727
3728        memcg_propagate_slab_attrs(s);
3729        mutex_unlock(&slab_mutex);
3730        err = sysfs_slab_add(s);
3731        mutex_lock(&slab_mutex);
3732
3733        if (err)
3734                kmem_cache_close(s);
3735
3736        return err;
3737}
3738
3739#ifdef CONFIG_SMP
3740/*
3741 * Use the cpu notifier to insure that the cpu slabs are flushed when
3742 * necessary.
3743 */
3744static int slab_cpuup_callback(struct notifier_block *nfb,
3745                unsigned long action, void *hcpu)
3746{
3747        long cpu = (long)hcpu;
3748        struct kmem_cache *s;
3749        unsigned long flags;
3750
3751        switch (action) {
3752        case CPU_UP_CANCELED:
3753        case CPU_UP_CANCELED_FROZEN:
3754        case CPU_DEAD:
3755        case CPU_DEAD_FROZEN:
3756                mutex_lock(&slab_mutex);
3757                list_for_each_entry(s, &slab_caches, list) {
3758                        local_irq_save(flags);
3759                        __flush_cpu_slab(s, cpu);
3760                        local_irq_restore(flags);
3761                }
3762                mutex_unlock(&slab_mutex);
3763                break;
3764        default:
3765                break;
3766        }
3767        return NOTIFY_OK;
3768}
3769
3770static struct notifier_block slab_notifier = {
3771        .notifier_call = slab_cpuup_callback
3772};
3773
3774#endif
3775
3776void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3777{
3778        struct kmem_cache *s;
3779        void *ret;
3780
3781        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
3782                return kmalloc_large(size, gfpflags);
3783
3784        s = kmalloc_slab(size, gfpflags);
3785
3786        if (unlikely(ZERO_OR_NULL_PTR(s)))
3787                return s;
3788
3789        ret = slab_alloc(s, gfpflags, caller);
3790
3791        /* Honor the call site pointer we received. */
3792        trace_kmalloc(caller, ret, size, s->size, gfpflags);
3793
3794        return ret;
3795}
3796
3797#ifdef CONFIG_NUMA
3798void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3799                                        int node, unsigned long caller)
3800{
3801        struct kmem_cache *s;
3802        void *ret;
3803
3804        if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
3805                ret = kmalloc_large_node(size, gfpflags, node);
3806
3807                trace_kmalloc_node(caller, ret,
3808                                   size, PAGE_SIZE << get_order(size),
3809                                   gfpflags, node);
3810
3811                return ret;
3812        }
3813
3814        s = kmalloc_slab(size, gfpflags);
3815
3816        if (unlikely(ZERO_OR_NULL_PTR(s)))
3817                return s;
3818
3819        ret = slab_alloc_node(s, gfpflags, node, caller);
3820
3821        /* Honor the call site pointer we received. */
3822        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3823
3824        return ret;
3825}
3826#endif
3827
3828#ifdef CONFIG_SYSFS
3829static int count_inuse(struct page *page)
3830{
3831        return page->inuse;
3832}
3833
3834static int count_total(struct page *page)
3835{
3836        return page->objects;
3837}
3838#endif
3839
3840#ifdef CONFIG_SLUB_DEBUG
3841static int validate_slab(struct kmem_cache *s, struct page *page,
3842                                                unsigned long *map)
3843{
3844        void *p;
3845        void *addr = page_address(page);
3846
3847        if (!check_slab(s, page) ||
3848                        !on_freelist(s, page, NULL))
3849                return 0;
3850
3851        /* Now we know that a valid freelist exists */
3852        bitmap_zero(map, page->objects);
3853
3854        get_map(s, page, map);
3855        for_each_object(p, s, addr, page->objects) {
3856                if (test_bit(slab_index(p, s, addr), map))
3857                        if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3858                                return 0;
3859        }
3860
3861        for_each_object(p, s, addr, page->objects)
3862                if (!test_bit(slab_index(p, s, addr), map))
3863                        if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3864                                return 0;
3865        return 1;
3866}
3867
3868static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3869                                                unsigned long *map)
3870{
3871        slab_lock(page);
3872        validate_slab(s, page, map);
3873        slab_unlock(page);
3874}
3875
3876static int validate_slab_node(struct kmem_cache *s,
3877                struct kmem_cache_node *n, unsigned long *map)
3878{
3879        unsigned long count = 0;
3880        struct page *page;
3881        unsigned long flags;
3882
3883        spin_lock_irqsave(&n->list_lock, flags);
3884
3885        list_for_each_entry(page, &n->partial, lru) {
3886                validate_slab_slab(s, page, map);
3887                count++;
3888        }
3889        if (count != n->nr_partial)
3890                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3891                        "counter=%ld\n", s->name, count, n->nr_partial);
3892
3893        if (!(s->flags & SLAB_STORE_USER))
3894                goto out;
3895
3896        list_for_each_entry(page, &n->full, lru) {
3897                validate_slab_slab(s, page, map);
3898                count++;
3899        }
3900        if (count != atomic_long_read(&n->nr_slabs))
3901                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3902                        "counter=%ld\n", s->name, count,
3903                        atomic_long_read(&n->nr_slabs));
3904
3905out:
3906        spin_unlock_irqrestore(&n->list_lock, flags);
3907        return count;
3908}
3909
3910static long validate_slab_cache(struct kmem_cache *s)
3911{
3912        int node;
3913        unsigned long count = 0;
3914        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3915                                sizeof(unsigned long), GFP_KERNEL);
3916
3917        if (!map)
3918                return -ENOMEM;
3919
3920        flush_all(s);
3921        for_each_node_state(node, N_NORMAL_MEMORY) {
3922                struct kmem_cache_node *n = get_node(s, node);
3923
3924                count += validate_slab_node(s, n, map);
3925        }
3926        kfree(map);
3927        return count;
3928}
3929/*
3930 * Generate lists of code addresses where slabcache objects are allocated
3931 * and freed.
3932 */
3933
3934struct location {
3935        unsigned long count;
3936        unsigned long addr;
3937        long long sum_time;
3938        long min_time;
3939        long max_time;
3940        long min_pid;
3941        long max_pid;
3942        DECLARE_BITMAP(cpus, NR_CPUS);
3943        nodemask_t nodes;
3944};
3945
3946struct loc_track {
3947        unsigned long max;
3948        unsigned long count;
3949        struct location *loc;
3950};
3951
3952static void free_loc_track(struct loc_track *t)
3953{
3954        if (t->max)
3955                free_pages((unsigned long)t->loc,
3956                        get_order(sizeof(struct location) * t->max));
3957}
3958
3959static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3960{
3961        struct location *l;
3962        int order;
3963
3964        order = get_order(sizeof(struct location) * max);
3965
3966        l = (void *)__get_free_pages(flags, order);
3967        if (!l)
3968                return 0;
3969
3970        if (t->count) {
3971                memcpy(l, t->loc, sizeof(struct location) * t->count);
3972                free_loc_track(t);
3973        }
3974        t->max = max;
3975        t->loc = l;
3976        return 1;
3977}
3978
3979static int add_location(struct loc_track *t, struct kmem_cache *s,
3980                                const struct track *track)
3981{
3982        long start, end, pos;
3983        struct location *l;
3984        unsigned long caddr;
3985        unsigned long age = jiffies - track->when;
3986
3987        start = -1;
3988        end = t->count;
3989
3990        for ( ; ; ) {
3991                pos = start + (end - start + 1) / 2;
3992
3993                /*
3994                 * There is nothing at "end". If we end up there
3995                 * we need to add something to before end.
3996                 */
3997                if (pos == end)
3998                        break;
3999
4000                caddr = t->loc[pos].addr;
4001                if (track->addr == caddr) {
4002
4003                        l = &t->loc[pos];
4004                        l->count++;
4005                        if (track->when) {
4006                                l->sum_time += age;
4007                                if (age < l->min_time)
4008                                        l->min_time = age;
4009                                if (age > l->max_time)
4010                                        l->max_time = age;
4011
4012                                if (track->pid < l->min_pid)
4013                                        l->min_pid = track->pid;
4014                                if (track->pid > l->max_pid)
4015                                        l->max_pid = track->pid;
4016
4017                                cpumask_set_cpu(track->cpu,
4018                                                to_cpumask(l->cpus));
4019                        }
4020                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4021                        return 1;
4022                }
4023
4024                if (track->addr < caddr)
4025                        end = pos;
4026                else
4027                        start = pos;
4028        }
4029
4030        /*
4031         * Not found. Insert new tracking element.
4032         */
4033        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
4034                return 0;
4035
4036        l = t->loc + pos;
4037        if (pos < t->count)
4038                memmove(l + 1, l,
4039                        (t->count - pos) * sizeof(struct location));
4040        t->count++;
4041        l->count = 1;
4042        l->addr = track->addr;
4043        l->sum_time = age;
4044        l->min_time = age;
4045        l->max_time = age;
4046        l->min_pid = track->pid;
4047        l->max_pid = track->pid;
4048        cpumask_clear(to_cpumask(l->cpus));
4049        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
4050        nodes_clear(l->nodes);
4051        node_set(page_to_nid(virt_to_page(track)), l->nodes);
4052        return 1;
4053}
4054
4055static void process_slab(struct loc_track *t, struct kmem_cache *s,
4056                struct page *page, enum track_item alloc,
4057                unsigned long *map)
4058{
4059        void *addr = page_address(page);
4060        void *p;
4061
4062        bitmap_zero(map, page->objects);
4063        get_map(s, page, map);
4064
4065        for_each_object(p, s, addr, page->objects)
4066                if (!test_bit(slab_index(p, s, addr), map))
4067                        add_location(t, s, get_track(s, p, alloc));
4068}
4069
4070static int list_locations(struct kmem_cache *s, char *buf,
4071                                        enum track_item alloc)
4072{
4073        int len = 0;
4074        unsigned long i;
4075        struct loc_track t = { 0, 0, NULL };
4076        int node;
4077        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4078                                     sizeof(unsigned long), GFP_KERNEL);
4079
4080        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4081                                     GFP_TEMPORARY)) {
4082                kfree(map);
4083                return sprintf(buf, "Out of memory\n");
4084        }
4085        /* Push back cpu slabs */
4086        flush_all(s);
4087
4088        for_each_node_state(node, N_NORMAL_MEMORY) {
4089                struct kmem_cache_node *n = get_node(s, node);
4090                unsigned long flags;
4091                struct page *page;
4092
4093                if (!atomic_long_read(&n->nr_slabs))
4094                        continue;
4095
4096                spin_lock_irqsave(&n->list_lock, flags);
4097                list_for_each_entry(page, &n->partial, lru)
4098                        process_slab(&t, s, page, alloc, map);
4099                list_for_each_entry(page, &n->full, lru)
4100                        process_slab(&t, s, page, alloc, map);
4101                spin_unlock_irqrestore(&n->list_lock, flags);
4102        }
4103
4104        for (i = 0; i < t.count; i++) {
4105                struct location *l = &t.loc[i];
4106
4107                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4108                        break;
4109                len += sprintf(buf + len, "%7ld ", l->count);
4110
4111                if (l->addr)
4112                        len += sprintf(buf + len, "%pS", (void *)l->addr);
4113                else
4114                        len += sprintf(buf + len, "<not-available>");
4115
4116                if (l->sum_time != l->min_time) {
4117                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
4118                                l->min_time,
4119                                (long)div_u64(l->sum_time, l->count),
4120                                l->max_time);
4121                } else
4122                        len += sprintf(buf + len, " age=%ld",
4123                                l->min_time);
4124
4125                if (l->min_pid != l->max_pid)
4126                        len += sprintf(buf + len, " pid=%ld-%ld",
4127                                l->min_pid, l->max_pid);
4128                else
4129                        len += sprintf(buf + len, " pid=%ld",
4130                                l->min_pid);
4131
4132                if (num_online_cpus() > 1 &&
4133                                !cpumask_empty(to_cpumask(l->cpus)) &&
4134                                len < PAGE_SIZE - 60) {
4135                        len += sprintf(buf + len, " cpus=");
4136                        len += cpulist_scnprintf(buf + len,
4137                                                 PAGE_SIZE - len - 50,
4138                                                 to_cpumask(l->cpus));
4139                }
4140
4141                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4142                                len < PAGE_SIZE - 60) {
4143                        len += sprintf(buf + len, " nodes=");
4144                        len += nodelist_scnprintf(buf + len,
4145                                                  PAGE_SIZE - len - 50,
4146                                                  l->nodes);
4147                }
4148
4149                len += sprintf(buf + len, "\n");
4150        }
4151
4152        free_loc_track(&t);
4153        kfree(map);
4154        if (!t.count)
4155                len += sprintf(buf, "No data\n");
4156        return len;
4157}
4158#endif
4159
4160#ifdef SLUB_RESILIENCY_TEST
4161static void resiliency_test(void)
4162{
4163        u8 *p;
4164
4165        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
4166
4167        printk(KERN_ERR "SLUB resiliency testing\n");
4168        printk(KERN_ERR "-----------------------\n");
4169        printk(KERN_ERR "A. Corruption after allocation\n");
4170
4171        p = kzalloc(16, GFP_KERNEL);
4172        p[16] = 0x12;
4173        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
4174                        " 0x12->0x%p\n\n", p + 16);
4175
4176        validate_slab_cache(kmalloc_caches[4]);
4177
4178        /* Hmmm... The next two are dangerous */
4179        p = kzalloc(32, GFP_KERNEL);
4180        p[32 + sizeof(void *)] = 0x34;
4181        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
4182                        " 0x34 -> -0x%p\n", p);
4183        printk(KERN_ERR
4184                "If allocated object is overwritten then not detectable\n\n");
4185
4186        validate_slab_cache(kmalloc_caches[5]);
4187        p = kzalloc(64, GFP_KERNEL);
4188        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
4189        *p = 0x56;
4190        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
4191                                                                        p);
4192        printk(KERN_ERR
4193                "If allocated object is overwritten then not detectable\n\n");
4194        validate_slab_cache(kmalloc_caches[6]);
4195
4196        printk(KERN_ERR "\nB. Corruption after free\n");
4197        p = kzalloc(128, GFP_KERNEL);
4198        kfree(p);
4199        *p = 0x78;
4200        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
4201        validate_slab_cache(kmalloc_caches[7]);
4202
4203        p = kzalloc(256, GFP_KERNEL);
4204        kfree(p);
4205        p[50] = 0x9a;
4206        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
4207                        p);
4208        validate_slab_cache(kmalloc_caches[8]);
4209
4210        p = kzalloc(512, GFP_KERNEL);
4211        kfree(p);
4212        p[512] = 0xab;
4213        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
4214        validate_slab_cache(kmalloc_caches[9]);
4215}
4216#else
4217#ifdef CONFIG_SYSFS
4218static void resiliency_test(void) {};
4219#endif
4220#endif
4221
4222#ifdef CONFIG_SYSFS
4223enum slab_stat_type {
4224        SL_ALL,                 /* All slabs */
4225        SL_PARTIAL,             /* Only partially allocated slabs */
4226        SL_CPU,                 /* Only slabs used for cpu caches */
4227        SL_OBJECTS,             /* Determine allocated objects not slabs */
4228        SL_TOTAL                /* Determine object capacity not slabs */
4229};
4230
4231#define SO_ALL          (1 << SL_ALL)
4232#define SO_PARTIAL      (1 << SL_PARTIAL)
4233#define SO_CPU          (1 << SL_CPU)
4234#define SO_OBJECTS      (1 << SL_OBJECTS)
4235#define SO_TOTAL        (1 << SL_TOTAL)
4236
4237static ssize_t show_slab_objects(struct kmem_cache *s,
4238                            char *buf, unsigned long flags)
4239{
4240        unsigned long total = 0;
4241        int node;
4242        int x;
4243        unsigned long *nodes;
4244
4245        nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
4246        if (!nodes)
4247                return -ENOMEM;
4248
4249        if (flags & SO_CPU) {
4250                int cpu;
4251
4252                for_each_possible_cpu(cpu) {
4253                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
4254                                                               cpu);
4255                        int node;
4256                        struct page *page;
4257
4258                        page = ACCESS_ONCE(c->page);
4259                        if (!page)
4260                                continue;
4261
4262                        node = page_to_nid(page);
4263                        if (flags & SO_TOTAL)
4264                                x = page->objects;
4265                        else if (flags & SO_OBJECTS)
4266                                x = page->inuse;
4267                        else
4268                                x = 1;
4269
4270                        total += x;
4271                        nodes[node] += x;
4272
4273                        page = ACCESS_ONCE(c->partial);
4274                        if (page) {
4275                                x = page->pobjects;
4276                                total += x;
4277                                nodes[node] += x;
4278                        }
4279                }
4280        }
4281
4282        lock_memory_hotplug();
4283#ifdef CONFIG_SLUB_DEBUG
4284        if (flags & SO_ALL) {
4285                for_each_node_state(node, N_NORMAL_MEMORY) {
4286                        struct kmem_cache_node *n = get_node(s, node);
4287
4288                        if (flags & SO_TOTAL)
4289                                x = atomic_long_read(&n->total_objects);
4290                        else if (flags & SO_OBJECTS)
4291                                x = atomic_long_read(&n->total_objects) -
4292                                        count_partial(n, count_free);
4293                        else
4294                                x = atomic_long_read(&n->nr_slabs);
4295                        total += x;
4296                        nodes[node] += x;
4297                }
4298
4299        } else
4300#endif
4301        if (flags & SO_PARTIAL) {
4302                for_each_node_state(node, N_NORMAL_MEMORY) {
4303                        struct kmem_cache_node *n = get_node(s, node);
4304
4305                        if (flags & SO_TOTAL)
4306                                x = count_partial(n, count_total);
4307                        else if (flags & SO_OBJECTS)
4308                                x = count_partial(n, count_inuse);
4309                        else
4310                                x = n->nr_partial;
4311                        total += x;
4312                        nodes[node] += x;
4313                }
4314        }
4315        x = sprintf(buf, "%lu", total);
4316#ifdef CONFIG_NUMA
4317        for_each_node_state(node, N_NORMAL_MEMORY)
4318                if (nodes[node])
4319                        x += sprintf(buf + x, " N%d=%lu",
4320                                        node, nodes[node]);
4321#endif
4322        unlock_memory_hotplug();
4323        kfree(nodes);
4324        return x + sprintf(buf + x, "\n");
4325}
4326
4327#ifdef CONFIG_SLUB_DEBUG
4328static int any_slab_objects(struct kmem_cache *s)
4329{
4330        int node;
4331
4332        for_each_online_node(node) {
4333                struct kmem_cache_node *n = get_node(s, node);
4334
4335                if (!n)
4336                        continue;
4337
4338                if (atomic_long_read(&n->total_objects))
4339                        return 1;
4340        }
4341        return 0;
4342}
4343#endif
4344
4345#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4346#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4347
4348struct slab_attribute {
4349        struct attribute attr;
4350        ssize_t (*show)(struct kmem_cache *s, char *buf);
4351        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
4352};
4353
4354#define SLAB_ATTR_RO(_name) \
4355        static struct slab_attribute _name##_attr = \
4356        __ATTR(_name, 0400, _name##_show, NULL)
4357
4358#define SLAB_ATTR(_name) \
4359        static struct slab_attribute _name##_attr =  \
4360        __ATTR(_name, 0600, _name##_show, _name##_store)
4361
4362static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4363{
4364        return sprintf(buf, "%d\n", s->size);
4365}
4366SLAB_ATTR_RO(slab_size);
4367
4368static ssize_t align_show(struct kmem_cache *s, char *buf)
4369{
4370        return sprintf(buf, "%d\n", s->align);
4371}
4372SLAB_ATTR_RO(align);
4373
4374static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4375{
4376        return sprintf(buf, "%d\n", s->object_size);
4377}
4378SLAB_ATTR_RO(object_size);
4379
4380static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4381{
4382        return sprintf(buf, "%d\n", oo_objects(s->oo));
4383}
4384SLAB_ATTR_RO(objs_per_slab);
4385
4386static ssize_t order_store(struct kmem_cache *s,
4387                                const char *buf, size_t length)
4388{
4389        unsigned long order;
4390        int err;
4391
4392        err = kstrtoul(buf, 10, &order);
4393        if (err)
4394                return err;
4395
4396        if (order > slub_max_order || order < slub_min_order)
4397                return -EINVAL;
4398
4399        calculate_sizes(s, order);
4400        return length;
4401}
4402
4403static ssize_t order_show(struct kmem_cache *s, char *buf)
4404{
4405        return sprintf(buf, "%d\n", oo_order(s->oo));
4406}
4407SLAB_ATTR(order);
4408
4409static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
4410{
4411        return sprintf(buf, "%lu\n", s->min_partial);
4412}
4413
4414static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4415                                 size_t length)
4416{
4417        unsigned long min;
4418        int err;
4419
4420        err = kstrtoul(buf, 10, &min);
4421        if (err)
4422                return err;
4423
4424        set_min_partial(s, min);
4425        return length;
4426}
4427SLAB_ATTR(min_partial);
4428
4429static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4430{
4431        return sprintf(buf, "%u\n", s->cpu_partial);
4432}
4433
4434static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4435                                 size_t length)
4436{
4437        unsigned long objects;
4438        int err;
4439
4440        err = kstrtoul(buf, 10, &objects);
4441        if (err)
4442                return err;
4443        if (objects && !kmem_cache_has_cpu_partial(s))
4444                return -EINVAL;
4445
4446        s->cpu_partial = objects;
4447        flush_all(s);
4448        return length;
4449}
4450SLAB_ATTR(cpu_partial);
4451
4452static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4453{
4454        if (!s->ctor)
4455                return 0;
4456        return sprintf(buf, "%pS\n", s->ctor);
4457}
4458SLAB_ATTR_RO(ctor);
4459
4460static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4461{
4462        return sprintf(buf, "%d\n", s->refcount - 1);
4463}
4464SLAB_ATTR_RO(aliases);
4465
4466static ssize_t partial_show(struct kmem_cache *s, char *buf)
4467{
4468        return show_slab_objects(s, buf, SO_PARTIAL);
4469}
4470SLAB_ATTR_RO(partial);
4471
4472static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
4473{
4474        return show_slab_objects(s, buf, SO_CPU);
4475}
4476SLAB_ATTR_RO(cpu_slabs);
4477
4478static ssize_t objects_show(struct kmem_cache *s, char *buf)
4479{
4480        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
4481}
4482SLAB_ATTR_RO(objects);
4483
4484static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4485{
4486        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
4487}
4488SLAB_ATTR_RO(objects_partial);
4489
4490static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4491{
4492        int objects = 0;
4493        int pages = 0;
4494        int cpu;
4495        int len;
4496
4497        for_each_online_cpu(cpu) {
4498                struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4499
4500                if (page) {
4501                        pages += page->pages;
4502                        objects += page->pobjects;
4503                }
4504        }
4505
4506        len = sprintf(buf, "%d(%d)", objects, pages);
4507
4508#ifdef CONFIG_SMP
4509        for_each_online_cpu(cpu) {
4510                struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4511
4512                if (page && len < PAGE_SIZE - 20)
4513                        len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4514                                page->pobjects, page->pages);
4515        }
4516#endif
4517        return len + sprintf(buf + len, "\n");
4518}
4519SLAB_ATTR_RO(slabs_cpu_partial);
4520
4521static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4522{
4523        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
4524}
4525
4526static ssize_t reclaim_account_store(struct kmem_cache *s,
4527                                const char *buf, size_t length)
4528{
4529        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4530        if (buf[0] == '1')
4531                s->flags |= SLAB_RECLAIM_ACCOUNT;
4532        return length;
4533}
4534SLAB_ATTR(reclaim_account);
4535
4536static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4537{
4538        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4539}
4540SLAB_ATTR_RO(hwcache_align);
4541
4542#ifdef CONFIG_ZONE_DMA
4543static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4544{
4545        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4546}
4547SLAB_ATTR_RO(cache_dma);
4548#endif
4549
4550static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4551{
4552        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4553}
4554SLAB_ATTR_RO(destroy_by_rcu);
4555
4556static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4557{
4558        return sprintf(buf, "%d\n", s->reserved);
4559}
4560SLAB_ATTR_RO(reserved);
4561
4562#ifdef CONFIG_SLUB_DEBUG
4563static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4564{
4565        return show_slab_objects(s, buf, SO_ALL);
4566}
4567SLAB_ATTR_RO(slabs);
4568
4569static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4570{
4571        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4572}
4573SLAB_ATTR_RO(total_objects);
4574
4575static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4576{
4577        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4578}
4579
4580static ssize_t sanity_checks_store(struct kmem_cache *s,
4581                                const char *buf, size_t length)
4582{
4583        s->flags &= ~SLAB_DEBUG_FREE;
4584        if (buf[0] == '1') {
4585                s->flags &= ~__CMPXCHG_DOUBLE;
4586                s->flags |= SLAB_DEBUG_FREE;
4587        }
4588        return length;
4589}
4590SLAB_ATTR(sanity_checks);
4591
4592static ssize_t trace_show(struct kmem_cache *s, char *buf)
4593{
4594        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4595}
4596
4597static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4598                                                        size_t length)
4599{
4600        s->flags &= ~SLAB_TRACE;
4601        if (buf[0] == '1') {
4602                s->flags &= ~__CMPXCHG_DOUBLE;
4603                s->flags |= SLAB_TRACE;
4604        }
4605        return length;
4606}
4607SLAB_ATTR(trace);
4608
4609static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4610{
4611        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
4612}
4613
4614static ssize_t red_zone_store(struct kmem_cache *s,
4615                                const char *buf, size_t length)
4616{
4617        if (any_slab_objects(s))
4618                return -EBUSY;
4619
4620        s->flags &= ~SLAB_RED_ZONE;
4621        if (buf[0] == '1') {
4622                s->flags &= ~__CMPXCHG_DOUBLE;
4623                s->flags |= SLAB_RED_ZONE;
4624        }
4625        calculate_sizes(s, -1);
4626        return length;
4627}
4628SLAB_ATTR(red_zone);
4629
4630static ssize_t poison_show(struct kmem_cache *s, char *buf)
4631{
4632        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
4633}
4634
4635static ssize_t poison_store(struct kmem_cache *s,
4636                                const char *buf, size_t length)
4637{
4638        if (any_slab_objects(s))
4639                return -EBUSY;
4640
4641        s->flags &= ~SLAB_POISON;
4642        if (buf[0] == '1') {
4643                s->flags &= ~__CMPXCHG_DOUBLE;
4644                s->flags |= SLAB_POISON;
4645        }
4646        calculate_sizes(s, -1);
4647        return length;
4648}
4649SLAB_ATTR(poison);
4650
4651static ssize_t store_user_show(struct kmem_cache *s, char *buf)
4652{
4653        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
4654}
4655
4656static ssize_t store_user_store(struct kmem_cache *s,
4657                                const char *buf, size_t length)
4658{
4659        if (any_slab_objects(s))
4660                return -EBUSY;
4661
4662        s->flags &= ~SLAB_STORE_USER;
4663        if (buf[0] == '1') {
4664                s->flags &= ~__CMPXCHG_DOUBLE;
4665                s->flags |= SLAB_STORE_USER;
4666        }
4667        calculate_sizes(s, -1);
4668        return length;
4669}
4670SLAB_ATTR(store_user);
4671
4672static ssize_t validate_show(struct kmem_cache *s, char *buf)
4673{
4674        return 0;
4675}
4676
4677static ssize_t validate_store(struct kmem_cache *s,
4678                        const char *buf, size_t length)
4679{
4680        int ret = -EINVAL;
4681
4682        if (buf[0] == '1') {
4683                ret = validate_slab_cache(s);
4684                if (ret >= 0)
4685                        ret = length;
4686        }
4687        return ret;
4688}
4689SLAB_ATTR(validate);
4690
4691static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4692{
4693        if (!(s->flags & SLAB_STORE_USER))
4694                return -ENOSYS;
4695        return list_locations(s, buf, TRACK_ALLOC);
4696}
4697SLAB_ATTR_RO(alloc_calls);
4698
4699static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4700{
4701        if (!(s->flags & SLAB_STORE_USER))
4702                return -ENOSYS;
4703        return list_locations(s, buf, TRACK_FREE);
4704}
4705SLAB_ATTR_RO(free_calls);
4706#endif /* CONFIG_SLUB_DEBUG */
4707
4708#ifdef CONFIG_FAILSLAB
4709static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4710{
4711        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4712}
4713
4714static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4715                                                        size_t length)
4716{
4717        s->flags &= ~SLAB_FAILSLAB;
4718        if (buf[0] == '1')
4719                s->flags |= SLAB_FAILSLAB;
4720        return length;
4721}
4722SLAB_ATTR(failslab);
4723#endif
4724
4725static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4726{
4727        return 0;
4728}
4729
4730static ssize_t shrink_store(struct kmem_cache *s,
4731                        const char *buf, size_t length)
4732{
4733        if (buf[0] == '1') {
4734                int rc = kmem_cache_shrink(s);
4735
4736                if (rc)
4737                        return rc;
4738        } else
4739                return -EINVAL;
4740        return length;
4741}
4742SLAB_ATTR(shrink);
4743
4744#ifdef CONFIG_NUMA
4745static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4746{
4747        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4748}
4749
4750static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4751                                const char *buf, size_t length)
4752{
4753        unsigned long ratio;
4754        int err;
4755
4756        err = kstrtoul(buf, 10, &ratio);
4757        if (err)
4758                return err;
4759
4760        if (ratio <= 100)
4761                s->remote_node_defrag_ratio = ratio * 10;
4762
4763        return length;
4764}
4765SLAB_ATTR(remote_node_defrag_ratio);
4766#endif
4767
4768#ifdef CONFIG_SLUB_STATS
4769static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4770{
4771        unsigned long sum  = 0;
4772        int cpu;
4773        int len;
4774        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4775
4776        if (!data)
4777                return -ENOMEM;
4778
4779        for_each_online_cpu(cpu) {
4780                unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4781
4782                data[cpu] = x;
4783                sum += x;
4784        }
4785
4786        len = sprintf(buf, "%lu", sum);
4787
4788#ifdef CONFIG_SMP
4789        for_each_online_cpu(cpu) {
4790                if (data[cpu] && len < PAGE_SIZE - 20)
4791                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4792        }
4793#endif
4794        kfree(data);
4795        return len + sprintf(buf + len, "\n");
4796}
4797
4798static void clear_stat(struct kmem_cache *s, enum stat_item si)
4799{
4800        int cpu;
4801
4802        for_each_online_cpu(cpu)
4803                per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4804}
4805
4806#define STAT_ATTR(si, text)                                     \
4807static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
4808{                                                               \
4809        return show_stat(s, buf, si);                           \
4810}                                                               \
4811static ssize_t text##_store(struct kmem_cache *s,               \
4812                                const char *buf, size_t length) \
4813{                                                               \
4814        if (buf[0] != '0')                                      \
4815                return -EINVAL;                                 \
4816        clear_stat(s, si);                                      \
4817        return length;                                          \
4818}                                                               \
4819SLAB_ATTR(text);                                                \
4820
4821STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4822STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4823STAT_ATTR(FREE_FASTPATH, free_fastpath);
4824STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4825STAT_ATTR(FREE_FROZEN, free_frozen);
4826STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4827STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4828STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4829STAT_ATTR(ALLOC_SLAB, alloc_slab);
4830STAT_ATTR(ALLOC_REFILL, alloc_refill);
4831STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4832STAT_ATTR(FREE_SLAB, free_slab);
4833STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4834STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4835STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4836STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4837STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4838STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4839STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4840STAT_ATTR(ORDER_FALLBACK, order_fallback);
4841STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4842STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4843STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
4844STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4845STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
4846STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
4847#endif
4848
4849static struct attribute *slab_attrs[] = {
4850        &slab_size_attr.attr,
4851        &object_size_attr.attr,
4852        &objs_per_slab_attr.attr,
4853        &order_attr.attr,
4854        &min_partial_attr.attr,
4855        &cpu_partial_attr.attr,
4856        &objects_attr.attr,
4857        &objects_partial_attr.attr,
4858        &partial_attr.attr,
4859        &cpu_slabs_attr.attr,
4860        &ctor_attr.attr,
4861        &aliases_attr.attr,
4862        &align_attr.attr,
4863        &hwcache_align_attr.attr,
4864        &reclaim_account_attr.attr,
4865        &destroy_by_rcu_attr.attr,
4866        &shrink_attr.attr,
4867        &reserved_attr.attr,
4868        &slabs_cpu_partial_attr.attr,
4869#ifdef CONFIG_SLUB_DEBUG
4870        &total_objects_attr.attr,
4871        &slabs_attr.attr,
4872        &sanity_checks_attr.attr,
4873        &trace_attr.attr,
4874        &red_zone_attr.attr,
4875        &poison_attr.attr,
4876        &store_user_attr.attr,
4877        &validate_attr.attr,
4878        &alloc_calls_attr.attr,
4879        &free_calls_attr.attr,
4880#endif
4881#ifdef CONFIG_ZONE_DMA
4882        &cache_dma_attr.attr,
4883#endif
4884#ifdef CONFIG_NUMA
4885        &remote_node_defrag_ratio_attr.attr,
4886#endif
4887#ifdef CONFIG_SLUB_STATS
4888        &alloc_fastpath_attr.attr,
4889        &alloc_slowpath_attr.attr,
4890        &free_fastpath_attr.attr,
4891        &free_slowpath_attr.attr,
4892        &free_frozen_attr.attr,
4893        &free_add_partial_attr.attr,
4894        &free_remove_partial_attr.attr,
4895        &alloc_from_partial_attr.attr,
4896        &alloc_slab_attr.attr,
4897        &alloc_refill_attr.attr,
4898        &alloc_node_mismatch_attr.attr,
4899        &free_slab_attr.attr,
4900        &cpuslab_flush_attr.attr,
4901        &deactivate_full_attr.attr,
4902        &deactivate_empty_attr.attr,
4903        &deactivate_to_head_attr.attr,
4904        &deactivate_to_tail_attr.attr,
4905        &deactivate_remote_frees_attr.attr,
4906        &deactivate_bypass_attr.attr,
4907        &order_fallback_attr.attr,
4908        &cmpxchg_double_fail_attr.attr,
4909        &cmpxchg_double_cpu_fail_attr.attr,
4910        &cpu_partial_alloc_attr.attr,
4911        &cpu_partial_free_attr.attr,
4912        &cpu_partial_node_attr.attr,
4913        &cpu_partial_drain_attr.attr,
4914#endif
4915#ifdef CONFIG_FAILSLAB
4916        &failslab_attr.attr,
4917#endif
4918
4919        NULL
4920};
4921
4922static struct attribute_group slab_attr_group = {
4923        .attrs = slab_attrs,
4924};
4925
4926static ssize_t slab_attr_show(struct kobject *kobj,
4927                                struct attribute *attr,
4928                                char *buf)
4929{
4930        struct slab_attribute *attribute;
4931        struct kmem_cache *s;
4932        int err;
4933
4934        attribute = to_slab_attr(attr);
4935        s = to_slab(kobj);
4936
4937        if (!attribute->show)
4938                return -EIO;
4939
4940        err = attribute->show(s, buf);
4941
4942        return err;
4943}
4944
4945static ssize_t slab_attr_store(struct kobject *kobj,
4946                                struct attribute *attr,
4947                                const char *buf, size_t len)
4948{
4949        struct slab_attribute *attribute;
4950        struct kmem_cache *s;
4951        int err;
4952
4953        attribute = to_slab_attr(attr);
4954        s = to_slab(kobj);
4955
4956        if (!attribute->store)
4957                return -EIO;
4958
4959        err = attribute->store(s, buf, len);
4960#ifdef CONFIG_MEMCG_KMEM
4961        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
4962                int i;
4963
4964                mutex_lock(&slab_mutex);
4965                if (s->max_attr_size < len)
4966                        s->max_attr_size = len;
4967
4968                /*
4969                 * This is a best effort propagation, so this function's return
4970                 * value will be determined by the parent cache only. This is
4971                 * basically because not all attributes will have a well
4972                 * defined semantics for rollbacks - most of the actions will
4973                 * have permanent effects.
4974                 *
4975                 * Returning the error value of any of the children that fail
4976                 * is not 100 % defined, in the sense that users seeing the
4977                 * error code won't be able to know anything about the state of
4978                 * the cache.
4979                 *
4980                 * Only returning the error code for the parent cache at least
4981                 * has well defined semantics. The cache being written to
4982                 * directly either failed or succeeded, in which case we loop
4983                 * through the descendants with best-effort propagation.
4984                 */
4985                for_each_memcg_cache_index(i) {
4986                        struct kmem_cache *c = cache_from_memcg(s, i);
4987                        if (c)
4988                                attribute->store(c, buf, len);
4989                }
4990                mutex_unlock(&slab_mutex);
4991        }
4992#endif
4993        return err;
4994}
4995
4996static void memcg_propagate_slab_attrs(struct kmem_cache *s)
4997{
4998#ifdef CONFIG_MEMCG_KMEM
4999        int i;
5000        char *buffer = NULL;
5001
5002        if (!is_root_cache(s))
5003                return;
5004
5005        /*
5006         * This mean this cache had no attribute written. Therefore, no point
5007         * in copying default values around
5008         */
5009        if (!s->max_attr_size)
5010                return;
5011
5012        for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5013                char mbuf[64];
5014                char *buf;
5015                struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5016
5017                if (!attr || !attr->store || !attr->show)
5018                        continue;
5019
5020                /*
5021                 * It is really bad that we have to allocate here, so we will
5022                 * do it only as a fallback. If we actually allocate, though,
5023                 * we can just use the allocated buffer until the end.
5024                 *
5025                 * Most of the slub attributes will tend to be very small in
5026                 * size, but sysfs allows buffers up to a page, so they can
5027                 * theoretically happen.
5028                 */
5029                if (buffer)
5030                        buf = buffer;
5031                else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5032                        buf = mbuf;
5033                else {
5034                        buffer = (char *) get_zeroed_page(GFP_KERNEL);
5035                        if (WARN_ON(!buffer))
5036                                continue;
5037                        buf = buffer;
5038                }
5039
5040                attr->show(s->memcg_params->root_cache, buf);
5041                attr->store(s, buf, strlen(buf));
5042        }
5043
5044        if (buffer)
5045                free_page((unsigned long)buffer);
5046#endif
5047}
5048
5049static const struct sysfs_ops slab_sysfs_ops = {
5050        .show = slab_attr_show,
5051        .store = slab_attr_store,
5052};
5053
5054static struct kobj_type slab_ktype = {
5055        .sysfs_ops = &slab_sysfs_ops,
5056};
5057
5058static int uevent_filter(struct kset *kset, struct kobject *kobj)
5059{
5060        struct kobj_type *ktype = get_ktype(kobj);
5061
5062        if (ktype == &slab_ktype)
5063                return 1;
5064        return 0;
5065}
5066
5067static const struct kset_uevent_ops slab_uevent_ops = {
5068        .filter = uevent_filter,
5069};
5070
5071static struct kset *slab_kset;
5072
5073#define ID_STR_LENGTH 64
5074
5075/* Create a unique string id for a slab cache:
5076 *
5077 * Format       :[flags-]size
5078 */
5079static char *create_unique_id(struct kmem_cache *s)
5080{
5081        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5082        char *p = name;
5083
5084        BUG_ON(!name);
5085
5086        *p++ = ':';
5087        /*
5088         * First flags affecting slabcache operations. We will only
5089         * get here for aliasable slabs so we do not need to support
5090         * too many flags. The flags here must cover all flags that
5091         * are matched during merging to guarantee that the id is
5092         * unique.
5093         */
5094        if (s->flags & SLAB_CACHE_DMA)
5095                *p++ = 'd';
5096        if (s->flags & SLAB_RECLAIM_ACCOUNT)
5097                *p++ = 'a';
5098        if (s->flags & SLAB_DEBUG_FREE)
5099                *p++ = 'F';
5100        if (!(s->flags & SLAB_NOTRACK))
5101                *p++ = 't';
5102        if (p != name + 1)
5103                *p++ = '-';
5104        p += sprintf(p, "%07d", s->size);
5105
5106#ifdef CONFIG_MEMCG_KMEM
5107        if (!is_root_cache(s))
5108                p += sprintf(p, "-%08d",
5109                                memcg_cache_id(s->memcg_params->memcg));
5110#endif
5111
5112        BUG_ON(p > name + ID_STR_LENGTH - 1);
5113        return name;
5114}
5115
5116static int sysfs_slab_add(struct kmem_cache *s)
5117{
5118        int err;
5119        const char *name;
5120        int unmergeable = slab_unmergeable(s);
5121
5122        if (unmergeable) {
5123                /*
5124                 * Slabcache can never be merged so we can use the name proper.
5125                 * This is typically the case for debug situations. In that
5126                 * case we can catch duplicate names easily.
5127                 */
5128                sysfs_remove_link(&slab_kset->kobj, s->name);
5129                name = s->name;
5130        } else {
5131                /*
5132                 * Create a unique name for the slab as a target
5133                 * for the symlinks.
5134                 */
5135                name = create_unique_id(s);
5136        }
5137
5138        s->kobj.kset = slab_kset;
5139        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
5140        if (err) {
5141                kobject_put(&s->kobj);
5142                return err;
5143        }
5144
5145        err = sysfs_create_group(&s->kobj, &slab_attr_group);
5146        if (err) {
5147                kobject_del(&s->kobj);
5148                kobject_put(&s->kobj);
5149                return err;
5150        }
5151        kobject_uevent(&s->kobj, KOBJ_ADD);
5152        if (!unmergeable) {
5153                /* Setup first alias */
5154                sysfs_slab_alias(s, s->name);
5155                kfree(name);
5156        }
5157        return 0;
5158}
5159
5160static void sysfs_slab_remove(struct kmem_cache *s)
5161{
5162        if (slab_state < FULL)
5163                /*
5164                 * Sysfs has not been setup yet so no need to remove the
5165                 * cache from sysfs.
5166                 */
5167                return;
5168
5169        kobject_uevent(&s->kobj, KOBJ_REMOVE);
5170        kobject_del(&s->kobj);
5171        kobject_put(&s->kobj);
5172}
5173
5174/*
5175 * Need to buffer aliases during bootup until sysfs becomes
5176 * available lest we lose that information.
5177 */
5178struct saved_alias {
5179        struct kmem_cache *s;
5180        const char *name;
5181        struct saved_alias *next;
5182};
5183
5184static struct saved_alias *alias_list;
5185
5186static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5187{
5188        struct saved_alias *al;
5189
5190        if (slab_state == FULL) {
5191                /*
5192                 * If we have a leftover link then remove it.
5193                 */
5194                sysfs_remove_link(&slab_kset->kobj, name);
5195                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5196        }
5197
5198        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5199        if (!al)
5200                return -ENOMEM;
5201
5202        al->s = s;
5203        al->name = name;
5204        al->next = alias_list;
5205        alias_list = al;
5206        return 0;
5207}
5208
5209static int __init slab_sysfs_init(void)
5210{
5211        struct kmem_cache *s;
5212        int err;
5213
5214        mutex_lock(&slab_mutex);
5215
5216        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5217        if (!slab_kset) {
5218                mutex_unlock(&slab_mutex);
5219                printk(KERN_ERR "Cannot register slab subsystem.\n");
5220                return -ENOSYS;
5221        }
5222
5223        slab_state = FULL;
5224
5225        list_for_each_entry(s, &slab_caches, list) {
5226                err = sysfs_slab_add(s);
5227                if (err)
5228                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
5229                                                " to sysfs\n", s->name);
5230        }
5231
5232        while (alias_list) {
5233                struct saved_alias *al = alias_list;
5234
5235                alias_list = alias_list->next;
5236                err = sysfs_slab_alias(al->s, al->name);
5237                if (err)
5238                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5239                                        " %s to sysfs\n", al->name);
5240                kfree(al);
5241        }
5242
5243        mutex_unlock(&slab_mutex);
5244        resiliency_test();
5245        return 0;
5246}
5247
5248__initcall(slab_sysfs_init);
5249#endif /* CONFIG_SYSFS */
5250
5251/*
5252 * The /proc/slabinfo ABI
5253 */
5254#ifdef CONFIG_SLABINFO
5255void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5256{
5257        unsigned long nr_slabs = 0;
5258        unsigned long nr_objs = 0;
5259        unsigned long nr_free = 0;
5260        int node;
5261
5262        for_each_online_node(node) {
5263                struct kmem_cache_node *n = get_node(s, node);
5264
5265                if (!n)
5266                        continue;
5267
5268                nr_slabs += node_nr_slabs(n);
5269                nr_objs += node_nr_objs(n);
5270                nr_free += count_partial(n, count_free);
5271        }
5272
5273        sinfo->active_objs = nr_objs - nr_free;
5274        sinfo->num_objs = nr_objs;
5275        sinfo->active_slabs = nr_slabs;
5276        sinfo->num_slabs = nr_slabs;
5277        sinfo->objects_per_slab = oo_objects(s->oo);
5278        sinfo->cache_order = oo_order(s->oo);
5279}
5280
5281void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
5282{
5283}
5284
5285ssize_t slabinfo_write(struct file *file, const char __user *buffer,
5286                       size_t count, loff_t *ppos)
5287{
5288        return -EIO;
5289}
5290#endif /* CONFIG_SLABINFO */
5291
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.