linux/mm/slub.c
<<
>>
Prefs
   1/*
   2 * SLUB: A slab allocator that limits cache line use instead of queuing
   3 * objects in per cpu and per node lists.
   4 *
   5 * The allocator synchronizes using per slab locks and only
   6 * uses a centralized lock to manage a pool of partial slabs.
   7 *
   8 * (C) 2007 SGI, Christoph Lameter
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/module.h>
  13#include <linux/bit_spinlock.h>
  14#include <linux/interrupt.h>
  15#include <linux/bitops.h>
  16#include <linux/slab.h>
  17#include <linux/proc_fs.h>
  18#include <linux/seq_file.h>
  19#include <linux/cpu.h>
  20#include <linux/cpuset.h>
  21#include <linux/mempolicy.h>
  22#include <linux/ctype.h>
  23#include <linux/debugobjects.h>
  24#include <linux/kallsyms.h>
  25#include <linux/memory.h>
  26#include <linux/math64.h>
  27
  28/*
  29 * Lock order:
  30 *   1. slab_lock(page)
  31 *   2. slab->list_lock
  32 *
  33 *   The slab_lock protects operations on the object of a particular
  34 *   slab and its metadata in the page struct. If the slab lock
  35 *   has been taken then no allocations nor frees can be performed
  36 *   on the objects in the slab nor can the slab be added or removed
  37 *   from the partial or full lists since this would mean modifying
  38 *   the page_struct of the slab.
  39 *
  40 *   The list_lock protects the partial and full list on each node and
  41 *   the partial slab counter. If taken then no new slabs may be added or
  42 *   removed from the lists nor make the number of partial slabs be modified.
  43 *   (Note that the total number of slabs is an atomic value that may be
  44 *   modified without taking the list lock).
  45 *
  46 *   The list_lock is a centralized lock and thus we avoid taking it as
  47 *   much as possible. As long as SLUB does not have to handle partial
  48 *   slabs, operations can continue without any centralized lock. F.e.
  49 *   allocating a long series of objects that fill up slabs does not require
  50 *   the list lock.
  51 *
  52 *   The lock order is sometimes inverted when we are trying to get a slab
  53 *   off a list. We take the list_lock and then look for a page on the list
  54 *   to use. While we do that objects in the slabs may be freed. We can
  55 *   only operate on the slab if we have also taken the slab_lock. So we use
  56 *   a slab_trylock() on the slab. If trylock was successful then no frees
  57 *   can occur anymore and we can use the slab for allocations etc. If the
  58 *   slab_trylock() does not succeed then frees are in progress in the slab and
  59 *   we must stay away from it for a while since we may cause a bouncing
  60 *   cacheline if we try to acquire the lock. So go onto the next slab.
  61 *   If all pages are busy then we may allocate a new slab instead of reusing
  62 *   a partial slab. A new slab has noone operating on it and thus there is
  63 *   no danger of cacheline contention.
  64 *
  65 *   Interrupts are disabled during allocation and deallocation in order to
  66 *   make the slab allocator safe to use in the context of an irq. In addition
  67 *   interrupts are disabled to ensure that the processor does not change
  68 *   while handling per_cpu slabs, due to kernel preemption.
  69 *
  70 * SLUB assigns one slab for allocation to each processor.
  71 * Allocations only occur from these slabs called cpu slabs.
  72 *
  73 * Slabs with free elements are kept on a partial list and during regular
  74 * operations no list for full slabs is used. If an object in a full slab is
  75 * freed then the slab will show up again on the partial lists.
  76 * We track full slabs for debugging purposes though because otherwise we
  77 * cannot scan all objects.
  78 *
  79 * Slabs are freed when they become empty. Teardown and setup is
  80 * minimal so we rely on the page allocators per cpu caches for
  81 * fast frees and allocs.
  82 *
  83 * Overloading of page flags that are otherwise used for LRU management.
  84 *
  85 * PageActive           The slab is frozen and exempt from list processing.
  86 *                      This means that the slab is dedicated to a purpose
  87 *                      such as satisfying allocations for a specific
  88 *                      processor. Objects may be freed in the slab while
  89 *                      it is frozen but slab_free will then skip the usual
  90 *                      list operations. It is up to the processor holding
  91 *                      the slab to integrate the slab into the slab lists
  92 *                      when the slab is no longer needed.
  93 *
  94 *                      One use of this flag is to mark slabs that are
  95 *                      used for allocations. Then such a slab becomes a cpu
  96 *                      slab. The cpu slab may be equipped with an additional
  97 *                      freelist that allows lockless access to
  98 *                      free objects in addition to the regular freelist
  99 *                      that requires the slab lock.
 100 *
 101 * PageError            Slab requires special handling due to debug
 102 *                      options set. This moves slab handling out of
 103 *                      the fast path and disables lockless freelists.
 104 */
 105
 106#ifdef CONFIG_SLUB_DEBUG
 107#define SLABDEBUG 1
 108#else
 109#define SLABDEBUG 0
 110#endif
 111
 112/*
 113 * Issues still to be resolved:
 114 *
 115 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 116 *
 117 * - Variable sizing of the per node arrays
 118 */
 119
 120/* Enable to test recovery from slab corruption on boot */
 121#undef SLUB_RESILIENCY_TEST
 122
 123/*
 124 * Mininum number of partial slabs. These will be left on the partial
 125 * lists even if they are empty. kmem_cache_shrink may reclaim them.
 126 */
 127#define MIN_PARTIAL 5
 128
 129/*
 130 * Maximum number of desirable partial slabs.
 131 * The existence of more partial slabs makes kmem_cache_shrink
 132 * sort the partial list by the number of objects in the.
 133 */
 134#define MAX_PARTIAL 10
 135
 136#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 137                                SLAB_POISON | SLAB_STORE_USER)
 138
 139/*
 140 * Set of flags that will prevent slab merging
 141 */
 142#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 143                SLAB_TRACE | SLAB_DESTROY_BY_RCU)
 144
 145#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 146                SLAB_CACHE_DMA)
 147
 148#ifndef ARCH_KMALLOC_MINALIGN
 149#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 150#endif
 151
 152#ifndef ARCH_SLAB_MINALIGN
 153#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 154#endif
 155
 156/* Internal SLUB flags */
 157#define __OBJECT_POISON         0x80000000 /* Poison object */
 158#define __SYSFS_ADD_DEFERRED    0x40000000 /* Not yet visible via sysfs */
 159
 160static int kmem_size = sizeof(struct kmem_cache);
 161
 162#ifdef CONFIG_SMP
 163static struct notifier_block slab_notifier;
 164#endif
 165
 166static enum {
 167        DOWN,           /* No slab functionality available */
 168        PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
 169        UP,             /* Everything works but does not show up in sysfs */
 170        SYSFS           /* Sysfs up */
 171} slab_state = DOWN;
 172
 173/* A list of all slab caches on the system */
 174static DECLARE_RWSEM(slub_lock);
 175static LIST_HEAD(slab_caches);
 176
 177/*
 178 * Tracking user of a slab.
 179 */
 180struct track {
 181        void *addr;             /* Called from address */
 182        int cpu;                /* Was running on cpu */
 183        int pid;                /* Pid context */
 184        unsigned long when;     /* When did the operation occur */
 185};
 186
 187enum track_item { TRACK_ALLOC, TRACK_FREE };
 188
 189#ifdef CONFIG_SLUB_DEBUG
 190static int sysfs_slab_add(struct kmem_cache *);
 191static int sysfs_slab_alias(struct kmem_cache *, const char *);
 192static void sysfs_slab_remove(struct kmem_cache *);
 193
 194#else
 195static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 196static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 197                                                        { return 0; }
 198static inline void sysfs_slab_remove(struct kmem_cache *s)
 199{
 200        kfree(s);
 201}
 202
 203#endif
 204
 205static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
 206{
 207#ifdef CONFIG_SLUB_STATS
 208        c->stat[si]++;
 209#endif
 210}
 211
 212/********************************************************************
 213 *                      Core slab cache functions
 214 *******************************************************************/
 215
 216int slab_is_available(void)
 217{
 218        return slab_state >= UP;
 219}
 220
 221static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 222{
 223#ifdef CONFIG_NUMA
 224        return s->node[node];
 225#else
 226        return &s->local_node;
 227#endif
 228}
 229
 230static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
 231{
 232#ifdef CONFIG_SMP
 233        return s->cpu_slab[cpu];
 234#else
 235        return &s->cpu_slab;
 236#endif
 237}
 238
 239/* Verify that a pointer has an address that is valid within a slab page */
 240static inline int check_valid_pointer(struct kmem_cache *s,
 241                                struct page *page, const void *object)
 242{
 243        void *base;
 244
 245        if (!object)
 246                return 1;
 247
 248        base = page_address(page);
 249        if (object < base || object >= base + page->objects * s->size ||
 250                (object - base) % s->size) {
 251                return 0;
 252        }
 253
 254        return 1;
 255}
 256
 257/*
 258 * Slow version of get and set free pointer.
 259 *
 260 * This version requires touching the cache lines of kmem_cache which
 261 * we avoid to do in the fast alloc free paths. There we obtain the offset
 262 * from the page struct.
 263 */
 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
 265{
 266        return *(void **)(object + s->offset);
 267}
 268
 269static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 270{
 271        *(void **)(object + s->offset) = fp;
 272}
 273
 274/* Loop over all objects in a slab */
 275#define for_each_object(__p, __s, __addr, __objects) \
 276        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
 277                        __p += (__s)->size)
 278
 279/* Scan freelist */
 280#define for_each_free_object(__p, __s, __free) \
 281        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 282
 283/* Determine object index from a given position */
 284static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 285{
 286        return (p - addr) / s->size;
 287}
 288
 289static inline struct kmem_cache_order_objects oo_make(int order,
 290                                                unsigned long size)
 291{
 292        struct kmem_cache_order_objects x = {
 293                (order << 16) + (PAGE_SIZE << order) / size
 294        };
 295
 296        return x;
 297}
 298
 299static inline int oo_order(struct kmem_cache_order_objects x)
 300{
 301        return x.x >> 16;
 302}
 303
 304static inline int oo_objects(struct kmem_cache_order_objects x)
 305{
 306        return x.x & ((1 << 16) - 1);
 307}
 308
 309#ifdef CONFIG_SLUB_DEBUG
 310/*
 311 * Debug settings:
 312 */
 313#ifdef CONFIG_SLUB_DEBUG_ON
 314static int slub_debug = DEBUG_DEFAULT_FLAGS;
 315#else
 316static int slub_debug;
 317#endif
 318
 319static char *slub_debug_slabs;
 320
 321/*
 322 * Object debugging
 323 */
 324static void print_section(char *text, u8 *addr, unsigned int length)
 325{
 326        int i, offset;
 327        int newline = 1;
 328        char ascii[17];
 329
 330        ascii[16] = 0;
 331
 332        for (i = 0; i < length; i++) {
 333                if (newline) {
 334                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
 335                        newline = 0;
 336                }
 337                printk(KERN_CONT " %02x", addr[i]);
 338                offset = i % 16;
 339                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 340                if (offset == 15) {
 341                        printk(KERN_CONT " %s\n", ascii);
 342                        newline = 1;
 343                }
 344        }
 345        if (!newline) {
 346                i %= 16;
 347                while (i < 16) {
 348                        printk(KERN_CONT "   ");
 349                        ascii[i] = ' ';
 350                        i++;
 351                }
 352                printk(KERN_CONT " %s\n", ascii);
 353        }
 354}
 355
 356static struct track *get_track(struct kmem_cache *s, void *object,
 357        enum track_item alloc)
 358{
 359        struct track *p;
 360
 361        if (s->offset)
 362                p = object + s->offset + sizeof(void *);
 363        else
 364                p = object + s->inuse;
 365
 366        return p + alloc;
 367}
 368
 369static void set_track(struct kmem_cache *s, void *object,
 370                                enum track_item alloc, void *addr)
 371{
 372        struct track *p;
 373
 374        if (s->offset)
 375                p = object + s->offset + sizeof(void *);
 376        else
 377                p = object + s->inuse;
 378
 379        p += alloc;
 380        if (addr) {
 381                p->addr = addr;
 382                p->cpu = smp_processor_id();
 383                p->pid = current->pid;
 384                p->when = jiffies;
 385        } else
 386                memset(p, 0, sizeof(struct track));
 387}
 388
 389static void init_tracking(struct kmem_cache *s, void *object)
 390{
 391        if (!(s->flags & SLAB_STORE_USER))
 392                return;
 393
 394        set_track(s, object, TRACK_FREE, NULL);
 395        set_track(s, object, TRACK_ALLOC, NULL);
 396}
 397
 398static void print_track(const char *s, struct track *t)
 399{
 400        if (!t->addr)
 401                return;
 402
 403        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 404                s, t->addr, jiffies - t->when, t->cpu, t->pid);
 405}
 406
 407static void print_tracking(struct kmem_cache *s, void *object)
 408{
 409        if (!(s->flags & SLAB_STORE_USER))
 410                return;
 411
 412        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 413        print_track("Freed", get_track(s, object, TRACK_FREE));
 414}
 415
 416static void print_page_info(struct page *page)
 417{
 418        printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
 419                page, page->objects, page->inuse, page->freelist, page->flags);
 420
 421}
 422
 423static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 424{
 425        va_list args;
 426        char buf[100];
 427
 428        va_start(args, fmt);
 429        vsnprintf(buf, sizeof(buf), fmt, args);
 430        va_end(args);
 431        printk(KERN_ERR "========================================"
 432                        "=====================================\n");
 433        printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
 434        printk(KERN_ERR "----------------------------------------"
 435                        "-------------------------------------\n\n");
 436}
 437
 438static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 439{
 440        va_list args;
 441        char buf[100];
 442
 443        va_start(args, fmt);
 444        vsnprintf(buf, sizeof(buf), fmt, args);
 445        va_end(args);
 446        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 447}
 448
 449static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 450{
 451        unsigned int off;       /* Offset of last byte */
 452        u8 *addr = page_address(page);
 453
 454        print_tracking(s, p);
 455
 456        print_page_info(page);
 457
 458        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 459                        p, p - addr, get_freepointer(s, p));
 460
 461        if (p > addr + 16)
 462                print_section("Bytes b4", p - 16, 16);
 463
 464        print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
 465
 466        if (s->flags & SLAB_RED_ZONE)
 467                print_section("Redzone", p + s->objsize,
 468                        s->inuse - s->objsize);
 469
 470        if (s->offset)
 471                off = s->offset + sizeof(void *);
 472        else
 473                off = s->inuse;
 474
 475        if (s->flags & SLAB_STORE_USER)
 476                off += 2 * sizeof(struct track);
 477
 478        if (off != s->size)
 479                /* Beginning of the filler is the free pointer */
 480                print_section("Padding", p + off, s->size - off);
 481
 482        dump_stack();
 483}
 484
 485static void object_err(struct kmem_cache *s, struct page *page,
 486                        u8 *object, char *reason)
 487{
 488        slab_bug(s, "%s", reason);
 489        print_trailer(s, page, object);
 490}
 491
 492static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
 493{
 494        va_list args;
 495        char buf[100];
 496
 497        va_start(args, fmt);
 498        vsnprintf(buf, sizeof(buf), fmt, args);
 499        va_end(args);
 500        slab_bug(s, "%s", buf);
 501        print_page_info(page);
 502        dump_stack();
 503}
 504
 505static void init_object(struct kmem_cache *s, void *object, int active)
 506{
 507        u8 *p = object;
 508
 509        if (s->flags & __OBJECT_POISON) {
 510                memset(p, POISON_FREE, s->objsize - 1);
 511                p[s->objsize - 1] = POISON_END;
 512        }
 513
 514        if (s->flags & SLAB_RED_ZONE)
 515                memset(p + s->objsize,
 516                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 517                        s->inuse - s->objsize);
 518}
 519
 520static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 521{
 522        while (bytes) {
 523                if (*start != (u8)value)
 524                        return start;
 525                start++;
 526                bytes--;
 527        }
 528        return NULL;
 529}
 530
 531static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 532                                                void *from, void *to)
 533{
 534        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 535        memset(from, data, to - from);
 536}
 537
 538static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 539                        u8 *object, char *what,
 540                        u8 *start, unsigned int value, unsigned int bytes)
 541{
 542        u8 *fault;
 543        u8 *end;
 544
 545        fault = check_bytes(start, value, bytes);
 546        if (!fault)
 547                return 1;
 548
 549        end = start + bytes;
 550        while (end > fault && end[-1] == value)
 551                end--;
 552
 553        slab_bug(s, "%s overwritten", what);
 554        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 555                                        fault, end - 1, fault[0], value);
 556        print_trailer(s, page, object);
 557
 558        restore_bytes(s, what, value, fault, end);
 559        return 0;
 560}
 561
 562/*
 563 * Object layout:
 564 *
 565 * object address
 566 *      Bytes of the object to be managed.
 567 *      If the freepointer may overlay the object then the free
 568 *      pointer is the first word of the object.
 569 *
 570 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 571 *      0xa5 (POISON_END)
 572 *
 573 * object + s->objsize
 574 *      Padding to reach word boundary. This is also used for Redzoning.
 575 *      Padding is extended by another word if Redzoning is enabled and
 576 *      objsize == inuse.
 577 *
 578 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 579 *      0xcc (RED_ACTIVE) for objects in use.
 580 *
 581 * object + s->inuse
 582 *      Meta data starts here.
 583 *
 584 *      A. Free pointer (if we cannot overwrite object on free)
 585 *      B. Tracking data for SLAB_STORE_USER
 586 *      C. Padding to reach required alignment boundary or at mininum
 587 *              one word if debugging is on to be able to detect writes
 588 *              before the word boundary.
 589 *
 590 *      Padding is done using 0x5a (POISON_INUSE)
 591 *
 592 * object + s->size
 593 *      Nothing is used beyond s->size.
 594 *
 595 * If slabcaches are merged then the objsize and inuse boundaries are mostly
 596 * ignored. And therefore no slab options that rely on these boundaries
 597 * may be used with merged slabcaches.
 598 */
 599
 600static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 601{
 602        unsigned long off = s->inuse;   /* The end of info */
 603
 604        if (s->offset)
 605                /* Freepointer is placed after the object. */
 606                off += sizeof(void *);
 607
 608        if (s->flags & SLAB_STORE_USER)
 609                /* We also have user information there */
 610                off += 2 * sizeof(struct track);
 611
 612        if (s->size == off)
 613                return 1;
 614
 615        return check_bytes_and_report(s, page, p, "Object padding",
 616                                p + off, POISON_INUSE, s->size - off);
 617}
 618
 619/* Check the pad bytes at the end of a slab page */
 620static int slab_pad_check(struct kmem_cache *s, struct page *page)
 621{
 622        u8 *start;
 623        u8 *fault;
 624        u8 *end;
 625        int length;
 626        int remainder;
 627
 628        if (!(s->flags & SLAB_POISON))
 629                return 1;
 630
 631        start = page_address(page);
 632        length = (PAGE_SIZE << compound_order(page));
 633        end = start + length;
 634        remainder = length % s->size;
 635        if (!remainder)
 636                return 1;
 637
 638        fault = check_bytes(end - remainder, POISON_INUSE, remainder);
 639        if (!fault)
 640                return 1;
 641        while (end > fault && end[-1] == POISON_INUSE)
 642                end--;
 643
 644        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 645        print_section("Padding", end - remainder, remainder);
 646
 647        restore_bytes(s, "slab padding", POISON_INUSE, start, end);
 648        return 0;
 649}
 650
 651static int check_object(struct kmem_cache *s, struct page *page,
 652                                        void *object, int active)
 653{
 654        u8 *p = object;
 655        u8 *endobject = object + s->objsize;
 656
 657        if (s->flags & SLAB_RED_ZONE) {
 658                unsigned int red =
 659                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 660
 661                if (!check_bytes_and_report(s, page, object, "Redzone",
 662                        endobject, red, s->inuse - s->objsize))
 663                        return 0;
 664        } else {
 665                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
 666                        check_bytes_and_report(s, page, p, "Alignment padding",
 667                                endobject, POISON_INUSE, s->inuse - s->objsize);
 668                }
 669        }
 670
 671        if (s->flags & SLAB_POISON) {
 672                if (!active && (s->flags & __OBJECT_POISON) &&
 673                        (!check_bytes_and_report(s, page, p, "Poison", p,
 674                                        POISON_FREE, s->objsize - 1) ||
 675                         !check_bytes_and_report(s, page, p, "Poison",
 676                                p + s->objsize - 1, POISON_END, 1)))
 677                        return 0;
 678                /*
 679                 * check_pad_bytes cleans up on its own.
 680                 */
 681                check_pad_bytes(s, page, p);
 682        }
 683
 684        if (!s->offset && active)
 685                /*
 686                 * Object and freepointer overlap. Cannot check
 687                 * freepointer while object is allocated.
 688                 */
 689                return 1;
 690
 691        /* Check free pointer validity */
 692        if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 693                object_err(s, page, p, "Freepointer corrupt");
 694                /*
 695                 * No choice but to zap it and thus loose the remainder
 696                 * of the free objects in this slab. May cause
 697                 * another error because the object count is now wrong.
 698                 */
 699                set_freepointer(s, p, NULL);
 700                return 0;
 701        }
 702        return 1;
 703}
 704
 705static int check_slab(struct kmem_cache *s, struct page *page)
 706{
 707        int maxobj;
 708
 709        VM_BUG_ON(!irqs_disabled());
 710
 711        if (!PageSlab(page)) {
 712                slab_err(s, page, "Not a valid slab page");
 713                return 0;
 714        }
 715
 716        maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
 717        if (page->objects > maxobj) {
 718                slab_err(s, page, "objects %u > max %u",
 719                        s->name, page->objects, maxobj);
 720                return 0;
 721        }
 722        if (page->inuse > page->objects) {
 723                slab_err(s, page, "inuse %u > max %u",
 724                        s->name, page->inuse, page->objects);
 725                return 0;
 726        }
 727        /* Slab_pad_check fixes things up after itself */
 728        slab_pad_check(s, page);
 729        return 1;
 730}
 731
 732/*
 733 * Determine if a certain object on a page is on the freelist. Must hold the
 734 * slab lock to guarantee that the chains are in a consistent state.
 735 */
 736static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 737{
 738        int nr = 0;
 739        void *fp = page->freelist;
 740        void *object = NULL;
 741        unsigned long max_objects;
 742
 743        while (fp && nr <= page->objects) {
 744                if (fp == search)
 745                        return 1;
 746                if (!check_valid_pointer(s, page, fp)) {
 747                        if (object) {
 748                                object_err(s, page, object,
 749                                        "Freechain corrupt");
 750                                set_freepointer(s, object, NULL);
 751                                break;
 752                        } else {
 753                                slab_err(s, page, "Freepointer corrupt");
 754                                page->freelist = NULL;
 755                                page->inuse = page->objects;
 756                                slab_fix(s, "Freelist cleared");
 757                                return 0;
 758                        }
 759                        break;
 760                }
 761                object = fp;
 762                fp = get_freepointer(s, object);
 763                nr++;
 764        }
 765
 766        max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
 767        if (max_objects > 65535)
 768                max_objects = 65535;
 769
 770        if (page->objects != max_objects) {
 771                slab_err(s, page, "Wrong number of objects. Found %d but "
 772                        "should be %d", page->objects, max_objects);
 773                page->objects = max_objects;
 774                slab_fix(s, "Number of objects adjusted.");
 775        }
 776        if (page->inuse != page->objects - nr) {
 777                slab_err(s, page, "Wrong object count. Counter is %d but "
 778                        "counted were %d", page->inuse, page->objects - nr);
 779                page->inuse = page->objects - nr;
 780                slab_fix(s, "Object count adjusted.");
 781        }
 782        return search == NULL;
 783}
 784
 785static void trace(struct kmem_cache *s, struct page *page, void *object,
 786                                                                int alloc)
 787{
 788        if (s->flags & SLAB_TRACE) {
 789                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 790                        s->name,
 791                        alloc ? "alloc" : "free",
 792                        object, page->inuse,
 793                        page->freelist);
 794
 795                if (!alloc)
 796                        print_section("Object", (void *)object, s->objsize);
 797
 798                dump_stack();
 799        }
 800}
 801
 802/*
 803 * Tracking of fully allocated slabs for debugging purposes.
 804 */
 805static void add_full(struct kmem_cache_node *n, struct page *page)
 806{
 807        spin_lock(&n->list_lock);
 808        list_add(&page->lru, &n->full);
 809        spin_unlock(&n->list_lock);
 810}
 811
 812static void remove_full(struct kmem_cache *s, struct page *page)
 813{
 814        struct kmem_cache_node *n;
 815
 816        if (!(s->flags & SLAB_STORE_USER))
 817                return;
 818
 819        n = get_node(s, page_to_nid(page));
 820
 821        spin_lock(&n->list_lock);
 822        list_del(&page->lru);
 823        spin_unlock(&n->list_lock);
 824}
 825
 826/* Tracking of the number of slabs for debugging purposes */
 827static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 828{
 829        struct kmem_cache_node *n = get_node(s, node);
 830
 831        return atomic_long_read(&n->nr_slabs);
 832}
 833
 834static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 835{
 836        struct kmem_cache_node *n = get_node(s, node);
 837
 838        /*
 839         * May be called early in order to allocate a slab for the
 840         * kmem_cache_node structure. Solve the chicken-egg
 841         * dilemma by deferring the increment of the count during
 842         * bootstrap (see early_kmem_cache_node_alloc).
 843         */
 844        if (!NUMA_BUILD || n) {
 845                atomic_long_inc(&n->nr_slabs);
 846                atomic_long_add(objects, &n->total_objects);
 847        }
 848}
 849static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 850{
 851        struct kmem_cache_node *n = get_node(s, node);
 852
 853        atomic_long_dec(&n->nr_slabs);
 854        atomic_long_sub(objects, &n->total_objects);
 855}
 856
 857/* Object debug checks for alloc/free paths */
 858static void setup_object_debug(struct kmem_cache *s, struct page *page,
 859                                                                void *object)
 860{
 861        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
 862                return;
 863
 864        init_object(s, object, 0);
 865        init_tracking(s, object);
 866}
 867
 868static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
 869                                                void *object, void *addr)
 870{
 871        if (!check_slab(s, page))
 872                goto bad;
 873
 874        if (!on_freelist(s, page, object)) {
 875                object_err(s, page, object, "Object already allocated");
 876                goto bad;
 877        }
 878
 879        if (!check_valid_pointer(s, page, object)) {
 880                object_err(s, page, object, "Freelist Pointer check fails");
 881                goto bad;
 882        }
 883
 884        if (!check_object(s, page, object, 0))
 885                goto bad;
 886
 887        /* Success perform special debug activities for allocs */
 888        if (s->flags & SLAB_STORE_USER)
 889                set_track(s, object, TRACK_ALLOC, addr);
 890        trace(s, page, object, 1);
 891        init_object(s, object, 1);
 892        return 1;
 893
 894bad:
 895        if (PageSlab(page)) {
 896                /*
 897                 * If this is a slab page then lets do the best we can
 898                 * to avoid issues in the future. Marking all objects
 899                 * as used avoids touching the remaining objects.
 900                 */
 901                slab_fix(s, "Marking all objects used");
 902                page->inuse = page->objects;
 903                page->freelist = NULL;
 904        }
 905        return 0;
 906}
 907
 908static int free_debug_processing(struct kmem_cache *s, struct page *page,
 909                                                void *object, void *addr)
 910{
 911        if (!check_slab(s, page))
 912                goto fail;
 913
 914        if (!check_valid_pointer(s, page, object)) {
 915                slab_err(s, page, "Invalid object pointer 0x%p", object);
 916                goto fail;
 917        }
 918
 919        if (on_freelist(s, page, object)) {
 920                object_err(s, page, object, "Object already free");
 921                goto fail;
 922        }
 923
 924        if (!check_object(s, page, object, 1))
 925                return 0;
 926
 927        if (unlikely(s != page->slab)) {
 928                if (!PageSlab(page)) {
 929                        slab_err(s, page, "Attempt to free object(0x%p) "
 930                                "outside of slab", object);
 931                } else if (!page->slab) {
 932                        printk(KERN_ERR
 933                                "SLUB <none>: no slab for object 0x%p.\n",
 934                                                object);
 935                        dump_stack();
 936                } else
 937                        object_err(s, page, object,
 938                                        "page slab pointer corrupt.");
 939                goto fail;
 940        }
 941
 942        /* Special debug activities for freeing objects */
 943        if (!PageSlubFrozen(page) && !page->freelist)
 944                remove_full(s, page);
 945        if (s->flags & SLAB_STORE_USER)
 946                set_track(s, object, TRACK_FREE, addr);
 947        trace(s, page, object, 0);
 948        init_object(s, object, 0);
 949        return 1;
 950
 951fail:
 952        slab_fix(s, "Object at 0x%p not freed", object);
 953        return 0;
 954}
 955
 956static int __init setup_slub_debug(char *str)
 957{
 958        slub_debug = DEBUG_DEFAULT_FLAGS;
 959        if (*str++ != '=' || !*str)
 960                /*
 961                 * No options specified. Switch on full debugging.
 962                 */
 963                goto out;
 964
 965        if (*str == ',')
 966                /*
 967                 * No options but restriction on slabs. This means full
 968                 * debugging for slabs matching a pattern.
 969                 */
 970                goto check_slabs;
 971
 972        slub_debug = 0;
 973        if (*str == '-')
 974                /*
 975                 * Switch off all debugging measures.
 976                 */
 977                goto out;
 978
 979        /*
 980         * Determine which debug features should be switched on
 981         */
 982        for (; *str && *str != ','; str++) {
 983                switch (tolower(*str)) {
 984                case 'f':
 985                        slub_debug |= SLAB_DEBUG_FREE;
 986                        break;
 987                case 'z':
 988                        slub_debug |= SLAB_RED_ZONE;
 989                        break;
 990                case 'p':
 991                        slub_debug |= SLAB_POISON;
 992                        break;
 993                case 'u':
 994                        slub_debug |= SLAB_STORE_USER;
 995                        break;
 996                case 't':
 997                        slub_debug |= SLAB_TRACE;
 998                        break;
 999                default:
1000                        printk(KERN_ERR "slub_debug option '%c' "
1001                                "unknown. skipped\n", *str);
1002                }
1003        }
1004
1005check_slabs:
1006        if (*str == ',')
1007                slub_debug_slabs = str + 1;
1008out:
1009        return 1;
1010}
1011
1012__setup("slub_debug", setup_slub_debug);
1013
1014static unsigned long kmem_cache_flags(unsigned long objsize,
1015        unsigned long flags, const char *name,
1016        void (*ctor)(void *))
1017{
1018        /*
1019         * Enable debugging if selected on the kernel commandline.
1020         */
1021        if (slub_debug && (!slub_debug_slabs ||
1022            strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
1023                        flags |= slub_debug;
1024
1025        return flags;
1026}
1027#else
1028static inline void setup_object_debug(struct kmem_cache *s,
1029                        struct page *page, void *object) {}
1030
1031static inline int alloc_debug_processing(struct kmem_cache *s,
1032        struct page *page, void *object, void *addr) { return 0; }
1033
1034static inline int free_debug_processing(struct kmem_cache *s,
1035        struct page *page, void *object, void *addr) { return 0; }
1036
1037static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1038                        { return 1; }
1039static inline int check_object(struct kmem_cache *s, struct page *page,
1040                        void *object, int active) { return 1; }
1041static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1042static inline unsigned long kmem_cache_flags(unsigned long objsize,
1043        unsigned long flags, const char *name,
1044        void (*ctor)(void *))
1045{
1046        return flags;
1047}
1048#define slub_debug 0
1049
1050static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1051                                                        { return 0; }
1052static inline void inc_slabs_node(struct kmem_cache *s, int node,
1053                                                        int objects) {}
1054static inline void dec_slabs_node(struct kmem_cache *s, int node,
1055                                                        int objects) {}
1056#endif
1057
1058/*
1059 * Slab allocation and freeing
1060 */
1061static inline struct page *alloc_slab_page(gfp_t flags, int node,
1062                                        struct kmem_cache_order_objects oo)
1063{
1064        int order = oo_order(oo);
1065
1066        if (node == -1)
1067                return alloc_pages(flags, order);
1068        else
1069                return alloc_pages_node(node, flags, order);
1070}
1071
1072static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1073{
1074        struct page *page;
1075        struct kmem_cache_order_objects oo = s->oo;
1076
1077        flags |= s->allocflags;
1078
1079        page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
1080                                                                        oo);
1081        if (unlikely(!page)) {
1082                oo = s->min;
1083                /*
1084                 * Allocation may have failed due to fragmentation.
1085                 * Try a lower order alloc if possible
1086                 */
1087                page = alloc_slab_page(flags, node, oo);
1088                if (!page)
1089                        return NULL;
1090
1091                stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1092        }
1093        page->objects = oo_objects(oo);
1094        mod_zone_page_state(page_zone(page),
1095                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1096                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1097                1 << oo_order(oo));
1098
1099        return page;
1100}
1101
1102static void setup_object(struct kmem_cache *s, struct page *page,
1103                                void *object)
1104{
1105        setup_object_debug(s, page, object);
1106        if (unlikely(s->ctor))
1107                s->ctor(object);
1108}
1109
1110static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1111{
1112        struct page *page;
1113        void *start;
1114        void *last;
1115        void *p;
1116
1117        BUG_ON(flags & GFP_SLAB_BUG_MASK);
1118
1119        page = allocate_slab(s,
1120                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1121        if (!page)
1122                goto out;
1123
1124        inc_slabs_node(s, page_to_nid(page), page->objects);
1125        page->slab = s;
1126        page->flags |= 1 << PG_slab;
1127        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1128                        SLAB_STORE_USER | SLAB_TRACE))
1129                __SetPageSlubDebug(page);
1130
1131        start = page_address(page);
1132
1133        if (unlikely(s->flags & SLAB_POISON))
1134                memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1135
1136        last = start;
1137        for_each_object(p, s, start, page->objects) {
1138                setup_object(s, page, last);
1139                set_freepointer(s, last, p);
1140                last = p;
1141        }
1142        setup_object(s, page, last);
1143        set_freepointer(s, last, NULL);
1144
1145        page->freelist = start;
1146        page->inuse = 0;
1147out:
1148        return page;
1149}
1150
1151static void __free_slab(struct kmem_cache *s, struct page *page)
1152{
1153        int order = compound_order(page);
1154        int pages = 1 << order;
1155
1156        if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1157                void *p;
1158
1159                slab_pad_check(s, page);
1160                for_each_object(p, s, page_address(page),
1161                                                page->objects)
1162                        check_object(s, page, p, 0);
1163                __ClearPageSlubDebug(page);
1164        }
1165
1166        mod_zone_page_state(page_zone(page),
1167                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1168                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1169                -pages);
1170
1171        __ClearPageSlab(page);
1172        reset_page_mapcount(page);
1173        __free_pages(page, order);
1174}
1175
1176static void rcu_free_slab(struct rcu_head *h)
1177{
1178        struct page *page;
1179
1180        page = container_of((struct list_head *)h, struct page, lru);
1181        __free_slab(page->slab, page);
1182}
1183
1184static void free_slab(struct kmem_cache *s, struct page *page)
1185{
1186        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1187                /*
1188                 * RCU free overloads the RCU head over the LRU
1189                 */
1190                struct rcu_head *head = (void *)&page->lru;
1191
1192                call_rcu(head, rcu_free_slab);
1193        } else
1194                __free_slab(s, page);
1195}
1196
1197static void discard_slab(struct kmem_cache *s, struct page *page)
1198{
1199        dec_slabs_node(s, page_to_nid(page), page->objects);
1200        free_slab(s, page);
1201}
1202
1203/*
1204 * Per slab locking using the pagelock
1205 */
1206static __always_inline void slab_lock(struct page *page)
1207{
1208        bit_spin_lock(PG_locked, &page->flags);
1209}
1210
1211static __always_inline void slab_unlock(struct page *page)
1212{
1213        __bit_spin_unlock(PG_locked, &page->flags);
1214}
1215
1216static __always_inline int slab_trylock(struct page *page)
1217{
1218        int rc = 1;
1219
1220        rc = bit_spin_trylock(PG_locked, &page->flags);
1221        return rc;
1222}
1223
1224/*
1225 * Management of partially allocated slabs
1226 */
1227static void add_partial(struct kmem_cache_node *n,
1228                                struct page *page, int tail)
1229{
1230        spin_lock(&n->list_lock);
1231        n->nr_partial++;
1232        if (tail)
1233                list_add_tail(&page->lru, &n->partial);
1234        else
1235                list_add(&page->lru, &n->partial);
1236        spin_unlock(&n->list_lock);
1237}
1238
1239static void remove_partial(struct kmem_cache *s, struct page *page)
1240{
1241        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1242
1243        spin_lock(&n->list_lock);
1244        list_del(&page->lru);
1245        n->nr_partial--;
1246        spin_unlock(&n->list_lock);
1247}
1248
1249/*
1250 * Lock slab and remove from the partial list.
1251 *
1252 * Must hold list_lock.
1253 */
1254static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1255                                                        struct page *page)
1256{
1257        if (slab_trylock(page)) {
1258                list_del(&page->lru);
1259                n->nr_partial--;
1260                __SetPageSlubFrozen(page);
1261                return 1;
1262        }
1263        return 0;
1264}
1265
1266/*
1267 * Try to allocate a partial slab from a specific node.
1268 */
1269static struct page *get_partial_node(struct kmem_cache_node *n)
1270{
1271        struct page *page;
1272
1273        /*
1274         * Racy check. If we mistakenly see no partial slabs then we
1275         * just allocate an empty slab. If we mistakenly try to get a
1276         * partial slab and there is none available then get_partials()
1277         * will return NULL.
1278         */
1279        if (!n || !n->nr_partial)
1280                return NULL;
1281
1282        spin_lock(&n->list_lock);
1283        list_for_each_entry(page, &n->partial, lru)
1284                if (lock_and_freeze_slab(n, page))
1285                        goto out;
1286        page = NULL;
1287out:
1288        spin_unlock(&n->list_lock);
1289        return page;
1290}
1291
1292/*
1293 * Get a page from somewhere. Search in increasing NUMA distances.
1294 */
1295static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1296{
1297#ifdef CONFIG_NUMA
1298        struct zonelist *zonelist;
1299        struct zoneref *z;
1300        struct zone *zone;
1301        enum zone_type high_zoneidx = gfp_zone(flags);
1302        struct page *page;
1303
1304        /*
1305         * The defrag ratio allows a configuration of the tradeoffs between
1306         * inter node defragmentation and node local allocations. A lower
1307         * defrag_ratio increases the tendency to do local allocations
1308         * instead of attempting to obtain partial slabs from other nodes.
1309         *
1310         * If the defrag_ratio is set to 0 then kmalloc() always
1311         * returns node local objects. If the ratio is higher then kmalloc()
1312         * may return off node objects because partial slabs are obtained
1313         * from other nodes and filled up.
1314         *
1315         * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
1316         * defrag_ratio = 1000) then every (well almost) allocation will
1317         * first attempt to defrag slab caches on other nodes. This means
1318         * scanning over all nodes to look for partial slabs which may be
1319         * expensive if we do it every time we are trying to find a slab
1320         * with available objects.
1321         */
1322        if (!s->remote_node_defrag_ratio ||
1323                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
1324                return NULL;
1325
1326        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1327        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1328                struct kmem_cache_node *n;
1329
1330                n = get_node(s, zone_to_nid(zone));
1331
1332                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1333                                n->nr_partial > n->min_partial) {
1334                        page = get_partial_node(n);
1335                        if (page)
1336                                return page;
1337                }
1338        }
1339#endif
1340        return NULL;
1341}
1342
1343/*
1344 * Get a partial page, lock it and return it.
1345 */
1346static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1347{
1348        struct page *page;
1349        int searchnode = (node == -1) ? numa_node_id() : node;
1350
1351        page = get_partial_node(get_node(s, searchnode));
1352        if (page || (flags & __GFP_THISNODE))
1353                return page;
1354
1355        return get_any_partial(s, flags);
1356}
1357
1358/*
1359 * Move a page back to the lists.
1360 *
1361 * Must be called with the slab lock held.
1362 *
1363 * On exit the slab lock will have been dropped.
1364 */
1365static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1366{
1367        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1368        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1369
1370        __ClearPageSlubFrozen(page);
1371        if (page->inuse) {
1372
1373                if (page->freelist) {
1374                        add_partial(n, page, tail);
1375                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1376                } else {
1377                        stat(c, DEACTIVATE_FULL);
1378                        if (SLABDEBUG && PageSlubDebug(page) &&
1379                                                (s->flags & SLAB_STORE_USER))
1380                                add_full(n, page);
1381                }
1382                slab_unlock(page);
1383        } else {
1384                stat(c, DEACTIVATE_EMPTY);
1385                if (n->nr_partial < n->min_partial) {
1386                        /*
1387                         * Adding an empty slab to the partial slabs in order
1388                         * to avoid page allocator overhead. This slab needs
1389                         * to come after the other slabs with objects in
1390                         * so that the others get filled first. That way the
1391                         * size of the partial list stays small.
1392                         *
1393                         * kmem_cache_shrink can reclaim any empty slabs from
1394                         * the partial list.
1395                         */
1396                        add_partial(n, page, 1);
1397                        slab_unlock(page);
1398                } else {
1399                        slab_unlock(page);
1400                        stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
1401                        discard_slab(s, page);
1402                }
1403        }
1404}
1405
1406/*
1407 * Remove the cpu slab
1408 */
1409static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1410{
1411        struct page *page = c->page;
1412        int tail = 1;
1413
1414        if (page->freelist)
1415                stat(c, DEACTIVATE_REMOTE_FREES);
1416        /*
1417         * Merge cpu freelist into slab freelist. Typically we get here
1418         * because both freelists are empty. So this is unlikely
1419         * to occur.
1420         */
1421        while (unlikely(c->freelist)) {
1422                void **object;
1423
1424                tail = 0;       /* Hot objects. Put the slab first */
1425
1426                /* Retrieve object from cpu_freelist */
1427                object = c->freelist;
1428                c->freelist = c->freelist[c->offset];
1429
1430                /* And put onto the regular freelist */
1431                object[c->offset] = page->freelist;
1432                page->freelist = object;
1433                page->inuse--;
1434        }
1435        c->page = NULL;
1436        unfreeze_slab(s, page, tail);
1437}
1438
1439static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1440{
1441        stat(c, CPUSLAB_FLUSH);
1442        slab_lock(c->page);
1443        deactivate_slab(s, c);
1444}
1445
1446/*
1447 * Flush cpu slab.
1448 *
1449 * Called from IPI handler with interrupts disabled.
1450 */
1451static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1452{
1453        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1454
1455        if (likely(c && c->page))
1456                flush_slab(s, c);
1457}
1458
1459static void flush_cpu_slab(void *d)
1460{
1461        struct kmem_cache *s = d;
1462
1463        __flush_cpu_slab(s, smp_processor_id());
1464}
1465
1466static void flush_all(struct kmem_cache *s)
1467{
1468        on_each_cpu(flush_cpu_slab, s, 1);
1469}
1470
1471/*
1472 * Check if the objects in a per cpu structure fit numa
1473 * locality expectations.
1474 */
1475static inline int node_match(struct kmem_cache_cpu *c, int node)
1476{
1477#ifdef CONFIG_NUMA
1478        if (node != -1 && c->node != node)
1479                return 0;
1480#endif
1481        return 1;
1482}
1483
1484/*
1485 * Slow path. The lockless freelist is empty or we need to perform
1486 * debugging duties.
1487 *
1488 * Interrupts are disabled.
1489 *
1490 * Processing is still very fast if new objects have been freed to the
1491 * regular freelist. In that case we simply take over the regular freelist
1492 * as the lockless freelist and zap the regular freelist.
1493 *
1494 * If that is not working then we fall back to the partial lists. We take the
1495 * first element of the freelist as the object to allocate now and move the
1496 * rest of the freelist to the lockless freelist.
1497 *
1498 * And if we were unable to get a new slab from the partial slab lists then
1499 * we need to allocate a new slab. This is the slowest path since it involves
1500 * a call to the page allocator and the setup of a new slab.
1501 */
1502static void *__slab_alloc(struct kmem_cache *s,
1503                gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
1504{
1505        void **object;
1506        struct page *new;
1507
1508        /* We handle __GFP_ZERO in the caller */
1509        gfpflags &= ~__GFP_ZERO;
1510
1511        if (!c->page)
1512                goto new_slab;
1513
1514        slab_lock(c->page);
1515        if (unlikely(!node_match(c, node)))
1516                goto another_slab;
1517
1518        stat(c, ALLOC_REFILL);
1519
1520load_freelist:
1521        object = c->page->freelist;
1522        if (unlikely(!object))
1523                goto another_slab;
1524        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1525                goto debug;
1526
1527        c->freelist = object[c->offset];
1528        c->page->inuse = c->page->objects;
1529        c->page->freelist = NULL;
1530        c->node = page_to_nid(c->page);
1531unlock_out:
1532        slab_unlock(c->page);
1533        stat(c, ALLOC_SLOWPATH);
1534        return object;
1535
1536another_slab:
1537        deactivate_slab(s, c);
1538
1539new_slab:
1540        new = get_partial(s, gfpflags, node);
1541        if (new) {
1542                c->page = new;
1543                stat(c, ALLOC_FROM_PARTIAL);
1544                goto load_freelist;
1545        }
1546
1547        if (gfpflags & __GFP_WAIT)
1548                local_irq_enable();
1549
1550        new = new_slab(s, gfpflags, node);
1551
1552        if (gfpflags & __GFP_WAIT)
1553                local_irq_disable();
1554
1555        if (new) {
1556                c = get_cpu_slab(s, smp_processor_id());
1557                stat(c, ALLOC_SLAB);
1558                if (c->page)
1559                        flush_slab(s, c);
1560                slab_lock(new);
1561                __SetPageSlubFrozen(new);
1562                c->page = new;
1563                goto load_freelist;
1564        }
1565        return NULL;
1566debug:
1567        if (!alloc_debug_processing(s, c->page, object, addr))
1568                goto another_slab;
1569
1570        c->page->inuse++;
1571        c->page->freelist = object[c->offset];
1572        c->node = -1;
1573        goto unlock_out;
1574}
1575
1576/*
1577 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1578 * have the fastpath folded into their functions. So no function call
1579 * overhead for requests that can be satisfied on the fastpath.
1580 *
1581 * The fastpath works by first checking if the lockless freelist can be used.
1582 * If not then __slab_alloc is called for slow processing.
1583 *
1584 * Otherwise we can simply pick the next object from the lockless free list.
1585 */
1586static __always_inline void *slab_alloc(struct kmem_cache *s,
1587                gfp_t gfpflags, int node, void *addr)
1588{
1589        void **object;
1590        struct kmem_cache_cpu *c;
1591        unsigned long flags;
1592        unsigned int objsize;
1593
1594        local_irq_save(flags);
1595        c = get_cpu_slab(s, smp_processor_id());
1596        objsize = c->objsize;
1597        if (unlikely(!c->freelist || !node_match(c, node)))
1598
1599                object = __slab_alloc(s, gfpflags, node, addr, c);
1600
1601        else {
1602                object = c->freelist;
1603                c->freelist = object[c->offset];
1604                stat(c, ALLOC_FASTPATH);
1605        }
1606        local_irq_restore(flags);
1607
1608        if (unlikely((gfpflags & __GFP_ZERO) && object))
1609                memset(object, 0, objsize);
1610
1611        return object;
1612}
1613
1614void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1615{
1616        return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1617}
1618EXPORT_SYMBOL(kmem_cache_alloc);
1619
1620#ifdef CONFIG_NUMA
1621void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1622{
1623        return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1624}
1625EXPORT_SYMBOL(kmem_cache_alloc_node);
1626#endif
1627
1628/*
1629 * Slow patch handling. This may still be called frequently since objects
1630 * have a longer lifetime than the cpu slabs in most processing loads.
1631 *
1632 * So we still attempt to reduce cache line usage. Just take the slab
1633 * lock and free the item. If there is no additional partial page
1634 * handling required then we can return immediately.
1635 */
1636static void __slab_free(struct kmem_cache *s, struct page *page,
1637                                void *x, void *addr, unsigned int offset)
1638{
1639        void *prior;
1640        void **object = (void *)x;
1641        struct kmem_cache_cpu *c;
1642
1643        c = get_cpu_slab(s, raw_smp_processor_id());
1644        stat(c, FREE_SLOWPATH);
1645        slab_lock(page);
1646
1647        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1648                goto debug;
1649
1650checks_ok:
1651        prior = object[offset] = page->freelist;
1652        page->freelist = object;
1653        page->inuse--;
1654
1655        if (unlikely(PageSlubFrozen(page))) {
1656                stat(c, FREE_FROZEN);
1657                goto out_unlock;
1658        }
1659
1660        if (unlikely(!page->inuse))
1661                goto slab_empty;
1662
1663        /*
1664         * Objects left in the slab. If it was not on the partial list before
1665         * then add it.
1666         */
1667        if (unlikely(!prior)) {
1668                add_partial(get_node(s, page_to_nid(page)), page, 1);
1669                stat(c, FREE_ADD_PARTIAL);
1670        }
1671
1672out_unlock:
1673        slab_unlock(page);
1674        return;
1675
1676slab_empty:
1677        if (prior) {
1678                /*
1679                 * Slab still on the partial list.
1680                 */
1681                remove_partial(s, page);
1682                stat(c, FREE_REMOVE_PARTIAL);
1683        }
1684        slab_unlock(page);
1685        stat(c, FREE_SLAB);
1686        discard_slab(s, page);
1687        return;
1688
1689debug:
1690        if (!free_debug_processing(s, page, x, addr))
1691                goto out_unlock;
1692        goto checks_ok;
1693}
1694
1695/*
1696 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1697 * can perform fastpath freeing without additional function calls.
1698 *
1699 * The fastpath is only possible if we are freeing to the current cpu slab
1700 * of this processor. This typically the case if we have just allocated
1701 * the item before.
1702 *
1703 * If fastpath is not possible then fall back to __slab_free where we deal
1704 * with all sorts of special processing.
1705 */
1706static __always_inline void slab_free(struct kmem_cache *s,
1707                        struct page *page, void *x, void *addr)
1708{
1709        void **object = (void *)x;
1710        struct kmem_cache_cpu *c;
1711        unsigned long flags;
1712
1713        local_irq_save(flags);
1714        c = get_cpu_slab(s, smp_processor_id());
1715        debug_check_no_locks_freed(object, c->objsize);
1716        if (!(s->flags & SLAB_DEBUG_OBJECTS))
1717                debug_check_no_obj_freed(object, s->objsize);
1718        if (likely(page == c->page && c->node >= 0)) {
1719                object[c->offset] = c->freelist;
1720                c->freelist = object;
1721                stat(c, FREE_FASTPATH);
1722        } else
1723                __slab_free(s, page, x, addr, c->offset);
1724
1725        local_irq_restore(flags);
1726}
1727
1728void kmem_cache_free(struct kmem_cache *s, void *x)
1729{
1730        struct page *page;
1731
1732        page = virt_to_head_page(x);
1733
1734        slab_free(s, page, x, __builtin_return_address(0));
1735}
1736EXPORT_SYMBOL(kmem_cache_free);
1737
1738/* Figure out on which slab object the object resides */
1739static struct page *get_object_page(const void *x)
1740{
1741        struct page *page = virt_to_head_page(x);
1742
1743        if (!PageSlab(page))
1744                return NULL;
1745
1746        return page;
1747}
1748
1749/*
1750 * Object placement in a slab is made very easy because we always start at
1751 * offset 0. If we tune the size of the object to the alignment then we can
1752 * get the required alignment by putting one properly sized object after
1753 * another.
1754 *
1755 * Notice that the allocation order determines the sizes of the per cpu
1756 * caches. Each processor has always one slab available for allocations.
1757 * Increasing the allocation order reduces the number of times that slabs
1758 * must be moved on and off the partial lists and is therefore a factor in
1759 * locking overhead.
1760 */
1761
1762/*
1763 * Mininum / Maximum order of slab pages. This influences locking overhead
1764 * and slab fragmentation. A higher order reduces the number of partial slabs
1765 * and increases the number of allocations possible without having to
1766 * take the list_lock.
1767 */
1768static int slub_min_order;
1769static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
1770static int slub_min_objects;
1771
1772/*
1773 * Merge control. If this is set then no merging of slab caches will occur.
1774 * (Could be removed. This was introduced to pacify the merge skeptics.)
1775 */
1776static int slub_nomerge;
1777
1778/*
1779 * Calculate the order of allocation given an slab object size.
1780 *
1781 * The order of allocation has significant impact on performance and other
1782 * system components. Generally order 0 allocations should be preferred since
1783 * order 0 does not cause fragmentation in the page allocator. Larger objects
1784 * be problematic to put into order 0 slabs because there may be too much
1785 * unused space left. We go to a higher order if more than 1/16th of the slab
1786 * would be wasted.
1787 *
1788 * In order to reach satisfactory performance we must ensure that a minimum
1789 * number of objects is in one slab. Otherwise we may generate too much
1790 * activity on the partial lists which requires taking the list_lock. This is
1791 * less a concern for large slabs though which are rarely used.
1792 *
1793 * slub_max_order specifies the order where we begin to stop considering the
1794 * number of objects in a slab as critical. If we reach slub_max_order then
1795 * we try to keep the page order as low as possible. So we accept more waste
1796 * of space in favor of a small page order.
1797 *
1798 * Higher order allocations also allow the placement of more objects in a
1799 * slab and thereby reduce object handling overhead. If the user has
1800 * requested a higher mininum order then we start with that one instead of
1801 * the smallest order which will fit the object.
1802 */
1803static inline int slab_order(int size, int min_objects,
1804                                int max_order, int fract_leftover)
1805{
1806        int order;
1807        int rem;
1808        int min_order = slub_min_order;
1809
1810        if ((PAGE_SIZE << min_order) / size > 65535)
1811                return get_order(size * 65535) - 1;
1812
1813        for (order = max(min_order,
1814                                fls(min_objects * size - 1) - PAGE_SHIFT);
1815                        order <= max_order; order++) {
1816
1817                unsigned long slab_size = PAGE_SIZE << order;
1818
1819                if (slab_size < min_objects * size)
1820                        continue;
1821
1822                rem = slab_size % size;
1823
1824                if (rem <= slab_size / fract_leftover)
1825                        break;
1826
1827        }
1828
1829        return order;
1830}
1831
1832static inline int calculate_order(int size)
1833{
1834        int order;
1835        int min_objects;
1836        int fraction;
1837
1838        /*
1839         * Attempt to find best configuration for a slab. This
1840         * works by first attempting to generate a layout with
1841         * the best configuration and backing off gradually.
1842         *
1843         * First we reduce the acceptable waste in a slab. Then
1844         * we reduce the minimum objects required in a slab.
1845         */
1846        min_objects = slub_min_objects;
1847        if (!min_objects)
1848                min_objects = 4 * (fls(nr_cpu_ids) + 1);
1849        while (min_objects > 1) {
1850                fraction = 16;
1851                while (fraction >= 4) {
1852                        order = slab_order(size, min_objects,
1853                                                slub_max_order, fraction);
1854                        if (order <= slub_max_order)
1855                                return order;
1856                        fraction /= 2;
1857                }
1858                min_objects /= 2;
1859        }
1860
1861        /*
1862         * We were unable to place multiple objects in a slab. Now
1863         * lets see if we can place a single object there.
1864         */
1865        order = slab_order(size, 1, slub_max_order, 1);
1866        if (order <= slub_max_order)
1867                return order;
1868
1869        /*
1870         * Doh this slab cannot be placed using slub_max_order.
1871         */
1872        order = slab_order(size, 1, MAX_ORDER, 1);
1873        if (order <= MAX_ORDER)
1874                return order;
1875        return -ENOSYS;
1876}
1877
1878/*
1879 * Figure out what the alignment of the objects will be.
1880 */
1881static unsigned long calculate_alignment(unsigned long flags,
1882                unsigned long align, unsigned long size)
1883{
1884        /*
1885         * If the user wants hardware cache aligned objects then follow that
1886         * suggestion if the object is sufficiently large.
1887         *
1888         * The hardware cache alignment cannot override the specified
1889         * alignment though. If that is greater then use it.
1890         */
1891        if (flags & SLAB_HWCACHE_ALIGN) {
1892                unsigned long ralign = cache_line_size();
1893                while (size <= ralign / 2)
1894                        ralign /= 2;
1895                align = max(align, ralign);
1896        }
1897
1898        if (align < ARCH_SLAB_MINALIGN)
1899                align = ARCH_SLAB_MINALIGN;
1900
1901        return ALIGN(align, sizeof(void *));
1902}
1903
1904static void init_kmem_cache_cpu(struct kmem_cache *s,
1905                        struct kmem_cache_cpu *c)
1906{
1907        c->page = NULL;
1908        c->freelist = NULL;
1909        c->node = 0;
1910        c->offset = s->offset / sizeof(void *);
1911        c->objsize = s->objsize;
1912#ifdef CONFIG_SLUB_STATS
1913        memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
1914#endif
1915}
1916
1917static void
1918init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1919{
1920        n->nr_partial = 0;
1921
1922        /*
1923         * The larger the object size is, the more pages we want on the partial
1924         * list to avoid pounding the page allocator excessively.
1925         */
1926        n->min_partial = ilog2(s->size);
1927        if (n->min_partial < MIN_PARTIAL)
1928                n->min_partial = MIN_PARTIAL;
1929        else if (n->min_partial > MAX_PARTIAL)
1930                n->min_partial = MAX_PARTIAL;
1931
1932        spin_lock_init(&n->list_lock);
1933        INIT_LIST_HEAD(&n->partial);
1934#ifdef CONFIG_SLUB_DEBUG
1935        atomic_long_set(&n->nr_slabs, 0);
1936        atomic_long_set(&n->total_objects, 0);
1937        INIT_LIST_HEAD(&n->full);
1938#endif
1939}
1940
1941#ifdef CONFIG_SMP
1942/*
1943 * Per cpu array for per cpu structures.
1944 *
1945 * The per cpu array places all kmem_cache_cpu structures from one processor
1946 * close together meaning that it becomes possible that multiple per cpu
1947 * structures are contained in one cacheline. This may be particularly
1948 * beneficial for the kmalloc caches.
1949 *
1950 * A desktop system typically has around 60-80 slabs. With 100 here we are
1951 * likely able to get per cpu structures for all caches from the array defined
1952 * here. We must be able to cover all kmalloc caches during bootstrap.
1953 *
1954 * If the per cpu array is exhausted then fall back to kmalloc
1955 * of individual cachelines. No sharing is possible then.
1956 */
1957#define NR_KMEM_CACHE_CPU 100
1958
1959static DEFINE_PER_CPU(struct kmem_cache_cpu,
1960                                kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1961
1962static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1963static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
1964
1965static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1966                                                        int cpu, gfp_t flags)
1967{
1968        struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
1969
1970        if (c)
1971                per_cpu(kmem_cache_cpu_free, cpu) =
1972                                (void *)c->freelist;
1973        else {
1974                /* Table overflow: So allocate ourselves */
1975                c = kmalloc_node(
1976                        ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
1977                        flags, cpu_to_node(cpu));
1978                if (!c)
1979                        return NULL;
1980        }
1981
1982        init_kmem_cache_cpu(s, c);
1983        return c;
1984}
1985
1986static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1987{
1988        if (c < per_cpu(kmem_cache_cpu, cpu) ||
1989                        c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1990                kfree(c);
1991                return;
1992        }
1993        c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
1994        per_cpu(kmem_cache_cpu_free, cpu) = c;
1995}
1996
1997static void free_kmem_cache_cpus(struct kmem_cache *s)
1998{
1999        int cpu;
2000
2001        for_each_online_cpu(cpu) {
2002                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2003
2004                if (c) {
2005                        s->cpu_slab[cpu] = NULL;
2006                        free_kmem_cache_cpu(c, cpu);
2007                }
2008        }
2009}
2010
2011static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2012{
2013        int cpu;
2014
2015        for_each_online_cpu(cpu) {
2016                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2017
2018                if (c)
2019                        continue;
2020
2021                c = alloc_kmem_cache_cpu(s, cpu, flags);
2022                if (!c) {
2023                        free_kmem_cache_cpus(s);
2024                        return 0;
2025                }
2026                s->cpu_slab[cpu] = c;
2027        }
2028        return 1;
2029}
2030
2031/*
2032 * Initialize the per cpu array.
2033 */
2034static void init_alloc_cpu_cpu(int cpu)
2035{
2036        int i;
2037
2038        if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
2039                return;
2040
2041        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2042                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2043
2044        cpu_set(cpu, kmem_cach_cpu_free_init_once);
2045}
2046
2047static void __init init_alloc_cpu(void)
2048{
2049        int cpu;
2050
2051        for_each_online_cpu(cpu)
2052                init_alloc_cpu_cpu(cpu);
2053  }
2054
2055#else
2056static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
2057static inline void init_alloc_cpu(void) {}
2058
2059static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2060{
2061        init_kmem_cache_cpu(s, &s->cpu_slab);
2062        return 1;
2063}
2064#endif
2065
2066#ifdef CONFIG_NUMA
2067/*
2068 * No kmalloc_node yet so do it by hand. We know that this is the first
2069 * slab on the node for this slabcache. There are no concurrent accesses
2070 * possible.
2071 *
2072 * Note that this function only works on the kmalloc_node_cache
2073 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2074 * memory on a fresh node that has no slab structures yet.
2075 */
2076static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2077                                                           int node)
2078{
2079        struct page *page;
2080        struct kmem_cache_node *n;
2081        unsigned long flags;
2082
2083        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
2084
2085        page = new_slab(kmalloc_caches, gfpflags, node);
2086
2087        BUG_ON(!page);
2088        if (page_to_nid(page) != node) {
2089                printk(KERN_ERR "SLUB: Unable to allocate memory from "
2090                                "node %d\n", node);
2091                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
2092                                "in order to be able to continue\n");
2093        }
2094
2095        n = page->freelist;
2096        BUG_ON(!n);
2097        page->freelist = get_freepointer(kmalloc_caches, n);
2098        page->inuse++;
2099        kmalloc_caches->node[node] = n;
2100#ifdef CONFIG_SLUB_DEBUG
2101        init_object(kmalloc_caches, n, 1);
2102        init_tracking(kmalloc_caches, n);
2103#endif
2104        init_kmem_cache_node(n, kmalloc_caches);
2105        inc_slabs_node(kmalloc_caches, node, page->objects);
2106
2107        /*
2108         * lockdep requires consistent irq usage for each lock
2109         * so even though there cannot be a race this early in
2110         * the boot sequence, we still disable irqs.
2111         */
2112        local_irq_save(flags);
2113        add_partial(n, page, 0);
2114        local_irq_restore(flags);
2115        return n;
2116}
2117
2118static void free_kmem_cache_nodes(struct kmem_cache *s)
2119{
2120        int node;
2121
2122        for_each_node_state(node, N_NORMAL_MEMORY) {
2123                struct kmem_cache_node *n = s->node[node];
2124                if (n && n != &s->local_node)
2125                        kmem_cache_free(kmalloc_caches, n);
2126                s->node[node] = NULL;
2127        }
2128}
2129
2130static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2131{
2132        int node;
2133        int local_node;
2134
2135        if (slab_state >= UP)
2136                local_node = page_to_nid(virt_to_page(s));
2137        else
2138                local_node = 0;
2139
2140        for_each_node_state(node, N_NORMAL_MEMORY) {
2141                struct kmem_cache_node *n;
2142
2143                if (local_node == node)
2144                        n = &s->local_node;
2145                else {
2146                        if (slab_state == DOWN) {
2147                                n = early_kmem_cache_node_alloc(gfpflags,
2148                                                                node);
2149                                continue;
2150                        }
2151                        n = kmem_cache_alloc_node(kmalloc_caches,
2152                                                        gfpflags, node);
2153
2154                        if (!n) {
2155                                free_kmem_cache_nodes(s);
2156                                return 0;
2157                        }
2158
2159                }
2160                s->node[node] = n;
2161                init_kmem_cache_node(n, s);
2162        }
2163        return 1;
2164}
2165#else
2166static void free_kmem_cache_nodes(struct kmem_cache *s)
2167{
2168}
2169
2170static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2171{
2172        init_kmem_cache_node(&s->local_node, s);
2173        return 1;
2174}
2175#endif
2176
2177/*
2178 * calculate_sizes() determines the order and the distribution of data within
2179 * a slab object.
2180 */
2181static int calculate_sizes(struct kmem_cache *s, int forced_order)
2182{
2183        unsigned long flags = s->flags;
2184        unsigned long size = s->objsize;
2185        unsigned long align = s->align;
2186        int order;
2187
2188        /*
2189         * Round up object size to the next word boundary. We can only
2190         * place the free pointer at word boundaries and this determines
2191         * the possible location of the free pointer.
2192         */
2193        size = ALIGN(size, sizeof(void *));
2194
2195#ifdef CONFIG_SLUB_DEBUG
2196        /*
2197         * Determine if we can poison the object itself. If the user of
2198         * the slab may touch the object after free or before allocation
2199         * then we should never poison the object itself.
2200         */
2201        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
2202                        !s->ctor)
2203                s->flags |= __OBJECT_POISON;
2204        else
2205                s->flags &= ~__OBJECT_POISON;
2206
2207
2208        /*
2209         * If we are Redzoning then check if there is some space between the
2210         * end of the object and the free pointer. If not then add an
2211         * additional word to have some bytes to store Redzone information.
2212         */
2213        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2214                size += sizeof(void *);
2215#endif
2216
2217        /*
2218         * With that we have determined the number of bytes in actual use
2219         * by the object. This is the potential offset to the free pointer.
2220         */
2221        s->inuse = size;
2222
2223        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
2224                s->ctor)) {
2225                /*
2226                 * Relocate free pointer after the object if it is not
2227                 * permitted to overwrite the first word of the object on
2228                 * kmem_cache_free.
2229                 *
2230                 * This is the case if we do RCU, have a constructor or
2231                 * destructor or are poisoning the objects.
2232                 */
2233                s->offset = size;
2234                size += sizeof(void *);
2235        }
2236
2237#ifdef CONFIG_SLUB_DEBUG
2238        if (flags & SLAB_STORE_USER)
2239                /*
2240                 * Need to store information about allocs and frees after
2241                 * the object.
2242                 */
2243                size += 2 * sizeof(struct track);
2244
2245        if (flags & SLAB_RED_ZONE)
2246                /*
2247                 * Add some empty padding so that we can catch
2248                 * overwrites from earlier objects rather than let
2249                 * tracking information or the free pointer be
2250                 * corrupted if an user writes before the start
2251                 * of the object.
2252                 */
2253                size += sizeof(void *);
2254#endif
2255
2256        /*
2257         * Determine the alignment based on various parameters that the
2258         * user specified and the dynamic determination of cache line size
2259         * on bootup.
2260         */
2261        align = calculate_alignment(flags, align, s->objsize);
2262
2263        /*
2264         * SLUB stores one object immediately after another beginning from
2265         * offset 0. In order to align the objects we have to simply size
2266         * each object to conform to the alignment.
2267         */
2268        size = ALIGN(size, align);
2269        s->size = size;
2270        if (forced_order >= 0)
2271                order = forced_order;
2272        else
2273                order = calculate_order(size);
2274
2275        if (order < 0)
2276                return 0;
2277
2278        s->allocflags = 0;
2279        if (order)
2280                s->allocflags |= __GFP_COMP;
2281
2282        if (s->flags & SLAB_CACHE_DMA)
2283                s->allocflags |= SLUB_DMA;
2284
2285        if (s->flags & SLAB_RECLAIM_ACCOUNT)
2286                s->allocflags |= __GFP_RECLAIMABLE;
2287
2288        /*
2289         * Determine the number of objects per slab
2290         */
2291        s->oo = oo_make(order, size);
2292        s->min = oo_make(get_order(size), size);
2293        if (oo_objects(s->oo) > oo_objects(s->max))
2294                s->max = s->oo;
2295
2296        return !!oo_objects(s->oo);
2297
2298}
2299
2300static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2301                const char *name, size_t size,
2302                size_t align, unsigned long flags,
2303                void (*ctor)(void *))
2304{
2305        memset(s, 0, kmem_size);
2306        s->name = name;
2307        s->ctor = ctor;
2308        s->objsize = size;
2309        s->align = align;
2310        s->flags = kmem_cache_flags(size, flags, name, ctor);
2311
2312        if (!calculate_sizes(s, -1))
2313                goto error;
2314
2315        s->refcount = 1;
2316#ifdef CONFIG_NUMA
2317        s->remote_node_defrag_ratio = 1000;
2318#endif
2319        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2320                goto error;
2321
2322        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2323                return 1;
2324        free_kmem_cache_nodes(s);
2325error:
2326        if (flags & SLAB_PANIC)
2327                panic("Cannot create slab %s size=%lu realsize=%u "
2328                        "order=%u offset=%u flags=%lx\n",
2329                        s->name, (unsigned long)size, s->size, oo_order(s->oo),
2330                        s->offset, flags);
2331        return 0;
2332}
2333
2334/*
2335 * Check if a given pointer is valid
2336 */
2337int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2338{
2339        struct page *page;
2340
2341        page = get_object_page(object);
2342
2343        if (!page || s != page->slab)
2344                /* No slab or wrong slab */
2345                return 0;
2346
2347        if (!check_valid_pointer(s, page, object))
2348                return 0;
2349
2350        /*
2351         * We could also check if the object is on the slabs freelist.
2352         * But this would be too expensive and it seems that the main
2353         * purpose of kmem_ptr_valid() is to check if the object belongs
2354         * to a certain slab.
2355         */
2356        return 1;
2357}
2358EXPORT_SYMBOL(kmem_ptr_validate);
2359
2360/*
2361 * Determine the size of a slab object
2362 */
2363unsigned int kmem_cache_size(struct kmem_cache *s)
2364{
2365        return s->objsize;
2366}
2367EXPORT_SYMBOL(kmem_cache_size);
2368
2369const char *kmem_cache_name(struct kmem_cache *s)
2370{
2371        return s->name;
2372}
2373EXPORT_SYMBOL(kmem_cache_name);
2374
2375static void list_slab_objects(struct kmem_cache *s, struct page *page,
2376                                                        const char *text)
2377{
2378#ifdef CONFIG_SLUB_DEBUG
2379        void *addr = page_address(page);
2380        void *p;
2381        DECLARE_BITMAP(map, page->objects);
2382
2383        bitmap_zero(map, page->objects);
2384        slab_err(s, page, "%s", text);
2385        slab_lock(page);
2386        for_each_free_object(p, s, page->freelist)
2387                set_bit(slab_index(p, s, addr), map);
2388
2389        for_each_object(p, s, addr, page->objects) {
2390
2391                if (!test_bit(slab_index(p, s, addr), map)) {
2392                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
2393                                                        p, p - addr);
2394                        print_tracking(s, p);
2395                }
2396        }
2397        slab_unlock(page);
2398#endif
2399}
2400
2401/*
2402 * Attempt to free all partial slabs on a node.
2403 */
2404static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2405{
2406        unsigned long flags;
2407        struct page *page, *h;
2408
2409        spin_lock_irqsave(&n->list_lock, flags);
2410        list_for_each_entry_safe(page, h, &n->partial, lru) {
2411                if (!page->inuse) {
2412                        list_del(&page->lru);
2413                        discard_slab(s, page);
2414                        n->nr_partial--;
2415                } else {
2416                        list_slab_objects(s, page,
2417                                "Objects remaining on kmem_cache_close()");
2418                }
2419        }
2420        spin_unlock_irqrestore(&n->list_lock, flags);
2421}
2422
2423/*
2424 * Release all resources used by a slab cache.
2425 */
2426static inline int kmem_cache_close(struct kmem_cache *s)
2427{
2428        int node;
2429
2430        flush_all(s);
2431
2432        /* Attempt to free all objects */
2433        free_kmem_cache_cpus(s);
2434        for_each_node_state(node, N_NORMAL_MEMORY) {
2435                struct kmem_cache_node *n = get_node(s, node);
2436
2437                free_partial(s, n);
2438                if (n->nr_partial || slabs_node(s, node))
2439                        return 1;
2440        }
2441        free_kmem_cache_nodes(s);
2442        return 0;
2443}
2444
2445/*
2446 * Close a cache and release the kmem_cache structure
2447 * (must be used for caches created using kmem_cache_create)
2448 */
2449void kmem_cache_destroy(struct kmem_cache *s)
2450{
2451        down_write(&slub_lock);
2452        s->refcount--;
2453        if (!s->refcount) {
2454                list_del(&s->list);
2455                up_write(&slub_lock);
2456                if (kmem_cache_close(s)) {
2457                        printk(KERN_ERR "SLUB %s: %s called for cache that "
2458                                "still has objects.\n", s->name, __func__);
2459                        dump_stack();
2460                }
2461                sysfs_slab_remove(s);
2462        } else
2463                up_write(&slub_lock);
2464}
2465EXPORT_SYMBOL(kmem_cache_destroy);
2466
2467/********************************************************************
2468 *              Kmalloc subsystem
2469 *******************************************************************/
2470
2471struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
2472EXPORT_SYMBOL(kmalloc_caches);
2473
2474static int __init setup_slub_min_order(char *str)
2475{
2476        get_option(&str, &slub_min_order);
2477
2478        return 1;
2479}
2480
2481__setup("slub_min_order=", setup_slub_min_order);
2482
2483static int __init setup_slub_max_order(char *str)
2484{
2485        get_option(&str, &slub_max_order);
2486
2487        return 1;
2488}
2489
2490__setup("slub_max_order=", setup_slub_max_order);
2491
2492static int __init setup_slub_min_objects(char *str)
2493{
2494        get_option(&str, &slub_min_objects);
2495
2496        return 1;
2497}
2498
2499__setup("slub_min_objects=", setup_slub_min_objects);
2500
2501static int __init setup_slub_nomerge(char *str)
2502{
2503        slub_nomerge = 1;
2504        return 1;
2505}
2506
2507__setup("slub_nomerge", setup_slub_nomerge);
2508
2509static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2510                const char *name, int size, gfp_t gfp_flags)
2511{
2512        unsigned int flags = 0;
2513
2514        if (gfp_flags & SLUB_DMA)
2515                flags = SLAB_CACHE_DMA;
2516
2517        down_write(&slub_lock);
2518        if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2519                                                                flags, NULL))
2520                goto panic;
2521
2522        list_add(&s->list, &slab_caches);
2523        up_write(&slub_lock);
2524        if (sysfs_slab_add(s))
2525                goto panic;
2526        return s;
2527
2528panic:
2529        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2530}
2531
2532#ifdef CONFIG_ZONE_DMA
2533static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
2534
2535static void sysfs_add_func(struct work_struct *w)
2536{
2537        struct kmem_cache *s;
2538
2539        down_write(&slub_lock);
2540        list_for_each_entry(s, &slab_caches, list) {
2541                if (s->flags & __SYSFS_ADD_DEFERRED) {
2542                        s->flags &= ~__SYSFS_ADD_DEFERRED;
2543                        sysfs_slab_add(s);
2544                }
2545        }
2546        up_write(&slub_lock);
2547}
2548
2549static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2550
2551static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2552{
2553        struct kmem_cache *s;
2554        char *text;
2555        size_t realsize;
2556
2557        s = kmalloc_caches_dma[index];
2558        if (s)
2559                return s;
2560
2561        /* Dynamically create dma cache */
2562        if (flags & __GFP_WAIT)
2563                down_write(&slub_lock);
2564        else {
2565                if (!down_write_trylock(&slub_lock))
2566                        goto out;
2567        }
2568
2569        if (kmalloc_caches_dma[index])
2570                goto unlock_out;
2571
2572        realsize = kmalloc_caches[index].objsize;
2573        text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2574                         (unsigned int)realsize);
2575        s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2576
2577        if (!s || !text || !kmem_cache_open(s, flags, text,
2578                        realsize, ARCH_KMALLOC_MINALIGN,
2579                        SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2580                kfree(s);
2581                kfree(text);
2582                goto unlock_out;
2583        }
2584
2585        list_add(&s->list, &slab_caches);
2586        kmalloc_caches_dma[index] = s;
2587
2588        schedule_work(&sysfs_add_work);
2589
2590unlock_out:
2591        up_write(&slub_lock);
2592out:
2593        return kmalloc_caches_dma[index];
2594}
2595#endif
2596
2597/*
2598 * Conversion table for small slabs sizes / 8 to the index in the
2599 * kmalloc array. This is necessary for slabs < 192 since we have non power
2600 * of two cache sizes there. The size of larger slabs can be determined using
2601 * fls.
2602 */
2603static s8 size_index[24] = {
2604        3,      /* 8 */
2605        4,      /* 16 */
2606        5,      /* 24 */
2607        5,      /* 32 */
2608        6,      /* 40 */
2609        6,      /* 48 */
2610        6,      /* 56 */
2611        6,      /* 64 */
2612        1,      /* 72 */
2613        1,      /* 80 */
2614        1,      /* 88 */
2615        1,      /* 96 */
2616        7,      /* 104 */
2617        7,      /* 112 */
2618        7,      /* 120 */
2619        7,      /* 128 */
2620        2,      /* 136 */
2621        2,      /* 144 */
2622        2,      /* 152 */
2623        2,      /* 160 */
2624        2,      /* 168 */
2625        2,      /* 176 */
2626        2,      /* 184 */
2627        2       /* 192 */
2628};
2629
2630static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2631{
2632        int index;
2633
2634        if (size <= 192) {
2635                if (!size)
2636                        return ZERO_SIZE_PTR;
2637
2638                index = size_index[(size - 1) / 8];
2639        } else
2640                index = fls(size - 1);
2641
2642#ifdef CONFIG_ZONE_DMA
2643        if (unlikely((flags & SLUB_DMA)))
2644                return dma_kmalloc_cache(index, flags);
2645
2646#endif
2647        return &kmalloc_caches[index];
2648}
2649
2650void *__kmalloc(size_t size, gfp_t flags)
2651{
2652        struct kmem_cache *s;
2653
2654        if (unlikely(size > PAGE_SIZE))
2655                return kmalloc_large(size, flags);
2656
2657        s = get_slab(size, flags);
2658
2659        if (unlikely(ZERO_OR_NULL_PTR(s)))
2660                return s;
2661
2662        return slab_alloc(s, flags, -1, __builtin_return_address(0));
2663}
2664EXPORT_SYMBOL(__kmalloc);
2665
2666static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2667{
2668        struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
2669                                                get_order(size));
2670
2671        if (page)
2672                return page_address(page);
2673        else
2674                return NULL;
2675}
2676
2677#ifdef CONFIG_NUMA
2678void *__kmalloc_node(size_t size, gfp_t flags, int node)
2679{
2680        struct kmem_cache *s;
2681
2682        if (unlikely(size > PAGE_SIZE))
2683                return kmalloc_large_node(size, flags, node);
2684
2685        s = get_slab(size, flags);
2686
2687        if (unlikely(ZERO_OR_NULL_PTR(s)))
2688                return s;
2689
2690        return slab_alloc(s, flags, node, __builtin_return_address(0));
2691}
2692EXPORT_SYMBOL(__kmalloc_node);
2693#endif
2694
2695size_t ksize(const void *object)
2696{
2697        struct page *page;
2698        struct kmem_cache *s;
2699
2700        if (unlikely(object == ZERO_SIZE_PTR))
2701                return 0;
2702
2703        page = virt_to_head_page(object);
2704
2705        if (unlikely(!PageSlab(page))) {
2706                WARN_ON(!PageCompound(page));
2707                return PAGE_SIZE << compound_order(page);
2708        }
2709        s = page->slab;
2710
2711#ifdef CONFIG_SLUB_DEBUG
2712        /*
2713         * Debugging requires use of the padding between object
2714         * and whatever may come after it.
2715         */
2716        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2717                return s->objsize;
2718
2719#endif
2720        /*
2721         * If we have the need to store the freelist pointer
2722         * back there or track user information then we can
2723         * only use the space before that information.
2724         */
2725        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2726                return s->inuse;
2727        /*
2728         * Else we can use all the padding etc for the allocation
2729         */
2730        return s->size;
2731}
2732
2733void kfree(const void *x)
2734{
2735        struct page *page;
2736        void *object = (void *)x;
2737
2738        if (unlikely(ZERO_OR_NULL_PTR(x)))
2739                return;
2740
2741        page = virt_to_head_page(x);
2742        if (unlikely(!PageSlab(page))) {
2743                BUG_ON(!PageCompound(page));
2744                put_page(page);
2745                return;
2746        }
2747        slab_free(page->slab, page, object, __builtin_return_address(0));
2748}
2749EXPORT_SYMBOL(kfree);
2750
2751/*
2752 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2753 * the remaining slabs by the number of items in use. The slabs with the
2754 * most items in use come first. New allocations will then fill those up
2755 * and thus they can be removed from the partial lists.
2756 *
2757 * The slabs with the least items are placed last. This results in them
2758 * being allocated from last increasing the chance that the last objects
2759 * are freed in them.
2760 */
2761int kmem_cache_shrink(struct kmem_cache *s)
2762{
2763        int node;
2764        int i;
2765        struct kmem_cache_node *n;
2766        struct page *page;
2767        struct page *t;
2768        int objects = oo_objects(s->max);
2769        struct list_head *slabs_by_inuse =
2770                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
2771        unsigned long flags;
2772
2773        if (!slabs_by_inuse)
2774                return -ENOMEM;
2775
2776        flush_all(s);
2777        for_each_node_state(node, N_NORMAL_MEMORY) {
2778                n = get_node(s, node);
2779
2780                if (!n->nr_partial)
2781                        continue;
2782
2783                for (i = 0; i < objects; i++)
2784                        INIT_LIST_HEAD(slabs_by_inuse + i);
2785
2786                spin_lock_irqsave(&n->list_lock, flags);
2787
2788                /*
2789                 * Build lists indexed by the items in use in each slab.
2790                 *
2791                 * Note that concurrent frees may occur while we hold the
2792                 * list_lock. page->inuse here is the upper limit.
2793                 */
2794                list_for_each_entry_safe(page, t, &n->partial, lru) {
2795                        if (!page->inuse && slab_trylock(page)) {
2796                                /*
2797                                 * Must hold slab lock here because slab_free
2798                                 * may have freed the last object and be
2799                                 * waiting to release the slab.
2800                                 */
2801                                list_del(&page->lru);
2802                                n->nr_partial--;
2803                                slab_unlock(page);
2804                                discard_slab(s, page);
2805                        } else {
2806                                list_move(&page->lru,
2807                                slabs_by_inuse + page->inuse);
2808                        }
2809                }
2810
2811                /*
2812                 * Rebuild the partial list with the slabs filled up most
2813                 * first and the least used slabs at the end.
2814                 */
2815                for (i = objects - 1; i >= 0; i--)
2816                        list_splice(slabs_by_inuse + i, n->partial.prev);
2817
2818                spin_unlock_irqrestore(&n->list_lock, flags);
2819        }
2820
2821        kfree(slabs_by_inuse);
2822        return 0;
2823}
2824EXPORT_SYMBOL(kmem_cache_shrink);
2825
2826#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2827static int slab_mem_going_offline_callback(void *arg)
2828{
2829        struct kmem_cache *s;
2830
2831        down_read(&slub_lock);
2832        list_for_each_entry(s, &slab_caches, list)
2833                kmem_cache_shrink(s);
2834        up_read(&slub_lock);
2835
2836        return 0;
2837}
2838
2839static void slab_mem_offline_callback(void *arg)
2840{
2841        struct kmem_cache_node *n;
2842        struct kmem_cache *s;
2843        struct memory_notify *marg = arg;
2844        int offline_node;
2845
2846        offline_node = marg->status_change_nid;
2847
2848        /*
2849         * If the node still has available memory. we need kmem_cache_node
2850         * for it yet.
2851         */
2852        if (offline_node < 0)
2853                return;
2854
2855        down_read(&slub_lock);
2856        list_for_each_entry(s, &slab_caches, list) {
2857                n = get_node(s, offline_node);
2858                if (n) {
2859                        /*
2860                         * if n->nr_slabs > 0, slabs still exist on the node
2861                         * that is going down. We were unable to free them,
2862                         * and offline_pages() function shoudn't call this
2863                         * callback. So, we must fail.
2864                         */
2865                        BUG_ON(slabs_node(s, offline_node));
2866
2867                        s->node[offline_node] = NULL;
2868                        kmem_cache_free(kmalloc_caches, n);
2869                }
2870        }
2871        up_read(&slub_lock);
2872}
2873
2874static int slab_mem_going_online_callback(void *arg)
2875{
2876        struct kmem_cache_node *n;
2877        struct kmem_cache *s;
2878        struct memory_notify *marg = arg;
2879        int nid = marg->status_change_nid;
2880        int ret = 0;
2881
2882        /*
2883         * If the node's memory is already available, then kmem_cache_node is
2884         * already created. Nothing to do.
2885         */
2886        if (nid < 0)
2887                return 0;
2888
2889        /*
2890         * We are bringing a node online. No memory is available yet. We must
2891         * allocate a kmem_cache_node structure in order to bring the node
2892         * online.
2893         */
2894        down_read(&slub_lock);
2895        list_for_each_entry(s, &slab_caches, list) {
2896                /*
2897                 * XXX: kmem_cache_alloc_node will fallback to other nodes
2898                 *      since memory is not yet available from the node that
2899                 *      is brought up.
2900                 */
2901                n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
2902                if (!n) {
2903                        ret = -ENOMEM;
2904                        goto out;
2905                }
2906                init_kmem_cache_node(n, s);
2907                s->node[nid] = n;
2908        }
2909out:
2910        up_read(&slub_lock);
2911        return ret;
2912}
2913
2914static int slab_memory_callback(struct notifier_block *self,
2915                                unsigned long action, void *arg)
2916{
2917        int ret = 0;
2918
2919        switch (action) {
2920        case MEM_GOING_ONLINE:
2921                ret = slab_mem_going_online_callback(arg);
2922                break;
2923        case MEM_GOING_OFFLINE:
2924                ret = slab_mem_going_offline_callback(arg);
2925                break;
2926        case MEM_OFFLINE:
2927        case MEM_CANCEL_ONLINE:
2928                slab_mem_offline_callback(arg);
2929                break;
2930        case MEM_ONLINE:
2931        case MEM_CANCEL_OFFLINE:
2932                break;
2933        }
2934        if (ret)
2935                ret = notifier_from_errno(ret);
2936        else
2937                ret = NOTIFY_OK;
2938        return ret;
2939}
2940
2941#endif /* CONFIG_MEMORY_HOTPLUG */
2942
2943/********************************************************************
2944 *                      Basic setup of slabs
2945 *******************************************************************/
2946
2947void __init kmem_cache_init(void)
2948{
2949        int i;
2950        int caches = 0;
2951
2952        init_alloc_cpu();
2953
2954#ifdef CONFIG_NUMA
2955        /*
2956         * Must first have the slab cache available for the allocations of the
2957         * struct kmem_cache_node's. There is special bootstrap code in
2958         * kmem_cache_open for slab_state == DOWN.
2959         */
2960        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2961                sizeof(struct kmem_cache_node), GFP_KERNEL);
2962        kmalloc_caches[0].refcount = -1;
2963        caches++;
2964
2965        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
2966#endif
2967
2968        /* Able to allocate the per node structures */
2969        slab_state = PARTIAL;
2970
2971        /* Caches that are not of the two-to-the-power-of size */
2972        if (KMALLOC_MIN_SIZE <= 64) {
2973                create_kmalloc_cache(&kmalloc_caches[1],
2974                                "kmalloc-96", 96, GFP_KERNEL);
2975                caches++;
2976                create_kmalloc_cache(&kmalloc_caches[2],
2977                                "kmalloc-192", 192, GFP_KERNEL);
2978                caches++;
2979        }
2980
2981        for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
2982                create_kmalloc_cache(&kmalloc_caches[i],
2983                        "kmalloc", 1 << i, GFP_KERNEL);
2984                caches++;
2985        }
2986
2987
2988        /*
2989         * Patch up the size_index table if we have strange large alignment
2990         * requirements for the kmalloc array. This is only the case for
2991         * MIPS it seems. The standard arches will not generate any code here.
2992         *
2993         * Largest permitted alignment is 256 bytes due to the way we
2994         * handle the index determination for the smaller caches.
2995         *
2996         * Make sure that nothing crazy happens if someone starts tinkering
2997         * around with ARCH_KMALLOC_MINALIGN
2998         */
2999        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3000                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3001
3002        for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
3003                size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
3004
3005        if (KMALLOC_MIN_SIZE == 128) {
3006                /*
3007                 * The 192 byte sized cache is not used if the alignment
3008                 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3009                 * instead.
3010                 */
3011                for (i = 128 + 8; i <= 192; i += 8)
3012                        size_index[(i - 1) / 8] = 8;
3013        }
3014
3015        slab_state = UP;
3016
3017        /* Provide the correct kmalloc names now that the caches are up */
3018        for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
3019                kmalloc_caches[i]. name =
3020                        kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3021
3022#ifdef CONFIG_SMP
3023        register_cpu_notifier(&slab_notifier);
3024        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
3025                                nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
3026#else
3027        kmem_size = sizeof(struct kmem_cache);
3028#endif
3029
3030        printk(KERN_INFO
3031                "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3032                " CPUs=%d, Nodes=%d\n",
3033                caches, cache_line_size(),
3034                slub_min_order, slub_max_order, slub_min_objects,
3035                nr_cpu_ids, nr_node_ids);
3036}
3037
3038/*
3039 * Find a mergeable slab cache
3040 */
3041static int slab_unmergeable(struct kmem_cache *s)
3042{
3043        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3044                return 1;
3045
3046        if (s->ctor)
3047                return 1;
3048
3049        /*
3050         * We may have set a slab to be unmergeable during bootstrap.
3051         */
3052        if (s->refcount < 0)
3053                return 1;
3054
3055        return 0;
3056}
3057
3058static struct kmem_cache *find_mergeable(size_t size,
3059                size_t align, unsigned long flags, const char *name,
3060                void (*ctor)(void *))
3061{
3062        struct kmem_cache *s;
3063
3064        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3065                return NULL;
3066
3067        if (ctor)
3068                return NULL;
3069
3070        size = ALIGN(size, sizeof(void *));
3071        align = calculate_alignment(flags, align, size);
3072        size = ALIGN(size, align);
3073        flags = kmem_cache_flags(size, flags, name, NULL);
3074
3075        list_for_each_entry(s, &slab_caches, list) {
3076                if (slab_unmergeable(s))
3077                        continue;
3078
3079                if (size > s->size)
3080                        continue;
3081
3082                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3083                                continue;
3084                /*
3085                 * Check if alignment is compatible.
3086                 * Courtesy of Adrian Drzewiecki
3087                 */
3088                if ((s->size & ~(align - 1)) != s->size)
3089                        continue;
3090
3091                if (s->size - size >= sizeof(void *))
3092                        continue;
3093
3094                return s;
3095        }
3096        return NULL;
3097}
3098
3099struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3100                size_t align, unsigned long flags, void (*ctor)(void *))
3101{
3102        struct kmem_cache *s;
3103
3104        down_write(&slub_lock);
3105        s = find_mergeable(size, align, flags, name, ctor);
3106        if (s) {
3107                int cpu;
3108
3109                s->refcount++;
3110                /*
3111                 * Adjust the object sizes so that we clear
3112                 * the complete object on kzalloc.
3113                 */
3114                s->objsize = max(s->objsize, (int)size);
3115
3116                /*
3117                 * And then we need to update the object size in the
3118                 * per cpu structures
3119                 */
3120                for_each_online_cpu(cpu)
3121                        get_cpu_slab(s, cpu)->objsize = s->objsize;
3122
3123                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3124                up_write(&slub_lock);
3125
3126                if (sysfs_slab_alias(s, name))
3127                        goto err;
3128                return s;
3129        }
3130
3131        s = kmalloc(kmem_size, GFP_KERNEL);
3132        if (s) {
3133                if (kmem_cache_open(s, GFP_KERNEL, name,
3134                                size, align, flags, ctor)) {
3135                        list_add(&s->list, &slab_caches);
3136                        up_write(&slub_lock);
3137                        if (sysfs_slab_add(s))
3138                                goto err;
3139                        return s;
3140                }
3141                kfree(s);
3142        }
3143        up_write(&slub_lock);
3144
3145err:
3146        if (flags & SLAB_PANIC)
3147                panic("Cannot create slabcache %s\n", name);
3148        else
3149                s = NULL;
3150        return s;
3151}
3152EXPORT_SYMBOL(kmem_cache_create);
3153
3154#ifdef CONFIG_SMP
3155/*
3156 * Use the cpu notifier to insure that the cpu slabs are flushed when
3157 * necessary.
3158 */
3159static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3160                unsigned long action, void *hcpu)
3161{
3162        long cpu = (long)hcpu;
3163        struct kmem_cache *s;
3164        unsigned long flags;
3165
3166        switch (action) {
3167        case CPU_UP_PREPARE:
3168        case CPU_UP_PREPARE_FROZEN:
3169                init_alloc_cpu_cpu(cpu);
3170                down_read(&slub_lock);
3171                list_for_each_entry(s, &slab_caches, list)
3172                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3173                                                        GFP_KERNEL);
3174                up_read(&slub_lock);
3175                break;
3176
3177        case CPU_UP_CANCELED:
3178        case CPU_UP_CANCELED_FROZEN:
3179        case CPU_DEAD:
3180        case CPU_DEAD_FROZEN:
3181                down_read(&slub_lock);
3182                list_for_each_entry(s, &slab_caches, list) {
3183                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3184
3185                        local_irq_save(flags);
3186                        __flush_cpu_slab(s, cpu);
3187                        local_irq_restore(flags);
3188                        free_kmem_cache_cpu(c, cpu);
3189                        s->cpu_slab[cpu] = NULL;
3190                }
3191                up_read(&slub_lock);
3192                break;
3193        default:
3194                break;
3195        }
3196        return NOTIFY_OK;
3197}
3198
3199static struct notifier_block __cpuinitdata slab_notifier = {
3200        .notifier_call = slab_cpuup_callback
3201};
3202
3203#endif
3204
3205void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
3206{
3207        struct kmem_cache *s;
3208
3209        if (unlikely(size > PAGE_SIZE))
3210                return kmalloc_large(size, gfpflags);
3211
3212        s = get_slab(size, gfpflags);
3213
3214        if (unlikely(ZERO_OR_NULL_PTR(s)))
3215                return s;
3216
3217        return slab_alloc(s, gfpflags, -1, caller);
3218}
3219
3220void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3221                                        int node, void *caller)
3222{
3223        struct kmem_cache *s;
3224
3225        if (unlikely(size > PAGE_SIZE))
3226                return kmalloc_large_node(size, gfpflags, node);
3227
3228        s = get_slab(size, gfpflags);
3229
3230        if (unlikely(ZERO_OR_NULL_PTR(s)))
3231                return s;
3232
3233        return slab_alloc(s, gfpflags, node, caller);
3234}
3235
3236#ifdef CONFIG_SLUB_DEBUG
3237static unsigned long count_partial(struct kmem_cache_node *n,
3238                                        int (*get_count)(struct page *))
3239{
3240        unsigned long flags;
3241        unsigned long x = 0;
3242        struct page *page;
3243
3244        spin_lock_irqsave(&n->list_lock, flags);
3245        list_for_each_entry(page, &n->partial, lru)
3246                x += get_count(page);
3247        spin_unlock_irqrestore(&n->list_lock, flags);
3248        return x;
3249}
3250
3251static int count_inuse(struct page *page)
3252{
3253        return page->inuse;
3254}
3255
3256static int count_total(struct page *page)
3257{
3258        return page->objects;
3259}
3260
3261static int count_free(struct page *page)
3262{
3263        return page->objects - page->inuse;
3264}
3265
3266static int validate_slab(struct kmem_cache *s, struct page *page,
3267                                                unsigned long *map)
3268{
3269        void *p;
3270        void *addr = page_address(page);
3271
3272        if (!check_slab(s, page) ||
3273                        !on_freelist(s, page, NULL))
3274                return 0;
3275
3276        /* Now we know that a valid freelist exists */
3277        bitmap_zero(map, page->objects);
3278
3279        for_each_free_object(p, s, page->freelist) {
3280                set_bit(slab_index(p, s, addr), map);
3281                if (!check_object(s, page, p, 0))
3282                        return 0;
3283        }
3284
3285        for_each_object(p, s, addr, page->objects)
3286                if (!test_bit(slab_index(p, s, addr), map))
3287                        if (!check_object(s, page, p, 1))
3288                                return 0;
3289        return 1;
3290}
3291
3292static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3293                                                unsigned long *map)
3294{
3295        if (slab_trylock(page)) {
3296                validate_slab(s, page, map);
3297                slab_unlock(page);
3298        } else
3299                printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3300                        s->name, page);
3301
3302        if (s->flags & DEBUG_DEFAULT_FLAGS) {
3303                if (!PageSlubDebug(page))
3304                        printk(KERN_ERR "SLUB %s: SlubDebug not set "
3305                                "on slab 0x%p\n", s->name, page);
3306        } else {
3307                if (PageSlubDebug(page))
3308                        printk(KERN_ERR "SLUB %s: SlubDebug set on "
3309                                "slab 0x%p\n", s->name, page);
3310        }
3311}
3312
3313static int validate_slab_node(struct kmem_cache *s,
3314                struct kmem_cache_node *n, unsigned long *map)
3315{
3316        unsigned long count = 0;
3317        struct page *page;
3318        unsigned long flags;
3319
3320        spin_lock_irqsave(&n->list_lock, flags);
3321
3322        list_for_each_entry(page, &n->partial, lru) {
3323                validate_slab_slab(s, page, map);
3324                count++;
3325        }
3326        if (count != n->nr_partial)
3327                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
3328                        "counter=%ld\n", s->name, count, n->nr_partial);
3329
3330        if (!(s->flags & SLAB_STORE_USER))
3331                goto out;
3332
3333        list_for_each_entry(page, &n->full, lru) {
3334                validate_slab_slab(s, page, map);
3335                count++;
3336        }
3337        if (count != atomic_long_read(&n->nr_slabs))
3338                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
3339                        "counter=%ld\n", s->name, count,
3340                        atomic_long_read(&n->nr_slabs));
3341
3342out:
3343        spin_unlock_irqrestore(&n->list_lock, flags);
3344        return count;
3345}
3346
3347static long validate_slab_cache(struct kmem_cache *s)
3348{
3349        int node;
3350        unsigned long count = 0;
3351        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3352                                sizeof(unsigned long), GFP_KERNEL);
3353
3354        if (!map)
3355                return -ENOMEM;
3356
3357        flush_all(s);
3358        for_each_node_state(node, N_NORMAL_MEMORY) {
3359                struct kmem_cache_node *n = get_node(s, node);
3360
3361                count += validate_slab_node(s, n, map);
3362        }
3363        kfree(map);
3364        return count;
3365}
3366
3367#ifdef SLUB_RESILIENCY_TEST
3368static void resiliency_test(void)
3369{
3370        u8 *p;
3371
3372        printk(KERN_ERR "SLUB resiliency testing\n");
3373        printk(KERN_ERR "-----------------------\n");
3374        printk(KERN_ERR "A. Corruption after allocation\n");
3375
3376        p = kzalloc(16, GFP_KERNEL);
3377        p[16] = 0x12;
3378        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3379                        " 0x12->0x%p\n\n", p + 16);
3380
3381        validate_slab_cache(kmalloc_caches + 4);
3382
3383        /* Hmmm... The next two are dangerous */
3384        p = kzalloc(32, GFP_KERNEL);
3385        p[32 + sizeof(void *)] = 0x34;
3386        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3387                        " 0x34 -> -0x%p\n", p);
3388        printk(KERN_ERR
3389                "If allocated object is overwritten then not detectable\n\n");
3390
3391        validate_slab_cache(kmalloc_caches + 5);
3392        p = kzalloc(64, GFP_KERNEL);
3393        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3394        *p = 0x56;
3395        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3396                                                                        p);
3397        printk(KERN_ERR
3398                "If allocated object is overwritten then not detectable\n\n");
3399        validate_slab_cache(kmalloc_caches + 6);
3400
3401        printk(KERN_ERR "\nB. Corruption after free\n");
3402        p = kzalloc(128, GFP_KERNEL);
3403        kfree(p);
3404        *p = 0x78;
3405        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3406        validate_slab_cache(kmalloc_caches + 7);
3407
3408        p = kzalloc(256, GFP_KERNEL);
3409        kfree(p);
3410        p[50] = 0x9a;
3411        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3412                        p);
3413        validate_slab_cache(kmalloc_caches + 8);
3414
3415        p = kzalloc(512, GFP_KERNEL);
3416        kfree(p);
3417        p[512] = 0xab;
3418        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3419        validate_slab_cache(kmalloc_caches + 9);
3420}
3421#else
3422static void resiliency_test(void) {};
3423#endif
3424
3425/*
3426 * Generate lists of code addresses where slabcache objects are allocated
3427 * and freed.
3428 */
3429
3430struct location {
3431        unsigned long count;
3432        void *addr;
3433        long long sum_time;
3434        long min_time;
3435        long max_time;
3436        long min_pid;
3437        long max_pid;
3438        cpumask_t cpus;
3439        nodemask_t nodes;
3440};
3441
3442struct loc_track {
3443        unsigned long max;
3444        unsigned long count;
3445        struct location *loc;
3446};
3447
3448static void free_loc_track(struct loc_track *t)
3449{
3450        if (t->max)
3451                free_pages((unsigned long)t->loc,
3452                        get_order(sizeof(struct location) * t->max));
3453}
3454
3455static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
3456{
3457        struct location *l;
3458        int order;
3459
3460        order = get_order(sizeof(struct location) * max);
3461
3462        l = (void *)__get_free_pages(flags, order);
3463        if (!l)
3464                return 0;
3465
3466        if (t->count) {
3467                memcpy(l, t->loc, sizeof(struct location) * t->count);
3468                free_loc_track(t);
3469        }
3470        t->max = max;
3471        t->loc = l;
3472        return 1;
3473}
3474
3475static int add_location(struct loc_track *t, struct kmem_cache *s,
3476                                const struct track *track)
3477{
3478        long start, end, pos;
3479        struct location *l;
3480        void *caddr;
3481        unsigned long age = jiffies - track->when;
3482
3483        start = -1;
3484        end = t->count;
3485
3486        for ( ; ; ) {
3487                pos = start + (end - start + 1) / 2;
3488
3489                /*
3490                 * There is nothing at "end". If we end up there
3491                 * we need to add something to before end.
3492                 */
3493                if (pos == end)
3494                        break;
3495
3496                caddr = t->loc[pos].addr;
3497                if (track->addr == caddr) {
3498
3499                        l = &t->loc[pos];
3500                        l->count++;
3501                        if (track->when) {
3502                                l->sum_time += age;
3503                                if (age < l->min_time)
3504                                        l->min_time = age;
3505                                if (age > l->max_time)
3506                                        l->max_time = age;
3507
3508                                if (track->pid < l->min_pid)
3509                                        l->min_pid = track->pid;
3510                                if (track->pid > l->max_pid)
3511                                        l->max_pid = track->pid;
3512
3513                                cpu_set(track->cpu, l->cpus);
3514                        }
3515                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3516                        return 1;
3517                }
3518
3519                if (track->addr < caddr)
3520                        end = pos;
3521                else
3522                        start = pos;
3523        }
3524
3525        /*
3526         * Not found. Insert new tracking element.
3527         */
3528        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
3529                return 0;
3530
3531        l = t->loc + pos;
3532        if (pos < t->count)
3533                memmove(l + 1, l,
3534                        (t->count - pos) * sizeof(struct location));
3535        t->count++;
3536        l->count = 1;
3537        l->addr = track->addr;
3538        l->sum_time = age;
3539        l->min_time = age;
3540        l->max_time = age;
3541        l->min_pid = track->pid;
3542        l->max_pid = track->pid;
3543        cpus_clear(l->cpus);
3544        cpu_set(track->cpu, l->cpus);
3545        nodes_clear(l->nodes);
3546        node_set(page_to_nid(virt_to_page(track)), l->nodes);
3547        return 1;
3548}
3549
3550static void process_slab(struct loc_track *t, struct kmem_cache *s,
3551                struct page *page, enum track_item alloc)
3552{
3553        void *addr = page_address(page);
3554        DECLARE_BITMAP(map, page->objects);
3555        void *p;
3556
3557        bitmap_zero(map, page->objects);
3558        for_each_free_object(p, s, page->freelist)
3559                set_bit(slab_index(p, s, addr), map);
3560
3561        for_each_object(p, s, addr, page->objects)
3562                if (!test_bit(slab_index(p, s, addr), map))
3563                        add_location(t, s, get_track(s, p, alloc));
3564}
3565
3566static int list_locations(struct kmem_cache *s, char *buf,
3567                                        enum track_item alloc)
3568{
3569        int len = 0;
3570        unsigned long i;
3571        struct loc_track t = { 0, 0, NULL };
3572        int node;
3573
3574        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3575                        GFP_TEMPORARY))
3576                return sprintf(buf, "Out of memory\n");
3577
3578        /* Push back cpu slabs */
3579        flush_all(s);
3580
3581        for_each_node_state(node, N_NORMAL_MEMORY) {
3582                struct kmem_cache_node *n = get_node(s, node);
3583                unsigned long flags;
3584                struct page *page;
3585
3586                if (!atomic_long_read(&n->nr_slabs))
3587                        continue;
3588
3589                spin_lock_irqsave(&n->list_lock, flags);
3590                list_for_each_entry(page, &n->partial, lru)
3591                        process_slab(&t, s, page, alloc);
3592                list_for_each_entry(page, &n->full, lru)
3593                        process_slab(&t, s, page, alloc);
3594                spin_unlock_irqrestore(&n->list_lock, flags);
3595        }
3596
3597        for (i = 0; i < t.count; i++) {
3598                struct location *l = &t.loc[i];
3599
3600                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
3601                        break;
3602                len += sprintf(buf + len, "%7ld ", l->count);
3603
3604                if (l->addr)
3605                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
3606                else
3607                        len += sprintf(buf + len, "<not-available>");
3608
3609                if (l->sum_time != l->min_time) {
3610                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
3611                                l->min_time,
3612                                (long)div_u64(l->sum_time, l->count),
3613                                l->max_time);
3614                } else
3615                        len += sprintf(buf + len, " age=%ld",
3616                                l->min_time);
3617
3618                if (l->min_pid != l->max_pid)
3619                        len += sprintf(buf + len, " pid=%ld-%ld",
3620                                l->min_pid, l->max_pid);
3621                else
3622                        len += sprintf(buf + len, " pid=%ld",
3623                                l->min_pid);
3624
3625                if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
3626                                len < PAGE_SIZE - 60) {
3627                        len += sprintf(buf + len, " cpus=");
3628                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3629                                        l->cpus);
3630                }
3631
3632                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
3633                                len < PAGE_SIZE - 60) {
3634                        len += sprintf(buf + len, " nodes=");
3635                        len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3636                                        l->nodes);
3637                }
3638
3639                len += sprintf(buf + len, "\n");
3640        }
3641
3642        free_loc_track(&t);
3643        if (!t.count)
3644                len += sprintf(buf, "No data\n");
3645        return len;
3646}
3647
3648enum slab_stat_type {
3649        SL_ALL,                 /* All slabs */
3650        SL_PARTIAL,             /* Only partially allocated slabs */
3651        SL_CPU,                 /* Only slabs used for cpu caches */
3652        SL_OBJECTS,             /* Determine allocated objects not slabs */
3653        SL_TOTAL                /* Determine object capacity not slabs */
3654};
3655
3656#define SO_ALL          (1 << SL_ALL)
3657#define SO_PARTIAL      (1 << SL_PARTIAL)
3658#define SO_CPU          (1 << SL_CPU)
3659#define SO_OBJECTS      (1 << SL_OBJECTS)
3660#define SO_TOTAL        (1 << SL_TOTAL)
3661
3662static ssize_t show_slab_objects(struct kmem_cache *s,
3663                            char *buf, unsigned long flags)
3664{
3665        unsigned long total = 0;
3666        int node;
3667        int x;
3668        unsigned long *nodes;
3669        unsigned long *per_cpu;
3670
3671        nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
3672        if (!nodes)
3673                return -ENOMEM;
3674        per_cpu = nodes + nr_node_ids;
3675
3676        if (flags & SO_CPU) {
3677                int cpu;
3678
3679                for_each_possible_cpu(cpu) {
3680                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3681
3682                        if (!c || c->node < 0)
3683                                continue;
3684
3685                        if (c->page) {
3686                                        if (flags & SO_TOTAL)
3687                                                x = c->page->objects;
3688                                else if (flags & SO_OBJECTS)
3689                                        x = c->page->inuse;
3690                                else
3691                                        x = 1;
3692
3693                                total += x;
3694                                nodes[c->node] += x;
3695                        }
3696                        per_cpu[c->node]++;
3697                }
3698        }
3699
3700        if (flags & SO_ALL) {
3701                for_each_node_state(node, N_NORMAL_MEMORY) {
3702                        struct kmem_cache_node *n = get_node(s, node);
3703
3704                if (flags & SO_TOTAL)
3705                        x = atomic_long_read(&n->total_objects);
3706                else if (flags & SO_OBJECTS)
3707                        x = atomic_long_read(&n->total_objects) -
3708                                count_partial(n, count_free);
3709
3710                        else
3711                                x = atomic_long_read(&n->nr_slabs);
3712                        total += x;
3713                        nodes[node] += x;
3714                }
3715
3716        } else if (flags & SO_PARTIAL) {
3717                for_each_node_state(node, N_NORMAL_MEMORY) {
3718                        struct kmem_cache_node *n = get_node(s, node);
3719
3720                        if (flags & SO_TOTAL)
3721                                x = count_partial(n, count_total);
3722                        else if (flags & SO_OBJECTS)
3723                                x = count_partial(n, count_inuse);
3724                        else
3725                                x = n->nr_partial;
3726                        total += x;
3727                        nodes[node] += x;
3728                }
3729        }
3730        x = sprintf(buf, "%lu", total);
3731#ifdef CONFIG_NUMA
3732        for_each_node_state(node, N_NORMAL_MEMORY)
3733                if (nodes[node])
3734                        x += sprintf(buf + x, " N%d=%lu",
3735                                        node, nodes[node]);
3736#endif
3737        kfree(nodes);
3738        return x + sprintf(buf + x, "\n");
3739}
3740
3741static int any_slab_objects(struct kmem_cache *s)
3742{
3743        int node;
3744
3745        for_each_online_node(node) {
3746                struct kmem_cache_node *n = get_node(s, node);
3747
3748                if (!n)
3749                        continue;
3750
3751                if (atomic_long_read(&n->total_objects))
3752                        return 1;
3753        }
3754        return 0;
3755}
3756
3757#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3758#define to_slab(n) container_of(n, struct kmem_cache, kobj);
3759
3760struct slab_attribute {
3761        struct attribute attr;
3762        ssize_t (*show)(struct kmem_cache *s, char *buf);
3763        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3764};
3765
3766#define SLAB_ATTR_RO(_name) \
3767        static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3768
3769#define SLAB_ATTR(_name) \
3770        static struct slab_attribute _name##_attr =  \
3771        __ATTR(_name, 0644, _name##_show, _name##_store)
3772
3773static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3774{
3775        return sprintf(buf, "%d\n", s->size);
3776}
3777SLAB_ATTR_RO(slab_size);
3778
3779static ssize_t align_show(struct kmem_cache *s, char *buf)
3780{
3781        return sprintf(buf, "%d\n", s->align);
3782}
3783SLAB_ATTR_RO(align);
3784
3785static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3786{
3787        return sprintf(buf, "%d\n", s->objsize);
3788}
3789SLAB_ATTR_RO(object_size);
3790
3791static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3792{
3793        return sprintf(buf, "%d\n", oo_objects(s->oo));
3794}
3795SLAB_ATTR_RO(objs_per_slab);
3796
3797static ssize_t order_store(struct kmem_cache *s,
3798                                const char *buf, size_t length)
3799{
3800        unsigned long order;
3801        int err;
3802
3803        err = strict_strtoul(buf, 10, &order);
3804        if (err)
3805                return err;
3806
3807        if (order > slub_max_order || order < slub_min_order)
3808                return -EINVAL;
3809
3810        calculate_sizes(s, order);
3811        return length;
3812}
3813
3814static ssize_t order_show(struct kmem_cache *s, char *buf)
3815{
3816        return sprintf(buf, "%d\n", oo_order(s->oo));
3817}
3818SLAB_ATTR(order);
3819
3820static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3821{
3822        if (s->ctor) {
3823                int n = sprint_symbol(buf, (unsigned long)s->ctor);
3824
3825                return n + sprintf(buf + n, "\n");
3826        }
3827        return 0;
3828}
3829SLAB_ATTR_RO(ctor);
3830
3831static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3832{
3833        return sprintf(buf, "%d\n", s->refcount - 1);
3834}
3835SLAB_ATTR_RO(aliases);
3836
3837static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3838{
3839        return show_slab_objects(s, buf, SO_ALL);
3840}
3841SLAB_ATTR_RO(slabs);
3842
3843static ssize_t partial_show(struct kmem_cache *s, char *buf)
3844{
3845        return show_slab_objects(s, buf, SO_PARTIAL);
3846}
3847SLAB_ATTR_RO(partial);
3848
3849static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3850{
3851        return show_slab_objects(s, buf, SO_CPU);
3852}
3853SLAB_ATTR_RO(cpu_slabs);
3854
3855static ssize_t objects_show(struct kmem_cache *s, char *buf)
3856{
3857        return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
3858}
3859SLAB_ATTR_RO(objects);
3860
3861static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
3862{
3863        return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
3864}
3865SLAB_ATTR_RO(objects_partial);
3866
3867static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3868{
3869        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
3870}
3871SLAB_ATTR_RO(total_objects);
3872
3873static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3874{
3875        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3876}
3877
3878static ssize_t sanity_checks_store(struct kmem_cache *s,
3879                                const char *buf, size_t length)
3880{
3881        s->flags &= ~SLAB_DEBUG_FREE;
3882        if (buf[0] == '1')
3883                s->flags |= SLAB_DEBUG_FREE;
3884        return length;
3885}
3886SLAB_ATTR(sanity_checks);
3887
3888static ssize_t trace_show(struct kmem_cache *s, char *buf)
3889{
3890        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3891}
3892
3893static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3894                                                        size_t length)
3895{
3896        s->flags &= ~SLAB_TRACE;
3897        if (buf[0] == '1')
3898                s->flags |= SLAB_TRACE;
3899        return length;
3900}
3901SLAB_ATTR(trace);
3902
3903static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3904{
3905        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3906}
3907
3908static ssize_t reclaim_account_store(struct kmem_cache *s,
3909                                const char *buf, size_t length)
3910{
3911        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3912        if (buf[0] == '1')
3913                s->flags |= SLAB_RECLAIM_ACCOUNT;
3914        return length;
3915}
3916SLAB_ATTR(reclaim_account);
3917
3918static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3919{
3920        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3921}
3922SLAB_ATTR_RO(hwcache_align);
3923
3924#ifdef CONFIG_ZONE_DMA
3925static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3926{
3927        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3928}
3929SLAB_ATTR_RO(cache_dma);
3930#endif
3931
3932static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3933{
3934        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3935}
3936SLAB_ATTR_RO(destroy_by_rcu);
3937
3938static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3939{
3940        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3941}
3942
3943static ssize_t red_zone_store(struct kmem_cache *s,
3944                                const char *buf, size_t length)
3945{
3946        if (any_slab_objects(s))
3947                return -EBUSY;
3948
3949        s->flags &= ~SLAB_RED_ZONE;
3950        if (buf[0] == '1')
3951                s->flags |= SLAB_RED_ZONE;
3952        calculate_sizes(s, -1);
3953        return length;
3954}
3955SLAB_ATTR(red_zone);
3956
3957static ssize_t poison_show(struct kmem_cache *s, char *buf)
3958{
3959        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3960}
3961
3962static ssize_t poison_store(struct kmem_cache *s,
3963                                const char *buf, size_t length)
3964{
3965        if (any_slab_objects(s))
3966                return -EBUSY;
3967
3968        s->flags &= ~SLAB_POISON;
3969        if (buf[0] == '1')
3970                s->flags |= SLAB_POISON;
3971        calculate_sizes(s, -1);
3972        return length;
3973}
3974SLAB_ATTR(poison);
3975
3976static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3977{
3978        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3979}
3980
3981static ssize_t store_user_store(struct kmem_cache *s,
3982                                const char *buf, size_t length)
3983{
3984        if (any_slab_objects(s))
3985                return -EBUSY;
3986
3987        s->flags &= ~SLAB_STORE_USER;
3988        if (buf[0] == '1')
3989                s->flags |= SLAB_STORE_USER;
3990        calculate_sizes(s, -1);
3991        return length;
3992}
3993SLAB_ATTR(store_user);
3994
3995static ssize_t validate_show(struct kmem_cache *s, char *buf)
3996{
3997        return 0;
3998}
3999
4000static ssize_t validate_store(struct kmem_cache *s,
4001                        const char *buf, size_t length)
4002{
4003        int ret = -EINVAL;
4004
4005        if (buf[0] == '1') {
4006                ret = validate_slab_cache(s);
4007                if (ret >= 0)
4008                        ret = length;
4009        }
4010        return ret;
4011}
4012SLAB_ATTR(validate);
4013
4014static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4015{
4016        return 0;
4017}
4018
4019static ssize_t shrink_store(struct kmem_cache *s,
4020                        const char *buf, size_t length)
4021{
4022        if (buf[0] == '1') {
4023                int rc = kmem_cache_shrink(s);
4024
4025                if (rc)
4026                        return rc;
4027        } else
4028                return -EINVAL;
4029        return length;
4030}
4031SLAB_ATTR(shrink);
4032
4033static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4034{
4035        if (!(s->flags & SLAB_STORE_USER))
4036                return -ENOSYS;
4037        return list_locations(s, buf, TRACK_ALLOC);
4038}
4039SLAB_ATTR_RO(alloc_calls);
4040
4041static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4042{
4043        if (!(s->flags & SLAB_STORE_USER))
4044                return -ENOSYS;
4045        return list_locations(s, buf, TRACK_FREE);
4046}
4047SLAB_ATTR_RO(free_calls);
4048
4049#ifdef CONFIG_NUMA
4050static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4051{
4052        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
4053}
4054
4055static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4056                                const char *buf, size_t length)
4057{
4058        unsigned long ratio;
4059        int err;
4060
4061        err = strict_strtoul(buf, 10, &ratio);
4062        if (err)
4063                return err;
4064
4065        if (ratio <= 100)
4066                s->remote_node_defrag_ratio = ratio * 10;
4067
4068        return length;
4069}
4070SLAB_ATTR(remote_node_defrag_ratio);
4071#endif
4072
4073#ifdef CONFIG_SLUB_STATS
4074static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4075{
4076        unsigned long sum  = 0;
4077        int cpu;
4078        int len;
4079        int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
4080
4081        if (!data)
4082                return -ENOMEM;
4083
4084        for_each_online_cpu(cpu) {
4085                unsigned x = get_cpu_slab(s, cpu)->stat[si];
4086
4087                data[cpu] = x;
4088                sum += x;
4089        }
4090
4091        len = sprintf(buf, "%lu", sum);
4092
4093#ifdef CONFIG_SMP
4094        for_each_online_cpu(cpu) {
4095                if (data[cpu] && len < PAGE_SIZE - 20)
4096                        len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
4097        }
4098#endif
4099        kfree(data);
4100        return len + sprintf(buf + len, "\n");
4101}
4102
4103#define STAT_ATTR(si, text)                                     \
4104static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
4105{                                                               \
4106        return show_stat(s, buf, si);                           \
4107}                                                               \
4108SLAB_ATTR_RO(text);                                             \
4109
4110STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4111STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
4112STAT_ATTR(FREE_FASTPATH, free_fastpath);
4113STAT_ATTR(FREE_SLOWPATH, free_slowpath);
4114STAT_ATTR(FREE_FROZEN, free_frozen);
4115STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
4116STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4117STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4118STAT_ATTR(ALLOC_SLAB, alloc_slab);
4119STAT_ATTR(ALLOC_REFILL, alloc_refill);
4120STAT_ATTR(FREE_SLAB, free_slab);
4121STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4122STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
4123STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4124STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4125STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4126STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4127STAT_ATTR(ORDER_FALLBACK, order_fallback);
4128#endif
4129
4130static struct attribute *slab_attrs[] = {
4131        &slab_size_attr.attr,
4132        &object_size_attr.attr,
4133        &objs_per_slab_attr.attr,
4134        &order_attr.attr,
4135        &objects_attr.attr,
4136        &objects_partial_attr.attr,
4137        &total_objects_attr.attr,
4138        &slabs_attr.attr,
4139        &partial_attr.attr,
4140        &cpu_slabs_attr.attr,
4141        &ctor_attr.attr,
4142        &aliases_attr.attr,
4143        &align_attr.attr,
4144        &sanity_checks_attr.attr,
4145        &trace_attr.attr,
4146        &hwcache_align_attr.attr,
4147        &reclaim_account_attr.attr,
4148        &destroy_by_rcu_attr.attr,
4149        &red_zone_attr.attr,
4150        &poison_attr.attr,
4151        &store_user_attr.attr,
4152        &validate_attr.attr,
4153        &shrink_attr.attr,
4154        &alloc_calls_attr.attr,
4155        &free_calls_attr.attr,
4156#ifdef CONFIG_ZONE_DMA
4157        &cache_dma_attr.attr,
4158#endif
4159#ifdef CONFIG_NUMA
4160        &remote_node_defrag_ratio_attr.attr,
4161#endif
4162#ifdef CONFIG_SLUB_STATS
4163        &alloc_fastpath_attr.attr,
4164        &alloc_slowpath_attr.attr,
4165        &free_fastpath_attr.attr,
4166        &free_slowpath_attr.attr,
4167        &free_frozen_attr.attr,
4168        &free_add_partial_attr.attr,
4169        &free_remove_partial_attr.attr,
4170        &alloc_from_partial_attr.attr,
4171        &alloc_slab_attr.attr,
4172        &alloc_refill_attr.attr,
4173        &free_slab_attr.attr,
4174        &cpuslab_flush_attr.attr,
4175        &deactivate_full_attr.attr,
4176        &deactivate_empty_attr.attr,
4177        &deactivate_to_head_attr.attr,
4178        &deactivate_to_tail_attr.attr,
4179        &deactivate_remote_frees_attr.attr,
4180        &order_fallback_attr.attr,
4181#endif
4182        NULL
4183};
4184
4185static struct attribute_group slab_attr_group = {
4186        .attrs = slab_attrs,
4187};
4188
4189static ssize_t slab_attr_show(struct kobject *kobj,
4190                                struct attribute *attr,
4191                                char *buf)
4192{
4193        struct slab_attribute *attribute;
4194        struct kmem_cache *s;
4195        int err;
4196
4197        attribute = to_slab_attr(attr);
4198        s = to_slab(kobj);
4199
4200        if (!attribute->show)
4201                return -EIO;
4202
4203        err = attribute->show(s, buf);
4204
4205        return err;
4206}
4207
4208static ssize_t slab_attr_store(struct kobject *kobj,
4209                                struct attribute *attr,
4210                                const char *buf, size_t len)
4211{
4212        struct slab_attribute *attribute;
4213        struct kmem_cache *s;
4214        int err;
4215
4216        attribute = to_slab_attr(attr);
4217        s = to_slab(kobj);
4218
4219        if (!attribute->store)
4220                return -EIO;
4221
4222        err = attribute->store(s, buf, len);
4223
4224        return err;
4225}
4226
4227static void kmem_cache_release(struct kobject *kobj)
4228{
4229        struct kmem_cache *s = to_slab(kobj);
4230
4231        kfree(s);
4232}
4233
4234static struct sysfs_ops slab_sysfs_ops = {
4235        .show = slab_attr_show,
4236        .store = slab_attr_store,
4237};
4238
4239static struct kobj_type slab_ktype = {
4240        .sysfs_ops = &slab_sysfs_ops,
4241        .release = kmem_cache_release
4242};
4243
4244static int uevent_filter(struct kset *kset, struct kobject *kobj)
4245{
4246        struct kobj_type *ktype = get_ktype(kobj);
4247
4248        if (ktype == &slab_ktype)
4249                return 1;
4250        return 0;
4251}
4252
4253static struct kset_uevent_ops slab_uevent_ops = {
4254        .filter = uevent_filter,
4255};
4256
4257static struct kset *slab_kset;
4258
4259#define ID_STR_LENGTH 64
4260
4261/* Create a unique string id for a slab cache:
4262 *
4263 * Format       :[flags-]size
4264 */
4265static char *create_unique_id(struct kmem_cache *s)
4266{
4267        char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
4268        char *p = name;
4269
4270        BUG_ON(!name);
4271
4272        *p++ = ':';
4273        /*
4274         * First flags affecting slabcache operations. We will only
4275         * get here for aliasable slabs so we do not need to support
4276         * too many flags. The flags here must cover all flags that
4277         * are matched during merging to guarantee that the id is
4278         * unique.
4279         */
4280        if (s->flags & SLAB_CACHE_DMA)
4281                *p++ = 'd';
4282        if (s->flags & SLAB_RECLAIM_ACCOUNT)
4283                *p++ = 'a';
4284        if (s->flags & SLAB_DEBUG_FREE)
4285                *p++ = 'F';
4286        if (p != name + 1)
4287                *p++ = '-';
4288        p += sprintf(p, "%07d", s->size);
4289        BUG_ON(p > name + ID_STR_LENGTH - 1);
4290        return name;
4291}
4292
4293static int sysfs_slab_add(struct kmem_cache *s)
4294{
4295        int err;
4296        const char *name;
4297        int unmergeable;
4298
4299        if (slab_state < SYSFS)
4300                /* Defer until later */
4301                return 0;
4302
4303        unmergeable = slab_unmergeable(s);
4304        if (unmergeable) {
4305                /*
4306                 * Slabcache can never be merged so we can use the name proper.
4307                 * This is typically the case for debug situations. In that
4308                 * case we can catch duplicate names easily.
4309                 */
4310                sysfs_remove_link(&slab_kset->kobj, s->name);
4311                name = s->name;
4312        } else {
4313                /*
4314                 * Create a unique name for the slab as a target
4315                 * for the symlinks.
4316                 */
4317                name = create_unique_id(s);
4318        }
4319
4320        s->kobj.kset = slab_kset;
4321        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
4322        if (err) {
4323                kobject_put(&s->kobj);
4324                return err;
4325        }
4326
4327        err = sysfs_create_group(&s->kobj, &slab_attr_group);
4328        if (err)
4329                return err;
4330        kobject_uevent(&s->kobj, KOBJ_ADD);
4331        if (!unmergeable) {
4332                /* Setup first alias */
4333                sysfs_slab_alias(s, s->name);
4334                kfree(name);
4335        }
4336        return 0;
4337}
4338
4339static void sysfs_slab_remove(struct kmem_cache *s)
4340{
4341        kobject_uevent(&s->kobj, KOBJ_REMOVE);
4342        kobject_del(&s->kobj);
4343        kobject_put(&s->kobj);
4344}
4345
4346/*
4347 * Need to buffer aliases during bootup until sysfs becomes
4348 * available lest we loose that information.
4349 */
4350struct saved_alias {
4351        struct kmem_cache *s;
4352        const char *name;
4353        struct saved_alias *next;
4354};
4355
4356static struct saved_alias *alias_list;
4357
4358static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
4359{
4360        struct saved_alias *al;
4361
4362        if (slab_state == SYSFS) {
4363                /*
4364                 * If we have a leftover link then remove it.
4365                 */
4366                sysfs_remove_link(&slab_kset->kobj, name);
4367                return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
4368        }
4369
4370        al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
4371        if (!al)
4372                return -ENOMEM;
4373
4374        al->s = s;
4375        al->name = name;
4376        al->next = alias_list;
4377        alias_list = al;
4378        return 0;
4379}
4380
4381static int __init slab_sysfs_init(void)
4382{
4383        struct kmem_cache *s;
4384        int err;
4385
4386        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4387        if (!slab_kset) {
4388                printk(KERN_ERR "Cannot register slab subsystem.\n");
4389                return -ENOSYS;
4390        }
4391
4392        slab_state = SYSFS;
4393
4394        list_for_each_entry(s, &slab_caches, list) {
4395                err = sysfs_slab_add(s);
4396                if (err)
4397                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
4398                                                " to sysfs\n", s->name);
4399        }
4400
4401        while (alias_list) {
4402                struct saved_alias *al = alias_list;
4403
4404                alias_list = alias_list->next;
4405                err = sysfs_slab_alias(al->s, al->name);
4406                if (err)
4407                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
4408                                        " %s to sysfs\n", s->name);
4409                kfree(al);
4410        }
4411
4412        resiliency_test();
4413        return 0;
4414}
4415
4416__initcall(slab_sysfs_init);
4417#endif
4418
4419/*
4420 * The /proc/slabinfo ABI
4421 */
4422#ifdef CONFIG_SLABINFO
4423static void print_slabinfo_header(struct seq_file *m)
4424{
4425        seq_puts(m, "slabinfo - version: 2.1\n");
4426        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4427                 "<objperslab> <pagesperslab>");
4428        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4429        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4430        seq_putc(m, '\n');
4431}
4432
4433static void *s_start(struct seq_file *m, loff_t *pos)
4434{
4435        loff_t n = *pos;
4436
4437        down_read(&slub_lock);
4438        if (!n)
4439                print_slabinfo_header(m);
4440
4441        return seq_list_start(&slab_caches, *pos);
4442}
4443
4444static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4445{
4446        return seq_list_next(p, &slab_caches, pos);
4447}
4448
4449static void s_stop(struct seq_file *m, void *p)
4450{
4451        up_read(&slub_lock);
4452}
4453
4454static int s_show(struct seq_file *m, void *p)
4455{
4456        unsigned long nr_partials = 0;
4457        unsigned long nr_slabs = 0;
4458        unsigned long nr_inuse = 0;
4459        unsigned long nr_objs = 0;
4460        unsigned long nr_free = 0;
4461        struct kmem_cache *s;
4462        int node;
4463
4464        s = list_entry(p, struct kmem_cache, list);
4465
4466        for_each_online_node(node) {
4467                struct kmem_cache_node *n = get_node(s, node);
4468
4469                if (!n)
4470                        continue;
4471
4472                nr_partials += n->nr_partial;
4473                nr_slabs += atomic_long_read(&n->nr_slabs);
4474                nr_objs += atomic_long_read(&n->total_objects);
4475                nr_free += count_partial(n, count_free);
4476        }
4477
4478        nr_inuse = nr_objs - nr_free;
4479
4480        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
4481                   nr_objs, s->size, oo_objects(s->oo),
4482                   (1 << oo_order(s->oo)));
4483        seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
4484        seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
4485                   0UL);
4486        seq_putc(m, '\n');
4487        return 0;
4488}
4489
4490static const struct seq_operations slabinfo_op = {
4491        .start = s_start,
4492        .next = s_next,
4493        .stop = s_stop,
4494        .show = s_show,
4495};
4496
4497static int slabinfo_open(struct inode *inode, struct file *file)
4498{
4499        return seq_open(file, &slabinfo_op);
4500}
4501
4502static const struct file_operations proc_slabinfo_operations = {
4503        .open           = slabinfo_open,
4504        .read           = seq_read,
4505        .llseek         = seq_lseek,
4506        .release        = seq_release,
4507};
4508
4509static int __init slab_proc_init(void)
4510{
4511        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4512        return 0;
4513}
4514module_init(slab_proc_init);
4515#endif /* CONFIG_SLABINFO */
4516
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.