linux/mm/slab.c
<<
>>
Prefs
   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same intializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts -
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in struct kmem_cache and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 * 15 March 2005. NUMA slab allocator.
  79 *      Shai Fultheim <shai@scalex86.org>.
  80 *      Shobhit Dayal <shobhit@calsoftinc.com>
  81 *      Alok N Kataria <alokk@calsoftinc.com>
  82 *      Christoph Lameter <christoph@lameter.com>
  83 *
  84 *      Modified the slab allocator to be node aware on NUMA systems.
  85 *      Each node has its own list of partial, free and full slabs.
  86 *      All object allocations for a node occur from node specific slab lists.
  87 */
  88
  89#include        <linux/slab.h>
  90#include        <linux/mm.h>
  91#include        <linux/poison.h>
  92#include        <linux/swap.h>
  93#include        <linux/cache.h>
  94#include        <linux/interrupt.h>
  95#include        <linux/init.h>
  96#include        <linux/compiler.h>
  97#include        <linux/cpuset.h>
  98#include        <linux/seq_file.h>
  99#include        <linux/notifier.h>
 100#include        <linux/kallsyms.h>
 101#include        <linux/cpu.h>
 102#include        <linux/sysctl.h>
 103#include        <linux/module.h>
 104#include        <linux/rcupdate.h>
 105#include        <linux/string.h>
 106#include        <linux/uaccess.h>
 107#include        <linux/nodemask.h>
 108#include        <linux/mempolicy.h>
 109#include        <linux/mutex.h>
 110#include        <linux/fault-inject.h>
 111#include        <linux/rtmutex.h>
 112#include        <linux/reciprocal_div.h>
 113
 114#include        <asm/cacheflush.h>
 115#include        <asm/tlbflush.h>
 116#include        <asm/page.h>
 117
 118/*
 119 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 120 *                0 for faster, smaller code (especially in the critical paths).
 121 *
 122 * STATS        - 1 to collect stats for /proc/slabinfo.
 123 *                0 for faster, smaller code (especially in the critical paths).
 124 *
 125 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 126 */
 127
 128#ifdef CONFIG_DEBUG_SLAB
 129#define DEBUG           1
 130#define STATS           1
 131#define FORCED_DEBUG    1
 132#else
 133#define DEBUG           0
 134#define STATS           0
 135#define FORCED_DEBUG    0
 136#endif
 137
 138/* Shouldn't this be in a header file somewhere? */
 139#define BYTES_PER_WORD          sizeof(void *)
 140#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 141
 142#ifndef cache_line_size
 143#define cache_line_size()       L1_CACHE_BYTES
 144#endif
 145
 146#ifndef ARCH_KMALLOC_MINALIGN
 147/*
 148 * Enforce a minimum alignment for the kmalloc caches.
 149 * Usually, the kmalloc caches are cache_line_size() aligned, except when
 150 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
 151 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 152 * alignment larger than the alignment of a 64-bit integer.
 153 * ARCH_KMALLOC_MINALIGN allows that.
 154 * Note that increasing this value may disable some debug features.
 155 */
 156#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 157#endif
 158
 159#ifndef ARCH_SLAB_MINALIGN
 160/*
 161 * Enforce a minimum alignment for all caches.
 162 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
 163 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
 164 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
 165 * some debug features.
 166 */
 167#define ARCH_SLAB_MINALIGN 0
 168#endif
 169
 170#ifndef ARCH_KMALLOC_FLAGS
 171#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 172#endif
 173
 174/* Legal flag mask for kmem_cache_create(). */
 175#if DEBUG
 176# define CREATE_MASK    (SLAB_RED_ZONE | \
 177                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 178                         SLAB_CACHE_DMA | \
 179                         SLAB_STORE_USER | \
 180                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 181                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 182#else
 183# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
 184                         SLAB_CACHE_DMA | \
 185                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 186                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 187#endif
 188
 189/*
 190 * kmem_bufctl_t:
 191 *
 192 * Bufctl's are used for linking objs within a slab
 193 * linked offsets.
 194 *
 195 * This implementation relies on "struct page" for locating the cache &
 196 * slab an object belongs to.
 197 * This allows the bufctl structure to be small (one int), but limits
 198 * the number of objects a slab (not a cache) can contain when off-slab
 199 * bufctls are used. The limit is the size of the largest general cache
 200 * that does not use off-slab slabs.
 201 * For 32bit archs with 4 kB pages, is this 56.
 202 * This is not serious, as it is only for large objects, when it is unwise
 203 * to have too many per slab.
 204 * Note: This limit can be raised by introducing a general cache whose size
 205 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 206 */
 207
 208typedef unsigned int kmem_bufctl_t;
 209#define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 210#define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 211#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
 212#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 213
 214/*
 215 * struct slab
 216 *
 217 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 218 * for a slab, or allocated from an general cache.
 219 * Slabs are chained into three list: fully used, partial, fully free slabs.
 220 */
 221struct slab {
 222        struct list_head list;
 223        unsigned long colouroff;
 224        void *s_mem;            /* including colour offset */
 225        unsigned int inuse;     /* num of objs active in slab */
 226        kmem_bufctl_t free;
 227        unsigned short nodeid;
 228};
 229
 230/*
 231 * struct slab_rcu
 232 *
 233 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 234 * arrange for kmem_freepages to be called via RCU.  This is useful if
 235 * we need to approach a kernel structure obliquely, from its address
 236 * obtained without the usual locking.  We can lock the structure to
 237 * stabilize it and check it's still at the given address, only if we
 238 * can be sure that the memory has not been meanwhile reused for some
 239 * other kind of object (which our subsystem's lock might corrupt).
 240 *
 241 * rcu_read_lock before reading the address, then rcu_read_unlock after
 242 * taking the spinlock within the structure expected at that address.
 243 *
 244 * We assume struct slab_rcu can overlay struct slab when destroying.
 245 */
 246struct slab_rcu {
 247        struct rcu_head head;
 248        struct kmem_cache *cachep;
 249        void *addr;
 250};
 251
 252/*
 253 * struct array_cache
 254 *
 255 * Purpose:
 256 * - LIFO ordering, to hand out cache-warm objects from _alloc
 257 * - reduce the number of linked list operations
 258 * - reduce spinlock operations
 259 *
 260 * The limit is stored in the per-cpu structure to reduce the data cache
 261 * footprint.
 262 *
 263 */
 264struct array_cache {
 265        unsigned int avail;
 266        unsigned int limit;
 267        unsigned int batchcount;
 268        unsigned int touched;
 269        spinlock_t lock;
 270        void *entry[0]; /*
 271                         * Must have this definition in here for the proper
 272                         * alignment of array_cache. Also simplifies accessing
 273                         * the entries.
 274                         * [0] is for gcc 2.95. It should really be [].
 275                         */
 276};
 277
 278/*
 279 * bootstrap: The caches do not work without cpuarrays anymore, but the
 280 * cpuarrays are allocated from the generic caches...
 281 */
 282#define BOOT_CPUCACHE_ENTRIES   1
 283struct arraycache_init {
 284        struct array_cache cache;
 285        void *entries[BOOT_CPUCACHE_ENTRIES];
 286};
 287
 288/*
 289 * The slab lists for all objects.
 290 */
 291struct kmem_list3 {
 292        struct list_head slabs_partial; /* partial list first, better asm code */
 293        struct list_head slabs_full;
 294        struct list_head slabs_free;
 295        unsigned long free_objects;
 296        unsigned int free_limit;
 297        unsigned int colour_next;       /* Per-node cache coloring */
 298        spinlock_t list_lock;
 299        struct array_cache *shared;     /* shared per node */
 300        struct array_cache **alien;     /* on other nodes */
 301        unsigned long next_reap;        /* updated without locking */
 302        int free_touched;               /* updated without locking */
 303};
 304
 305/*
 306 * Need this for bootstrapping a per node allocator.
 307 */
 308#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
 309struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 310#define CACHE_CACHE 0
 311#define SIZE_AC 1
 312#define SIZE_L3 (1 + MAX_NUMNODES)
 313
 314static int drain_freelist(struct kmem_cache *cache,
 315                        struct kmem_list3 *l3, int tofree);
 316static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 317                        int node);
 318static int enable_cpucache(struct kmem_cache *cachep);
 319static void cache_reap(struct work_struct *unused);
 320
 321/*
 322 * This function must be completely optimized away if a constant is passed to
 323 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
 324 */
 325static __always_inline int index_of(const size_t size)
 326{
 327        extern void __bad_size(void);
 328
 329        if (__builtin_constant_p(size)) {
 330                int i = 0;
 331
 332#define CACHE(x) \
 333        if (size <=x) \
 334                return i; \
 335        else \
 336                i++;
 337#include "linux/kmalloc_sizes.h"
 338#undef CACHE
 339                __bad_size();
 340        } else
 341                __bad_size();
 342        return 0;
 343}
 344
 345static int slab_early_init = 1;
 346
 347#define INDEX_AC index_of(sizeof(struct arraycache_init))
 348#define INDEX_L3 index_of(sizeof(struct kmem_list3))
 349
 350static void kmem_list3_init(struct kmem_list3 *parent)
 351{
 352        INIT_LIST_HEAD(&parent->slabs_full);
 353        INIT_LIST_HEAD(&parent->slabs_partial);
 354        INIT_LIST_HEAD(&parent->slabs_free);
 355        parent->shared = NULL;
 356        parent->alien = NULL;
 357        parent->colour_next = 0;
 358        spin_lock_init(&parent->list_lock);
 359        parent->free_objects = 0;
 360        parent->free_touched = 0;
 361}
 362
 363#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 364        do {                                                            \
 365                INIT_LIST_HEAD(listp);                                  \
 366                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
 367        } while (0)
 368
 369#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 370        do {                                                            \
 371        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 372        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 373        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 374        } while (0)
 375
 376/*
 377 * struct kmem_cache
 378 *
 379 * manages a cache.
 380 */
 381
 382struct kmem_cache {
 383/* 1) per-cpu data, touched during every alloc/free */
 384        struct array_cache *array[NR_CPUS];
 385/* 2) Cache tunables. Protected by cache_chain_mutex */
 386        unsigned int batchcount;
 387        unsigned int limit;
 388        unsigned int shared;
 389
 390        unsigned int buffer_size;
 391        u32 reciprocal_buffer_size;
 392/* 3) touched by every alloc & free from the backend */
 393
 394        unsigned int flags;             /* constant flags */
 395        unsigned int num;               /* # of objs per slab */
 396
 397/* 4) cache_grow/shrink */
 398        /* order of pgs per slab (2^n) */
 399        unsigned int gfporder;
 400
 401        /* force GFP flags, e.g. GFP_DMA */
 402        gfp_t gfpflags;
 403
 404        size_t colour;                  /* cache colouring range */
 405        unsigned int colour_off;        /* colour offset */
 406        struct kmem_cache *slabp_cache;
 407        unsigned int slab_size;
 408        unsigned int dflags;            /* dynamic flags */
 409
 410        /* constructor func */
 411        void (*ctor) (void *, struct kmem_cache *, unsigned long);
 412
 413/* 5) cache creation/removal */
 414        const char *name;
 415        struct list_head next;
 416
 417/* 6) statistics */
 418#if STATS
 419        unsigned long num_active;
 420        unsigned long num_allocations;
 421        unsigned long high_mark;
 422        unsigned long grown;
 423        unsigned long reaped;
 424        unsigned long errors;
 425        unsigned long max_freeable;
 426        unsigned long node_allocs;
 427        unsigned long node_frees;
 428        unsigned long node_overflow;
 429        atomic_t allochit;
 430        atomic_t allocmiss;
 431        atomic_t freehit;
 432        atomic_t freemiss;
 433#endif
 434#if DEBUG
 435        /*
 436         * If debugging is enabled, then the allocator can add additional
 437         * fields and/or padding to every object. buffer_size contains the total
 438         * object size including these internal fields, the following two
 439         * variables contain the offset to the user object and its size.
 440         */
 441        int obj_offset;
 442        int obj_size;
 443#endif
 444        /*
 445         * We put nodelists[] at the end of kmem_cache, because we want to size
 446         * this array to nr_node_ids slots instead of MAX_NUMNODES
 447         * (see kmem_cache_init())
 448         * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
 449         * is statically defined, so we reserve the max number of nodes.
 450         */
 451        struct kmem_list3 *nodelists[MAX_NUMNODES];
 452        /*
 453         * Do not add fields after nodelists[]
 454         */
 455};
 456
 457#define CFLGS_OFF_SLAB          (0x80000000UL)
 458#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 459
 460#define BATCHREFILL_LIMIT       16
 461/*
 462 * Optimization question: fewer reaps means less probability for unnessary
 463 * cpucache drain/refill cycles.
 464 *
 465 * OTOH the cpuarrays can contain lots of objects,
 466 * which could lock up otherwise freeable slabs.
 467 */
 468#define REAPTIMEOUT_CPUC        (2*HZ)
 469#define REAPTIMEOUT_LIST3       (4*HZ)
 470
 471#if STATS
 472#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 473#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 474#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 475#define STATS_INC_GROWN(x)      ((x)->grown++)
 476#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 477#define STATS_SET_HIGH(x)                                               \
 478        do {                                                            \
 479                if ((x)->num_active > (x)->high_mark)                   \
 480                        (x)->high_mark = (x)->num_active;               \
 481        } while (0)
 482#define STATS_INC_ERR(x)        ((x)->errors++)
 483#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 484#define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 485#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 486#define STATS_SET_FREEABLE(x, i)                                        \
 487        do {                                                            \
 488                if ((x)->max_freeable < i)                              \
 489                        (x)->max_freeable = i;                          \
 490        } while (0)
 491#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 492#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 493#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 494#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 495#else
 496#define STATS_INC_ACTIVE(x)     do { } while (0)
 497#define STATS_DEC_ACTIVE(x)     do { } while (0)
 498#define STATS_INC_ALLOCED(x)    do { } while (0)
 499#define STATS_INC_GROWN(x)      do { } while (0)
 500#define STATS_ADD_REAPED(x,y)   do { } while (0)
 501#define STATS_SET_HIGH(x)       do { } while (0)
 502#define STATS_INC_ERR(x)        do { } while (0)
 503#define STATS_INC_NODEALLOCS(x) do { } while (0)
 504#define STATS_INC_NODEFREES(x)  do { } while (0)
 505#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 506#define STATS_SET_FREEABLE(x, i) do { } while (0)
 507#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 508#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 509#define STATS_INC_FREEHIT(x)    do { } while (0)
 510#define STATS_INC_FREEMISS(x)   do { } while (0)
 511#endif
 512
 513#if DEBUG
 514
 515/*
 516 * memory layout of objects:
 517 * 0            : objp
 518 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 519 *              the end of an object is aligned with the end of the real
 520 *              allocation. Catches writes behind the end of the allocation.
 521 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 522 *              redzone word.
 523 * cachep->obj_offset: The real object.
 524 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 525 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
 526 *                                      [BYTES_PER_WORD long]
 527 */
 528static int obj_offset(struct kmem_cache *cachep)
 529{
 530        return cachep->obj_offset;
 531}
 532
 533static int obj_size(struct kmem_cache *cachep)
 534{
 535        return cachep->obj_size;
 536}
 537
 538static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 539{
 540        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 541        return (unsigned long long*) (objp + obj_offset(cachep) -
 542                                      sizeof(unsigned long long));
 543}
 544
 545static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 546{
 547        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 548        if (cachep->flags & SLAB_STORE_USER)
 549                return (unsigned long long *)(objp + cachep->buffer_size -
 550                                              sizeof(unsigned long long) -
 551                                              REDZONE_ALIGN);
 552        return (unsigned long long *) (objp + cachep->buffer_size -
 553                                       sizeof(unsigned long long));
 554}
 555
 556static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 557{
 558        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 559        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 560}
 561
 562#else
 563
 564#define obj_offset(x)                   0
 565#define obj_size(cachep)                (cachep->buffer_size)
 566#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 567#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 568#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 569
 570#endif
 571
 572/*
 573 * Do not go above this order unless 0 objects fit into the slab.
 574 */
 575#define BREAK_GFP_ORDER_HI      1
 576#define BREAK_GFP_ORDER_LO      0
 577static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 578
 579/*
 580 * Functions for storing/retrieving the cachep and or slab from the page
 581 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
 582 * these are used to find the cache which an obj belongs to.
 583 */
 584static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 585{
 586        page->lru.next = (struct list_head *)cache;
 587}
 588
 589static inline struct kmem_cache *page_get_cache(struct page *page)
 590{
 591        page = compound_head(page);
 592        BUG_ON(!PageSlab(page));
 593        return (struct kmem_cache *)page->lru.next;
 594}
 595
 596static inline void page_set_slab(struct page *page, struct slab *slab)
 597{
 598        page->lru.prev = (struct list_head *)slab;
 599}
 600
 601static inline struct slab *page_get_slab(struct page *page)
 602{
 603        BUG_ON(!PageSlab(page));
 604        return (struct slab *)page->lru.prev;
 605}
 606
 607static inline struct kmem_cache *virt_to_cache(const void *obj)
 608{
 609        struct page *page = virt_to_head_page(obj);
 610        return page_get_cache(page);
 611}
 612
 613static inline struct slab *virt_to_slab(const void *obj)
 614{
 615        struct page *page = virt_to_head_page(obj);
 616        return page_get_slab(page);
 617}
 618
 619static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 620                                 unsigned int idx)
 621{
 622        return slab->s_mem + cache->buffer_size * idx;
 623}
 624
 625/*
 626 * We want to avoid an expensive divide : (offset / cache->buffer_size)
 627 *   Using the fact that buffer_size is a constant for a particular cache,
 628 *   we can replace (offset / cache->buffer_size) by
 629 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 630 */
 631static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 632                                        const struct slab *slab, void *obj)
 633{
 634        u32 offset = (obj - slab->s_mem);
 635        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 636}
 637
 638/*
 639 * These are the default caches for kmalloc. Custom caches can have other sizes.
 640 */
 641struct cache_sizes malloc_sizes[] = {
 642#define CACHE(x) { .cs_size = (x) },
 643#include <linux/kmalloc_sizes.h>
 644        CACHE(ULONG_MAX)
 645#undef CACHE
 646};
 647EXPORT_SYMBOL(malloc_sizes);
 648
 649/* Must match cache_sizes above. Out of line to keep cache footprint low. */
 650struct cache_names {
 651        char *name;
 652        char *name_dma;
 653};
 654
 655static struct cache_names __initdata cache_names[] = {
 656#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 657#include <linux/kmalloc_sizes.h>
 658        {NULL,}
 659#undef CACHE
 660};
 661
 662static struct arraycache_init initarray_cache __initdata =
 663    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 664static struct arraycache_init initarray_generic =
 665    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 666
 667/* internal cache of cache description objs */
 668static struct kmem_cache cache_cache = {
 669        .batchcount = 1,
 670        .limit = BOOT_CPUCACHE_ENTRIES,
 671        .shared = 1,
 672        .buffer_size = sizeof(struct kmem_cache),
 673        .name = "kmem_cache",
 674};
 675
 676#define BAD_ALIEN_MAGIC 0x01020304ul
 677
 678#ifdef CONFIG_LOCKDEP
 679
 680/*
 681 * Slab sometimes uses the kmalloc slabs to store the slab headers
 682 * for other slabs "off slab".
 683 * The locking for this is tricky in that it nests within the locks
 684 * of all other slabs in a few places; to deal with this special
 685 * locking we put on-slab caches into a separate lock-class.
 686 *
 687 * We set lock class for alien array caches which are up during init.
 688 * The lock annotation will be lost if all cpus of a node goes down and
 689 * then comes back up during hotplug
 690 */
 691static struct lock_class_key on_slab_l3_key;
 692static struct lock_class_key on_slab_alc_key;
 693
 694static inline void init_lock_keys(void)
 695
 696{
 697        int q;
 698        struct cache_sizes *s = malloc_sizes;
 699
 700        while (s->cs_size != ULONG_MAX) {
 701                for_each_node(q) {
 702                        struct array_cache **alc;
 703                        int r;
 704                        struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
 705                        if (!l3 || OFF_SLAB(s->cs_cachep))
 706                                continue;
 707                        lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
 708                        alc = l3->alien;
 709                        /*
 710                         * FIXME: This check for BAD_ALIEN_MAGIC
 711                         * should go away when common slab code is taught to
 712                         * work even without alien caches.
 713                         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 714                         * for alloc_alien_cache,
 715                         */
 716                        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 717                                continue;
 718                        for_each_node(r) {
 719                                if (alc[r])
 720                                        lockdep_set_class(&alc[r]->lock,
 721                                             &on_slab_alc_key);
 722                        }
 723                }
 724                s++;
 725        }
 726}
 727#else
 728static inline void init_lock_keys(void)
 729{
 730}
 731#endif
 732
 733/*
 734 * 1. Guard access to the cache-chain.
 735 * 2. Protect sanity of cpu_online_map against cpu hotplug events
 736 */
 737static DEFINE_MUTEX(cache_chain_mutex);
 738static struct list_head cache_chain;
 739
 740/*
 741 * chicken and egg problem: delay the per-cpu array allocation
 742 * until the general caches are up.
 743 */
 744static enum {
 745        NONE,
 746        PARTIAL_AC,
 747        PARTIAL_L3,
 748        FULL
 749} g_cpucache_up;
 750
 751/*
 752 * used by boot code to determine if it can use slab based allocator
 753 */
 754int slab_is_available(void)
 755{
 756        return g_cpucache_up == FULL;
 757}
 758
 759static DEFINE_PER_CPU(struct delayed_work, reap_work);
 760
 761static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 762{
 763        return cachep->array[smp_processor_id()];
 764}
 765
 766static inline struct kmem_cache *__find_general_cachep(size_t size,
 767                                                        gfp_t gfpflags)
 768{
 769        struct cache_sizes *csizep = malloc_sizes;
 770
 771#if DEBUG
 772        /* This happens if someone tries to call
 773         * kmem_cache_create(), or __kmalloc(), before
 774         * the generic caches are initialized.
 775         */
 776        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 777#endif
 778        while (size > csizep->cs_size)
 779                csizep++;
 780
 781        /*
 782         * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 783         * has cs_{dma,}cachep==NULL. Thus no special case
 784         * for large kmalloc calls required.
 785         */
 786#ifdef CONFIG_ZONE_DMA
 787        if (unlikely(gfpflags & GFP_DMA))
 788                return csizep->cs_dmacachep;
 789#endif
 790        return csizep->cs_cachep;
 791}
 792
 793static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 794{
 795        return __find_general_cachep(size, gfpflags);
 796}
 797
 798static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 799{
 800        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 801}
 802
 803/*
 804 * Calculate the number of objects and left-over bytes for a given buffer size.
 805 */
 806static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 807                           size_t align, int flags, size_t *left_over,
 808                           unsigned int *num)
 809{
 810        int nr_objs;
 811        size_t mgmt_size;
 812        size_t slab_size = PAGE_SIZE << gfporder;
 813
 814        /*
 815         * The slab management structure can be either off the slab or
 816         * on it. For the latter case, the memory allocated for a
 817         * slab is used for:
 818         *
 819         * - The struct slab
 820         * - One kmem_bufctl_t for each object
 821         * - Padding to respect alignment of @align
 822         * - @buffer_size bytes for each object
 823         *
 824         * If the slab management structure is off the slab, then the
 825         * alignment will already be calculated into the size. Because
 826         * the slabs are all pages aligned, the objects will be at the
 827         * correct alignment when allocated.
 828         */
 829        if (flags & CFLGS_OFF_SLAB) {
 830                mgmt_size = 0;
 831                nr_objs = slab_size / buffer_size;
 832
 833                if (nr_objs > SLAB_LIMIT)
 834                        nr_objs = SLAB_LIMIT;
 835        } else {
 836                /*
 837                 * Ignore padding for the initial guess. The padding
 838                 * is at most @align-1 bytes, and @buffer_size is at
 839                 * least @align. In the worst case, this result will
 840                 * be one greater than the number of objects that fit
 841                 * into the memory allocation when taking the padding
 842                 * into account.
 843                 */
 844                nr_objs = (slab_size - sizeof(struct slab)) /
 845                          (buffer_size + sizeof(kmem_bufctl_t));
 846
 847                /*
 848                 * This calculated number will be either the right
 849                 * amount, or one greater than what we want.
 850                 */
 851                if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 852                       > slab_size)
 853                        nr_objs--;
 854
 855                if (nr_objs > SLAB_LIMIT)
 856                        nr_objs = SLAB_LIMIT;
 857
 858                mgmt_size = slab_mgmt_size(nr_objs, align);
 859        }
 860        *num = nr_objs;
 861        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 862}
 863
 864#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 865
 866static void __slab_error(const char *function, struct kmem_cache *cachep,
 867                        char *msg)
 868{
 869        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 870               function, cachep->name, msg);
 871        dump_stack();
 872}
 873
 874/*
 875 * By default on NUMA we use alien caches to stage the freeing of
 876 * objects allocated from other nodes. This causes massive memory
 877 * inefficiencies when using fake NUMA setup to split memory into a
 878 * large number of small nodes, so it can be disabled on the command
 879 * line
 880  */
 881
 882static int use_alien_caches __read_mostly = 1;
 883static int __init noaliencache_setup(char *s)
 884{
 885        use_alien_caches = 0;
 886        return 1;
 887}
 888__setup("noaliencache", noaliencache_setup);
 889
 890#ifdef CONFIG_NUMA
 891/*
 892 * Special reaping functions for NUMA systems called from cache_reap().
 893 * These take care of doing round robin flushing of alien caches (containing
 894 * objects freed on different nodes from which they were allocated) and the
 895 * flushing of remote pcps by calling drain_node_pages.
 896 */
 897static DEFINE_PER_CPU(unsigned long, reap_node);
 898
 899static void init_reap_node(int cpu)
 900{
 901        int node;
 902
 903        node = next_node(cpu_to_node(cpu), node_online_map);
 904        if (node == MAX_NUMNODES)
 905                node = first_node(node_online_map);
 906
 907        per_cpu(reap_node, cpu) = node;
 908}
 909
 910static void next_reap_node(void)
 911{
 912        int node = __get_cpu_var(reap_node);
 913
 914        node = next_node(node, node_online_map);
 915        if (unlikely(node >= MAX_NUMNODES))
 916                node = first_node(node_online_map);
 917        __get_cpu_var(reap_node) = node;
 918}
 919
 920#else
 921#define init_reap_node(cpu) do { } while (0)
 922#define next_reap_node(void) do { } while (0)
 923#endif
 924
 925/*
 926 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 927 * via the workqueue/eventd.
 928 * Add the CPU number into the expiration time to minimize the possibility of
 929 * the CPUs getting into lockstep and contending for the global cache chain
 930 * lock.
 931 */
 932static void __devinit start_cpu_timer(int cpu)
 933{
 934        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
 935
 936        /*
 937         * When this gets called from do_initcalls via cpucache_init(),
 938         * init_workqueues() has already run, so keventd will be setup
 939         * at that time.
 940         */
 941        if (keventd_up() && reap_work->work.func == NULL) {
 942                init_reap_node(cpu);
 943                INIT_DELAYED_WORK(reap_work, cache_reap);
 944                schedule_delayed_work_on(cpu, reap_work,
 945                                        __round_jiffies_relative(HZ, cpu));
 946        }
 947}
 948
 949static struct array_cache *alloc_arraycache(int node, int entries,
 950                                            int batchcount)
 951{
 952        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 953        struct array_cache *nc = NULL;
 954
 955        nc = kmalloc_node(memsize, GFP_KERNEL, node);
 956        if (nc) {
 957                nc->avail = 0;
 958                nc->limit = entries;
 959                nc->batchcount = batchcount;
 960                nc->touched = 0;
 961                spin_lock_init(&nc->lock);
 962        }
 963        return nc;
 964}
 965
 966/*
 967 * Transfer objects in one arraycache to another.
 968 * Locking must be handled by the caller.
 969 *
 970 * Return the number of entries transferred.
 971 */
 972static int transfer_objects(struct array_cache *to,
 973                struct array_cache *from, unsigned int max)
 974{
 975        /* Figure out how many entries to transfer */
 976        int nr = min(min(from->avail, max), to->limit - to->avail);
 977
 978        if (!nr)
 979                return 0;
 980
 981        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 982                        sizeof(void *) *nr);
 983
 984        from->avail -= nr;
 985        to->avail += nr;
 986        to->touched = 1;
 987        return nr;
 988}
 989
 990#ifndef CONFIG_NUMA
 991
 992#define drain_alien_cache(cachep, alien) do { } while (0)
 993#define reap_alien(cachep, l3) do { } while (0)
 994
 995static inline struct array_cache **alloc_alien_cache(int node, int limit)
 996{
 997        return (struct array_cache **)BAD_ALIEN_MAGIC;
 998}
 999
1000static inline void free_alien_cache(struct array_cache **ac_ptr)
1001{
1002}
1003
1004static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1005{
1006        return 0;
1007}
1008
1009static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1010                gfp_t flags)
1011{
1012        return NULL;
1013}
1014
1015static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1016                 gfp_t flags, int nodeid)
1017{
1018        return NULL;
1019}
1020
1021#else   /* CONFIG_NUMA */
1022
1023static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1024static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1025
1026static struct array_cache **alloc_alien_cache(int node, int limit)
1027{
1028        struct array_cache **ac_ptr;
1029        int memsize = sizeof(void *) * nr_node_ids;
1030        int i;
1031
1032        if (limit > 1)
1033                limit = 12;
1034        ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1035        if (ac_ptr) {
1036                for_each_node(i) {
1037                        if (i == node || !node_online(i)) {
1038                                ac_ptr[i] = NULL;
1039                                continue;
1040                        }
1041                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1042                        if (!ac_ptr[i]) {
1043                                for (i--; i <= 0; i--)
1044                                        kfree(ac_ptr[i]);
1045                                kfree(ac_ptr);
1046                                return NULL;
1047                        }
1048                }
1049        }
1050        return ac_ptr;
1051}
1052
1053static void free_alien_cache(struct array_cache **ac_ptr)
1054{
1055        int i;
1056
1057        if (!ac_ptr)
1058                return;
1059        for_each_node(i)
1060            kfree(ac_ptr[i]);
1061        kfree(ac_ptr);
1062}
1063
1064static void __drain_alien_cache(struct kmem_cache *cachep,
1065                                struct array_cache *ac, int node)
1066{
1067        struct kmem_list3 *rl3 = cachep->nodelists[node];
1068
1069        if (ac->avail) {
1070                spin_lock(&rl3->list_lock);
1071                /*
1072                 * Stuff objects into the remote nodes shared array first.
1073                 * That way we could avoid the overhead of putting the objects
1074                 * into the free lists and getting them back later.
1075                 */
1076                if (rl3->shared)
1077                        transfer_objects(rl3->shared, ac, ac->limit);
1078
1079                free_block(cachep, ac->entry, ac->avail, node);
1080                ac->avail = 0;
1081                spin_unlock(&rl3->list_lock);
1082        }
1083}
1084
1085/*
1086 * Called from cache_reap() to regularly drain alien caches round robin.
1087 */
1088static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1089{
1090        int node = __get_cpu_var(reap_node);
1091
1092        if (l3->alien) {
1093                struct array_cache *ac = l3->alien[node];
1094
1095                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1096                        __drain_alien_cache(cachep, ac, node);
1097                        spin_unlock_irq(&ac->lock);
1098                }
1099        }
1100}
1101
1102static void drain_alien_cache(struct kmem_cache *cachep,
1103                                struct array_cache **alien)
1104{
1105        int i = 0;
1106        struct array_cache *ac;
1107        unsigned long flags;
1108
1109        for_each_online_node(i) {
1110                ac = alien[i];
1111                if (ac) {
1112                        spin_lock_irqsave(&ac->lock, flags);
1113                        __drain_alien_cache(cachep, ac, i);
1114                        spin_unlock_irqrestore(&ac->lock, flags);
1115                }
1116        }
1117}
1118
1119static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1120{
1121        struct slab *slabp = virt_to_slab(objp);
1122        int nodeid = slabp->nodeid;
1123        struct kmem_list3 *l3;
1124        struct array_cache *alien = NULL;
1125        int node;
1126
1127        node = numa_node_id();
1128
1129        /*
1130         * Make sure we are not freeing a object from another node to the array
1131         * cache on this cpu.
1132         */
1133        if (likely(slabp->nodeid == node))
1134                return 0;
1135
1136        l3 = cachep->nodelists[node];
1137        STATS_INC_NODEFREES(cachep);
1138        if (l3->alien && l3->alien[nodeid]) {
1139                alien = l3->alien[nodeid];
1140                spin_lock(&alien->lock);
1141                if (unlikely(alien->avail == alien->limit)) {
1142                        STATS_INC_ACOVERFLOW(cachep);
1143                        __drain_alien_cache(cachep, alien, nodeid);
1144                }
1145                alien->entry[alien->avail++] = objp;
1146                spin_unlock(&alien->lock);
1147        } else {
1148                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1149                free_block(cachep, &objp, 1, nodeid);
1150                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1151        }
1152        return 1;
1153}
1154#endif
1155
1156static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1157                                    unsigned long action, void *hcpu)
1158{
1159        long cpu = (long)hcpu;
1160        struct kmem_cache *cachep;
1161        struct kmem_list3 *l3 = NULL;
1162        int node = cpu_to_node(cpu);
1163        int memsize = sizeof(struct kmem_list3);
1164
1165        switch (action) {
1166        case CPU_LOCK_ACQUIRE:
1167                mutex_lock(&cache_chain_mutex);
1168                break;
1169        case CPU_UP_PREPARE:
1170        case CPU_UP_PREPARE_FROZEN:
1171                /*
1172                 * We need to do this right in the beginning since
1173                 * alloc_arraycache's are going to use this list.
1174                 * kmalloc_node allows us to add the slab to the right
1175                 * kmem_list3 and not this cpu's kmem_list3
1176                 */
1177
1178                list_for_each_entry(cachep, &cache_chain, next) {
1179                        /*
1180                         * Set up the size64 kmemlist for cpu before we can
1181                         * begin anything. Make sure some other cpu on this
1182                         * node has not already allocated this
1183                         */
1184                        if (!cachep->nodelists[node]) {
1185                                l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1186                                if (!l3)
1187                                        goto bad;
1188                                kmem_list3_init(l3);
1189                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1190                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1191
1192                                /*
1193                                 * The l3s don't come and go as CPUs come and
1194                                 * go.  cache_chain_mutex is sufficient
1195                                 * protection here.
1196                                 */
1197                                cachep->nodelists[node] = l3;
1198                        }
1199
1200                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
1201                        cachep->nodelists[node]->free_limit =
1202                                (1 + nr_cpus_node(node)) *
1203                                cachep->batchcount + cachep->num;
1204                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1205                }
1206
1207                /*
1208                 * Now we can go ahead with allocating the shared arrays and
1209                 * array caches
1210                 */
1211                list_for_each_entry(cachep, &cache_chain, next) {
1212                        struct array_cache *nc;
1213                        struct array_cache *shared = NULL;
1214                        struct array_cache **alien = NULL;
1215
1216                        nc = alloc_arraycache(node, cachep->limit,
1217                                                cachep->batchcount);
1218                        if (!nc)
1219                                goto bad;
1220                        if (cachep->shared) {
1221                                shared = alloc_arraycache(node,
1222                                        cachep->shared * cachep->batchcount,
1223                                        0xbaadf00d);
1224                                if (!shared)
1225                                        goto bad;
1226                        }
1227                        if (use_alien_caches) {
1228                                alien = alloc_alien_cache(node, cachep->limit);
1229                                if (!alien)
1230                                        goto bad;
1231                        }
1232                        cachep->array[cpu] = nc;
1233                        l3 = cachep->nodelists[node];
1234                        BUG_ON(!l3);
1235
1236                        spin_lock_irq(&l3->list_lock);
1237                        if (!l3->shared) {
1238                                /*
1239                                 * We are serialised from CPU_DEAD or
1240                                 * CPU_UP_CANCELLED by the cpucontrol lock
1241                                 */
1242                                l3->shared = shared;
1243                                shared = NULL;
1244                        }
1245#ifdef CONFIG_NUMA
1246                        if (!l3->alien) {
1247                                l3->alien = alien;
1248                                alien = NULL;
1249                        }
1250#endif
1251                        spin_unlock_irq(&l3->list_lock);
1252                        kfree(shared);
1253                        free_alien_cache(alien);
1254                }
1255                break;
1256        case CPU_ONLINE:
1257        case CPU_ONLINE_FROZEN:
1258                start_cpu_timer(cpu);
1259                break;
1260#ifdef CONFIG_HOTPLUG_CPU
1261        case CPU_DOWN_PREPARE:
1262        case CPU_DOWN_PREPARE_FROZEN:
1263                /*
1264                 * Shutdown cache reaper. Note that the cache_chain_mutex is
1265                 * held so that if cache_reap() is invoked it cannot do
1266                 * anything expensive but will only modify reap_work
1267                 * and reschedule the timer.
1268                */
1269                cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
1270                /* Now the cache_reaper is guaranteed to be not running. */
1271                per_cpu(reap_work, cpu).work.func = NULL;
1272                break;
1273        case CPU_DOWN_FAILED:
1274        case CPU_DOWN_FAILED_FROZEN:
1275                start_cpu_timer(cpu);
1276                break;
1277        case CPU_DEAD:
1278        case CPU_DEAD_FROZEN:
1279                /*
1280                 * Even if all the cpus of a node are down, we don't free the
1281                 * kmem_list3 of any cache. This to avoid a race between
1282                 * cpu_down, and a kmalloc allocation from another cpu for
1283                 * memory from the node of the cpu going down.  The list3
1284                 * structure is usually allocated from kmem_cache_create() and
1285                 * gets destroyed at kmem_cache_destroy().
1286                 */
1287                /* fall thru */
1288#endif
1289        case CPU_UP_CANCELED:
1290        case CPU_UP_CANCELED_FROZEN:
1291                list_for_each_entry(cachep, &cache_chain, next) {
1292                        struct array_cache *nc;
1293                        struct array_cache *shared;
1294                        struct array_cache **alien;
1295                        cpumask_t mask;
1296
1297                        mask = node_to_cpumask(node);
1298                        /* cpu is dead; no one can alloc from it. */
1299                        nc = cachep->array[cpu];
1300                        cachep->array[cpu] = NULL;
1301                        l3 = cachep->nodelists[node];
1302
1303                        if (!l3)
1304                                goto free_array_cache;
1305
1306                        spin_lock_irq(&l3->list_lock);
1307
1308                        /* Free limit for this kmem_list3 */
1309                        l3->free_limit -= cachep->batchcount;
1310                        if (nc)
1311                                free_block(cachep, nc->entry, nc->avail, node);
1312
1313                        if (!cpus_empty(mask)) {
1314                                spin_unlock_irq(&l3->list_lock);
1315                                goto free_array_cache;
1316                        }
1317
1318                        shared = l3->shared;
1319                        if (shared) {
1320                                free_block(cachep, shared->entry,
1321                                           shared->avail, node);
1322                                l3->shared = NULL;
1323                        }
1324
1325                        alien = l3->alien;
1326                        l3->alien = NULL;
1327
1328                        spin_unlock_irq(&l3->list_lock);
1329
1330                        kfree(shared);
1331                        if (alien) {
1332                                drain_alien_cache(cachep, alien);
1333                                free_alien_cache(alien);
1334                        }
1335free_array_cache:
1336                        kfree(nc);
1337                }
1338                /*
1339                 * In the previous loop, all the objects were freed to
1340                 * the respective cache's slabs,  now we can go ahead and
1341                 * shrink each nodelist to its limit.
1342                 */
1343                list_for_each_entry(cachep, &cache_chain, next) {
1344                        l3 = cachep->nodelists[node];
1345                        if (!l3)
1346                                continue;
1347                        drain_freelist(cachep, l3, l3->free_objects);
1348                }
1349                break;
1350        case CPU_LOCK_RELEASE:
1351                mutex_unlock(&cache_chain_mutex);
1352                break;
1353        }
1354        return NOTIFY_OK;
1355bad:
1356        return NOTIFY_BAD;
1357}
1358
1359static struct notifier_block __cpuinitdata cpucache_notifier = {
1360        &cpuup_callback, NULL, 0
1361};
1362
1363/*
1364 * swap the static kmem_list3 with kmalloced memory
1365 */
1366static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1367                        int nodeid)
1368{
1369        struct kmem_list3 *ptr;
1370
1371        ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1372        BUG_ON(!ptr);
1373
1374        local_irq_disable();
1375        memcpy(ptr, list, sizeof(struct kmem_list3));
1376        /*
1377         * Do not assume that spinlocks can be initialized via memcpy:
1378         */
1379        spin_lock_init(&ptr->list_lock);
1380
1381        MAKE_ALL_LISTS(cachep, ptr, nodeid);
1382        cachep->nodelists[nodeid] = ptr;
1383        local_irq_enable();
1384}
1385
1386/*
1387 * Initialisation.  Called after the page allocator have been initialised and
1388 * before smp_init().
1389 */
1390void __init kmem_cache_init(void)
1391{
1392        size_t left_over;
1393        struct cache_sizes *sizes;
1394        struct cache_names *names;
1395        int i;
1396        int order;
1397        int node;
1398
1399        if (num_possible_nodes() == 1)
1400                use_alien_caches = 0;
1401
1402        for (i = 0; i < NUM_INIT_LISTS; i++) {
1403                kmem_list3_init(&initkmem_list3[i]);
1404                if (i < MAX_NUMNODES)
1405                        cache_cache.nodelists[i] = NULL;
1406        }
1407
1408        /*
1409         * Fragmentation resistance on low memory - only use bigger
1410         * page orders on machines with more than 32MB of memory.
1411         */
1412        if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1413                slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1414
1415        /* Bootstrap is tricky, because several objects are allocated
1416         * from caches that do not exist yet:
1417         * 1) initialize the cache_cache cache: it contains the struct
1418         *    kmem_cache structures of all caches, except cache_cache itself:
1419         *    cache_cache is statically allocated.
1420         *    Initially an __init data area is used for the head array and the
1421         *    kmem_list3 structures, it's replaced with a kmalloc allocated
1422         *    array at the end of the bootstrap.
1423         * 2) Create the first kmalloc cache.
1424         *    The struct kmem_cache for the new cache is allocated normally.
1425         *    An __init data area is used for the head array.
1426         * 3) Create the remaining kmalloc caches, with minimally sized
1427         *    head arrays.
1428         * 4) Replace the __init data head arrays for cache_cache and the first
1429         *    kmalloc cache with kmalloc allocated arrays.
1430         * 5) Replace the __init data for kmem_list3 for cache_cache and
1431         *    the other cache's with kmalloc allocated memory.
1432         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1433         */
1434
1435        node = numa_node_id();
1436
1437        /* 1) create the cache_cache */
1438        INIT_LIST_HEAD(&cache_chain);
1439        list_add(&cache_cache.next, &cache_chain);
1440        cache_cache.colour_off = cache_line_size();
1441        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1442        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1443
1444        /*
1445         * struct kmem_cache size depends on nr_node_ids, which
1446         * can be less than MAX_NUMNODES.
1447         */
1448        cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1449                                 nr_node_ids * sizeof(struct kmem_list3 *);
1450#if DEBUG
1451        cache_cache.obj_size = cache_cache.buffer_size;
1452#endif
1453        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1454                                        cache_line_size());
1455        cache_cache.reciprocal_buffer_size =
1456                reciprocal_value(cache_cache.buffer_size);
1457
1458        for (order = 0; order < MAX_ORDER; order++) {
1459                cache_estimate(order, cache_cache.buffer_size,
1460                        cache_line_size(), 0, &left_over, &cache_cache.num);
1461                if (cache_cache.num)
1462                        break;
1463        }
1464        BUG_ON(!cache_cache.num);
1465        cache_cache.gfporder = order;
1466        cache_cache.colour = left_over / cache_cache.colour_off;
1467        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1468                                      sizeof(struct slab), cache_line_size());
1469
1470        /* 2+3) create the kmalloc caches */
1471        sizes = malloc_sizes;
1472        names = cache_names;
1473
1474        /*
1475         * Initialize the caches that provide memory for the array cache and the
1476         * kmem_list3 structures first.  Without this, further allocations will
1477         * bug.
1478         */
1479
1480        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1481                                        sizes[INDEX_AC].cs_size,
1482                                        ARCH_KMALLOC_MINALIGN,
1483                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1484                                        NULL, NULL);
1485
1486        if (INDEX_AC != INDEX_L3) {
1487                sizes[INDEX_L3].cs_cachep =
1488                        kmem_cache_create(names[INDEX_L3].name,
1489                                sizes[INDEX_L3].cs_size,
1490                                ARCH_KMALLOC_MINALIGN,
1491                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1492                                NULL, NULL);
1493        }
1494
1495        slab_early_init = 0;
1496
1497        while (sizes->cs_size != ULONG_MAX) {
1498                /*
1499                 * For performance, all the general caches are L1 aligned.
1500                 * This should be particularly beneficial on SMP boxes, as it
1501                 * eliminates "false sharing".
1502                 * Note for systems short on memory removing the alignment will
1503                 * allow tighter packing of the smaller caches.
1504                 */
1505                if (!sizes->cs_cachep) {
1506                        sizes->cs_cachep = kmem_cache_create(names->name,
1507                                        sizes->cs_size,
1508                                        ARCH_KMALLOC_MINALIGN,
1509                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1510                                        NULL, NULL);
1511                }
1512#ifdef CONFIG_ZONE_DMA
1513                sizes->cs_dmacachep = kmem_cache_create(
1514                                        names->name_dma,
1515                                        sizes->cs_size,
1516                                        ARCH_KMALLOC_MINALIGN,
1517                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1518                                                SLAB_PANIC,
1519                                        NULL, NULL);
1520#endif
1521                sizes++;
1522                names++;
1523        }
1524        /* 4) Replace the bootstrap head arrays */
1525        {
1526                struct array_cache *ptr;
1527
1528                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1529
1530                local_irq_disable();
1531                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1532                memcpy(ptr, cpu_cache_get(&cache_cache),
1533                       sizeof(struct arraycache_init));
1534                /*
1535                 * Do not assume that spinlocks can be initialized via memcpy:
1536                 */
1537                spin_lock_init(&ptr->lock);
1538
1539                cache_cache.array[smp_processor_id()] = ptr;
1540                local_irq_enable();
1541
1542                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1543
1544                local_irq_disable();
1545                BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1546                       != &initarray_generic.cache);
1547                memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1548                       sizeof(struct arraycache_init));
1549                /*
1550                 * Do not assume that spinlocks can be initialized via memcpy:
1551                 */
1552                spin_lock_init(&ptr->lock);
1553
1554                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1555                    ptr;
1556                local_irq_enable();
1557        }
1558        /* 5) Replace the bootstrap kmem_list3's */
1559        {
1560                int nid;
1561
1562                /* Replace the static kmem_list3 structures for the boot cpu */
1563                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1564
1565                for_each_online_node(nid) {
1566                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
1567                                  &initkmem_list3[SIZE_AC + nid], nid);
1568
1569                        if (INDEX_AC != INDEX_L3) {
1570                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
1571                                          &initkmem_list3[SIZE_L3 + nid], nid);
1572                        }
1573                }
1574        }
1575
1576        /* 6) resize the head arrays to their final sizes */
1577        {
1578                struct kmem_cache *cachep;
1579                mutex_lock(&cache_chain_mutex);
1580                list_for_each_entry(cachep, &cache_chain, next)
1581                        if (enable_cpucache(cachep))
1582                                BUG();
1583                mutex_unlock(&cache_chain_mutex);
1584        }
1585
1586        /* Annotate slab for lockdep -- annotate the malloc caches */
1587        init_lock_keys();
1588
1589
1590        /* Done! */
1591        g_cpucache_up = FULL;
1592
1593        /*
1594         * Register a cpu startup notifier callback that initializes
1595         * cpu_cache_get for all new cpus
1596         */
1597        register_cpu_notifier(&cpucache_notifier);
1598
1599        /*
1600         * The reap timers are started later, with a module init call: That part
1601         * of the kernel is not yet operational.
1602         */
1603}
1604
1605static int __init cpucache_init(void)
1606{
1607        int cpu;
1608
1609        /*
1610         * Register the timers that return unneeded pages to the page allocator
1611         */
1612        for_each_online_cpu(cpu)
1613                start_cpu_timer(cpu);
1614        return 0;
1615}
1616__initcall(cpucache_init);
1617
1618/*
1619 * Interface to system's page allocator. No need to hold the cache-lock.
1620 *
1621 * If we requested dmaable memory, we will get it. Even if we
1622 * did not request dmaable memory, we might get it, but that
1623 * would be relatively rare and ignorable.
1624 */
1625static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1626{
1627        struct page *page;
1628        int nr_pages;
1629        int i;
1630
1631#ifndef CONFIG_MMU
1632        /*
1633         * Nommu uses slab's for process anonymous memory allocations, and thus
1634         * requires __GFP_COMP to properly refcount higher order allocations
1635         */
1636        flags |= __GFP_COMP;
1637#endif
1638
1639        flags |= cachep->gfpflags;
1640
1641        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1642        if (!page)
1643                return NULL;
1644
1645        nr_pages = (1 << cachep->gfporder);
1646        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1647                add_zone_page_state(page_zone(page),
1648                        NR_SLAB_RECLAIMABLE, nr_pages);
1649        else
1650                add_zone_page_state(page_zone(page),
1651                        NR_SLAB_UNRECLAIMABLE, nr_pages);
1652        for (i = 0; i < nr_pages; i++)
1653                __SetPageSlab(page + i);
1654        return page_address(page);
1655}
1656
1657/*
1658 * Interface to system's page release.
1659 */
1660static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1661{
1662        unsigned long i = (1 << cachep->gfporder);
1663        struct page *page = virt_to_page(addr);
1664        const unsigned long nr_freed = i;
1665
1666        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1667                sub_zone_page_state(page_zone(page),
1668                                NR_SLAB_RECLAIMABLE, nr_freed);
1669        else
1670                sub_zone_page_state(page_zone(page),
1671                                NR_SLAB_UNRECLAIMABLE, nr_freed);
1672        while (i--) {
1673                BUG_ON(!PageSlab(page));
1674                __ClearPageSlab(page);
1675                page++;
1676        }
1677        if (current->reclaim_state)
1678                current->reclaim_state->reclaimed_slab += nr_freed;
1679        free_pages((unsigned long)addr, cachep->gfporder);
1680}
1681
1682static void kmem_rcu_free(struct rcu_head *head)
1683{
1684        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1685        struct kmem_cache *cachep = slab_rcu->cachep;
1686
1687        kmem_freepages(cachep, slab_rcu->addr);
1688        if (OFF_SLAB(cachep))
1689                kmem_cache_free(cachep->slabp_cache, slab_rcu);
1690}
1691
1692#if DEBUG
1693
1694#ifdef CONFIG_DEBUG_PAGEALLOC
1695static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1696                            unsigned long caller)
1697{
1698        int size = obj_size(cachep);
1699
1700        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1701
1702        if (size < 5 * sizeof(unsigned long))
1703                return;
1704
1705        *addr++ = 0x12345678;
1706        *addr++ = caller;
1707        *addr++ = smp_processor_id();
1708        size -= 3 * sizeof(unsigned long);
1709        {
1710                unsigned long *sptr = &caller;
1711                unsigned long svalue;
1712
1713                while (!kstack_end(sptr)) {
1714                        svalue = *sptr++;
1715                        if (kernel_text_address(svalue)) {
1716                                *addr++ = svalue;
1717                                size -= sizeof(unsigned long);
1718                                if (size <= sizeof(unsigned long))
1719                                        break;
1720                        }
1721                }
1722
1723        }
1724        *addr++ = 0x87654321;
1725}
1726#endif
1727
1728static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1729{
1730        int size = obj_size(cachep);
1731        addr = &((char *)addr)[obj_offset(cachep)];
1732
1733        memset(addr, val, size);
1734        *(unsigned char *)(addr + size - 1) = POISON_END;
1735}
1736
1737static void dump_line(char *data, int offset, int limit)
1738{
1739        int i;
1740        unsigned char error = 0;
1741        int bad_count = 0;
1742
1743        printk(KERN_ERR "%03x:", offset);
1744        for (i = 0; i < limit; i++) {
1745                if (data[offset + i] != POISON_FREE) {
1746                        error = data[offset + i];
1747                        bad_count++;
1748                }
1749                printk(" %02x", (unsigned char)data[offset + i]);
1750        }
1751        printk("\n");
1752
1753        if (bad_count == 1) {
1754                error ^= POISON_FREE;
1755                if (!(error & (error - 1))) {
1756                        printk(KERN_ERR "Single bit error detected. Probably "
1757                                        "bad RAM.\n");
1758#ifdef CONFIG_X86
1759                        printk(KERN_ERR "Run memtest86+ or a similar memory "
1760                                        "test tool.\n");
1761#else
1762                        printk(KERN_ERR "Run a memory test tool.\n");
1763#endif
1764                }
1765        }
1766}
1767#endif
1768
1769#if DEBUG
1770
1771static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1772{
1773        int i, size;
1774        char *realobj;
1775
1776        if (cachep->flags & SLAB_RED_ZONE) {
1777                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1778                        *dbg_redzone1(cachep, objp),
1779                        *dbg_redzone2(cachep, objp));
1780        }
1781
1782        if (cachep->flags & SLAB_STORE_USER) {
1783                printk(KERN_ERR "Last user: [<%p>]",
1784                        *dbg_userword(cachep, objp));
1785                print_symbol("(%s)",
1786                                (unsigned long)*dbg_userword(cachep, objp));
1787                printk("\n");
1788        }
1789        realobj = (char *)objp + obj_offset(cachep);
1790        size = obj_size(cachep);
1791        for (i = 0; i < size && lines; i += 16, lines--) {
1792                int limit;
1793                limit = 16;
1794                if (i + limit > size)
1795                        limit = size - i;
1796                dump_line(realobj, i, limit);
1797        }
1798}
1799
1800static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1801{
1802        char *realobj;
1803        int size, i;
1804        int lines = 0;
1805
1806        realobj = (char *)objp + obj_offset(cachep);
1807        size = obj_size(cachep);
1808
1809        for (i = 0; i < size; i++) {
1810                char exp = POISON_FREE;
1811                if (i == size - 1)
1812                        exp = POISON_END;
1813                if (realobj[i] != exp) {
1814                        int limit;
1815                        /* Mismatch ! */
1816                        /* Print header */
1817                        if (lines == 0) {
1818                                printk(KERN_ERR
1819                                        "Slab corruption: %s start=%p, len=%d\n",
1820                                        cachep->name, realobj, size);
1821                                print_objinfo(cachep, objp, 0);
1822                        }
1823                        /* Hexdump the affected line */
1824                        i = (i / 16) * 16;
1825                        limit = 16;
1826                        if (i + limit > size)
1827                                limit = size - i;
1828                        dump_line(realobj, i, limit);
1829                        i += 16;
1830                        lines++;
1831                        /* Limit to 5 lines */
1832                        if (lines > 5)
1833                                break;
1834                }
1835        }
1836        if (lines != 0) {
1837                /* Print some data about the neighboring objects, if they
1838                 * exist:
1839                 */
1840                struct slab *slabp = virt_to_slab(objp);
1841                unsigned int objnr;
1842
1843                objnr = obj_to_index(cachep, slabp, objp);
1844                if (objnr) {
1845                        objp = index_to_obj(cachep, slabp, objnr - 1);
1846                        realobj = (char *)objp + obj_offset(cachep);
1847                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1848                               realobj, size);
1849                        print_objinfo(cachep, objp, 2);
1850                }
1851                if (objnr + 1 < cachep->num) {
1852                        objp = index_to_obj(cachep, slabp, objnr + 1);
1853                        realobj = (char *)objp + obj_offset(cachep);
1854                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1855                               realobj, size);
1856                        print_objinfo(cachep, objp, 2);
1857                }
1858        }
1859}
1860#endif
1861
1862#if DEBUG
1863/**
1864 * slab_destroy_objs - destroy a slab and its objects
1865 * @cachep: cache pointer being destroyed
1866 * @slabp: slab pointer being destroyed
1867 *
1868 * Call the registered destructor for each object in a slab that is being
1869 * destroyed.
1870 */
1871static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1872{
1873        int i;
1874        for (i = 0; i < cachep->num; i++) {
1875                void *objp = index_to_obj(cachep, slabp, i);
1876
1877                if (cachep->flags & SLAB_POISON) {
1878#ifdef CONFIG_DEBUG_PAGEALLOC
1879                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
1880                                        OFF_SLAB(cachep))
1881                                kernel_map_pages(virt_to_page(objp),
1882                                        cachep->buffer_size / PAGE_SIZE, 1);
1883                        else
1884                                check_poison_obj(cachep, objp);
1885#else
1886                        check_poison_obj(cachep, objp);
1887#endif
1888                }
1889                if (cachep->flags & SLAB_RED_ZONE) {
1890                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1891                                slab_error(cachep, "start of a freed object "
1892                                           "was overwritten");
1893                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1894                                slab_error(cachep, "end of a freed object "
1895                                           "was overwritten");
1896                }
1897        }
1898}
1899#else
1900static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1901{
1902}
1903#endif
1904
1905/**
1906 * slab_destroy - destroy and release all objects in a slab
1907 * @cachep: cache pointer being destroyed
1908 * @slabp: slab pointer being destroyed
1909 *
1910 * Destroy all the objs in a slab, and release the mem back to the system.
1911 * Before calling the slab must have been unlinked from the cache.  The
1912 * cache-lock is not held/needed.
1913 */
1914static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1915{
1916        void *addr = slabp->s_mem - slabp->colouroff;
1917
1918        slab_destroy_objs(cachep, slabp);
1919        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1920                struct slab_rcu *slab_rcu;
1921
1922                slab_rcu = (struct slab_rcu *)slabp;
1923                slab_rcu->cachep = cachep;
1924                slab_rcu->addr = addr;
1925                call_rcu(&slab_rcu->head, kmem_rcu_free);
1926        } else {
1927                kmem_freepages(cachep, addr);
1928                if (OFF_SLAB(cachep))
1929                        kmem_cache_free(cachep->slabp_cache, slabp);
1930        }
1931}
1932
1933/*
1934 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1935 * size of kmem_list3.
1936 */
1937static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1938{
1939        int node;
1940
1941        for_each_online_node(node) {
1942                cachep->nodelists[node] = &initkmem_list3[index + node];
1943                cachep->nodelists[node]->next_reap = jiffies +
1944                    REAPTIMEOUT_LIST3 +
1945                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1946        }
1947}
1948
1949static void __kmem_cache_destroy(struct kmem_cache *cachep)
1950{
1951        int i;
1952        struct kmem_list3 *l3;
1953
1954        for_each_online_cpu(i)
1955            kfree(cachep->array[i]);
1956
1957        /* NUMA: free the list3 structures */
1958        for_each_online_node(i) {
1959                l3 = cachep->nodelists[i];
1960                if (l3) {
1961                        kfree(l3->shared);
1962                        free_alien_cache(l3->alien);
1963                        kfree(l3);
1964                }
1965        }
1966        kmem_cache_free(&cache_cache, cachep);
1967}
1968
1969
1970/**
1971 * calculate_slab_order - calculate size (page order) of slabs
1972 * @cachep: pointer to the cache that is being created
1973 * @size: size of objects to be created in this cache.
1974 * @align: required alignment for the objects.
1975 * @flags: slab allocation flags
1976 *
1977 * Also calculates the number of objects per slab.
1978 *
1979 * This could be made much more intelligent.  For now, try to avoid using
1980 * high order pages for slabs.  When the gfp() functions are more friendly
1981 * towards high-order requests, this should be changed.
1982 */
1983static size_t calculate_slab_order(struct kmem_cache *cachep,
1984                        size_t size, size_t align, unsigned long flags)
1985{
1986        unsigned long offslab_limit;
1987        size_t left_over = 0;
1988        int gfporder;
1989
1990        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1991                unsigned int num;
1992                size_t remainder;
1993
1994                cache_estimate(gfporder, size, align, flags, &remainder, &num);
1995                if (!num)
1996                        continue;
1997
1998                if (flags & CFLGS_OFF_SLAB) {
1999                        /*
2000                         * Max number of objs-per-slab for caches which
2001                         * use off-slab slabs. Needed to avoid a possible
2002                         * looping condition in cache_grow().
2003                         */
2004                        offslab_limit = size - sizeof(struct slab);
2005                        offslab_limit /= sizeof(kmem_bufctl_t);
2006
2007                        if (num > offslab_limit)
2008                                break;
2009                }
2010
2011                /* Found something acceptable - save it away */
2012                cachep->num = num;
2013                cachep->gfporder = gfporder;
2014                left_over = remainder;
2015
2016                /*
2017                 * A VFS-reclaimable slab tends to have most allocations
2018                 * as GFP_NOFS and we really don't want to have to be allocating
2019                 * higher-order pages when we are unable to shrink dcache.
2020                 */
2021                if (flags & SLAB_RECLAIM_ACCOUNT)
2022                        break;
2023
2024                /*
2025                 * Large number of objects is good, but very large slabs are
2026                 * currently bad for the gfp()s.
2027                 */
2028                if (gfporder >= slab_break_gfp_order)
2029                        break;
2030
2031                /*
2032                 * Acceptable internal fragmentation?
2033                 */
2034                if (left_over * 8 <= (PAGE_SIZE << gfporder))
2035                        break;
2036        }
2037        return left_over;
2038}
2039
2040static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2041{
2042        if (g_cpucache_up == FULL)
2043                return enable_cpucache(cachep);
2044
2045        if (g_cpucache_up == NONE) {
2046                /*
2047                 * Note: the first kmem_cache_create must create the cache
2048                 * that's used by kmalloc(24), otherwise the creation of
2049                 * further caches will BUG().
2050                 */
2051                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2052
2053                /*
2054                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2055                 * the first cache, then we need to set up all its list3s,
2056                 * otherwise the creation of further caches will BUG().
2057                 */
2058                set_up_list3s(cachep, SIZE_AC);
2059                if (INDEX_AC == INDEX_L3)
2060                        g_cpucache_up = PARTIAL_L3;
2061                else
2062                        g_cpucache_up = PARTIAL_AC;
2063        } else {
2064                cachep->array[smp_processor_id()] =
2065                        kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2066
2067                if (g_cpucache_up == PARTIAL_AC) {
2068                        set_up_list3s(cachep, SIZE_L3);
2069                        g_cpucache_up = PARTIAL_L3;
2070                } else {
2071                        int node;
2072                        for_each_online_node(node) {
2073                                cachep->nodelists[node] =
2074                                    kmalloc_node(sizeof(struct kmem_list3),
2075                                                GFP_KERNEL, node);
2076                                BUG_ON(!cachep->nodelists[node]);
2077                                kmem_list3_init(cachep->nodelists[node]);
2078                        }
2079                }
2080        }
2081        cachep->nodelists[numa_node_id()]->next_reap =
2082                        jiffies + REAPTIMEOUT_LIST3 +
2083                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2084
2085        cpu_cache_get(cachep)->avail = 0;
2086        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2087        cpu_cache_get(cachep)->batchcount = 1;
2088        cpu_cache_get(cachep)->touched = 0;
2089        cachep->batchcount = 1;
2090        cachep->limit = BOOT_CPUCACHE_ENTRIES;
2091        return 0;
2092}
2093
2094/**
2095 * kmem_cache_create - Create a cache.
2096 * @name: A string which is used in /proc/slabinfo to identify this cache.
2097 * @size: The size of objects to be created in this cache.
2098 * @align: The required alignment for the objects.
2099 * @flags: SLAB flags
2100 * @ctor: A constructor for the objects.
2101 * @dtor: A destructor for the objects (not implemented anymore).
2102 *
2103 * Returns a ptr to the cache on success, NULL on failure.
2104 * Cannot be called within a int, but can be interrupted.
2105 * The @ctor is run when new pages are allocated by the cache
2106 * and the @dtor is run before the pages are handed back.
2107 *
2108 * @name must be valid until the cache is destroyed. This implies that
2109 * the module calling this has to destroy the cache before getting unloaded.
2110 *
2111 * The flags are
2112 *
2113 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2114 * to catch references to uninitialised memory.
2115 *
2116 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2117 * for buffer overruns.
2118 *
2119 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2120 * cacheline.  This can be beneficial if you're counting cycles as closely
2121 * as davem.
2122 */
2123struct kmem_cache *
2124kmem_cache_create (const char *name, size_t size, size_t align,
2125        unsigned long flags,
2126        void (*ctor)(void*, struct kmem_cache *, unsigned long),
2127        void (*dtor)(void*, struct kmem_cache *, unsigned long))
2128{
2129        size_t left_over, slab_size, ralign;
2130        struct kmem_cache *cachep = NULL, *pc;
2131
2132        /*
2133         * Sanity checks... these are all serious usage bugs.
2134         */
2135        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2136            size > KMALLOC_MAX_SIZE || dtor) {
2137                printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2138                                name);
2139                BUG();
2140        }
2141
2142        /*
2143         * We use cache_chain_mutex to ensure a consistent view of
2144         * cpu_online_map as well.  Please see cpuup_callback
2145         */
2146        mutex_lock(&cache_chain_mutex);
2147
2148        list_for_each_entry(pc, &cache_chain, next) {
2149                char tmp;
2150                int res;
2151
2152                /*
2153                 * This happens when the module gets unloaded and doesn't
2154                 * destroy its slab cache and no-one else reuses the vmalloc
2155                 * area of the module.  Print a warning.
2156                 */
2157                res = probe_kernel_address(pc->name, tmp);
2158                if (res) {
2159                        printk(KERN_ERR
2160                               "SLAB: cache with size %d has lost its name\n",
2161                               pc->buffer_size);
2162                        continue;
2163                }
2164
2165                if (!strcmp(pc->name, name)) {
2166                        printk(KERN_ERR
2167                               "kmem_cache_create: duplicate cache %s\n", name);
2168                        dump_stack();
2169                        goto oops;
2170                }
2171        }
2172
2173#if DEBUG
2174        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
2175#if FORCED_DEBUG
2176        /*
2177         * Enable redzoning and last user accounting, except for caches with
2178         * large objects, if the increased size would increase the object size
2179         * above the next power of two: caches with object sizes just above a
2180         * power of two have a significant amount of internal fragmentation.
2181         */
2182        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2183                                                2 * sizeof(unsigned long long)))
2184                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2185        if (!(flags & SLAB_DESTROY_BY_RCU))
2186                flags |= SLAB_POISON;
2187#endif
2188        if (flags & SLAB_DESTROY_BY_RCU)
2189                BUG_ON(flags & SLAB_POISON);
2190#endif
2191        /*
2192         * Always checks flags, a caller might be expecting debug support which
2193         * isn't available.
2194         */
2195        BUG_ON(flags & ~CREATE_MASK);
2196
2197        /*
2198         * Check that size is in terms of words.  This is needed to avoid
2199         * unaligned accesses for some archs when redzoning is used, and makes
2200         * sure any on-slab bufctl's are also correctly aligned.
2201         */
2202        if (size & (BYTES_PER_WORD - 1)) {
2203                size += (BYTES_PER_WORD - 1);
2204                size &= ~(BYTES_PER_WORD - 1);
2205        }
2206
2207        /* calculate the final buffer alignment: */
2208
2209        /* 1) arch recommendation: can be overridden for debug */
2210        if (flags & SLAB_HWCACHE_ALIGN) {
2211                /*
2212                 * Default alignment: as specified by the arch code.  Except if
2213                 * an object is really small, then squeeze multiple objects into
2214                 * one cacheline.
2215                 */
2216                ralign = cache_line_size();
2217                while (size <= ralign / 2)
2218                        ralign /= 2;
2219        } else {
2220                ralign = BYTES_PER_WORD;
2221        }
2222
2223        /*
2224         * Redzoning and user store require word alignment or possibly larger.
2225         * Note this will be overridden by architecture or caller mandated
2226         * alignment if either is greater than BYTES_PER_WORD.
2227         */
2228        if (flags & SLAB_STORE_USER)
2229                ralign = BYTES_PER_WORD;
2230
2231        if (flags & SLAB_RED_ZONE) {
2232                ralign = REDZONE_ALIGN;
2233                /* If redzoning, ensure that the second redzone is suitably
2234                 * aligned, by adjusting the object size accordingly. */
2235                size += REDZONE_ALIGN - 1;
2236                size &= ~(REDZONE_ALIGN - 1);
2237        }
2238
2239        /* 2) arch mandated alignment */
2240        if (ralign < ARCH_SLAB_MINALIGN) {
2241                ralign = ARCH_SLAB_MINALIGN;
2242        }
2243        /* 3) caller mandated alignment */
2244        if (ralign < align) {
2245                ralign = align;
2246        }
2247        /* disable debug if necessary */
2248        if (ralign > __alignof__(unsigned long long))
2249                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2250        /*
2251         * 4) Store it.
2252         */
2253        align = ralign;
2254
2255        /* Get cache's description obj. */
2256        cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2257        if (!cachep)
2258                goto oops;
2259
2260#if DEBUG
2261        cachep->obj_size = size;
2262
2263        /*
2264         * Both debugging options require word-alignment which is calculated
2265         * into align above.
2266         */
2267        if (flags & SLAB_RED_ZONE) {
2268                /* add space for red zone words */
2269                cachep->obj_offset += sizeof(unsigned long long);
2270                size += 2 * sizeof(unsigned long long);
2271        }
2272        if (flags & SLAB_STORE_USER) {
2273                /* user store requires one word storage behind the end of
2274                 * the real object. But if the second red zone needs to be
2275                 * aligned to 64 bits, we must allow that much space.
2276                 */
2277                if (flags & SLAB_RED_ZONE)
2278                        size += REDZONE_ALIGN;
2279                else
2280                        size += BYTES_PER_WORD;
2281        }
2282#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2283        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2284            && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2285                cachep->obj_offset += PAGE_SIZE - size;
2286                size = PAGE_SIZE;
2287        }
2288#endif
2289#endif
2290
2291        /*
2292         * Determine if the slab management is 'on' or 'off' slab.
2293         * (bootstrapping cannot cope with offslab caches so don't do
2294         * it too early on.)
2295         */
2296        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2297                /*
2298                 * Size is large, assume best to place the slab management obj
2299                 * off-slab (should allow better packing of objs).
2300                 */
2301                flags |= CFLGS_OFF_SLAB;
2302
2303        size = ALIGN(size, align);
2304
2305        left_over = calculate_slab_order(cachep, size, align, flags);
2306
2307        if (!cachep->num) {
2308                printk(KERN_ERR
2309                       "kmem_cache_create: couldn't create cache %s.\n", name);
2310                kmem_cache_free(&cache_cache, cachep);
2311                cachep = NULL;
2312                goto oops;
2313        }
2314        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2315                          + sizeof(struct slab), align);
2316
2317        /*
2318         * If the slab has been placed off-slab, and we have enough space then
2319         * move it on-slab. This is at the expense of any extra colouring.
2320         */
2321        if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2322                flags &= ~CFLGS_OFF_SLAB;
2323                left_over -= slab_size;
2324        }
2325
2326        if (flags & CFLGS_OFF_SLAB) {
2327                /* really off slab. No need for manual alignment */
2328                slab_size =
2329                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2330        }
2331
2332        cachep->colour_off = cache_line_size();
2333        /* Offset must be a multiple of the alignment. */
2334        if (cachep->colour_off < align)
2335                cachep->colour_off = align;
2336        cachep->colour = left_over / cachep->colour_off;
2337        cachep->slab_size = slab_size;
2338        cachep->flags = flags;
2339        cachep->gfpflags = 0;
2340        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2341                cachep->gfpflags |= GFP_DMA;
2342        cachep->buffer_size = size;
2343        cachep->reciprocal_buffer_size = reciprocal_value(size);
2344
2345        if (flags & CFLGS_OFF_SLAB) {
2346                cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2347                /*
2348                 * This is a possibility for one of the malloc_sizes caches.
2349                 * But since we go off slab only for object size greater than
2350                 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2351                 * this should not happen at all.
2352                 * But leave a BUG_ON for some lucky dude.
2353                 */
2354                BUG_ON(!cachep->slabp_cache);
2355        }
2356        cachep->ctor = ctor;
2357        cachep->name = name;
2358
2359        if (setup_cpu_cache(cachep)) {
2360                __kmem_cache_destroy(cachep);
2361                cachep = NULL;
2362                goto oops;
2363        }
2364
2365        /* cache setup completed, link it into the list */
2366        list_add(&cachep->next, &cache_chain);
2367oops:
2368        if (!cachep && (flags & SLAB_PANIC))
2369                panic("kmem_cache_create(): failed to create slab `%s'\n",
2370                      name);
2371        mutex_unlock(&cache_chain_mutex);
2372        return cachep;
2373}
2374EXPORT_SYMBOL(kmem_cache_create);
2375
2376#if DEBUG
2377static void check_irq_off(void)
2378{
2379        BUG_ON(!irqs_disabled());
2380}
2381
2382static void check_irq_on(void)
2383{
2384        BUG_ON(irqs_disabled());
2385}
2386
2387static void check_spinlock_acquired(struct kmem_cache *cachep)
2388{
2389#ifdef CONFIG_SMP
2390        check_irq_off();
2391        assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2392#endif
2393}
2394
2395static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2396{
2397#ifdef CONFIG_SMP
2398        check_irq_off();
2399        assert_spin_locked(&cachep->nodelists[node]->list_lock);
2400#endif
2401}
2402
2403#else
2404#define check_irq_off() do { } while(0)
2405#define check_irq_on()  do { } while(0)
2406#define check_spinlock_acquired(x) do { } while(0)
2407#define check_spinlock_acquired_node(x, y) do { } while(0)
2408#endif
2409
2410static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2411                        struct array_cache *ac,
2412                        int force, int node);
2413
2414static void do_drain(void *arg)
2415{
2416        struct kmem_cache *cachep = arg;
2417        struct array_cache *ac;
2418        int node = numa_node_id();
2419
2420        check_irq_off();
2421        ac = cpu_cache_get(cachep);
2422        spin_lock(&cachep->nodelists[node]->list_lock);
2423        free_block(cachep, ac->entry, ac->avail, node);
2424        spin_unlock(&cachep->nodelists[node]->list_lock);
2425        ac->avail = 0;
2426}
2427
2428static void drain_cpu_caches(struct kmem_cache *cachep)
2429{
2430        struct kmem_list3 *l3;
2431        int node;
2432
2433        on_each_cpu(do_drain, cachep, 1, 1);
2434        check_irq_on();
2435        for_each_online_node(node) {
2436                l3 = cachep->nodelists[node];
2437                if (l3 && l3->alien)
2438                        drain_alien_cache(cachep, l3->alien);
2439        }
2440
2441        for_each_online_node(node) {
2442                l3 = cachep->nodelists[node];
2443                if (l3)
2444                        drain_array(cachep, l3, l3->shared, 1, node);
2445        }
2446}
2447
2448/*
2449 * Remove slabs from the list of free slabs.
2450 * Specify the number of slabs to drain in tofree.
2451 *
2452 * Returns the actual number of slabs released.
2453 */
2454static int drain_freelist(struct kmem_cache *cache,
2455                        struct kmem_list3 *l3, int tofree)
2456{
2457        struct list_head *p;
2458        int nr_freed;
2459        struct slab *slabp;
2460
2461        nr_freed = 0;
2462        while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2463
2464                spin_lock_irq(&l3->list_lock);
2465                p = l3->slabs_free.prev;
2466                if (p == &l3->slabs_free) {
2467                        spin_unlock_irq(&l3->list_lock);
2468                        goto out;
2469                }
2470
2471                slabp = list_entry(p, struct slab, list);
2472#if DEBUG
2473                BUG_ON(slabp->inuse);
2474#endif
2475                list_del(&slabp->list);
2476                /*
2477                 * Safe to drop the lock. The slab is no longer linked
2478                 * to the cache.
2479                 */
2480                l3->free_objects -= cache->num;
2481                spin_unlock_irq(&l3->list_lock);
2482                slab_destroy(cache, slabp);
2483                nr_freed++;
2484        }
2485out:
2486        return nr_freed;
2487}
2488
2489/* Called with cache_chain_mutex held to protect against cpu hotplug */
2490static int __cache_shrink(struct kmem_cache *cachep)
2491{
2492        int ret = 0, i = 0;
2493        struct kmem_list3 *l3;
2494
2495        drain_cpu_caches(cachep);
2496
2497        check_irq_on();
2498        for_each_online_node(i) {
2499                l3 = cachep->nodelists[i];
2500                if (!l3)
2501                        continue;
2502
2503                drain_freelist(cachep, l3, l3->free_objects);
2504
2505                ret += !list_empty(&l3->slabs_full) ||
2506                        !list_empty(&l3->slabs_partial);
2507        }
2508        return (ret ? 1 : 0);
2509}
2510
2511/**
2512 * kmem_cache_shrink - Shrink a cache.
2513 * @cachep: The cache to shrink.
2514 *
2515 * Releases as many slabs as possible for a cache.
2516 * To help debugging, a zero exit status indicates all slabs were released.
2517 */
2518int kmem_cache_shrink(struct kmem_cache *cachep)
2519{
2520        int ret;
2521        BUG_ON(!cachep || in_interrupt());
2522
2523        mutex_lock(&cache_chain_mutex);
2524        ret = __cache_shrink(cachep);
2525        mutex_unlock(&cache_chain_mutex);
2526        return ret;
2527}
2528EXPORT_SYMBOL(kmem_cache_shrink);
2529
2530/**
2531 * kmem_cache_destroy - delete a cache
2532 * @cachep: the cache to destroy
2533 *
2534 * Remove a &struct kmem_cache object from the slab cache.
2535 *
2536 * It is expected this function will be called by a module when it is
2537 * unloaded.  This will remove the cache completely, and avoid a duplicate
2538 * cache being allocated each time a module is loaded and unloaded, if the
2539 * module doesn't have persistent in-kernel storage across loads and unloads.
2540 *
2541 * The cache must be empty before calling this function.
2542 *
2543 * The caller must guarantee that noone will allocate memory from the cache
2544 * during the kmem_cache_destroy().
2545 */
2546void kmem_cache_destroy(struct kmem_cache *cachep)
2547{
2548        BUG_ON(!cachep || in_interrupt());
2549
2550        /* Find the cache in the chain of caches. */
2551        mutex_lock(&cache_chain_mutex);
2552        /*
2553         * the chain is never empty, cache_cache is never destroyed
2554         */
2555        list_del(&cachep->next);
2556        if (__cache_shrink(cachep)) {
2557                slab_error(cachep, "Can't free all objects");
2558                list_add(&cachep->next, &cache_chain);
2559                mutex_unlock(&cache_chain_mutex);
2560                return;
2561        }
2562
2563        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2564                synchronize_rcu();
2565
2566        __kmem_cache_destroy(cachep);
2567        mutex_unlock(&cache_chain_mutex);
2568}
2569EXPORT_SYMBOL(kmem_cache_destroy);
2570
2571/*
2572 * Get the memory for a slab management obj.
2573 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2574 * always come from malloc_sizes caches.  The slab descriptor cannot
2575 * come from the same cache which is getting created because,
2576 * when we are searching for an appropriate cache for these
2577 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2578 * If we are creating a malloc_sizes cache here it would not be visible to
2579 * kmem_find_general_cachep till the initialization is complete.
2580 * Hence we cannot have slabp_cache same as the original cache.
2581 */
2582static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2583                                   int colour_off, gfp_t local_flags,
2584                                   int nodeid)
2585{
2586        struct slab *slabp;
2587
2588        if (OFF_SLAB(cachep)) {
2589                /* Slab management obj is off-slab. */
2590                slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2591                                              local_flags & ~GFP_THISNODE, nodeid);
2592                if (!slabp)
2593                        return NULL;
2594        } else {
2595                slabp = objp + colour_off;
2596                colour_off += cachep->slab_size;
2597        }
2598        slabp->inuse = 0;
2599        slabp->colouroff = colour_off;
2600        slabp->s_mem = objp + colour_off;
2601        slabp->nodeid = nodeid;
2602        return slabp;
2603}
2604
2605static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2606{
2607        return (kmem_bufctl_t *) (slabp + 1);
2608}
2609
2610static void cache_init_objs(struct kmem_cache *cachep,
2611                            struct slab *slabp)
2612{
2613        int i;
2614
2615        for (i = 0; i < cachep->num; i++) {
2616                void *objp = index_to_obj(cachep, slabp, i);
2617#if DEBUG
2618                /* need to poison the objs? */
2619                if (cachep->flags & SLAB_POISON)
2620                        poison_obj(cachep, objp, POISON_FREE);
2621                if (cachep->flags & SLAB_STORE_USER)
2622                        *dbg_userword(cachep, objp) = NULL;
2623
2624                if (cachep->flags & SLAB_RED_ZONE) {
2625                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2626                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2627                }
2628                /*
2629                 * Constructors are not allowed to allocate memory from the same
2630                 * cache which they are a constructor for.  Otherwise, deadlock.
2631                 * They must also be threaded.
2632                 */
2633                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2634                        cachep->ctor(objp + obj_offset(cachep), cachep,
2635                                     0);
2636
2637                if (cachep->flags & SLAB_RED_ZONE) {
2638                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2639                                slab_error(cachep, "constructor overwrote the"
2640                                           " end of an object");
2641                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2642                                slab_error(cachep, "constructor overwrote the"
2643                                           " start of an object");
2644                }
2645                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2646                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2647                        kernel_map_pages(virt_to_page(objp),
2648                                         cachep->buffer_size / PAGE_SIZE, 0);
2649#else
2650                if (cachep->ctor)
2651                        cachep->ctor(objp, cachep, 0);
2652#endif
2653                slab_bufctl(slabp)[i] = i + 1;
2654        }
2655        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2656        slabp->free = 0;
2657}
2658
2659static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2660{
2661        if (CONFIG_ZONE_DMA_FLAG) {
2662                if (flags & GFP_DMA)
2663                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
2664                else
2665                        BUG_ON(cachep->gfpflags & GFP_DMA);
2666        }
2667}
2668
2669static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2670                                int nodeid)
2671{
2672        void *objp = index_to_obj(cachep, slabp, slabp->free);
2673        kmem_bufctl_t next;
2674
2675        slabp->inuse++;
2676        next = slab_bufctl(slabp)[slabp->free];
2677#if DEBUG
2678        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2679        WARN_ON(slabp->nodeid != nodeid);
2680#endif
2681        slabp->free = next;
2682
2683        return objp;
2684}
2685
2686static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2687                                void *objp, int nodeid)
2688{
2689        unsigned int objnr = obj_to_index(cachep, slabp, objp);
2690
2691#if DEBUG
2692        /* Verify that the slab belongs to the intended node */
2693        WARN_ON(slabp->nodeid != nodeid);
2694
2695        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2696                printk(KERN_ERR "slab: double free detected in cache "
2697                                "'%s', objp %p\n", cachep->name, objp);
2698                BUG();
2699        }
2700#endif
2701        slab_bufctl(slabp)[objnr] = slabp->free;
2702        slabp->free = objnr;
2703        slabp->inuse--;
2704}
2705
2706/*
2707 * Map pages beginning at addr to the given cache and slab. This is required
2708 * for the slab allocator to be able to lookup the cache and slab of a
2709 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2710 */
2711static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2712                           void *addr)
2713{
2714        int nr_pages;
2715        struct page *page;
2716
2717        page = virt_to_page(addr);
2718
2719        nr_pages = 1;
2720        if (likely(!PageCompound(page)))
2721                nr_pages <<= cache->gfporder;
2722
2723        do {
2724                page_set_cache(page, cache);
2725                page_set_slab(page, slab);
2726                page++;
2727        } while (--nr_pages);
2728}
2729
2730/*
2731 * Grow (by 1) the number of slabs within a cache.  This is called by
2732 * kmem_cache_alloc() when there are no active objs left in a cache.
2733 */
2734static int cache_grow(struct kmem_cache *cachep,
2735                gfp_t flags, int nodeid, void *objp)
2736{
2737        struct slab *slabp;
2738        size_t offset;
2739        gfp_t local_flags;
2740        struct kmem_list3 *l3;
2741
2742        /*
2743         * Be lazy and only check for valid flags here,  keeping it out of the
2744         * critical path in kmem_cache_alloc().
2745         */
2746        BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
2747
2748        local_flags = (flags & GFP_LEVEL_MASK);
2749        /* Take the l3 list lock to change the colour_next on this node */
2750        check_irq_off();
2751        l3 = cachep->nodelists[nodeid];
2752        spin_lock(&l3->list_lock);
2753
2754        /* Get colour for the slab, and cal the next value. */
2755        offset = l3->colour_next;
2756        l3->colour_next++;
2757        if (l3->colour_next >= cachep->colour)
2758                l3->colour_next = 0;
2759        spin_unlock(&l3->list_lock);
2760
2761        offset *= cachep->colour_off;
2762
2763        if (local_flags & __GFP_WAIT)
2764                local_irq_enable();
2765
2766        /*
2767         * The test for missing atomic flag is performed here, rather than
2768         * the more obvious place, simply to reduce the critical path length
2769         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2770         * will eventually be caught here (where it matters).
2771         */
2772        kmem_flagcheck(cachep, flags);
2773
2774        /*
2775         * Get mem for the objs.  Attempt to allocate a physical page from
2776         * 'nodeid'.
2777         */
2778        if (!objp)
2779                objp = kmem_getpages(cachep, flags, nodeid);
2780        if (!objp)
2781                goto failed;
2782
2783        /* Get slab management. */
2784        slabp = alloc_slabmgmt(cachep, objp, offset,
2785                        local_flags & ~GFP_THISNODE, nodeid);
2786        if (!slabp)
2787                goto opps1;
2788
2789        slabp->nodeid = nodeid;
2790        slab_map_pages(cachep, slabp, objp);
2791
2792        cache_init_objs(cachep, slabp);
2793
2794        if (local_flags & __GFP_WAIT)
2795                local_irq_disable();
2796        check_irq_off();
2797        spin_lock(&l3->list_lock);
2798
2799        /* Make slab active. */
2800        list_add_tail(&slabp->list, &(l3->slabs_free));
2801        STATS_INC_GROWN(cachep);
2802        l3->free_objects += cachep->num;
2803        spin_unlock(&l3->list_lock);
2804        return 1;
2805opps1:
2806        kmem_freepages(cachep, objp);
2807failed:
2808        if (local_flags & __GFP_WAIT)
2809                local_irq_disable();
2810        return 0;
2811}
2812
2813#if DEBUG
2814
2815/*
2816 * Perform extra freeing checks:
2817 * - detect bad pointers.
2818 * - POISON/RED_ZONE checking
2819 */
2820static void kfree_debugcheck(const void *objp)
2821{
2822        if (!virt_addr_valid(objp)) {
2823                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2824                       (unsigned long)objp);
2825                BUG();
2826        }
2827}
2828
2829static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2830{
2831        unsigned long long redzone1, redzone2;
2832
2833        redzone1 = *dbg_redzone1(cache, obj);
2834        redzone2 = *dbg_redzone2(cache, obj);
2835
2836        /*
2837         * Redzone is ok.
2838         */
2839        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2840                return;
2841
2842        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2843                slab_error(cache, "double free detected");
2844        else
2845                slab_error(cache, "memory outside object was overwritten");
2846
2847        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2848                        obj, redzone1, redzone2);
2849}
2850
2851static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2852                                   void *caller)
2853{
2854        struct page *page;
2855        unsigned int objnr;
2856        struct slab *slabp;
2857
2858        objp -= obj_offset(cachep);
2859        kfree_debugcheck(objp);
2860        page = virt_to_head_page(objp);
2861
2862        slabp = page_get_slab(page);
2863
2864        if (cachep->flags & SLAB_RED_ZONE) {
2865                verify_redzone_free(cachep, objp);
2866                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2867                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2868        }
2869        if (cachep->flags & SLAB_STORE_USER)
2870                *dbg_userword(cachep, objp) = caller;
2871
2872        objnr = obj_to_index(cachep, slabp, objp);
2873
2874        BUG_ON(objnr >= cachep->num);
2875        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2876
2877#ifdef CONFIG_DEBUG_SLAB_LEAK
2878        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2879#endif
2880        if (cachep->flags & SLAB_POISON) {
2881#ifdef CONFIG_DEBUG_PAGEALLOC
2882                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2883                        store_stackinfo(cachep, objp, (unsigned long)caller);
2884                        kernel_map_pages(virt_to_page(objp),
2885                                         cachep->buffer_size / PAGE_SIZE, 0);
2886                } else {
2887                        poison_obj(cachep, objp, POISON_FREE);
2888                }
2889#else
2890                poison_obj(cachep, objp, POISON_FREE);
2891#endif
2892        }
2893        return objp;
2894}
2895
2896static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2897{
2898        kmem_bufctl_t i;
2899        int entries = 0;
2900
2901        /* Check slab's freelist to see if this obj is there. */
2902        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2903                entries++;
2904                if (entries > cachep->num || i >= cachep->num)
2905                        goto bad;
2906        }
2907        if (entries != cachep->num - slabp->inuse) {
2908bad:
2909                printk(KERN_ERR "slab: Internal list corruption detected in "
2910                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2911                        cachep->name, cachep->num, slabp, slabp->inuse);
2912                for (i = 0;
2913                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2914                     i++) {
2915                        if (i % 16 == 0)
2916                                printk("\n%03x:", i);
2917                        printk(" %02x", ((unsigned char *)slabp)[i]);
2918                }
2919                printk("\n");
2920                BUG();
2921        }
2922}
2923#else
2924#define kfree_debugcheck(x) do { } while(0)
2925#define cache_free_debugcheck(x,objp,z) (objp)
2926#define check_slabp(x,y) do { } while(0)
2927#endif
2928
2929static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2930{
2931        int batchcount;
2932        struct kmem_list3 *l3;
2933        struct array_cache *ac;
2934        int node;
2935
2936        node = numa_node_id();
2937
2938        check_irq_off();
2939        ac = cpu_cache_get(cachep);
2940retry:
2941        batchcount = ac->batchcount;
2942        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2943                /*
2944                 * If there was little recent activity on this cache, then
2945                 * perform only a partial refill.  Otherwise we could generate
2946                 * refill bouncing.
2947                 */
2948                batchcount = BATCHREFILL_LIMIT;
2949        }
2950        l3 = cachep->nodelists[node];
2951
2952        BUG_ON(ac->avail > 0 || !l3);
2953        spin_lock(&l3->list_lock);
2954
2955        /* See if we can refill from the shared array */
2956        if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2957                goto alloc_done;
2958
2959        while (batchcount > 0) {
2960                struct list_head *entry;
2961                struct slab *slabp;
2962                /* Get slab alloc is to come from. */
2963                entry = l3->slabs_partial.next;
2964                if (entry == &l3->slabs_partial) {
2965                        l3->free_touched = 1;
2966                        entry = l3->slabs_free.next;
2967                        if (entry == &l3->slabs_free)
2968                                goto must_grow;
2969                }
2970
2971                slabp = list_entry(entry, struct slab, list);
2972                check_slabp(cachep, slabp);
2973                check_spinlock_acquired(cachep);
2974
2975                /*
2976                 * The slab was either on partial or free list so
2977                 * there must be at least one object available for
2978                 * allocation.
2979                 */
2980                BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
2981
2982                while (slabp->inuse < cachep->num && batchcount--) {
2983                        STATS_INC_ALLOCED(cachep);
2984                        STATS_INC_ACTIVE(cachep);
2985                        STATS_SET_HIGH(cachep);
2986
2987                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2988                                                            node);
2989                }
2990                check_slabp(cachep, slabp);
2991
2992                /* move slabp to correct slabp list: */
2993                list_del(&slabp->list);
2994                if (slabp->free == BUFCTL_END)
2995                        list_add(&slabp->list, &l3->slabs_full);
2996                else
2997                        list_add(&slabp->list, &l3->slabs_partial);
2998        }
2999
3000must_grow:
3001        l3->free_objects -= ac->avail;
3002alloc_done:
3003        spin_unlock(&l3->list_lock);
3004
3005        if (unlikely(!ac->avail)) {
3006                int x;
3007                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3008
3009                /* cache_grow can reenable interrupts, then ac could change. */
3010                ac = cpu_cache_get(cachep);
3011                if (!x && ac->avail == 0)       /* no objects in sight? abort */
3012                        return NULL;
3013
3014                if (!ac->avail)         /* objects refilled by interrupt? */
3015                        goto retry;
3016        }
3017        ac->touched = 1;
3018        return ac->entry[--ac->avail];
3019}
3020
3021static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3022                                                gfp_t flags)
3023{
3024        might_sleep_if(flags & __GFP_WAIT);
3025#if DEBUG
3026        kmem_flagcheck(cachep, flags);
3027#endif
3028}
3029
3030#if DEBUG
3031static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3032                                gfp_t flags, void *objp, void *caller)
3033{
3034        if (!objp)
3035                return objp;
3036        if (cachep->flags & SLAB_POISON) {
3037#ifdef CONFIG_DEBUG_PAGEALLOC
3038                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3039                        kernel_map_pages(virt_to_page(objp),
3040                                         cachep->buffer_size / PAGE_SIZE, 1);
3041                else
3042                        check_poison_obj(cachep, objp);
3043#else
3044                check_poison_obj(cachep, objp);
3045#endif
3046                poison_obj(cachep, objp, POISON_INUSE);
3047        }
3048        if (cachep->flags & SLAB_STORE_USER)
3049                *dbg_userword(cachep, objp) = caller;
3050
3051        if (cachep->flags & SLAB_RED_ZONE) {
3052                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3053                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3054                        slab_error(cachep, "double free, or memory outside"
3055                                                " object was overwritten");
3056                        printk(KERN_ERR
3057                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3058                                objp, *dbg_redzone1(cachep, objp),
3059                                *dbg_redzone2(cachep, objp));
3060                }
3061                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3062                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3063        }
3064#ifdef CONFIG_DEBUG_SLAB_LEAK
3065        {
3066                struct slab *slabp;
3067                unsigned objnr;
3068
3069                slabp = page_get_slab(virt_to_head_page(objp));
3070                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3071                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3072        }
3073#endif
3074        objp += obj_offset(cachep);
3075        if (cachep->ctor && cachep->flags & SLAB_POISON)
3076                cachep->ctor(objp, cachep, 0);
3077#if ARCH_SLAB_MINALIGN
3078        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3079                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3080                       objp, ARCH_SLAB_MINALIGN);
3081        }
3082#endif
3083        return objp;
3084}
3085#else
3086#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3087#endif
3088
3089#ifdef CONFIG_FAILSLAB
3090
3091static struct failslab_attr {
3092
3093        struct fault_attr attr;
3094
3095        u32 ignore_gfp_wait;
3096#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3097        struct dentry *ignore_gfp_wait_file;
3098#endif
3099
3100} failslab = {
3101        .attr = FAULT_ATTR_INITIALIZER,
3102        .ignore_gfp_wait = 1,
3103};
3104
3105static int __init setup_failslab(char *str)
3106{
3107        return setup_fault_attr(&failslab.attr, str);
3108}
3109__setup("failslab=", setup_failslab);
3110
3111static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3112{
3113        if (cachep == &cache_cache)
3114                return 0;
3115        if (flags & __GFP_NOFAIL)
3116                return 0;
3117        if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3118                return 0;
3119
3120        return should_fail(&failslab.attr, obj_size(cachep));
3121}
3122
3123#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3124
3125static int __init failslab_debugfs(void)
3126{
3127        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3128        struct dentry *dir;
3129        int err;
3130
3131        err = init_fault_attr_dentries(&failslab.attr, "failslab");
3132        if (err)
3133                return err;
3134        dir = failslab.attr.dentries.dir;
3135
3136        failslab.ignore_gfp_wait_file =
3137                debugfs_create_bool("ignore-gfp-wait", mode, dir,
3138                                      &failslab.ignore_gfp_wait);
3139
3140        if (!failslab.ignore_gfp_wait_file) {
3141                err = -ENOMEM;
3142                debugfs_remove(failslab.ignore_gfp_wait_file);
3143                cleanup_fault_attr_dentries(&failslab.attr);
3144        }
3145
3146        return err;
3147}
3148
3149late_initcall(failslab_debugfs);
3150
3151#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3152
3153#else /* CONFIG_FAILSLAB */
3154
3155static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3156{
3157        return 0;
3158}
3159
3160#endif /* CONFIG_FAILSLAB */
3161
3162static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3163{
3164        void *objp;
3165        struct array_cache *ac;
3166
3167        check_irq_off();
3168
3169        ac = cpu_cache_get(cachep);
3170        if (likely(ac->avail)) {
3171                STATS_INC_ALLOCHIT(cachep);
3172                ac->touched = 1;
3173                objp = ac->entry[--ac->avail];
3174        } else {
3175                STATS_INC_ALLOCMISS(cachep);
3176                objp = cache_alloc_refill(cachep, flags);
3177        }
3178        return objp;
3179}
3180
3181#ifdef CONFIG_NUMA
3182/*
3183 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3184 *
3185 * If we are in_interrupt, then process context, including cpusets and
3186 * mempolicy, may not apply and should not be used for allocation policy.
3187 */
3188static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3189{
3190        int nid_alloc, nid_here;
3191
3192        if (in_interrupt() || (flags & __GFP_THISNODE))
3193                return NULL;
3194        nid_alloc = nid_here = numa_node_id();
3195        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3196                nid_alloc = cpuset_mem_spread_node();
3197        else if (current->mempolicy)
3198                nid_alloc = slab_node(current->mempolicy);
3199        if (nid_alloc != nid_here)
3200                return ____cache_alloc_node(cachep, flags, nid_alloc);
3201        return NULL;
3202}
3203
3204/*
3205 * Fallback function if there was no memory available and no objects on a
3206 * certain node and fall back is permitted. First we scan all the
3207 * available nodelists for available objects. If that fails then we
3208 * perform an allocation without specifying a node. This allows the page
3209 * allocator to do its reclaim / fallback magic. We then insert the
3210 * slab into the proper nodelist and then allocate from it.
3211 */
3212static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3213{
3214        struct zonelist *zonelist;
3215        gfp_t local_flags;
3216        struct zone **z;
3217        void *obj = NULL;
3218        int nid;
3219
3220        if (flags & __GFP_THISNODE)
3221                return NULL;
3222
3223        zonelist = &NODE_DATA(slab_node(current->mempolicy))
3224                        ->node_zonelists[gfp_zone(flags)];
3225        local_flags = (flags & GFP_LEVEL_MASK);
3226
3227retry:
3228        /*
3229         * Look through allowed nodes for objects available
3230         * from existing per node queues.
3231         */
3232        for (z = zonelist->zones; *z && !obj; z++) {
3233                nid = zone_to_nid(*z);
3234
3235                if (cpuset_zone_allowed_hardwall(*z, flags) &&
3236                        cache->nodelists[nid] &&
3237                        cache->nodelists[nid]->free_objects)
3238                                obj = ____cache_alloc_node(cache,
3239                                        flags | GFP_THISNODE, nid);
3240        }
3241
3242        if (!obj) {
3243                /*
3244                 * This allocation will be performed within the constraints
3245                 * of the current cpuset / memory policy requirements.
3246                 * We may trigger various forms of reclaim on the allowed
3247                 * set and go into memory reserves if necessary.
3248                 */
3249                if (local_flags & __GFP_WAIT)
3250                        local_irq_enable();
3251                kmem_flagcheck(cache, flags);
3252                obj = kmem_getpages(cache, flags, -1);
3253                if (local_flags & __GFP_WAIT)
3254                        local_irq_disable();
3255                if (obj) {
3256                        /*
3257                         * Insert into the appropriate per node queues
3258                         */
3259                        nid = page_to_nid(virt_to_page(obj));
3260                        if (cache_grow(cache, flags, nid, obj)) {
3261                                obj = ____cache_alloc_node(cache,
3262                                        flags | GFP_THISNODE, nid);
3263                                if (!obj)
3264                                        /*
3265                                         * Another processor may allocate the
3266                                         * objects in the slab since we are
3267                                         * not holding any locks.
3268                                         */
3269                                        goto retry;
3270                        } else {
3271                                /* cache_grow already freed obj */
3272                                obj = NULL;
3273                        }
3274                }
3275        }
3276        return obj;
3277}
3278
3279/*
3280 * A interface to enable slab creation on nodeid
3281 */
3282static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3283                                int nodeid)
3284{
3285        struct list_head *entry;
3286        struct slab *slabp;
3287        struct kmem_list3 *l3;
3288        void *obj;
3289        int x;
3290
3291        l3 = cachep->nodelists[nodeid];
3292        BUG_ON(!l3);
3293
3294retry:
3295        check_irq_off();
3296        spin_lock(&l3->list_lock);
3297        entry = l3->slabs_partial.next;
3298        if (entry == &l3->slabs_partial) {
3299                l3->free_touched = 1;
3300                entry = l3->slabs_free.next;
3301                if (entry == &l3->slabs_free)
3302                        goto must_grow;
3303        }
3304
3305        slabp = list_entry(entry, struct slab, list);
3306        check_spinlock_acquired_node(cachep, nodeid);
3307        check_slabp(cachep, slabp);
3308
3309        STATS_INC_NODEALLOCS(cachep);
3310        STATS_INC_ACTIVE(cachep);
3311        STATS_SET_HIGH(cachep);
3312
3313        BUG_ON(slabp->inuse == cachep->num);
3314
3315        obj = slab_get_obj(cachep, slabp, nodeid);
3316        check_slabp(cachep, slabp);
3317        l3->free_objects--;
3318        /* move slabp to correct slabp list: */
3319        list_del(&slabp->list);
3320
3321        if (slabp->free == BUFCTL_END)
3322                list_add(&slabp->list, &l3->slabs_full);
3323        else
3324                list_add(&slabp->list, &l3->slabs_partial);
3325
3326        spin_unlock(&l3->list_lock);
3327        goto done;
3328
3329must_grow:
3330        spin_unlock(&l3->list_lock);
3331        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3332        if (x)
3333                goto retry;
3334
3335        return fallback_alloc(cachep, flags);
3336
3337done:
3338        return obj;
3339}
3340
3341/**
3342 * kmem_cache_alloc_node - Allocate an object on the specified node
3343 * @cachep: The cache to allocate from.
3344 * @flags: See kmalloc().
3345 * @nodeid: node number of the target node.
3346 * @caller: return address of caller, used for debug information
3347 *
3348 * Identical to kmem_cache_alloc but it will allocate memory on the given
3349 * node, which can improve the performance for cpu bound structures.
3350 *
3351 * Fallback to other node is possible if __GFP_THISNODE is not set.
3352 */
3353static __always_inline void *
3354__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3355                   void *caller)
3356{
3357        unsigned long save_flags;
3358        void *ptr;
3359
3360        if (should_failslab(cachep, flags))
3361                return NULL;
3362
3363        cache_alloc_debugcheck_before(cachep, flags);
3364        local_irq_save(save_flags);
3365
3366        if (unlikely(nodeid == -1))
3367                nodeid = numa_node_id();
3368
3369        if (unlikely(!cachep->nodelists[nodeid])) {
3370                /* Node not bootstrapped yet */
3371                ptr = fallback_alloc(cachep, flags);
3372                goto out;
3373        }
3374
3375        if (nodeid == numa_node_id()) {
3376                /*
3377                 * Use the locally cached objects if possible.
3378                 * However ____cache_alloc does not allow fallback
3379                 * to other nodes. It may fail while we still have
3380                 * objects on other nodes available.
3381                 */
3382                ptr = ____cache_alloc(cachep, flags);
3383                if (ptr)
3384                        goto out;
3385        }
3386        /* ___cache_alloc_node can fall back to other nodes */
3387        ptr = ____cache_alloc_node(cachep, flags, nodeid);
3388  out:
3389        local_irq_restore(save_flags);
3390        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3391
3392        return ptr;
3393}
3394
3395static __always_inline void *
3396__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3397{
3398        void *objp;
3399
3400        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3401                objp = alternate_node_alloc(cache, flags);
3402                if (objp)
3403                        goto out;
3404        }
3405        objp = ____cache_alloc(cache, flags);
3406
3407        /*
3408         * We may just have run out of memory on the local node.
3409         * ____cache_alloc_node() knows how to locate memory on other nodes
3410         */
3411        if (!objp)
3412                objp = ____cache_alloc_node(cache, flags, numa_node_id());
3413
3414  out:
3415        return objp;
3416}
3417#else
3418
3419static __always_inline void *
3420__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3421{
3422        return ____cache_alloc(cachep, flags);
3423}
3424
3425#endif /* CONFIG_NUMA */
3426
3427static __always_inline void *
3428__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3429{
3430        unsigned long save_flags;
3431        void *objp;
3432
3433        if (should_failslab(cachep, flags))
3434                return NULL;
3435
3436        cache_alloc_debugcheck_before(cachep, flags);
3437        local_irq_save(save_flags);
3438        objp = __do_cache_alloc(cachep, flags);
3439        local_irq_restore(save_flags);
3440        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3441        prefetchw(objp);
3442
3443        return objp;
3444}
3445
3446/*
3447 * Caller needs to acquire correct kmem_list's list_lock
3448 */
3449static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3450                       int node)
3451{
3452        int i;
3453        struct kmem_list3 *l3;
3454
3455        for (i = 0; i < nr_objects; i++) {
3456                void *objp = objpp[i];
3457                struct slab *slabp;
3458
3459                slabp = virt_to_slab(objp);
3460                l3 = cachep->nodelists[node];
3461                list_del(&slabp->list);
3462                check_spinlock_acquired_node(cachep, node);
3463                check_slabp(cachep, slabp);
3464                slab_put_obj(cachep, slabp, objp, node);
3465                STATS_DEC_ACTIVE(cachep);
3466                l3->free_objects++;
3467                check_slabp(cachep, slabp);
3468
3469                /* fixup slab chains */
3470                if (slabp->inuse == 0) {
3471                        if (l3->free_objects > l3->free_limit) {
3472                                l3->free_objects -= cachep->num;
3473                                /* No need to drop any previously held
3474                                 * lock here, even if we have a off-slab slab
3475                                 * descriptor it is guaranteed to come from
3476                                 * a different cache, refer to comments before
3477                                 * alloc_slabmgmt.
3478                                 */
3479                                slab_destroy(cachep, slabp);
3480                        } else {
3481                                list_add(&slabp->list, &l3->slabs_free);
3482                        }
3483                } else {
3484                        /* Unconditionally move a slab to the end of the
3485                         * partial list on free - maximum time for the
3486                         * other objects to be freed, too.
3487                         */
3488                        list_add_tail(&slabp->list, &l3->slabs_partial);
3489                }
3490        }
3491}
3492
3493static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3494{
3495        int batchcount;
3496        struct kmem_list3 *l3;
3497        int node = numa_node_id();
3498
3499        batchcount = ac->batchcount;
3500#if DEBUG
3501        BUG_ON(!batchcount || batchcount > ac->avail);
3502#endif
3503        check_irq_off();
3504        l3 = cachep->nodelists[node];
3505        spin_lock(&l3->list_lock);
3506        if (l3->shared) {
3507                struct array_cache *shared_array = l3->shared;
3508                int max = shared_array->limit - shared_array->avail;
3509                if (max) {
3510                        if (batchcount > max)
3511                                batchcount = max;
3512                        memcpy(&(shared_array->entry[shared_array->avail]),
3513                               ac->entry, sizeof(void *) * batchcount);
3514                        shared_array->avail += batchcount;
3515                        goto free_done;
3516                }
3517        }
3518
3519        free_block(cachep, ac->entry, batchcount, node);
3520free_done:
3521#if STATS
3522        {
3523                int i = 0;
3524                struct list_head *p;
3525
3526                p = l3->slabs_free.next;
3527                while (p != &(l3->slabs_free)) {
3528                        struct slab *slabp;
3529
3530                        slabp = list_entry(p, struct slab, list);
3531                        BUG_ON(slabp->inuse);
3532
3533                        i++;
3534                        p = p->next;
3535                }
3536                STATS_SET_FREEABLE(cachep, i);
3537        }
3538#endif
3539        spin_unlock(&l3->list_lock);
3540        ac->avail -= batchcount;
3541        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3542}
3543
3544/*
3545 * Release an obj back to its cache. If the obj has a constructed state, it must
3546 * be in this state _before_ it is released.  Called with disabled ints.
3547 */
3548static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3549{
3550        struct array_cache *ac = cpu_cache_get(cachep);
3551
3552        check_irq_off();
3553        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3554
3555        if (cache_free_alien(cachep, objp))
3556                return;
3557
3558        if (likely(ac->avail < ac->limit)) {
3559                STATS_INC_FREEHIT(cachep);
3560                ac->entry[ac->avail++] = objp;
3561                return;
3562        } else {
3563                STATS_INC_FREEMISS(cachep);
3564                cache_flusharray(cachep, ac);
3565                ac->entry[ac->avail++] = objp;
3566        }
3567}
3568
3569/**
3570 * kmem_cache_alloc - Allocate an object
3571 * @cachep: The cache to allocate from.
3572 * @flags: See kmalloc().
3573 *
3574 * Allocate an object from this cache.  The flags are only relevant
3575 * if the cache has no available objects.
3576 */
3577void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3578{
3579        return __cache_alloc(cachep, flags, __builtin_return_address(0));
3580}
3581EXPORT_SYMBOL(kmem_cache_alloc);
3582
3583/**
3584 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3585 * @cache: The cache to allocate from.
3586 * @flags: See kmalloc().
3587 *
3588 * Allocate an object from this cache and set the allocated memory to zero.
3589 * The flags are only relevant if the cache has no available objects.
3590 */
3591void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3592{
3593        void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3594        if (ret)
3595                memset(ret, 0, obj_size(cache));
3596        return ret;
3597}
3598EXPORT_SYMBOL(kmem_cache_zalloc);
3599
3600/**
3601 * kmem_ptr_validate - check if an untrusted pointer might
3602 *      be a slab entry.
3603 * @cachep: the cache we're checking against
3604 * @ptr: pointer to validate
3605 *
3606 * This verifies that the untrusted pointer looks sane:
3607 * it is _not_ a guarantee that the pointer is actually
3608 * part of the slab cache in question, but it at least
3609 * validates that the pointer can be dereferenced and
3610 * looks half-way sane.
3611 *
3612 * Currently only used for dentry validation.
3613 */
3614int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3615{
3616        unsigned long addr = (unsigned long)ptr;
3617        unsigned long min_addr = PAGE_OFFSET;
3618        unsigned long align_mask = BYTES_PER_WORD - 1;
3619        unsigned long size = cachep->buffer_size;
3620        struct page *page;
3621
3622        if (unlikely(addr < min_addr))
3623                goto out;
3624        if (unlikely(addr > (unsigned long)high_memory - size))
3625                goto out;
3626        if (unlikely(addr & align_mask))
3627                goto out;
3628        if (unlikely(!kern_addr_valid(addr)))
3629                goto out;
3630        if (unlikely(!kern_addr_valid(addr + size - 1)))
3631                goto out;
3632        page = virt_to_page(ptr);
3633        if (unlikely(!PageSlab(page)))
3634                goto out;
3635        if (unlikely(page_get_cache(page) != cachep))
3636                goto out;
3637        return 1;
3638out:
3639        return 0;
3640}
3641
3642#ifdef CONFIG_NUMA
3643void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3644{
3645        return __cache_alloc_node(cachep, flags, nodeid,
3646                        __builtin_return_address(0));
3647}
3648EXPORT_SYMBOL(kmem_cache_alloc_node);
3649
3650static __always_inline void *
3651__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3652{
3653        struct kmem_cache *cachep;
3654
3655        cachep = kmem_find_general_cachep(size, flags);
3656        if (unlikely(cachep == NULL))
3657                return NULL;
3658        return kmem_cache_alloc_node(cachep, flags, node);
3659}
3660
3661#ifdef CONFIG_DEBUG_SLAB
3662void *__kmalloc_node(size_t size, gfp_t flags, int node)
3663{
3664        return __do_kmalloc_node(size, flags, node,
3665                        __builtin_return_address(0));
3666}
3667EXPORT_SYMBOL(__kmalloc_node);
3668
3669void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3670                int node, void *caller)
3671{
3672        return __do_kmalloc_node(size, flags, node, caller);
3673}
3674EXPORT_SYMBOL(__kmalloc_node_track_caller);
3675#else
3676void *__kmalloc_node(size_t size, gfp_t flags, int node)
3677{
3678        return __do_kmalloc_node(size, flags, node, NULL);
3679}
3680EXPORT_SYMBOL(__kmalloc_node);
3681#endif /* CONFIG_DEBUG_SLAB */
3682#endif /* CONFIG_NUMA */
3683
3684/**
3685 * __do_kmalloc - allocate memory
3686 * @size: how many bytes of memory are required.
3687 * @flags: the type of memory to allocate (see kmalloc).
3688 * @caller: function caller for debug tracking of the caller
3689 */
3690static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3691                                          void *caller)
3692{
3693        struct kmem_cache *cachep;
3694
3695        /* If you want to save a few bytes .text space: replace
3696         * __ with kmem_.
3697         * Then kmalloc uses the uninlined functions instead of the inline
3698         * functions.
3699         */
3700        cachep = __find_general_cachep(size, flags);
3701        if (unlikely(cachep == NULL))
3702                return NULL;
3703        return __cache_alloc(cachep, flags, caller);
3704}
3705
3706
3707#ifdef CONFIG_DEBUG_SLAB
3708void *__kmalloc(size_t size, gfp_t flags)
3709{
3710        return __do_kmalloc(size, flags, __builtin_return_address(0));
3711}
3712EXPORT_SYMBOL(__kmalloc);
3713
3714void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3715{
3716        return __do_kmalloc(size, flags, caller);
3717}
3718EXPORT_SYMBOL(__kmalloc_track_caller);
3719
3720#else
3721void *__kmalloc(size_t size, gfp_t flags)
3722{
3723        return __do_kmalloc(size, flags, NULL);
3724}
3725EXPORT_SYMBOL(__kmalloc);
3726#endif
3727
3728/**
3729 * krealloc - reallocate memory. The contents will remain unchanged.
3730 * @p: object to reallocate memory for.
3731 * @new_size: how many bytes of memory are required.
3732 * @flags: the type of memory to allocate.
3733 *
3734 * The contents of the object pointed to are preserved up to the
3735 * lesser of the new and old sizes.  If @p is %NULL, krealloc()
3736 * behaves exactly like kmalloc().  If @size is 0 and @p is not a
3737 * %NULL pointer, the object pointed to is freed.
3738 */
3739void *krealloc(const void *p, size_t new_size, gfp_t flags)
3740{
3741        struct kmem_cache *cache, *new_cache;
3742        void *ret;
3743
3744        if (unlikely(!p))
3745                return kmalloc_track_caller(new_size, flags);
3746
3747        if (unlikely(!new_size)) {
3748                kfree(p);
3749                return NULL;
3750        }
3751
3752        cache = virt_to_cache(p);
3753        new_cache = __find_general_cachep(new_size, flags);
3754
3755        /*
3756         * If new size fits in the current cache, bail out.
3757         */
3758        if (likely(cache == new_cache))
3759                return (void *)p;
3760
3761        /*
3762         * We are on the slow-path here so do not use __cache_alloc
3763         * because it bloats kernel text.
3764         */
3765        ret = kmalloc_track_caller(new_size, flags);
3766        if (ret) {
3767                memcpy(ret, p, min(new_size, ksize(p)));
3768                kfree(p);
3769        }
3770        return ret;
3771}
3772EXPORT_SYMBOL(krealloc);
3773
3774/**
3775 * kmem_cache_free - Deallocate an object
3776 * @cachep: The cache the allocation was from.
3777 * @objp: The previously allocated object.
3778 *
3779 * Free an object which was previously allocated from this
3780 * cache.
3781 */
3782void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3783{
3784        unsigned long flags;
3785
3786        BUG_ON(virt_to_cache(objp) != cachep);
3787
3788        local_irq_save(flags);
3789        debug_check_no_locks_freed(objp, obj_size(cachep));
3790        __cache_free(cachep, objp);
3791        local_irq_restore(flags);
3792}
3793EXPORT_SYMBOL(kmem_cache_free);
3794
3795/**
3796 * kfree - free previously allocated memory
3797 * @objp: pointer returned by kmalloc.
3798 *
3799 * If @objp is NULL, no operation is performed.
3800 *
3801 * Don't free memory not originally allocated by kmalloc()
3802 * or you will run into trouble.
3803 */
3804void kfree(const void *objp)
3805{
3806        struct kmem_cache *c;
3807        unsigned long flags;
3808
3809        if (unlikely(!objp))
3810                return;
3811        local_irq_save(flags);
3812        kfree_debugcheck(objp);
3813        c = virt_to_cache(objp);
3814        debug_check_no_locks_freed(objp, obj_size(c));
3815        __cache_free(c, (void *)objp);
3816        local_irq_restore(flags);
3817}
3818EXPORT_SYMBOL(kfree);
3819
3820unsigned int kmem_cache_size(struct kmem_cache *cachep)
3821{
3822        return obj_size(cachep);
3823}
3824EXPORT_SYMBOL(kmem_cache_size);
3825
3826const char *kmem_cache_name(struct kmem_cache *cachep)
3827{
3828        return cachep->name;
3829}
3830EXPORT_SYMBOL_GPL(kmem_cache_name);
3831
3832/*
3833 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3834 */
3835static int alloc_kmemlist(struct kmem_cache *cachep)
3836{
3837        int node;
3838        struct kmem_list3 *l3;
3839        struct array_cache *new_shared;
3840        struct array_cache **new_alien = NULL;
3841
3842        for_each_online_node(node) {
3843
3844                if (use_alien_caches) {
3845                        new_alien = alloc_alien_cache(node, cachep->limit);
3846                        if (!new_alien)
3847                                goto fail;
3848                }
3849
3850                new_shared = NULL;
3851                if (cachep->shared) {
3852                        new_shared = alloc_arraycache(node,
3853                                cachep->shared*cachep->batchcount,
3854                                        0xbaadf00d);
3855                        if (!new_shared) {
3856                                free_alien_cache(new_alien);
3857                                goto fail;
3858                        }
3859                }
3860
3861                l3 = cachep->nodelists[node];
3862                if (l3) {
3863                        struct array_cache *shared = l3->shared;
3864
3865                        spin_lock_irq(&l3->list_lock);
3866
3867                        if (shared)
3868                                free_block(cachep, shared->entry,
3869                                                shared->avail, node);
3870
3871                        l3->shared = new_shared;
3872                        if (!l3->alien) {
3873                                l3->alien = new_alien;
3874                                new_alien = NULL;
3875                        }
3876                        l3->free_limit = (1 + nr_cpus_node(node)) *
3877                                        cachep->batchcount + cachep->num;
3878                        spin_unlock_irq(&l3->list_lock);
3879                        kfree(shared);
3880                        free_alien_cache(new_alien);
3881                        continue;
3882                }
3883                l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3884                if (!l3) {
3885                        free_alien_cache(new_alien);
3886                        kfree(new_shared);
3887                        goto fail;
3888                }
3889
3890                kmem_list3_init(l3);
3891                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3892                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3893                l3->shared = new_shared;
3894                l3->alien = new_alien;
3895                l3->free_limit = (1 + nr_cpus_node(node)) *
3896                                        cachep->batchcount + cachep->num;
3897                cachep->nodelists[node] = l3;
3898        }
3899        return 0;
3900
3901fail:
3902        if (!cachep->next.next) {
3903                /* Cache is not active yet. Roll back what we did */
3904                node--;
3905                while (node >= 0) {
3906                        if (cachep->nodelists[node]) {
3907                                l3 = cachep->nodelists[node];
3908
3909                                kfree(l3->shared);
3910                                free_alien_cache(l3->alien);
3911                                kfree(l3);
3912                                cachep->nodelists[node] = NULL;
3913                        }
3914                        node--;
3915                }
3916        }
3917        return -ENOMEM;
3918}
3919
3920struct ccupdate_struct {
3921        struct kmem_cache *cachep;
3922        struct array_cache *new[NR_CPUS];
3923};
3924
3925static void do_ccupdate_local(void *info)
3926{
3927        struct ccupdate_struct *new = info;
3928        struct array_cache *old;
3929
3930        check_irq_off();
3931        old = cpu_cache_get(new->cachep);
3932
3933        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3934        new->new[smp_processor_id()] = old;
3935}
3936
3937/* Always called with the cache_chain_mutex held */
3938static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3939                                int batchcount, int shared)
3940{
3941        struct ccupdate_struct *new;
3942        int i;
3943
3944        new = kzalloc(sizeof(*new), GFP_KERNEL);
3945        if (!new)
3946                return -ENOMEM;
3947
3948        for_each_online_cpu(i) {
3949                new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3950                                                batchcount);
3951                if (!new->new[i]) {
3952                        for (i--; i >= 0; i--)
3953                                kfree(new->new[i]);
3954                        kfree(new);
3955                        return -ENOMEM;
3956                }
3957        }
3958        new->cachep = cachep;
3959
3960        on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3961
3962        check_irq_on();
3963        cachep->batchcount = batchcount;
3964        cachep->limit = limit;
3965        cachep->shared = shared;
3966
3967        for_each_online_cpu(i) {
3968                struct array_cache *ccold = new->new[i];
3969                if (!ccold)
3970                        continue;
3971                spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3972                free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3973                spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3974                kfree(ccold);
3975        }
3976        kfree(new);
3977        return alloc_kmemlist(cachep);
3978}
3979
3980/* Called with cache_chain_mutex held always */
3981static int enable_cpucache(struct kmem_cache *cachep)
3982{
3983        int err;
3984        int limit, shared;
3985
3986        /*
3987         * The head array serves three purposes:
3988         * - create a LIFO ordering, i.e. return objects that are cache-warm
3989         * - reduce the number of spinlock operations.
3990         * - reduce the number of linked list operations on the slab and
3991         *   bufctl chains: array operations are cheaper.
3992         * The numbers are guessed, we should auto-tune as described by
3993         * Bonwick.
3994         */
3995        if (cachep->buffer_size > 131072)
3996                limit = 1;
3997        else if (cachep->buffer_size > PAGE_SIZE)
3998                limit = 8;
3999        else if (cachep->buffer_size > 1024)
4000                limit = 24;
4001        else if (cachep->buffer_size > 256)
4002                limit = 54;
4003        else
4004                limit = 120;
4005
4006        /*
4007         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
4008         * allocation behaviour: Most allocs on one cpu, most free operations
4009         * on another cpu. For these cases, an efficient object passing between
4010         * cpus is necessary. This is provided by a shared array. The array
4011         * replaces Bonwick's magazine layer.
4012         * On uniprocessor, it's functionally equivalent (but less efficient)
4013         * to a larger limit. Thus disabled by default.
4014         */
4015        shared = 0;
4016        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
4017                shared = 8;
4018
4019#if DEBUG
4020        /*
4021         * With debugging enabled, large batchcount lead to excessively long
4022         * periods with disabled local interrupts. Limit the batchcount
4023         */
4024        if (limit > 32)
4025                limit = 32;
4026#endif
4027        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
4028        if (err)
4029                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4030                       cachep->name, -err);
4031        return err;
4032}
4033
4034/*
4035 * Drain an array if it contains any elements taking the l3 lock only if
4036 * necessary. Note that the l3 listlock also protects the array_cache
4037 * if drain_array() is used on the shared array.
4038 */
4039void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4040                         struct array_cache *ac, int force, int node)
4041{
4042        int tofree;
4043
4044        if (!ac || !ac->avail)
4045                return;
4046        if (ac->touched && !force) {
4047                ac->touched = 0;
4048        } else {
4049                spin_lock_irq(&l3->list_lock);
4050                if (ac->avail) {
4051                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
4052                        if (tofree > ac->avail)
4053                                tofree = (ac->avail + 1) / 2;
4054                        free_block(cachep, ac->entry, tofree, node);
4055                        ac->avail -= tofree;
4056                        memmove(ac->entry, &(ac->entry[tofree]),
4057                                sizeof(void *) * ac->avail);
4058                }
4059                spin_unlock_irq(&l3->list_lock);
4060        }
4061}
4062
4063/**
4064 * cache_reap - Reclaim memory from caches.
4065 * @w: work descriptor
4066 *
4067 * Called from workqueue/eventd every few seconds.
4068 * Purpose:
4069 * - clear the per-cpu caches for this CPU.
4070 * - return freeable pages to the main free memory pool.
4071 *
4072 * If we cannot acquire the cache chain mutex then just give up - we'll try
4073 * again on the next iteration.
4074 */
4075static void cache_reap(struct work_struct *w)
4076{
4077        struct kmem_cache *searchp;
4078        struct kmem_list3 *l3;
4079        int node = numa_node_id();
4080        struct delayed_work *work =
4081                container_of(w, struct delayed_work, work);
4082
4083        if (!mutex_trylock(&cache_chain_mutex))
4084                /* Give up. Setup the next iteration. */
4085                goto out;
4086
4087        list_for_each_entry(searchp, &cache_chain, next) {
4088                check_irq_on();
4089
4090                /*
4091                 * We only take the l3 lock if absolutely necessary and we
4092                 * have established with reasonable certainty that
4093                 * we can do some work if the lock was obtained.
4094                 */
4095                l3 = searchp->nodelists[node];
4096
4097                reap_alien(searchp, l3);
4098
4099                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4100
4101                /*
4102                 * These are racy checks but it does not matter
4103                 * if we skip one check or scan twice.
4104                 */
4105                if (time_after(l3->next_reap, jiffies))
4106                        goto next;
4107
4108                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4109
4110                drain_array(searchp, l3, l3->shared, 0, node);
4111
4112                if (l3->free_touched)
4113                        l3->free_touched = 0;
4114                else {
4115                        int freed;
4116
4117                        freed = drain_freelist(searchp, l3, (l3->free_limit +
4118                                5 * searchp->num - 1) / (5 * searchp->num));
4119                        STATS_ADD_REAPED(searchp, freed);
4120                }
4121next:
4122                cond_resched();
4123        }
4124        check_irq_on();
4125        mutex_unlock(&cache_chain_mutex);
4126        next_reap_node();
4127out:
4128        /* Set up the next iteration */
4129        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4130}
4131
4132#ifdef CONFIG_PROC_FS
4133
4134static void print_slabinfo_header(struct seq_file *m)
4135{
4136        /*
4137         * Output format version, so at least we can change it
4138         * without _too_ many complaints.
4139         */
4140#if STATS
4141        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4142#else
4143        seq_puts(m, "slabinfo - version: 2.1\n");
4144#endif
4145        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4146                 "<objperslab> <pagesperslab>");
4147        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4148        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4149#if STATS
4150        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4151                 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4152        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4153#endif
4154        seq_putc(m, '\n');
4155}
4156
4157static void *s_start(struct seq_file *m, loff_t *pos)
4158{
4159        loff_t n = *pos;
4160        struct list_head *p;
4161
4162        mutex_lock(&cache_chain_mutex);
4163        if (!n)
4164                print_slabinfo_header(m);
4165        p = cache_chain.next;
4166        while (n--) {
4167                p = p->next;
4168                if (p == &cache_chain)
4169                        return NULL;
4170        }
4171        return list_entry(p, struct kmem_cache, next);
4172}
4173
4174static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4175{
4176        struct kmem_cache *cachep = p;
4177        ++*pos;
4178        return cachep->next.next == &cache_chain ?
4179                NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4180}
4181
4182static void s_stop(struct seq_file *m, void *p)
4183{
4184        mutex_unlock(&cache_chain_mutex);
4185}
4186
4187static int s_show(struct seq_file *m, void *p)
4188{
4189        struct kmem_cache *cachep = p;
4190        struct slab *slabp;
4191        unsigned long active_objs;
4192        unsigned long num_objs;
4193        unsigned long active_slabs = 0;
4194        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4195        const char *name;
4196        char *error = NULL;
4197        int node;
4198        struct kmem_list3 *l3;
4199
4200        active_objs = 0;
4201        num_slabs = 0;
4202        for_each_online_node(node) {
4203                l3 = cachep->nodelists[node];
4204                if (!l3)
4205                        continue;
4206
4207                check_irq_on();
4208                spin_lock_irq(&l3->list_lock);
4209
4210                list_for_each_entry(slabp, &l3->slabs_full, list) {
4211                        if (slabp->inuse != cachep->num && !error)
4212                                error = "slabs_full accounting error";
4213                        active_objs += cachep->num;
4214                        active_slabs++;
4215                }
4216                list_for_each_entry(slabp, &l3->slabs_partial, list) {
4217                        if (slabp->inuse == cachep->num && !error)
4218                                error = "slabs_partial inuse accounting error";
4219                        if (!slabp->inuse && !error)
4220                                error = "slabs_partial/inuse accounting error";
4221                        active_objs += slabp->inuse;
4222                        active_slabs++;
4223                }
4224                list_for_each_entry(slabp, &l3->slabs_free, list) {
4225                        if (slabp->inuse && !error)
4226                                error = "slabs_free/inuse accounting error";
4227                        num_slabs++;
4228                }
4229                free_objects += l3->free_objects;
4230                if (l3->shared)
4231                        shared_avail += l3->shared->avail;
4232
4233                spin_unlock_irq(&l3->list_lock);
4234        }
4235        num_slabs += active_slabs;
4236        num_objs = num_slabs * cachep->num;
4237        if (num_objs - active_objs != free_objects && !error)
4238                error = "free_objects accounting error";
4239
4240        name = cachep->name;
4241        if (error)
4242                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4243
4244        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4245                   name, active_objs, num_objs, cachep->buffer_size,
4246                   cachep->num, (1 << cachep->gfporder));
4247        seq_printf(m, " : tunables %4u %4u %4u",
4248                   cachep->limit, cachep->batchcount, cachep->shared);
4249        seq_printf(m, " : slabdata %6lu %6lu %6lu",
4250                   active_slabs, num_slabs, shared_avail);
4251#if STATS
4252        {                       /* list3 stats */
4253                unsigned long high = cachep->high_mark;
4254                unsigned long allocs = cachep->num_allocations;
4255                unsigned long grown = cachep->grown;
4256                unsigned long reaped = cachep->reaped;
4257                unsigned long errors = cachep->errors;
4258                unsigned long max_freeable = cachep->max_freeable;
4259                unsigned long node_allocs = cachep->node_allocs;
4260                unsigned long node_frees = cachep->node_frees;
4261                unsigned long overflows = cachep->node_overflow;
4262
4263                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4264                                %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4265                                reaped, errors, max_freeable, node_allocs,
4266                                node_frees, overflows);
4267        }
4268        /* cpu stats */
4269        {
4270                unsigned long allochit = atomic_read(&cachep->allochit);
4271                unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4272                unsigned long freehit = atomic_read(&cachep->freehit);
4273                unsigned long freemiss = atomic_read(&cachep->freemiss);
4274
4275                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4276                           allochit, allocmiss, freehit, freemiss);
4277        }
4278#endif
4279        seq_putc(m, '\n');
4280        return 0;
4281}
4282
4283/*
4284 * slabinfo_op - iterator that generates /proc/slabinfo
4285 *
4286 * Output layout:
4287 * cache-name
4288 * num-active-objs
4289 * total-objs
4290 * object size
4291 * num-active-slabs
4292 * total-slabs
4293 * num-pages-per-slab
4294 * + further values on SMP and with statistics enabled
4295 */
4296
4297const struct seq_operations slabinfo_op = {
4298        .start = s_start,
4299        .next = s_next,
4300        .stop = s_stop,
4301        .show = s_show,
4302};
4303
4304#define MAX_SLABINFO_WRITE 128
4305/**
4306 * slabinfo_write - Tuning for the slab allocator
4307 * @file: unused
4308 * @buffer: user buffer
4309 * @count: data length
4310 * @ppos: unused
4311 */
4312ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4313                       size_t count, loff_t *ppos)
4314{
4315        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4316        int limit, batchcount, shared, res;
4317        struct kmem_cache *cachep;
4318
4319        if (count > MAX_SLABINFO_WRITE)
4320                return -EINVAL;
4321        if (copy_from_user(&kbuf, buffer, count))
4322                return -EFAULT;
4323        kbuf[MAX_SLABINFO_WRITE] = '\0';
4324
4325        tmp = strchr(kbuf, ' ');
4326        if (!tmp)
4327                return -EINVAL;
4328        *tmp = '\0';
4329        tmp++;
4330        if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4331                return -EINVAL;
4332
4333        /* Find the cache in the chain of caches. */
4334        mutex_lock(&cache_chain_mutex);
4335        res = -EINVAL;
4336        list_for_each_entry(cachep, &cache_chain, next) {
4337                if (!strcmp(cachep->name, kbuf)) {
4338                        if (limit < 1 || batchcount < 1 ||
4339                                        batchcount > limit || shared < 0) {
4340                                res = 0;
4341                        } else {
4342                                res = do_tune_cpucache(cachep, limit,
4343                                                       batchcount, shared);
4344                        }
4345                        break;
4346                }
4347        }
4348        mutex_unlock(&cache_chain_mutex);
4349        if (res >= 0)
4350                res = count;
4351        return res;
4352}
4353
4354#ifdef CONFIG_DEBUG_SLAB_LEAK
4355
4356static void *leaks_start(struct seq_file *m, loff_t *pos)
4357{
4358        loff_t n = *pos;
4359        struct list_head *p;
4360
4361        mutex_lock(&cache_chain_mutex);
4362        p = cache_chain.next;
4363        while (n--) {
4364                p = p->next;
4365                if (p == &cache_chain)
4366                        return NULL;
4367        }
4368        return list_entry(p, struct kmem_cache, next);
4369}
4370
4371static inline int add_caller(unsigned long *n, unsigned long v)
4372{
4373        unsigned long *p;
4374        int l;
4375        if (!v)
4376                return 1;
4377        l = n[1];
4378        p = n + 2;
4379        while (l) {
4380                int i = l/2;
4381                unsigned long *q = p + 2 * i;
4382                if (*q == v) {
4383                        q[1]++;
4384                        return 1;
4385                }
4386                if (*q > v) {
4387                        l = i;
4388                } else {
4389                        p = q + 2;
4390                        l -= i + 1;
4391                }
4392        }
4393        if (++n[1] == n[0])
4394                return 0;
4395        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4396        p[0] = v;
4397        p[1] = 1;
4398        return 1;
4399}
4400
4401static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4402{
4403        void *p;
4404        int i;
4405        if (n[0] == n[1])
4406                return;
4407        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4408                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4409                        continue;
4410                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4411                        return;
4412        }
4413}
4414
4415static void show_symbol(struct seq_file *m, unsigned long address)
4416{
4417#ifdef CONFIG_KALLSYMS
4418        unsigned long offset, size;
4419        char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
4420
4421        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4422                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4423                if (modname[0])
4424                        seq_printf(m, " [%s]", modname);
4425                return;
4426        }
4427#endif
4428        seq_printf(m, "%p", (void *)address);
4429}
4430
4431static int leaks_show(struct seq_file *m, void *p)
4432{
4433        struct kmem_cache *cachep = p;
4434        struct slab *slabp;
4435        struct kmem_list3 *l3;
4436        const char *name;
4437        unsigned long *n = m->private;
4438        int node;
4439        int i;
4440
4441        if (!(cachep->flags & SLAB_STORE_USER))
4442                return 0;
4443        if (!(cachep->flags & SLAB_RED_ZONE))
4444                return 0;
4445
4446        /* OK, we can do it */
4447
4448        n[1] = 0;
4449
4450        for_each_online_node(node) {
4451                l3 = cachep->nodelists[node];
4452                if (!l3)
4453                        continue;
4454
4455                check_irq_on();
4456                spin_lock_irq(&l3->list_lock);
4457
4458                list_for_each_entry(slabp, &l3->slabs_full, list)
4459                        handle_slab(n, cachep, slabp);
4460                list_for_each_entry(slabp, &l3->slabs_partial, list)
4461                        handle_slab(n, cachep, slabp);
4462                spin_unlock_irq(&l3->list_lock);
4463        }
4464        name = cachep->name;
4465        if (n[0] == n[1]) {
4466                /* Increase the buffer size */
4467                mutex_unlock(&cache_chain_mutex);
4468                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4469                if (!m->private) {
4470                        /* Too bad, we are really out */
4471                        m->private = n;
4472                        mutex_lock(&cache_chain_mutex);
4473                        return -ENOMEM;
4474                }
4475                *(unsigned long *)m->private = n[0] * 2;
4476                kfree(n);
4477                mutex_lock(&cache_chain_mutex);
4478                /* Now make sure this entry will be retried */
4479                m->count = m->size;
4480                return 0;
4481        }
4482        for (i = 0; i < n[1]; i++) {
4483                seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4484                show_symbol(m, n[2*i+2]);
4485                seq_putc(m, '\n');
4486        }
4487
4488        return 0;
4489}
4490
4491const struct seq_operations slabstats_op = {
4492        .start = leaks_start,
4493        .next = s_next,
4494        .stop = s_stop,
4495        .show = leaks_show,
4496};
4497#endif
4498#endif
4499
4500/**
4501 * ksize - get the actual amount of memory allocated for a given object
4502 * @objp: Pointer to the object
4503 *
4504 * kmalloc may internally round up allocations and return more memory
4505 * than requested. ksize() can be used to determine the actual amount of
4506 * memory allocated. The caller may use this additional memory, even though
4507 * a smaller amount of memory was initially specified with the kmalloc call.
4508 * The caller must guarantee that objp points to a valid object previously
4509 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4510 * must not be freed during the duration of the call.
4511 */
4512size_t ksize(const void *objp)
4513{
4514        if (unlikely(objp == NULL))
4515                return 0;
4516
4517        return obj_size(virt_to_cache(objp));
4518}
4519
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.