linux/mm/slab.c
<<
>>
Prefs
   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same initializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts -
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in struct kmem_cache and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 * 15 March 2005. NUMA slab allocator.
  79 *      Shai Fultheim <shai@scalex86.org>.
  80 *      Shobhit Dayal <shobhit@calsoftinc.com>
  81 *      Alok N Kataria <alokk@calsoftinc.com>
  82 *      Christoph Lameter <christoph@lameter.com>
  83 *
  84 *      Modified the slab allocator to be node aware on NUMA systems.
  85 *      Each node has its own list of partial, free and full slabs.
  86 *      All object allocations for a node occur from node specific slab lists.
  87 */
  88
  89#include        <linux/slab.h>
  90#include        <linux/mm.h>
  91#include        <linux/poison.h>
  92#include        <linux/swap.h>
  93#include        <linux/cache.h>
  94#include        <linux/interrupt.h>
  95#include        <linux/init.h>
  96#include        <linux/compiler.h>
  97#include        <linux/cpuset.h>
  98#include        <linux/proc_fs.h>
  99#include        <linux/seq_file.h>
 100#include        <linux/notifier.h>
 101#include        <linux/kallsyms.h>
 102#include        <linux/cpu.h>
 103#include        <linux/sysctl.h>
 104#include        <linux/module.h>
 105#include        <linux/kmemtrace.h>
 106#include        <linux/rcupdate.h>
 107#include        <linux/string.h>
 108#include        <linux/uaccess.h>
 109#include        <linux/nodemask.h>
 110#include        <linux/kmemleak.h>
 111#include        <linux/mempolicy.h>
 112#include        <linux/mutex.h>
 113#include        <linux/fault-inject.h>
 114#include        <linux/rtmutex.h>
 115#include        <linux/reciprocal_div.h>
 116#include        <linux/debugobjects.h>
 117#include        <linux/kmemcheck.h>
 118
 119#include        <asm/cacheflush.h>
 120#include        <asm/tlbflush.h>
 121#include        <asm/page.h>
 122
 123/*
 124 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 125 *                0 for faster, smaller code (especially in the critical paths).
 126 *
 127 * STATS        - 1 to collect stats for /proc/slabinfo.
 128 *                0 for faster, smaller code (especially in the critical paths).
 129 *
 130 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 131 */
 132
 133#ifdef CONFIG_DEBUG_SLAB
 134#define DEBUG           1
 135#define STATS           1
 136#define FORCED_DEBUG    1
 137#else
 138#define DEBUG           0
 139#define STATS           0
 140#define FORCED_DEBUG    0
 141#endif
 142
 143/* Shouldn't this be in a header file somewhere? */
 144#define BYTES_PER_WORD          sizeof(void *)
 145#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 146
 147#ifndef ARCH_KMALLOC_MINALIGN
 148/*
 149 * Enforce a minimum alignment for the kmalloc caches.
 150 * Usually, the kmalloc caches are cache_line_size() aligned, except when
 151 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
 152 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 153 * alignment larger than the alignment of a 64-bit integer.
 154 * ARCH_KMALLOC_MINALIGN allows that.
 155 * Note that increasing this value may disable some debug features.
 156 */
 157#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 158#endif
 159
 160#ifndef ARCH_SLAB_MINALIGN
 161/*
 162 * Enforce a minimum alignment for all caches.
 163 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
 164 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
 165 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
 166 * some debug features.
 167 */
 168#define ARCH_SLAB_MINALIGN 0
 169#endif
 170
 171#ifndef ARCH_KMALLOC_FLAGS
 172#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 173#endif
 174
 175/* Legal flag mask for kmem_cache_create(). */
 176#if DEBUG
 177# define CREATE_MASK    (SLAB_RED_ZONE | \
 178                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 179                         SLAB_CACHE_DMA | \
 180                         SLAB_STORE_USER | \
 181                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 182                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
 183                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 184#else
 185# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
 186                         SLAB_CACHE_DMA | \
 187                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 188                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
 189                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
 190#endif
 191
 192/*
 193 * kmem_bufctl_t:
 194 *
 195 * Bufctl's are used for linking objs within a slab
 196 * linked offsets.
 197 *
 198 * This implementation relies on "struct page" for locating the cache &
 199 * slab an object belongs to.
 200 * This allows the bufctl structure to be small (one int), but limits
 201 * the number of objects a slab (not a cache) can contain when off-slab
 202 * bufctls are used. The limit is the size of the largest general cache
 203 * that does not use off-slab slabs.
 204 * For 32bit archs with 4 kB pages, is this 56.
 205 * This is not serious, as it is only for large objects, when it is unwise
 206 * to have too many per slab.
 207 * Note: This limit can be raised by introducing a general cache whose size
 208 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 209 */
 210
 211typedef unsigned int kmem_bufctl_t;
 212#define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 213#define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 214#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
 215#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 216
 217/*
 218 * struct slab
 219 *
 220 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 221 * for a slab, or allocated from an general cache.
 222 * Slabs are chained into three list: fully used, partial, fully free slabs.
 223 */
 224struct slab {
 225        struct list_head list;
 226        unsigned long colouroff;
 227        void *s_mem;            /* including colour offset */
 228        unsigned int inuse;     /* num of objs active in slab */
 229        kmem_bufctl_t free;
 230        unsigned short nodeid;
 231};
 232
 233/*
 234 * struct slab_rcu
 235 *
 236 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 237 * arrange for kmem_freepages to be called via RCU.  This is useful if
 238 * we need to approach a kernel structure obliquely, from its address
 239 * obtained without the usual locking.  We can lock the structure to
 240 * stabilize it and check it's still at the given address, only if we
 241 * can be sure that the memory has not been meanwhile reused for some
 242 * other kind of object (which our subsystem's lock might corrupt).
 243 *
 244 * rcu_read_lock before reading the address, then rcu_read_unlock after
 245 * taking the spinlock within the structure expected at that address.
 246 *
 247 * We assume struct slab_rcu can overlay struct slab when destroying.
 248 */
 249struct slab_rcu {
 250        struct rcu_head head;
 251        struct kmem_cache *cachep;
 252        void *addr;
 253};
 254
 255/*
 256 * struct array_cache
 257 *
 258 * Purpose:
 259 * - LIFO ordering, to hand out cache-warm objects from _alloc
 260 * - reduce the number of linked list operations
 261 * - reduce spinlock operations
 262 *
 263 * The limit is stored in the per-cpu structure to reduce the data cache
 264 * footprint.
 265 *
 266 */
 267struct array_cache {
 268        unsigned int avail;
 269        unsigned int limit;
 270        unsigned int batchcount;
 271        unsigned int touched;
 272        spinlock_t lock;
 273        void *entry[];  /*
 274                         * Must have this definition in here for the proper
 275                         * alignment of array_cache. Also simplifies accessing
 276                         * the entries.
 277                         */
 278};
 279
 280/*
 281 * bootstrap: The caches do not work without cpuarrays anymore, but the
 282 * cpuarrays are allocated from the generic caches...
 283 */
 284#define BOOT_CPUCACHE_ENTRIES   1
 285struct arraycache_init {
 286        struct array_cache cache;
 287        void *entries[BOOT_CPUCACHE_ENTRIES];
 288};
 289
 290/*
 291 * The slab lists for all objects.
 292 */
 293struct kmem_list3 {
 294        struct list_head slabs_partial; /* partial list first, better asm code */
 295        struct list_head slabs_full;
 296        struct list_head slabs_free;
 297        unsigned long free_objects;
 298        unsigned int free_limit;
 299        unsigned int colour_next;       /* Per-node cache coloring */
 300        spinlock_t list_lock;
 301        struct array_cache *shared;     /* shared per node */
 302        struct array_cache **alien;     /* on other nodes */
 303        unsigned long next_reap;        /* updated without locking */
 304        int free_touched;               /* updated without locking */
 305};
 306
 307/*
 308 * Need this for bootstrapping a per node allocator.
 309 */
 310#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
 311struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 312#define CACHE_CACHE 0
 313#define SIZE_AC MAX_NUMNODES
 314#define SIZE_L3 (2 * MAX_NUMNODES)
 315
 316static int drain_freelist(struct kmem_cache *cache,
 317                        struct kmem_list3 *l3, int tofree);
 318static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 319                        int node);
 320static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 321static void cache_reap(struct work_struct *unused);
 322
 323/*
 324 * This function must be completely optimized away if a constant is passed to
 325 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
 326 */
 327static __always_inline int index_of(const size_t size)
 328{
 329        extern void __bad_size(void);
 330
 331        if (__builtin_constant_p(size)) {
 332                int i = 0;
 333
 334#define CACHE(x) \
 335        if (size <=x) \
 336                return i; \
 337        else \
 338                i++;
 339#include <linux/kmalloc_sizes.h>
 340#undef CACHE
 341                __bad_size();
 342        } else
 343                __bad_size();
 344        return 0;
 345}
 346
 347static int slab_early_init = 1;
 348
 349#define INDEX_AC index_of(sizeof(struct arraycache_init))
 350#define INDEX_L3 index_of(sizeof(struct kmem_list3))
 351
 352static void kmem_list3_init(struct kmem_list3 *parent)
 353{
 354        INIT_LIST_HEAD(&parent->slabs_full);
 355        INIT_LIST_HEAD(&parent->slabs_partial);
 356        INIT_LIST_HEAD(&parent->slabs_free);
 357        parent->shared = NULL;
 358        parent->alien = NULL;
 359        parent->colour_next = 0;
 360        spin_lock_init(&parent->list_lock);
 361        parent->free_objects = 0;
 362        parent->free_touched = 0;
 363}
 364
 365#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
 366        do {                                                            \
 367                INIT_LIST_HEAD(listp);                                  \
 368                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
 369        } while (0)
 370
 371#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
 372        do {                                                            \
 373        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
 374        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 375        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
 376        } while (0)
 377
 378#define CFLGS_OFF_SLAB          (0x80000000UL)
 379#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 380
 381#define BATCHREFILL_LIMIT       16
 382/*
 383 * Optimization question: fewer reaps means less probability for unnessary
 384 * cpucache drain/refill cycles.
 385 *
 386 * OTOH the cpuarrays can contain lots of objects,
 387 * which could lock up otherwise freeable slabs.
 388 */
 389#define REAPTIMEOUT_CPUC        (2*HZ)
 390#define REAPTIMEOUT_LIST3       (4*HZ)
 391
 392#if STATS
 393#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 394#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 395#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 396#define STATS_INC_GROWN(x)      ((x)->grown++)
 397#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 398#define STATS_SET_HIGH(x)                                               \
 399        do {                                                            \
 400                if ((x)->num_active > (x)->high_mark)                   \
 401                        (x)->high_mark = (x)->num_active;               \
 402        } while (0)
 403#define STATS_INC_ERR(x)        ((x)->errors++)
 404#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 405#define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
 406#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 407#define STATS_SET_FREEABLE(x, i)                                        \
 408        do {                                                            \
 409                if ((x)->max_freeable < i)                              \
 410                        (x)->max_freeable = i;                          \
 411        } while (0)
 412#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 413#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 414#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 415#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 416#else
 417#define STATS_INC_ACTIVE(x)     do { } while (0)
 418#define STATS_DEC_ACTIVE(x)     do { } while (0)
 419#define STATS_INC_ALLOCED(x)    do { } while (0)
 420#define STATS_INC_GROWN(x)      do { } while (0)
 421#define STATS_ADD_REAPED(x,y)   do { } while (0)
 422#define STATS_SET_HIGH(x)       do { } while (0)
 423#define STATS_INC_ERR(x)        do { } while (0)
 424#define STATS_INC_NODEALLOCS(x) do { } while (0)
 425#define STATS_INC_NODEFREES(x)  do { } while (0)
 426#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 427#define STATS_SET_FREEABLE(x, i) do { } while (0)
 428#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 429#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 430#define STATS_INC_FREEHIT(x)    do { } while (0)
 431#define STATS_INC_FREEMISS(x)   do { } while (0)
 432#endif
 433
 434#if DEBUG
 435
 436/*
 437 * memory layout of objects:
 438 * 0            : objp
 439 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 440 *              the end of an object is aligned with the end of the real
 441 *              allocation. Catches writes behind the end of the allocation.
 442 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 443 *              redzone word.
 444 * cachep->obj_offset: The real object.
 445 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 446 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
 447 *                                      [BYTES_PER_WORD long]
 448 */
 449static int obj_offset(struct kmem_cache *cachep)
 450{
 451        return cachep->obj_offset;
 452}
 453
 454static int obj_size(struct kmem_cache *cachep)
 455{
 456        return cachep->obj_size;
 457}
 458
 459static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 460{
 461        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 462        return (unsigned long long*) (objp + obj_offset(cachep) -
 463                                      sizeof(unsigned long long));
 464}
 465
 466static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 467{
 468        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 469        if (cachep->flags & SLAB_STORE_USER)
 470                return (unsigned long long *)(objp + cachep->buffer_size -
 471                                              sizeof(unsigned long long) -
 472                                              REDZONE_ALIGN);
 473        return (unsigned long long *) (objp + cachep->buffer_size -
 474                                       sizeof(unsigned long long));
 475}
 476
 477static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 478{
 479        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 480        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 481}
 482
 483#else
 484
 485#define obj_offset(x)                   0
 486#define obj_size(cachep)                (cachep->buffer_size)
 487#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 488#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 489#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 490
 491#endif
 492
 493#ifdef CONFIG_TRACING
 494size_t slab_buffer_size(struct kmem_cache *cachep)
 495{
 496        return cachep->buffer_size;
 497}
 498EXPORT_SYMBOL(slab_buffer_size);
 499#endif
 500
 501/*
 502 * Do not go above this order unless 0 objects fit into the slab.
 503 */
 504#define BREAK_GFP_ORDER_HI      1
 505#define BREAK_GFP_ORDER_LO      0
 506static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 507
 508/*
 509 * Functions for storing/retrieving the cachep and or slab from the page
 510 * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
 511 * these are used to find the cache which an obj belongs to.
 512 */
 513static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 514{
 515        page->lru.next = (struct list_head *)cache;
 516}
 517
 518static inline struct kmem_cache *page_get_cache(struct page *page)
 519{
 520        page = compound_head(page);
 521        BUG_ON(!PageSlab(page));
 522        return (struct kmem_cache *)page->lru.next;
 523}
 524
 525static inline void page_set_slab(struct page *page, struct slab *slab)
 526{
 527        page->lru.prev = (struct list_head *)slab;
 528}
 529
 530static inline struct slab *page_get_slab(struct page *page)
 531{
 532        BUG_ON(!PageSlab(page));
 533        return (struct slab *)page->lru.prev;
 534}
 535
 536static inline struct kmem_cache *virt_to_cache(const void *obj)
 537{
 538        struct page *page = virt_to_head_page(obj);
 539        return page_get_cache(page);
 540}
 541
 542static inline struct slab *virt_to_slab(const void *obj)
 543{
 544        struct page *page = virt_to_head_page(obj);
 545        return page_get_slab(page);
 546}
 547
 548static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 549                                 unsigned int idx)
 550{
 551        return slab->s_mem + cache->buffer_size * idx;
 552}
 553
 554/*
 555 * We want to avoid an expensive divide : (offset / cache->buffer_size)
 556 *   Using the fact that buffer_size is a constant for a particular cache,
 557 *   we can replace (offset / cache->buffer_size) by
 558 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 559 */
 560static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 561                                        const struct slab *slab, void *obj)
 562{
 563        u32 offset = (obj - slab->s_mem);
 564        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 565}
 566
 567/*
 568 * These are the default caches for kmalloc. Custom caches can have other sizes.
 569 */
 570struct cache_sizes malloc_sizes[] = {
 571#define CACHE(x) { .cs_size = (x) },
 572#include <linux/kmalloc_sizes.h>
 573        CACHE(ULONG_MAX)
 574#undef CACHE
 575};
 576EXPORT_SYMBOL(malloc_sizes);
 577
 578/* Must match cache_sizes above. Out of line to keep cache footprint low. */
 579struct cache_names {
 580        char *name;
 581        char *name_dma;
 582};
 583
 584static struct cache_names __initdata cache_names[] = {
 585#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 586#include <linux/kmalloc_sizes.h>
 587        {NULL,}
 588#undef CACHE
 589};
 590
 591static struct arraycache_init initarray_cache __initdata =
 592    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 593static struct arraycache_init initarray_generic =
 594    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 595
 596/* internal cache of cache description objs */
 597static struct kmem_cache cache_cache = {
 598        .batchcount = 1,
 599        .limit = BOOT_CPUCACHE_ENTRIES,
 600        .shared = 1,
 601        .buffer_size = sizeof(struct kmem_cache),
 602        .name = "kmem_cache",
 603};
 604
 605#define BAD_ALIEN_MAGIC 0x01020304ul
 606
 607/*
 608 * chicken and egg problem: delay the per-cpu array allocation
 609 * until the general caches are up.
 610 */
 611static enum {
 612        NONE,
 613        PARTIAL_AC,
 614        PARTIAL_L3,
 615        EARLY,
 616        FULL
 617} g_cpucache_up;
 618
 619/*
 620 * used by boot code to determine if it can use slab based allocator
 621 */
 622int slab_is_available(void)
 623{
 624        return g_cpucache_up >= EARLY;
 625}
 626
 627#ifdef CONFIG_LOCKDEP
 628
 629/*
 630 * Slab sometimes uses the kmalloc slabs to store the slab headers
 631 * for other slabs "off slab".
 632 * The locking for this is tricky in that it nests within the locks
 633 * of all other slabs in a few places; to deal with this special
 634 * locking we put on-slab caches into a separate lock-class.
 635 *
 636 * We set lock class for alien array caches which are up during init.
 637 * The lock annotation will be lost if all cpus of a node goes down and
 638 * then comes back up during hotplug
 639 */
 640static struct lock_class_key on_slab_l3_key;
 641static struct lock_class_key on_slab_alc_key;
 642
 643static void init_node_lock_keys(int q)
 644{
 645        struct cache_sizes *s = malloc_sizes;
 646
 647        if (g_cpucache_up != FULL)
 648                return;
 649
 650        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
 651                struct array_cache **alc;
 652                struct kmem_list3 *l3;
 653                int r;
 654
 655                l3 = s->cs_cachep->nodelists[q];
 656                if (!l3 || OFF_SLAB(s->cs_cachep))
 657                        continue;
 658                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
 659                alc = l3->alien;
 660                /*
 661                 * FIXME: This check for BAD_ALIEN_MAGIC
 662                 * should go away when common slab code is taught to
 663                 * work even without alien caches.
 664                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 665                 * for alloc_alien_cache,
 666                 */
 667                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 668                        continue;
 669                for_each_node(r) {
 670                        if (alc[r])
 671                                lockdep_set_class(&alc[r]->lock,
 672                                        &on_slab_alc_key);
 673                }
 674        }
 675}
 676
 677static inline void init_lock_keys(void)
 678{
 679        int node;
 680
 681        for_each_node(node)
 682                init_node_lock_keys(node);
 683}
 684#else
 685static void init_node_lock_keys(int q)
 686{
 687}
 688
 689static inline void init_lock_keys(void)
 690{
 691}
 692#endif
 693
 694/*
 695 * Guard access to the cache-chain.
 696 */
 697static DEFINE_MUTEX(cache_chain_mutex);
 698static struct list_head cache_chain;
 699
 700static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 701
 702static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 703{
 704        return cachep->array[smp_processor_id()];
 705}
 706
 707static inline struct kmem_cache *__find_general_cachep(size_t size,
 708                                                        gfp_t gfpflags)
 709{
 710        struct cache_sizes *csizep = malloc_sizes;
 711
 712#if DEBUG
 713        /* This happens if someone tries to call
 714         * kmem_cache_create(), or __kmalloc(), before
 715         * the generic caches are initialized.
 716         */
 717        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 718#endif
 719        if (!size)
 720                return ZERO_SIZE_PTR;
 721
 722        while (size > csizep->cs_size)
 723                csizep++;
 724
 725        /*
 726         * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 727         * has cs_{dma,}cachep==NULL. Thus no special case
 728         * for large kmalloc calls required.
 729         */
 730#ifdef CONFIG_ZONE_DMA
 731        if (unlikely(gfpflags & GFP_DMA))
 732                return csizep->cs_dmacachep;
 733#endif
 734        return csizep->cs_cachep;
 735}
 736
 737static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 738{
 739        return __find_general_cachep(size, gfpflags);
 740}
 741
 742static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 743{
 744        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 745}
 746
 747/*
 748 * Calculate the number of objects and left-over bytes for a given buffer size.
 749 */
 750static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 751                           size_t align, int flags, size_t *left_over,
 752                           unsigned int *num)
 753{
 754        int nr_objs;
 755        size_t mgmt_size;
 756        size_t slab_size = PAGE_SIZE << gfporder;
 757
 758        /*
 759         * The slab management structure can be either off the slab or
 760         * on it. For the latter case, the memory allocated for a
 761         * slab is used for:
 762         *
 763         * - The struct slab
 764         * - One kmem_bufctl_t for each object
 765         * - Padding to respect alignment of @align
 766         * - @buffer_size bytes for each object
 767         *
 768         * If the slab management structure is off the slab, then the
 769         * alignment will already be calculated into the size. Because
 770         * the slabs are all pages aligned, the objects will be at the
 771         * correct alignment when allocated.
 772         */
 773        if (flags & CFLGS_OFF_SLAB) {
 774                mgmt_size = 0;
 775                nr_objs = slab_size / buffer_size;
 776
 777                if (nr_objs > SLAB_LIMIT)
 778                        nr_objs = SLAB_LIMIT;
 779        } else {
 780                /*
 781                 * Ignore padding for the initial guess. The padding
 782                 * is at most @align-1 bytes, and @buffer_size is at
 783                 * least @align. In the worst case, this result will
 784                 * be one greater than the number of objects that fit
 785                 * into the memory allocation when taking the padding
 786                 * into account.
 787                 */
 788                nr_objs = (slab_size - sizeof(struct slab)) /
 789                          (buffer_size + sizeof(kmem_bufctl_t));
 790
 791                /*
 792                 * This calculated number will be either the right
 793                 * amount, or one greater than what we want.
 794                 */
 795                if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 796                       > slab_size)
 797                        nr_objs--;
 798
 799                if (nr_objs > SLAB_LIMIT)
 800                        nr_objs = SLAB_LIMIT;
 801
 802                mgmt_size = slab_mgmt_size(nr_objs, align);
 803        }
 804        *num = nr_objs;
 805        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 806}
 807
 808#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 809
 810static void __slab_error(const char *function, struct kmem_cache *cachep,
 811                        char *msg)
 812{
 813        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 814               function, cachep->name, msg);
 815        dump_stack();
 816}
 817
 818/*
 819 * By default on NUMA we use alien caches to stage the freeing of
 820 * objects allocated from other nodes. This causes massive memory
 821 * inefficiencies when using fake NUMA setup to split memory into a
 822 * large number of small nodes, so it can be disabled on the command
 823 * line
 824  */
 825
 826static int use_alien_caches __read_mostly = 1;
 827static int __init noaliencache_setup(char *s)
 828{
 829        use_alien_caches = 0;
 830        return 1;
 831}
 832__setup("noaliencache", noaliencache_setup);
 833
 834#ifdef CONFIG_NUMA
 835/*
 836 * Special reaping functions for NUMA systems called from cache_reap().
 837 * These take care of doing round robin flushing of alien caches (containing
 838 * objects freed on different nodes from which they were allocated) and the
 839 * flushing of remote pcps by calling drain_node_pages.
 840 */
 841static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 842
 843static void init_reap_node(int cpu)
 844{
 845        int node;
 846
 847        node = next_node(cpu_to_node(cpu), node_online_map);
 848        if (node == MAX_NUMNODES)
 849                node = first_node(node_online_map);
 850
 851        per_cpu(slab_reap_node, cpu) = node;
 852}
 853
 854static void next_reap_node(void)
 855{
 856        int node = __get_cpu_var(slab_reap_node);
 857
 858        node = next_node(node, node_online_map);
 859        if (unlikely(node >= MAX_NUMNODES))
 860                node = first_node(node_online_map);
 861        __get_cpu_var(slab_reap_node) = node;
 862}
 863
 864#else
 865#define init_reap_node(cpu) do { } while (0)
 866#define next_reap_node(void) do { } while (0)
 867#endif
 868
 869/*
 870 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 871 * via the workqueue/eventd.
 872 * Add the CPU number into the expiration time to minimize the possibility of
 873 * the CPUs getting into lockstep and contending for the global cache chain
 874 * lock.
 875 */
 876static void __cpuinit start_cpu_timer(int cpu)
 877{
 878        struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 879
 880        /*
 881         * When this gets called from do_initcalls via cpucache_init(),
 882         * init_workqueues() has already run, so keventd will be setup
 883         * at that time.
 884         */
 885        if (keventd_up() && reap_work->work.func == NULL) {
 886                init_reap_node(cpu);
 887                INIT_DELAYED_WORK(reap_work, cache_reap);
 888                schedule_delayed_work_on(cpu, reap_work,
 889                                        __round_jiffies_relative(HZ, cpu));
 890        }
 891}
 892
 893static struct array_cache *alloc_arraycache(int node, int entries,
 894                                            int batchcount, gfp_t gfp)
 895{
 896        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 897        struct array_cache *nc = NULL;
 898
 899        nc = kmalloc_node(memsize, gfp, node);
 900        /*
 901         * The array_cache structures contain pointers to free object.
 902         * However, when such objects are allocated or transfered to another
 903         * cache the pointers are not cleared and they could be counted as
 904         * valid references during a kmemleak scan. Therefore, kmemleak must
 905         * not scan such objects.
 906         */
 907        kmemleak_no_scan(nc);
 908        if (nc) {
 909                nc->avail = 0;
 910                nc->limit = entries;
 911                nc->batchcount = batchcount;
 912                nc->touched = 0;
 913                spin_lock_init(&nc->lock);
 914        }
 915        return nc;
 916}
 917
 918/*
 919 * Transfer objects in one arraycache to another.
 920 * Locking must be handled by the caller.
 921 *
 922 * Return the number of entries transferred.
 923 */
 924static int transfer_objects(struct array_cache *to,
 925                struct array_cache *from, unsigned int max)
 926{
 927        /* Figure out how many entries to transfer */
 928        int nr = min(min(from->avail, max), to->limit - to->avail);
 929
 930        if (!nr)
 931                return 0;
 932
 933        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 934                        sizeof(void *) *nr);
 935
 936        from->avail -= nr;
 937        to->avail += nr;
 938        to->touched = 1;
 939        return nr;
 940}
 941
 942#ifndef CONFIG_NUMA
 943
 944#define drain_alien_cache(cachep, alien) do { } while (0)
 945#define reap_alien(cachep, l3) do { } while (0)
 946
 947static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 948{
 949        return (struct array_cache **)BAD_ALIEN_MAGIC;
 950}
 951
 952static inline void free_alien_cache(struct array_cache **ac_ptr)
 953{
 954}
 955
 956static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 957{
 958        return 0;
 959}
 960
 961static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 962                gfp_t flags)
 963{
 964        return NULL;
 965}
 966
 967static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 968                 gfp_t flags, int nodeid)
 969{
 970        return NULL;
 971}
 972
 973#else   /* CONFIG_NUMA */
 974
 975static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 976static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 977
 978static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 979{
 980        struct array_cache **ac_ptr;
 981        int memsize = sizeof(void *) * nr_node_ids;
 982        int i;
 983
 984        if (limit > 1)
 985                limit = 12;
 986        ac_ptr = kmalloc_node(memsize, gfp, node);
 987        if (ac_ptr) {
 988                for_each_node(i) {
 989                        if (i == node || !node_online(i)) {
 990                                ac_ptr[i] = NULL;
 991                                continue;
 992                        }
 993                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 994                        if (!ac_ptr[i]) {
 995                                for (i--; i >= 0; i--)
 996                                        kfree(ac_ptr[i]);
 997                                kfree(ac_ptr);
 998                                return NULL;
 999                        }
1000                }
1001        }
1002        return ac_ptr;
1003}
1004
1005static void free_alien_cache(struct array_cache **ac_ptr)
1006{
1007        int i;
1008
1009        if (!ac_ptr)
1010                return;
1011        for_each_node(i)
1012            kfree(ac_ptr[i]);
1013        kfree(ac_ptr);
1014}
1015
1016static void __drain_alien_cache(struct kmem_cache *cachep,
1017                                struct array_cache *ac, int node)
1018{
1019        struct kmem_list3 *rl3 = cachep->nodelists[node];
1020
1021        if (ac->avail) {
1022                spin_lock(&rl3->list_lock);
1023                /*
1024                 * Stuff objects into the remote nodes shared array first.
1025                 * That way we could avoid the overhead of putting the objects
1026                 * into the free lists and getting them back later.
1027                 */
1028                if (rl3->shared)
1029                        transfer_objects(rl3->shared, ac, ac->limit);
1030
1031                free_block(cachep, ac->entry, ac->avail, node);
1032                ac->avail = 0;
1033                spin_unlock(&rl3->list_lock);
1034        }
1035}
1036
1037/*
1038 * Called from cache_reap() to regularly drain alien caches round robin.
1039 */
1040static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1041{
1042        int node = __get_cpu_var(slab_reap_node);
1043
1044        if (l3->alien) {
1045                struct array_cache *ac = l3->alien[node];
1046
1047                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1048                        __drain_alien_cache(cachep, ac, node);
1049                        spin_unlock_irq(&ac->lock);
1050                }
1051        }
1052}
1053
1054static void drain_alien_cache(struct kmem_cache *cachep,
1055                                struct array_cache **alien)
1056{
1057        int i = 0;
1058        struct array_cache *ac;
1059        unsigned long flags;
1060
1061        for_each_online_node(i) {
1062                ac = alien[i];
1063                if (ac) {
1064                        spin_lock_irqsave(&ac->lock, flags);
1065                        __drain_alien_cache(cachep, ac, i);
1066                        spin_unlock_irqrestore(&ac->lock, flags);
1067                }
1068        }
1069}
1070
1071static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1072{
1073        struct slab *slabp = virt_to_slab(objp);
1074        int nodeid = slabp->nodeid;
1075        struct kmem_list3 *l3;
1076        struct array_cache *alien = NULL;
1077        int node;
1078
1079        node = numa_node_id();
1080
1081        /*
1082         * Make sure we are not freeing a object from another node to the array
1083         * cache on this cpu.
1084         */
1085        if (likely(slabp->nodeid == node))
1086                return 0;
1087
1088        l3 = cachep->nodelists[node];
1089        STATS_INC_NODEFREES(cachep);
1090        if (l3->alien && l3->alien[nodeid]) {
1091                alien = l3->alien[nodeid];
1092                spin_lock(&alien->lock);
1093                if (unlikely(alien->avail == alien->limit)) {
1094                        STATS_INC_ACOVERFLOW(cachep);
1095                        __drain_alien_cache(cachep, alien, nodeid);
1096                }
1097                alien->entry[alien->avail++] = objp;
1098                spin_unlock(&alien->lock);
1099        } else {
1100                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1101                free_block(cachep, &objp, 1, nodeid);
1102                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1103        }
1104        return 1;
1105}
1106#endif
1107
1108static void __cpuinit cpuup_canceled(long cpu)
1109{
1110        struct kmem_cache *cachep;
1111        struct kmem_list3 *l3 = NULL;
1112        int node = cpu_to_node(cpu);
1113        const struct cpumask *mask = cpumask_of_node(node);
1114
1115        list_for_each_entry(cachep, &cache_chain, next) {
1116                struct array_cache *nc;
1117                struct array_cache *shared;
1118                struct array_cache **alien;
1119
1120                /* cpu is dead; no one can alloc from it. */
1121                nc = cachep->array[cpu];
1122                cachep->array[cpu] = NULL;
1123                l3 = cachep->nodelists[node];
1124
1125                if (!l3)
1126                        goto free_array_cache;
1127
1128                spin_lock_irq(&l3->list_lock);
1129
1130                /* Free limit for this kmem_list3 */
1131                l3->free_limit -= cachep->batchcount;
1132                if (nc)
1133                        free_block(cachep, nc->entry, nc->avail, node);
1134
1135                if (!cpumask_empty(mask)) {
1136                        spin_unlock_irq(&l3->list_lock);
1137                        goto free_array_cache;
1138                }
1139
1140                shared = l3->shared;
1141                if (shared) {
1142                        free_block(cachep, shared->entry,
1143                                   shared->avail, node);
1144                        l3->shared = NULL;
1145                }
1146
1147                alien = l3->alien;
1148                l3->alien = NULL;
1149
1150                spin_unlock_irq(&l3->list_lock);
1151
1152                kfree(shared);
1153                if (alien) {
1154                        drain_alien_cache(cachep, alien);
1155                        free_alien_cache(alien);
1156                }
1157free_array_cache:
1158                kfree(nc);
1159        }
1160        /*
1161         * In the previous loop, all the objects were freed to
1162         * the respective cache's slabs,  now we can go ahead and
1163         * shrink each nodelist to its limit.
1164         */
1165        list_for_each_entry(cachep, &cache_chain, next) {
1166                l3 = cachep->nodelists[node];
1167                if (!l3)
1168                        continue;
1169                drain_freelist(cachep, l3, l3->free_objects);
1170        }
1171}
1172
1173static int __cpuinit cpuup_prepare(long cpu)
1174{
1175        struct kmem_cache *cachep;
1176        struct kmem_list3 *l3 = NULL;
1177        int node = cpu_to_node(cpu);
1178        const int memsize = sizeof(struct kmem_list3);
1179
1180        /*
1181         * We need to do this right in the beginning since
1182         * alloc_arraycache's are going to use this list.
1183         * kmalloc_node allows us to add the slab to the right
1184         * kmem_list3 and not this cpu's kmem_list3
1185         */
1186
1187        list_for_each_entry(cachep, &cache_chain, next) {
1188                /*
1189                 * Set up the size64 kmemlist for cpu before we can
1190                 * begin anything. Make sure some other cpu on this
1191                 * node has not already allocated this
1192                 */
1193                if (!cachep->nodelists[node]) {
1194                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1195                        if (!l3)
1196                                goto bad;
1197                        kmem_list3_init(l3);
1198                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1199                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1200
1201                        /*
1202                         * The l3s don't come and go as CPUs come and
1203                         * go.  cache_chain_mutex is sufficient
1204                         * protection here.
1205                         */
1206                        cachep->nodelists[node] = l3;
1207                }
1208
1209                spin_lock_irq(&cachep->nodelists[node]->list_lock);
1210                cachep->nodelists[node]->free_limit =
1211                        (1 + nr_cpus_node(node)) *
1212                        cachep->batchcount + cachep->num;
1213                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1214        }
1215
1216        /*
1217         * Now we can go ahead with allocating the shared arrays and
1218         * array caches
1219         */
1220        list_for_each_entry(cachep, &cache_chain, next) {
1221                struct array_cache *nc;
1222                struct array_cache *shared = NULL;
1223                struct array_cache **alien = NULL;
1224
1225                nc = alloc_arraycache(node, cachep->limit,
1226                                        cachep->batchcount, GFP_KERNEL);
1227                if (!nc)
1228                        goto bad;
1229                if (cachep->shared) {
1230                        shared = alloc_arraycache(node,
1231                                cachep->shared * cachep->batchcount,
1232                                0xbaadf00d, GFP_KERNEL);
1233                        if (!shared) {
1234                                kfree(nc);
1235                                goto bad;
1236                        }
1237                }
1238                if (use_alien_caches) {
1239                        alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1240                        if (!alien) {
1241                                kfree(shared);
1242                                kfree(nc);
1243                                goto bad;
1244                        }
1245                }
1246                cachep->array[cpu] = nc;
1247                l3 = cachep->nodelists[node];
1248                BUG_ON(!l3);
1249
1250                spin_lock_irq(&l3->list_lock);
1251                if (!l3->shared) {
1252                        /*
1253                         * We are serialised from CPU_DEAD or
1254                         * CPU_UP_CANCELLED by the cpucontrol lock
1255                         */
1256                        l3->shared = shared;
1257                        shared = NULL;
1258                }
1259#ifdef CONFIG_NUMA
1260                if (!l3->alien) {
1261                        l3->alien = alien;
1262                        alien = NULL;
1263                }
1264#endif
1265                spin_unlock_irq(&l3->list_lock);
1266                kfree(shared);
1267                free_alien_cache(alien);
1268        }
1269        init_node_lock_keys(node);
1270
1271        return 0;
1272bad:
1273        cpuup_canceled(cpu);
1274        return -ENOMEM;
1275}
1276
1277static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1278                                    unsigned long action, void *hcpu)
1279{
1280        long cpu = (long)hcpu;
1281        int err = 0;
1282
1283        switch (action) {
1284        case CPU_UP_PREPARE:
1285        case CPU_UP_PREPARE_FROZEN:
1286                mutex_lock(&cache_chain_mutex);
1287                err = cpuup_prepare(cpu);
1288                mutex_unlock(&cache_chain_mutex);
1289                break;
1290        case CPU_ONLINE:
1291        case CPU_ONLINE_FROZEN:
1292                start_cpu_timer(cpu);
1293                break;
1294#ifdef CONFIG_HOTPLUG_CPU
1295        case CPU_DOWN_PREPARE:
1296        case CPU_DOWN_PREPARE_FROZEN:
1297                /*
1298                 * Shutdown cache reaper. Note that the cache_chain_mutex is
1299                 * held so that if cache_reap() is invoked it cannot do
1300                 * anything expensive but will only modify reap_work
1301                 * and reschedule the timer.
1302                */
1303                cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
1304                /* Now the cache_reaper is guaranteed to be not running. */
1305                per_cpu(slab_reap_work, cpu).work.func = NULL;
1306                break;
1307        case CPU_DOWN_FAILED:
1308        case CPU_DOWN_FAILED_FROZEN:
1309                start_cpu_timer(cpu);
1310                break;
1311        case CPU_DEAD:
1312        case CPU_DEAD_FROZEN:
1313                /*
1314                 * Even if all the cpus of a node are down, we don't free the
1315                 * kmem_list3 of any cache. This to avoid a race between
1316                 * cpu_down, and a kmalloc allocation from another cpu for
1317                 * memory from the node of the cpu going down.  The list3
1318                 * structure is usually allocated from kmem_cache_create() and
1319                 * gets destroyed at kmem_cache_destroy().
1320                 */
1321                /* fall through */
1322#endif
1323        case CPU_UP_CANCELED:
1324        case CPU_UP_CANCELED_FROZEN:
1325                mutex_lock(&cache_chain_mutex);
1326                cpuup_canceled(cpu);
1327                mutex_unlock(&cache_chain_mutex);
1328                break;
1329        }
1330        return err ? NOTIFY_BAD : NOTIFY_OK;
1331}
1332
1333static struct notifier_block __cpuinitdata cpucache_notifier = {
1334        &cpuup_callback, NULL, 0
1335};
1336
1337/*
1338 * swap the static kmem_list3 with kmalloced memory
1339 */
1340static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1341                        int nodeid)
1342{
1343        struct kmem_list3 *ptr;
1344
1345        ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1346        BUG_ON(!ptr);
1347
1348        memcpy(ptr, list, sizeof(struct kmem_list3));
1349        /*
1350         * Do not assume that spinlocks can be initialized via memcpy:
1351         */
1352        spin_lock_init(&ptr->list_lock);
1353
1354        MAKE_ALL_LISTS(cachep, ptr, nodeid);
1355        cachep->nodelists[nodeid] = ptr;
1356}
1357
1358/*
1359 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1360 * size of kmem_list3.
1361 */
1362static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1363{
1364        int node;
1365
1366        for_each_online_node(node) {
1367                cachep->nodelists[node] = &initkmem_list3[index + node];
1368                cachep->nodelists[node]->next_reap = jiffies +
1369                    REAPTIMEOUT_LIST3 +
1370                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1371        }
1372}
1373
1374/*
1375 * Initialisation.  Called after the page allocator have been initialised and
1376 * before smp_init().
1377 */
1378void __init kmem_cache_init(void)
1379{
1380        size_t left_over;
1381        struct cache_sizes *sizes;
1382        struct cache_names *names;
1383        int i;
1384        int order;
1385        int node;
1386
1387        if (num_possible_nodes() == 1)
1388                use_alien_caches = 0;
1389
1390        for (i = 0; i < NUM_INIT_LISTS; i++) {
1391                kmem_list3_init(&initkmem_list3[i]);
1392                if (i < MAX_NUMNODES)
1393                        cache_cache.nodelists[i] = NULL;
1394        }
1395        set_up_list3s(&cache_cache, CACHE_CACHE);
1396
1397        /*
1398         * Fragmentation resistance on low memory - only use bigger
1399         * page orders on machines with more than 32MB of memory.
1400         */
1401        if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1402                slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1403
1404        /* Bootstrap is tricky, because several objects are allocated
1405         * from caches that do not exist yet:
1406         * 1) initialize the cache_cache cache: it contains the struct
1407         *    kmem_cache structures of all caches, except cache_cache itself:
1408         *    cache_cache is statically allocated.
1409         *    Initially an __init data area is used for the head array and the
1410         *    kmem_list3 structures, it's replaced with a kmalloc allocated
1411         *    array at the end of the bootstrap.
1412         * 2) Create the first kmalloc cache.
1413         *    The struct kmem_cache for the new cache is allocated normally.
1414         *    An __init data area is used for the head array.
1415         * 3) Create the remaining kmalloc caches, with minimally sized
1416         *    head arrays.
1417         * 4) Replace the __init data head arrays for cache_cache and the first
1418         *    kmalloc cache with kmalloc allocated arrays.
1419         * 5) Replace the __init data for kmem_list3 for cache_cache and
1420         *    the other cache's with kmalloc allocated memory.
1421         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1422         */
1423
1424        node = numa_node_id();
1425
1426        /* 1) create the cache_cache */
1427        INIT_LIST_HEAD(&cache_chain);
1428        list_add(&cache_cache.next, &cache_chain);
1429        cache_cache.colour_off = cache_line_size();
1430        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1431        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1432
1433        /*
1434         * struct kmem_cache size depends on nr_node_ids, which
1435         * can be less than MAX_NUMNODES.
1436         */
1437        cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
1438                                 nr_node_ids * sizeof(struct kmem_list3 *);
1439#if DEBUG
1440        cache_cache.obj_size = cache_cache.buffer_size;
1441#endif
1442        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1443                                        cache_line_size());
1444        cache_cache.reciprocal_buffer_size =
1445                reciprocal_value(cache_cache.buffer_size);
1446
1447        for (order = 0; order < MAX_ORDER; order++) {
1448                cache_estimate(order, cache_cache.buffer_size,
1449                        cache_line_size(), 0, &left_over, &cache_cache.num);
1450                if (cache_cache.num)
1451                        break;
1452        }
1453        BUG_ON(!cache_cache.num);
1454        cache_cache.gfporder = order;
1455        cache_cache.colour = left_over / cache_cache.colour_off;
1456        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1457                                      sizeof(struct slab), cache_line_size());
1458
1459        /* 2+3) create the kmalloc caches */
1460        sizes = malloc_sizes;
1461        names = cache_names;
1462
1463        /*
1464         * Initialize the caches that provide memory for the array cache and the
1465         * kmem_list3 structures first.  Without this, further allocations will
1466         * bug.
1467         */
1468
1469        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1470                                        sizes[INDEX_AC].cs_size,
1471                                        ARCH_KMALLOC_MINALIGN,
1472                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1473                                        NULL);
1474
1475        if (INDEX_AC != INDEX_L3) {
1476                sizes[INDEX_L3].cs_cachep =
1477                        kmem_cache_create(names[INDEX_L3].name,
1478                                sizes[INDEX_L3].cs_size,
1479                                ARCH_KMALLOC_MINALIGN,
1480                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1481                                NULL);
1482        }
1483
1484        slab_early_init = 0;
1485
1486        while (sizes->cs_size != ULONG_MAX) {
1487                /*
1488                 * For performance, all the general caches are L1 aligned.
1489                 * This should be particularly beneficial on SMP boxes, as it
1490                 * eliminates "false sharing".
1491                 * Note for systems short on memory removing the alignment will
1492                 * allow tighter packing of the smaller caches.
1493                 */
1494                if (!sizes->cs_cachep) {
1495                        sizes->cs_cachep = kmem_cache_create(names->name,
1496                                        sizes->cs_size,
1497                                        ARCH_KMALLOC_MINALIGN,
1498                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1499                                        NULL);
1500                }
1501#ifdef CONFIG_ZONE_DMA
1502                sizes->cs_dmacachep = kmem_cache_create(
1503                                        names->name_dma,
1504                                        sizes->cs_size,
1505                                        ARCH_KMALLOC_MINALIGN,
1506                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1507                                                SLAB_PANIC,
1508                                        NULL);
1509#endif
1510                sizes++;
1511                names++;
1512        }
1513        /* 4) Replace the bootstrap head arrays */
1514        {
1515                struct array_cache *ptr;
1516
1517                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1518
1519                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1520                memcpy(ptr, cpu_cache_get(&cache_cache),
1521                       sizeof(struct arraycache_init));
1522                /*
1523                 * Do not assume that spinlocks can be initialized via memcpy:
1524                 */
1525                spin_lock_init(&ptr->lock);
1526
1527                cache_cache.array[smp_processor_id()] = ptr;
1528
1529                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1530
1531                BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1532                       != &initarray_generic.cache);
1533                memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1534                       sizeof(struct arraycache_init));
1535                /*
1536                 * Do not assume that spinlocks can be initialized via memcpy:
1537                 */
1538                spin_lock_init(&ptr->lock);
1539
1540                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1541                    ptr;
1542        }
1543        /* 5) Replace the bootstrap kmem_list3's */
1544        {
1545                int nid;
1546
1547                for_each_online_node(nid) {
1548                        init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1549
1550                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
1551                                  &initkmem_list3[SIZE_AC + nid], nid);
1552
1553                        if (INDEX_AC != INDEX_L3) {
1554                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
1555                                          &initkmem_list3[SIZE_L3 + nid], nid);
1556                        }
1557                }
1558        }
1559
1560        g_cpucache_up = EARLY;
1561}
1562
1563void __init kmem_cache_init_late(void)
1564{
1565        struct kmem_cache *cachep;
1566
1567        /* 6) resize the head arrays to their final sizes */
1568        mutex_lock(&cache_chain_mutex);
1569        list_for_each_entry(cachep, &cache_chain, next)
1570                if (enable_cpucache(cachep, GFP_NOWAIT))
1571                        BUG();
1572        mutex_unlock(&cache_chain_mutex);
1573
1574        /* Done! */
1575        g_cpucache_up = FULL;
1576
1577        /* Annotate slab for lockdep -- annotate the malloc caches */
1578        init_lock_keys();
1579
1580        /*
1581         * Register a cpu startup notifier callback that initializes
1582         * cpu_cache_get for all new cpus
1583         */
1584        register_cpu_notifier(&cpucache_notifier);
1585
1586        /*
1587         * The reap timers are started later, with a module init call: That part
1588         * of the kernel is not yet operational.
1589         */
1590}
1591
1592static int __init cpucache_init(void)
1593{
1594        int cpu;
1595
1596        /*
1597         * Register the timers that return unneeded pages to the page allocator
1598         */
1599        for_each_online_cpu(cpu)
1600                start_cpu_timer(cpu);
1601        return 0;
1602}
1603__initcall(cpucache_init);
1604
1605/*
1606 * Interface to system's page allocator. No need to hold the cache-lock.
1607 *
1608 * If we requested dmaable memory, we will get it. Even if we
1609 * did not request dmaable memory, we might get it, but that
1610 * would be relatively rare and ignorable.
1611 */
1612static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1613{
1614        struct page *page;
1615        int nr_pages;
1616        int i;
1617
1618#ifndef CONFIG_MMU
1619        /*
1620         * Nommu uses slab's for process anonymous memory allocations, and thus
1621         * requires __GFP_COMP to properly refcount higher order allocations
1622         */
1623        flags |= __GFP_COMP;
1624#endif
1625
1626        flags |= cachep->gfpflags;
1627        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1628                flags |= __GFP_RECLAIMABLE;
1629
1630        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1631        if (!page)
1632                return NULL;
1633
1634        nr_pages = (1 << cachep->gfporder);
1635        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1636                add_zone_page_state(page_zone(page),
1637                        NR_SLAB_RECLAIMABLE, nr_pages);
1638        else
1639                add_zone_page_state(page_zone(page),
1640                        NR_SLAB_UNRECLAIMABLE, nr_pages);
1641        for (i = 0; i < nr_pages; i++)
1642                __SetPageSlab(page + i);
1643
1644        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1645                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1646
1647                if (cachep->ctor)
1648                        kmemcheck_mark_uninitialized_pages(page, nr_pages);
1649                else
1650                        kmemcheck_mark_unallocated_pages(page, nr_pages);
1651        }
1652
1653        return page_address(page);
1654}
1655
1656/*
1657 * Interface to system's page release.
1658 */
1659static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1660{
1661        unsigned long i = (1 << cachep->gfporder);
1662        struct page *page = virt_to_page(addr);
1663        const unsigned long nr_freed = i;
1664
1665        kmemcheck_free_shadow(page, cachep->gfporder);
1666
1667        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1668                sub_zone_page_state(page_zone(page),
1669                                NR_SLAB_RECLAIMABLE, nr_freed);
1670        else
1671                sub_zone_page_state(page_zone(page),
1672                                NR_SLAB_UNRECLAIMABLE, nr_freed);
1673        while (i--) {
1674                BUG_ON(!PageSlab(page));
1675                __ClearPageSlab(page);
1676                page++;
1677        }
1678        if (current->reclaim_state)
1679                current->reclaim_state->reclaimed_slab += nr_freed;
1680        free_pages((unsigned long)addr, cachep->gfporder);
1681}
1682
1683static void kmem_rcu_free(struct rcu_head *head)
1684{
1685        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1686        struct kmem_cache *cachep = slab_rcu->cachep;
1687
1688        kmem_freepages(cachep, slab_rcu->addr);
1689        if (OFF_SLAB(cachep))
1690                kmem_cache_free(cachep->slabp_cache, slab_rcu);
1691}
1692
1693#if DEBUG
1694
1695#ifdef CONFIG_DEBUG_PAGEALLOC
1696static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1697                            unsigned long caller)
1698{
1699        int size = obj_size(cachep);
1700
1701        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1702
1703        if (size < 5 * sizeof(unsigned long))
1704                return;
1705
1706        *addr++ = 0x12345678;
1707        *addr++ = caller;
1708        *addr++ = smp_processor_id();
1709        size -= 3 * sizeof(unsigned long);
1710        {
1711                unsigned long *sptr = &caller;
1712                unsigned long svalue;
1713
1714                while (!kstack_end(sptr)) {
1715                        svalue = *sptr++;
1716                        if (kernel_text_address(svalue)) {
1717                                *addr++ = svalue;
1718                                size -= sizeof(unsigned long);
1719                                if (size <= sizeof(unsigned long))
1720                                        break;
1721                        }
1722                }
1723
1724        }
1725        *addr++ = 0x87654321;
1726}
1727#endif
1728
1729static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1730{
1731        int size = obj_size(cachep);
1732        addr = &((char *)addr)[obj_offset(cachep)];
1733
1734        memset(addr, val, size);
1735        *(unsigned char *)(addr + size - 1) = POISON_END;
1736}
1737
1738static void dump_line(char *data, int offset, int limit)
1739{
1740        int i;
1741        unsigned char error = 0;
1742        int bad_count = 0;
1743
1744        printk(KERN_ERR "%03x:", offset);
1745        for (i = 0; i < limit; i++) {
1746                if (data[offset + i] != POISON_FREE) {
1747                        error = data[offset + i];
1748                        bad_count++;
1749                }
1750                printk(" %02x", (unsigned char)data[offset + i]);
1751        }
1752        printk("\n");
1753
1754        if (bad_count == 1) {
1755                error ^= POISON_FREE;
1756                if (!(error & (error - 1))) {
1757                        printk(KERN_ERR "Single bit error detected. Probably "
1758                                        "bad RAM.\n");
1759#ifdef CONFIG_X86
1760                        printk(KERN_ERR "Run memtest86+ or a similar memory "
1761                                        "test tool.\n");
1762#else
1763                        printk(KERN_ERR "Run a memory test tool.\n");
1764#endif
1765                }
1766        }
1767}
1768#endif
1769
1770#if DEBUG
1771
1772static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1773{
1774        int i, size;
1775        char *realobj;
1776
1777        if (cachep->flags & SLAB_RED_ZONE) {
1778                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
1779                        *dbg_redzone1(cachep, objp),
1780                        *dbg_redzone2(cachep, objp));
1781        }
1782
1783        if (cachep->flags & SLAB_STORE_USER) {
1784                printk(KERN_ERR "Last user: [<%p>]",
1785                        *dbg_userword(cachep, objp));
1786                print_symbol("(%s)",
1787                                (unsigned long)*dbg_userword(cachep, objp));
1788                printk("\n");
1789        }
1790        realobj = (char *)objp + obj_offset(cachep);
1791        size = obj_size(cachep);
1792        for (i = 0; i < size && lines; i += 16, lines--) {
1793                int limit;
1794                limit = 16;
1795                if (i + limit > size)
1796                        limit = size - i;
1797                dump_line(realobj, i, limit);
1798        }
1799}
1800
1801static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1802{
1803        char *realobj;
1804        int size, i;
1805        int lines = 0;
1806
1807        realobj = (char *)objp + obj_offset(cachep);
1808        size = obj_size(cachep);
1809
1810        for (i = 0; i < size; i++) {
1811                char exp = POISON_FREE;
1812                if (i == size - 1)
1813                        exp = POISON_END;
1814                if (realobj[i] != exp) {
1815                        int limit;
1816                        /* Mismatch ! */
1817                        /* Print header */
1818                        if (lines == 0) {
1819                                printk(KERN_ERR
1820                                        "Slab corruption: %s start=%p, len=%d\n",
1821                                        cachep->name, realobj, size);
1822                                print_objinfo(cachep, objp, 0);
1823                        }
1824                        /* Hexdump the affected line */
1825                        i = (i / 16) * 16;
1826                        limit = 16;
1827                        if (i + limit > size)
1828                                limit = size - i;
1829                        dump_line(realobj, i, limit);
1830                        i += 16;
1831                        lines++;
1832                        /* Limit to 5 lines */
1833                        if (lines > 5)
1834                                break;
1835                }
1836        }
1837        if (lines != 0) {
1838                /* Print some data about the neighboring objects, if they
1839                 * exist:
1840                 */
1841                struct slab *slabp = virt_to_slab(objp);
1842                unsigned int objnr;
1843
1844                objnr = obj_to_index(cachep, slabp, objp);
1845                if (objnr) {
1846                        objp = index_to_obj(cachep, slabp, objnr - 1);
1847                        realobj = (char *)objp + obj_offset(cachep);
1848                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1849                               realobj, size);
1850                        print_objinfo(cachep, objp, 2);
1851                }
1852                if (objnr + 1 < cachep->num) {
1853                        objp = index_to_obj(cachep, slabp, objnr + 1);
1854                        realobj = (char *)objp + obj_offset(cachep);
1855                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1856                               realobj, size);
1857                        print_objinfo(cachep, objp, 2);
1858                }
1859        }
1860}
1861#endif
1862
1863#if DEBUG
1864static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1865{
1866        int i;
1867        for (i = 0; i < cachep->num; i++) {
1868                void *objp = index_to_obj(cachep, slabp, i);
1869
1870                if (cachep->flags & SLAB_POISON) {
1871#ifdef CONFIG_DEBUG_PAGEALLOC
1872                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
1873                                        OFF_SLAB(cachep))
1874                                kernel_map_pages(virt_to_page(objp),
1875                                        cachep->buffer_size / PAGE_SIZE, 1);
1876                        else
1877                                check_poison_obj(cachep, objp);
1878#else
1879                        check_poison_obj(cachep, objp);
1880#endif
1881                }
1882                if (cachep->flags & SLAB_RED_ZONE) {
1883                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1884                                slab_error(cachep, "start of a freed object "
1885                                           "was overwritten");
1886                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1887                                slab_error(cachep, "end of a freed object "
1888                                           "was overwritten");
1889                }
1890        }
1891}
1892#else
1893static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1894{
1895}
1896#endif
1897
1898/**
1899 * slab_destroy - destroy and release all objects in a slab
1900 * @cachep: cache pointer being destroyed
1901 * @slabp: slab pointer being destroyed
1902 *
1903 * Destroy all the objs in a slab, and release the mem back to the system.
1904 * Before calling the slab must have been unlinked from the cache.  The
1905 * cache-lock is not held/needed.
1906 */
1907static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1908{
1909        void *addr = slabp->s_mem - slabp->colouroff;
1910
1911        slab_destroy_debugcheck(cachep, slabp);
1912        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1913                struct slab_rcu *slab_rcu;
1914
1915                slab_rcu = (struct slab_rcu *)slabp;
1916                slab_rcu->cachep = cachep;
1917                slab_rcu->addr = addr;
1918                call_rcu(&slab_rcu->head, kmem_rcu_free);
1919        } else {
1920                kmem_freepages(cachep, addr);
1921                if (OFF_SLAB(cachep))
1922                        kmem_cache_free(cachep->slabp_cache, slabp);
1923        }
1924}
1925
1926static void __kmem_cache_destroy(struct kmem_cache *cachep)
1927{
1928        int i;
1929        struct kmem_list3 *l3;
1930
1931        for_each_online_cpu(i)
1932            kfree(cachep->array[i]);
1933
1934        /* NUMA: free the list3 structures */
1935        for_each_online_node(i) {
1936                l3 = cachep->nodelists[i];
1937                if (l3) {
1938                        kfree(l3->shared);
1939                        free_alien_cache(l3->alien);
1940                        kfree(l3);
1941                }
1942        }
1943        kmem_cache_free(&cache_cache, cachep);
1944}
1945
1946
1947/**
1948 * calculate_slab_order - calculate size (page order) of slabs
1949 * @cachep: pointer to the cache that is being created
1950 * @size: size of objects to be created in this cache.
1951 * @align: required alignment for the objects.
1952 * @flags: slab allocation flags
1953 *
1954 * Also calculates the number of objects per slab.
1955 *
1956 * This could be made much more intelligent.  For now, try to avoid using
1957 * high order pages for slabs.  When the gfp() functions are more friendly
1958 * towards high-order requests, this should be changed.
1959 */
1960static size_t calculate_slab_order(struct kmem_cache *cachep,
1961                        size_t size, size_t align, unsigned long flags)
1962{
1963        unsigned long offslab_limit;
1964        size_t left_over = 0;
1965        int gfporder;
1966
1967        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1968                unsigned int num;
1969                size_t remainder;
1970
1971                cache_estimate(gfporder, size, align, flags, &remainder, &num);
1972                if (!num)
1973                        continue;
1974
1975                if (flags & CFLGS_OFF_SLAB) {
1976                        /*
1977                         * Max number of objs-per-slab for caches which
1978                         * use off-slab slabs. Needed to avoid a possible
1979                         * looping condition in cache_grow().
1980                         */
1981                        offslab_limit = size - sizeof(struct slab);
1982                        offslab_limit /= sizeof(kmem_bufctl_t);
1983
1984                        if (num > offslab_limit)
1985                                break;
1986                }
1987
1988                /* Found something acceptable - save it away */
1989                cachep->num = num;
1990                cachep->gfporder = gfporder;
1991                left_over = remainder;
1992
1993                /*
1994                 * A VFS-reclaimable slab tends to have most allocations
1995                 * as GFP_NOFS and we really don't want to have to be allocating
1996                 * higher-order pages when we are unable to shrink dcache.
1997                 */
1998                if (flags & SLAB_RECLAIM_ACCOUNT)
1999                        break;
2000
2001                /*
2002                 * Large number of objects is good, but very large slabs are
2003                 * currently bad for the gfp()s.
2004                 */
2005                if (gfporder >= slab_break_gfp_order)
2006                        break;
2007
2008                /*
2009                 * Acceptable internal fragmentation?
2010                 */
2011                if (left_over * 8 <= (PAGE_SIZE << gfporder))
2012                        break;
2013        }
2014        return left_over;
2015}
2016
2017static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2018{
2019        if (g_cpucache_up == FULL)
2020                return enable_cpucache(cachep, gfp);
2021
2022        if (g_cpucache_up == NONE) {
2023                /*
2024                 * Note: the first kmem_cache_create must create the cache
2025                 * that's used by kmalloc(24), otherwise the creation of
2026                 * further caches will BUG().
2027                 */
2028                cachep->array[smp_processor_id()] = &initarray_generic.cache;
2029
2030                /*
2031                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2032                 * the first cache, then we need to set up all its list3s,
2033                 * otherwise the creation of further caches will BUG().
2034                 */
2035                set_up_list3s(cachep, SIZE_AC);
2036                if (INDEX_AC == INDEX_L3)
2037                        g_cpucache_up = PARTIAL_L3;
2038                else
2039                        g_cpucache_up = PARTIAL_AC;
2040        } else {
2041                cachep->array[smp_processor_id()] =
2042                        kmalloc(sizeof(struct arraycache_init), gfp);
2043
2044                if (g_cpucache_up == PARTIAL_AC) {
2045                        set_up_list3s(cachep, SIZE_L3);
2046                        g_cpucache_up = PARTIAL_L3;
2047                } else {
2048                        int node;
2049                        for_each_online_node(node) {
2050                                cachep->nodelists[node] =
2051                                    kmalloc_node(sizeof(struct kmem_list3),
2052                                                gfp, node);
2053                                BUG_ON(!cachep->nodelists[node]);
2054                                kmem_list3_init(cachep->nodelists[node]);
2055                        }
2056                }
2057        }
2058        cachep->nodelists[numa_node_id()]->next_reap =
2059                        jiffies + REAPTIMEOUT_LIST3 +
2060                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2061
2062        cpu_cache_get(cachep)->avail = 0;
2063        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2064        cpu_cache_get(cachep)->batchcount = 1;
2065        cpu_cache_get(cachep)->touched = 0;
2066        cachep->batchcount = 1;
2067        cachep->limit = BOOT_CPUCACHE_ENTRIES;
2068        return 0;
2069}
2070
2071/**
2072 * kmem_cache_create - Create a cache.
2073 * @name: A string which is used in /proc/slabinfo to identify this cache.
2074 * @size: The size of objects to be created in this cache.
2075 * @align: The required alignment for the objects.
2076 * @flags: SLAB flags
2077 * @ctor: A constructor for the objects.
2078 *
2079 * Returns a ptr to the cache on success, NULL on failure.
2080 * Cannot be called within a int, but can be interrupted.
2081 * The @ctor is run when new pages are allocated by the cache.
2082 *
2083 * @name must be valid until the cache is destroyed. This implies that
2084 * the module calling this has to destroy the cache before getting unloaded.
2085 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2086 * therefore applications must manage it themselves.
2087 *
2088 * The flags are
2089 *
2090 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2091 * to catch references to uninitialised memory.
2092 *
2093 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2094 * for buffer overruns.
2095 *
2096 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2097 * cacheline.  This can be beneficial if you're counting cycles as closely
2098 * as davem.
2099 */
2100struct kmem_cache *
2101kmem_cache_create (const char *name, size_t size, size_t align,
2102        unsigned long flags, void (*ctor)(void *))
2103{
2104        size_t left_over, slab_size, ralign;
2105        struct kmem_cache *cachep = NULL, *pc;
2106        gfp_t gfp;
2107
2108        /*
2109         * Sanity checks... these are all serious usage bugs.
2110         */
2111        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2112            size > KMALLOC_MAX_SIZE) {
2113                printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
2114                                name);
2115                BUG();
2116        }
2117
2118        /*
2119         * We use cache_chain_mutex to ensure a consistent view of
2120         * cpu_online_mask as well.  Please see cpuup_callback
2121         */
2122        if (slab_is_available()) {
2123                get_online_cpus();
2124                mutex_lock(&cache_chain_mutex);
2125        }
2126
2127        list_for_each_entry(pc, &cache_chain, next) {
2128                char tmp;
2129                int res;
2130
2131                /*
2132                 * This happens when the module gets unloaded and doesn't
2133                 * destroy its slab cache and no-one else reuses the vmalloc
2134                 * area of the module.  Print a warning.
2135                 */
2136                res = probe_kernel_address(pc->name, tmp);
2137                if (res) {
2138                        printk(KERN_ERR
2139                               "SLAB: cache with size %d has lost its name\n",
2140                               pc->buffer_size);
2141                        continue;
2142                }
2143
2144                if (!strcmp(pc->name, name)) {
2145                        printk(KERN_ERR
2146                               "kmem_cache_create: duplicate cache %s\n", name);
2147                        dump_stack();
2148                        goto oops;
2149                }
2150        }
2151
2152#if DEBUG
2153        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
2154#if FORCED_DEBUG
2155        /*
2156         * Enable redzoning and last user accounting, except for caches with
2157         * large objects, if the increased size would increase the object size
2158         * above the next power of two: caches with object sizes just above a
2159         * power of two have a significant amount of internal fragmentation.
2160         */
2161        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2162                                                2 * sizeof(unsigned long long)))
2163                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2164        if (!(flags & SLAB_DESTROY_BY_RCU))
2165                flags |= SLAB_POISON;
2166#endif
2167        if (flags & SLAB_DESTROY_BY_RCU)
2168                BUG_ON(flags & SLAB_POISON);
2169#endif
2170        /*
2171         * Always checks flags, a caller might be expecting debug support which
2172         * isn't available.
2173         */
2174        BUG_ON(flags & ~CREATE_MASK);
2175
2176        /*
2177         * Check that size is in terms of words.  This is needed to avoid
2178         * unaligned accesses for some archs when redzoning is used, and makes
2179         * sure any on-slab bufctl's are also correctly aligned.
2180         */
2181        if (size & (BYTES_PER_WORD - 1)) {
2182                size += (BYTES_PER_WORD - 1);
2183                size &= ~(BYTES_PER_WORD - 1);
2184        }
2185
2186        /* calculate the final buffer alignment: */
2187
2188        /* 1) arch recommendation: can be overridden for debug */
2189        if (flags & SLAB_HWCACHE_ALIGN) {
2190                /*
2191                 * Default alignment: as specified by the arch code.  Except if
2192                 * an object is really small, then squeeze multiple objects into
2193                 * one cacheline.
2194                 */
2195                ralign = cache_line_size();
2196                while (size <= ralign / 2)
2197                        ralign /= 2;
2198        } else {
2199                ralign = BYTES_PER_WORD;
2200        }
2201
2202        /*
2203         * Redzoning and user store require word alignment or possibly larger.
2204         * Note this will be overridden by architecture or caller mandated
2205         * alignment if either is greater than BYTES_PER_WORD.
2206         */
2207        if (flags & SLAB_STORE_USER)
2208                ralign = BYTES_PER_WORD;
2209
2210        if (flags & SLAB_RED_ZONE) {
2211                ralign = REDZONE_ALIGN;
2212                /* If redzoning, ensure that the second redzone is suitably
2213                 * aligned, by adjusting the object size accordingly. */
2214                size += REDZONE_ALIGN - 1;
2215                size &= ~(REDZONE_ALIGN - 1);
2216        }
2217
2218        /* 2) arch mandated alignment */
2219        if (ralign < ARCH_SLAB_MINALIGN) {
2220                ralign = ARCH_SLAB_MINALIGN;
2221        }
2222        /* 3) caller mandated alignment */
2223        if (ralign < align) {
2224                ralign = align;
2225        }
2226        /* disable debug if necessary */
2227        if (ralign > __alignof__(unsigned long long))
2228                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2229        /*
2230         * 4) Store it.
2231         */
2232        align = ralign;
2233
2234        if (slab_is_available())
2235                gfp = GFP_KERNEL;
2236        else
2237                gfp = GFP_NOWAIT;
2238
2239        /* Get cache's description obj. */
2240        cachep = kmem_cache_zalloc(&cache_cache, gfp);
2241        if (!cachep)
2242                goto oops;
2243
2244#if DEBUG
2245        cachep->obj_size = size;
2246
2247        /*
2248         * Both debugging options require word-alignment which is calculated
2249         * into align above.
2250         */
2251        if (flags & SLAB_RED_ZONE) {
2252                /* add space for red zone words */
2253                cachep->obj_offset += sizeof(unsigned long long);
2254                size += 2 * sizeof(unsigned long long);
2255        }
2256        if (flags & SLAB_STORE_USER) {
2257                /* user store requires one word storage behind the end of
2258                 * the real object. But if the second red zone needs to be
2259                 * aligned to 64 bits, we must allow that much space.
2260                 */
2261                if (flags & SLAB_RED_ZONE)
2262                        size += REDZONE_ALIGN;
2263                else
2264                        size += BYTES_PER_WORD;
2265        }
2266#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2267        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2268            && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2269                cachep->obj_offset += PAGE_SIZE - size;
2270                size = PAGE_SIZE;
2271        }
2272#endif
2273#endif
2274
2275        /*
2276         * Determine if the slab management is 'on' or 'off' slab.
2277         * (bootstrapping cannot cope with offslab caches so don't do
2278         * it too early on. Always use on-slab management when
2279         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2280         */
2281        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2282            !(flags & SLAB_NOLEAKTRACE))
2283                /*
2284                 * Size is large, assume best to place the slab management obj
2285                 * off-slab (should allow better packing of objs).
2286                 */
2287                flags |= CFLGS_OFF_SLAB;
2288
2289        size = ALIGN(size, align);
2290
2291        left_over = calculate_slab_order(cachep, size, align, flags);
2292
2293        if (!cachep->num) {
2294                printk(KERN_ERR
2295                       "kmem_cache_create: couldn't create cache %s.\n", name);
2296                kmem_cache_free(&cache_cache, cachep);
2297                cachep = NULL;
2298                goto oops;
2299        }
2300        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2301                          + sizeof(struct slab), align);
2302
2303        /*
2304         * If the slab has been placed off-slab, and we have enough space then
2305         * move it on-slab. This is at the expense of any extra colouring.
2306         */
2307        if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2308                flags &= ~CFLGS_OFF_SLAB;
2309                left_over -= slab_size;
2310        }
2311
2312        if (flags & CFLGS_OFF_SLAB) {
2313                /* really off slab. No need for manual alignment */
2314                slab_size =
2315                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2316
2317#ifdef CONFIG_PAGE_POISONING
2318                /* If we're going to use the generic kernel_map_pages()
2319                 * poisoning, then it's going to smash the contents of
2320                 * the redzone and userword anyhow, so switch them off.
2321                 */
2322                if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2323                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2324#endif
2325        }
2326
2327        cachep->colour_off = cache_line_size();
2328        /* Offset must be a multiple of the alignment. */
2329        if (cachep->colour_off < align)
2330                cachep->colour_off = align;
2331        cachep->colour = left_over / cachep->colour_off;
2332        cachep->slab_size = slab_size;
2333        cachep->flags = flags;
2334        cachep->gfpflags = 0;
2335        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2336                cachep->gfpflags |= GFP_DMA;
2337        cachep->buffer_size = size;
2338        cachep->reciprocal_buffer_size = reciprocal_value(size);
2339
2340        if (flags & CFLGS_OFF_SLAB) {
2341                cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2342                /*
2343                 * This is a possibility for one of the malloc_sizes caches.
2344                 * But since we go off slab only for object size greater than
2345                 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2346                 * this should not happen at all.
2347                 * But leave a BUG_ON for some lucky dude.
2348                 */
2349                BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2350        }
2351        cachep->ctor = ctor;
2352        cachep->name = name;
2353
2354        if (setup_cpu_cache(cachep, gfp)) {
2355                __kmem_cache_destroy(cachep);
2356                cachep = NULL;
2357                goto oops;
2358        }
2359
2360        /* cache setup completed, link it into the list */
2361        list_add(&cachep->next, &cache_chain);
2362oops:
2363        if (!cachep && (flags & SLAB_PANIC))
2364                panic("kmem_cache_create(): failed to create slab `%s'\n",
2365                      name);
2366        if (slab_is_available()) {
2367                mutex_unlock(&cache_chain_mutex);
2368                put_online_cpus();
2369        }
2370        return cachep;
2371}
2372EXPORT_SYMBOL(kmem_cache_create);
2373
2374#if DEBUG
2375static void check_irq_off(void)
2376{
2377        BUG_ON(!irqs_disabled());
2378}
2379
2380static void check_irq_on(void)
2381{
2382        BUG_ON(irqs_disabled());
2383}
2384
2385static void check_spinlock_acquired(struct kmem_cache *cachep)
2386{
2387#ifdef CONFIG_SMP
2388        check_irq_off();
2389        assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2390#endif
2391}
2392
2393static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2394{
2395#ifdef CONFIG_SMP
2396        check_irq_off();
2397        assert_spin_locked(&cachep->nodelists[node]->list_lock);
2398#endif
2399}
2400
2401#else
2402#define check_irq_off() do { } while(0)
2403#define check_irq_on()  do { } while(0)
2404#define check_spinlock_acquired(x) do { } while(0)
2405#define check_spinlock_acquired_node(x, y) do { } while(0)
2406#endif
2407
2408static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2409                        struct array_cache *ac,
2410                        int force, int node);
2411
2412static void do_drain(void *arg)
2413{
2414        struct kmem_cache *cachep = arg;
2415        struct array_cache *ac;
2416        int node = numa_node_id();
2417
2418        check_irq_off();
2419        ac = cpu_cache_get(cachep);
2420        spin_lock(&cachep->nodelists[node]->list_lock);
2421        free_block(cachep, ac->entry, ac->avail, node);
2422        spin_unlock(&cachep->nodelists[node]->list_lock);
2423        ac->avail = 0;
2424}
2425
2426static void drain_cpu_caches(struct kmem_cache *cachep)
2427{
2428        struct kmem_list3 *l3;
2429        int node;
2430
2431        on_each_cpu(do_drain, cachep, 1);
2432        check_irq_on();
2433        for_each_online_node(node) {
2434                l3 = cachep->nodelists[node];
2435                if (l3 && l3->alien)
2436                        drain_alien_cache(cachep, l3->alien);
2437        }
2438
2439        for_each_online_node(node) {
2440                l3 = cachep->nodelists[node];
2441                if (l3)
2442                        drain_array(cachep, l3, l3->shared, 1, node);
2443        }
2444}
2445
2446/*
2447 * Remove slabs from the list of free slabs.
2448 * Specify the number of slabs to drain in tofree.
2449 *
2450 * Returns the actual number of slabs released.
2451 */
2452static int drain_freelist(struct kmem_cache *cache,
2453                        struct kmem_list3 *l3, int tofree)
2454{
2455        struct list_head *p;
2456        int nr_freed;
2457        struct slab *slabp;
2458
2459        nr_freed = 0;
2460        while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2461
2462                spin_lock_irq(&l3->list_lock);
2463                p = l3->slabs_free.prev;
2464                if (p == &l3->slabs_free) {
2465                        spin_unlock_irq(&l3->list_lock);
2466                        goto out;
2467                }
2468
2469                slabp = list_entry(p, struct slab, list);
2470#if DEBUG
2471                BUG_ON(slabp->inuse);
2472#endif
2473                list_del(&slabp->list);
2474                /*
2475                 * Safe to drop the lock. The slab is no longer linked
2476                 * to the cache.
2477                 */
2478                l3->free_objects -= cache->num;
2479                spin_unlock_irq(&l3->list_lock);
2480                slab_destroy(cache, slabp);
2481                nr_freed++;
2482        }
2483out:
2484        return nr_freed;
2485}
2486
2487/* Called with cache_chain_mutex held to protect against cpu hotplug */
2488static int __cache_shrink(struct kmem_cache *cachep)
2489{
2490        int ret = 0, i = 0;
2491        struct kmem_list3 *l3;
2492
2493        drain_cpu_caches(cachep);
2494
2495        check_irq_on();
2496        for_each_online_node(i) {
2497                l3 = cachep->nodelists[i];
2498                if (!l3)
2499                        continue;
2500
2501                drain_freelist(cachep, l3, l3->free_objects);
2502
2503                ret += !list_empty(&l3->slabs_full) ||
2504                        !list_empty(&l3->slabs_partial);
2505        }
2506        return (ret ? 1 : 0);
2507}
2508
2509/**
2510 * kmem_cache_shrink - Shrink a cache.
2511 * @cachep: The cache to shrink.
2512 *
2513 * Releases as many slabs as possible for a cache.
2514 * To help debugging, a zero exit status indicates all slabs were released.
2515 */
2516int kmem_cache_shrink(struct kmem_cache *cachep)
2517{
2518        int ret;
2519        BUG_ON(!cachep || in_interrupt());
2520
2521        get_online_cpus();
2522        mutex_lock(&cache_chain_mutex);
2523        ret = __cache_shrink(cachep);
2524        mutex_unlock(&cache_chain_mutex);
2525        put_online_cpus();
2526        return ret;
2527}
2528EXPORT_SYMBOL(kmem_cache_shrink);
2529
2530/**
2531 * kmem_cache_destroy - delete a cache
2532 * @cachep: the cache to destroy
2533 *
2534 * Remove a &struct kmem_cache object from the slab cache.
2535 *
2536 * It is expected this function will be called by a module when it is
2537 * unloaded.  This will remove the cache completely, and avoid a duplicate
2538 * cache being allocated each time a module is loaded and unloaded, if the
2539 * module doesn't have persistent in-kernel storage across loads and unloads.
2540 *
2541 * The cache must be empty before calling this function.
2542 *
2543 * The caller must guarantee that noone will allocate memory from the cache
2544 * during the kmem_cache_destroy().
2545 */
2546void kmem_cache_destroy(struct kmem_cache *cachep)
2547{
2548        BUG_ON(!cachep || in_interrupt());
2549
2550        /* Find the cache in the chain of caches. */
2551        get_online_cpus();
2552        mutex_lock(&cache_chain_mutex);
2553        /*
2554         * the chain is never empty, cache_cache is never destroyed
2555         */
2556        list_del(&cachep->next);
2557        if (__cache_shrink(cachep)) {
2558                slab_error(cachep, "Can't free all objects");
2559                list_add(&cachep->next, &cache_chain);
2560                mutex_unlock(&cache_chain_mutex);
2561                put_online_cpus();
2562                return;
2563        }
2564
2565        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2566                rcu_barrier();
2567
2568        __kmem_cache_destroy(cachep);
2569        mutex_unlock(&cache_chain_mutex);
2570        put_online_cpus();
2571}
2572EXPORT_SYMBOL(kmem_cache_destroy);
2573
2574/*
2575 * Get the memory for a slab management obj.
2576 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2577 * always come from malloc_sizes caches.  The slab descriptor cannot
2578 * come from the same cache which is getting created because,
2579 * when we are searching for an appropriate cache for these
2580 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2581 * If we are creating a malloc_sizes cache here it would not be visible to
2582 * kmem_find_general_cachep till the initialization is complete.
2583 * Hence we cannot have slabp_cache same as the original cache.
2584 */
2585static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2586                                   int colour_off, gfp_t local_flags,
2587                                   int nodeid)
2588{
2589        struct slab *slabp;
2590
2591        if (OFF_SLAB(cachep)) {
2592                /* Slab management obj is off-slab. */
2593                slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2594                                              local_flags, nodeid);
2595                /*
2596                 * If the first object in the slab is leaked (it's allocated
2597                 * but no one has a reference to it), we want to make sure
2598                 * kmemleak does not treat the ->s_mem pointer as a reference
2599                 * to the object. Otherwise we will not report the leak.
2600                 */
2601                kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2602                                   local_flags);
2603                if (!slabp)
2604                        return NULL;
2605        } else {
2606                slabp = objp + colour_off;
2607                colour_off += cachep->slab_size;
2608        }
2609        slabp->inuse = 0;
2610        slabp->colouroff = colour_off;
2611        slabp->s_mem = objp + colour_off;
2612        slabp->nodeid = nodeid;
2613        slabp->free = 0;
2614        return slabp;
2615}
2616
2617static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2618{
2619        return (kmem_bufctl_t *) (slabp + 1);
2620}
2621
2622static void cache_init_objs(struct kmem_cache *cachep,
2623                            struct slab *slabp)
2624{
2625        int i;
2626
2627        for (i = 0; i < cachep->num; i++) {
2628                void *objp = index_to_obj(cachep, slabp, i);
2629#if DEBUG
2630                /* need to poison the objs? */
2631                if (cachep->flags & SLAB_POISON)
2632                        poison_obj(cachep, objp, POISON_FREE);
2633                if (cachep->flags & SLAB_STORE_USER)
2634                        *dbg_userword(cachep, objp) = NULL;
2635
2636                if (cachep->flags & SLAB_RED_ZONE) {
2637                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2638                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2639                }
2640                /*
2641                 * Constructors are not allowed to allocate memory from the same
2642                 * cache which they are a constructor for.  Otherwise, deadlock.
2643                 * They must also be threaded.
2644                 */
2645                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2646                        cachep->ctor(objp + obj_offset(cachep));
2647
2648                if (cachep->flags & SLAB_RED_ZONE) {
2649                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2650                                slab_error(cachep, "constructor overwrote the"
2651                                           " end of an object");
2652                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2653                                slab_error(cachep, "constructor overwrote the"
2654                                           " start of an object");
2655                }
2656                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2657                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2658                        kernel_map_pages(virt_to_page(objp),
2659                                         cachep->buffer_size / PAGE_SIZE, 0);
2660#else
2661                if (cachep->ctor)
2662                        cachep->ctor(objp);
2663#endif
2664                slab_bufctl(slabp)[i] = i + 1;
2665        }
2666        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2667}
2668
2669static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2670{
2671        if (CONFIG_ZONE_DMA_FLAG) {
2672                if (flags & GFP_DMA)
2673                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
2674                else
2675                        BUG_ON(cachep->gfpflags & GFP_DMA);
2676        }
2677}
2678
2679static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2680                                int nodeid)
2681{
2682        void *objp = index_to_obj(cachep, slabp, slabp->free);
2683        kmem_bufctl_t next;
2684
2685        slabp->inuse++;
2686        next = slab_bufctl(slabp)[slabp->free];
2687#if DEBUG
2688        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2689        WARN_ON(slabp->nodeid != nodeid);
2690#endif
2691        slabp->free = next;
2692
2693        return objp;
2694}
2695
2696static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2697                                void *objp, int nodeid)
2698{
2699        unsigned int objnr = obj_to_index(cachep, slabp, objp);
2700
2701#if DEBUG
2702        /* Verify that the slab belongs to the intended node */
2703        WARN_ON(slabp->nodeid != nodeid);
2704
2705        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2706                printk(KERN_ERR "slab: double free detected in cache "
2707                                "'%s', objp %p\n", cachep->name, objp);
2708                BUG();
2709        }
2710#endif
2711        slab_bufctl(slabp)[objnr] = slabp->free;
2712        slabp->free = objnr;
2713        slabp->inuse--;
2714}
2715
2716/*
2717 * Map pages beginning at addr to the given cache and slab. This is required
2718 * for the slab allocator to be able to lookup the cache and slab of a
2719 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2720 */
2721static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2722                           void *addr)
2723{
2724        int nr_pages;
2725        struct page *page;
2726
2727        page = virt_to_page(addr);
2728
2729        nr_pages = 1;
2730        if (likely(!PageCompound(page)))
2731                nr_pages <<= cache->gfporder;
2732
2733        do {
2734                page_set_cache(page, cache);
2735                page_set_slab(page, slab);
2736                page++;
2737        } while (--nr_pages);
2738}
2739
2740/*
2741 * Grow (by 1) the number of slabs within a cache.  This is called by
2742 * kmem_cache_alloc() when there are no active objs left in a cache.
2743 */
2744static int cache_grow(struct kmem_cache *cachep,
2745                gfp_t flags, int nodeid, void *objp)
2746{
2747        struct slab *slabp;
2748        size_t offset;
2749        gfp_t local_flags;
2750        struct kmem_list3 *l3;
2751
2752        /*
2753         * Be lazy and only check for valid flags here,  keeping it out of the
2754         * critical path in kmem_cache_alloc().
2755         */
2756        BUG_ON(flags & GFP_SLAB_BUG_MASK);
2757        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2758
2759        /* Take the l3 list lock to change the colour_next on this node */
2760        check_irq_off();
2761        l3 = cachep->nodelists[nodeid];
2762        spin_lock(&l3->list_lock);
2763
2764        /* Get colour for the slab, and cal the next value. */
2765        offset = l3->colour_next;
2766        l3->colour_next++;
2767        if (l3->colour_next >= cachep->colour)
2768                l3->colour_next = 0;
2769        spin_unlock(&l3->list_lock);
2770
2771        offset *= cachep->colour_off;
2772
2773        if (local_flags & __GFP_WAIT)
2774                local_irq_enable();
2775
2776        /*
2777         * The test for missing atomic flag is performed here, rather than
2778         * the more obvious place, simply to reduce the critical path length
2779         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2780         * will eventually be caught here (where it matters).
2781         */
2782        kmem_flagcheck(cachep, flags);
2783
2784        /*
2785         * Get mem for the objs.  Attempt to allocate a physical page from
2786         * 'nodeid'.
2787         */
2788        if (!objp)
2789                objp = kmem_getpages(cachep, local_flags, nodeid);
2790        if (!objp)
2791                goto failed;
2792
2793        /* Get slab management. */
2794        slabp = alloc_slabmgmt(cachep, objp, offset,
2795                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2796        if (!slabp)
2797                goto opps1;
2798
2799        slab_map_pages(cachep, slabp, objp);
2800
2801        cache_init_objs(cachep, slabp);
2802
2803        if (local_flags & __GFP_WAIT)
2804                local_irq_disable();
2805        check_irq_off();
2806        spin_lock(&l3->list_lock);
2807
2808        /* Make slab active. */
2809        list_add_tail(&slabp->list, &(l3->slabs_free));
2810        STATS_INC_GROWN(cachep);
2811        l3->free_objects += cachep->num;
2812        spin_unlock(&l3->list_lock);
2813        return 1;
2814opps1:
2815        kmem_freepages(cachep, objp);
2816failed:
2817        if (local_flags & __GFP_WAIT)
2818                local_irq_disable();
2819        return 0;
2820}
2821
2822#if DEBUG
2823
2824/*
2825 * Perform extra freeing checks:
2826 * - detect bad pointers.
2827 * - POISON/RED_ZONE checking
2828 */
2829static void kfree_debugcheck(const void *objp)
2830{
2831        if (!virt_addr_valid(objp)) {
2832                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2833                       (unsigned long)objp);
2834                BUG();
2835        }
2836}
2837
2838static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2839{
2840        unsigned long long redzone1, redzone2;
2841
2842        redzone1 = *dbg_redzone1(cache, obj);
2843        redzone2 = *dbg_redzone2(cache, obj);
2844
2845        /*
2846         * Redzone is ok.
2847         */
2848        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2849                return;
2850
2851        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2852                slab_error(cache, "double free detected");
2853        else
2854                slab_error(cache, "memory outside object was overwritten");
2855
2856        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
2857                        obj, redzone1, redzone2);
2858}
2859
2860static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2861                                   void *caller)
2862{
2863        struct page *page;
2864        unsigned int objnr;
2865        struct slab *slabp;
2866
2867        BUG_ON(virt_to_cache(objp) != cachep);
2868
2869        objp -= obj_offset(cachep);
2870        kfree_debugcheck(objp);
2871        page = virt_to_head_page(objp);
2872
2873        slabp = page_get_slab(page);
2874
2875        if (cachep->flags & SLAB_RED_ZONE) {
2876                verify_redzone_free(cachep, objp);
2877                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2878                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2879        }
2880        if (cachep->flags & SLAB_STORE_USER)
2881                *dbg_userword(cachep, objp) = caller;
2882
2883        objnr = obj_to_index(cachep, slabp, objp);
2884
2885        BUG_ON(objnr >= cachep->num);
2886        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2887
2888#ifdef CONFIG_DEBUG_SLAB_LEAK
2889        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2890#endif
2891        if (cachep->flags & SLAB_POISON) {
2892#ifdef CONFIG_DEBUG_PAGEALLOC
2893                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2894                        store_stackinfo(cachep, objp, (unsigned long)caller);
2895                        kernel_map_pages(virt_to_page(objp),
2896                                         cachep->buffer_size / PAGE_SIZE, 0);
2897                } else {
2898                        poison_obj(cachep, objp, POISON_FREE);
2899                }
2900#else
2901                poison_obj(cachep, objp, POISON_FREE);
2902#endif
2903        }
2904        return objp;
2905}
2906
2907static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2908{
2909        kmem_bufctl_t i;
2910        int entries = 0;
2911
2912        /* Check slab's freelist to see if this obj is there. */
2913        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2914                entries++;
2915                if (entries > cachep->num || i >= cachep->num)
2916                        goto bad;
2917        }
2918        if (entries != cachep->num - slabp->inuse) {
2919bad:
2920                printk(KERN_ERR "slab: Internal list corruption detected in "
2921                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2922                        cachep->name, cachep->num, slabp, slabp->inuse);
2923                for (i = 0;
2924                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2925                     i++) {
2926                        if (i % 16 == 0)
2927                                printk("\n%03x:", i);
2928                        printk(" %02x", ((unsigned char *)slabp)[i]);
2929                }
2930                printk("\n");
2931                BUG();
2932        }
2933}
2934#else
2935#define kfree_debugcheck(x) do { } while(0)
2936#define cache_free_debugcheck(x,objp,z) (objp)
2937#define check_slabp(x,y) do { } while(0)
2938#endif
2939
2940static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2941{
2942        int batchcount;
2943        struct kmem_list3 *l3;
2944        struct array_cache *ac;
2945        int node;
2946
2947retry:
2948        check_irq_off();
2949        node = numa_node_id();
2950        ac = cpu_cache_get(cachep);
2951        batchcount = ac->batchcount;
2952        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2953                /*
2954                 * If there was little recent activity on this cache, then
2955                 * perform only a partial refill.  Otherwise we could generate
2956                 * refill bouncing.
2957                 */
2958                batchcount = BATCHREFILL_LIMIT;
2959        }
2960        l3 = cachep->nodelists[node];
2961
2962        BUG_ON(ac->avail > 0 || !l3);
2963        spin_lock(&l3->list_lock);
2964
2965        /* See if we can refill from the shared array */
2966        if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2967                goto alloc_done;
2968
2969        while (batchcount > 0) {
2970                struct list_head *entry;
2971                struct slab *slabp;
2972                /* Get slab alloc is to come from. */
2973                entry = l3->slabs_partial.next;
2974                if (entry == &l3->slabs_partial) {
2975                        l3->free_touched = 1;
2976                        entry = l3->slabs_free.next;
2977                        if (entry == &l3->slabs_free)
2978                                goto must_grow;
2979                }
2980
2981                slabp = list_entry(entry, struct slab, list);
2982                check_slabp(cachep, slabp);
2983                check_spinlock_acquired(cachep);
2984
2985                /*
2986                 * The slab was either on partial or free list so
2987                 * there must be at least one object available for
2988                 * allocation.
2989                 */
2990                BUG_ON(slabp->inuse >= cachep->num);
2991
2992                while (slabp->inuse < cachep->num && batchcount--) {
2993                        STATS_INC_ALLOCED(cachep);
2994                        STATS_INC_ACTIVE(cachep);
2995                        STATS_SET_HIGH(cachep);
2996
2997                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2998                                                            node);
2999                }
3000                check_slabp(cachep, slabp);
3001
3002                /* move slabp to correct slabp list: */
3003                list_del(&slabp->list);
3004                if (slabp->free == BUFCTL_END)
3005                        list_add(&slabp->list, &l3->slabs_full);
3006                else
3007                        list_add(&slabp->list, &l3->slabs_partial);
3008        }
3009
3010must_grow:
3011        l3->free_objects -= ac->avail;
3012alloc_done:
3013        spin_unlock(&l3->list_lock);
3014
3015        if (unlikely(!ac->avail)) {
3016                int x;
3017                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3018
3019                /* cache_grow can reenable interrupts, then ac could change. */
3020                ac = cpu_cache_get(cachep);
3021                if (!x && ac->avail == 0)       /* no objects in sight? abort */
3022                        return NULL;
3023
3024                if (!ac->avail)         /* objects refilled by interrupt? */
3025                        goto retry;
3026        }
3027        ac->touched = 1;
3028        return ac->entry[--ac->avail];
3029}
3030
3031static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3032                                                gfp_t flags)
3033{
3034        might_sleep_if(flags & __GFP_WAIT);
3035#if DEBUG
3036        kmem_flagcheck(cachep, flags);
3037#endif
3038}
3039
3040#if DEBUG
3041static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3042                                gfp_t flags, void *objp, void *caller)
3043{
3044        if (!objp)
3045                return objp;
3046        if (cachep->flags & SLAB_POISON) {
3047#ifdef CONFIG_DEBUG_PAGEALLOC
3048                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3049                        kernel_map_pages(virt_to_page(objp),
3050                                         cachep->buffer_size / PAGE_SIZE, 1);
3051                else
3052                        check_poison_obj(cachep, objp);
3053#else
3054                check_poison_obj(cachep, objp);
3055#endif
3056                poison_obj(cachep, objp, POISON_INUSE);
3057        }
3058        if (cachep->flags & SLAB_STORE_USER)
3059                *dbg_userword(cachep, objp) = caller;
3060
3061        if (cachep->flags & SLAB_RED_ZONE) {
3062                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3063                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3064                        slab_error(cachep, "double free, or memory outside"
3065                                                " object was overwritten");
3066                        printk(KERN_ERR
3067                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
3068                                objp, *dbg_redzone1(cachep, objp),
3069                                *dbg_redzone2(cachep, objp));
3070                }
3071                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3072                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3073        }
3074#ifdef CONFIG_DEBUG_SLAB_LEAK
3075        {
3076                struct slab *slabp;
3077                unsigned objnr;
3078
3079                slabp = page_get_slab(virt_to_head_page(objp));
3080                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3081                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3082        }
3083#endif
3084        objp += obj_offset(cachep);
3085        if (cachep->ctor && cachep->flags & SLAB_POISON)
3086                cachep->ctor(objp);
3087#if ARCH_SLAB_MINALIGN
3088        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3089                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3090                       objp, ARCH_SLAB_MINALIGN);
3091        }
3092#endif
3093        return objp;
3094}
3095#else
3096#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3097#endif
3098
3099static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3100{
3101        if (cachep == &cache_cache)
3102                return false;
3103
3104        return should_failslab(obj_size(cachep), flags);
3105}
3106
3107static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3108{
3109        void *objp;
3110        struct array_cache *ac;
3111
3112        check_irq_off();
3113
3114        ac = cpu_cache_get(cachep);
3115        if (likely(ac->avail)) {
3116                STATS_INC_ALLOCHIT(cachep);
3117                ac->touched = 1;
3118                objp = ac->entry[--ac->avail];
3119        } else {
3120                STATS_INC_ALLOCMISS(cachep);
3121                objp = cache_alloc_refill(cachep, flags);
3122                /*
3123                 * the 'ac' may be updated by cache_alloc_refill(),
3124                 * and kmemleak_erase() requires its correct value.
3125                 */
3126                ac = cpu_cache_get(cachep);
3127        }
3128        /*
3129         * To avoid a false negative, if an object that is in one of the
3130         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3131         * treat the array pointers as a reference to the object.
3132         */
3133        if (objp)
3134                kmemleak_erase(&ac->entry[ac->avail]);
3135        return objp;
3136}
3137
3138#ifdef CONFIG_NUMA
3139/*
3140 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3141 *
3142 * If we are in_interrupt, then process context, including cpusets and
3143 * mempolicy, may not apply and should not be used for allocation policy.
3144 */
3145static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3146{
3147        int nid_alloc, nid_here;
3148
3149        if (in_interrupt() || (flags & __GFP_THISNODE))
3150                return NULL;
3151        nid_alloc = nid_here = numa_node_id();
3152        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3153                nid_alloc = cpuset_mem_spread_node();
3154        else if (current->mempolicy)
3155                nid_alloc = slab_node(current->mempolicy);
3156        if (nid_alloc != nid_here)
3157                return ____cache_alloc_node(cachep, flags, nid_alloc);
3158        return NULL;
3159}
3160
3161/*
3162 * Fallback function if there was no memory available and no objects on a
3163 * certain node and fall back is permitted. First we scan all the
3164 * available nodelists for available objects. If that fails then we
3165 * perform an allocation without specifying a node. This allows the page
3166 * allocator to do its reclaim / fallback magic. We then insert the
3167 * slab into the proper nodelist and then allocate from it.
3168 */
3169static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3170{
3171        struct zonelist *zonelist;
3172        gfp_t local_flags;
3173        struct zoneref *z;
3174        struct zone *zone;
3175        enum zone_type high_zoneidx = gfp_zone(flags);
3176        void *obj = NULL;
3177        int nid;
3178
3179        if (flags & __GFP_THISNODE)
3180                return NULL;
3181
3182        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3183        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3184
3185retry:
3186        /*
3187         * Look through allowed nodes for objects available
3188         * from existing per node queues.
3189         */
3190        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3191                nid = zone_to_nid(zone);
3192
3193                if (cpuset_zone_allowed_hardwall(zone, flags) &&
3194                        cache->nodelists[nid] &&
3195                        cache->nodelists[nid]->free_objects) {
3196                                obj = ____cache_alloc_node(cache,
3197                                        flags | GFP_THISNODE, nid);
3198                                if (obj)
3199                                        break;
3200                }
3201        }
3202
3203        if (!obj) {
3204                /*
3205                 * This allocation will be performed within the constraints
3206                 * of the current cpuset / memory policy requirements.
3207                 * We may trigger various forms of reclaim on the allowed
3208                 * set and go into memory reserves if necessary.
3209                 */
3210                if (local_flags & __GFP_WAIT)
3211                        local_irq_enable();
3212                kmem_flagcheck(cache, flags);
3213                obj = kmem_getpages(cache, local_flags, numa_node_id());
3214                if (local_flags & __GFP_WAIT)
3215                        local_irq_disable();
3216                if (obj) {
3217                        /*
3218                         * Insert into the appropriate per node queues
3219                         */
3220                        nid = page_to_nid(virt_to_page(obj));
3221                        if (cache_grow(cache, flags, nid, obj)) {
3222                                obj = ____cache_alloc_node(cache,
3223                                        flags | GFP_THISNODE, nid);
3224                                if (!obj)
3225                                        /*
3226                                         * Another processor may allocate the
3227                                         * objects in the slab since we are
3228                                         * not holding any locks.
3229                                         */
3230                                        goto retry;
3231                        } else {
3232                                /* cache_grow already freed obj */
3233                                obj = NULL;
3234                        }
3235                }
3236        }
3237        return obj;
3238}
3239
3240/*
3241 * A interface to enable slab creation on nodeid
3242 */
3243static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3244                                int nodeid)
3245{
3246        struct list_head *entry;
3247        struct slab *slabp;
3248        struct kmem_list3 *l3;
3249        void *obj;
3250        int x;
3251
3252        l3 = cachep->nodelists[nodeid];
3253        BUG_ON(!l3);
3254
3255retry:
3256        check_irq_off();
3257        spin_lock(&l3->list_lock);
3258        entry = l3->slabs_partial.next;
3259        if (entry == &l3->slabs_partial) {
3260                l3->free_touched = 1;
3261                entry = l3->slabs_free.next;
3262                if (entry == &l3->slabs_free)
3263                        goto must_grow;
3264        }
3265
3266        slabp = list_entry(entry, struct slab, list);
3267        check_spinlock_acquired_node(cachep, nodeid);
3268        check_slabp(cachep, slabp);
3269
3270        STATS_INC_NODEALLOCS(cachep);
3271        STATS_INC_ACTIVE(cachep);
3272        STATS_SET_HIGH(cachep);
3273
3274        BUG_ON(slabp->inuse == cachep->num);
3275
3276        obj = slab_get_obj(cachep, slabp, nodeid);
3277        check_slabp(cachep, slabp);
3278        l3->free_objects--;
3279        /* move slabp to correct slabp list: */
3280        list_del(&slabp->list);
3281
3282        if (slabp->free == BUFCTL_END)
3283                list_add(&slabp->list, &l3->slabs_full);
3284        else
3285                list_add(&slabp->list, &l3->slabs_partial);
3286
3287        spin_unlock(&l3->list_lock);
3288        goto done;
3289
3290must_grow:
3291        spin_unlock(&l3->list_lock);
3292        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3293        if (x)
3294                goto retry;
3295
3296        return fallback_alloc(cachep, flags);
3297
3298done:
3299        return obj;
3300}
3301
3302/**
3303 * kmem_cache_alloc_node - Allocate an object on the specified node
3304 * @cachep: The cache to allocate from.
3305 * @flags: See kmalloc().
3306 * @nodeid: node number of the target node.
3307 * @caller: return address of caller, used for debug information
3308 *
3309 * Identical to kmem_cache_alloc but it will allocate memory on the given
3310 * node, which can improve the performance for cpu bound structures.
3311 *
3312 * Fallback to other node is possible if __GFP_THISNODE is not set.
3313 */
3314static __always_inline void *
3315__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3316                   void *caller)
3317{
3318        unsigned long save_flags;
3319        void *ptr;
3320
3321        flags &= gfp_allowed_mask;
3322
3323        lockdep_trace_alloc(flags);
3324
3325        if (slab_should_failslab(cachep, flags))
3326                return NULL;
3327
3328        cache_alloc_debugcheck_before(cachep, flags);
3329        local_irq_save(save_flags);
3330
3331        if (nodeid == -1)
3332                nodeid = numa_node_id();
3333
3334        if (unlikely(!cachep->nodelists[nodeid])) {
3335                /* Node not bootstrapped yet */
3336                ptr = fallback_alloc(cachep, flags);
3337                goto out;
3338        }
3339
3340        if (nodeid == numa_node_id()) {
3341                /*
3342                 * Use the locally cached objects if possible.
3343                 * However ____cache_alloc does not allow fallback
3344                 * to other nodes. It may fail while we still have
3345                 * objects on other nodes available.
3346                 */
3347                ptr = ____cache_alloc(cachep, flags);
3348                if (ptr)
3349                        goto out;
3350        }
3351        /* ___cache_alloc_node can fall back to other nodes */
3352        ptr = ____cache_alloc_node(cachep, flags, nodeid);
3353  out:
3354        local_irq_restore(save_flags);
3355        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3356        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3357                                 flags);
3358
3359        if (likely(ptr))
3360                kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3361
3362        if (unlikely((flags & __GFP_ZERO) && ptr))
3363                memset(ptr, 0, obj_size(cachep));
3364
3365        return ptr;
3366}
3367
3368static __always_inline void *
3369__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3370{
3371        void *objp;
3372
3373        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3374                objp = alternate_node_alloc(cache, flags);
3375                if (objp)
3376                        goto out;
3377        }
3378        objp = ____cache_alloc(cache, flags);
3379
3380        /*
3381         * We may just have run out of memory on the local node.
3382         * ____cache_alloc_node() knows how to locate memory on other nodes
3383         */
3384        if (!objp)
3385                objp = ____cache_alloc_node(cache, flags, numa_node_id());
3386
3387  out:
3388        return objp;
3389}
3390#else
3391
3392static __always_inline void *
3393__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3394{
3395        return ____cache_alloc(cachep, flags);
3396}
3397
3398#endif /* CONFIG_NUMA */
3399
3400static __always_inline void *
3401__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3402{
3403        unsigned long save_flags;
3404        void *objp;
3405
3406        flags &= gfp_allowed_mask;
3407
3408        lockdep_trace_alloc(flags);
3409
3410        if (slab_should_failslab(cachep, flags))
3411                return NULL;
3412
3413        cache_alloc_debugcheck_before(cachep, flags);
3414        local_irq_save(save_flags);
3415        objp = __do_cache_alloc(cachep, flags);
3416        local_irq_restore(save_flags);
3417        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3418        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3419                                 flags);
3420        prefetchw(objp);
3421
3422        if (likely(objp))
3423                kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3424
3425        if (unlikely((flags & __GFP_ZERO) && objp))
3426                memset(objp, 0, obj_size(cachep));
3427
3428        return objp;
3429}
3430
3431/*
3432 * Caller needs to acquire correct kmem_list's list_lock
3433 */
3434static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3435                       int node)
3436{
3437        int i;
3438        struct kmem_list3 *l3;
3439
3440        for (i = 0; i < nr_objects; i++) {
3441                void *objp = objpp[i];
3442                struct slab *slabp;
3443
3444                slabp = virt_to_slab(objp);
3445                l3 = cachep->nodelists[node];
3446                list_del(&slabp->list);
3447                check_spinlock_acquired_node(cachep, node);
3448                check_slabp(cachep, slabp);
3449                slab_put_obj(cachep, slabp, objp, node);
3450                STATS_DEC_ACTIVE(cachep);
3451                l3->free_objects++;
3452                check_slabp(cachep, slabp);
3453
3454                /* fixup slab chains */
3455                if (slabp->inuse == 0) {
3456                        if (l3->free_objects > l3->free_limit) {
3457                                l3->free_objects -= cachep->num;
3458                                /* No need to drop any previously held
3459                                 * lock here, even if we have a off-slab slab
3460                                 * descriptor it is guaranteed to come from
3461                                 * a different cache, refer to comments before
3462                                 * alloc_slabmgmt.
3463                                 */
3464                                slab_destroy(cachep, slabp);
3465                        } else {
3466                                list_add(&slabp->list, &l3->slabs_free);
3467                        }
3468                } else {
3469                        /* Unconditionally move a slab to the end of the
3470                         * partial list on free - maximum time for the
3471                         * other objects to be freed, too.
3472                         */
3473                        list_add_tail(&slabp->list, &l3->slabs_partial);
3474                }
3475        }
3476}
3477
3478static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3479{
3480        int batchcount;
3481        struct kmem_list3 *l3;
3482        int node = numa_node_id();
3483
3484        batchcount = ac->batchcount;
3485#if DEBUG
3486        BUG_ON(!batchcount || batchcount > ac->avail);
3487#endif
3488        check_irq_off();
3489        l3 = cachep->nodelists[node];
3490        spin_lock(&l3->list_lock);
3491        if (l3->shared) {
3492                struct array_cache *shared_array = l3->shared;
3493                int max = shared_array->limit - shared_array->avail;
3494                if (max) {
3495                        if (batchcount > max)
3496                                batchcount = max;
3497                        memcpy(&(shared_array->entry[shared_array->avail]),
3498                               ac->entry, sizeof(void *) * batchcount);
3499                        shared_array->avail += batchcount;
3500                        goto free_done;
3501                }
3502        }
3503
3504        free_block(cachep, ac->entry, batchcount, node);
3505free_done:
3506#if STATS
3507        {
3508                int i = 0;
3509                struct list_head *p;
3510
3511                p = l3->slabs_free.next;
3512                while (p != &(l3->slabs_free)) {
3513                        struct slab *slabp;
3514
3515                        slabp = list_entry(p, struct slab, list);
3516                        BUG_ON(slabp->inuse);
3517
3518                        i++;
3519                        p = p->next;
3520                }
3521                STATS_SET_FREEABLE(cachep, i);
3522        }
3523#endif
3524        spin_unlock(&l3->list_lock);
3525        ac->avail -= batchcount;
3526        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3527}
3528
3529/*
3530 * Release an obj back to its cache. If the obj has a constructed state, it must
3531 * be in this state _before_ it is released.  Called with disabled ints.
3532 */
3533static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3534{
3535        struct array_cache *ac = cpu_cache_get(cachep);
3536
3537        check_irq_off();
3538        kmemleak_free_recursive(objp, cachep->flags);
3539        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3540
3541        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3542
3543        /*
3544         * Skip calling cache_free_alien() when the platform is not numa.
3545         * This will avoid cache misses that happen while accessing slabp (which
3546         * is per page memory  reference) to get nodeid. Instead use a global
3547         * variable to skip the call, which is mostly likely to be present in
3548         * the cache.
3549         */
3550        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3551                return;
3552
3553        if (likely(ac->avail < ac->limit)) {
3554                STATS_INC_FREEHIT(cachep);
3555                ac->entry[ac->avail++] = objp;
3556                return;
3557        } else {
3558                STATS_INC_FREEMISS(cachep);
3559                cache_flusharray(cachep, ac);
3560                ac->entry[ac->avail++] = objp;
3561        }
3562}
3563
3564/**
3565 * kmem_cache_alloc - Allocate an object
3566 * @cachep: The cache to allocate from.
3567 * @flags: See kmalloc().
3568 *
3569 * Allocate an object from this cache.  The flags are only relevant
3570 * if the cache has no available objects.
3571 */
3572void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3573{
3574        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3575
3576        trace_kmem_cache_alloc(_RET_IP_, ret,
3577                               obj_size(cachep), cachep->buffer_size, flags);
3578
3579        return ret;
3580}
3581EXPORT_SYMBOL(kmem_cache_alloc);
3582
3583#ifdef CONFIG_TRACING
3584void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3585{
3586        return __cache_alloc(cachep, flags, __builtin_return_address(0));
3587}
3588EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3589#endif
3590
3591/**
3592 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3593 * @cachep: the cache we're checking against
3594 * @ptr: pointer to validate
3595 *
3596 * This verifies that the untrusted pointer looks sane;
3597 * it is _not_ a guarantee that the pointer is actually
3598 * part of the slab cache in question, but it at least
3599 * validates that the pointer can be dereferenced and
3600 * looks half-way sane.
3601 *
3602 * Currently only used for dentry validation.
3603 */
3604int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3605{
3606        unsigned long addr = (unsigned long)ptr;
3607        unsigned long min_addr = PAGE_OFFSET;
3608        unsigned long align_mask = BYTES_PER_WORD - 1;
3609        unsigned long size = cachep->buffer_size;
3610        struct page *page;
3611
3612        if (unlikely(addr < min_addr))
3613                goto out;
3614        if (unlikely(addr > (unsigned long)high_memory - size))
3615                goto out;
3616        if (unlikely(addr & align_mask))
3617                goto out;
3618        if (unlikely(!kern_addr_valid(addr)))
3619                goto out;
3620        if (unlikely(!kern_addr_valid(addr + size - 1)))
3621                goto out;
3622        page = virt_to_page(ptr);
3623        if (unlikely(!PageSlab(page)))
3624                goto out;
3625        if (unlikely(page_get_cache(page) != cachep))
3626                goto out;
3627        return 1;
3628out:
3629        return 0;
3630}
3631
3632#ifdef CONFIG_NUMA
3633void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3634{
3635        void *ret = __cache_alloc_node(cachep, flags, nodeid,
3636                                       __builtin_return_address(0));
3637
3638        trace_kmem_cache_alloc_node(_RET_IP_, ret,
3639                                    obj_size(cachep), cachep->buffer_size,
3640                                    flags, nodeid);
3641
3642        return ret;
3643}
3644EXPORT_SYMBOL(kmem_cache_alloc_node);
3645
3646#ifdef CONFIG_TRACING
3647void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3648                                    gfp_t flags,
3649                                    int nodeid)
3650{
3651        return __cache_alloc_node(cachep, flags, nodeid,
3652                                  __builtin_return_address(0));
3653}
3654EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3655#endif
3656
3657static __always_inline void *
3658__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3659{
3660        struct kmem_cache *cachep;
3661        void *ret;
3662
3663        cachep = kmem_find_general_cachep(size, flags);
3664        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3665                return cachep;
3666        ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3667
3668        trace_kmalloc_node((unsigned long) caller, ret,
3669                           size, cachep->buffer_size, flags, node);
3670
3671        return ret;
3672}
3673
3674#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3675void *__kmalloc_node(size_t size, gfp_t flags, int node)
3676{
3677        return __do_kmalloc_node(size, flags, node,
3678                        __builtin_return_address(0));
3679}
3680EXPORT_SYMBOL(__kmalloc_node);
3681
3682void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3683                int node, unsigned long caller)
3684{
3685        return __do_kmalloc_node(size, flags, node, (void *)caller);
3686}
3687EXPORT_SYMBOL(__kmalloc_node_track_caller);
3688#else
3689void *__kmalloc_node(size_t size, gfp_t flags, int node)
3690{
3691        return __do_kmalloc_node(size, flags, node, NULL);
3692}
3693EXPORT_SYMBOL(__kmalloc_node);
3694#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3695#endif /* CONFIG_NUMA */
3696
3697/**
3698 * __do_kmalloc - allocate memory
3699 * @size: how many bytes of memory are required.
3700 * @flags: the type of memory to allocate (see kmalloc).
3701 * @caller: function caller for debug tracking of the caller
3702 */
3703static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3704                                          void *caller)
3705{
3706        struct kmem_cache *cachep;
3707        void *ret;
3708
3709        /* If you want to save a few bytes .text space: replace
3710         * __ with kmem_.
3711         * Then kmalloc uses the uninlined functions instead of the inline
3712         * functions.
3713         */
3714        cachep = __find_general_cachep(size, flags);
3715        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3716                return cachep;
3717        ret = __cache_alloc(cachep, flags, caller);
3718
3719        trace_kmalloc((unsigned long) caller, ret,
3720                      size, cachep->buffer_size, flags);
3721
3722        return ret;
3723}
3724
3725
3726#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3727void *__kmalloc(size_t size, gfp_t flags)
3728{
3729        return __do_kmalloc(size, flags, __builtin_return_address(0));
3730}
3731EXPORT_SYMBOL(__kmalloc);
3732
3733void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3734{
3735        return __do_kmalloc(size, flags, (void *)caller);
3736}
3737EXPORT_SYMBOL(__kmalloc_track_caller);
3738
3739#else
3740void *__kmalloc(size_t size, gfp_t flags)
3741{
3742        return __do_kmalloc(size, flags, NULL);
3743}
3744EXPORT_SYMBOL(__kmalloc);
3745#endif
3746
3747/**
3748 * kmem_cache_free - Deallocate an object
3749 * @cachep: The cache the allocation was from.
3750 * @objp: The previously allocated object.
3751 *
3752 * Free an object which was previously allocated from this
3753 * cache.
3754 */
3755void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3756{
3757        unsigned long flags;
3758
3759        local_irq_save(flags);
3760        debug_check_no_locks_freed(objp, obj_size(cachep));
3761        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3762                debug_check_no_obj_freed(objp, obj_size(cachep));
3763        __cache_free(cachep, objp);
3764        local_irq_restore(flags);
3765
3766        trace_kmem_cache_free(_RET_IP_, objp);
3767}
3768EXPORT_SYMBOL(kmem_cache_free);
3769
3770/**
3771 * kfree - free previously allocated memory
3772 * @objp: pointer returned by kmalloc.
3773 *
3774 * If @objp is NULL, no operation is performed.
3775 *
3776 * Don't free memory not originally allocated by kmalloc()
3777 * or you will run into trouble.
3778 */
3779void kfree(const void *objp)
3780{
3781        struct kmem_cache *c;
3782        unsigned long flags;
3783
3784        trace_kfree(_RET_IP_, objp);
3785
3786        if (unlikely(ZERO_OR_NULL_PTR(objp)))
3787                return;
3788        local_irq_save(flags);
3789        kfree_debugcheck(objp);
3790        c = virt_to_cache(objp);
3791        debug_check_no_locks_freed(objp, obj_size(c));
3792        debug_check_no_obj_freed(objp, obj_size(c));
3793        __cache_free(c, (void *)objp);
3794        local_irq_restore(flags);
3795}
3796EXPORT_SYMBOL(kfree);
3797
3798unsigned int kmem_cache_size(struct kmem_cache *cachep)
3799{
3800        return obj_size(cachep);
3801}
3802EXPORT_SYMBOL(kmem_cache_size);
3803
3804const char *kmem_cache_name(struct kmem_cache *cachep)
3805{
3806        return cachep->name;
3807}
3808EXPORT_SYMBOL_GPL(kmem_cache_name);
3809
3810/*
3811 * This initializes kmem_list3 or resizes various caches for all nodes.
3812 */
3813static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3814{
3815        int node;
3816        struct kmem_list3 *l3;
3817        struct array_cache *new_shared;
3818        struct array_cache **new_alien = NULL;
3819
3820        for_each_online_node(node) {
3821
3822                if (use_alien_caches) {
3823                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3824                        if (!new_alien)
3825                                goto fail;
3826                }
3827
3828                new_shared = NULL;
3829                if (cachep->shared) {
3830                        new_shared = alloc_arraycache(node,
3831                                cachep->shared*cachep->batchcount,
3832                                        0xbaadf00d, gfp);
3833                        if (!new_shared) {
3834                                free_alien_cache(new_alien);
3835                                goto fail;
3836                        }
3837                }
3838
3839                l3 = cachep->nodelists[node];
3840                if (l3) {
3841                        struct array_cache *shared = l3->shared;
3842
3843                        spin_lock_irq(&l3->list_lock);
3844
3845                        if (shared)
3846                                free_block(cachep, shared->entry,
3847                                                shared->avail, node);
3848
3849                        l3->shared = new_shared;
3850                        if (!l3->alien) {
3851                                l3->alien = new_alien;
3852                                new_alien = NULL;
3853                        }
3854                        l3->free_limit = (1 + nr_cpus_node(node)) *
3855                                        cachep->batchcount + cachep->num;
3856                        spin_unlock_irq(&l3->list_lock);
3857                        kfree(shared);
3858                        free_alien_cache(new_alien);
3859                        continue;
3860                }
3861                l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3862                if (!l3) {
3863                        free_alien_cache(new_alien);
3864                        kfree(new_shared);
3865                        goto fail;
3866                }
3867
3868                kmem_list3_init(l3);
3869                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3870                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3871                l3->shared = new_shared;
3872                l3->alien = new_alien;
3873                l3->free_limit = (1 + nr_cpus_node(node)) *
3874                                        cachep->batchcount + cachep->num;
3875                cachep->nodelists[node] = l3;
3876        }
3877        return 0;
3878
3879fail:
3880        if (!cachep->next.next) {
3881                /* Cache is not active yet. Roll back what we did */
3882                node--;
3883                while (node >= 0) {
3884                        if (cachep->nodelists[node]) {
3885                                l3 = cachep->nodelists[node];
3886
3887                                kfree(l3->shared);
3888                                free_alien_cache(l3->alien);
3889                                kfree(l3);
3890                                cachep->nodelists[node] = NULL;
3891                        }
3892                        node--;
3893                }
3894        }
3895        return -ENOMEM;
3896}
3897
3898struct ccupdate_struct {
3899        struct kmem_cache *cachep;
3900        struct array_cache *new[NR_CPUS];
3901};
3902
3903static void do_ccupdate_local(void *info)
3904{
3905        struct ccupdate_struct *new = info;
3906        struct array_cache *old;
3907
3908        check_irq_off();
3909        old = cpu_cache_get(new->cachep);
3910
3911        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3912        new->new[smp_processor_id()] = old;
3913}
3914
3915/* Always called with the cache_chain_mutex held */
3916static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3917                                int batchcount, int shared, gfp_t gfp)
3918{
3919        struct ccupdate_struct *new;
3920        int i;
3921
3922        new = kzalloc(sizeof(*new), gfp);
3923        if (!new)
3924                return -ENOMEM;
3925
3926        for_each_online_cpu(i) {
3927                new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3928                                                batchcount, gfp);
3929                if (!new->new[i]) {
3930                        for (i--; i >= 0; i--)
3931                                kfree(new->new[i]);
3932                        kfree(new);
3933                        return -ENOMEM;
3934                }
3935        }
3936        new->cachep = cachep;
3937
3938        on_each_cpu(do_ccupdate_local, (void *)new, 1);
3939
3940        check_irq_on();
3941        cachep->batchcount = batchcount;
3942        cachep->limit = limit;
3943        cachep->shared = shared;
3944
3945        for_each_online_cpu(i) {
3946                struct array_cache *ccold = new->new[i];
3947                if (!ccold)
3948                        continue;
3949                spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3950                free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3951                spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3952                kfree(ccold);
3953        }
3954        kfree(new);
3955        return alloc_kmemlist(cachep, gfp);
3956}
3957
3958/* Called with cache_chain_mutex held always */
3959static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3960{
3961        int err;
3962        int limit, shared;
3963
3964        /*
3965         * The head array serves three purposes:
3966         * - create a LIFO ordering, i.e. return objects that are cache-warm
3967         * - reduce the number of spinlock operations.
3968         * - reduce the number of linked list operations on the slab and
3969         *   bufctl chains: array operations are cheaper.
3970         * The numbers are guessed, we should auto-tune as described by
3971         * Bonwick.
3972         */
3973        if (cachep->buffer_size > 131072)
3974                limit = 1;
3975        else if (cachep->buffer_size > PAGE_SIZE)
3976                limit = 8;
3977        else if (cachep->buffer_size > 1024)
3978                limit = 24;
3979        else if (cachep->buffer_size > 256)
3980                limit = 54;
3981        else
3982                limit = 120;
3983
3984        /*
3985         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3986         * allocation behaviour: Most allocs on one cpu, most free operations
3987         * on another cpu. For these cases, an efficient object passing between
3988         * cpus is necessary. This is provided by a shared array. The array
3989         * replaces Bonwick's magazine layer.
3990         * On uniprocessor, it's functionally equivalent (but less efficient)
3991         * to a larger limit. Thus disabled by default.
3992         */
3993        shared = 0;
3994        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
3995                shared = 8;
3996
3997#if DEBUG
3998        /*
3999         * With debugging enabled, large batchcount lead to excessively long
4000         * periods with disabled local interrupts. Limit the batchcount
4001         */
4002        if (limit > 32)
4003                limit = 32;
4004#endif
4005        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
4006        if (err)
4007                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4008                       cachep->name, -err);
4009        return err;
4010}
4011
4012/*
4013 * Drain an array if it contains any elements taking the l3 lock only if
4014 * necessary. Note that the l3 listlock also protects the array_cache
4015 * if drain_array() is used on the shared array.
4016 */
4017void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4018                         struct array_cache *ac, int force, int node)
4019{
4020        int tofree;
4021
4022        if (!ac || !ac->avail)
4023                return;
4024        if (ac->touched && !force) {
4025                ac->touched = 0;
4026        } else {
4027                spin_lock_irq(&l3->list_lock);
4028                if (ac->avail) {
4029                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
4030                        if (tofree > ac->avail)
4031                                tofree = (ac->avail + 1) / 2;
4032                        free_block(cachep, ac->entry, tofree, node);
4033                        ac->avail -= tofree;
4034                        memmove(ac->entry, &(ac->entry[tofree]),
4035                                sizeof(void *) * ac->avail);
4036                }
4037                spin_unlock_irq(&l3->list_lock);
4038        }
4039}
4040
4041/**
4042 * cache_reap - Reclaim memory from caches.
4043 * @w: work descriptor
4044 *
4045 * Called from workqueue/eventd every few seconds.
4046 * Purpose:
4047 * - clear the per-cpu caches for this CPU.
4048 * - return freeable pages to the main free memory pool.
4049 *
4050 * If we cannot acquire the cache chain mutex then just give up - we'll try
4051 * again on the next iteration.
4052 */
4053static void cache_reap(struct work_struct *w)
4054{
4055        struct kmem_cache *searchp;
4056        struct kmem_list3 *l3;
4057        int node = numa_node_id();
4058        struct delayed_work *work = to_delayed_work(w);
4059
4060        if (!mutex_trylock(&cache_chain_mutex))
4061                /* Give up. Setup the next iteration. */
4062                goto out;
4063
4064        list_for_each_entry(searchp, &cache_chain, next) {
4065                check_irq_on();
4066
4067                /*
4068                 * We only take the l3 lock if absolutely necessary and we
4069                 * have established with reasonable certainty that
4070                 * we can do some work if the lock was obtained.
4071                 */
4072                l3 = searchp->nodelists[node];
4073
4074                reap_alien(searchp, l3);
4075
4076                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4077
4078                /*
4079                 * These are racy checks but it does not matter
4080                 * if we skip one check or scan twice.
4081                 */
4082                if (time_after(l3->next_reap, jiffies))
4083                        goto next;
4084
4085                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4086
4087                drain_array(searchp, l3, l3->shared, 0, node);
4088
4089                if (l3->free_touched)
4090                        l3->free_touched = 0;
4091                else {
4092                        int freed;
4093
4094                        freed = drain_freelist(searchp, l3, (l3->free_limit +
4095                                5 * searchp->num - 1) / (5 * searchp->num));
4096                        STATS_ADD_REAPED(searchp, freed);
4097                }
4098next:
4099                cond_resched();
4100        }
4101        check_irq_on();
4102        mutex_unlock(&cache_chain_mutex);
4103        next_reap_node();
4104out:
4105        /* Set up the next iteration */
4106        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
4107}
4108
4109#ifdef CONFIG_SLABINFO
4110
4111static void print_slabinfo_header(struct seq_file *m)
4112{
4113        /*
4114         * Output format version, so at least we can change it
4115         * without _too_ many complaints.
4116         */
4117#if STATS
4118        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4119#else
4120        seq_puts(m, "slabinfo - version: 2.1\n");
4121#endif
4122        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
4123                 "<objperslab> <pagesperslab>");
4124        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4125        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4126#if STATS
4127        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4128                 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4129        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4130#endif
4131        seq_putc(m, '\n');
4132}
4133
4134static void *s_start(struct seq_file *m, loff_t *pos)
4135{
4136        loff_t n = *pos;
4137
4138        mutex_lock(&cache_chain_mutex);
4139        if (!n)
4140                print_slabinfo_header(m);
4141
4142        return seq_list_start(&cache_chain, *pos);
4143}
4144
4145static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4146{
4147        return seq_list_next(p, &cache_chain, pos);
4148}
4149
4150static void s_stop(struct seq_file *m, void *p)
4151{
4152        mutex_unlock(&cache_chain_mutex);
4153}
4154
4155static int s_show(struct seq_file *m, void *p)
4156{
4157        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4158        struct slab *slabp;
4159        unsigned long active_objs;
4160        unsigned long num_objs;
4161        unsigned long active_slabs = 0;
4162        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4163        const char *name;
4164        char *error = NULL;
4165        int node;
4166        struct kmem_list3 *l3;
4167
4168        active_objs = 0;
4169        num_slabs = 0;
4170        for_each_online_node(node) {
4171                l3 = cachep->nodelists[node];
4172                if (!l3)
4173                        continue;
4174
4175                check_irq_on();
4176                spin_lock_irq(&l3->list_lock);
4177
4178                list_for_each_entry(slabp, &l3->slabs_full, list) {
4179                        if (slabp->inuse != cachep->num && !error)
4180                                error = "slabs_full accounting error";
4181                        active_objs += cachep->num;
4182                        active_slabs++;
4183                }
4184                list_for_each_entry(slabp, &l3->slabs_partial, list) {
4185                        if (slabp->inuse == cachep->num && !error)
4186                                error = "slabs_partial inuse accounting error";
4187                        if (!slabp->inuse && !error)
4188                                error = "slabs_partial/inuse accounting error";
4189                        active_objs += slabp->inuse;
4190                        active_slabs++;
4191                }
4192                list_for_each_entry(slabp, &l3->slabs_free, list) {
4193                        if (slabp->inuse && !error)
4194                                error = "slabs_free/inuse accounting error";
4195                        num_slabs++;
4196                }
4197                free_objects += l3->free_objects;
4198                if (l3->shared)
4199                        shared_avail += l3->shared->avail;
4200
4201                spin_unlock_irq(&l3->list_lock);
4202        }
4203        num_slabs += active_slabs;
4204        num_objs = num_slabs * cachep->num;
4205        if (num_objs - active_objs != free_objects && !error)
4206                error = "free_objects accounting error";
4207
4208        name = cachep->name;
4209        if (error)
4210                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4211
4212        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4213                   name, active_objs, num_objs, cachep->buffer_size,
4214                   cachep->num, (1 << cachep->gfporder));
4215        seq_printf(m, " : tunables %4u %4u %4u",
4216                   cachep->limit, cachep->batchcount, cachep->shared);
4217        seq_printf(m, " : slabdata %6lu %6lu %6lu",
4218                   active_slabs, num_slabs, shared_avail);
4219#if STATS
4220        {                       /* list3 stats */
4221                unsigned long high = cachep->high_mark;
4222                unsigned long allocs = cachep->num_allocations;
4223                unsigned long grown = cachep->grown;
4224                unsigned long reaped = cachep->reaped;
4225                unsigned long errors = cachep->errors;
4226                unsigned long max_freeable = cachep->max_freeable;
4227                unsigned long node_allocs = cachep->node_allocs;
4228                unsigned long node_frees = cachep->node_frees;
4229                unsigned long overflows = cachep->node_overflow;
4230
4231                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4232                                %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4233                                reaped, errors, max_freeable, node_allocs,
4234                                node_frees, overflows);
4235        }
4236        /* cpu stats */
4237        {
4238                unsigned long allochit = atomic_read(&cachep->allochit);
4239                unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4240                unsigned long freehit = atomic_read(&cachep->freehit);
4241                unsigned long freemiss = atomic_read(&cachep->freemiss);
4242
4243                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4244                           allochit, allocmiss, freehit, freemiss);
4245        }
4246#endif
4247        seq_putc(m, '\n');
4248        return 0;
4249}
4250
4251/*
4252 * slabinfo_op - iterator that generates /proc/slabinfo
4253 *
4254 * Output layout:
4255 * cache-name
4256 * num-active-objs
4257 * total-objs
4258 * object size
4259 * num-active-slabs
4260 * total-slabs
4261 * num-pages-per-slab
4262 * + further values on SMP and with statistics enabled
4263 */
4264
4265static const struct seq_operations slabinfo_op = {
4266        .start = s_start,
4267        .next = s_next,
4268        .stop = s_stop,
4269        .show = s_show,
4270};
4271
4272#define MAX_SLABINFO_WRITE 128
4273/**
4274 * slabinfo_write - Tuning for the slab allocator
4275 * @file: unused
4276 * @buffer: user buffer
4277 * @count: data length
4278 * @ppos: unused
4279 */
4280ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4281                       size_t count, loff_t *ppos)
4282{
4283        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4284        int limit, batchcount, shared, res;
4285        struct kmem_cache *cachep;
4286
4287        if (count > MAX_SLABINFO_WRITE)
4288                return -EINVAL;
4289        if (copy_from_user(&kbuf, buffer, count))
4290                return -EFAULT;
4291        kbuf[MAX_SLABINFO_WRITE] = '\0';
4292
4293        tmp = strchr(kbuf, ' ');
4294        if (!tmp)
4295                return -EINVAL;
4296        *tmp = '\0';
4297        tmp++;
4298        if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4299                return -EINVAL;
4300
4301        /* Find the cache in the chain of caches. */
4302        mutex_lock(&cache_chain_mutex);
4303        res = -EINVAL;
4304        list_for_each_entry(cachep, &cache_chain, next) {
4305                if (!strcmp(cachep->name, kbuf)) {
4306                        if (limit < 1 || batchcount < 1 ||
4307                                        batchcount > limit || shared < 0) {
4308                                res = 0;
4309                        } else {
4310                                res = do_tune_cpucache(cachep, limit,
4311                                                       batchcount, shared,
4312                                                       GFP_KERNEL);
4313                        }
4314                        break;
4315                }
4316        }
4317        mutex_unlock(&cache_chain_mutex);
4318        if (res >= 0)
4319                res = count;
4320        return res;
4321}
4322
4323static int slabinfo_open(struct inode *inode, struct file *file)
4324{
4325        return seq_open(file, &slabinfo_op);
4326}
4327
4328static const struct file_operations proc_slabinfo_operations = {
4329        .open           = slabinfo_open,
4330        .read           = seq_read,
4331        .write          = slabinfo_write,
4332        .llseek         = seq_lseek,
4333        .release        = seq_release,
4334};
4335
4336#ifdef CONFIG_DEBUG_SLAB_LEAK
4337
4338static void *leaks_start(struct seq_file *m, loff_t *pos)
4339{
4340        mutex_lock(&cache_chain_mutex);
4341        return seq_list_start(&cache_chain, *pos);
4342}
4343
4344static inline int add_caller(unsigned long *n, unsigned long v)
4345{
4346        unsigned long *p;
4347        int l;
4348        if (!v)
4349                return 1;
4350        l = n[1];
4351        p = n + 2;
4352        while (l) {
4353                int i = l/2;
4354                unsigned long *q = p + 2 * i;
4355                if (*q == v) {
4356                        q[1]++;
4357                        return 1;
4358                }
4359                if (*q > v) {
4360                        l = i;
4361                } else {
4362                        p = q + 2;
4363                        l -= i + 1;
4364                }
4365        }
4366        if (++n[1] == n[0])
4367                return 0;
4368        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4369        p[0] = v;
4370        p[1] = 1;
4371        return 1;
4372}
4373
4374static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4375{
4376        void *p;
4377        int i;
4378        if (n[0] == n[1])
4379                return;
4380        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4381                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4382                        continue;
4383                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4384                        return;
4385        }
4386}
4387
4388static void show_symbol(struct seq_file *m, unsigned long address)
4389{
4390#ifdef CONFIG_KALLSYMS
4391        unsigned long offset, size;
4392        char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
4393
4394        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
4395                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4396                if (modname[0])
4397                        seq_printf(m, " [%s]", modname);
4398                return;
4399        }
4400#endif
4401        seq_printf(m, "%p", (void *)address);
4402}
4403
4404static int leaks_show(struct seq_file *m, void *p)
4405{
4406        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4407        struct slab *slabp;
4408        struct kmem_list3 *l3;
4409        const char *name;
4410        unsigned long *n = m->private;
4411        int node;
4412        int i;
4413
4414        if (!(cachep->flags & SLAB_STORE_USER))
4415                return 0;
4416        if (!(cachep->flags & SLAB_RED_ZONE))
4417                return 0;
4418
4419        /* OK, we can do it */
4420
4421        n[1] = 0;
4422
4423        for_each_online_node(node) {
4424                l3 = cachep->nodelists[node];
4425                if (!l3)
4426                        continue;
4427
4428                check_irq_on();
4429                spin_lock_irq(&l3->list_lock);
4430
4431                list_for_each_entry(slabp, &l3->slabs_full, list)
4432                        handle_slab(n, cachep, slabp);
4433                list_for_each_entry(slabp, &l3->slabs_partial, list)
4434                        handle_slab(n, cachep, slabp);
4435                spin_unlock_irq(&l3->list_lock);
4436        }
4437        name = cachep->name;
4438        if (n[0] == n[1]) {
4439                /* Increase the buffer size */
4440                mutex_unlock(&cache_chain_mutex);
4441                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4442                if (!m->private) {
4443                        /* Too bad, we are really out */
4444                        m->private = n;
4445                        mutex_lock(&cache_chain_mutex);
4446                        return -ENOMEM;
4447                }
4448                *(unsigned long *)m->private = n[0] * 2;
4449                kfree(n);
4450                mutex_lock(&cache_chain_mutex);
4451                /* Now make sure this entry will be retried */
4452                m->count = m->size;
4453                return 0;
4454        }
4455        for (i = 0; i < n[1]; i++) {
4456                seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4457                show_symbol(m, n[2*i+2]);
4458                seq_putc(m, '\n');
4459        }
4460
4461        return 0;
4462}
4463
4464static const struct seq_operations slabstats_op = {
4465        .start = leaks_start,
4466        .next = s_next,
4467        .stop = s_stop,
4468        .show = leaks_show,
4469};
4470
4471static int slabstats_open(struct inode *inode, struct file *file)
4472{
4473        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4474        int ret = -ENOMEM;
4475        if (n) {
4476                ret = seq_open(file, &slabstats_op);
4477                if (!ret) {
4478                        struct seq_file *m = file->private_data;
4479                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4480                        m->private = n;
4481                        n = NULL;
4482                }
4483                kfree(n);
4484        }
4485        return ret;
4486}
4487
4488static const struct file_operations proc_slabstats_operations = {
4489        .open           = slabstats_open,
4490        .read           = seq_read,
4491        .llseek         = seq_lseek,
4492        .release        = seq_release_private,
4493};
4494#endif
4495
4496static int __init slab_proc_init(void)
4497{
4498        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4499#ifdef CONFIG_DEBUG_SLAB_LEAK
4500        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4501#endif
4502        return 0;
4503}
4504module_init(slab_proc_init);
4505#endif
4506
4507/**
4508 * ksize - get the actual amount of memory allocated for a given object
4509 * @objp: Pointer to the object
4510 *
4511 * kmalloc may internally round up allocations and return more memory
4512 * than requested. ksize() can be used to determine the actual amount of
4513 * memory allocated. The caller may use this additional memory, even though
4514 * a smaller amount of memory was initially specified with the kmalloc call.
4515 * The caller must guarantee that objp points to a valid object previously
4516 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4517 * must not be freed during the duration of the call.
4518 */
4519size_t ksize(const void *objp)
4520{
4521        BUG_ON(!objp);
4522        if (unlikely(objp == ZERO_SIZE_PTR))
4523                return 0;
4524
4525        return obj_size(virt_to_cache(objp));
4526}
4527EXPORT_SYMBOL(ksize);
4528
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.