linux/mm/percpu.c
<<
>>
Prefs
   1/*
   2 * linux/mm/percpu.c - percpu memory allocator
   3 *
   4 * Copyright (C) 2009           SUSE Linux Products GmbH
   5 * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
   6 *
   7 * This file is released under the GPLv2.
   8 *
   9 * This is percpu allocator which can handle both static and dynamic
  10 * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
  11 * chunk is consisted of num_possible_cpus() units and the first chunk
  12 * is used for static percpu variables in the kernel image (special
  13 * boot time alloc/init handling necessary as these areas need to be
  14 * brought up before allocation services are running).  Unit grows as
  15 * necessary and all units grow or shrink in unison.  When a chunk is
  16 * filled up, another chunk is allocated.  ie. in vmalloc area
  17 *
  18 *  c0                           c1                         c2
  19 *  -------------------          -------------------        ------------
  20 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
  21 *  -------------------  ......  -------------------  ....  ------------
  22 *
  23 * Allocation is done in offset-size areas of single unit space.  Ie,
  24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  25 * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
  26 * percpu base registers UNIT_SIZE apart.
  27 *
  28 * There are usually many small percpu allocations many of them as
  29 * small as 4 bytes.  The allocator organizes chunks into lists
  30 * according to free size and tries to allocate from the fullest one.
  31 * Each chunk keeps the maximum contiguous area size hint which is
  32 * guaranteed to be eqaul to or larger than the maximum contiguous
  33 * area in the chunk.  This helps the allocator not to iterate the
  34 * chunk maps unnecessarily.
  35 *
  36 * Allocation state in each chunk is kept using an array of integers
  37 * on chunk->map.  A positive value in the map represents a free
  38 * region and negative allocated.  Allocation inside a chunk is done
  39 * by scanning this map sequentially and serving the first matching
  40 * entry.  This is mostly copied from the percpu_modalloc() allocator.
  41 * Chunks are also linked into a rb tree to ease address to chunk
  42 * mapping during free.
  43 *
  44 * To use this allocator, arch code should do the followings.
  45 *
  46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  47 *
  48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  49 *   regular address to percpu pointer and back if they need to be
  50 *   different from the default
  51 *
  52 * - use pcpu_setup_first_chunk() during percpu area initialization to
  53 *   setup the first chunk containing the kernel static percpu area
  54 */
  55
  56#include <linux/bitmap.h>
  57#include <linux/bootmem.h>
  58#include <linux/list.h>
  59#include <linux/mm.h>
  60#include <linux/module.h>
  61#include <linux/mutex.h>
  62#include <linux/percpu.h>
  63#include <linux/pfn.h>
  64#include <linux/rbtree.h>
  65#include <linux/slab.h>
  66#include <linux/spinlock.h>
  67#include <linux/vmalloc.h>
  68#include <linux/workqueue.h>
  69
  70#include <asm/cacheflush.h>
  71#include <asm/sections.h>
  72#include <asm/tlbflush.h>
  73
  74#define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
  75#define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
  76
  77/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  78#ifndef __addr_to_pcpu_ptr
  79#define __addr_to_pcpu_ptr(addr)                                        \
  80        (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr  \
  81                 + (unsigned long)__per_cpu_start)
  82#endif
  83#ifndef __pcpu_ptr_to_addr
  84#define __pcpu_ptr_to_addr(ptr)                                         \
  85        (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr   \
  86                 - (unsigned long)__per_cpu_start)
  87#endif
  88
  89struct pcpu_chunk {
  90        struct list_head        list;           /* linked to pcpu_slot lists */
  91        struct rb_node          rb_node;        /* key is chunk->vm->addr */
  92        int                     free_size;      /* free bytes in the chunk */
  93        int                     contig_hint;    /* max contiguous size hint */
  94        struct vm_struct        *vm;            /* mapped vmalloc region */
  95        int                     map_used;       /* # of map entries used */
  96        int                     map_alloc;      /* # of map entries allocated */
  97        int                     *map;           /* allocation map */
  98        bool                    immutable;      /* no [de]population allowed */
  99        struct page             **page;         /* points to page array */
 100        struct page             *page_ar[];     /* #cpus * UNIT_PAGES */
 101};
 102
 103static int pcpu_unit_pages __read_mostly;
 104static int pcpu_unit_size __read_mostly;
 105static int pcpu_chunk_size __read_mostly;
 106static int pcpu_nr_slots __read_mostly;
 107static size_t pcpu_chunk_struct_size __read_mostly;
 108
 109/* the address of the first chunk which starts with the kernel static area */
 110void *pcpu_base_addr __read_mostly;
 111EXPORT_SYMBOL_GPL(pcpu_base_addr);
 112
 113/* optional reserved chunk, only accessible for reserved allocations */
 114static struct pcpu_chunk *pcpu_reserved_chunk;
 115/* offset limit of the reserved chunk */
 116static int pcpu_reserved_chunk_limit;
 117
 118/*
 119 * Synchronization rules.
 120 *
 121 * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
 122 * protects allocation/reclaim paths, chunks and chunk->page arrays.
 123 * The latter is a spinlock and protects the index data structures -
 124 * chunk slots, rbtree, chunks and area maps in chunks.
 125 *
 126 * During allocation, pcpu_alloc_mutex is kept locked all the time and
 127 * pcpu_lock is grabbed and released as necessary.  All actual memory
 128 * allocations are done using GFP_KERNEL with pcpu_lock released.
 129 *
 130 * Free path accesses and alters only the index data structures, so it
 131 * can be safely called from atomic context.  When memory needs to be
 132 * returned to the system, free path schedules reclaim_work which
 133 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
 134 * reclaimed, release both locks and frees the chunks.  Note that it's
 135 * necessary to grab both locks to remove a chunk from circulation as
 136 * allocation path might be referencing the chunk with only
 137 * pcpu_alloc_mutex locked.
 138 */
 139static DEFINE_MUTEX(pcpu_alloc_mutex);  /* protects whole alloc and reclaim */
 140static DEFINE_SPINLOCK(pcpu_lock);      /* protects index data structures */
 141
 142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
 144
 145/* reclaim work to release fully free chunks, scheduled from free path */
 146static void pcpu_reclaim(struct work_struct *work);
 147static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
 148
 149static int __pcpu_size_to_slot(int size)
 150{
 151        int highbit = fls(size);        /* size is in bytes */
 152        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 153}
 154
 155static int pcpu_size_to_slot(int size)
 156{
 157        if (size == pcpu_unit_size)
 158                return pcpu_nr_slots - 1;
 159        return __pcpu_size_to_slot(size);
 160}
 161
 162static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 163{
 164        if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
 165                return 0;
 166
 167        return pcpu_size_to_slot(chunk->free_size);
 168}
 169
 170static int pcpu_page_idx(unsigned int cpu, int page_idx)
 171{
 172        return cpu * pcpu_unit_pages + page_idx;
 173}
 174
 175static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
 176                                      unsigned int cpu, int page_idx)
 177{
 178        return &chunk->page[pcpu_page_idx(cpu, page_idx)];
 179}
 180
 181static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 182                                     unsigned int cpu, int page_idx)
 183{
 184        return (unsigned long)chunk->vm->addr +
 185                (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 186}
 187
 188static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 189                                     int page_idx)
 190{
 191        return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 192}
 193
 194/**
 195 * pcpu_mem_alloc - allocate memory
 196 * @size: bytes to allocate
 197 *
 198 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 199 * kzalloc() is used; otherwise, vmalloc() is used.  The returned
 200 * memory is always zeroed.
 201 *
 202 * CONTEXT:
 203 * Does GFP_KERNEL allocation.
 204 *
 205 * RETURNS:
 206 * Pointer to the allocated area on success, NULL on failure.
 207 */
 208static void *pcpu_mem_alloc(size_t size)
 209{
 210        if (size <= PAGE_SIZE)
 211                return kzalloc(size, GFP_KERNEL);
 212        else {
 213                void *ptr = vmalloc(size);
 214                if (ptr)
 215                        memset(ptr, 0, size);
 216                return ptr;
 217        }
 218}
 219
 220/**
 221 * pcpu_mem_free - free memory
 222 * @ptr: memory to free
 223 * @size: size of the area
 224 *
 225 * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
 226 */
 227static void pcpu_mem_free(void *ptr, size_t size)
 228{
 229        if (size <= PAGE_SIZE)
 230                kfree(ptr);
 231        else
 232                vfree(ptr);
 233}
 234
 235/**
 236 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 237 * @chunk: chunk of interest
 238 * @oslot: the previous slot it was on
 239 *
 240 * This function is called after an allocation or free changed @chunk.
 241 * New slot according to the changed state is determined and @chunk is
 242 * moved to the slot.  Note that the reserved chunk is never put on
 243 * chunk slots.
 244 *
 245 * CONTEXT:
 246 * pcpu_lock.
 247 */
 248static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 249{
 250        int nslot = pcpu_chunk_slot(chunk);
 251
 252        if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 253                if (oslot < nslot)
 254                        list_move(&chunk->list, &pcpu_slot[nslot]);
 255                else
 256                        list_move_tail(&chunk->list, &pcpu_slot[nslot]);
 257        }
 258}
 259
 260static struct rb_node **pcpu_chunk_rb_search(void *addr,
 261                                             struct rb_node **parentp)
 262{
 263        struct rb_node **p = &pcpu_addr_root.rb_node;
 264        struct rb_node *parent = NULL;
 265        struct pcpu_chunk *chunk;
 266
 267        while (*p) {
 268                parent = *p;
 269                chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
 270
 271                if (addr < chunk->vm->addr)
 272                        p = &(*p)->rb_left;
 273                else if (addr > chunk->vm->addr)
 274                        p = &(*p)->rb_right;
 275                else
 276                        break;
 277        }
 278
 279        if (parentp)
 280                *parentp = parent;
 281        return p;
 282}
 283
 284/**
 285 * pcpu_chunk_addr_search - search for chunk containing specified address
 286 * @addr: address to search for
 287 *
 288 * Look for chunk which might contain @addr.  More specifically, it
 289 * searchs for the chunk with the highest start address which isn't
 290 * beyond @addr.
 291 *
 292 * CONTEXT:
 293 * pcpu_lock.
 294 *
 295 * RETURNS:
 296 * The address of the found chunk.
 297 */
 298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 299{
 300        struct rb_node *n, *parent;
 301        struct pcpu_chunk *chunk;
 302
 303        /* is it in the reserved chunk? */
 304        if (pcpu_reserved_chunk) {
 305                void *start = pcpu_reserved_chunk->vm->addr;
 306
 307                if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
 308                        return pcpu_reserved_chunk;
 309        }
 310
 311        /* nah... search the regular ones */
 312        n = *pcpu_chunk_rb_search(addr, &parent);
 313        if (!n) {
 314                /* no exactly matching chunk, the parent is the closest */
 315                n = parent;
 316                BUG_ON(!n);
 317        }
 318        chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 319
 320        if (addr < chunk->vm->addr) {
 321                /* the parent was the next one, look for the previous one */
 322                n = rb_prev(n);
 323                BUG_ON(!n);
 324                chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 325        }
 326
 327        return chunk;
 328}
 329
 330/**
 331 * pcpu_chunk_addr_insert - insert chunk into address rb tree
 332 * @new: chunk to insert
 333 *
 334 * Insert @new into address rb tree.
 335 *
 336 * CONTEXT:
 337 * pcpu_lock.
 338 */
 339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 340{
 341        struct rb_node **p, *parent;
 342
 343        p = pcpu_chunk_rb_search(new->vm->addr, &parent);
 344        BUG_ON(*p);
 345        rb_link_node(&new->rb_node, parent, p);
 346        rb_insert_color(&new->rb_node, &pcpu_addr_root);
 347}
 348
 349/**
 350 * pcpu_extend_area_map - extend area map for allocation
 351 * @chunk: target chunk
 352 *
 353 * Extend area map of @chunk so that it can accomodate an allocation.
 354 * A single allocation can split an area into three areas, so this
 355 * function makes sure that @chunk->map has at least two extra slots.
 356 *
 357 * CONTEXT:
 358 * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
 359 * if area map is extended.
 360 *
 361 * RETURNS:
 362 * 0 if noop, 1 if successfully extended, -errno on failure.
 363 */
 364static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 365{
 366        int new_alloc;
 367        int *new;
 368        size_t size;
 369
 370        /* has enough? */
 371        if (chunk->map_alloc >= chunk->map_used + 2)
 372                return 0;
 373
 374        spin_unlock_irq(&pcpu_lock);
 375
 376        new_alloc = PCPU_DFL_MAP_ALLOC;
 377        while (new_alloc < chunk->map_used + 2)
 378                new_alloc *= 2;
 379
 380        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
 381        if (!new) {
 382                spin_lock_irq(&pcpu_lock);
 383                return -ENOMEM;
 384        }
 385
 386        /*
 387         * Acquire pcpu_lock and switch to new area map.  Only free
 388         * could have happened inbetween, so map_used couldn't have
 389         * grown.
 390         */
 391        spin_lock_irq(&pcpu_lock);
 392        BUG_ON(new_alloc < chunk->map_used + 2);
 393
 394        size = chunk->map_alloc * sizeof(chunk->map[0]);
 395        memcpy(new, chunk->map, size);
 396
 397        /*
 398         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
 399         * one of the first chunks and still using static map.
 400         */
 401        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
 402                pcpu_mem_free(chunk->map, size);
 403
 404        chunk->map_alloc = new_alloc;
 405        chunk->map = new;
 406        return 0;
 407}
 408
 409/**
 410 * pcpu_split_block - split a map block
 411 * @chunk: chunk of interest
 412 * @i: index of map block to split
 413 * @head: head size in bytes (can be 0)
 414 * @tail: tail size in bytes (can be 0)
 415 *
 416 * Split the @i'th map block into two or three blocks.  If @head is
 417 * non-zero, @head bytes block is inserted before block @i moving it
 418 * to @i+1 and reducing its size by @head bytes.
 419 *
 420 * If @tail is non-zero, the target block, which can be @i or @i+1
 421 * depending on @head, is reduced by @tail bytes and @tail byte block
 422 * is inserted after the target block.
 423 *
 424 * @chunk->map must have enough free slots to accomodate the split.
 425 *
 426 * CONTEXT:
 427 * pcpu_lock.
 428 */
 429static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 430                             int head, int tail)
 431{
 432        int nr_extra = !!head + !!tail;
 433
 434        BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 435
 436        /* insert new subblocks */
 437        memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 438                sizeof(chunk->map[0]) * (chunk->map_used - i));
 439        chunk->map_used += nr_extra;
 440
 441        if (head) {
 442                chunk->map[i + 1] = chunk->map[i] - head;
 443                chunk->map[i++] = head;
 444        }
 445        if (tail) {
 446                chunk->map[i++] -= tail;
 447                chunk->map[i] = tail;
 448        }
 449}
 450
 451/**
 452 * pcpu_alloc_area - allocate area from a pcpu_chunk
 453 * @chunk: chunk of interest
 454 * @size: wanted size in bytes
 455 * @align: wanted align
 456 *
 457 * Try to allocate @size bytes area aligned at @align from @chunk.
 458 * Note that this function only allocates the offset.  It doesn't
 459 * populate or map the area.
 460 *
 461 * @chunk->map must have at least two free slots.
 462 *
 463 * CONTEXT:
 464 * pcpu_lock.
 465 *
 466 * RETURNS:
 467 * Allocated offset in @chunk on success, -1 if no matching area is
 468 * found.
 469 */
 470static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 471{
 472        int oslot = pcpu_chunk_slot(chunk);
 473        int max_contig = 0;
 474        int i, off;
 475
 476        for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 477                bool is_last = i + 1 == chunk->map_used;
 478                int head, tail;
 479
 480                /* extra for alignment requirement */
 481                head = ALIGN(off, align) - off;
 482                BUG_ON(i == 0 && head != 0);
 483
 484                if (chunk->map[i] < 0)
 485                        continue;
 486                if (chunk->map[i] < head + size) {
 487                        max_contig = max(chunk->map[i], max_contig);
 488                        continue;
 489                }
 490
 491                /*
 492                 * If head is small or the previous block is free,
 493                 * merge'em.  Note that 'small' is defined as smaller
 494                 * than sizeof(int), which is very small but isn't too
 495                 * uncommon for percpu allocations.
 496                 */
 497                if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
 498                        if (chunk->map[i - 1] > 0)
 499                                chunk->map[i - 1] += head;
 500                        else {
 501                                chunk->map[i - 1] -= head;
 502                                chunk->free_size -= head;
 503                        }
 504                        chunk->map[i] -= head;
 505                        off += head;
 506                        head = 0;
 507                }
 508
 509                /* if tail is small, just keep it around */
 510                tail = chunk->map[i] - head - size;
 511                if (tail < sizeof(int))
 512                        tail = 0;
 513
 514                /* split if warranted */
 515                if (head || tail) {
 516                        pcpu_split_block(chunk, i, head, tail);
 517                        if (head) {
 518                                i++;
 519                                off += head;
 520                                max_contig = max(chunk->map[i - 1], max_contig);
 521                        }
 522                        if (tail)
 523                                max_contig = max(chunk->map[i + 1], max_contig);
 524                }
 525
 526                /* update hint and mark allocated */
 527                if (is_last)
 528                        chunk->contig_hint = max_contig; /* fully scanned */
 529                else
 530                        chunk->contig_hint = max(chunk->contig_hint,
 531                                                 max_contig);
 532
 533                chunk->free_size -= chunk->map[i];
 534                chunk->map[i] = -chunk->map[i];
 535
 536                pcpu_chunk_relocate(chunk, oslot);
 537                return off;
 538        }
 539
 540        chunk->contig_hint = max_contig;        /* fully scanned */
 541        pcpu_chunk_relocate(chunk, oslot);
 542
 543        /* tell the upper layer that this chunk has no matching area */
 544        return -1;
 545}
 546
 547/**
 548 * pcpu_free_area - free area to a pcpu_chunk
 549 * @chunk: chunk of interest
 550 * @freeme: offset of area to free
 551 *
 552 * Free area starting from @freeme to @chunk.  Note that this function
 553 * only modifies the allocation map.  It doesn't depopulate or unmap
 554 * the area.
 555 *
 556 * CONTEXT:
 557 * pcpu_lock.
 558 */
 559static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 560{
 561        int oslot = pcpu_chunk_slot(chunk);
 562        int i, off;
 563
 564        for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
 565                if (off == freeme)
 566                        break;
 567        BUG_ON(off != freeme);
 568        BUG_ON(chunk->map[i] > 0);
 569
 570        chunk->map[i] = -chunk->map[i];
 571        chunk->free_size += chunk->map[i];
 572
 573        /* merge with previous? */
 574        if (i > 0 && chunk->map[i - 1] >= 0) {
 575                chunk->map[i - 1] += chunk->map[i];
 576                chunk->map_used--;
 577                memmove(&chunk->map[i], &chunk->map[i + 1],
 578                        (chunk->map_used - i) * sizeof(chunk->map[0]));
 579                i--;
 580        }
 581        /* merge with next? */
 582        if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
 583                chunk->map[i] += chunk->map[i + 1];
 584                chunk->map_used--;
 585                memmove(&chunk->map[i + 1], &chunk->map[i + 2],
 586                        (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
 587        }
 588
 589        chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
 590        pcpu_chunk_relocate(chunk, oslot);
 591}
 592
 593/**
 594 * pcpu_unmap - unmap pages out of a pcpu_chunk
 595 * @chunk: chunk of interest
 596 * @page_start: page index of the first page to unmap
 597 * @page_end: page index of the last page to unmap + 1
 598 * @flush: whether to flush cache and tlb or not
 599 *
 600 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 601 * If @flush is true, vcache is flushed before unmapping and tlb
 602 * after.
 603 */
 604static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 605                       bool flush)
 606{
 607        unsigned int last = num_possible_cpus() - 1;
 608        unsigned int cpu;
 609
 610        /* unmap must not be done on immutable chunk */
 611        WARN_ON(chunk->immutable);
 612
 613        /*
 614         * Each flushing trial can be very expensive, issue flush on
 615         * the whole region at once rather than doing it for each cpu.
 616         * This could be an overkill but is more scalable.
 617         */
 618        if (flush)
 619                flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
 620                                   pcpu_chunk_addr(chunk, last, page_end));
 621
 622        for_each_possible_cpu(cpu)
 623                unmap_kernel_range_noflush(
 624                                pcpu_chunk_addr(chunk, cpu, page_start),
 625                                (page_end - page_start) << PAGE_SHIFT);
 626
 627        /* ditto as flush_cache_vunmap() */
 628        if (flush)
 629                flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
 630                                       pcpu_chunk_addr(chunk, last, page_end));
 631}
 632
 633/**
 634 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 635 * @chunk: chunk to depopulate
 636 * @off: offset to the area to depopulate
 637 * @size: size of the area to depopulate in bytes
 638 * @flush: whether to flush cache and tlb or not
 639 *
 640 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 641 * from @chunk.  If @flush is true, vcache is flushed before unmapping
 642 * and tlb after.
 643 *
 644 * CONTEXT:
 645 * pcpu_alloc_mutex.
 646 */
 647static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 648                                  bool flush)
 649{
 650        int page_start = PFN_DOWN(off);
 651        int page_end = PFN_UP(off + size);
 652        int unmap_start = -1;
 653        int uninitialized_var(unmap_end);
 654        unsigned int cpu;
 655        int i;
 656
 657        for (i = page_start; i < page_end; i++) {
 658                for_each_possible_cpu(cpu) {
 659                        struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 660
 661                        if (!*pagep)
 662                                continue;
 663
 664                        __free_page(*pagep);
 665
 666                        /*
 667                         * If it's partial depopulation, it might get
 668                         * populated or depopulated again.  Mark the
 669                         * page gone.
 670                         */
 671                        *pagep = NULL;
 672
 673                        unmap_start = unmap_start < 0 ? i : unmap_start;
 674                        unmap_end = i + 1;
 675                }
 676        }
 677
 678        if (unmap_start >= 0)
 679                pcpu_unmap(chunk, unmap_start, unmap_end, flush);
 680}
 681
 682/**
 683 * pcpu_map - map pages into a pcpu_chunk
 684 * @chunk: chunk of interest
 685 * @page_start: page index of the first page to map
 686 * @page_end: page index of the last page to map + 1
 687 *
 688 * For each cpu, map pages [@page_start,@page_end) into @chunk.
 689 * vcache is flushed afterwards.
 690 */
 691static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 692{
 693        unsigned int last = num_possible_cpus() - 1;
 694        unsigned int cpu;
 695        int err;
 696
 697        /* map must not be done on immutable chunk */
 698        WARN_ON(chunk->immutable);
 699
 700        for_each_possible_cpu(cpu) {
 701                err = map_kernel_range_noflush(
 702                                pcpu_chunk_addr(chunk, cpu, page_start),
 703                                (page_end - page_start) << PAGE_SHIFT,
 704                                PAGE_KERNEL,
 705                                pcpu_chunk_pagep(chunk, cpu, page_start));
 706                if (err < 0)
 707                        return err;
 708        }
 709
 710        /* flush at once, please read comments in pcpu_unmap() */
 711        flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 712                         pcpu_chunk_addr(chunk, last, page_end));
 713        return 0;
 714}
 715
 716/**
 717 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 718 * @chunk: chunk of interest
 719 * @off: offset to the area to populate
 720 * @size: size of the area to populate in bytes
 721 *
 722 * For each cpu, populate and map pages [@page_start,@page_end) into
 723 * @chunk.  The area is cleared on return.
 724 *
 725 * CONTEXT:
 726 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 727 */
 728static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 729{
 730        const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 731        int page_start = PFN_DOWN(off);
 732        int page_end = PFN_UP(off + size);
 733        int map_start = -1;
 734        int uninitialized_var(map_end);
 735        unsigned int cpu;
 736        int i;
 737
 738        for (i = page_start; i < page_end; i++) {
 739                if (pcpu_chunk_page_occupied(chunk, i)) {
 740                        if (map_start >= 0) {
 741                                if (pcpu_map(chunk, map_start, map_end))
 742                                        goto err;
 743                                map_start = -1;
 744                        }
 745                        continue;
 746                }
 747
 748                map_start = map_start < 0 ? i : map_start;
 749                map_end = i + 1;
 750
 751                for_each_possible_cpu(cpu) {
 752                        struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 753
 754                        *pagep = alloc_pages_node(cpu_to_node(cpu),
 755                                                  alloc_mask, 0);
 756                        if (!*pagep)
 757                                goto err;
 758                }
 759        }
 760
 761        if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
 762                goto err;
 763
 764        for_each_possible_cpu(cpu)
 765                memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 766                       size);
 767
 768        return 0;
 769err:
 770        /* likely under heavy memory pressure, give memory back */
 771        pcpu_depopulate_chunk(chunk, off, size, true);
 772        return -ENOMEM;
 773}
 774
 775static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 776{
 777        if (!chunk)
 778                return;
 779        if (chunk->vm)
 780                free_vm_area(chunk->vm);
 781        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 782        kfree(chunk);
 783}
 784
 785static struct pcpu_chunk *alloc_pcpu_chunk(void)
 786{
 787        struct pcpu_chunk *chunk;
 788
 789        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
 790        if (!chunk)
 791                return NULL;
 792
 793        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 794        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 795        chunk->map[chunk->map_used++] = pcpu_unit_size;
 796        chunk->page = chunk->page_ar;
 797
 798        chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 799        if (!chunk->vm) {
 800                free_pcpu_chunk(chunk);
 801                return NULL;
 802        }
 803
 804        INIT_LIST_HEAD(&chunk->list);
 805        chunk->free_size = pcpu_unit_size;
 806        chunk->contig_hint = pcpu_unit_size;
 807
 808        return chunk;
 809}
 810
 811/**
 812 * pcpu_alloc - the percpu allocator
 813 * @size: size of area to allocate in bytes
 814 * @align: alignment of area (max PAGE_SIZE)
 815 * @reserved: allocate from the reserved chunk if available
 816 *
 817 * Allocate percpu area of @size bytes aligned at @align.
 818 *
 819 * CONTEXT:
 820 * Does GFP_KERNEL allocation.
 821 *
 822 * RETURNS:
 823 * Percpu pointer to the allocated area on success, NULL on failure.
 824 */
 825static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 826{
 827        struct pcpu_chunk *chunk;
 828        int slot, off;
 829
 830        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 831                WARN(true, "illegal size (%zu) or align (%zu) for "
 832                     "percpu allocation\n", size, align);
 833                return NULL;
 834        }
 835
 836        mutex_lock(&pcpu_alloc_mutex);
 837        spin_lock_irq(&pcpu_lock);
 838
 839        /* serve reserved allocations from the reserved chunk if available */
 840        if (reserved && pcpu_reserved_chunk) {
 841                chunk = pcpu_reserved_chunk;
 842                if (size > chunk->contig_hint ||
 843                    pcpu_extend_area_map(chunk) < 0)
 844                        goto fail_unlock;
 845                off = pcpu_alloc_area(chunk, size, align);
 846                if (off >= 0)
 847                        goto area_found;
 848                goto fail_unlock;
 849        }
 850
 851restart:
 852        /* search through normal chunks */
 853        for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 854                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 855                        if (size > chunk->contig_hint)
 856                                continue;
 857
 858                        switch (pcpu_extend_area_map(chunk)) {
 859                        case 0:
 860                                break;
 861                        case 1:
 862                                goto restart;   /* pcpu_lock dropped, restart */
 863                        default:
 864                                goto fail_unlock;
 865                        }
 866
 867                        off = pcpu_alloc_area(chunk, size, align);
 868                        if (off >= 0)
 869                                goto area_found;
 870                }
 871        }
 872
 873        /* hmmm... no space left, create a new chunk */
 874        spin_unlock_irq(&pcpu_lock);
 875
 876        chunk = alloc_pcpu_chunk();
 877        if (!chunk)
 878                goto fail_unlock_mutex;
 879
 880        spin_lock_irq(&pcpu_lock);
 881        pcpu_chunk_relocate(chunk, -1);
 882        pcpu_chunk_addr_insert(chunk);
 883        goto restart;
 884
 885area_found:
 886        spin_unlock_irq(&pcpu_lock);
 887
 888        /* populate, map and clear the area */
 889        if (pcpu_populate_chunk(chunk, off, size)) {
 890                spin_lock_irq(&pcpu_lock);
 891                pcpu_free_area(chunk, off);
 892                goto fail_unlock;
 893        }
 894
 895        mutex_unlock(&pcpu_alloc_mutex);
 896
 897        return __addr_to_pcpu_ptr(chunk->vm->addr + off);
 898
 899fail_unlock:
 900        spin_unlock_irq(&pcpu_lock);
 901fail_unlock_mutex:
 902        mutex_unlock(&pcpu_alloc_mutex);
 903        return NULL;
 904}
 905
 906/**
 907 * __alloc_percpu - allocate dynamic percpu area
 908 * @size: size of area to allocate in bytes
 909 * @align: alignment of area (max PAGE_SIZE)
 910 *
 911 * Allocate percpu area of @size bytes aligned at @align.  Might
 912 * sleep.  Might trigger writeouts.
 913 *
 914 * CONTEXT:
 915 * Does GFP_KERNEL allocation.
 916 *
 917 * RETURNS:
 918 * Percpu pointer to the allocated area on success, NULL on failure.
 919 */
 920void *__alloc_percpu(size_t size, size_t align)
 921{
 922        return pcpu_alloc(size, align, false);
 923}
 924EXPORT_SYMBOL_GPL(__alloc_percpu);
 925
 926/**
 927 * __alloc_reserved_percpu - allocate reserved percpu area
 928 * @size: size of area to allocate in bytes
 929 * @align: alignment of area (max PAGE_SIZE)
 930 *
 931 * Allocate percpu area of @size bytes aligned at @align from reserved
 932 * percpu area if arch has set it up; otherwise, allocation is served
 933 * from the same dynamic area.  Might sleep.  Might trigger writeouts.
 934 *
 935 * CONTEXT:
 936 * Does GFP_KERNEL allocation.
 937 *
 938 * RETURNS:
 939 * Percpu pointer to the allocated area on success, NULL on failure.
 940 */
 941void *__alloc_reserved_percpu(size_t size, size_t align)
 942{
 943        return pcpu_alloc(size, align, true);
 944}
 945
 946/**
 947 * pcpu_reclaim - reclaim fully free chunks, workqueue function
 948 * @work: unused
 949 *
 950 * Reclaim all fully free chunks except for the first one.
 951 *
 952 * CONTEXT:
 953 * workqueue context.
 954 */
 955static void pcpu_reclaim(struct work_struct *work)
 956{
 957        LIST_HEAD(todo);
 958        struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 959        struct pcpu_chunk *chunk, *next;
 960
 961        mutex_lock(&pcpu_alloc_mutex);
 962        spin_lock_irq(&pcpu_lock);
 963
 964        list_for_each_entry_safe(chunk, next, head, list) {
 965                WARN_ON(chunk->immutable);
 966
 967                /* spare the first one */
 968                if (chunk == list_first_entry(head, struct pcpu_chunk, list))
 969                        continue;
 970
 971                rb_erase(&chunk->rb_node, &pcpu_addr_root);
 972                list_move(&chunk->list, &todo);
 973        }
 974
 975        spin_unlock_irq(&pcpu_lock);
 976        mutex_unlock(&pcpu_alloc_mutex);
 977
 978        list_for_each_entry_safe(chunk, next, &todo, list) {
 979                pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 980                free_pcpu_chunk(chunk);
 981        }
 982}
 983
 984/**
 985 * free_percpu - free percpu area
 986 * @ptr: pointer to area to free
 987 *
 988 * Free percpu area @ptr.
 989 *
 990 * CONTEXT:
 991 * Can be called from atomic context.
 992 */
 993void free_percpu(void *ptr)
 994{
 995        void *addr = __pcpu_ptr_to_addr(ptr);
 996        struct pcpu_chunk *chunk;
 997        unsigned long flags;
 998        int off;
 999
1000        if (!ptr)
1001                return;
1002
1003        spin_lock_irqsave(&pcpu_lock, flags);
1004
1005        chunk = pcpu_chunk_addr_search(addr);
1006        off = addr - chunk->vm->addr;
1007
1008        pcpu_free_area(chunk, off);
1009
1010        /* if there are more than one fully free chunks, wake up grim reaper */
1011        if (chunk->free_size == pcpu_unit_size) {
1012                struct pcpu_chunk *pos;
1013
1014                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1015                        if (pos != chunk) {
1016                                schedule_work(&pcpu_reclaim_work);
1017                                break;
1018                        }
1019        }
1020
1021        spin_unlock_irqrestore(&pcpu_lock, flags);
1022}
1023EXPORT_SYMBOL_GPL(free_percpu);
1024
1025/**
1026 * pcpu_setup_first_chunk - initialize the first percpu chunk
1027 * @get_page_fn: callback to fetch page pointer
1028 * @static_size: the size of static percpu area in bytes
1029 * @reserved_size: the size of reserved percpu area in bytes
1030 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1031 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1032 * @base_addr: mapped address, NULL for auto
1033 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
1034 *
1035 * Initialize the first percpu chunk which contains the kernel static
1036 * perpcu area.  This function is to be called from arch percpu area
1037 * setup path.  The first two parameters are mandatory.  The rest are
1038 * optional.
1039 *
1040 * @get_page_fn() should return pointer to percpu page given cpu
1041 * number and page number.  It should at least return enough pages to
1042 * cover the static area.  The returned pages for static area should
1043 * have been initialized with valid data.  If @unit_size is specified,
1044 * it can also return pages after the static area.  NULL return
1045 * indicates end of pages for the cpu.  Note that @get_page_fn() must
1046 * return the same number of pages for all cpus.
1047 *
1048 * @reserved_size, if non-zero, specifies the amount of bytes to
1049 * reserve after the static area in the first chunk.  This reserves
1050 * the first chunk such that it's available only through reserved
1051 * percpu allocation.  This is primarily used to serve module percpu
1052 * static areas on architectures where the addressing model has
1053 * limited offset range for symbol relocations to guarantee module
1054 * percpu symbols fall inside the relocatable range.
1055 *
1056 * @dyn_size, if non-negative, determines the number of bytes
1057 * available for dynamic allocation in the first chunk.  Specifying
1058 * non-negative value makes percpu leave alone the area beyond
1059 * @static_size + @reserved_size + @dyn_size.
1060 *
1061 * @unit_size, if non-negative, specifies unit size and must be
1062 * aligned to PAGE_SIZE and equal to or larger than @static_size +
1063 * @reserved_size + if non-negative, @dyn_size.
1064 *
1065 * Non-null @base_addr means that the caller already allocated virtual
1066 * region for the first chunk and mapped it.  percpu must not mess
1067 * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
1068 * @populate_pte_fn doesn't make any sense.
1069 *
1070 * @populate_pte_fn is used to populate the pagetable.  NULL means the
1071 * caller already populated the pagetable.
1072 *
1073 * If the first chunk ends up with both reserved and dynamic areas, it
1074 * is served by two chunks - one to serve the core static and reserved
1075 * areas and the other for the dynamic area.  They share the same vm
1076 * and page map but uses different area allocation map to stay away
1077 * from each other.  The latter chunk is circulated in the chunk slots
1078 * and available for dynamic allocation like any other chunks.
1079 *
1080 * RETURNS:
1081 * The determined pcpu_unit_size which can be used to initialize
1082 * percpu access.
1083 */
1084size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1085                                     size_t static_size, size_t reserved_size,
1086                                     ssize_t dyn_size, ssize_t unit_size,
1087                                     void *base_addr,
1088                                     pcpu_populate_pte_fn_t populate_pte_fn)
1089{
1090        static struct vm_struct first_vm;
1091        static int smap[2], dmap[2];
1092        size_t size_sum = static_size + reserved_size +
1093                          (dyn_size >= 0 ? dyn_size : 0);
1094        struct pcpu_chunk *schunk, *dchunk = NULL;
1095        unsigned int cpu;
1096        int nr_pages;
1097        int err, i;
1098
1099        /* santiy checks */
1100        BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1101                     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1102        BUG_ON(!static_size);
1103        if (unit_size >= 0) {
1104                BUG_ON(unit_size < size_sum);
1105                BUG_ON(unit_size & ~PAGE_MASK);
1106                BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1107        } else
1108                BUG_ON(base_addr);
1109        BUG_ON(base_addr && populate_pte_fn);
1110
1111        if (unit_size >= 0)
1112                pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1113        else
1114                pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
1115                                        PFN_UP(size_sum));
1116
1117        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1118        pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1119        pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1120                + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
1121
1122        if (dyn_size < 0)
1123                dyn_size = pcpu_unit_size - static_size - reserved_size;
1124
1125        /*
1126         * Allocate chunk slots.  The additional last slot is for
1127         * empty chunks.
1128         */
1129        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1130        pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
1131        for (i = 0; i < pcpu_nr_slots; i++)
1132                INIT_LIST_HEAD(&pcpu_slot[i]);
1133
1134        /*
1135         * Initialize static chunk.  If reserved_size is zero, the
1136         * static chunk covers static area + dynamic allocation area
1137         * in the first chunk.  If reserved_size is not zero, it
1138         * covers static area + reserved area (mostly used for module
1139         * static percpu allocation).
1140         */
1141        schunk = alloc_bootmem(pcpu_chunk_struct_size);
1142        INIT_LIST_HEAD(&schunk->list);
1143        schunk->vm = &first_vm;
1144        schunk->map = smap;
1145        schunk->map_alloc = ARRAY_SIZE(smap);
1146        schunk->page = schunk->page_ar;
1147
1148        if (reserved_size) {
1149                schunk->free_size = reserved_size;
1150                pcpu_reserved_chunk = schunk;   /* not for dynamic alloc */
1151        } else {
1152                schunk->free_size = dyn_size;
1153                dyn_size = 0;                   /* dynamic area covered */
1154        }
1155        schunk->contig_hint = schunk->free_size;
1156
1157        schunk->map[schunk->map_used++] = -static_size;
1158        if (schunk->free_size)
1159                schunk->map[schunk->map_used++] = schunk->free_size;
1160
1161        pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163        /* init dynamic chunk if necessary */
1164        if (dyn_size) {
1165                dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
1166                INIT_LIST_HEAD(&dchunk->list);
1167                dchunk->vm = &first_vm;
1168                dchunk->map = dmap;
1169                dchunk->map_alloc = ARRAY_SIZE(dmap);
1170                dchunk->page = schunk->page_ar; /* share page map with schunk */
1171
1172                dchunk->contig_hint = dchunk->free_size = dyn_size;
1173                dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1174                dchunk->map[dchunk->map_used++] = dchunk->free_size;
1175        }
1176
1177        /* allocate vm address */
1178        first_vm.flags = VM_ALLOC;
1179        first_vm.size = pcpu_chunk_size;
1180
1181        if (!base_addr)
1182                vm_area_register_early(&first_vm, PAGE_SIZE);
1183        else {
1184                /*
1185                 * Pages already mapped.  No need to remap into
1186                 * vmalloc area.  In this case the first chunks can't
1187                 * be mapped or unmapped by percpu and are marked
1188                 * immutable.
1189                 */
1190                first_vm.addr = base_addr;
1191                schunk->immutable = true;
1192                if (dchunk)
1193                        dchunk->immutable = true;
1194        }
1195
1196        /* assign pages */
1197        nr_pages = -1;
1198        for_each_possible_cpu(cpu) {
1199                for (i = 0; i < pcpu_unit_pages; i++) {
1200                        struct page *page = get_page_fn(cpu, i);
1201
1202                        if (!page)
1203                                break;
1204                        *pcpu_chunk_pagep(schunk, cpu, i) = page;
1205                }
1206
1207                BUG_ON(i < PFN_UP(static_size));
1208
1209                if (nr_pages < 0)
1210                        nr_pages = i;
1211                else
1212                        BUG_ON(nr_pages != i);
1213        }
1214
1215        /* map them */
1216        if (populate_pte_fn) {
1217                for_each_possible_cpu(cpu)
1218                        for (i = 0; i < nr_pages; i++)
1219                                populate_pte_fn(pcpu_chunk_addr(schunk,
1220                                                                cpu, i));
1221
1222                err = pcpu_map(schunk, 0, nr_pages);
1223                if (err)
1224                        panic("failed to setup static percpu area, err=%d\n",
1225                              err);
1226        }
1227
1228        /* link the first chunk in */
1229        if (!dchunk) {
1230                pcpu_chunk_relocate(schunk, -1);
1231                pcpu_chunk_addr_insert(schunk);
1232        } else {
1233                pcpu_chunk_relocate(dchunk, -1);
1234                pcpu_chunk_addr_insert(dchunk);
1235        }
1236
1237        /* we're done */
1238        pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
1239        return pcpu_unit_size;
1240}
1241
1242/*
1243 * Embedding first chunk setup helper.
1244 */
1245static void *pcpue_ptr __initdata;
1246static size_t pcpue_size __initdata;
1247static size_t pcpue_unit_size __initdata;
1248
1249static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1250{
1251        size_t off = (size_t)pageno << PAGE_SHIFT;
1252
1253        if (off >= pcpue_size)
1254                return NULL;
1255
1256        return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1257}
1258
1259/**
1260 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1261 * @static_size: the size of static percpu area in bytes
1262 * @reserved_size: the size of reserved percpu area in bytes
1263 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1264 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1265 *
1266 * This is a helper to ease setting up embedded first percpu chunk and
1267 * can be called where pcpu_setup_first_chunk() is expected.
1268 *
1269 * If this function is used to setup the first chunk, it is allocated
1270 * as a contiguous area using bootmem allocator and used as-is without
1271 * being mapped into vmalloc area.  This enables the first chunk to
1272 * piggy back on the linear physical mapping which often uses larger
1273 * page size.
1274 *
1275 * When @dyn_size is positive, dynamic area might be larger than
1276 * specified to fill page alignment.  Also, when @dyn_size is auto,
1277 * @dyn_size does not fill the whole first chunk but only what's
1278 * necessary for page alignment after static and reserved areas.
1279 *
1280 * If the needed size is smaller than the minimum or specified unit
1281 * size, the leftover is returned to the bootmem allocator.
1282 *
1283 * RETURNS:
1284 * The determined pcpu_unit_size which can be used to initialize
1285 * percpu access on success, -errno on failure.
1286 */
1287ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1288                                      ssize_t dyn_size, ssize_t unit_size)
1289{
1290        unsigned int cpu;
1291
1292        /* determine parameters and allocate */
1293        pcpue_size = PFN_ALIGN(static_size + reserved_size +
1294                               (dyn_size >= 0 ? dyn_size : 0));
1295        if (dyn_size != 0)
1296                dyn_size = pcpue_size - static_size - reserved_size;
1297
1298        if (unit_size >= 0) {
1299                BUG_ON(unit_size < pcpue_size);
1300                pcpue_unit_size = unit_size;
1301        } else
1302                pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1303
1304        pcpue_ptr = __alloc_bootmem_nopanic(
1305                                        num_possible_cpus() * pcpue_unit_size,
1306                                        PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1307        if (!pcpue_ptr)
1308                return -ENOMEM;
1309
1310        /* return the leftover and copy */
1311        for_each_possible_cpu(cpu) {
1312                void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
1313
1314                free_bootmem(__pa(ptr + pcpue_size),
1315                             pcpue_unit_size - pcpue_size);
1316                memcpy(ptr, __per_cpu_load, static_size);
1317        }
1318
1319        /* we're ready, commit */
1320        pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1321                pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
1322
1323        return pcpu_setup_first_chunk(pcpue_get_page, static_size,
1324                                      reserved_size, dyn_size,
1325                                      pcpue_unit_size, pcpue_ptr, NULL);
1326}
1327