linux/mm/vmalloc.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmalloc.c
   3 *
   4 *  Copyright (C) 1993  Linus Torvalds
   5 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   6 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   7 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
   8 *  Numa awareness, Christoph Lameter, SGI, June 2005
   9 */
  10
  11#include <linux/vmalloc.h>
  12#include <linux/mm.h>
  13#include <linux/module.h>
  14#include <linux/highmem.h>
  15#include <linux/slab.h>
  16#include <linux/spinlock.h>
  17#include <linux/interrupt.h>
  18#include <linux/proc_fs.h>
  19#include <linux/seq_file.h>
  20#include <linux/debugobjects.h>
  21#include <linux/kallsyms.h>
  22#include <linux/list.h>
  23#include <linux/rbtree.h>
  24#include <linux/radix-tree.h>
  25#include <linux/rcupdate.h>
  26
  27#include <asm/atomic.h>
  28#include <asm/uaccess.h>
  29#include <asm/tlbflush.h>
  30
  31
  32/*** Page table manipulation functions ***/
  33
  34static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
  35{
  36        pte_t *pte;
  37
  38        pte = pte_offset_kernel(pmd, addr);
  39        do {
  40                pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
  41                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
  42        } while (pte++, addr += PAGE_SIZE, addr != end);
  43}
  44
  45static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
  46{
  47        pmd_t *pmd;
  48        unsigned long next;
  49
  50        pmd = pmd_offset(pud, addr);
  51        do {
  52                next = pmd_addr_end(addr, end);
  53                if (pmd_none_or_clear_bad(pmd))
  54                        continue;
  55                vunmap_pte_range(pmd, addr, next);
  56        } while (pmd++, addr = next, addr != end);
  57}
  58
  59static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
  60{
  61        pud_t *pud;
  62        unsigned long next;
  63
  64        pud = pud_offset(pgd, addr);
  65        do {
  66                next = pud_addr_end(addr, end);
  67                if (pud_none_or_clear_bad(pud))
  68                        continue;
  69                vunmap_pmd_range(pud, addr, next);
  70        } while (pud++, addr = next, addr != end);
  71}
  72
  73static void vunmap_page_range(unsigned long addr, unsigned long end)
  74{
  75        pgd_t *pgd;
  76        unsigned long next;
  77
  78        BUG_ON(addr >= end);
  79        pgd = pgd_offset_k(addr);
  80        do {
  81                next = pgd_addr_end(addr, end);
  82                if (pgd_none_or_clear_bad(pgd))
  83                        continue;
  84                vunmap_pud_range(pgd, addr, next);
  85        } while (pgd++, addr = next, addr != end);
  86}
  87
  88static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
  89                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
  90{
  91        pte_t *pte;
  92
  93        /*
  94         * nr is a running index into the array which helps higher level
  95         * callers keep track of where we're up to.
  96         */
  97
  98        pte = pte_alloc_kernel(pmd, addr);
  99        if (!pte)
 100                return -ENOMEM;
 101        do {
 102                struct page *page = pages[*nr];
 103
 104                if (WARN_ON(!pte_none(*pte)))
 105                        return -EBUSY;
 106                if (WARN_ON(!page))
 107                        return -ENOMEM;
 108                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
 109                (*nr)++;
 110        } while (pte++, addr += PAGE_SIZE, addr != end);
 111        return 0;
 112}
 113
 114static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 115                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 116{
 117        pmd_t *pmd;
 118        unsigned long next;
 119
 120        pmd = pmd_alloc(&init_mm, pud, addr);
 121        if (!pmd)
 122                return -ENOMEM;
 123        do {
 124                next = pmd_addr_end(addr, end);
 125                if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
 126                        return -ENOMEM;
 127        } while (pmd++, addr = next, addr != end);
 128        return 0;
 129}
 130
 131static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 132                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 133{
 134        pud_t *pud;
 135        unsigned long next;
 136
 137        pud = pud_alloc(&init_mm, pgd, addr);
 138        if (!pud)
 139                return -ENOMEM;
 140        do {
 141                next = pud_addr_end(addr, end);
 142                if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
 143                        return -ENOMEM;
 144        } while (pud++, addr = next, addr != end);
 145        return 0;
 146}
 147
 148/*
 149 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
 150 * will have pfns corresponding to the "pages" array.
 151 *
 152 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 153 */
 154static int vmap_page_range(unsigned long addr, unsigned long end,
 155                                pgprot_t prot, struct page **pages)
 156{
 157        pgd_t *pgd;
 158        unsigned long next;
 159        int err = 0;
 160        int nr = 0;
 161
 162        BUG_ON(addr >= end);
 163        pgd = pgd_offset_k(addr);
 164        do {
 165                next = pgd_addr_end(addr, end);
 166                err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
 167                if (err)
 168                        break;
 169        } while (pgd++, addr = next, addr != end);
 170        flush_cache_vmap(addr, end);
 171
 172        if (unlikely(err))
 173                return err;
 174        return nr;
 175}
 176
 177static inline int is_vmalloc_or_module_addr(const void *x)
 178{
 179        /*
 180         * ARM, x86-64 and sparc64 put modules in a special place,
 181         * and fall back on vmalloc() if that fails. Others
 182         * just put it in the vmalloc space.
 183         */
 184#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
 185        unsigned long addr = (unsigned long)x;
 186        if (addr >= MODULES_VADDR && addr < MODULES_END)
 187                return 1;
 188#endif
 189        return is_vmalloc_addr(x);
 190}
 191
 192/*
 193 * Walk a vmap address to the struct page it maps.
 194 */
 195struct page *vmalloc_to_page(const void *vmalloc_addr)
 196{
 197        unsigned long addr = (unsigned long) vmalloc_addr;
 198        struct page *page = NULL;
 199        pgd_t *pgd = pgd_offset_k(addr);
 200
 201        /*
 202         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
 203         * architectures that do not vmalloc module space
 204         */
 205        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 206
 207        if (!pgd_none(*pgd)) {
 208                pud_t *pud = pud_offset(pgd, addr);
 209                if (!pud_none(*pud)) {
 210                        pmd_t *pmd = pmd_offset(pud, addr);
 211                        if (!pmd_none(*pmd)) {
 212                                pte_t *ptep, pte;
 213
 214                                ptep = pte_offset_map(pmd, addr);
 215                                pte = *ptep;
 216                                if (pte_present(pte))
 217                                        page = pte_page(pte);
 218                                pte_unmap(ptep);
 219                        }
 220                }
 221        }
 222        return page;
 223}
 224EXPORT_SYMBOL(vmalloc_to_page);
 225
 226/*
 227 * Map a vmalloc()-space virtual address to the physical page frame number.
 228 */
 229unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 230{
 231        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
 232}
 233EXPORT_SYMBOL(vmalloc_to_pfn);
 234
 235
 236/*** Global kva allocator ***/
 237
 238#define VM_LAZY_FREE    0x01
 239#define VM_LAZY_FREEING 0x02
 240#define VM_VM_AREA      0x04
 241
 242struct vmap_area {
 243        unsigned long va_start;
 244        unsigned long va_end;
 245        unsigned long flags;
 246        struct rb_node rb_node;         /* address sorted rbtree */
 247        struct list_head list;          /* address sorted list */
 248        struct list_head purge_list;    /* "lazy purge" list */
 249        void *private;
 250        struct rcu_head rcu_head;
 251};
 252
 253static DEFINE_SPINLOCK(vmap_area_lock);
 254static struct rb_root vmap_area_root = RB_ROOT;
 255static LIST_HEAD(vmap_area_list);
 256
 257static struct vmap_area *__find_vmap_area(unsigned long addr)
 258{
 259        struct rb_node *n = vmap_area_root.rb_node;
 260
 261        while (n) {
 262                struct vmap_area *va;
 263
 264                va = rb_entry(n, struct vmap_area, rb_node);
 265                if (addr < va->va_start)
 266                        n = n->rb_left;
 267                else if (addr > va->va_start)
 268                        n = n->rb_right;
 269                else
 270                        return va;
 271        }
 272
 273        return NULL;
 274}
 275
 276static void __insert_vmap_area(struct vmap_area *va)
 277{
 278        struct rb_node **p = &vmap_area_root.rb_node;
 279        struct rb_node *parent = NULL;
 280        struct rb_node *tmp;
 281
 282        while (*p) {
 283                struct vmap_area *tmp;
 284
 285                parent = *p;
 286                tmp = rb_entry(parent, struct vmap_area, rb_node);
 287                if (va->va_start < tmp->va_end)
 288                        p = &(*p)->rb_left;
 289                else if (va->va_end > tmp->va_start)
 290                        p = &(*p)->rb_right;
 291                else
 292                        BUG();
 293        }
 294
 295        rb_link_node(&va->rb_node, parent, p);
 296        rb_insert_color(&va->rb_node, &vmap_area_root);
 297
 298        /* address-sort this list so it is usable like the vmlist */
 299        tmp = rb_prev(&va->rb_node);
 300        if (tmp) {
 301                struct vmap_area *prev;
 302                prev = rb_entry(tmp, struct vmap_area, rb_node);
 303                list_add_rcu(&va->list, &prev->list);
 304        } else
 305                list_add_rcu(&va->list, &vmap_area_list);
 306}
 307
 308static void purge_vmap_area_lazy(void);
 309
 310/*
 311 * Allocate a region of KVA of the specified size and alignment, within the
 312 * vstart and vend.
 313 */
 314static struct vmap_area *alloc_vmap_area(unsigned long size,
 315                                unsigned long align,
 316                                unsigned long vstart, unsigned long vend,
 317                                int node, gfp_t gfp_mask)
 318{
 319        struct vmap_area *va;
 320        struct rb_node *n;
 321        unsigned long addr;
 322        int purged = 0;
 323
 324        BUG_ON(size & ~PAGE_MASK);
 325
 326        va = kmalloc_node(sizeof(struct vmap_area),
 327                        gfp_mask & GFP_RECLAIM_MASK, node);
 328        if (unlikely(!va))
 329                return ERR_PTR(-ENOMEM);
 330
 331retry:
 332        addr = ALIGN(vstart, align);
 333
 334        spin_lock(&vmap_area_lock);
 335        /* XXX: could have a last_hole cache */
 336        n = vmap_area_root.rb_node;
 337        if (n) {
 338                struct vmap_area *first = NULL;
 339
 340                do {
 341                        struct vmap_area *tmp;
 342                        tmp = rb_entry(n, struct vmap_area, rb_node);
 343                        if (tmp->va_end >= addr) {
 344                                if (!first && tmp->va_start < addr + size)
 345                                        first = tmp;
 346                                n = n->rb_left;
 347                        } else {
 348                                first = tmp;
 349                                n = n->rb_right;
 350                        }
 351                } while (n);
 352
 353                if (!first)
 354                        goto found;
 355
 356                if (first->va_end < addr) {
 357                        n = rb_next(&first->rb_node);
 358                        if (n)
 359                                first = rb_entry(n, struct vmap_area, rb_node);
 360                        else
 361                                goto found;
 362                }
 363
 364                while (addr + size > first->va_start && addr + size <= vend) {
 365                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
 366
 367                        n = rb_next(&first->rb_node);
 368                        if (n)
 369                                first = rb_entry(n, struct vmap_area, rb_node);
 370                        else
 371                                goto found;
 372                }
 373        }
 374found:
 375        if (addr + size > vend) {
 376                spin_unlock(&vmap_area_lock);
 377                if (!purged) {
 378                        purge_vmap_area_lazy();
 379                        purged = 1;
 380                        goto retry;
 381                }
 382                if (printk_ratelimit())
 383                        printk(KERN_WARNING "vmap allocation failed: "
 384                                 "use vmalloc=<size> to increase size.\n");
 385                return ERR_PTR(-EBUSY);
 386        }
 387
 388        BUG_ON(addr & (align-1));
 389
 390        va->va_start = addr;
 391        va->va_end = addr + size;
 392        va->flags = 0;
 393        __insert_vmap_area(va);
 394        spin_unlock(&vmap_area_lock);
 395
 396        return va;
 397}
 398
 399static void rcu_free_va(struct rcu_head *head)
 400{
 401        struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
 402
 403        kfree(va);
 404}
 405
 406static void __free_vmap_area(struct vmap_area *va)
 407{
 408        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
 409        rb_erase(&va->rb_node, &vmap_area_root);
 410        RB_CLEAR_NODE(&va->rb_node);
 411        list_del_rcu(&va->list);
 412
 413        call_rcu(&va->rcu_head, rcu_free_va);
 414}
 415
 416/*
 417 * Free a region of KVA allocated by alloc_vmap_area
 418 */
 419static void free_vmap_area(struct vmap_area *va)
 420{
 421        spin_lock(&vmap_area_lock);
 422        __free_vmap_area(va);
 423        spin_unlock(&vmap_area_lock);
 424}
 425
 426/*
 427 * Clear the pagetable entries of a given vmap_area
 428 */
 429static void unmap_vmap_area(struct vmap_area *va)
 430{
 431        vunmap_page_range(va->va_start, va->va_end);
 432}
 433
 434/*
 435 * lazy_max_pages is the maximum amount of virtual address space we gather up
 436 * before attempting to purge with a TLB flush.
 437 *
 438 * There is a tradeoff here: a larger number will cover more kernel page tables
 439 * and take slightly longer to purge, but it will linearly reduce the number of
 440 * global TLB flushes that must be performed. It would seem natural to scale
 441 * this number up linearly with the number of CPUs (because vmapping activity
 442 * could also scale linearly with the number of CPUs), however it is likely
 443 * that in practice, workloads might be constrained in other ways that mean
 444 * vmap activity will not scale linearly with CPUs. Also, I want to be
 445 * conservative and not introduce a big latency on huge systems, so go with
 446 * a less aggressive log scale. It will still be an improvement over the old
 447 * code, and it will be simple to change the scale factor if we find that it
 448 * becomes a problem on bigger systems.
 449 */
 450static unsigned long lazy_max_pages(void)
 451{
 452        unsigned int log;
 453
 454        log = fls(num_online_cpus());
 455
 456        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
 457}
 458
 459static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 460
 461/*
 462 * Purges all lazily-freed vmap areas.
 463 *
 464 * If sync is 0 then don't purge if there is already a purge in progress.
 465 * If force_flush is 1, then flush kernel TLBs between *start and *end even
 466 * if we found no lazy vmap areas to unmap (callers can use this to optimise
 467 * their own TLB flushing).
 468 * Returns with *start = min(*start, lowest purged address)
 469 *              *end = max(*end, highest purged address)
 470 */
 471static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 472                                        int sync, int force_flush)
 473{
 474        static DEFINE_SPINLOCK(purge_lock);
 475        LIST_HEAD(valist);
 476        struct vmap_area *va;
 477        int nr = 0;
 478
 479        /*
 480         * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
 481         * should not expect such behaviour. This just simplifies locking for
 482         * the case that isn't actually used at the moment anyway.
 483         */
 484        if (!sync && !force_flush) {
 485                if (!spin_trylock(&purge_lock))
 486                        return;
 487        } else
 488                spin_lock(&purge_lock);
 489
 490        rcu_read_lock();
 491        list_for_each_entry_rcu(va, &vmap_area_list, list) {
 492                if (va->flags & VM_LAZY_FREE) {
 493                        if (va->va_start < *start)
 494                                *start = va->va_start;
 495                        if (va->va_end > *end)
 496                                *end = va->va_end;
 497                        nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
 498                        unmap_vmap_area(va);
 499                        list_add_tail(&va->purge_list, &valist);
 500                        va->flags |= VM_LAZY_FREEING;
 501                        va->flags &= ~VM_LAZY_FREE;
 502                }
 503        }
 504        rcu_read_unlock();
 505
 506        if (nr) {
 507                BUG_ON(nr > atomic_read(&vmap_lazy_nr));
 508                atomic_sub(nr, &vmap_lazy_nr);
 509        }
 510
 511        if (nr || force_flush)
 512                flush_tlb_kernel_range(*start, *end);
 513
 514        if (nr) {
 515                spin_lock(&vmap_area_lock);
 516                list_for_each_entry(va, &valist, purge_list)
 517                        __free_vmap_area(va);
 518                spin_unlock(&vmap_area_lock);
 519        }
 520        spin_unlock(&purge_lock);
 521}
 522
 523/*
 524 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
 525 * is already purging.
 526 */
 527static void try_purge_vmap_area_lazy(void)
 528{
 529        unsigned long start = ULONG_MAX, end = 0;
 530
 531        __purge_vmap_area_lazy(&start, &end, 0, 0);
 532}
 533
 534/*
 535 * Kick off a purge of the outstanding lazy areas.
 536 */
 537static void purge_vmap_area_lazy(void)
 538{
 539        unsigned long start = ULONG_MAX, end = 0;
 540
 541        __purge_vmap_area_lazy(&start, &end, 1, 0);
 542}
 543
 544/*
 545 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
 546 * called for the correct range previously.
 547 */
 548static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 549{
 550        va->flags |= VM_LAZY_FREE;
 551        atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
 552        if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
 553                try_purge_vmap_area_lazy();
 554}
 555
 556/*
 557 * Free and unmap a vmap area
 558 */
 559static void free_unmap_vmap_area(struct vmap_area *va)
 560{
 561        flush_cache_vunmap(va->va_start, va->va_end);
 562        free_unmap_vmap_area_noflush(va);
 563}
 564
 565static struct vmap_area *find_vmap_area(unsigned long addr)
 566{
 567        struct vmap_area *va;
 568
 569        spin_lock(&vmap_area_lock);
 570        va = __find_vmap_area(addr);
 571        spin_unlock(&vmap_area_lock);
 572
 573        return va;
 574}
 575
 576static void free_unmap_vmap_area_addr(unsigned long addr)
 577{
 578        struct vmap_area *va;
 579
 580        va = find_vmap_area(addr);
 581        BUG_ON(!va);
 582        free_unmap_vmap_area(va);
 583}
 584
 585
 586/*** Per cpu kva allocator ***/
 587
 588/*
 589 * vmap space is limited especially on 32 bit architectures. Ensure there is
 590 * room for at least 16 percpu vmap blocks per CPU.
 591 */
 592/*
 593 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 594 * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
 595 * instead (we just need a rough idea)
 596 */
 597#if BITS_PER_LONG == 32
 598#define VMALLOC_SPACE           (128UL*1024*1024)
 599#else
 600#define VMALLOC_SPACE           (128UL*1024*1024*1024)
 601#endif
 602
 603#define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
 604#define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
 605#define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
 606#define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
 607#define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
 608#define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
 609#define VMAP_BBMAP_BITS         VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
 610                                        VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
 611                                                VMALLOC_PAGES / NR_CPUS / 16))
 612
 613#define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
 614
 615static bool vmap_initialized __read_mostly = false;
 616
 617struct vmap_block_queue {
 618        spinlock_t lock;
 619        struct list_head free;
 620        struct list_head dirty;
 621        unsigned int nr_dirty;
 622};
 623
 624struct vmap_block {
 625        spinlock_t lock;
 626        struct vmap_area *va;
 627        struct vmap_block_queue *vbq;
 628        unsigned long free, dirty;
 629        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
 630        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
 631        union {
 632                struct {
 633                        struct list_head free_list;
 634                        struct list_head dirty_list;
 635                };
 636                struct rcu_head rcu_head;
 637        };
 638};
 639
 640/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
 641static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
 642
 643/*
 644 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
 645 * in the free path. Could get rid of this if we change the API to return a
 646 * "cookie" from alloc, to be passed to free. But no big deal yet.
 647 */
 648static DEFINE_SPINLOCK(vmap_block_tree_lock);
 649static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
 650
 651/*
 652 * We should probably have a fallback mechanism to allocate virtual memory
 653 * out of partially filled vmap blocks. However vmap block sizing should be
 654 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 655 * big problem.
 656 */
 657
 658static unsigned long addr_to_vb_idx(unsigned long addr)
 659{
 660        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
 661        addr /= VMAP_BLOCK_SIZE;
 662        return addr;
 663}
 664
 665static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 666{
 667        struct vmap_block_queue *vbq;
 668        struct vmap_block *vb;
 669        struct vmap_area *va;
 670        unsigned long vb_idx;
 671        int node, err;
 672
 673        node = numa_node_id();
 674
 675        vb = kmalloc_node(sizeof(struct vmap_block),
 676                        gfp_mask & GFP_RECLAIM_MASK, node);
 677        if (unlikely(!vb))
 678                return ERR_PTR(-ENOMEM);
 679
 680        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
 681                                        VMALLOC_START, VMALLOC_END,
 682                                        node, gfp_mask);
 683        if (unlikely(IS_ERR(va))) {
 684                kfree(vb);
 685                return ERR_PTR(PTR_ERR(va));
 686        }
 687
 688        err = radix_tree_preload(gfp_mask);
 689        if (unlikely(err)) {
 690                kfree(vb);
 691                free_vmap_area(va);
 692                return ERR_PTR(err);
 693        }
 694
 695        spin_lock_init(&vb->lock);
 696        vb->va = va;
 697        vb->free = VMAP_BBMAP_BITS;
 698        vb->dirty = 0;
 699        bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
 700        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
 701        INIT_LIST_HEAD(&vb->free_list);
 702        INIT_LIST_HEAD(&vb->dirty_list);
 703
 704        vb_idx = addr_to_vb_idx(va->va_start);
 705        spin_lock(&vmap_block_tree_lock);
 706        err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
 707        spin_unlock(&vmap_block_tree_lock);
 708        BUG_ON(err);
 709        radix_tree_preload_end();
 710
 711        vbq = &get_cpu_var(vmap_block_queue);
 712        vb->vbq = vbq;
 713        spin_lock(&vbq->lock);
 714        list_add(&vb->free_list, &vbq->free);
 715        spin_unlock(&vbq->lock);
 716        put_cpu_var(vmap_cpu_blocks);
 717
 718        return vb;
 719}
 720
 721static void rcu_free_vb(struct rcu_head *head)
 722{
 723        struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
 724
 725        kfree(vb);
 726}
 727
 728static void free_vmap_block(struct vmap_block *vb)
 729{
 730        struct vmap_block *tmp;
 731        unsigned long vb_idx;
 732
 733        spin_lock(&vb->vbq->lock);
 734        if (!list_empty(&vb->free_list))
 735                list_del(&vb->free_list);
 736        if (!list_empty(&vb->dirty_list))
 737                list_del(&vb->dirty_list);
 738        spin_unlock(&vb->vbq->lock);
 739
 740        vb_idx = addr_to_vb_idx(vb->va->va_start);
 741        spin_lock(&vmap_block_tree_lock);
 742        tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
 743        spin_unlock(&vmap_block_tree_lock);
 744        BUG_ON(tmp != vb);
 745
 746        free_unmap_vmap_area_noflush(vb->va);
 747        call_rcu(&vb->rcu_head, rcu_free_vb);
 748}
 749
 750static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 751{
 752        struct vmap_block_queue *vbq;
 753        struct vmap_block *vb;
 754        unsigned long addr = 0;
 755        unsigned int order;
 756
 757        BUG_ON(size & ~PAGE_MASK);
 758        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 759        order = get_order(size);
 760
 761again:
 762        rcu_read_lock();
 763        vbq = &get_cpu_var(vmap_block_queue);
 764        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 765                int i;
 766
 767                spin_lock(&vb->lock);
 768                i = bitmap_find_free_region(vb->alloc_map,
 769                                                VMAP_BBMAP_BITS, order);
 770
 771                if (i >= 0) {
 772                        addr = vb->va->va_start + (i << PAGE_SHIFT);
 773                        BUG_ON(addr_to_vb_idx(addr) !=
 774                                        addr_to_vb_idx(vb->va->va_start));
 775                        vb->free -= 1UL << order;
 776                        if (vb->free == 0) {
 777                                spin_lock(&vbq->lock);
 778                                list_del_init(&vb->free_list);
 779                                spin_unlock(&vbq->lock);
 780                        }
 781                        spin_unlock(&vb->lock);
 782                        break;
 783                }
 784                spin_unlock(&vb->lock);
 785        }
 786        put_cpu_var(vmap_cpu_blocks);
 787        rcu_read_unlock();
 788
 789        if (!addr) {
 790                vb = new_vmap_block(gfp_mask);
 791                if (IS_ERR(vb))
 792                        return vb;
 793                goto again;
 794        }
 795
 796        return (void *)addr;
 797}
 798
 799static void vb_free(const void *addr, unsigned long size)
 800{
 801        unsigned long offset;
 802        unsigned long vb_idx;
 803        unsigned int order;
 804        struct vmap_block *vb;
 805
 806        BUG_ON(size & ~PAGE_MASK);
 807        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 808
 809        flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
 810
 811        order = get_order(size);
 812
 813        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
 814
 815        vb_idx = addr_to_vb_idx((unsigned long)addr);
 816        rcu_read_lock();
 817        vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
 818        rcu_read_unlock();
 819        BUG_ON(!vb);
 820
 821        spin_lock(&vb->lock);
 822        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
 823        if (!vb->dirty) {
 824                spin_lock(&vb->vbq->lock);
 825                list_add(&vb->dirty_list, &vb->vbq->dirty);
 826                spin_unlock(&vb->vbq->lock);
 827        }
 828        vb->dirty += 1UL << order;
 829        if (vb->dirty == VMAP_BBMAP_BITS) {
 830                BUG_ON(vb->free || !list_empty(&vb->free_list));
 831                spin_unlock(&vb->lock);
 832                free_vmap_block(vb);
 833        } else
 834                spin_unlock(&vb->lock);
 835}
 836
 837/**
 838 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 839 *
 840 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 841 * to amortize TLB flushing overheads. What this means is that any page you
 842 * have now, may, in a former life, have been mapped into kernel virtual
 843 * address by the vmap layer and so there might be some CPUs with TLB entries
 844 * still referencing that page (additional to the regular 1:1 kernel mapping).
 845 *
 846 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 847 * be sure that none of the pages we have control over will have any aliases
 848 * from the vmap layer.
 849 */
 850void vm_unmap_aliases(void)
 851{
 852        unsigned long start = ULONG_MAX, end = 0;
 853        int cpu;
 854        int flush = 0;
 855
 856        if (unlikely(!vmap_initialized))
 857                return;
 858
 859        for_each_possible_cpu(cpu) {
 860                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
 861                struct vmap_block *vb;
 862
 863                rcu_read_lock();
 864                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 865                        int i;
 866
 867                        spin_lock(&vb->lock);
 868                        i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
 869                        while (i < VMAP_BBMAP_BITS) {
 870                                unsigned long s, e;
 871                                int j;
 872                                j = find_next_zero_bit(vb->dirty_map,
 873                                        VMAP_BBMAP_BITS, i);
 874
 875                                s = vb->va->va_start + (i << PAGE_SHIFT);
 876                                e = vb->va->va_start + (j << PAGE_SHIFT);
 877                                vunmap_page_range(s, e);
 878                                flush = 1;
 879
 880                                if (s < start)
 881                                        start = s;
 882                                if (e > end)
 883                                        end = e;
 884
 885                                i = j;
 886                                i = find_next_bit(vb->dirty_map,
 887                                                        VMAP_BBMAP_BITS, i);
 888                        }
 889                        spin_unlock(&vb->lock);
 890                }
 891                rcu_read_unlock();
 892        }
 893
 894        __purge_vmap_area_lazy(&start, &end, 1, flush);
 895}
 896EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 897
 898/**
 899 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 900 * @mem: the pointer returned by vm_map_ram
 901 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 902 */
 903void vm_unmap_ram(const void *mem, unsigned int count)
 904{
 905        unsigned long size = count << PAGE_SHIFT;
 906        unsigned long addr = (unsigned long)mem;
 907
 908        BUG_ON(!addr);
 909        BUG_ON(addr < VMALLOC_START);
 910        BUG_ON(addr > VMALLOC_END);
 911        BUG_ON(addr & (PAGE_SIZE-1));
 912
 913        debug_check_no_locks_freed(mem, size);
 914
 915        if (likely(count <= VMAP_MAX_ALLOC))
 916                vb_free(mem, size);
 917        else
 918                free_unmap_vmap_area_addr(addr);
 919}
 920EXPORT_SYMBOL(vm_unmap_ram);
 921
 922/**
 923 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 924 * @pages: an array of pointers to the pages to be mapped
 925 * @count: number of pages
 926 * @node: prefer to allocate data structures on this node
 927 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
 928 *
 929 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 930 */
 931void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 932{
 933        unsigned long size = count << PAGE_SHIFT;
 934        unsigned long addr;
 935        void *mem;
 936
 937        if (likely(count <= VMAP_MAX_ALLOC)) {
 938                mem = vb_alloc(size, GFP_KERNEL);
 939                if (IS_ERR(mem))
 940                        return NULL;
 941                addr = (unsigned long)mem;
 942        } else {
 943                struct vmap_area *va;
 944                va = alloc_vmap_area(size, PAGE_SIZE,
 945                                VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
 946                if (IS_ERR(va))
 947                        return NULL;
 948
 949                addr = va->va_start;
 950                mem = (void *)addr;
 951        }
 952        if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
 953                vm_unmap_ram(mem, count);
 954                return NULL;
 955        }
 956        return mem;
 957}
 958EXPORT_SYMBOL(vm_map_ram);
 959
 960void __init vmalloc_init(void)
 961{
 962        int i;
 963
 964        for_each_possible_cpu(i) {
 965                struct vmap_block_queue *vbq;
 966
 967                vbq = &per_cpu(vmap_block_queue, i);
 968                spin_lock_init(&vbq->lock);
 969                INIT_LIST_HEAD(&vbq->free);
 970                INIT_LIST_HEAD(&vbq->dirty);
 971                vbq->nr_dirty = 0;
 972        }
 973
 974        vmap_initialized = true;
 975}
 976
 977void unmap_kernel_range(unsigned long addr, unsigned long size)
 978{
 979        unsigned long end = addr + size;
 980        vunmap_page_range(addr, end);
 981        flush_tlb_kernel_range(addr, end);
 982}
 983
 984int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 985{
 986        unsigned long addr = (unsigned long)area->addr;
 987        unsigned long end = addr + area->size - PAGE_SIZE;
 988        int err;
 989
 990        err = vmap_page_range(addr, end, prot, *pages);
 991        if (err > 0) {
 992                *pages += err;
 993                err = 0;
 994        }
 995
 996        return err;
 997}
 998EXPORT_SYMBOL_GPL(map_vm_area);
 999
1000/*** Old vmalloc interfaces ***/
1001DEFINE_RWLOCK(vmlist_lock);
1002struct vm_struct *vmlist;
1003
1004static struct vm_struct *__get_vm_area_node(unsigned long size,
1005                unsigned long flags, unsigned long start, unsigned long end,
1006                int node, gfp_t gfp_mask, void *caller)
1007{
1008        static struct vmap_area *va;
1009        struct vm_struct *area;
1010        struct vm_struct *tmp, **p;
1011        unsigned long align = 1;
1012
1013        BUG_ON(in_interrupt());
1014        if (flags & VM_IOREMAP) {
1015                int bit = fls(size);
1016
1017                if (bit > IOREMAP_MAX_ORDER)
1018                        bit = IOREMAP_MAX_ORDER;
1019                else if (bit < PAGE_SHIFT)
1020                        bit = PAGE_SHIFT;
1021
1022                align = 1ul << bit;
1023        }
1024
1025        size = PAGE_ALIGN(size);
1026        if (unlikely(!size))
1027                return NULL;
1028
1029        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1030        if (unlikely(!area))
1031                return NULL;
1032
1033        /*
1034         * We always allocate a guard page.
1035         */
1036        size += PAGE_SIZE;
1037
1038        va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1039        if (IS_ERR(va)) {
1040                kfree(area);
1041                return NULL;
1042        }
1043
1044        area->flags = flags;
1045        area->addr = (void *)va->va_start;
1046        area->size = size;
1047        area->pages = NULL;
1048        area->nr_pages = 0;
1049        area->phys_addr = 0;
1050        area->caller = caller;
1051        va->private = area;
1052        va->flags |= VM_VM_AREA;
1053
1054        write_lock(&vmlist_lock);
1055        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1056                if (tmp->addr >= area->addr)
1057                        break;
1058        }
1059        area->next = *p;
1060        *p = area;
1061        write_unlock(&vmlist_lock);
1062
1063        return area;
1064}
1065
1066struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1067                                unsigned long start, unsigned long end)
1068{
1069        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
1070                                                __builtin_return_address(0));
1071}
1072EXPORT_SYMBOL_GPL(__get_vm_area);
1073
1074/**
1075 *      get_vm_area  -  reserve a contiguous kernel virtual area
1076 *      @size:          size of the area
1077 *      @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
1078 *
1079 *      Search an area of @size in the kernel virtual mapping area,
1080 *      and reserved it for out purposes.  Returns the area descriptor
1081 *      on success or %NULL on failure.
1082 */
1083struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1084{
1085        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1086                                -1, GFP_KERNEL, __builtin_return_address(0));
1087}
1088
1089struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1090                                void *caller)
1091{
1092        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1093                                                -1, GFP_KERNEL, caller);
1094}
1095
1096struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1097                                   int node, gfp_t gfp_mask)
1098{
1099        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
1100                                  gfp_mask, __builtin_return_address(0));
1101}
1102
1103static struct vm_struct *find_vm_area(const void *addr)
1104{
1105        struct vmap_area *va;
1106
1107        va = find_vmap_area((unsigned long)addr);
1108        if (va && va->flags & VM_VM_AREA)
1109                return va->private;
1110
1111        return NULL;
1112}
1113
1114/**
1115 *      remove_vm_area  -  find and remove a continuous kernel virtual area
1116 *      @addr:          base address
1117 *
1118 *      Search for the kernel VM area starting at @addr, and remove it.
1119 *      This function returns the found VM area, but using it is NOT safe
1120 *      on SMP machines, except for its size or flags.
1121 */
1122struct vm_struct *remove_vm_area(const void *addr)
1123{
1124        struct vmap_area *va;
1125
1126        va = find_vmap_area((unsigned long)addr);
1127        if (va && va->flags & VM_VM_AREA) {
1128                struct vm_struct *vm = va->private;
1129                struct vm_struct *tmp, **p;
1130                free_unmap_vmap_area(va);
1131                vm->size -= PAGE_SIZE;
1132
1133                write_lock(&vmlist_lock);
1134                for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1135                        ;
1136                *p = tmp->next;
1137                write_unlock(&vmlist_lock);
1138
1139                return vm;
1140        }
1141        return NULL;
1142}
1143
1144static void __vunmap(const void *addr, int deallocate_pages)
1145{
1146        struct vm_struct *area;
1147
1148        if (!addr)
1149                return;
1150
1151        if ((PAGE_SIZE-1) & (unsigned long)addr) {
1152                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
1153                return;
1154        }
1155
1156        area = remove_vm_area(addr);
1157        if (unlikely(!area)) {
1158                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1159                                addr);
1160                return;
1161        }
1162
1163        debug_check_no_locks_freed(addr, area->size);
1164        debug_check_no_obj_freed(addr, area->size);
1165
1166        if (deallocate_pages) {
1167                int i;
1168
1169                for (i = 0; i < area->nr_pages; i++) {
1170                        struct page *page = area->pages[i];
1171
1172                        BUG_ON(!page);
1173                        __free_page(page);
1174                }
1175
1176                if (area->flags & VM_VPAGES)
1177                        vfree(area->pages);
1178                else
1179                        kfree(area->pages);
1180        }
1181
1182        kfree(area);
1183        return;
1184}
1185
1186/**
1187 *      vfree  -  release memory allocated by vmalloc()
1188 *      @addr:          memory base address
1189 *
1190 *      Free the virtually continuous memory area starting at @addr, as
1191 *      obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1192 *      NULL, no operation is performed.
1193 *
1194 *      Must not be called in interrupt context.
1195 */
1196void vfree(const void *addr)
1197{
1198        BUG_ON(in_interrupt());
1199        __vunmap(addr, 1);
1200}
1201EXPORT_SYMBOL(vfree);
1202
1203/**
1204 *      vunmap  -  release virtual mapping obtained by vmap()
1205 *      @addr:          memory base address
1206 *
1207 *      Free the virtually contiguous memory area starting at @addr,
1208 *      which was created from the page array passed to vmap().
1209 *
1210 *      Must not be called in interrupt context.
1211 */
1212void vunmap(const void *addr)
1213{
1214        BUG_ON(in_interrupt());
1215        __vunmap(addr, 0);
1216}
1217EXPORT_SYMBOL(vunmap);
1218
1219/**
1220 *      vmap  -  map an array of pages into virtually contiguous space
1221 *      @pages:         array of page pointers
1222 *      @count:         number of pages to map
1223 *      @flags:         vm_area->flags
1224 *      @prot:          page protection for the mapping
1225 *
1226 *      Maps @count pages from @pages into contiguous kernel virtual
1227 *      space.
1228 */
1229void *vmap(struct page **pages, unsigned int count,
1230                unsigned long flags, pgprot_t prot)
1231{
1232        struct vm_struct *area;
1233
1234        if (count > num_physpages)
1235                return NULL;
1236
1237        area = get_vm_area_caller((count << PAGE_SHIFT), flags,
1238                                        __builtin_return_address(0));
1239        if (!area)
1240                return NULL;
1241
1242        if (map_vm_area(area, prot, &pages)) {
1243                vunmap(area->addr);
1244                return NULL;
1245        }
1246
1247        return area->addr;
1248}
1249EXPORT_SYMBOL(vmap);
1250
1251static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1252                            int node, void *caller);
1253static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1254                                 pgprot_t prot, int node, void *caller)
1255{
1256        struct page **pages;
1257        unsigned int nr_pages, array_size, i;
1258
1259        nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1260        array_size = (nr_pages * sizeof(struct page *));
1261
1262        area->nr_pages = nr_pages;
1263        /* Please note that the recursion is strictly bounded. */
1264        if (array_size > PAGE_SIZE) {
1265                pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
1266                                PAGE_KERNEL, node, caller);
1267                area->flags |= VM_VPAGES;
1268        } else {
1269                pages = kmalloc_node(array_size,
1270                                (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1271                                node);
1272        }
1273        area->pages = pages;
1274        area->caller = caller;
1275        if (!area->pages) {
1276                remove_vm_area(area->addr);
1277                kfree(area);
1278                return NULL;
1279        }
1280
1281        for (i = 0; i < area->nr_pages; i++) {
1282                struct page *page;
1283
1284                if (node < 0)
1285                        page = alloc_page(gfp_mask);
1286                else
1287                        page = alloc_pages_node(node, gfp_mask, 0);
1288
1289                if (unlikely(!page)) {
1290                        /* Successfully allocated i pages, free them in __vunmap() */
1291                        area->nr_pages = i;
1292                        goto fail;
1293                }
1294                area->pages[i] = page;
1295        }
1296
1297        if (map_vm_area(area, prot, &pages))
1298                goto fail;
1299        return area->addr;
1300
1301fail:
1302        vfree(area->addr);
1303        return NULL;
1304}
1305
1306void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1307{
1308        return __vmalloc_area_node(area, gfp_mask, prot, -1,
1309                                        __builtin_return_address(0));
1310}
1311
1312/**
1313 *      __vmalloc_node  -  allocate virtually contiguous memory
1314 *      @size:          allocation size
1315 *      @gfp_mask:      flags for the page level allocator
1316 *      @prot:          protection mask for the allocated pages
1317 *      @node:          node to use for allocation or -1
1318 *      @caller:        caller's return address
1319 *
1320 *      Allocate enough pages to cover @size from the page level
1321 *      allocator with @gfp_mask flags.  Map them into contiguous
1322 *      kernel virtual space, using a pagetable protection of @prot.
1323 */
1324static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1325                                                int node, void *caller)
1326{
1327        struct vm_struct *area;
1328
1329        size = PAGE_ALIGN(size);
1330        if (!size || (size >> PAGE_SHIFT) > num_physpages)
1331                return NULL;
1332
1333        area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
1334                                                node, gfp_mask, caller);
1335
1336        if (!area)
1337                return NULL;
1338
1339        return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1340}
1341
1342void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1343{
1344        return __vmalloc_node(size, gfp_mask, prot, -1,
1345                                __builtin_return_address(0));
1346}
1347EXPORT_SYMBOL(__vmalloc);
1348
1349/**
1350 *      vmalloc  -  allocate virtually contiguous memory
1351 *      @size:          allocation size
1352 *      Allocate enough pages to cover @size from the page level
1353 *      allocator and map them into contiguous kernel virtual space.
1354 *
1355 *      For tight control over page level allocator and protection flags
1356 *      use __vmalloc() instead.
1357 */
1358void *vmalloc(unsigned long size)
1359{
1360        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1361                                        -1, __builtin_return_address(0));
1362}
1363EXPORT_SYMBOL(vmalloc);
1364
1365/**
1366 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1367 * @size: allocation size
1368 *
1369 * The resulting memory area is zeroed so it can be mapped to userspace
1370 * without leaking data.
1371 */
1372void *vmalloc_user(unsigned long size)
1373{
1374        struct vm_struct *area;
1375        void *ret;
1376
1377        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
1378        if (ret) {
1379                area = find_vm_area(ret);
1380                area->flags |= VM_USERMAP;
1381        }
1382        return ret;
1383}
1384EXPORT_SYMBOL(vmalloc_user);
1385
1386/**
1387 *      vmalloc_node  -  allocate memory on a specific node
1388 *      @size:          allocation size
1389 *      @node:          numa node
1390 *
1391 *      Allocate enough pages to cover @size from the page level
1392 *      allocator and map them into contiguous kernel virtual space.
1393 *
1394 *      For tight control over page level allocator and protection flags
1395 *      use __vmalloc() instead.
1396 */
1397void *vmalloc_node(unsigned long size, int node)
1398{
1399        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1400                                        node, __builtin_return_address(0));
1401}
1402EXPORT_SYMBOL(vmalloc_node);
1403
1404#ifndef PAGE_KERNEL_EXEC
1405# define PAGE_KERNEL_EXEC PAGE_KERNEL
1406#endif
1407
1408/**
1409 *      vmalloc_exec  -  allocate virtually contiguous, executable memory
1410 *      @size:          allocation size
1411 *
1412 *      Kernel-internal function to allocate enough pages to cover @size
1413 *      the page level allocator and map them into contiguous and
1414 *      executable kernel virtual space.
1415 *
1416 *      For tight control over page level allocator and protection flags
1417 *      use __vmalloc() instead.
1418 */
1419
1420void *vmalloc_exec(unsigned long size)
1421{
1422        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
1423}
1424
1425#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
1426#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
1427#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
1428#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
1429#else
1430#define GFP_VMALLOC32 GFP_KERNEL
1431#endif
1432
1433/**
1434 *      vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
1435 *      @size:          allocation size
1436 *
1437 *      Allocate enough 32bit PA addressable pages to cover @size from the
1438 *      page level allocator and map them into contiguous kernel virtual space.
1439 */
1440void *vmalloc_32(unsigned long size)
1441{
1442        return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
1443}
1444EXPORT_SYMBOL(vmalloc_32);
1445
1446/**
1447 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1448 *      @size:          allocation size
1449 *
1450 * The resulting memory area is 32bit addressable and zeroed so it can be
1451 * mapped to userspace without leaking data.
1452 */
1453void *vmalloc_32_user(unsigned long size)
1454{
1455        struct vm_struct *area;
1456        void *ret;
1457
1458        ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
1459        if (ret) {
1460                area = find_vm_area(ret);
1461                area->flags |= VM_USERMAP;
1462        }
1463        return ret;
1464}
1465EXPORT_SYMBOL(vmalloc_32_user);
1466
1467long vread(char *buf, char *addr, unsigned long count)
1468{
1469        struct vm_struct *tmp;
1470        char *vaddr, *buf_start = buf;
1471        unsigned long n;
1472
1473        /* Don't allow overflow */
1474        if ((unsigned long) addr + count < count)
1475                count = -(unsigned long) addr;
1476
1477        read_lock(&vmlist_lock);
1478        for (tmp = vmlist; tmp; tmp = tmp->next) {
1479                vaddr = (char *) tmp->addr;
1480                if (addr >= vaddr + tmp->size - PAGE_SIZE)
1481                        continue;
1482                while (addr < vaddr) {
1483                        if (count == 0)
1484                                goto finished;
1485                        *buf = '\0';
1486                        buf++;
1487                        addr++;
1488                        count--;
1489                }
1490                n = vaddr + tmp->size - PAGE_SIZE - addr;
1491                do {
1492                        if (count == 0)
1493                                goto finished;
1494                        *buf = *addr;
1495                        buf++;
1496                        addr++;
1497                        count--;
1498                } while (--n > 0);
1499        }
1500finished:
1501        read_unlock(&vmlist_lock);
1502        return buf - buf_start;
1503}
1504
1505long vwrite(char *buf, char *addr, unsigned long count)
1506{
1507        struct vm_struct *tmp;
1508        char *vaddr, *buf_start = buf;
1509        unsigned long n;
1510
1511        /* Don't allow overflow */
1512        if ((unsigned long) addr + count < count)
1513                count = -(unsigned long) addr;
1514
1515        read_lock(&vmlist_lock);
1516        for (tmp = vmlist; tmp; tmp = tmp->next) {
1517                vaddr = (char *) tmp->addr;
1518                if (addr >= vaddr + tmp->size - PAGE_SIZE)
1519                        continue;
1520                while (addr < vaddr) {
1521                        if (count == 0)
1522                                goto finished;
1523                        buf++;
1524                        addr++;
1525                        count--;
1526                }
1527                n = vaddr + tmp->size - PAGE_SIZE - addr;
1528                do {
1529                        if (count == 0)
1530                                goto finished;
1531                        *addr = *buf;
1532                        buf++;
1533                        addr++;
1534                        count--;
1535                } while (--n > 0);
1536        }
1537finished:
1538        read_unlock(&vmlist_lock);
1539        return buf - buf_start;
1540}
1541
1542/**
1543 *      remap_vmalloc_range  -  map vmalloc pages to userspace
1544 *      @vma:           vma to cover (map full range of vma)
1545 *      @addr:          vmalloc memory
1546 *      @pgoff:         number of pages into addr before first page to map
1547 *
1548 *      Returns:        0 for success, -Exxx on failure
1549 *
1550 *      This function checks that addr is a valid vmalloc'ed area, and
1551 *      that it is big enough to cover the vma. Will return failure if
1552 *      that criteria isn't met.
1553 *
1554 *      Similar to remap_pfn_range() (see mm/memory.c)
1555 */
1556int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1557                                                unsigned long pgoff)
1558{
1559        struct vm_struct *area;
1560        unsigned long uaddr = vma->vm_start;
1561        unsigned long usize = vma->vm_end - vma->vm_start;
1562
1563        if ((PAGE_SIZE-1) & (unsigned long)addr)
1564                return -EINVAL;
1565
1566        area = find_vm_area(addr);
1567        if (!area)
1568                return -EINVAL;
1569
1570        if (!(area->flags & VM_USERMAP))
1571                return -EINVAL;
1572
1573        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
1574                return -EINVAL;
1575
1576        addr += pgoff << PAGE_SHIFT;
1577        do {
1578                struct page *page = vmalloc_to_page(addr);
1579                int ret;
1580
1581                ret = vm_insert_page(vma, uaddr, page);
1582                if (ret)
1583                        return ret;
1584
1585                uaddr += PAGE_SIZE;
1586                addr += PAGE_SIZE;
1587                usize -= PAGE_SIZE;
1588        } while (usize > 0);
1589
1590        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
1591        vma->vm_flags |= VM_RESERVED;
1592
1593        return 0;
1594}
1595EXPORT_SYMBOL(remap_vmalloc_range);
1596
1597/*
1598 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
1599 * have one.
1600 */
1601void  __attribute__((weak)) vmalloc_sync_all(void)
1602{
1603}
1604
1605
1606static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
1607{
1608        /* apply_to_page_range() does all the hard work. */
1609        return 0;
1610}
1611
1612/**
1613 *      alloc_vm_area - allocate a range of kernel address space
1614 *      @size:          size of the area
1615 *
1616 *      Returns:        NULL on failure, vm_struct on success
1617 *
1618 *      This function reserves a range of kernel address space, and
1619 *      allocates pagetables to map that range.  No actual mappings
1620 *      are created.  If the kernel address space is not shared
1621 *      between processes, it syncs the pagetable across all
1622 *      processes.
1623 */
1624struct vm_struct *alloc_vm_area(size_t size)
1625{
1626        struct vm_struct *area;
1627
1628        area = get_vm_area_caller(size, VM_IOREMAP,
1629                                __builtin_return_address(0));
1630        if (area == NULL)
1631                return NULL;
1632
1633        /*
1634         * This ensures that page tables are constructed for this region
1635         * of kernel virtual address space and mapped into init_mm.
1636         */
1637        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
1638                                area->size, f, NULL)) {
1639                free_vm_area(area);
1640                return NULL;
1641        }
1642
1643        /* Make sure the pagetables are constructed in process kernel
1644           mappings */
1645        vmalloc_sync_all();
1646
1647        return area;
1648}
1649EXPORT_SYMBOL_GPL(alloc_vm_area);
1650
1651void free_vm_area(struct vm_struct *area)
1652{
1653        struct vm_struct *ret;
1654        ret = remove_vm_area(area->addr);
1655        BUG_ON(ret != area);
1656        kfree(area);
1657}
1658EXPORT_SYMBOL_GPL(free_vm_area);
1659
1660
1661#ifdef CONFIG_PROC_FS
1662static void *s_start(struct seq_file *m, loff_t *pos)
1663{
1664        loff_t n = *pos;
1665        struct vm_struct *v;
1666
1667        read_lock(&vmlist_lock);
1668        v = vmlist;
1669        while (n > 0 && v) {
1670                n--;
1671                v = v->next;
1672        }
1673        if (!n)
1674                return v;
1675
1676        return NULL;
1677
1678}
1679
1680static void *s_next(struct seq_file *m, void *p, loff_t *pos)
1681{
1682        struct vm_struct *v = p;
1683
1684        ++*pos;
1685        return v->next;
1686}
1687
1688static void s_stop(struct seq_file *m, void *p)
1689{
1690        read_unlock(&vmlist_lock);
1691}
1692
1693static void show_numa_info(struct seq_file *m, struct vm_struct *v)
1694{
1695        if (NUMA_BUILD) {
1696                unsigned int nr, *counters = m->private;
1697
1698                if (!counters)
1699                        return;
1700
1701                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
1702
1703                for (nr = 0; nr < v->nr_pages; nr++)
1704                        counters[page_to_nid(v->pages[nr])]++;
1705
1706                for_each_node_state(nr, N_HIGH_MEMORY)
1707                        if (counters[nr])
1708                                seq_printf(m, " N%u=%u", nr, counters[nr]);
1709        }
1710}
1711
1712static int s_show(struct seq_file *m, void *p)
1713{
1714        struct vm_struct *v = p;
1715
1716        seq_printf(m, "0x%p-0x%p %7ld",
1717                v->addr, v->addr + v->size, v->size);
1718
1719        if (v->caller) {
1720                char buff[KSYM_SYMBOL_LEN];
1721
1722                seq_putc(m, ' ');
1723                sprint_symbol(buff, (unsigned long)v->caller);
1724                seq_puts(m, buff);
1725        }
1726
1727        if (v->nr_pages)
1728                seq_printf(m, " pages=%d", v->nr_pages);
1729
1730        if (v->phys_addr)
1731                seq_printf(m, " phys=%lx", v->phys_addr);
1732
1733        if (v->flags & VM_IOREMAP)
1734                seq_printf(m, " ioremap");
1735
1736        if (v->flags & VM_ALLOC)
1737                seq_printf(m, " vmalloc");
1738
1739        if (v->flags & VM_MAP)
1740                seq_printf(m, " vmap");
1741
1742        if (v->flags & VM_USERMAP)
1743                seq_printf(m, " user");
1744
1745        if (v->flags & VM_VPAGES)
1746                seq_printf(m, " vpages");
1747
1748        show_numa_info(m, v);
1749        seq_putc(m, '\n');
1750        return 0;
1751}
1752
1753static const struct seq_operations vmalloc_op = {
1754        .start = s_start,
1755        .next = s_next,
1756        .stop = s_stop,
1757        .show = s_show,
1758};
1759
1760static int vmalloc_open(struct inode *inode, struct file *file)
1761{
1762        unsigned int *ptr = NULL;
1763        int ret;
1764
1765        if (NUMA_BUILD)
1766                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
1767        ret = seq_open(file, &vmalloc_op);
1768        if (!ret) {
1769                struct seq_file *m = file->private_data;
1770                m->private = ptr;
1771        } else
1772                kfree(ptr);
1773        return ret;
1774}
1775
1776static const struct file_operations proc_vmalloc_operations = {
1777        .open           = vmalloc_open,
1778        .read           = seq_read,
1779        .llseek         = seq_lseek,
1780        .release        = seq_release_private,
1781};
1782
1783static int __init proc_vmalloc_init(void)
1784{
1785        proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
1786        return 0;
1787}
1788module_init(proc_vmalloc_init);
1789#endif
1790
1791