linux-old/mm/page_alloc.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/page_alloc.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *  Swap reorganised 29.12.95, Stephen Tweedie
   6 */
   7
   8#include <linux/config.h>
   9#include <linux/mm.h>
  10#include <linux/kernel_stat.h>
  11#include <linux/swap.h>
  12#include <linux/swapctl.h>
  13#include <linux/interrupt.h>
  14#include <linux/init.h>
  15#include <linux/pagemap.h>
  16
  17#include <asm/dma.h>
  18#include <asm/uaccess.h> /* for copy_to/from_user */
  19#include <asm/pgtable.h>
  20
  21int nr_swap_pages = 0;
  22int nr_free_pages = 0;
  23
  24/*
  25 * Free area management
  26 *
  27 * The free_area_list arrays point to the queue heads of the free areas
  28 * of different sizes
  29 */
  30
  31#if CONFIG_AP1000
  32/* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
  33   for the ring buffers */
  34#define NR_MEM_LISTS 12
  35#else
  36#define NR_MEM_LISTS 10
  37#endif
  38#define NR_MEM_TYPES 2          /* GFP_DMA vs not for now. */
  39
  40/* The start of this MUST match the start of "struct page" */
  41struct free_area_struct {
  42        struct page *next;
  43        struct page *prev;
  44        unsigned int * map;
  45        unsigned long count;
  46};
  47
  48#define memory_head(x) ((struct page *)(x))
  49
  50static struct free_area_struct free_area[NR_MEM_TYPES][NR_MEM_LISTS];
  51
  52static inline void init_mem_queue(struct free_area_struct * head)
  53{
  54        head->next = memory_head(head);
  55        head->prev = memory_head(head);
  56}
  57
  58static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
  59{
  60        struct page * next = head->next;
  61
  62        entry->prev = memory_head(head);
  63        entry->next = next;
  64        next->prev = entry;
  65        head->next = entry;
  66        head->count++;
  67}
  68
  69static inline void remove_mem_queue(struct page * entry)
  70{
  71        struct page * next = entry->next;
  72        struct page * prev = entry->prev;
  73        next->prev = prev;
  74        prev->next = next;
  75}
  76
  77/*
  78 * Free_page() adds the page to the free lists. This is optimized for
  79 * fast normal cases (no error jumps taken normally).
  80 *
  81 * The way to optimize jumps for gcc-2.2.2 is to:
  82 *  - select the "normal" case and put it inside the if () { XXX }
  83 *  - no else-statements if you can avoid them
  84 *
  85 * With the above two rules, you get a straight-line execution path
  86 * for the normal case, giving better asm-code.
  87 */
  88
  89/*
  90 * Buddy system. Hairy. You really aren't expected to understand this
  91 *
  92 * Hint: -mask = 1+~mask
  93 */
  94spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
  95
  96#define list(x) (mem_map+(x))
  97#define __free_pages_ok(map_nr, mask, area, index)              \
  98        nr_free_pages -= (mask);                                \
  99        while ((mask) + (1 << (NR_MEM_LISTS-1))) {              \
 100                if (!test_and_change_bit((index), (area)->map)) \
 101                        break;                                  \
 102                (area)->count--;                                \
 103                remove_mem_queue(list((map_nr) ^ -(mask)));     \
 104                (mask) <<= 1;                                   \
 105                (area)++;                                       \
 106                (index) >>= 1;                                  \
 107                (map_nr) &= (mask);                             \
 108        }                                                       \
 109        add_mem_queue(area, list(map_nr));
 110
 111static void free_local_pages(struct page * page) {
 112        unsigned long order = page->offset;
 113        unsigned int type = PageDMA(page) ? 1 : 0;
 114        struct free_area_struct *area;
 115        unsigned long map_nr = page - mem_map;
 116        unsigned long mask = (~0UL) << order;
 117        unsigned long index = map_nr >> (1 + order);
 118
 119        area = free_area[type] + order;
 120        __free_pages_ok(map_nr, mask, area, index);
 121}
 122
 123static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
 124{
 125        struct free_area_struct *area;
 126        unsigned long index;
 127        unsigned long mask;
 128        unsigned long flags;
 129        struct page * page;
 130
 131        if (current->flags & PF_FREE_PAGES)
 132                goto local_freelist;
 133 back_local_freelist:
 134
 135        index = map_nr >> (1 + order);
 136        mask = (~0UL) << order;
 137        map_nr &= mask;
 138
 139        spin_lock_irqsave(&page_alloc_lock, flags);
 140        area = free_area[type] + order;
 141        __free_pages_ok(map_nr, mask, area, index);
 142        spin_unlock_irqrestore(&page_alloc_lock, flags);
 143        return;
 144
 145 local_freelist:
 146        /*
 147         * This is a little subtle: if the allocation order
 148         * wanted is major than zero we'd better take all the pages
 149         * local since we must deal with fragmentation too and we
 150         * can't rely on the nr_local_pages information.
 151         */
 152        if ((current->nr_local_pages && !current->allocation_order) ||
 153            in_interrupt())
 154                goto back_local_freelist;
 155
 156        page = mem_map + map_nr;
 157        list_add((struct list_head *) page, &current->local_pages);
 158        page->offset = order;
 159        current->nr_local_pages++;
 160}
 161
 162void __free_pages(struct page *page, unsigned long order)
 163{
 164        if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 165                if (PageSwapCache(page))
 166                        panic ("Freeing swap cache page");
 167                page->flags &= ~(1 << PG_referenced);
 168                free_pages_ok(page - mem_map, order, PageDMA(page) ? 1 : 0);
 169                return;
 170        }
 171}
 172
 173void free_pages(unsigned long addr, unsigned long order)
 174{
 175        unsigned long map_nr = MAP_NR(addr);
 176
 177        if (map_nr < max_mapnr)
 178                __free_pages(mem_map + map_nr, order);
 179}
 180
 181/*
 182 * Some ugly macros to speed up __get_free_pages()..
 183 */
 184#define MARK_USED(index, order, area) \
 185        change_bit((index) >> (1+(order)), (area)->map)
 186#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
 187#define RMQUEUE_TYPE(order, type) \
 188do { struct free_area_struct * area = free_area[type]+order; \
 189     unsigned long new_order = order; \
 190        do { struct page *prev = memory_head(area), *ret = prev->next; \
 191                if (memory_head(area) != ret) { \
 192                        unsigned long map_nr; \
 193                        (prev->next = ret->next)->prev = prev; \
 194                        map_nr = ret - mem_map; \
 195                        MARK_USED(map_nr, new_order, area); \
 196                        nr_free_pages -= 1 << order; \
 197                        area->count--; \
 198                        EXPAND(ret, map_nr, order, new_order, area); \
 199                        spin_unlock_irqrestore(&page_alloc_lock, flags); \
 200                        return ADDRESS(map_nr); \
 201                } \
 202                new_order++; area++; \
 203        } while (new_order < NR_MEM_LISTS); \
 204} while (0)
 205
 206#define EXPAND(map,index,low,high,area) \
 207do { unsigned long size = 1 << high; \
 208        while (high > low) { \
 209                area--; high--; size >>= 1; \
 210                add_mem_queue(area, map); \
 211                MARK_USED(index, high, area); \
 212                index += size; \
 213                map += size; \
 214        } \
 215        atomic_set(&map->count, 1); \
 216} while (0)
 217
 218static void refile_local_pages(void)
 219{
 220        if (current->nr_local_pages) {
 221                struct page * page;
 222                struct list_head * entry;
 223                int nr_pages = current->nr_local_pages;
 224
 225                while ((entry = current->local_pages.next) != &current->local_pages) {
 226                        list_del(entry);
 227                        page = (struct page *) entry;
 228                        free_local_pages(page);
 229                        if (!nr_pages--)
 230                                panic("__get_free_pages local_pages list corrupted I");
 231                }
 232                if (nr_pages)
 233                        panic("__get_free_pages local_pages list corrupted II");
 234                current->nr_local_pages = 0;
 235        }
 236}
 237
 238unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 239{
 240        unsigned long flags;
 241
 242        if (order >= NR_MEM_LISTS)
 243                goto out;
 244
 245#ifdef ATOMIC_MEMORY_DEBUGGING
 246        if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
 247                static int count = 0;
 248                if (++count < 5) {
 249                        printk("gfp called nonatomically from interrupt %p\n",
 250                                __builtin_return_address(0));
 251                }
 252                goto out;
 253        }
 254#endif
 255
 256        /*
 257         * Acquire lock before reading nr_free_pages to make sure it
 258         * won't change from under us.
 259         */
 260        spin_lock_irqsave(&page_alloc_lock, flags);
 261
 262        /*
 263         * If this is a recursive call, we'd better
 264         * do our best to just allocate things without
 265         * further thought.
 266         */
 267        if (!(current->flags & PF_MEMALLOC)) {
 268                extern struct wait_queue * kswapd_wait;
 269
 270                if (nr_free_pages > freepages.low)
 271                        goto ok_to_allocate;
 272
 273                if (waitqueue_active(&kswapd_wait))
 274                        wake_up_interruptible(&kswapd_wait);
 275
 276                /* Do we have to block or can we proceed? */
 277                if (nr_free_pages > freepages.min)
 278                        goto ok_to_allocate;
 279                if (gfp_mask & __GFP_WAIT) {
 280                        int freed;
 281                        /*
 282                         * If the task is ok to sleep it's fine also
 283                         * if we release irq here.
 284                         */
 285                        spin_unlock_irq(&page_alloc_lock);
 286
 287                        current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
 288                        current->allocation_order = order;
 289                        freed = try_to_free_pages(gfp_mask);
 290                        current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
 291
 292                        spin_lock_irq(&page_alloc_lock);
 293                        refile_local_pages();
 294
 295                        /*
 296                         * Re-check we're still low on memory after we blocked
 297                         * for some time. Somebody may have released lots of
 298                         * memory from under us while we was trying to free
 299                         * the pages. We check against pages_high to be sure
 300                         * to succeed only if lots of memory is been released.
 301                         */
 302                        if (nr_free_pages > freepages.high)
 303                                goto ok_to_allocate;
 304
 305                        if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 306                                goto nopage;
 307                }
 308        }
 309ok_to_allocate:
 310        /* if it's not a dma request, try non-dma first */
 311        if (!(gfp_mask & __GFP_DMA))
 312                RMQUEUE_TYPE(order, 0);
 313        RMQUEUE_TYPE(order, 1);
 314 nopage:
 315        spin_unlock_irqrestore(&page_alloc_lock, flags);
 316 out:
 317        return 0;
 318}
 319
 320/*
 321 * Show free area list (used inside shift_scroll-lock stuff)
 322 * We also calculate the percentage fragmentation. We do this by counting the
 323 * memory on each free list with the exception of the first item on the list.
 324 */
 325void show_free_areas(void)
 326{
 327        unsigned long order, flags;
 328        unsigned type;
 329
 330        spin_lock_irqsave(&page_alloc_lock, flags);
 331        printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 332        printk("Free: %d (%d %d %d)\n",
 333                nr_free_pages,
 334                freepages.min,
 335                freepages.low,
 336                freepages.high);
 337        for (type = 0; type < NR_MEM_TYPES; type++) {
 338                unsigned long total = 0;
 339                printk("%sDMA: ", type ? "" : "Non");
 340                for (order=0 ; order < NR_MEM_LISTS; order++) {
 341                        unsigned long nr = free_area[type][order].count;
 342
 343                        total += nr * ((PAGE_SIZE>>10) << order);
 344                        printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
 345                }
 346                printk("= %lukB)\n", total);
 347        }
 348        spin_unlock_irqrestore(&page_alloc_lock, flags);
 349#ifdef SWAP_CACHE_INFO
 350        show_swap_cache_info();
 351#endif  
 352}
 353
 354#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 355
 356/*
 357 * set up the free-area data structures:
 358 *   - mark all pages reserved
 359 *   - mark all memory queues empty
 360 *   - clear the memory bitmaps
 361 */
 362unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_mem)
 363{
 364        mem_map_t * p;
 365        unsigned long i, j;
 366
 367        /*
 368         * Select nr of pages we try to keep free for important stuff
 369         * with a minimum of 10 pages and a maximum of 256 pages, so
 370         * that we don't waste too much memory on large systems.
 371         * This is fairly arbitrary, but based on some behaviour
 372         * analysis.
 373         */
 374        i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
 375        if (i < 10)
 376                i = 10;
 377        if (i > 256)
 378                i = 256;
 379        freepages.min = i;
 380        freepages.low = i * 2;
 381        freepages.high = i * 3;
 382        mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
 383        p = mem_map + MAP_NR(end_mem);
 384        start_mem = LONG_ALIGN((unsigned long) p);
 385        memset(mem_map, 0, start_mem - (unsigned long) mem_map);
 386        do {
 387                --p;
 388                atomic_set(&p->count, 0);
 389                p->flags = (1 << PG_DMA) | (1 << PG_reserved);
 390        } while (p > mem_map);
 391
 392        for (j = 0; j < NR_MEM_TYPES; j++) {
 393                unsigned long mask = PAGE_MASK;
 394                for (i = 0 ; i < NR_MEM_LISTS ; i++) {
 395                        unsigned long bitmap_size;
 396                        init_mem_queue(free_area[j]+i);
 397                        mask += mask;
 398                        end_mem = (end_mem + ~mask) & mask;
 399                        bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
 400                        bitmap_size = (bitmap_size + 7) >> 3;
 401                        bitmap_size = LONG_ALIGN(bitmap_size);
 402                        free_area[j][i].map = (unsigned int *) start_mem;
 403                        memset((void *) start_mem, 0, bitmap_size);
 404                        start_mem += bitmap_size;
 405                }
 406        }
 407        return start_mem;
 408}
 409
 410/* 
 411 * Primitive swap readahead code. We simply read an aligned block of
 412 * (1 << page_cluster) entries in the swap area. This method is chosen
 413 * because it doesn't cost us any seek time.  We also make sure to queue
 414 * the 'original' request together with the readahead ones...  
 415 */
 416void swapin_readahead(unsigned long entry)
 417{
 418        int i;
 419        struct page *new_page;
 420        unsigned long offset = SWP_OFFSET(entry);
 421        struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 422        
 423        offset = (offset >> page_cluster) << page_cluster;
 424
 425        i = 1 << page_cluster;
 426        do {
 427                /* Don't read-ahead past the end of the swap area */
 428                if (offset >= swapdev->max)
 429                        break;
 430                /* Don't block on I/O for read-ahead */
 431                if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
 432                        break;
 433                /* Don't read in bad or busy pages */
 434                if (!swapdev->swap_map[offset])
 435                        break;
 436                if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
 437                        break;
 438                if (test_bit(offset, swapdev->swap_lockmap))
 439                        break;
 440
 441                /* Ok, do the async read-ahead now */
 442                new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
 443                if (new_page != NULL)
 444                        __free_page(new_page);
 445                offset++;
 446        } while (--i);
 447        return;
 448}
 449
 450/*
 451 * The tests may look silly, but it essentially makes sure that
 452 * no other process did a swap-in on us just as we were waiting.
 453 *
 454 * Also, don't bother to add to the swap cache if this page-in
 455 * was due to a write access.
 456 */
 457int swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 458        pte_t * page_table, unsigned long entry, int write_access)
 459{
 460        unsigned long page;
 461        struct page *page_map = lookup_swap_cache(entry);
 462
 463        if (!page_map) {
 464                swapin_readahead(entry);
 465                page_map = read_swap_cache(entry);
 466        }
 467        if (pte_val(*page_table) != entry) {
 468                if (page_map)
 469                        free_page_and_swap_cache(page_address(page_map));
 470                return 1;
 471        }
 472        if (!page_map)
 473                return -1;
 474
 475        page = page_address(page_map);
 476        vma->vm_mm->rss++;
 477        tsk->min_flt++;
 478        swap_free(entry);
 479
 480        if (!write_access || is_page_shared(page_map)) {
 481                set_pte(page_table, mk_pte(page, vma->vm_page_prot));
 482                return 1;
 483        }
 484
 485        /*
 486         * The page is unshared and we're going to dirty it - so tear
 487         * down the swap cache and give exclusive access to the page to
 488         * this process.
 489         */
 490        delete_from_swap_cache(page_map);
 491        set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
 492        return 1;
 493}
 494
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.