linux-bk/mm/page_alloc.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/page_alloc.c
   3 *
   4 *  Manages the free list, the system allocates free pages here.
   5 *  Note that kmalloc() lives in slab.c
   6 *
   7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8 *  Swap reorganised 29.12.95, Stephen Tweedie
   9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13 */
  14
  15#include <linux/config.h>
  16#include <linux/kernel_stat.h>
  17#include <linux/mm.h>
  18#include <linux/swap.h>
  19#include <linux/interrupt.h>
  20#include <linux/pagemap.h>
  21#include <linux/bootmem.h>
  22#include <linux/compiler.h>
  23#include <linux/module.h>
  24#include <linux/suspend.h>
  25#include <linux/pagevec.h>
  26#include <linux/blkdev.h>
  27
  28unsigned long totalram_pages;
  29unsigned long totalhigh_pages;
  30int nr_swap_pages;
  31pg_data_t *pgdat_list;
  32
  33/*
  34 * Used by page_zone() to look up the address of the struct zone whose
  35 * id is encoded in the upper bits of page->flags
  36 */
  37struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
  38EXPORT_SYMBOL(zone_table);
  39
  40static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  41static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
  42static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
  43static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
  44
  45/*
  46 * Temporary debugging check for pages not lying within a given zone.
  47 */
  48static inline int bad_range(struct zone *zone, struct page *page)
  49{
  50        if (page_to_pfn(page) >= zone->zone_start_pfn + zone->size)
  51                return 1;
  52        if (page_to_pfn(page) < zone->zone_start_pfn)
  53                return 1;
  54        if (zone != page_zone(page))
  55                return 1;
  56        return 0;
  57}
  58
  59/*
  60 * Freeing function for a buddy system allocator.
  61 *
  62 * The concept of a buddy system is to maintain direct-mapped table
  63 * (containing bit values) for memory blocks of various "orders".
  64 * The bottom level table contains the map for the smallest allocatable
  65 * units of memory (here, pages), and each level above it describes
  66 * pairs of units from the levels below, hence, "buddies".
  67 * At a high level, all that happens here is marking the table entry
  68 * at the bottom level available, and propagating the changes upward
  69 * as necessary, plus some accounting needed to play nicely with other
  70 * parts of the VM system.
  71 * At each level, we keep one bit for each pair of blocks, which
  72 * is set to 1 iff only one of the pair is allocated.  So when we
  73 * are allocating or freeing one, we can derive the state of the
  74 * other.  That is, if we allocate a small block, and both were   
  75 * free, the remainder of the region must be split into blocks.   
  76 * If a block is freed, and its buddy is also free, then this
  77 * triggers coalescing into a block of larger size.            
  78 *
  79 * -- wli
  80 */
  81
  82void __free_pages_ok (struct page *page, unsigned int order)
  83{
  84        unsigned long index, page_idx, mask, flags;
  85        free_area_t *area;
  86        struct page *base;
  87        struct zone *zone;
  88
  89        KERNEL_STAT_ADD(pgfree, 1<<order);
  90
  91        BUG_ON(PageLRU(page));
  92        BUG_ON(PagePrivate(page));
  93        BUG_ON(page->mapping != NULL);
  94        BUG_ON(PageLocked(page));
  95        BUG_ON(PageActive(page));
  96        BUG_ON(PageWriteback(page));
  97        BUG_ON(page->pte.direct != 0);
  98        if (PageDirty(page))
  99                ClearPageDirty(page);
 100        BUG_ON(page_count(page) != 0);
 101
 102        if (unlikely(current->flags & PF_FREE_PAGES)) {
 103                if (!current->nr_local_pages && !in_interrupt()) {
 104                        list_add(&page->list, &current->local_pages);
 105                        page->index = order;
 106                        current->nr_local_pages++;
 107                        goto out;
 108                }
 109        }
 110
 111        zone = page_zone(page);
 112
 113        mask = (~0UL) << order;
 114        base = zone->zone_mem_map;
 115        page_idx = page - base;
 116        if (page_idx & ~mask)
 117                BUG();
 118        index = page_idx >> (1 + order);
 119        area = zone->free_area + order;
 120
 121        spin_lock_irqsave(&zone->lock, flags);
 122        zone->free_pages -= mask;
 123        while (mask + (1 << (MAX_ORDER-1))) {
 124                struct page *buddy1, *buddy2;
 125
 126                BUG_ON(area >= zone->free_area + MAX_ORDER);
 127                if (!__test_and_change_bit(index, area->map))
 128                        /*
 129                         * the buddy page is still allocated.
 130                         */
 131                        break;
 132                /*
 133                 * Move the buddy up one level.
 134                 * This code is taking advantage of the identity:
 135                 *      -mask = 1+~mask
 136                 */
 137                buddy1 = base + (page_idx ^ -mask);
 138                buddy2 = base + page_idx;
 139                BUG_ON(bad_range(zone, buddy1));
 140                BUG_ON(bad_range(zone, buddy2));
 141                list_del(&buddy1->list);
 142                mask <<= 1;
 143                area++;
 144                index >>= 1;
 145                page_idx &= mask;
 146        }
 147        list_add(&(base + page_idx)->list, &area->free_list);
 148        spin_unlock_irqrestore(&zone->lock, flags);
 149out:
 150        return;
 151}
 152
 153#define MARK_USED(index, order, area) \
 154        __change_bit((index) >> (1+(order)), (area)->map)
 155
 156static inline struct page *
 157expand(struct zone *zone, struct page *page,
 158         unsigned long index, int low, int high, free_area_t * area)
 159{
 160        unsigned long size = 1 << high;
 161
 162        while (high > low) {
 163                BUG_ON(bad_range(zone, page));
 164                area--;
 165                high--;
 166                size >>= 1;
 167                list_add(&page->list, &area->free_list);
 168                MARK_USED(index, high, area);
 169                index += size;
 170                page += size;
 171        }
 172        BUG_ON(bad_range(zone, page));
 173        return page;
 174}
 175
 176/*
 177 * This page is about to be returned from the page allocator
 178 */
 179static inline void prep_new_page(struct page *page)
 180{
 181        BUG_ON(page->mapping);
 182        BUG_ON(PagePrivate(page));
 183        BUG_ON(PageLocked(page));
 184        BUG_ON(PageLRU(page));
 185        BUG_ON(PageActive(page));
 186        BUG_ON(PageDirty(page));
 187        BUG_ON(PageWriteback(page));
 188        BUG_ON(page->pte.direct != 0);
 189        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 190                        1 << PG_referenced | 1 << PG_arch_1 |
 191                        1 << PG_checked);
 192        set_page_count(page, 1);
 193}
 194
 195static struct page *rmqueue(struct zone *zone, unsigned int order)
 196{
 197        free_area_t * area = zone->free_area + order;
 198        unsigned int curr_order = order;
 199        struct list_head *head, *curr;
 200        unsigned long flags;
 201        struct page *page;
 202
 203        spin_lock_irqsave(&zone->lock, flags);
 204        do {
 205                head = &area->free_list;
 206                curr = head->next;
 207
 208                if (curr != head) {
 209                        unsigned int index;
 210
 211                        page = list_entry(curr, struct page, list);
 212                        BUG_ON(bad_range(zone, page));
 213                        list_del(curr);
 214                        index = page - zone->zone_mem_map;
 215                        if (curr_order != MAX_ORDER-1)
 216                                MARK_USED(index, curr_order, area);
 217                        zone->free_pages -= 1UL << order;
 218
 219                        page = expand(zone, page, index, order, curr_order, area);
 220                        spin_unlock_irqrestore(&zone->lock, flags);
 221
 222                        if (bad_range(zone, page))
 223                                BUG();
 224                        prep_new_page(page);
 225                        return page;    
 226                }
 227                curr_order++;
 228                area++;
 229        } while (curr_order < MAX_ORDER);
 230        spin_unlock_irqrestore(&zone->lock, flags);
 231
 232        return NULL;
 233}
 234
 235#ifdef CONFIG_SOFTWARE_SUSPEND
 236int is_head_of_free_region(struct page *page)
 237{
 238        struct zone *zone = page_zone(page);
 239        unsigned long flags;
 240        int order;
 241        struct list_head *curr;
 242
 243        /*
 244         * Should not matter as we need quiescent system for
 245         * suspend anyway, but...
 246         */
 247        spin_lock_irqsave(&zone->lock, flags);
 248        for (order = MAX_ORDER - 1; order >= 0; --order)
 249                list_for_each(curr, &zone->free_area[order].free_list)
 250                        if (page == list_entry(curr, struct page, list)) {
 251                                spin_unlock_irqrestore(&zone->lock, flags);
 252                                return 1 << order;
 253                        }
 254        spin_unlock_irqrestore(&zone->lock, flags);
 255        return 0;
 256}
 257#endif /* CONFIG_SOFTWARE_SUSPEND */
 258
 259static /* inline */ struct page *
 260balance_classzone(struct zone* classzone, unsigned int gfp_mask,
 261                        unsigned int order, int * freed)
 262{
 263        struct page * page = NULL;
 264        int __freed = 0;
 265
 266        BUG_ON(in_interrupt());
 267
 268        current->allocation_order = order;
 269        current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
 270
 271        __freed = try_to_free_pages(classzone, gfp_mask, order);
 272
 273        current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
 274
 275        if (current->nr_local_pages) {
 276                struct list_head * entry, * local_pages;
 277                struct page * tmp;
 278                int nr_pages;
 279
 280                local_pages = &current->local_pages;
 281
 282                if (likely(__freed)) {
 283                        /* pick from the last inserted so we're lifo */
 284                        entry = local_pages->next;
 285                        do {
 286                                tmp = list_entry(entry, struct page, list);
 287                                if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
 288                                        list_del(entry);
 289                                        page = tmp;
 290                                        current->nr_local_pages--;
 291                                        prep_new_page(page);
 292                                        break;
 293                                }
 294                        } while ((entry = entry->next) != local_pages);
 295                }
 296
 297                nr_pages = current->nr_local_pages;
 298                /* free in reverse order so that the global order will be lifo */
 299                while ((entry = local_pages->prev) != local_pages) {
 300                        list_del(entry);
 301                        tmp = list_entry(entry, struct page, list);
 302                        __free_pages_ok(tmp, tmp->index);
 303                        if (!nr_pages--)
 304                                BUG();
 305                }
 306                current->nr_local_pages = 0;
 307        }
 308        *freed = __freed;
 309        return page;
 310}
 311
 312/*
 313 * This is the 'heart' of the zoned buddy allocator:
 314 */
 315struct page *
 316__alloc_pages(unsigned int gfp_mask, unsigned int order,
 317                struct zonelist *zonelist)
 318{
 319        unsigned long min;
 320        struct zone **zones, *classzone;
 321        struct page * page;
 322        int freed, i;
 323
 324        KERNEL_STAT_ADD(pgalloc, 1<<order);
 325
 326        zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 327        classzone = zones[0]; 
 328        if (classzone == NULL)    /* no zones in the zonelist */
 329                return NULL;
 330
 331        /* Go through the zonelist once, looking for a zone with enough free */
 332        min = 1UL << order;
 333        for (i = 0; zones[i] != NULL; i++) {
 334                struct zone *z = zones[i];
 335
 336                /* the incremental min is allegedly to discourage fallback */
 337                min += z->pages_low;
 338                if (z->free_pages > min || z->free_pages >= z->pages_high) {
 339                        page = rmqueue(z, order);
 340                        if (page)
 341                                return page;
 342                }
 343        }
 344
 345        classzone->need_balance = 1;
 346        mb();
 347        /* we're somewhat low on memory, failed to find what we needed */
 348        if (waitqueue_active(&kswapd_wait))
 349                wake_up_interruptible(&kswapd_wait);
 350
 351        /* Go through the zonelist again, taking __GFP_HIGH into account */
 352        min = 1UL << order;
 353        for (i = 0; zones[i] != NULL; i++) {
 354                unsigned long local_min;
 355                struct zone *z = zones[i];
 356
 357                local_min = z->pages_min;
 358                if (gfp_mask & __GFP_HIGH)
 359                        local_min >>= 2;
 360                min += local_min;
 361                if (z->free_pages > min || z->free_pages >= z->pages_high) {
 362                        page = rmqueue(z, order);
 363                        if (page)
 364                                return page;
 365                }
 366        }
 367
 368        /* here we're in the low on memory slow path */
 369
 370rebalance:
 371        if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
 372                /* go through the zonelist yet again, ignoring mins */
 373                for (i = 0; zones[i] != NULL; i++) {
 374                        struct zone *z = zones[i];
 375
 376                        page = rmqueue(z, order);
 377                        if (page)
 378                                return page;
 379                }
 380nopage:
 381                if (!(current->flags & PF_NOWARN)) {
 382                        printk("%s: page allocation failure."
 383                                " order:%d, mode:0x%x\n",
 384                                current->comm, order, gfp_mask);
 385                }
 386                return NULL;
 387        }
 388
 389        /* Atomic allocations - we can't balance anything */
 390        if (!(gfp_mask & __GFP_WAIT))
 391                goto nopage;
 392
 393        KERNEL_STAT_INC(allocstall);
 394        page = balance_classzone(classzone, gfp_mask, order, &freed);
 395        if (page)
 396                return page;
 397
 398        /* go through the zonelist yet one more time */
 399        min = 1UL << order;
 400        for (i = 0; zones[i] != NULL; i++) {
 401                struct zone *z = zones[i];
 402
 403                min += z->pages_min;
 404                if (z->free_pages > min || z->free_pages >= z->pages_high) {
 405                        page = rmqueue(z, order);
 406                        if (page)
 407                                return page;
 408                }
 409        }
 410
 411        /* Don't let big-order allocations loop */
 412        if (order > 3)
 413                goto nopage;
 414
 415        /* Yield for kswapd, and try again */
 416        yield();
 417        goto rebalance;
 418}
 419
 420/*
 421 * Common helper functions.
 422 */
 423unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 424{
 425        struct page * page;
 426
 427        page = alloc_pages(gfp_mask, order);
 428        if (!page)
 429                return 0;
 430        return (unsigned long) page_address(page);
 431}
 432
 433unsigned long get_zeroed_page(unsigned int gfp_mask)
 434{
 435        struct page * page;
 436
 437        page = alloc_pages(gfp_mask, 0);
 438        if (page) {
 439                void *address = page_address(page);
 440                clear_page(address);
 441                return (unsigned long) address;
 442        }
 443        return 0;
 444}
 445
 446void __pagevec_free(struct pagevec *pvec)
 447{
 448        int i = pagevec_count(pvec);
 449
 450        while (--i >= 0)
 451                __free_pages_ok(pvec->pages[i], 0);
 452}
 453
 454void __free_pages(struct page *page, unsigned int order)
 455{
 456        if (!PageReserved(page) && put_page_testzero(page))
 457                __free_pages_ok(page, order);
 458}
 459
 460void free_pages(unsigned long addr, unsigned int order)
 461{
 462        if (addr != 0) {
 463                BUG_ON(!virt_addr_valid(addr));
 464                __free_pages(virt_to_page(addr), order);
 465        }
 466}
 467
 468/*
 469 * Total amount of free (allocatable) RAM:
 470 */
 471unsigned int nr_free_pages(void)
 472{
 473        unsigned int sum = 0;
 474        struct zone *zone;
 475
 476        for_each_zone(zone)
 477                sum += zone->free_pages;
 478
 479        return sum;
 480}
 481
 482static unsigned int nr_free_zone_pages(int offset)
 483{
 484        pg_data_t *pgdat;
 485        unsigned int sum = 0;
 486
 487        for_each_pgdat(pgdat) {
 488                struct zonelist *zonelist = pgdat->node_zonelists + offset;
 489                struct zone **zonep = zonelist->zones;
 490                struct zone *zone;
 491
 492                for (zone = *zonep++; zone; zone = *zonep++) {
 493                        unsigned long size = zone->size;
 494                        unsigned long high = zone->pages_high;
 495                        if (size > high)
 496                                sum += size - high;
 497                }
 498        }
 499
 500        return sum;
 501}
 502
 503/*
 504 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
 505 */
 506unsigned int nr_free_buffer_pages(void)
 507{
 508        return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
 509}
 510
 511/*
 512 * Amount of free RAM allocatable within all zones
 513 */
 514unsigned int nr_free_pagecache_pages(void)
 515{
 516        return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
 517}
 518
 519#if CONFIG_HIGHMEM
 520unsigned int nr_free_highpages (void)
 521{
 522        pg_data_t *pgdat;
 523        unsigned int pages = 0;
 524
 525        for_each_pgdat(pgdat)
 526                pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 527
 528        return pages;
 529}
 530#endif
 531
 532/*
 533 * Accumulate the page_state information across all CPUs.
 534 * The result is unavoidably approximate - it can change
 535 * during and after execution of this function.
 536 */
 537struct page_state page_states[NR_CPUS] __cacheline_aligned;
 538EXPORT_SYMBOL(page_states);
 539
 540void get_page_state(struct page_state *ret)
 541{
 542        int pcpu;
 543
 544        memset(ret, 0, sizeof(*ret));
 545        for (pcpu = 0; pcpu < NR_CPUS; pcpu++) {
 546                struct page_state *ps;
 547
 548                if (!cpu_online(pcpu))
 549                        continue;
 550
 551                ps = &page_states[pcpu];
 552                ret->nr_dirty += ps->nr_dirty;
 553                ret->nr_writeback += ps->nr_writeback;
 554                ret->nr_pagecache += ps->nr_pagecache;
 555                ret->nr_page_table_pages += ps->nr_page_table_pages;
 556                ret->nr_reverse_maps += ps->nr_reverse_maps;
 557                ret->nr_mapped += ps->nr_mapped;
 558                ret->nr_slab += ps->nr_slab;
 559        }
 560}
 561
 562void get_zone_counts(unsigned long *active, unsigned long *inactive)
 563{
 564        struct zone *zone;
 565
 566        *active = 0;
 567        *inactive = 0;
 568        for_each_zone(zone) {
 569                *active += zone->nr_active;
 570                *inactive += zone->nr_inactive;
 571        }
 572}
 573
 574unsigned long get_page_cache_size(void)
 575{
 576        struct page_state ps;
 577
 578        get_page_state(&ps);
 579        return ps.nr_pagecache;
 580}
 581
 582void si_meminfo(struct sysinfo *val)
 583{
 584        val->totalram = totalram_pages;
 585        val->sharedram = 0;
 586        val->freeram = nr_free_pages();
 587        val->bufferram = nr_blockdev_pages();
 588#ifdef CONFIG_HIGHMEM
 589        val->totalhigh = totalhigh_pages;
 590        val->freehigh = nr_free_highpages();
 591#else
 592        val->totalhigh = 0;
 593        val->freehigh = 0;
 594#endif
 595        val->mem_unit = PAGE_SIZE;
 596}
 597
 598#define K(x) ((x) << (PAGE_SHIFT-10))
 599
 600/*
 601 * Show free area list (used inside shift_scroll-lock stuff)
 602 * We also calculate the percentage fragmentation. We do this by counting the
 603 * memory on each free list with the exception of the first item on the list.
 604 */
 605void show_free_areas(void)
 606{
 607        pg_data_t *pgdat;
 608        struct page_state ps;
 609        int type;
 610        unsigned long active;
 611        unsigned long inactive;
 612
 613        get_page_state(&ps);
 614        get_zone_counts(&active, &inactive);
 615
 616        printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 617                K(nr_free_pages()),
 618                K(nr_free_highpages()));
 619
 620        for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
 621                for (type = 0; type < MAX_NR_ZONES; ++type) {
 622                        struct zone *zone = &pgdat->node_zones[type];
 623                        printk("Zone:%s"
 624                                " freepages:%6lukB"
 625                                " min:%6lukB"
 626                                " low:%6lukB"
 627                                " high:%6lukB"
 628                                " active:%6lukB"
 629                                " inactive:%6lukB"
 630                                "\n",
 631                                zone->name,
 632                                K(zone->free_pages),
 633                                K(zone->pages_min),
 634                                K(zone->pages_low),
 635                                K(zone->pages_high),
 636                                K(zone->nr_active),
 637                                K(zone->nr_inactive)
 638                                );
 639                }
 640
 641        printk("( Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u )\n",
 642                active,
 643                inactive,
 644                ps.nr_dirty,
 645                ps.nr_writeback,
 646                nr_free_pages());
 647
 648        for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
 649                for (type = 0; type < MAX_NR_ZONES; type++) {
 650                        struct list_head *elem;
 651                        struct zone *zone = &pgdat->node_zones[type];
 652                        unsigned long nr, flags, order, total = 0;
 653
 654                        if (!zone->size)
 655                                continue;
 656
 657                        spin_lock_irqsave(&zone->lock, flags);
 658                        for (order = 0; order < MAX_ORDER; order++) {
 659                                nr = 0;
 660                                list_for_each(elem, &zone->free_area[order].free_list)
 661                                        ++nr;
 662                                total += nr << order;
 663                                printk("%lu*%lukB ", nr, K(1UL) << order);
 664                        }
 665                        spin_unlock_irqrestore(&zone->lock, flags);
 666                        printk("= %lukB)\n", K(total));
 667                }
 668
 669        show_swap_cache_info();
 670}
 671
 672/*
 673 * Builds allocation fallback zone lists.
 674 */
 675static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
 676{
 677        switch (k) {
 678                struct zone *zone;
 679        default:
 680                BUG();
 681        case ZONE_HIGHMEM:
 682                zone = pgdat->node_zones + ZONE_HIGHMEM;
 683                if (zone->size) {
 684#ifndef CONFIG_HIGHMEM
 685                        BUG();
 686#endif
 687                        zonelist->zones[j++] = zone;
 688                }
 689        case ZONE_NORMAL:
 690                zone = pgdat->node_zones + ZONE_NORMAL;
 691                if (zone->size)
 692                        zonelist->zones[j++] = zone;
 693        case ZONE_DMA:
 694                zone = pgdat->node_zones + ZONE_DMA;
 695                if (zone->size)
 696                        zonelist->zones[j++] = zone;
 697        }
 698
 699        return j;
 700}
 701
 702static void __init build_zonelists(pg_data_t *pgdat)
 703{
 704        int i, j, k, node, local_node;
 705
 706        local_node = pgdat->node_id;
 707        printk("Building zonelist for node : %d\n", local_node);
 708        for (i = 0; i <= GFP_ZONEMASK; i++) {
 709                struct zonelist *zonelist;
 710
 711                zonelist = pgdat->node_zonelists + i;
 712                memset(zonelist, 0, sizeof(*zonelist));
 713
 714                j = 0;
 715                k = ZONE_NORMAL;
 716                if (i & __GFP_HIGHMEM)
 717                        k = ZONE_HIGHMEM;
 718                if (i & __GFP_DMA)
 719                        k = ZONE_DMA;
 720
 721                j = build_zonelists_node(pgdat, zonelist, j, k);
 722                /*
 723                 * Now we build the zonelist so that it contains the zones
 724                 * of all the other nodes.
 725                 * We don't want to pressure a particular node, so when
 726                 * building the zones for node N, we make sure that the
 727                 * zones coming right after the local ones are those from
 728                 * node N+1 (modulo N)
 729                 */
 730                for (node = local_node + 1; node < numnodes; node++)
 731                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 732                for (node = 0; node < local_node; node++)
 733                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 734 
 735                zonelist->zones[j++] = NULL;
 736        } 
 737}
 738
 739void __init build_all_zonelists(void)
 740{
 741        int i;
 742
 743        for(i = 0 ; i < numnodes ; i++)
 744                build_zonelists(NODE_DATA(i));
 745}
 746
 747void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
 748        unsigned long *zholes_size)
 749{
 750        unsigned long realtotalpages, totalpages = 0;
 751        int i;
 752
 753        for (i = 0; i < MAX_NR_ZONES; i++)
 754                totalpages += zones_size[i];
 755        pgdat->node_size = totalpages;
 756
 757        realtotalpages = totalpages;
 758        if (zholes_size)
 759                for (i = 0; i < MAX_NR_ZONES; i++)
 760                        realtotalpages -= zholes_size[i];
 761        printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 762}
 763
 764/*
 765 * Helper functions to size the waitqueue hash table.
 766 * Essentially these want to choose hash table sizes sufficiently
 767 * large so that collisions trying to wait on pages are rare.
 768 * But in fact, the number of active page waitqueues on typical
 769 * systems is ridiculously low, less than 200. So this is even
 770 * conservative, even though it seems large.
 771 *
 772 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 773 * waitqueues, i.e. the size of the waitq table given the number of pages.
 774 */
 775#define PAGES_PER_WAITQUEUE     256
 776
 777static inline unsigned long wait_table_size(unsigned long pages)
 778{
 779        unsigned long size = 1;
 780
 781        pages /= PAGES_PER_WAITQUEUE;
 782
 783        while (size < pages)
 784                size <<= 1;
 785
 786        /*
 787         * Once we have dozens or even hundreds of threads sleeping
 788         * on IO we've got bigger problems than wait queue collision.
 789         * Limit the size of the wait table to a reasonable size.
 790         */
 791        size = min(size, 4096UL);
 792
 793        return size;
 794}
 795
 796/*
 797 * This is an integer logarithm so that shifts can be used later
 798 * to extract the more random high bits from the multiplicative
 799 * hash function before the remainder is taken.
 800 */
 801static inline unsigned long wait_table_bits(unsigned long size)
 802{
 803        return ffz(~size);
 804}
 805
 806#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 807
 808/*
 809 * Set up the zone data structures:
 810 *   - mark all pages reserved
 811 *   - mark all memory queues empty
 812 *   - clear the memory bitmaps
 813 */
 814void __init free_area_init_core(pg_data_t *pgdat,
 815                unsigned long *zones_size, unsigned long *zholes_size)
 816{
 817        unsigned long i, j;
 818        unsigned long local_offset;
 819        const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 820        int nid = pgdat->node_id;
 821        struct page *lmem_map = pgdat->node_mem_map;
 822        unsigned long zone_start_pfn = pgdat->node_start_pfn;
 823
 824        pgdat->nr_zones = 0;
 825        local_offset = 0;                /* offset within lmem_map */
 826        for (j = 0; j < MAX_NR_ZONES; j++) {
 827                struct zone *zone = pgdat->node_zones + j;
 828                unsigned long mask;
 829                unsigned long size, realsize;
 830
 831                zone_table[nid * MAX_NR_ZONES + j] = zone;
 832                realsize = size = zones_size[j];
 833                if (zholes_size)
 834                        realsize -= zholes_size[j];
 835
 836                printk("  %s zone: %lu pages\n", zone_names[j], realsize);
 837                zone->size = size;
 838                zone->name = zone_names[j];
 839                spin_lock_init(&zone->lock);
 840                spin_lock_init(&zone->lru_lock);
 841                zone->zone_pgdat = pgdat;
 842                zone->free_pages = 0;
 843                zone->need_balance = 0;
 844                INIT_LIST_HEAD(&zone->active_list);
 845                INIT_LIST_HEAD(&zone->inactive_list);
 846                atomic_set(&zone->refill_counter, 0);
 847                zone->nr_active = 0;
 848                zone->nr_inactive = 0;
 849                if (!size)
 850                        continue;
 851
 852                /*
 853                 * The per-page waitqueue mechanism uses hashed waitqueues
 854                 * per zone.
 855                 */
 856                zone->wait_table_size = wait_table_size(size);
 857                zone->wait_table_bits =
 858                        wait_table_bits(zone->wait_table_size);
 859                zone->wait_table = (wait_queue_head_t *)
 860                        alloc_bootmem_node(pgdat, zone->wait_table_size
 861                                                * sizeof(wait_queue_head_t));
 862
 863                for(i = 0; i < zone->wait_table_size; ++i)
 864                        init_waitqueue_head(zone->wait_table + i);
 865
 866                pgdat->nr_zones = j+1;
 867
 868                mask = (realsize / zone_balance_ratio[j]);
 869                if (mask < zone_balance_min[j])
 870                        mask = zone_balance_min[j];
 871                else if (mask > zone_balance_max[j])
 872                        mask = zone_balance_max[j];
 873                zone->pages_min = mask;
 874                zone->pages_low = mask*2;
 875                zone->pages_high = mask*3;
 876
 877                zone->zone_mem_map = lmem_map + local_offset;
 878                zone->zone_start_pfn = zone_start_pfn;
 879
 880                if ((zone_start_pfn) & (zone_required_alignment-1))
 881                        printk("BUG: wrong zone alignment, it will crash\n");
 882
 883                /*
 884                 * Initially all pages are reserved - free ones are freed
 885                 * up by free_all_bootmem() once the early boot process is
 886                 * done. Non-atomic initialization, single-pass.
 887                 */
 888                for (i = 0; i < size; i++) {
 889                        struct page *page = lmem_map + local_offset + i;
 890                        set_page_zone(page, nid * MAX_NR_ZONES + j);
 891                        set_page_count(page, 0);
 892                        SetPageReserved(page);
 893                        INIT_LIST_HEAD(&page->list);
 894                        if (j != ZONE_HIGHMEM)
 895                                /*
 896                                 * The shift left won't overflow because the
 897                                 * ZONE_NORMAL is below 4G.
 898                                 */
 899                                set_page_address(page, __va(zone_start_pfn << PAGE_SHIFT));
 900                        zone_start_pfn++;
 901                }
 902
 903                local_offset += size;
 904                for (i = 0; ; i++) {
 905                        unsigned long bitmap_size;
 906
 907                        INIT_LIST_HEAD(&zone->free_area[i].free_list);
 908                        if (i == MAX_ORDER-1) {
 909                                zone->free_area[i].map = NULL;
 910                                break;
 911                        }
 912
 913                        /*
 914                         * Page buddy system uses "index >> (i+1)",
 915                         * where "index" is at most "size-1".
 916                         *
 917                         * The extra "+3" is to round down to byte
 918                         * size (8 bits per byte assumption). Thus
 919                         * we get "(size-1) >> (i+4)" as the last byte
 920                         * we can access.
 921                         *
 922                         * The "+1" is because we want to round the
 923                         * byte allocation up rather than down. So
 924                         * we should have had a "+7" before we shifted
 925                         * down by three. Also, we have to add one as
 926                         * we actually _use_ the last bit (it's [0,n]
 927                         * inclusive, not [0,n[).
 928                         *
 929                         * So we actually had +7+1 before we shift
 930                         * down by 3. But (n+8) >> 3 == (n >> 3) + 1
 931                         * (modulo overflows, which we do not have).
 932                         *
 933                         * Finally, we LONG_ALIGN because all bitmap
 934                         * operations are on longs.
 935                         */
 936                        bitmap_size = (size-1) >> (i+4);
 937                        bitmap_size = LONG_ALIGN(bitmap_size+1);
 938                        zone->free_area[i].map = 
 939                          (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 940                }
 941        }
 942}
 943
 944#ifndef CONFIG_DISCONTIGMEM
 945void __init free_area_init(unsigned long *zones_size)
 946{
 947        free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL);
 948        mem_map = contig_page_data.node_mem_map;
 949}
 950#endif
 951
 952static int __init setup_mem_frac(char *str)
 953{
 954        int j = 0;
 955
 956        while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 957        printk("setup_mem_frac: ");
 958        for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
 959        printk("\n");
 960        return 1;
 961}
 962
 963__setup("memfrac=", setup_mem_frac);
 964
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.