linux-old/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  The pageout daemon, decides which pages to evict (swap out) and
   5 *  does the actual work of freeing them.
   6 *
   7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8 *
   9 *  Swap reorganised 29.12.95, Stephen Tweedie.
  10 *  kswapd added: 7.1.96  sct
  11 *  Removed kswapd_ctl limits, and swap out as many pages as needed
  12 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  13 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  14 *  Multiqueue VM started 5.8.00, Rik van Riel.
  15 */
  16
  17#include <linux/slab.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/swapctl.h>
  21#include <linux/smp_lock.h>
  22#include <linux/pagemap.h>
  23#include <linux/init.h>
  24#include <linux/highmem.h>
  25#include <linux/file.h>
  26
  27#include <asm/pgalloc.h>
  28
  29/*
  30 * "vm_passes" is the number of vm passes before failing the
  31 * memory balancing. Take into account 3 passes are needed
  32 * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
  33 * of the inactive list at each pass.
  34 */
  35int vm_passes = 60;
  36
  37/*
  38 * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
  39 * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
  40 * scan 1/6 of the inactive lists during a normal aging round.
  41 */
  42int vm_cache_scan_ratio = 6;
  43
  44/*
  45 * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
  46 * we'll start to pageout.
  47 */
  48int vm_mapped_ratio = 100;
  49
  50/*
  51 * "vm_lru_balance_ratio" controls the balance between active and
  52 * inactive cache. The bigger vm_balance is, the easier the
  53 * active cache will grow, because we'll rotate the active list
  54 * slowly. A value of 2 means we'll go towards a balance of
  55 * 1/3 of the cache being inactive.
  56 */
  57int vm_lru_balance_ratio = 2;
  58
  59/*
  60 * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
  61 * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
  62 * the unused-inode, dentry and dquot caches will be freed during a normal
  63 * aging round.
  64 */
  65int vm_vfs_scan_ratio = 6;
  66
  67/*
  68 * "vm_anon_lru" select if to immdiatly insert anon pages in the
  69 * lru. Immediatly means as soon as they're allocated during the
  70 * page faults.
  71 *
  72 * If this is set to 0, they're inserted only after the first
  73 * swapout.
  74 *
  75 * Having anon pages immediatly inserted in the lru allows the
  76 * VM to know better when it's worthwhile to start swapping
  77 * anonymous ram, it will start to swap earlier and it should
  78 * swap smoother and faster, but it will decrease scalability
  79 * on the >16-ways of an order of magnitude. Big SMP/NUMA
  80 * definitely can't take an hit on a global spinlock at
  81 * every anon page allocation. So this is off by default.
  82 *
  83 * Low ram machines that swaps all the time want to turn
  84 * this on (i.e. set to 1).
  85 */
  86int vm_anon_lru = 0;
  87
  88/*
  89 * The swap-out function returns 1 if it successfully
  90 * scanned all the pages it was asked to (`count').
  91 * It returns zero if it couldn't do anything,
  92 *
  93 * rss may decrease because pages are shared, but this
  94 * doesn't count as having freed a page.
  95 */
  96
  97/* mm->page_table_lock is held. mmap_sem is not held */
  98static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
  99{
 100        pte_t pte;
 101        swp_entry_t entry;
 102
 103        /* Don't look at this pte if it's been accessed recently. */
 104        if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
 105                mark_page_accessed(page);
 106                return 0;
 107        }
 108
 109        /* Don't bother unmapping pages that are active */
 110        if (PageActive(page))
 111                return 0;
 112
 113        /* Don't bother replenishing zones not under pressure.. */
 114        if (!memclass(page_zone(page), classzone))
 115                return 0;
 116
 117        if (TryLockPage(page))
 118                return 0;
 119
 120        /* From this point on, the odds are that we're going to
 121         * nuke this pte, so read and clear the pte.  This hook
 122         * is needed on CPUs which update the accessed and dirty
 123         * bits in hardware.
 124         */
 125        flush_cache_page(vma, address);
 126        pte = ptep_get_and_clear(page_table);
 127        flush_tlb_page(vma, address);
 128
 129        if (pte_dirty(pte))
 130                set_page_dirty(page);
 131
 132        /*
 133         * Is the page already in the swap cache? If so, then
 134         * we can just drop our reference to it without doing
 135         * any IO - it's already up-to-date on disk.
 136         */
 137        if (PageSwapCache(page)) {
 138                entry.val = page->index;
 139                swap_duplicate(entry);
 140set_swap_pte:
 141                set_pte(page_table, swp_entry_to_pte(entry));
 142drop_pte:
 143                mm->rss--;
 144                UnlockPage(page);
 145                {
 146                        int freeable = page_count(page) - !!page->buffers <= 2;
 147                        page_cache_release(page);
 148                        return freeable;
 149                }
 150        }
 151
 152        /*
 153         * Is it a clean page? Then it must be recoverable
 154         * by just paging it in again, and we can just drop
 155         * it..  or if it's dirty but has backing store,
 156         * just mark the page dirty and drop it.
 157         *
 158         * However, this won't actually free any real
 159         * memory, as the page will just be in the page cache
 160         * somewhere, and as such we should just continue
 161         * our scan.
 162         *
 163         * Basically, this just makes it possible for us to do
 164         * some real work in the future in "refill_inactive()".
 165         */
 166        if (page->mapping)
 167                goto drop_pte;
 168        if (!PageDirty(page))
 169                goto drop_pte;
 170
 171        /*
 172         * Anonymous buffercache pages can be left behind by
 173         * concurrent truncate and pagefault.
 174         */
 175        if (page->buffers)
 176                goto preserve;
 177
 178        /*
 179         * This is a dirty, swappable page.  First of all,
 180         * get a suitable swap entry for it, and make sure
 181         * we have the swap cache set up to associate the
 182         * page with that swap entry.
 183         */
 184        for (;;) {
 185                entry = get_swap_page();
 186                if (!entry.val)
 187                        break;
 188                /* Add it to the swap cache and mark it dirty
 189                 * (adding to the page cache will clear the dirty
 190                 * and uptodate bits, so we need to do it again)
 191                 */
 192                if (add_to_swap_cache(page, entry) == 0) {
 193                        SetPageUptodate(page);
 194                        set_page_dirty(page);
 195                        goto set_swap_pte;
 196                }
 197                /* Raced with "speculative" read_swap_cache_async */
 198                swap_free(entry);
 199        }
 200
 201        /* No swap space left */
 202preserve:
 203        set_pte(page_table, pte);
 204        UnlockPage(page);
 205        return 0;
 206}
 207
 208/* mm->page_table_lock is held. mmap_sem is not held */
 209static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 210{
 211        pte_t * pte;
 212        unsigned long pmd_end;
 213
 214        if (pmd_none(*dir))
 215                return count;
 216        if (pmd_bad(*dir)) {
 217                pmd_ERROR(*dir);
 218                pmd_clear(dir);
 219                return count;
 220        }
 221        
 222        pte = pte_offset(dir, address);
 223        
 224        pmd_end = (address + PMD_SIZE) & PMD_MASK;
 225        if (end > pmd_end)
 226                end = pmd_end;
 227
 228        do {
 229                if (pte_present(*pte)) {
 230                        struct page *page = pte_page(*pte);
 231
 232                        if (VALID_PAGE(page) && !PageReserved(page)) {
 233                                count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 234                                if (!count) {
 235                                        address += PAGE_SIZE;
 236                                        break;
 237                                }
 238                        }
 239                }
 240                address += PAGE_SIZE;
 241                pte++;
 242        } while (address && (address < end));
 243        mm->swap_address = address;
 244        return count;
 245}
 246
 247/* mm->page_table_lock is held. mmap_sem is not held */
 248static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 249{
 250        pmd_t * pmd;
 251        unsigned long pgd_end;
 252
 253        if (pgd_none(*dir))
 254                return count;
 255        if (pgd_bad(*dir)) {
 256                pgd_ERROR(*dir);
 257                pgd_clear(dir);
 258                return count;
 259        }
 260
 261        pmd = pmd_offset(dir, address);
 262
 263        pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 264        if (pgd_end && (end > pgd_end))
 265                end = pgd_end;
 266        
 267        do {
 268                count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 269                if (!count)
 270                        break;
 271                address = (address + PMD_SIZE) & PMD_MASK;
 272                pmd++;
 273        } while (address && (address < end));
 274        return count;
 275}
 276
 277/* mm->page_table_lock is held. mmap_sem is not held */
 278static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
 279{
 280        pgd_t *pgdir;
 281        unsigned long end;
 282
 283        /* Don't swap out areas which are reserved */
 284        if (vma->vm_flags & VM_RESERVED)
 285                return count;
 286
 287        pgdir = pgd_offset(mm, address);
 288
 289        end = vma->vm_end;
 290        BUG_ON(address >= end);
 291        do {
 292                count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 293                if (!count)
 294                        break;
 295                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 296                pgdir++;
 297        } while (address && (address < end));
 298        return count;
 299}
 300
 301/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
 302struct mm_struct *swap_mm = &init_mm;
 303
 304/*
 305 * Returns remaining count of pages to be swapped out by followup call.
 306 */
 307static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
 308{
 309        unsigned long address;
 310        struct vm_area_struct* vma;
 311
 312        /*
 313         * Find the proper vm-area after freezing the vma chain 
 314         * and ptes.
 315         */
 316        spin_lock(&mm->page_table_lock);
 317        address = mm->swap_address;
 318        if (address == TASK_SIZE || swap_mm != mm) {
 319                /* We raced: don't count this mm but try again */
 320                ++*mmcounter;
 321                goto out_unlock;
 322        }
 323        vma = find_vma(mm, address);
 324        if (vma) {
 325                if (address < vma->vm_start)
 326                        address = vma->vm_start;
 327
 328                for (;;) {
 329                        count = swap_out_vma(mm, vma, address, count, classzone);
 330                        vma = vma->vm_next;
 331                        if (!vma)
 332                                break;
 333                        if (!count)
 334                                goto out_unlock;
 335                        address = vma->vm_start;
 336                }
 337        }
 338        /* Indicate that we reached the end of address space */
 339        mm->swap_address = TASK_SIZE;
 340
 341out_unlock:
 342        spin_unlock(&mm->page_table_lock);
 343        return count;
 344}
 345
 346static int FASTCALL(swap_out(zone_t * classzone));
 347static int fastcall swap_out(zone_t * classzone)
 348{
 349        int counter, nr_pages = SWAP_CLUSTER_MAX;
 350        struct mm_struct *mm;
 351
 352        counter = mmlist_nr << 1;
 353        do {
 354                if (unlikely(current->need_resched)) {
 355                        __set_current_state(TASK_RUNNING);
 356                        schedule();
 357                }
 358
 359                spin_lock(&mmlist_lock);
 360                mm = swap_mm;
 361                while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
 362                        mm->swap_address = 0;
 363                        mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 364                        if (mm == swap_mm)
 365                                goto empty;
 366                        swap_mm = mm;
 367                }
 368
 369                /* Make sure the mm doesn't disappear when we drop the lock.. */
 370                atomic_inc(&mm->mm_users);
 371                spin_unlock(&mmlist_lock);
 372
 373                nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
 374
 375                mmput(mm);
 376
 377                if (!nr_pages)
 378                        return 1;
 379        } while (--counter >= 0);
 380
 381        return 0;
 382
 383empty:
 384        spin_unlock(&mmlist_lock);
 385        return 0;
 386}
 387
 388static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
 389static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
 390static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
 391{
 392        struct list_head * entry;
 393        int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
 394        int max_mapped = vm_mapped_ratio * nr_pages;
 395
 396        while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 397                struct page * page;
 398
 399                if (unlikely(current->need_resched)) {
 400                        spin_unlock(&pagemap_lru_lock);
 401                        __set_current_state(TASK_RUNNING);
 402                        schedule();
 403                        spin_lock(&pagemap_lru_lock);
 404                        continue;
 405                }
 406
 407                page = list_entry(entry, struct page, lru);
 408
 409                BUG_ON(!PageLRU(page));
 410                BUG_ON(PageActive(page));
 411
 412                list_del(entry);
 413                list_add(entry, &inactive_list);
 414
 415                /*
 416                 * Zero page counts can happen because we unlink the pages
 417                 * _after_ decrementing the usage count..
 418                 */
 419                if (unlikely(!page_count(page)))
 420                        continue;
 421
 422                if (!memclass(page_zone(page), classzone))
 423                        continue;
 424
 425                max_scan--;
 426
 427                /* Racy check to avoid trylocking when not worthwhile */
 428                if (!page->buffers && (page_count(page) != 1 || !page->mapping))
 429                        goto page_mapped;
 430
 431                /*
 432                 * The page is locked. IO in progress?
 433                 * Move it to the back of the list.
 434                 */
 435                if (unlikely(TryLockPage(page))) {
 436                        if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
 437                                page_cache_get(page);
 438                                spin_unlock(&pagemap_lru_lock);
 439                                wait_on_page(page);
 440                                page_cache_release(page);
 441                                spin_lock(&pagemap_lru_lock);
 442                        }
 443                        continue;
 444                }
 445
 446                if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
 447                        /*
 448                         * It is not critical here to write it only if
 449                         * the page is unmapped beause any direct writer
 450                         * like O_DIRECT would set the PG_dirty bitflag
 451                         * on the phisical page after having successfully
 452                         * pinned it and after the I/O to the page is finished,
 453                         * so the direct writes to the page cannot get lost.
 454                         */
 455                        int (*writepage)(struct page *);
 456
 457                        writepage = page->mapping->a_ops->writepage;
 458                        if ((gfp_mask & __GFP_FS) && writepage) {
 459                                ClearPageDirty(page);
 460                                SetPageLaunder(page);
 461                                page_cache_get(page);
 462                                spin_unlock(&pagemap_lru_lock);
 463
 464                                writepage(page);
 465                                page_cache_release(page);
 466
 467                                spin_lock(&pagemap_lru_lock);
 468                                continue;
 469                        }
 470                }
 471
 472                /*
 473                 * If the page has buffers, try to free the buffer mappings
 474                 * associated with this page. If we succeed we try to free
 475                 * the page as well.
 476                 */
 477                if (page->buffers) {
 478                        spin_unlock(&pagemap_lru_lock);
 479
 480                        /* avoid to free a locked page */
 481                        page_cache_get(page);
 482
 483                        if (try_to_release_page(page, gfp_mask)) {
 484                                if (!page->mapping) {
 485                                        /*
 486                                         * We must not allow an anon page
 487                                         * with no buffers to be visible on
 488                                         * the LRU, so we unlock the page after
 489                                         * taking the lru lock
 490                                         */
 491                                        spin_lock(&pagemap_lru_lock);
 492                                        UnlockPage(page);
 493                                        __lru_cache_del(page);
 494
 495                                        /* effectively free the page here */
 496                                        page_cache_release(page);
 497
 498                                        if (--nr_pages)
 499                                                continue;
 500                                        break;
 501                                } else {
 502                                        /*
 503                                         * The page is still in pagecache so undo the stuff
 504                                         * before the try_to_release_page since we've not
 505                                         * finished and we can now try the next step.
 506                                         */
 507                                        page_cache_release(page);
 508
 509                                        spin_lock(&pagemap_lru_lock);
 510                                }
 511                        } else {
 512                                /* failed to drop the buffers so stop here */
 513                                UnlockPage(page);
 514                                page_cache_release(page);
 515
 516                                spin_lock(&pagemap_lru_lock);
 517                                continue;
 518                        }
 519                }
 520
 521                spin_lock(&pagecache_lock);
 522
 523                /*
 524                 * This is the non-racy check for busy page.
 525                 * It is critical to check PageDirty _after_ we made sure
 526                 * the page is freeable so not in use by anybody.
 527                 * At this point we're guaranteed that page->buffers is NULL,
 528                 * nobody can refill page->buffers under us because we still
 529                 * hold the page lock.
 530                 */
 531                if (!page->mapping || page_count(page) > 1) {
 532                        spin_unlock(&pagecache_lock);
 533                        UnlockPage(page);
 534page_mapped:
 535                        if (--max_mapped < 0) {
 536                                spin_unlock(&pagemap_lru_lock);
 537
 538                                nr_pages -= kmem_cache_reap(gfp_mask);
 539                                if (nr_pages <= 0)
 540                                        goto out;
 541
 542                                shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 543                                shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 544#ifdef CONFIG_QUOTA
 545                                shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 546#endif
 547
 548                                if (!*failed_swapout)
 549                                        *failed_swapout = !swap_out(classzone);
 550
 551                                max_mapped = nr_pages * vm_mapped_ratio;
 552
 553                                spin_lock(&pagemap_lru_lock);
 554                                refill_inactive(nr_pages, classzone);
 555                        }
 556                        continue;
 557                        
 558                }
 559                if (PageDirty(page)) {
 560                        spin_unlock(&pagecache_lock);
 561                        UnlockPage(page);
 562                        continue;
 563                }
 564
 565                __lru_cache_del(page);
 566
 567                /* point of no return */
 568                if (likely(!PageSwapCache(page))) {
 569                        __remove_inode_page(page);
 570                        spin_unlock(&pagecache_lock);
 571                } else {
 572                        swp_entry_t swap;
 573                        swap.val = page->index;
 574                        __delete_from_swap_cache(page);
 575                        spin_unlock(&pagecache_lock);
 576                        swap_free(swap);
 577                }
 578
 579                UnlockPage(page);
 580
 581                /* effectively free the page here */
 582                page_cache_release(page);
 583
 584                if (--nr_pages)
 585                        continue;
 586                break;
 587        }
 588        spin_unlock(&pagemap_lru_lock);
 589
 590 out:
 591        return nr_pages;
 592}
 593
 594/*
 595 * This moves pages from the active list to
 596 * the inactive list.
 597 *
 598 * We move them the other way when we see the
 599 * reference bit on the page.
 600 */
 601static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
 602{
 603        struct list_head * entry;
 604        unsigned long ratio;
 605
 606        ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
 607
 608        entry = active_list.prev;
 609        while (ratio && entry != &active_list) {
 610                struct page * page;
 611
 612                page = list_entry(entry, struct page, lru);
 613                entry = entry->prev;
 614                if (PageTestandClearReferenced(page)) {
 615                        list_del(&page->lru);
 616                        list_add(&page->lru, &active_list);
 617                        continue;
 618                }
 619
 620                ratio--;
 621
 622                del_page_from_active_list(page);
 623                add_page_to_inactive_list(page);
 624                SetPageReferenced(page);
 625        }
 626
 627        if (entry != &active_list) {
 628                list_del(&active_list);
 629                list_add(&active_list, entry);
 630        }
 631}
 632
 633static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
 634static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
 635{
 636        nr_pages -= kmem_cache_reap(gfp_mask);
 637        if (nr_pages <= 0)
 638                goto out;
 639
 640        spin_lock(&pagemap_lru_lock);
 641        refill_inactive(nr_pages, classzone);
 642
 643        nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
 644
 645out:
 646        return nr_pages;
 647}
 648
 649static int check_classzone_need_balance(zone_t * classzone);
 650
 651int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
 652{
 653        gfp_mask = pf_gfp_mask(gfp_mask);
 654
 655        for (;;) {
 656                int tries = vm_passes;
 657                int failed_swapout = !(gfp_mask & __GFP_IO);
 658                int nr_pages = SWAP_CLUSTER_MAX;
 659
 660                do {
 661                        nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
 662                        if (nr_pages <= 0)
 663                                return 1;
 664                        shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 665                        shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 666#ifdef CONFIG_QUOTA
 667                        shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 668#endif
 669                        if (!failed_swapout)
 670                                failed_swapout = !swap_out(classzone);
 671                } while (--tries);
 672
 673#ifdef  CONFIG_OOM_KILLER
 674        out_of_memory();
 675#else
 676        if (likely(current->pid != 1))
 677                break;
 678        if (!check_classzone_need_balance(classzone))
 679                break;
 680
 681        __set_current_state(TASK_RUNNING);
 682        yield();
 683#endif
 684        }
 685
 686        return 0;
 687}
 688
 689int fastcall try_to_free_pages(unsigned int gfp_mask)
 690{
 691        pg_data_t *pgdat;
 692        zonelist_t *zonelist;
 693        unsigned long pf_free_pages;
 694        int error = 0;
 695
 696        pf_free_pages = current->flags & PF_FREE_PAGES;
 697        current->flags &= ~PF_FREE_PAGES;
 698
 699        for_each_pgdat(pgdat) {
 700                zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
 701                error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
 702        }
 703
 704        current->flags |= pf_free_pages;
 705        return error;
 706}
 707
 708DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 709
 710static int check_classzone_need_balance(zone_t * classzone)
 711{
 712        zone_t * first_zone;
 713        int class_idx = zone_idx(classzone);
 714
 715        first_zone = classzone->zone_pgdat->node_zones;
 716        while (classzone >= first_zone) {
 717                if (classzone->free_pages > classzone->watermarks[class_idx].high)
 718                        return 0;
 719                classzone--;
 720        }
 721        return 1;
 722}
 723
 724static int kswapd_balance_pgdat(pg_data_t * pgdat)
 725{
 726        int need_more_balance = 0, i;
 727        zone_t * zone;
 728
 729        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 730                zone = pgdat->node_zones + i;
 731                if (unlikely(current->need_resched))
 732                        schedule();
 733                if (!zone->need_balance || !zone->size)
 734                        continue;
 735                if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
 736                        zone->need_balance = 0;
 737                        __set_current_state(TASK_INTERRUPTIBLE);
 738                        schedule_timeout(HZ*5);
 739                        continue;
 740                }
 741                if (check_classzone_need_balance(zone))
 742                        need_more_balance = 1;
 743                else
 744                        zone->need_balance = 0;
 745        }
 746
 747        return need_more_balance;
 748}
 749
 750static void kswapd_balance(void)
 751{
 752        int need_more_balance;
 753        pg_data_t * pgdat;
 754
 755        do {
 756                need_more_balance = 0;
 757
 758                for_each_pgdat(pgdat)
 759                        need_more_balance |= kswapd_balance_pgdat(pgdat);
 760        } while (need_more_balance);
 761}
 762
 763static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 764{
 765        zone_t * zone;
 766        int i;
 767
 768        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 769                zone = pgdat->node_zones + i;
 770                if (!zone->need_balance || !zone->size)
 771                        continue;
 772                return 0;
 773        }
 774
 775        return 1;
 776}
 777
 778static int kswapd_can_sleep(void)
 779{
 780        pg_data_t * pgdat;
 781
 782        for_each_pgdat(pgdat) {
 783                if (!kswapd_can_sleep_pgdat(pgdat))
 784                        return 0;
 785        }
 786
 787        return 1;
 788}
 789
 790/*
 791 * The background pageout daemon, started as a kernel thread
 792 * from the init process. 
 793 *
 794 * This basically trickles out pages so that we have _some_
 795 * free memory available even if there is no other activity
 796 * that frees anything up. This is needed for things like routing
 797 * etc, where we otherwise might have all activity going on in
 798 * asynchronous contexts that cannot page things out.
 799 *
 800 * If there are applications that are active memory-allocators
 801 * (most normal use), this basically shouldn't matter.
 802 */
 803int kswapd(void *unused)
 804{
 805        struct task_struct *tsk = current;
 806        DECLARE_WAITQUEUE(wait, tsk);
 807
 808        daemonize();
 809        strcpy(tsk->comm, "kswapd");
 810        sigfillset(&tsk->blocked);
 811        
 812        /*
 813         * Tell the memory management that we're a "memory allocator",
 814         * and that if we need more memory we should get access to it
 815         * regardless (see "__alloc_pages()"). "kswapd" should
 816         * never get caught in the normal page freeing logic.
 817         *
 818         * (Kswapd normally doesn't need memory anyway, but sometimes
 819         * you need a small amount of memory in order to be able to
 820         * page out something else, and this flag essentially protects
 821         * us from recursively trying to free more memory as we're
 822         * trying to free the first piece of memory in the first place).
 823         */
 824        tsk->flags |= PF_MEMALLOC;
 825
 826        /*
 827         * Kswapd main loop.
 828         */
 829        for (;;) {
 830                __set_current_state(TASK_INTERRUPTIBLE);
 831                add_wait_queue(&kswapd_wait, &wait);
 832
 833                mb();
 834                if (kswapd_can_sleep())
 835                        schedule();
 836
 837                __set_current_state(TASK_RUNNING);
 838                remove_wait_queue(&kswapd_wait, &wait);
 839
 840                /*
 841                 * If we actually get into a low-memory situation,
 842                 * the processes needing more memory will wake us
 843                 * up on a more timely basis.
 844                 */
 845                kswapd_balance();
 846                run_task_queue(&tq_disk);
 847        }
 848}
 849
 850static int __init kswapd_init(void)
 851{
 852        printk("Starting kswapd\n");
 853        swap_setup();
 854        kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 855        return 0;
 856}
 857
 858module_init(kswapd_init)
 859
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.