linux-old/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *
   6 *  Swap reorganised 29.12.95, Stephen Tweedie.
   7 *  kswapd added: 7.1.96  sct
   8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9 *  to bring the system back to free_pages_high: 2.4.97, Rik van Riel.
  10 *  Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $
  11 */
  12
  13#include <linux/mm.h>
  14#include <linux/sched.h>
  15#include <linux/head.h>
  16#include <linux/kernel.h>
  17#include <linux/kernel_stat.h>
  18#include <linux/errno.h>
  19#include <linux/string.h>
  20#include <linux/stat.h>
  21#include <linux/swap.h>
  22#include <linux/fs.h>
  23#include <linux/swapctl.h>
  24#include <linux/smp_lock.h>
  25#include <linux/slab.h>
  26
  27#include <asm/dma.h>
  28#include <asm/system.h> /* for cli()/sti() */
  29#include <asm/uaccess.h> /* for copy_to/from_user */
  30#include <asm/bitops.h>
  31#include <asm/pgtable.h>
  32
  33/* 
  34 * When are we next due for a page scan? 
  35 */
  36static int next_swap_jiffies = 0;
  37
  38/* 
  39 * How often do we do a pageout scan during normal conditions?
  40 * Default is four times a second.
  41 */
  42int swapout_interval = HZ / 4;
  43
  44/* 
  45 * The wait queue for waking up the pageout daemon:
  46 */
  47static struct wait_queue * kswapd_wait = NULL;
  48
  49/* 
  50 * We avoid doing a reschedule if the pageout daemon is already awake;
  51 */
  52static int kswapd_awake = 0;
  53
  54static void init_swap_timer(void);
  55
  56/*
  57 * The swap-out functions return 1 if they successfully
  58 * threw something out, and we got a free page. It returns
  59 * zero if it couldn't do anything, and any other value
  60 * indicates it decreased rss, but the page was shared.
  61 *
  62 * NOTE! If it sleeps, it *must* return 1 to make sure we
  63 * don't continue with the swap-out. Otherwise we may be
  64 * using a process that no longer actually exists (it might
  65 * have died while we slept).
  66 */
  67static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  68        unsigned long address, pte_t * page_table, int dma, int wait)
  69{
  70        pte_t pte;
  71        unsigned long entry;
  72        unsigned long page;
  73        struct page * page_map;
  74
  75        pte = *page_table;
  76        if (!pte_present(pte))
  77                return 0;
  78        page = pte_page(pte);
  79        if (MAP_NR(page) >= max_mapnr)
  80                return 0;
  81
  82        page_map = mem_map + MAP_NR(page);
  83        if (PageReserved(page_map)
  84            || PageLocked(page_map)
  85            || (dma && !PageDMA(page_map)))
  86                return 0;
  87        /* Deal with page aging.  Pages age from being unused; they
  88         * rejuvenate on being accessed.  Only swap old pages (age==0
  89         * is oldest). */
  90        if ((pte_dirty(pte) && delete_from_swap_cache(page_map)) 
  91            || pte_young(pte))  {
  92                set_pte(page_table, pte_mkold(pte));
  93                touch_page(page_map);
  94                return 0;
  95        }
  96        age_page(page_map);
  97        if (page_map->age)
  98                return 0;
  99        if (pte_dirty(pte)) {
 100                if (vma->vm_ops && vma->vm_ops->swapout) {
 101                        pid_t pid = tsk->pid;
 102                        vma->vm_mm->rss--;
 103                        if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
 104                                kill_proc(pid, SIGBUS, 1);
 105                } else {
 106                        if (atomic_read(&page_map->count) != 1)
 107                                return 0;
 108                        if (!(entry = get_swap_page()))
 109                                return 0;
 110                        vma->vm_mm->rss--;
 111                        flush_cache_page(vma, address);
 112                        set_pte(page_table, __pte(entry));
 113                        flush_tlb_page(vma, address);
 114                        tsk->nswap++;
 115                        rw_swap_page(WRITE, entry, (char *) page, wait);
 116                }
 117                free_page(page);
 118                return 1;       /* we slept: the process may not exist any more */
 119        }
 120        if ((entry = find_in_swap_cache(page_map)))  {
 121                if (atomic_read(&page_map->count) != 1) {
 122                        set_pte(page_table, pte_mkdirty(pte));
 123                        printk("Aiee.. duplicated cached swap-cache entry\n");
 124                        return 0;
 125                }
 126                vma->vm_mm->rss--;
 127                flush_cache_page(vma, address);
 128                set_pte(page_table, __pte(entry));
 129                flush_tlb_page(vma, address);
 130                free_page(page);
 131                return 1;
 132        } 
 133        vma->vm_mm->rss--;
 134        flush_cache_page(vma, address);
 135        pte_clear(page_table);
 136        flush_tlb_page(vma, address);
 137        entry = page_unuse(page);
 138        free_page(page);
 139        return entry;
 140}
 141
 142/*
 143 * A new implementation of swap_out().  We do not swap complete processes,
 144 * but only a small number of blocks, before we continue with the next
 145 * process.  The number of blocks actually swapped is determined on the
 146 * number of page faults, that this process actually had in the last time,
 147 * so we won't swap heavily used processes all the time ...
 148 *
 149 * Note: the priority argument is a hint on much CPU to waste with the
 150 *       swap block search, not a hint, of how much blocks to swap with
 151 *       each process.
 152 *
 153 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 154 */
 155
 156static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 157        pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
 158{
 159        pte_t * pte;
 160        unsigned long pmd_end;
 161
 162        if (pmd_none(*dir))
 163                return 0;
 164        if (pmd_bad(*dir)) {
 165                printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 166                pmd_clear(dir);
 167                return 0;
 168        }
 169        
 170        pte = pte_offset(dir, address);
 171        
 172        pmd_end = (address + PMD_SIZE) & PMD_MASK;
 173        if (end > pmd_end)
 174                end = pmd_end;
 175
 176        do {
 177                int result;
 178                tsk->swap_address = address + PAGE_SIZE;
 179                result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
 180                if (result)
 181                        return result;
 182                address += PAGE_SIZE;
 183                pte++;
 184        } while (address < end);
 185        return 0;
 186}
 187
 188static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 189        pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
 190{
 191        pmd_t * pmd;
 192        unsigned long pgd_end;
 193
 194        if (pgd_none(*dir))
 195                return 0;
 196        if (pgd_bad(*dir)) {
 197                printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 198                pgd_clear(dir);
 199                return 0;
 200        }
 201
 202        pmd = pmd_offset(dir, address);
 203
 204        pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 205        if (end > pgd_end)
 206                end = pgd_end;
 207        
 208        do {
 209                int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
 210                if (result)
 211                        return result;
 212                address = (address + PMD_SIZE) & PMD_MASK;
 213                pmd++;
 214        } while (address < end);
 215        return 0;
 216}
 217
 218static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 219        pgd_t *pgdir, unsigned long start, int dma, int wait)
 220{
 221        unsigned long end;
 222
 223        /* Don't swap out areas like shared memory which have their
 224            own separate swapping mechanism or areas which are locked down */
 225        if (vma->vm_flags & (VM_SHM | VM_LOCKED))
 226                return 0;
 227
 228        end = vma->vm_end;
 229        while (start < end) {
 230                int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
 231                if (result)
 232                        return result;
 233                start = (start + PGDIR_SIZE) & PGDIR_MASK;
 234                pgdir++;
 235        }
 236        return 0;
 237}
 238
 239static int swap_out_process(struct task_struct * p, int dma, int wait)
 240{
 241        unsigned long address;
 242        struct vm_area_struct* vma;
 243
 244        /*
 245         * Go through process' page directory.
 246         */
 247        address = p->swap_address;
 248        p->swap_address = 0;
 249
 250        /*
 251         * Find the proper vm-area
 252         */
 253        vma = find_vma(p->mm, address);
 254        if (!vma)
 255                return 0;
 256        if (address < vma->vm_start)
 257                address = vma->vm_start;
 258
 259        for (;;) {
 260                int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
 261                if (result)
 262                        return result;
 263                vma = vma->vm_next;
 264                if (!vma)
 265                        break;
 266                address = vma->vm_start;
 267        }
 268        p->swap_address = 0;
 269        return 0;
 270}
 271
 272/*
 273 * Select the task with maximal swap_cnt and try to swap out a page.
 274 * N.B. This function returns only 0 or 1.  Return values != 1 from
 275 * the lower level routines result in continued processing.
 276 */
 277static int swap_out(unsigned int priority, int dma, int wait)
 278{
 279        struct task_struct * p, * pbest;
 280        int counter, assign, max_cnt;
 281
 282        /* 
 283         * We make one or two passes through the task list, indexed by 
 284         * assign = {0, 1}:
 285         *   Pass 1: select the swappable task with maximal swap_cnt.
 286         *   Pass 2: assign new swap_cnt values, then select as above.
 287         * With this approach, there's no need to remember the last task
 288         * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 289         * task won't be selected again until all others have been tried.
 290         */
 291        counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
 292        for (; counter >= 0; counter--) {
 293                assign = 0;
 294                max_cnt = 0;
 295                pbest = NULL;
 296        select:
 297                read_lock(&tasklist_lock);
 298                p = init_task.next_task;
 299                for (; p != &init_task; p = p->next_task) {
 300                        if (!p->swappable)
 301                                continue;
 302                        if (p->mm->rss <= 0)
 303                                continue;
 304                        if (assign) {
 305                                /* 
 306                                 * If we didn't select a task on pass 1, 
 307                                 * assign each task a new swap_cnt.
 308                                 * Normalise the number of pages swapped
 309                                 * by multiplying by (RSS / 1MB)
 310                                 */
 311                                p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
 312                        }
 313                        if (p->swap_cnt > max_cnt) {
 314                                max_cnt = p->swap_cnt;
 315                                pbest = p;
 316                        }
 317                }
 318                read_unlock(&tasklist_lock);
 319                if (!pbest) {
 320                        if (!assign) {
 321                                assign = 1;
 322                                goto select;
 323                        }
 324                        goto out;
 325                }
 326                pbest->swap_cnt--;
 327
 328                switch (swap_out_process(pbest, dma, wait)) {
 329                case 0:
 330                        /*
 331                         * Clear swap_cnt so we don't look at this task
 332                         * again until we've tried all of the others.
 333                         * (We didn't block, so the task is still here.)
 334                         */
 335                        pbest->swap_cnt = 0;
 336                        break;
 337                case 1:
 338                        return 1;
 339                default:
 340                        break;
 341                };
 342        }
 343out:
 344        return 0;
 345}
 346
 347/*
 348 * We are much more aggressive about trying to swap out than we used
 349 * to be.  This works out OK, because we now do proper aging on page
 350 * contents. 
 351 */
 352static inline int do_try_to_free_page(int priority, int dma, int wait)
 353{
 354        static int state = 0;
 355        int i=6;
 356        int stop;
 357
 358        /* Always trim SLAB caches when memory gets low. */
 359        (void) kmem_cache_reap(0, dma, wait);
 360
 361        /* we don't try as hard if we're not waiting.. */
 362        stop = 3;
 363        if (wait)
 364                stop = 0;
 365        switch (state) {
 366                do {
 367                case 0:
 368                        if (shrink_mmap(i, dma))
 369                                return 1;
 370                        state = 1;
 371                case 1:
 372                        if (shm_swap(i, dma))
 373                                return 1;
 374                        state = 2;
 375                default:
 376                        if (swap_out(i, dma, wait))
 377                                return 1;
 378                        state = 0;
 379                i--;
 380                } while ((i - stop) >= 0);
 381        }
 382        return 0;
 383}
 384
 385/*
 386 * This is REALLY ugly.
 387 *
 388 * We need to make the locks finer granularity, but right
 389 * now we need this so that we can do page allocations
 390 * without holding the kernel lock etc.
 391 */
 392int try_to_free_page(int priority, int dma, int wait)
 393{
 394        int retval;
 395
 396        lock_kernel();
 397        retval = do_try_to_free_page(priority,dma,wait);
 398        unlock_kernel();
 399        return retval;
 400}
 401
 402/*
 403 * Before we start the kernel thread, print out the 
 404 * kswapd initialization message (otherwise the init message 
 405 * may be printed in the middle of another driver's init 
 406 * message).  It looks very bad when that happens.
 407 */
 408void kswapd_setup(void)
 409{
 410       int i;
 411       char *revision="$Revision: 1.23 $", *s, *e;
 412
 413       if ((s = strchr(revision, ':')) &&
 414           (e = strchr(s, '$')))
 415               s++, i = e - s;
 416       else
 417               s = revision, i = -1;
 418       printk ("Starting kswapd v%.*s\n", i, s);
 419}
 420
 421/*
 422 * The background pageout daemon.
 423 * Started as a kernel thread from the init process.
 424 */
 425int kswapd(void *unused)
 426{
 427        current->session = 1;
 428        current->pgrp = 1;
 429        sprintf(current->comm, "kswapd");
 430        current->blocked = ~0UL;
 431        
 432        /*
 433         *      As a kernel thread we want to tamper with system buffers
 434         *      and other internals and thus be subject to the SMP locking
 435         *      rules. (On a uniprocessor box this does nothing).
 436         */
 437        lock_kernel();
 438
 439        /* Give kswapd a realtime priority. */
 440        current->policy = SCHED_FIFO;
 441        current->priority = 32;  /* Fixme --- we need to standardise our
 442                                    namings for POSIX.4 realtime scheduling
 443                                    priorities.  */
 444
 445        init_swap_timer();
 446        
 447        while (1) {
 448                kswapd_awake = 0;
 449                current->signal = 0;
 450                run_task_queue(&tq_disk);
 451                interruptible_sleep_on(&kswapd_wait);
 452                kswapd_awake = 1;
 453                swapstats.wakeups++;
 454                /* Do the background pageout: 
 455                 * We now only swap out as many pages as needed.
 456                 * When we are truly low on memory, we swap out
 457                 * synchronously (WAIT == 1).  -- Rik.
 458                 */
 459                while(nr_free_pages < min_free_pages)
 460                        try_to_free_page(GFP_KERNEL, 0, 1);
 461                while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_low)
 462                        try_to_free_page(GFP_KERNEL, 0, 1);
 463                while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_high)
 464                        try_to_free_page(GFP_KERNEL, 0, 0);
 465        }
 466}
 467
 468/* 
 469 * The swap_tick function gets called on every clock tick.
 470 */
 471
 472void swap_tick(void)
 473{
 474        int want_wakeup = 0, memory_low = 0;
 475        int pages = nr_free_pages + atomic_read(&nr_async_pages);
 476
 477        if (pages < free_pages_low)
 478                memory_low = want_wakeup = 1;
 479        else if (pages < free_pages_high && jiffies >= next_swap_jiffies)
 480                want_wakeup = 1;
 481
 482        if (want_wakeup) { 
 483                if (!kswapd_awake) {
 484                        wake_up(&kswapd_wait);
 485                        need_resched = 1;
 486                }
 487                /* Set the next wake-up time */
 488                next_swap_jiffies = jiffies;
 489                if (!memory_low) 
 490                        next_swap_jiffies += swapout_interval;
 491        }
 492        timer_active |= (1<<SWAP_TIMER);
 493}
 494
 495/* 
 496 * Initialise the swap timer
 497 */
 498
 499void init_swap_timer(void)
 500{
 501        timer_table[SWAP_TIMER].expires = 0;
 502        timer_table[SWAP_TIMER].fn = swap_tick;
 503        timer_active |= (1<<SWAP_TIMER);
 504}
 505
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.