linux-bk/mm/memory.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/memory.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * demand-loading started 01.12.91 - seems it is high on the list of
   9 * things wanted, and it should be easy to implement. - Linus
  10 */
  11
  12/*
  13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14 * pages started 02.12.91, seems to work. - Linus.
  15 *
  16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17 * would have taken more than the 6M I have free, but it worked well as
  18 * far as I could see.
  19 *
  20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21 */
  22
  23/*
  24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25 * thought has to go into this. Oh, well..
  26 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27 *              Found it. Everything seems to work now.
  28 * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29 */
  30
  31/*
  32 * 05.04.94  -  Multi-page memory management added for v1.1.
  33 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34 *
  35 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  36 *              (Gerhard.Wichert@pdb.siemens.de)
  37 */
  38
  39#include <linux/kernel_stat.h>
  40#include <linux/mm.h>
  41#include <linux/mman.h>
  42#include <linux/swap.h>
  43#include <linux/iobuf.h>
  44#include <linux/highmem.h>
  45#include <linux/pagemap.h>
  46
  47#include <asm/pgalloc.h>
  48#include <asm/rmap.h>
  49#include <asm/uaccess.h>
  50#include <asm/tlb.h>
  51#include <asm/tlbflush.h>
  52
  53#include <linux/swapops.h>
  54
  55#ifndef CONFIG_DISCONTIGMEM
  56/* use the per-pgdat data instead for discontigmem - mbligh */
  57unsigned long max_mapnr;
  58struct page *mem_map;
  59#endif
  60
  61unsigned long num_physpages;
  62void * high_memory;
  63struct page *highmem_start_page;
  64
  65/*
  66 * We special-case the C-O-W ZERO_PAGE, because it's such
  67 * a common occurrence (no need to read the page to know
  68 * that it's zero - better for the cache and memory subsystem).
  69 */
  70static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
  71{
  72        if (from == ZERO_PAGE(address)) {
  73                clear_user_highpage(to, address);
  74                return;
  75        }
  76        copy_user_highpage(to, from, address);
  77}
  78
  79/*
  80 * Note: this doesn't free the actual pages themselves. That
  81 * has been handled earlier when unmapping all the memory regions.
  82 */
  83static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir)
  84{
  85        struct page *page;
  86
  87        if (pmd_none(*dir))
  88                return;
  89        if (pmd_bad(*dir)) {
  90                pmd_ERROR(*dir);
  91                pmd_clear(dir);
  92                return;
  93        }
  94        page = pmd_page(*dir);
  95        pmd_clear(dir);
  96        pgtable_remove_rmap(page);
  97        pte_free_tlb(tlb, page);
  98}
  99
 100static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir)
 101{
 102        int j;
 103        pmd_t * pmd;
 104
 105        if (pgd_none(*dir))
 106                return;
 107        if (pgd_bad(*dir)) {
 108                pgd_ERROR(*dir);
 109                pgd_clear(dir);
 110                return;
 111        }
 112        pmd = pmd_offset(dir, 0);
 113        pgd_clear(dir);
 114        for (j = 0; j < PTRS_PER_PMD ; j++) {
 115                prefetchw(pmd+j+(PREFETCH_STRIDE/16));
 116                free_one_pmd(tlb, pmd+j);
 117        }
 118        pmd_free_tlb(tlb, pmd);
 119}
 120
 121/*
 122 * This function clears all user-level page tables of a process - this
 123 * is needed by execve(), so that old pages aren't in the way.
 124 *
 125 * Must be called with pagetable lock held.
 126 */
 127void clear_page_tables(mmu_gather_t *tlb, unsigned long first, int nr)
 128{
 129        pgd_t * page_dir = tlb->mm->pgd;
 130
 131        page_dir += first;
 132        do {
 133                free_one_pgd(tlb, page_dir);
 134                page_dir++;
 135        } while (--nr);
 136}
 137
 138pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 139{
 140        if (!pmd_present(*pmd)) {
 141                struct page *new;
 142
 143                spin_unlock(&mm->page_table_lock);
 144                new = pte_alloc_one(mm, address);
 145                spin_lock(&mm->page_table_lock);
 146                if (!new)
 147                        return NULL;
 148
 149                /*
 150                 * Because we dropped the lock, we should re-check the
 151                 * entry, as somebody else could have populated it..
 152                 */
 153                if (pmd_present(*pmd)) {
 154                        pte_free(new);
 155                        goto out;
 156                }
 157                pgtable_add_rmap(new, mm, address);
 158                pmd_populate(mm, pmd, new);
 159        }
 160out:
 161        if (pmd_present(*pmd))
 162                return pte_offset_map(pmd, address);
 163        return NULL;
 164}
 165
 166pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 167{
 168        if (!pmd_present(*pmd)) {
 169                pte_t *new;
 170
 171                spin_unlock(&mm->page_table_lock);
 172                new = pte_alloc_one_kernel(mm, address);
 173                spin_lock(&mm->page_table_lock);
 174                if (!new)
 175                        return NULL;
 176
 177                /*
 178                 * Because we dropped the lock, we should re-check the
 179                 * entry, as somebody else could have populated it..
 180                 */
 181                if (pmd_present(*pmd)) {
 182                        pte_free_kernel(new);
 183                        goto out;
 184                }
 185                pgtable_add_rmap(virt_to_page(new), mm, address);
 186                pmd_populate_kernel(mm, pmd, new);
 187        }
 188out:
 189        return pte_offset_kernel(pmd, address);
 190}
 191#define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 192#define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 193
 194/*
 195 * copy one vm_area from one task to the other. Assumes the page tables
 196 * already present in the new task to be cleared in the whole range
 197 * covered by this vma.
 198 *
 199 * 08Jan98 Merged into one routine from several inline routines to reduce
 200 *         variable count and make things faster. -jj
 201 *
 202 * dst->page_table_lock is held on entry and exit,
 203 * but may be dropped within pmd_alloc() and pte_alloc_map().
 204 */
 205int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 206                        struct vm_area_struct *vma)
 207{
 208        pgd_t * src_pgd, * dst_pgd;
 209        unsigned long address = vma->vm_start;
 210        unsigned long end = vma->vm_end;
 211        unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 212
 213        if (is_vm_hugetlb_page(vma))
 214                return copy_hugetlb_page_range(dst, src, vma);
 215
 216        src_pgd = pgd_offset(src, address)-1;
 217        dst_pgd = pgd_offset(dst, address)-1;
 218
 219        for (;;) {
 220                pmd_t * src_pmd, * dst_pmd;
 221
 222                src_pgd++; dst_pgd++;
 223                
 224                /* copy_pmd_range */
 225                
 226                if (pgd_none(*src_pgd))
 227                        goto skip_copy_pmd_range;
 228                if (pgd_bad(*src_pgd)) {
 229                        pgd_ERROR(*src_pgd);
 230                        pgd_clear(src_pgd);
 231skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 232                        if (!address || (address >= end))
 233                                goto out;
 234                        continue;
 235                }
 236
 237                src_pmd = pmd_offset(src_pgd, address);
 238                dst_pmd = pmd_alloc(dst, dst_pgd, address);
 239                if (!dst_pmd)
 240                        goto nomem;
 241
 242                do {
 243                        pte_t * src_pte, * dst_pte;
 244                
 245                        /* copy_pte_range */
 246                
 247                        if (pmd_none(*src_pmd))
 248                                goto skip_copy_pte_range;
 249                        if (pmd_bad(*src_pmd)) {
 250                                pmd_ERROR(*src_pmd);
 251                                pmd_clear(src_pmd);
 252skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 253                                if (address >= end)
 254                                        goto out;
 255                                goto cont_copy_pmd_range;
 256                        }
 257
 258                        dst_pte = pte_alloc_map(dst, dst_pmd, address);
 259                        if (!dst_pte)
 260                                goto nomem;
 261                        spin_lock(&src->page_table_lock);                       
 262                        src_pte = pte_offset_map_nested(src_pmd, address);
 263                        do {
 264                                pte_t pte = *src_pte;
 265                                struct page *ptepage;
 266                                unsigned long pfn;
 267                                
 268                                /* copy_one_pte */
 269
 270                                if (pte_none(pte))
 271                                        goto cont_copy_pte_range_noset;
 272                                /* pte contains position in swap, so copy. */
 273                                if (!pte_present(pte)) {
 274                                        swap_duplicate(pte_to_swp_entry(pte));
 275                                        set_pte(dst_pte, pte);
 276                                        goto cont_copy_pte_range_noset;
 277                                }
 278                                ptepage = pte_page(pte);
 279                                pfn = pte_pfn(pte);
 280                                if (!pfn_valid(pfn))
 281                                        goto cont_copy_pte_range;
 282                                ptepage = pfn_to_page(pfn);
 283                                if (PageReserved(ptepage))
 284                                        goto cont_copy_pte_range;
 285
 286                                /* If it's a COW mapping, write protect it both in the parent and the child */
 287                                if (cow) {
 288                                        ptep_set_wrprotect(src_pte);
 289                                        pte = *src_pte;
 290                                }
 291
 292                                /* If it's a shared mapping, mark it clean in the child */
 293                                if (vma->vm_flags & VM_SHARED)
 294                                        pte = pte_mkclean(pte);
 295                                pte = pte_mkold(pte);
 296                                get_page(ptepage);
 297                                dst->rss++;
 298
 299cont_copy_pte_range:            set_pte(dst_pte, pte);
 300                                page_add_rmap(ptepage, dst_pte);
 301cont_copy_pte_range_noset:      address += PAGE_SIZE;
 302                                if (address >= end) {
 303                                        pte_unmap_nested(src_pte);
 304                                        pte_unmap(dst_pte);
 305                                        goto out_unlock;
 306                                }
 307                                src_pte++;
 308                                dst_pte++;
 309                        } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 310                        pte_unmap_nested(src_pte-1);
 311                        pte_unmap(dst_pte-1);
 312                        spin_unlock(&src->page_table_lock);
 313                
 314cont_copy_pmd_range:    src_pmd++;
 315                        dst_pmd++;
 316                } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 317        }
 318out_unlock:
 319        spin_unlock(&src->page_table_lock);
 320out:
 321        return 0;
 322nomem:
 323        return -ENOMEM;
 324}
 325
 326static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 327{
 328        unsigned long offset;
 329        pte_t *ptep;
 330
 331        if (pmd_none(*pmd))
 332                return;
 333        if (pmd_bad(*pmd)) {
 334                pmd_ERROR(*pmd);
 335                pmd_clear(pmd);
 336                return;
 337        }
 338        ptep = pte_offset_map(pmd, address);
 339        offset = address & ~PMD_MASK;
 340        if (offset + size > PMD_SIZE)
 341                size = PMD_SIZE - offset;
 342        size &= PAGE_MASK;
 343        for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
 344                pte_t pte = *ptep;
 345                if (pte_none(pte))
 346                        continue;
 347                if (pte_present(pte)) {
 348                        unsigned long pfn = pte_pfn(pte);
 349
 350                        pte = ptep_get_and_clear(ptep);
 351                        tlb_remove_tlb_entry(tlb, ptep, address+offset);
 352                        if (pfn_valid(pfn)) {
 353                                struct page *page = pfn_to_page(pfn);
 354                                if (!PageReserved(page)) {
 355                                        if (pte_dirty(pte))
 356                                                set_page_dirty(page);
 357                                        tlb->freed++;
 358                                        page_remove_rmap(page, ptep);
 359                                        tlb_remove_page(tlb, page);
 360                                }
 361                        }
 362                } else {
 363                        free_swap_and_cache(pte_to_swp_entry(pte));
 364                        pte_clear(ptep);
 365                }
 366        }
 367        pte_unmap(ptep-1);
 368}
 369
 370static void zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
 371{
 372        pmd_t * pmd;
 373        unsigned long end;
 374
 375        if (pgd_none(*dir))
 376                return;
 377        if (pgd_bad(*dir)) {
 378                pgd_ERROR(*dir);
 379                pgd_clear(dir);
 380                return;
 381        }
 382        pmd = pmd_offset(dir, address);
 383        end = address + size;
 384        if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
 385                end = ((address + PGDIR_SIZE) & PGDIR_MASK);
 386        do {
 387                zap_pte_range(tlb, pmd, address, end - address);
 388                address = (address + PMD_SIZE) & PMD_MASK; 
 389                pmd++;
 390        } while (address < end);
 391}
 392
 393void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end)
 394{
 395        pgd_t * dir;
 396
 397        BUG_ON(address >= end);
 398
 399        dir = pgd_offset(vma->vm_mm, address);
 400        tlb_start_vma(tlb, vma);
 401        do {
 402                zap_pmd_range(tlb, dir, address, end - address);
 403                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 404                dir++;
 405        } while (address && (address < end));
 406        tlb_end_vma(tlb, vma);
 407}
 408
 409/* Dispose of an entire mmu_gather_t per rescheduling point */
 410#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
 411#define ZAP_BLOCK_SIZE  (FREE_PTE_NR * PAGE_SIZE)
 412#endif
 413
 414/* For UP, 256 pages at a time gives nice low latency */
 415#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
 416#define ZAP_BLOCK_SIZE  (256 * PAGE_SIZE)
 417#endif
 418
 419/* No preempt: go for the best straight-line efficiency */
 420#if !defined(CONFIG_PREEMPT)
 421#define ZAP_BLOCK_SIZE  (~(0UL))
 422#endif
 423
 424/**
 425 * zap_page_range - remove user pages in a given range
 426 * @vma: vm_area_struct holding the applicable pages
 427 * @address: starting address of pages to zap
 428 * @size: number of bytes to zap
 429 */
 430void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size)
 431{
 432        struct mm_struct *mm = vma->vm_mm;
 433        mmu_gather_t *tlb;
 434        unsigned long end, block;
 435
 436        spin_lock(&mm->page_table_lock);
 437
 438        /*
 439         * This was once a long-held spinlock.  Now we break the
 440         * work up into ZAP_BLOCK_SIZE units and relinquish the
 441         * lock after each interation.  This drastically lowers
 442         * lock contention and allows for a preemption point.
 443         */
 444        while (size) {
 445                block = (size > ZAP_BLOCK_SIZE) ? ZAP_BLOCK_SIZE : size;
 446                end = address + block;
 447 
 448                flush_cache_range(vma, address, end);
 449                tlb = tlb_gather_mmu(mm, 0);
 450                unmap_page_range(tlb, vma, address, end);
 451                tlb_finish_mmu(tlb, address, end);
 452 
 453                cond_resched_lock(&mm->page_table_lock);
 454 
 455                address += block;
 456                size -= block;
 457        }
 458
 459        spin_unlock(&mm->page_table_lock);
 460}
 461
 462/*
 463 * Do a quick page-table lookup for a single page.
 464 * mm->page_table_lock must be held.
 465 */
 466static inline struct page *
 467follow_page(struct mm_struct *mm, unsigned long address, int write) 
 468{
 469        pgd_t *pgd;
 470        pmd_t *pmd;
 471        pte_t *ptep, pte;
 472        unsigned long pfn;
 473
 474        pgd = pgd_offset(mm, address);
 475        if (pgd_none(*pgd) || pgd_bad(*pgd))
 476                goto out;
 477
 478        pmd = pmd_offset(pgd, address);
 479        if (pmd_none(*pmd) || pmd_bad(*pmd))
 480                goto out;
 481
 482        ptep = pte_offset_map(pmd, address);
 483        if (!ptep)
 484                goto out;
 485
 486        pte = *ptep;
 487        pte_unmap(ptep);
 488        if (pte_present(pte)) {
 489                if (!write || (pte_write(pte) && pte_dirty(pte))) {
 490                        pfn = pte_pfn(pte);
 491                        if (pfn_valid(pfn))
 492                                return pfn_to_page(pfn);
 493                }
 494        }
 495
 496out:
 497        return 0;
 498}
 499
 500/* 
 501 * Given a physical address, is there a useful struct page pointing to
 502 * it?  This may become more complex in the future if we start dealing
 503 * with IO-aperture pages in kiobufs.
 504 */
 505
 506static inline struct page *get_page_map(struct page *page)
 507{
 508        if (!pfn_valid(page_to_pfn(page)))
 509                return 0;
 510        return page;
 511}
 512
 513
 514int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 515                unsigned long start, int len, int write, int force,
 516                struct page **pages, struct vm_area_struct **vmas)
 517{
 518        int i;
 519        unsigned int flags;
 520
 521        /* 
 522         * Require read or write permissions.
 523         * If 'force' is set, we only require the "MAY" flags.
 524         */
 525        flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 526        flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 527        i = 0;
 528
 529        do {
 530                struct vm_area_struct * vma;
 531
 532                vma = find_extend_vma(mm, start);
 533
 534                if (!vma || (pages && (vma->vm_flags & VM_IO))
 535                                || !(flags & vma->vm_flags))
 536                        return i ? : -EFAULT;
 537
 538                if (is_vm_hugetlb_page(vma)) {
 539                        i = follow_hugetlb_page(mm, vma, pages, vmas,
 540                                                &start, &len, i);
 541                        continue;
 542                }
 543                spin_lock(&mm->page_table_lock);
 544                do {
 545                        struct page *map;
 546                        while (!(map = follow_page(mm, start, write))) {
 547                                spin_unlock(&mm->page_table_lock);
 548                                switch (handle_mm_fault(mm,vma,start,write)) {
 549                                case VM_FAULT_MINOR:
 550                                        tsk->min_flt++;
 551                                        break;
 552                                case VM_FAULT_MAJOR:
 553                                        tsk->maj_flt++;
 554                                        break;
 555                                case VM_FAULT_SIGBUS:
 556                                        return i ? i : -EFAULT;
 557                                case VM_FAULT_OOM:
 558                                        return i ? i : -ENOMEM;
 559                                default:
 560                                        BUG();
 561                                }
 562                                spin_lock(&mm->page_table_lock);
 563                        }
 564                        if (pages) {
 565                                pages[i] = get_page_map(map);
 566                                if (!pages[i]) {
 567                                        spin_unlock(&mm->page_table_lock);
 568                                        while (i--)
 569                                                page_cache_release(pages[i]);
 570                                        i = -EFAULT;
 571                                        goto out;
 572                                }
 573                                page_cache_get(pages[i]);
 574                        }
 575                        if (vmas)
 576                                vmas[i] = vma;
 577                        i++;
 578                        start += PAGE_SIZE;
 579                        len--;
 580                } while(len && start < vma->vm_end);
 581                spin_unlock(&mm->page_table_lock);
 582        } while(len);
 583out:
 584        return i;
 585}
 586
 587/*
 588 * Force in an entire range of pages from the current process's user VA,
 589 * and pin them in physical memory.  
 590 */
 591#define dprintk(x...)
 592
 593int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
 594{
 595        int pgcount, err;
 596        struct mm_struct *      mm;
 597        
 598        /* Make sure the iobuf is not already mapped somewhere. */
 599        if (iobuf->nr_pages)
 600                return -EINVAL;
 601
 602        mm = current->mm;
 603        dprintk ("map_user_kiobuf: begin\n");
 604        
 605        pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
 606        /* mapping 0 bytes is not permitted */
 607        if (!pgcount) BUG();
 608        err = expand_kiobuf(iobuf, pgcount);
 609        if (err)
 610                return err;
 611
 612        iobuf->locked = 0;
 613        iobuf->offset = va & (PAGE_SIZE-1);
 614        iobuf->length = len;
 615        
 616        /* Try to fault in all of the necessary pages */
 617        down_read(&mm->mmap_sem);
 618        /* rw==READ means read from disk, write into memory area */
 619        err = get_user_pages(current, mm, va, pgcount,
 620                        (rw==READ), 0, iobuf->maplist, NULL);
 621        up_read(&mm->mmap_sem);
 622        if (err < 0) {
 623                unmap_kiobuf(iobuf);
 624                dprintk ("map_user_kiobuf: end %d\n", err);
 625                return err;
 626        }
 627        iobuf->nr_pages = err;
 628        while (pgcount--) {
 629                /* FIXME: flush superflous for rw==READ,
 630                 * probably wrong function for rw==WRITE
 631                 */
 632                flush_dcache_page(iobuf->maplist[pgcount]);
 633        }
 634        dprintk ("map_user_kiobuf: end OK\n");
 635        return 0;
 636}
 637
 638/*
 639 * Mark all of the pages in a kiobuf as dirty 
 640 *
 641 * We need to be able to deal with short reads from disk: if an IO error
 642 * occurs, the number of bytes read into memory may be less than the
 643 * size of the kiobuf, so we have to stop marking pages dirty once the
 644 * requested byte count has been reached.
 645 */
 646
 647void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
 648{
 649        int index, offset, remaining;
 650        struct page *page;
 651        
 652        index = iobuf->offset >> PAGE_SHIFT;
 653        offset = iobuf->offset & ~PAGE_MASK;
 654        remaining = bytes;
 655        if (remaining > iobuf->length)
 656                remaining = iobuf->length;
 657        
 658        while (remaining > 0 && index < iobuf->nr_pages) {
 659                page = iobuf->maplist[index];
 660                
 661                if (!PageReserved(page))
 662                        set_page_dirty(page);
 663
 664                remaining -= (PAGE_SIZE - offset);
 665                offset = 0;
 666                index++;
 667        }
 668}
 669
 670/*
 671 * Unmap all of the pages referenced by a kiobuf.  We release the pages,
 672 * and unlock them if they were locked. 
 673 */
 674
 675void unmap_kiobuf (struct kiobuf *iobuf) 
 676{
 677        int i;
 678        struct page *map;
 679        
 680        for (i = 0; i < iobuf->nr_pages; i++) {
 681                map = iobuf->maplist[i];
 682                if (map) {
 683                        if (iobuf->locked)
 684                                unlock_page(map);
 685                        /* FIXME: cache flush missing for rw==READ
 686                         * FIXME: call the correct reference counting function
 687                         */
 688                        page_cache_release(map);
 689                }
 690        }
 691        
 692        iobuf->nr_pages = 0;
 693        iobuf->locked = 0;
 694}
 695
 696
 697/*
 698 * Lock down all of the pages of a kiovec for IO.
 699 *
 700 * If any page is mapped twice in the kiovec, we return the error -EINVAL.
 701 *
 702 * The optional wait parameter causes the lock call to block until all
 703 * pages can be locked if set.  If wait==0, the lock operation is
 704 * aborted if any locked pages are found and -EAGAIN is returned.
 705 */
 706
 707int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
 708{
 709        struct kiobuf *iobuf;
 710        int i, j;
 711        struct page *page, **ppage;
 712        int doublepage = 0;
 713        int repeat = 0;
 714        
 715 repeat:
 716        
 717        for (i = 0; i < nr; i++) {
 718                iobuf = iovec[i];
 719
 720                if (iobuf->locked)
 721                        continue;
 722
 723                ppage = iobuf->maplist;
 724                for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
 725                        page = *ppage;
 726                        if (!page)
 727                                continue;
 728                        
 729                        if (TestSetPageLocked(page)) {
 730                                while (j--) {
 731                                        struct page *tmp = *--ppage;
 732                                        if (tmp)
 733                                                unlock_page(tmp);
 734                                }
 735                                goto retry;
 736                        }
 737                }
 738                iobuf->locked = 1;
 739        }
 740
 741        return 0;
 742        
 743 retry:
 744        
 745        /* 
 746         * We couldn't lock one of the pages.  Undo the locking so far,
 747         * wait on the page we got to, and try again.  
 748         */
 749        
 750        unlock_kiovec(nr, iovec);
 751        if (!wait)
 752                return -EAGAIN;
 753        
 754        /* 
 755         * Did the release also unlock the page we got stuck on?
 756         */
 757        if (!PageLocked(page)) {
 758                /* 
 759                 * If so, we may well have the page mapped twice
 760                 * in the IO address range.  Bad news.  Of
 761                 * course, it _might_ just be a coincidence,
 762                 * but if it happens more than once, chances
 763                 * are we have a double-mapped page. 
 764                 */
 765                if (++doublepage >= 3) 
 766                        return -EINVAL;
 767                
 768                /* Try again...  */
 769                wait_on_page_locked(page);
 770        }
 771        
 772        if (++repeat < 16)
 773                goto repeat;
 774        return -EAGAIN;
 775}
 776
 777/*
 778 * Unlock all of the pages of a kiovec after IO.
 779 */
 780
 781int unlock_kiovec(int nr, struct kiobuf *iovec[])
 782{
 783        struct kiobuf *iobuf;
 784        int i, j;
 785        struct page *page, **ppage;
 786        
 787        for (i = 0; i < nr; i++) {
 788                iobuf = iovec[i];
 789
 790                if (!iobuf->locked)
 791                        continue;
 792                iobuf->locked = 0;
 793                
 794                ppage = iobuf->maplist;
 795                for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
 796                        page = *ppage;
 797                        if (!page)
 798                                continue;
 799                        unlock_page(page);
 800                }
 801        }
 802        return 0;
 803}
 804
 805static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
 806                                     unsigned long size, pgprot_t prot)
 807{
 808        unsigned long end;
 809
 810        address &= ~PMD_MASK;
 811        end = address + size;
 812        if (end > PMD_SIZE)
 813                end = PMD_SIZE;
 814        do {
 815                pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
 816                BUG_ON(!pte_none(*pte));
 817                set_pte(pte, zero_pte);
 818                address += PAGE_SIZE;
 819                pte++;
 820        } while (address && (address < end));
 821}
 822
 823static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
 824                                    unsigned long size, pgprot_t prot)
 825{
 826        unsigned long end;
 827
 828        address &= ~PGDIR_MASK;
 829        end = address + size;
 830        if (end > PGDIR_SIZE)
 831                end = PGDIR_SIZE;
 832        do {
 833                pte_t * pte = pte_alloc_map(mm, pmd, address);
 834                if (!pte)
 835                        return -ENOMEM;
 836                zeromap_pte_range(pte, address, end - address, prot);
 837                pte_unmap(pte);
 838                address = (address + PMD_SIZE) & PMD_MASK;
 839                pmd++;
 840        } while (address && (address < end));
 841        return 0;
 842}
 843
 844int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot)
 845{
 846        int error = 0;
 847        pgd_t * dir;
 848        unsigned long beg = address;
 849        unsigned long end = address + size;
 850        struct mm_struct *mm = vma->vm_mm;
 851
 852        dir = pgd_offset(mm, address);
 853        flush_cache_range(vma, beg, end);
 854        if (address >= end)
 855                BUG();
 856
 857        spin_lock(&mm->page_table_lock);
 858        do {
 859                pmd_t *pmd = pmd_alloc(mm, dir, address);
 860                error = -ENOMEM;
 861                if (!pmd)
 862                        break;
 863                error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
 864                if (error)
 865                        break;
 866                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 867                dir++;
 868        } while (address && (address < end));
 869        flush_tlb_range(vma, beg, end);
 870        spin_unlock(&mm->page_table_lock);
 871        return error;
 872}
 873
 874/*
 875 * maps a range of physical memory into the requested pages. the old
 876 * mappings are removed. any references to nonexistent pages results
 877 * in null mappings (currently treated as "copy-on-access")
 878 */
 879static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 880        unsigned long phys_addr, pgprot_t prot)
 881{
 882        unsigned long end;
 883        unsigned long pfn;
 884
 885        address &= ~PMD_MASK;
 886        end = address + size;
 887        if (end > PMD_SIZE)
 888                end = PMD_SIZE;
 889        pfn = phys_addr >> PAGE_SHIFT;
 890        do {
 891                BUG_ON(!pte_none(*pte));
 892                if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
 893                        set_pte(pte, pfn_pte(pfn, prot));
 894                address += PAGE_SIZE;
 895                pfn++;
 896                pte++;
 897        } while (address && (address < end));
 898}
 899
 900static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
 901        unsigned long phys_addr, pgprot_t prot)
 902{
 903        unsigned long base, end;
 904
 905        base = address & PGDIR_MASK;
 906        address &= ~PGDIR_MASK;
 907        end = address + size;
 908        if (end > PGDIR_SIZE)
 909                end = PGDIR_SIZE;
 910        phys_addr -= address;
 911        do {
 912                pte_t * pte = pte_alloc_map(mm, pmd, base + address);
 913                if (!pte)
 914                        return -ENOMEM;
 915                remap_pte_range(pte, base + address, end - address, address + phys_addr, prot);
 916                pte_unmap(pte);
 917                address = (address + PMD_SIZE) & PMD_MASK;
 918                pmd++;
 919        } while (address && (address < end));
 920        return 0;
 921}
 922
 923/*  Note: this is only safe if the mm semaphore is held when called. */
 924int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 925{
 926        int error = 0;
 927        pgd_t * dir;
 928        unsigned long beg = from;
 929        unsigned long end = from + size;
 930        struct mm_struct *mm = vma->vm_mm;
 931
 932        phys_addr -= from;
 933        dir = pgd_offset(mm, from);
 934        flush_cache_range(vma, beg, end);
 935        if (from >= end)
 936                BUG();
 937
 938        spin_lock(&mm->page_table_lock);
 939        do {
 940                pmd_t *pmd = pmd_alloc(mm, dir, from);
 941                error = -ENOMEM;
 942                if (!pmd)
 943                        break;
 944                error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
 945                if (error)
 946                        break;
 947                from = (from + PGDIR_SIZE) & PGDIR_MASK;
 948                dir++;
 949        } while (from && (from < end));
 950        flush_tlb_range(vma, beg, end);
 951        spin_unlock(&mm->page_table_lock);
 952        return error;
 953}
 954
 955/*
 956 * Establish a new mapping:
 957 *  - flush the old one
 958 *  - update the page tables
 959 *  - inform the TLB about the new one
 960 *
 961 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
 962 */
 963static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
 964{
 965        set_pte(page_table, entry);
 966        flush_tlb_page(vma, address);
 967        update_mmu_cache(vma, address, entry);
 968}
 969
 970/*
 971 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
 972 */
 973static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
 974                pte_t *page_table)
 975{
 976        flush_page_to_ram(new_page);
 977        flush_cache_page(vma, address);
 978        establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 979}
 980
 981/*
 982 * This routine handles present pages, when users try to write
 983 * to a shared page. It is done by copying the page to a new address
 984 * and decrementing the shared-page counter for the old page.
 985 *
 986 * Goto-purists beware: the only reason for goto's here is that it results
 987 * in better assembly code.. The "default" path will see no jumps at all.
 988 *
 989 * Note that this routine assumes that the protection checks have been
 990 * done by the caller (the low-level page fault routine in most cases).
 991 * Thus we can safely just mark it writable once we've done any necessary
 992 * COW.
 993 *
 994 * We also mark the page dirty at this point even though the page will
 995 * change only once the write actually happens. This avoids a few races,
 996 * and potentially makes it more efficient.
 997 *
 998 * We hold the mm semaphore and the page_table_lock on entry and exit
 999 * with the page_table_lock released.
1000 */
1001static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1002        unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
1003{
1004        struct page *old_page, *new_page;
1005        unsigned long pfn = pte_pfn(pte);
1006
1007        if (!pfn_valid(pfn))
1008                goto bad_wp_page;
1009        old_page = pfn_to_page(pfn);
1010
1011        if (!TestSetPageLocked(old_page)) {
1012                int reuse = can_share_swap_page(old_page);
1013                unlock_page(old_page);
1014                if (reuse) {
1015                        flush_cache_page(vma, address);
1016                        establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
1017                        pte_unmap(page_table);
1018                        spin_unlock(&mm->page_table_lock);
1019                        return VM_FAULT_MINOR;
1020                }
1021        }
1022        pte_unmap(page_table);
1023
1024        /*
1025         * Ok, we need to copy. Oh, well..
1026         */
1027        page_cache_get(old_page);
1028        spin_unlock(&mm->page_table_lock);
1029
1030        new_page = alloc_page(GFP_HIGHUSER);
1031        if (!new_page)
1032                goto no_mem;
1033        copy_cow_page(old_page,new_page,address);
1034
1035        /*
1036         * Re-check the pte - we dropped the lock
1037         */
1038        spin_lock(&mm->page_table_lock);
1039        page_table = pte_offset_map(pmd, address);
1040        if (pte_same(*page_table, pte)) {
1041                if (PageReserved(old_page))
1042                        ++mm->rss;
1043                page_remove_rmap(old_page, page_table);
1044                break_cow(vma, new_page, address, page_table);
1045                page_add_rmap(new_page, page_table);
1046                lru_cache_add(new_page);
1047
1048                /* Free the old page.. */
1049                new_page = old_page;
1050        }
1051        pte_unmap(page_table);
1052        spin_unlock(&mm->page_table_lock);
1053        page_cache_release(new_page);
1054        page_cache_release(old_page);
1055        return VM_FAULT_MINOR;
1056
1057bad_wp_page:
1058        pte_unmap(page_table);
1059        spin_unlock(&mm->page_table_lock);
1060        printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address);
1061        /*
1062         * This should really halt the system so it can be debugged or
1063         * at least the kernel stops what it's doing before it corrupts
1064         * data, but for the moment just pretend this is OOM.
1065         */
1066        return VM_FAULT_OOM;
1067no_mem:
1068        page_cache_release(old_page);
1069        return VM_FAULT_OOM;
1070}
1071
1072static void vmtruncate_list(struct list_head *head, unsigned long pgoff)
1073{
1074        unsigned long start, end, len, diff;
1075        struct vm_area_struct *vma;
1076        struct list_head *curr;
1077
1078        list_for_each(curr, head) {
1079                vma = list_entry(curr, struct vm_area_struct, shared);
1080                start = vma->vm_start;
1081                end = vma->vm_end;
1082                len = end - start;
1083
1084                /* mapping wholly truncated? */
1085                if (vma->vm_pgoff >= pgoff) {
1086                        zap_page_range(vma, start, len);
1087                        continue;
1088                }
1089
1090                /* mapping wholly unaffected? */
1091                len = len >> PAGE_SHIFT;
1092                diff = pgoff - vma->vm_pgoff;
1093                if (diff >= len)
1094                        continue;
1095
1096                /* Ok, partially affected.. */
1097                start += diff << PAGE_SHIFT;
1098                len = (len - diff) << PAGE_SHIFT;
1099                zap_page_range(vma, start, len);
1100        }
1101}
1102
1103/*
1104 * Handle all mappings that got truncated by a "truncate()"
1105 * system call.
1106 *
1107 * NOTE! We have to be ready to update the memory sharing
1108 * between the file and the memory map for a potential last
1109 * incomplete page.  Ugly, but necessary.
1110 */
1111int vmtruncate(struct inode * inode, loff_t offset)
1112{
1113        unsigned long pgoff;
1114        struct address_space *mapping = inode->i_mapping;
1115        unsigned long limit;
1116
1117        if (inode->i_size < offset)
1118                goto do_expand;
1119        inode->i_size = offset;
1120        spin_lock(&mapping->i_shared_lock);
1121        if (list_empty(&mapping->i_mmap) && list_empty(&mapping->i_mmap_shared))
1122                goto out_unlock;
1123
1124        pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1125        if (!list_empty(&mapping->i_mmap))
1126                vmtruncate_list(&mapping->i_mmap, pgoff);
1127        if (!list_empty(&mapping->i_mmap_shared))
1128                vmtruncate_list(&mapping->i_mmap_shared, pgoff);
1129
1130out_unlock:
1131        spin_unlock(&mapping->i_shared_lock);
1132        truncate_inode_pages(mapping, offset);
1133        goto out_truncate;
1134
1135do_expand:
1136        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1137        if (limit != RLIM_INFINITY && offset > limit)
1138                goto out_sig;
1139        if (offset > inode->i_sb->s_maxbytes)
1140                goto out;
1141        inode->i_size = offset;
1142
1143out_truncate:
1144        if (inode->i_op && inode->i_op->truncate)
1145                inode->i_op->truncate(inode);
1146        return 0;
1147out_sig:
1148        send_sig(SIGXFSZ, current, 0);
1149out:
1150        return -EFBIG;
1151}
1152
1153/* 
1154 * Primitive swap readahead code. We simply read an aligned block of
1155 * (1 << page_cluster) entries in the swap area. This method is chosen
1156 * because it doesn't cost us any seek time.  We also make sure to queue
1157 * the 'original' request together with the readahead ones...  
1158 */
1159void swapin_readahead(swp_entry_t entry)
1160{
1161        int i, num;
1162        struct page *new_page;
1163        unsigned long offset;
1164
1165        /*
1166         * Get the number of handles we should do readahead io to.
1167         */
1168        num = valid_swaphandles(entry, &offset);
1169        for (i = 0; i < num; offset++, i++) {
1170                /* Ok, do the async read-ahead now */
1171                new_page = read_swap_cache_async(swp_entry(swp_type(entry), offset));
1172                if (!new_page)
1173                        break;
1174                page_cache_release(new_page);
1175        }
1176        return;
1177}
1178
1179/*
1180 * We hold the mm semaphore and the page_table_lock on entry and
1181 * should release the pagetable lock on exit..
1182 */
1183static int do_swap_page(struct mm_struct * mm,
1184        struct vm_area_struct * vma, unsigned long address,
1185        pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
1186{
1187        struct page *page;
1188        swp_entry_t entry = pte_to_swp_entry(orig_pte);
1189        pte_t pte;
1190        int ret = VM_FAULT_MINOR;
1191
1192        pte_unmap(page_table);
1193        spin_unlock(&mm->page_table_lock);
1194        page = lookup_swap_cache(entry);
1195        if (!page) {
1196                swapin_readahead(entry);
1197                page = read_swap_cache_async(entry);
1198                if (!page) {
1199                        /*
1200                         * Back out if somebody else faulted in this pte while
1201                         * we released the page table lock.
1202                         */
1203                        spin_lock(&mm->page_table_lock);
1204                        page_table = pte_offset_map(pmd, address);
1205                        if (pte_same(*page_table, orig_pte))
1206                                ret = VM_FAULT_OOM;
1207                        else
1208                                ret = VM_FAULT_MINOR;
1209                        pte_unmap(page_table);
1210                        spin_unlock(&mm->page_table_lock);
1211                        return ret;
1212                }
1213
1214                /* Had to read the page from swap area: Major fault */
1215                ret = VM_FAULT_MAJOR;
1216                KERNEL_STAT_INC(pgmajfault);
1217        }
1218
1219        mark_page_accessed(page);
1220        lock_page(page);
1221
1222        /*
1223         * Back out if somebody else faulted in this pte while we
1224         * released the page table lock.
1225         */
1226        spin_lock(&mm->page_table_lock);
1227        page_table = pte_offset_map(pmd, address);
1228        if (!pte_same(*page_table, orig_pte)) {
1229                pte_unmap(page_table);
1230                spin_unlock(&mm->page_table_lock);
1231                unlock_page(page);
1232                page_cache_release(page);
1233                return VM_FAULT_MINOR;
1234        }
1235
1236        /* The page isn't present yet, go ahead with the fault. */
1237                
1238        swap_free(entry);
1239        if (vm_swap_full())
1240                remove_exclusive_swap_page(page);
1241
1242        mm->rss++;
1243        pte = mk_pte(page, vma->vm_page_prot);
1244        if (write_access && can_share_swap_page(page))
1245                pte = pte_mkdirty(pte_mkwrite(pte));
1246        unlock_page(page);
1247
1248        flush_page_to_ram(page);
1249        flush_icache_page(vma, page);
1250        set_pte(page_table, pte);
1251        page_add_rmap(page, page_table);
1252
1253        /* No need to invalidate - it was non-present before */
1254        update_mmu_cache(vma, address, pte);
1255        pte_unmap(page_table);
1256        spin_unlock(&mm->page_table_lock);
1257        return ret;
1258}
1259
1260/*
1261 * We are called with the MM semaphore and page_table_lock
1262 * spinlock held to protect against concurrent faults in
1263 * multithreaded programs. 
1264 */
1265static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr)
1266{
1267        pte_t entry;
1268        struct page * page = ZERO_PAGE(addr);
1269
1270        /* Read-only mapping of ZERO_PAGE. */
1271        entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1272
1273        /* ..except if it's a write access */
1274        if (write_access) {
1275                /* Allocate our own private page. */
1276                pte_unmap(page_table);
1277                spin_unlock(&mm->page_table_lock);
1278
1279                page = alloc_page(GFP_HIGHUSER);
1280                if (!page)
1281                        goto no_mem;
1282                clear_user_highpage(page, addr);
1283
1284                spin_lock(&mm->page_table_lock);
1285                page_table = pte_offset_map(pmd, addr);
1286
1287                if (!pte_none(*page_table)) {
1288                        pte_unmap(page_table);
1289                        page_cache_release(page);
1290                        spin_unlock(&mm->page_table_lock);
1291                        return VM_FAULT_MINOR;
1292                }
1293                mm->rss++;
1294                flush_page_to_ram(page);
1295                entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1296                lru_cache_add(page);
1297                mark_page_accessed(page);
1298        }
1299
1300        set_pte(page_table, entry);
1301        page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
1302        pte_unmap(page_table);
1303
1304        /* No need to invalidate - it was non-present before */
1305        update_mmu_cache(vma, addr, entry);
1306        spin_unlock(&mm->page_table_lock);
1307        return VM_FAULT_MINOR;
1308
1309no_mem:
1310        return VM_FAULT_OOM;
1311}
1312
1313/*
1314 * do_no_page() tries to create a new page mapping. It aggressively
1315 * tries to share with existing pages, but makes a separate copy if
1316 * the "write_access" parameter is true in order to avoid the next
1317 * page fault.
1318 *
1319 * As this is called only for pages that do not currently exist, we
1320 * do not need to flush old virtual caches or the TLB.
1321 *
1322 * This is called with the MM semaphore held and the page table
1323 * spinlock held. Exit with the spinlock released.
1324 */
1325static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
1326        unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
1327{
1328        struct page * new_page;
1329        pte_t entry;
1330
1331        if (!vma->vm_ops || !vma->vm_ops->nopage)
1332                return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
1333        pte_unmap(page_table);
1334        spin_unlock(&mm->page_table_lock);
1335
1336        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
1337
1338        /* no page was available -- either SIGBUS or OOM */
1339        if (new_page == NOPAGE_SIGBUS)
1340                return VM_FAULT_SIGBUS;
1341        if (new_page == NOPAGE_OOM)
1342                return VM_FAULT_OOM;
1343
1344        /*
1345         * Should we do an early C-O-W break?
1346         */
1347        if (write_access && !(vma->vm_flags & VM_SHARED)) {
1348                struct page * page = alloc_page(GFP_HIGHUSER);
1349                if (!page) {
1350                        page_cache_release(new_page);
1351                        return VM_FAULT_OOM;
1352                }
1353                copy_user_highpage(page, new_page, address);
1354                page_cache_release(new_page);
1355                lru_cache_add(page);
1356                new_page = page;
1357        }
1358
1359        spin_lock(&mm->page_table_lock);
1360        page_table = pte_offset_map(pmd, address);
1361
1362        /*
1363         * This silly early PAGE_DIRTY setting removes a race
1364         * due to the bad i386 page protection. But it's valid
1365         * for other architectures too.
1366         *
1367         * Note that if write_access is true, we either now have
1368         * an exclusive copy of the page, or this is a shared mapping,
1369         * so we can make it writable and dirty to avoid having to
1370         * handle that later.
1371         */
1372        /* Only go through if we didn't race with anybody else... */
1373        if (pte_none(*page_table)) {
1374                ++mm->rss;
1375                flush_page_to_ram(new_page);
1376                flush_icache_page(vma, new_page);
1377                entry = mk_pte(new_page, vma->vm_page_prot);
1378                if (write_access)
1379                        entry = pte_mkwrite(pte_mkdirty(entry));
1380                set_pte(page_table, entry);
1381                page_add_rmap(new_page, page_table);
1382                pte_unmap(page_table);
1383        } else {
1384                /* One of our sibling threads was faster, back out. */
1385                pte_unmap(page_table);
1386                page_cache_release(new_page);
1387                spin_unlock(&mm->page_table_lock);
1388                return VM_FAULT_MINOR;
1389        }
1390
1391        /* no need to invalidate: a not-present page shouldn't be cached */
1392        update_mmu_cache(vma, address, entry);
1393        spin_unlock(&mm->page_table_lock);
1394        return VM_FAULT_MAJOR;
1395}
1396
1397/*
1398 * These routines also need to handle stuff like marking pages dirty
1399 * and/or accessed for architectures that don't do it in hardware (most
1400 * RISC architectures).  The early dirtying is also good on the i386.
1401 *
1402 * There is also a hook called "update_mmu_cache()" that architectures
1403 * with external mmu caches can use to update those (ie the Sparc or
1404 * PowerPC hashed page tables that act as extended TLBs).
1405 *
1406 * Note the "page_table_lock". It is to protect against kswapd removing
1407 * pages from under us. Note that kswapd only ever _removes_ pages, never
1408 * adds them. As such, once we have noticed that the page is not present,
1409 * we can drop the lock early.
1410 *
1411 * The adding of pages is protected by the MM semaphore (which we hold),
1412 * so we don't need to worry about a page being suddenly been added into
1413 * our VM.
1414 *
1415 * We enter with the pagetable spinlock held, we are supposed to
1416 * release it when done.
1417 */
1418static inline int handle_pte_fault(struct mm_struct *mm,
1419        struct vm_area_struct * vma, unsigned long address,
1420        int write_access, pte_t *pte, pmd_t *pmd)
1421{
1422        pte_t entry;
1423
1424        entry = *pte;
1425        if (!pte_present(entry)) {
1426                /*
1427                 * If it truly wasn't present, we know that kswapd
1428                 * and the PTE updates will not touch it later. So
1429                 * drop the lock.
1430                 */
1431                if (pte_none(entry))
1432                        return do_no_page(mm, vma, address, write_access, pte, pmd);
1433                return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
1434        }
1435
1436        if (write_access) {
1437                if (!pte_write(entry))
1438                        return do_wp_page(mm, vma, address, pte, pmd, entry);
1439
1440                entry = pte_mkdirty(entry);
1441        }
1442        entry = pte_mkyoung(entry);
1443        establish_pte(vma, address, pte, entry);
1444        pte_unmap(pte);
1445        spin_unlock(&mm->page_table_lock);
1446        return VM_FAULT_MINOR;
1447}
1448
1449/*
1450 * By the time we get here, we already hold the mm semaphore
1451 */
1452int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
1453        unsigned long address, int write_access)
1454{
1455        pgd_t *pgd;
1456        pmd_t *pmd;
1457
1458        current->state = TASK_RUNNING;
1459        pgd = pgd_offset(mm, address);
1460
1461        KERNEL_STAT_INC(pgfault);
1462        /*
1463         * We need the page table lock to synchronize with kswapd
1464         * and the SMP-safe atomic PTE updates.
1465         */
1466        spin_lock(&mm->page_table_lock);
1467        pmd = pmd_alloc(mm, pgd, address);
1468
1469        if (pmd) {
1470                pte_t * pte = pte_alloc_map(mm, pmd, address);
1471                if (pte)
1472                        return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
1473        }
1474        spin_unlock(&mm->page_table_lock);
1475        return VM_FAULT_OOM;
1476}
1477
1478/*
1479 * Allocate page middle directory.
1480 *
1481 * We've already handled the fast-path in-line, and we own the
1482 * page table lock.
1483 *
1484 * On a two-level page table, this ends up actually being entirely
1485 * optimized away.
1486 */
1487pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1488{
1489        pmd_t *new;
1490
1491        spin_unlock(&mm->page_table_lock);
1492        new = pmd_alloc_one(mm, address);
1493        spin_lock(&mm->page_table_lock);
1494        if (!new)
1495                return NULL;
1496
1497        /*
1498         * Because we dropped the lock, we should re-check the
1499         * entry, as somebody else could have populated it..
1500         */
1501        if (pgd_present(*pgd)) {
1502                pmd_free(new);
1503                goto out;
1504        }
1505        pgd_populate(mm, pgd, new);
1506out:
1507        return pmd_offset(pgd, address);
1508}
1509
1510int make_pages_present(unsigned long addr, unsigned long end)
1511{
1512        int ret, len, write;
1513        struct vm_area_struct * vma;
1514
1515        vma = find_vma(current->mm, addr);
1516        write = (vma->vm_flags & VM_WRITE) != 0;
1517        if (addr >= end)
1518                BUG();
1519        if (end > vma->vm_end)
1520                BUG();
1521        len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
1522        ret = get_user_pages(current, current->mm, addr,
1523                        len, write, 0, NULL, NULL);
1524        return ret == len ? 0 : -1;
1525}
1526
1527/* 
1528 * Map a vmalloc()-space virtual address to the physical page.
1529 */
1530struct page * vmalloc_to_page(void * vmalloc_addr)
1531{
1532        unsigned long addr = (unsigned long) vmalloc_addr;
1533        struct page *page = NULL;
1534        pgd_t *pgd = pgd_offset_k(addr);
1535        pmd_t *pmd;
1536        pte_t *ptep, pte;
1537  
1538        if (!pgd_none(*pgd)) {
1539                pmd = pmd_offset(pgd, addr);
1540                if (!pmd_none(*pmd)) {
1541                        preempt_disable();
1542                        ptep = pte_offset_map(pmd, addr);
1543                        pte = *ptep;
1544                        if (pte_present(pte))
1545                                page = pte_page(pte);
1546                        pte_unmap(ptep);
1547                        preempt_enable();
1548                }
1549        }
1550        return page;
1551}
1552
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.