linux-old/mm/memory.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/memory.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * demand-loading started 01.12.91 - seems it is high on the list of
   9 * things wanted, and it should be easy to implement. - Linus
  10 */
  11
  12/*
  13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14 * pages started 02.12.91, seems to work. - Linus.
  15 *
  16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17 * would have taken more than the 6M I have free, but it worked well as
  18 * far as I could see.
  19 *
  20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21 */
  22
  23/*
  24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25 * thought has to go into this. Oh, well..
  26 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27 *              Found it. Everything seems to work now.
  28 * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29 */
  30
  31/*
  32 * 05.04.94  -  Multi-page memory management added for v1.1.
  33 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34 */
  35
  36#include <linux/mm.h>
  37#include <linux/mman.h>
  38#include <linux/swap.h>
  39#include <linux/smp_lock.h>
  40
  41#include <asm/uaccess.h>
  42#include <asm/pgtable.h>
  43
  44unsigned long max_mapnr = 0;
  45unsigned long num_physpages = 0;
  46void * high_memory = NULL;
  47
  48/*
  49 * We special-case the C-O-W ZERO_PAGE, because it's such
  50 * a common occurrence (no need to read the page to know
  51 * that it's zero - better for the cache and memory subsystem).
  52 */
  53static inline void copy_cow_page(unsigned long from, unsigned long to)
  54{
  55        if (from == ZERO_PAGE(to)) {
  56                clear_page(to);
  57                return;
  58        }
  59        copy_page(to, from);
  60}
  61
  62mem_map_t * mem_map = NULL;
  63
  64/*
  65 * Note: this doesn't free the actual pages themselves. That
  66 * has been handled earlier when unmapping all the memory regions.
  67 */
  68static inline void free_one_pmd(pmd_t * dir)
  69{
  70        pte_t * pte;
  71
  72        if (pmd_none(*dir))
  73                return;
  74        if (pmd_bad(*dir)) {
  75                printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  76                pmd_clear(dir);
  77                return;
  78        }
  79        pte = pte_offset(dir, 0);
  80        pmd_clear(dir);
  81        pte_free(pte);
  82}
  83
  84static inline void free_one_pgd(pgd_t * dir)
  85{
  86        int j;
  87        pmd_t * pmd;
  88
  89        if (pgd_none(*dir))
  90                return;
  91        if (pgd_bad(*dir)) {
  92                printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
  93                pgd_clear(dir);
  94                return;
  95        }
  96        pmd = pmd_offset(dir, 0);
  97        pgd_clear(dir);
  98        for (j = 0; j < PTRS_PER_PMD ; j++)
  99                free_one_pmd(pmd+j);
 100        pmd_free(pmd);
 101}
 102
 103/* Low and high watermarks for page table cache.
 104   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
 105 */
 106int pgt_cache_water[2] = { 25, 50 };
 107
 108/* Returns the number of pages freed */
 109int check_pgt_cache(void)
 110{
 111        return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
 112}
 113
 114
 115/*
 116 * This function clears all user-level page tables of a process - this
 117 * is needed by execve(), so that old pages aren't in the way.
 118 */
 119void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
 120{
 121        pgd_t * page_dir = mm->pgd;
 122
 123        if (page_dir && page_dir != swapper_pg_dir) {
 124                page_dir += first;
 125                do {
 126                        free_one_pgd(page_dir);
 127                        page_dir++;
 128                } while (--nr);
 129
 130                /* keep the page table cache within bounds */
 131                check_pgt_cache();
 132        }
 133}
 134
 135/*
 136 * This function just free's the page directory - the
 137 * pages tables themselves have been freed earlier by 
 138 * clear_page_tables().
 139 */
 140void free_page_tables(struct mm_struct * mm)
 141{
 142        pgd_t * page_dir = mm->pgd;
 143
 144        if (page_dir) {
 145                if (page_dir == swapper_pg_dir)
 146                        goto out_bad;
 147                pgd_free(page_dir);
 148        }
 149        return;
 150
 151out_bad:
 152        printk(KERN_ERR
 153                "free_page_tables: Trying to free kernel pgd\n");
 154        return;
 155}
 156
 157int new_page_tables(struct task_struct * tsk)
 158{
 159        pgd_t * new_pg;
 160
 161        if (!(new_pg = pgd_alloc()))
 162                return -ENOMEM;
 163        SET_PAGE_DIR(tsk, new_pg);
 164        tsk->mm->pgd = new_pg;
 165        return 0;
 166}
 167
 168#define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
 169#define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
 170
 171/*
 172 * copy one vm_area from one task to the other. Assumes the page tables
 173 * already present in the new task to be cleared in the whole range
 174 * covered by this vma.
 175 *
 176 * 08Jan98 Merged into one routine from several inline routines to reduce
 177 *         variable count and make things faster. -jj
 178 */
 179int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 180                        struct vm_area_struct *vma)
 181{
 182        pgd_t * src_pgd, * dst_pgd;
 183        unsigned long address = vma->vm_start;
 184        unsigned long end = vma->vm_end;
 185        unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 186        
 187        src_pgd = pgd_offset(src, address)-1;
 188        dst_pgd = pgd_offset(dst, address)-1;
 189        
 190        for (;;) {
 191                pmd_t * src_pmd, * dst_pmd;
 192
 193                src_pgd++; dst_pgd++;
 194                
 195                /* copy_pmd_range */
 196                
 197                if (pgd_none(*src_pgd))
 198                        goto skip_copy_pmd_range;
 199                if (pgd_bad(*src_pgd)) {
 200                        printk("copy_pmd_range: bad pgd (%08lx)\n", 
 201                                pgd_val(*src_pgd));
 202                        pgd_clear(src_pgd);
 203skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
 204                        if (address >= end)
 205                                goto out;
 206                        continue;
 207                }
 208                if (pgd_none(*dst_pgd)) {
 209                        if (!pmd_alloc(dst_pgd, 0))
 210                                goto nomem;
 211                }
 212                
 213                src_pmd = pmd_offset(src_pgd, address);
 214                dst_pmd = pmd_offset(dst_pgd, address);
 215
 216                do {
 217                        pte_t * src_pte, * dst_pte;
 218                
 219                        /* copy_pte_range */
 220                
 221                        if (pmd_none(*src_pmd))
 222                                goto skip_copy_pte_range;
 223                        if (pmd_bad(*src_pmd)) {
 224                                printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 225                                pmd_clear(src_pmd);
 226skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
 227                                if (address >= end)
 228                                        goto out;
 229                                goto cont_copy_pmd_range;
 230                        }
 231                        if (pmd_none(*dst_pmd)) {
 232                                if (!pte_alloc(dst_pmd, 0))
 233                                        goto nomem;
 234                        }
 235                        
 236                        src_pte = pte_offset(src_pmd, address);
 237                        dst_pte = pte_offset(dst_pmd, address);
 238                        
 239                        do {
 240                                pte_t pte = *src_pte;
 241                                unsigned long page_nr;
 242                                
 243                                /* copy_one_pte */
 244
 245                                if (pte_none(pte))
 246                                        goto cont_copy_pte_range;
 247                                if (!pte_present(pte)) {
 248                                        swap_duplicate(pte_val(pte));
 249                                        set_pte(dst_pte, pte);
 250                                        goto cont_copy_pte_range;
 251                                }
 252                                page_nr = MAP_NR(pte_page(pte));
 253                                if (page_nr >= max_mapnr || 
 254                                    PageReserved(mem_map+page_nr)) {
 255                                        set_pte(dst_pte, pte);
 256                                        goto cont_copy_pte_range;
 257                                }
 258                                /* If it's a COW mapping, write protect it both in the parent and the child */
 259                                if (cow) {
 260                                        pte = pte_wrprotect(pte);
 261                                        set_pte(src_pte, pte);
 262                                }
 263                                /* If it's a shared mapping, mark it clean in the child */
 264                                if (vma->vm_flags & VM_SHARED)
 265                                        pte = pte_mkclean(pte);
 266                                set_pte(dst_pte, pte_mkold(pte));
 267                                atomic_inc(&mem_map[page_nr].count);
 268                        
 269cont_copy_pte_range:            address += PAGE_SIZE;
 270                                if (address >= end)
 271                                        goto out;
 272                                src_pte++;
 273                                dst_pte++;
 274                        } while ((unsigned long)src_pte & PTE_TABLE_MASK);
 275                
 276cont_copy_pmd_range:    src_pmd++;
 277                        dst_pmd++;
 278                } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 279        }
 280out:
 281        return 0;
 282
 283nomem:
 284        return -ENOMEM;
 285}
 286
 287/*
 288 * Return indicates whether a page was freed so caller can adjust rss
 289 */
 290static inline int free_pte(pte_t page)
 291{
 292        if (pte_present(page)) {
 293                unsigned long addr = pte_page(page);
 294                if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 295                        return 0;
 296                /* 
 297                 * free_page() used to be able to clear swap cache
 298                 * entries.  We may now have to do it manually.  
 299                 */
 300                free_page_and_swap_cache(addr);
 301                return 1;
 302        }
 303        swap_free(pte_val(page));
 304        return 0;
 305}
 306
 307static inline void forget_pte(pte_t page)
 308{
 309        if (!pte_none(page)) {
 310                printk("forget_pte: old mapping existed!\n");
 311                free_pte(page);
 312        }
 313}
 314
 315static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
 316{
 317        pte_t * pte;
 318        int freed;
 319
 320        if (pmd_none(*pmd))
 321                return 0;
 322        if (pmd_bad(*pmd)) {
 323                printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 324                pmd_clear(pmd);
 325                return 0;
 326        }
 327        pte = pte_offset(pmd, address);
 328        address &= ~PMD_MASK;
 329        if (address + size > PMD_SIZE)
 330                size = PMD_SIZE - address;
 331        size >>= PAGE_SHIFT;
 332        freed = 0;
 333        for (;;) {
 334                pte_t page;
 335                if (!size)
 336                        break;
 337                page = *pte;
 338                pte++;
 339                size--;
 340                if (pte_none(page))
 341                        continue;
 342                pte_clear(pte-1);
 343                freed += free_pte(page);
 344        }
 345        return freed;
 346}
 347
 348static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
 349{
 350        pmd_t * pmd;
 351        unsigned long end;
 352        int freed;
 353
 354        if (pgd_none(*dir))
 355                return 0;
 356        if (pgd_bad(*dir)) {
 357                printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 358                pgd_clear(dir);
 359                return 0;
 360        }
 361        pmd = pmd_offset(dir, address);
 362        address &= ~PGDIR_MASK;
 363        end = address + size;
 364        if (end > PGDIR_SIZE)
 365                end = PGDIR_SIZE;
 366        freed = 0;
 367        do {
 368                freed += zap_pte_range(pmd, address, end - address);
 369                address = (address + PMD_SIZE) & PMD_MASK; 
 370                pmd++;
 371        } while (address < end);
 372        return freed;
 373}
 374
 375/*
 376 * remove user pages in a given range.
 377 */
 378void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 379{
 380        pgd_t * dir;
 381        unsigned long end = address + size;
 382        int freed = 0;
 383
 384        dir = pgd_offset(mm, address);
 385        while (address < end) {
 386                freed += zap_pmd_range(dir, address, end - address);
 387                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 388                dir++;
 389        }
 390        /*
 391         * Update rss for the mm_struct (not necessarily current->mm)
 392         */
 393        if (mm->rss > 0) {
 394                mm->rss -= freed;
 395                if (mm->rss < 0)
 396                        mm->rss = 0;
 397        }
 398}
 399
 400static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
 401                                     unsigned long size, pgprot_t prot)
 402{
 403        unsigned long end;
 404
 405        address &= ~PMD_MASK;
 406        end = address + size;
 407        if (end > PMD_SIZE)
 408                end = PMD_SIZE;
 409        do {
 410                pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address),
 411                                               prot));
 412                pte_t oldpage = *pte;
 413                set_pte(pte, zero_pte);
 414                forget_pte(oldpage);
 415                address += PAGE_SIZE;
 416                pte++;
 417        } while (address < end);
 418}
 419
 420static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
 421                                    unsigned long size, pgprot_t prot)
 422{
 423        unsigned long end;
 424
 425        address &= ~PGDIR_MASK;
 426        end = address + size;
 427        if (end > PGDIR_SIZE)
 428                end = PGDIR_SIZE;
 429        do {
 430                pte_t * pte = pte_alloc(pmd, address);
 431                if (!pte)
 432                        return -ENOMEM;
 433                zeromap_pte_range(pte, address, end - address, prot);
 434                address = (address + PMD_SIZE) & PMD_MASK;
 435                pmd++;
 436        } while (address < end);
 437        return 0;
 438}
 439
 440int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 441{
 442        int error = 0;
 443        pgd_t * dir;
 444        unsigned long beg = address;
 445        unsigned long end = address + size;
 446
 447        dir = pgd_offset(current->mm, address);
 448        flush_cache_range(current->mm, beg, end);
 449        while (address < end) {
 450                pmd_t *pmd = pmd_alloc(dir, address);
 451                error = -ENOMEM;
 452                if (!pmd)
 453                        break;
 454                error = zeromap_pmd_range(pmd, address, end - address, prot);
 455                if (error)
 456                        break;
 457                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 458                dir++;
 459        }
 460        flush_tlb_range(current->mm, beg, end);
 461        return error;
 462}
 463
 464/*
 465 * maps a range of physical memory into the requested pages. the old
 466 * mappings are removed. any references to nonexistent pages results
 467 * in null mappings (currently treated as "copy-on-access")
 468 */
 469static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 470        unsigned long phys_addr, pgprot_t prot)
 471{
 472        unsigned long end;
 473
 474        address &= ~PMD_MASK;
 475        end = address + size;
 476        if (end > PMD_SIZE)
 477                end = PMD_SIZE;
 478        do {
 479                unsigned long mapnr;
 480                pte_t oldpage = *pte;
 481                pte_clear(pte);
 482
 483                mapnr = MAP_NR(__va(phys_addr));
 484                if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 485                        set_pte(pte, mk_pte_phys(phys_addr, prot));
 486                forget_pte(oldpage);
 487                address += PAGE_SIZE;
 488                phys_addr += PAGE_SIZE;
 489                pte++;
 490        } while (address < end);
 491}
 492
 493static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 494        unsigned long phys_addr, pgprot_t prot)
 495{
 496        unsigned long end;
 497
 498        address &= ~PGDIR_MASK;
 499        end = address + size;
 500        if (end > PGDIR_SIZE)
 501                end = PGDIR_SIZE;
 502        phys_addr -= address;
 503        do {
 504                pte_t * pte = pte_alloc(pmd, address);
 505                if (!pte)
 506                        return -ENOMEM;
 507                remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 508                address = (address + PMD_SIZE) & PMD_MASK;
 509                pmd++;
 510        } while (address < end);
 511        return 0;
 512}
 513
 514int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 515{
 516        int error = 0;
 517        pgd_t * dir;
 518        unsigned long beg = from;
 519        unsigned long end = from + size;
 520
 521        phys_addr -= from;
 522        dir = pgd_offset(current->mm, from);
 523        flush_cache_range(current->mm, beg, end);
 524        while (from < end) {
 525                pmd_t *pmd = pmd_alloc(dir, from);
 526                error = -ENOMEM;
 527                if (!pmd)
 528                        break;
 529                error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 530                if (error)
 531                        break;
 532                from = (from + PGDIR_SIZE) & PGDIR_MASK;
 533                dir++;
 534        }
 535        flush_tlb_range(current->mm, beg, end);
 536        return error;
 537}
 538
 539/*
 540 * sanity-check function..
 541 */
 542static void put_page(pte_t * page_table, pte_t pte)
 543{
 544        if (!pte_none(*page_table)) {
 545                free_page_and_swap_cache(pte_page(pte));
 546                return;
 547        }
 548/* no need for flush_tlb */
 549        set_pte(page_table, pte);
 550}
 551
 552/*
 553 * This routine is used to map in a page into an address space: needed by
 554 * execve() for the initial stack and environment pages.
 555 */
 556unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 557{
 558        pgd_t * pgd;
 559        pmd_t * pmd;
 560        pte_t * pte;
 561
 562        if (MAP_NR(page) >= max_mapnr)
 563                printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 564        if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
 565                printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 566        pgd = pgd_offset(tsk->mm,address);
 567        pmd = pmd_alloc(pgd, address);
 568        if (!pmd) {
 569                free_page(page);
 570                force_sig(SIGKILL, tsk);
 571                return 0;
 572        }
 573        pte = pte_alloc(pmd, address);
 574        if (!pte) {
 575                free_page(page);
 576                force_sig(SIGKILL, tsk);
 577                return 0;
 578        }
 579        if (!pte_none(*pte)) {
 580                printk("put_dirty_page: pte %08lx already exists\n",
 581                       pte_val(*pte));
 582                free_page(page);
 583                return 0;
 584        }
 585        flush_page_to_ram(page);
 586        set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 587/* no need for flush_tlb */
 588        return page;
 589}
 590
 591/*
 592 * This routine handles present pages, when users try to write
 593 * to a shared page. It is done by copying the page to a new address
 594 * and decrementing the shared-page counter for the old page.
 595 *
 596 * Goto-purists beware: the only reason for goto's here is that it results
 597 * in better assembly code.. The "default" path will see no jumps at all.
 598 *
 599 * Note that this routine assumes that the protection checks have been
 600 * done by the caller (the low-level page fault routine in most cases).
 601 * Thus we can safely just mark it writable once we've done any necessary
 602 * COW.
 603 *
 604 * We also mark the page dirty at this point even though the page will
 605 * change only once the write actually happens. This avoids a few races,
 606 * and potentially makes it more efficient.
 607 */
 608static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 609        unsigned long address, pte_t *page_table)
 610{
 611        pte_t pte;
 612        unsigned long old_page, new_page;
 613        struct page * page_map;
 614        
 615        pte = *page_table;
 616        new_page = __get_free_page(GFP_USER);
 617        /* Did swap_out() unmapped the protected page while we slept? */
 618        if (pte_val(*page_table) != pte_val(pte))
 619                goto end_wp_page;
 620        if (!pte_present(pte))
 621                goto end_wp_page;
 622        if (pte_write(pte))
 623                goto end_wp_page;
 624        old_page = pte_page(pte);
 625        if (MAP_NR(old_page) >= max_mapnr)
 626                goto bad_wp_page;
 627        tsk->min_flt++;
 628        page_map = mem_map + MAP_NR(old_page);
 629        
 630        /*
 631         * We can avoid the copy if:
 632         * - we're the only user (count == 1)
 633         * - the only other user is the swap cache,
 634         *   and the only swap cache user is itself,
 635         *   in which case we can remove the page
 636         *   from the swap cache.
 637         */
 638        switch (atomic_read(&page_map->count)) {
 639        case 2:
 640                if (!PageSwapCache(page_map))
 641                        break;
 642                if (swap_count(page_map->offset) != 1)
 643                        break;
 644                delete_from_swap_cache(page_map);
 645                /* FallThrough */
 646        case 1:
 647                flush_cache_page(vma, address);
 648                set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 649                flush_tlb_page(vma, address);
 650end_wp_page:
 651                /*
 652                 * We can release the kernel lock now.. Now swap_out will see
 653                 * a dirty page and so won't get confused and flush_tlb_page
 654                 * won't SMP race. -Andrea
 655                 */
 656                unlock_kernel();
 657
 658                if (new_page)
 659                        free_page(new_page);
 660                return 1;
 661        }
 662                
 663        if (!new_page)
 664                goto no_new_page;
 665
 666        if (PageReserved(page_map))
 667                ++vma->vm_mm->rss;
 668        copy_cow_page(old_page,new_page);
 669        flush_page_to_ram(old_page);
 670        flush_page_to_ram(new_page);
 671        flush_cache_page(vma, address);
 672        set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 673        flush_tlb_page(vma, address);
 674        unlock_kernel();
 675        __free_page(page_map);
 676        return 1;
 677
 678bad_wp_page:
 679        printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 680no_new_page:
 681        unlock_kernel();
 682        if (new_page)
 683                free_page(new_page);
 684        return -1;
 685}
 686
 687/*
 688 * This function zeroes out partial mmap'ed pages at truncation time..
 689 */
 690static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 691{
 692        pgd_t *page_dir;
 693        pmd_t *page_middle;
 694        pte_t *page_table, pte;
 695
 696        page_dir = pgd_offset(vma->vm_mm, address);
 697        if (pgd_none(*page_dir))
 698                return;
 699        if (pgd_bad(*page_dir)) {
 700                printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 701                pgd_clear(page_dir);
 702                return;
 703        }
 704        page_middle = pmd_offset(page_dir, address);
 705        if (pmd_none(*page_middle))
 706                return;
 707        if (pmd_bad(*page_middle)) {
 708                printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 709                pmd_clear(page_middle);
 710                return;
 711        }
 712        page_table = pte_offset(page_middle, address);
 713        pte = *page_table;
 714        if (!pte_present(pte))
 715                return;
 716        flush_cache_page(vma, address);
 717        address &= ~PAGE_MASK;
 718        address += pte_page(pte);
 719        if (MAP_NR(address) >= max_mapnr)
 720                return;
 721        memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 722        flush_page_to_ram(pte_page(pte));
 723}
 724
 725static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long offset)
 726{
 727        do {
 728                struct mm_struct *mm = mpnt->vm_mm;
 729                unsigned long start = mpnt->vm_start;
 730                unsigned long end = mpnt->vm_end;
 731                unsigned long len = end - start;
 732                unsigned long diff;
 733
 734                /* mapping wholly truncated? */
 735                if (mpnt->vm_offset >= offset) {
 736                        flush_cache_range(mm, start, end);
 737                        zap_page_range(mm, start, len);
 738                        flush_tlb_range(mm, start, end);
 739                        continue;
 740                }
 741                /* mapping wholly unaffected? */
 742                diff = offset - mpnt->vm_offset;
 743                if (diff >= len)
 744                        continue;
 745                /* Ok, partially affected.. */
 746                start += diff;
 747                len = (len - diff) & PAGE_MASK;
 748                if (start & ~PAGE_MASK) {
 749                        partial_clear(mpnt, start);
 750                        start = (start + ~PAGE_MASK) & PAGE_MASK;
 751                }
 752                flush_cache_range(mm, start, end);
 753                zap_page_range(mm, start, len);
 754                flush_tlb_range(mm, start, end);
 755        } while ((mpnt = mpnt->vm_next_share) != NULL);
 756}
 757
 758/*
 759 * Handle all mappings that got truncated by a "truncate()"
 760 * system call.
 761 *
 762 * NOTE! We have to be ready to update the memory sharing
 763 * between the file and the memory map for a potential last
 764 * incomplete page.  Ugly, but necessary.
 765 */
 766void vmtruncate(struct inode * inode, unsigned long offset)
 767{
 768        truncate_inode_pages(inode, offset);
 769        if (inode->i_mmap)
 770                vmtruncate_list(inode->i_mmap, offset);
 771        if (inode->i_mmap_shared)
 772                vmtruncate_list(inode->i_mmap_shared, offset);
 773}
 774
 775
 776/*
 777 * This is called with the kernel lock held, we need
 778 * to return without it.
 779 */
 780static int do_swap_page(struct task_struct * tsk, 
 781        struct vm_area_struct * vma, unsigned long address,
 782        pte_t * page_table, pte_t entry, int write_access)
 783{
 784        int ret = 1;
 785        if (!vma->vm_ops || !vma->vm_ops->swapin) {
 786                ret = swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 787                flush_page_to_ram(pte_page(*page_table));
 788        } else {
 789                pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 790                if (pte_val(*page_table) != pte_val(entry)) {
 791                        free_page(pte_page(page));
 792                } else {
 793                        if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
 794                            !(vma->vm_flags & VM_SHARED))
 795                                page = pte_wrprotect(page);
 796                        ++vma->vm_mm->rss;
 797                        ++tsk->maj_flt;
 798                        flush_page_to_ram(pte_page(page));
 799                        set_pte(page_table, page);
 800                }
 801        }
 802        unlock_kernel();
 803        return ret;
 804}
 805
 806/*
 807 * This only needs the MM semaphore
 808 */
 809static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 810{
 811        pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 812        if (write_access) {
 813                unsigned long page = __get_free_page(GFP_USER);
 814                if (!page)
 815                        return -1;
 816                clear_page(page);
 817                entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 818                vma->vm_mm->rss++;
 819                tsk->min_flt++;
 820                flush_page_to_ram(page);
 821        }
 822        put_page(page_table, entry);
 823        return 1;
 824}
 825
 826/*
 827 * do_no_page() tries to create a new page mapping. It aggressively
 828 * tries to share with existing pages, but makes a separate copy if
 829 * the "write_access" parameter is true in order to avoid the next
 830 * page fault.
 831 *
 832 * As this is called only for pages that do not currently exist, we
 833 * do not need to flush old virtual caches or the TLB.
 834 *
 835 * This is called with the MM semaphore and the kernel lock held.
 836 * We need to release the kernel lock as soon as possible..
 837 */
 838static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 839        unsigned long address, int write_access, pte_t *page_table)
 840{
 841        unsigned long page;
 842        pte_t entry;
 843
 844        if (!vma->vm_ops || !vma->vm_ops->nopage) {
 845                unlock_kernel();
 846                return do_anonymous_page(tsk, vma, page_table, write_access,
 847                                         address);
 848        }
 849
 850        /*
 851         * The third argument is "no_share", which tells the low-level code
 852         * to copy, not share the page even if sharing is possible.  It's
 853         * essentially an early COW detection.
 854         */
 855        page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
 856                (vma->vm_flags & VM_SHARED)?0:write_access);
 857
 858        unlock_kernel();
 859        if (!page)
 860                return 0;
 861        if (page == -1)
 862                return -1;
 863
 864        ++tsk->maj_flt;
 865        ++vma->vm_mm->rss;
 866        /*
 867         * This silly early PAGE_DIRTY setting removes a race
 868         * due to the bad i386 page protection. But it's valid
 869         * for other architectures too.
 870         *
 871         * Note that if write_access is true, we either now have
 872         * an exclusive copy of the page, or this is a shared mapping,
 873         * so we can make it writable and dirty to avoid having to
 874         * handle that later.
 875         */
 876        flush_page_to_ram(page);
 877        entry = mk_pte(page, vma->vm_page_prot);
 878        if (write_access) {
 879                entry = pte_mkwrite(pte_mkdirty(entry));
 880        } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
 881                   !(vma->vm_flags & VM_SHARED))
 882                entry = pte_wrprotect(entry);
 883        put_page(page_table, entry);
 884        /* no need to invalidate: a not-present page shouldn't be cached */
 885        return 1;
 886}
 887
 888/*
 889 * These routines also need to handle stuff like marking pages dirty
 890 * and/or accessed for architectures that don't do it in hardware (most
 891 * RISC architectures).  The early dirtying is also good on the i386.
 892 *
 893 * There is also a hook called "update_mmu_cache()" that architectures
 894 * with external mmu caches can use to update those (ie the Sparc or
 895 * PowerPC hashed page tables that act as extended TLBs).
 896 */
 897static inline int handle_pte_fault(struct task_struct *tsk,
 898        struct vm_area_struct * vma, unsigned long address,
 899        int write_access, pte_t * pte)
 900{
 901        pte_t entry;
 902
 903        lock_kernel();
 904        entry = *pte;
 905
 906        if (!pte_present(entry)) {
 907                if (pte_none(entry))
 908                        return do_no_page(tsk, vma, address, write_access, pte);
 909                return do_swap_page(tsk, vma, address, pte, entry, write_access);
 910        }
 911
 912        entry = pte_mkyoung(entry);
 913        set_pte(pte, entry);
 914        flush_tlb_page(vma, address);
 915        if (write_access) {
 916                if (!pte_write(entry))
 917                        return do_wp_page(tsk, vma, address, pte);
 918
 919                entry = pte_mkdirty(entry);
 920                set_pte(pte, entry);
 921                flush_tlb_page(vma, address);
 922        }
 923        unlock_kernel();
 924        return 1;
 925}
 926
 927/*
 928 * By the time we get here, we already hold the mm semaphore
 929 */
 930int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 931        unsigned long address, int write_access)
 932{
 933        pgd_t *pgd;
 934        pmd_t *pmd;
 935        pte_t * pte;
 936        int ret;
 937
 938        current->state = TASK_RUNNING;
 939        pgd = pgd_offset(vma->vm_mm, address);
 940        pmd = pmd_alloc(pgd, address);
 941        if (!pmd)
 942                return -1;
 943        pte = pte_alloc(pmd, address);
 944        if (!pte)
 945                return -1;
 946        ret = handle_pte_fault(tsk, vma, address, write_access, pte);
 947        if (ret > 0)
 948                update_mmu_cache(vma, address, *pte);
 949        return ret;
 950}
 951
 952/*
 953 * Simplistic page force-in..
 954 */
 955int make_pages_present(unsigned long addr, unsigned long end)
 956{
 957        int write;
 958        struct vm_area_struct * vma;
 959
 960        vma = find_vma(current->mm, addr);
 961        write = (vma->vm_flags & VM_WRITE) != 0;
 962        while (addr < end) {
 963                if (handle_mm_fault(current, vma, addr, write) < 0)
 964                        return -1;
 965                addr += PAGE_SIZE;
 966        }
 967        return 0;
 968}
 969
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.