linux-old/mm/memory.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/memory.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * demand-loading started 01.12.91 - seems it is high on the list of
   9 * things wanted, and it should be easy to implement. - Linus
  10 */
  11
  12/*
  13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14 * pages started 02.12.91, seems to work. - Linus.
  15 *
  16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17 * would have taken more than the 6M I have free, but it worked well as
  18 * far as I could see.
  19 *
  20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21 */
  22
  23/*
  24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25 * thought has to go into this. Oh, well..
  26 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27 *              Found it. Everything seems to work now.
  28 * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29 */
  30
  31/*
  32 * 05.04.94  -  Multi-page memory management added for v1.1.
  33 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34 */
  35
  36#include <linux/signal.h>
  37#include <linux/sched.h>
  38#include <linux/head.h>
  39#include <linux/kernel.h>
  40#include <linux/errno.h>
  41#include <linux/string.h>
  42#include <linux/types.h>
  43#include <linux/ptrace.h>
  44#include <linux/mman.h>
  45#include <linux/mm.h>
  46#include <linux/swap.h>
  47
  48#include <asm/system.h>
  49#include <asm/uaccess.h>
  50#include <asm/pgtable.h>
  51#include <asm/string.h>
  52
  53unsigned long max_mapnr = 0;
  54unsigned long num_physpages = 0;
  55void * high_memory = NULL;
  56
  57/*
  58 * We special-case the C-O-W ZERO_PAGE, because it's such
  59 * a common occurrence (no need to read the page to know
  60 * that it's zero - better for the cache and memory subsystem).
  61 */
  62static inline void copy_cow_page(unsigned long from, unsigned long to)
  63{
  64        if (from == ZERO_PAGE) {
  65                clear_page(to);
  66                return;
  67        }
  68        copy_page(to, from);
  69}
  70
  71#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  72
  73mem_map_t * mem_map = NULL;
  74
  75/*
  76 * oom() prints a message (so that the user knows why the process died),
  77 * and gives the process an untrappable SIGKILL.
  78 */
  79void oom(struct task_struct * task)
  80{
  81        printk("\nOut of memory for %s.\n", task->comm);
  82        task->sig->action[SIGKILL-1].sa_handler = NULL;
  83        task->blocked &= ~(1<<(SIGKILL-1));
  84        send_sig(SIGKILL,task,1);
  85}
  86
  87/*
  88 * Note: this doesn't free the actual pages themselves. That
  89 * has been handled earlier when unmapping all the memory regions.
  90 */
  91static inline void free_one_pmd(pmd_t * dir)
  92{
  93        pte_t * pte;
  94
  95        if (pmd_none(*dir))
  96                return;
  97        if (pmd_bad(*dir)) {
  98                printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  99                pmd_clear(dir);
 100                return;
 101        }
 102        pte = pte_offset(dir, 0);
 103        pmd_clear(dir);
 104        pte_free(pte);
 105}
 106
 107static inline void free_one_pgd(pgd_t * dir)
 108{
 109        int j;
 110        pmd_t * pmd;
 111
 112        if (pgd_none(*dir))
 113                return;
 114        if (pgd_bad(*dir)) {
 115                printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 116                pgd_clear(dir);
 117                return;
 118        }
 119        pmd = pmd_offset(dir, 0);
 120        pgd_clear(dir);
 121        for (j = 0; j < PTRS_PER_PMD ; j++)
 122                free_one_pmd(pmd+j);
 123        pmd_free(pmd);
 124}
 125        
 126/*
 127 * This function clears all user-level page tables of a process - this
 128 * is needed by execve(), so that old pages aren't in the way.
 129 */
 130void clear_page_tables(struct task_struct * tsk)
 131{
 132        int i;
 133        pgd_t * page_dir;
 134
 135        page_dir = tsk->mm->pgd;
 136        if (!page_dir || page_dir == swapper_pg_dir) {
 137                printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 138                return;
 139        }
 140        for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 141                free_one_pgd(page_dir + i);
 142}
 143
 144/*
 145 * This function frees up all page tables of a process when it exits. It
 146 * is the same as "clear_page_tables()", except it also frees the old
 147 * page table directory.
 148 */
 149void free_page_tables(struct mm_struct * mm)
 150{
 151        int i;
 152        pgd_t * page_dir;
 153
 154        page_dir = mm->pgd;
 155        if (page_dir) {
 156                if (page_dir == swapper_pg_dir) {
 157                        printk("free_page_tables: Trying to free kernel pgd\n");
 158                        return;
 159                }
 160                for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 161                        free_one_pgd(page_dir + i);
 162                pgd_free(page_dir);
 163        }
 164}
 165
 166int new_page_tables(struct task_struct * tsk)
 167{
 168        pgd_t * page_dir, * new_pg;
 169
 170        if (!(new_pg = pgd_alloc()))
 171                return -ENOMEM;
 172        page_dir = pgd_offset(&init_mm, 0);
 173        memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 174               (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 175        SET_PAGE_DIR(tsk, new_pg);
 176        tsk->mm->pgd = new_pg;
 177        return 0;
 178}
 179
 180static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
 181{
 182        pte_t pte = *old_pte;
 183        unsigned long page_nr;
 184
 185        if (pte_none(pte))
 186                return;
 187        if (!pte_present(pte)) {
 188                swap_duplicate(pte_val(pte));
 189                set_pte(new_pte, pte);
 190                return;
 191        }
 192        page_nr = MAP_NR(pte_page(pte));
 193        if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) {
 194                set_pte(new_pte, pte);
 195                return;
 196        }
 197        if (cow)
 198                pte = pte_wrprotect(pte);
 199        if (delete_from_swap_cache(&mem_map[page_nr]))
 200                pte = pte_mkdirty(pte);
 201        set_pte(new_pte, pte_mkold(pte));
 202        set_pte(old_pte, pte);
 203        atomic_inc(&mem_map[page_nr].count);
 204}
 205
 206static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
 207{
 208        pte_t * src_pte, * dst_pte;
 209        unsigned long end;
 210
 211        if (pmd_none(*src_pmd))
 212                return 0;
 213        if (pmd_bad(*src_pmd)) {
 214                printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 215                pmd_clear(src_pmd);
 216                return 0;
 217        }
 218        src_pte = pte_offset(src_pmd, address);
 219        if (pmd_none(*dst_pmd)) {
 220                if (!pte_alloc(dst_pmd, 0))
 221                        return -ENOMEM;
 222        }
 223        dst_pte = pte_offset(dst_pmd, address);
 224        address &= ~PMD_MASK;
 225        end = address + size;
 226        if (end >= PMD_SIZE)
 227                end = PMD_SIZE;
 228        do {
 229                /* I would like to switch arguments here, to make it
 230                 * consistent with copy_xxx_range and memcpy syntax.
 231                 */
 232                copy_one_pte(src_pte++, dst_pte++, cow);
 233                address += PAGE_SIZE;
 234        } while (address < end);
 235        return 0;
 236}
 237
 238static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
 239{
 240        pmd_t * src_pmd, * dst_pmd;
 241        unsigned long end;
 242        int error = 0;
 243
 244        if (pgd_none(*src_pgd))
 245                return 0;
 246        if (pgd_bad(*src_pgd)) {
 247                printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 248                pgd_clear(src_pgd);
 249                return 0;
 250        }
 251        src_pmd = pmd_offset(src_pgd, address);
 252        if (pgd_none(*dst_pgd)) {
 253                if (!pmd_alloc(dst_pgd, 0))
 254                        return -ENOMEM;
 255        }
 256        dst_pmd = pmd_offset(dst_pgd, address);
 257        address &= ~PGDIR_MASK;
 258        end = address + size;
 259        if (end > PGDIR_SIZE)
 260                end = PGDIR_SIZE;
 261        do {
 262                error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 263                if (error)
 264                        break;
 265                address = (address + PMD_SIZE) & PMD_MASK; 
 266        } while (address < end);
 267        return error;
 268}
 269
 270/*
 271 * copy one vm_area from one task to the other. Assumes the page tables
 272 * already present in the new task to be cleared in the whole range
 273 * covered by this vma.
 274 */
 275int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 276                        struct vm_area_struct *vma)
 277{
 278        pgd_t * src_pgd, * dst_pgd;
 279        unsigned long address = vma->vm_start;
 280        unsigned long end = vma->vm_end;
 281        int error = 0, cow;
 282
 283        cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 284        src_pgd = pgd_offset(src, address);
 285        dst_pgd = pgd_offset(dst, address);
 286        while (address < end) {
 287                error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 288                if (error)
 289                        break;
 290                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 291        }
 292        return error;
 293}
 294
 295/*
 296 * Return indicates whether a page was freed so caller can adjust rss
 297 */
 298static inline int free_pte(pte_t page)
 299{
 300        if (pte_present(page)) {
 301                unsigned long addr = pte_page(page);
 302                if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
 303                        return 0;
 304                free_page(addr);
 305                return 1;
 306        }
 307        swap_free(pte_val(page));
 308        return 0;
 309}
 310
 311static inline void forget_pte(pte_t page)
 312{
 313        if (!pte_none(page)) {
 314                printk("forget_pte: old mapping existed!\n");
 315                free_pte(page);
 316        }
 317}
 318
 319static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
 320{
 321        pte_t * pte;
 322        int freed;
 323
 324        if (pmd_none(*pmd))
 325                return 0;
 326        if (pmd_bad(*pmd)) {
 327                printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 328                pmd_clear(pmd);
 329                return 0;
 330        }
 331        pte = pte_offset(pmd, address);
 332        address &= ~PMD_MASK;
 333        if (address + size > PMD_SIZE)
 334                size = PMD_SIZE - address;
 335        size >>= PAGE_SHIFT;
 336        freed = 0;
 337        for (;;) {
 338                pte_t page;
 339                if (!size)
 340                        break;
 341                page = *pte;
 342                pte++;
 343                size--;
 344                if (pte_none(page))
 345                        continue;
 346                pte_clear(pte-1);
 347                freed += free_pte(page);
 348        }
 349        return freed;
 350}
 351
 352static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
 353{
 354        pmd_t * pmd;
 355        unsigned long end;
 356        int freed;
 357
 358        if (pgd_none(*dir))
 359                return 0;
 360        if (pgd_bad(*dir)) {
 361                printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 362                pgd_clear(dir);
 363                return 0;
 364        }
 365        pmd = pmd_offset(dir, address);
 366        address &= ~PGDIR_MASK;
 367        end = address + size;
 368        if (end > PGDIR_SIZE)
 369                end = PGDIR_SIZE;
 370        freed = 0;
 371        do {
 372                freed += zap_pte_range(pmd, address, end - address);
 373                address = (address + PMD_SIZE) & PMD_MASK; 
 374                pmd++;
 375        } while (address < end);
 376        return freed;
 377}
 378
 379/*
 380 * remove user pages in a given range.
 381 */
 382void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
 383{
 384        pgd_t * dir;
 385        unsigned long end = address + size;
 386        int freed = 0;
 387
 388        dir = pgd_offset(mm, address);
 389        while (address < end) {
 390                freed += zap_pmd_range(dir, address, end - address);
 391                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 392                dir++;
 393        }
 394        /*
 395         * Update rss for the mm_struct (not necessarily current->mm)
 396         */
 397        if (mm->rss > 0) {
 398                mm->rss -= freed;
 399                if (mm->rss < 0)
 400                        mm->rss = 0;
 401        }
 402}
 403
 404static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
 405{
 406        unsigned long end;
 407
 408        address &= ~PMD_MASK;
 409        end = address + size;
 410        if (end > PMD_SIZE)
 411                end = PMD_SIZE;
 412        do {
 413                pte_t oldpage = *pte;
 414                set_pte(pte, zero_pte);
 415                forget_pte(oldpage);
 416                address += PAGE_SIZE;
 417                pte++;
 418        } while (address < end);
 419}
 420
 421static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
 422{
 423        unsigned long end;
 424
 425        address &= ~PGDIR_MASK;
 426        end = address + size;
 427        if (end > PGDIR_SIZE)
 428                end = PGDIR_SIZE;
 429        do {
 430                pte_t * pte = pte_alloc(pmd, address);
 431                if (!pte)
 432                        return -ENOMEM;
 433                zeromap_pte_range(pte, address, end - address, zero_pte);
 434                address = (address + PMD_SIZE) & PMD_MASK;
 435                pmd++;
 436        } while (address < end);
 437        return 0;
 438}
 439
 440int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
 441{
 442        int error = 0;
 443        pgd_t * dir;
 444        unsigned long beg = address;
 445        unsigned long end = address + size;
 446        pte_t zero_pte;
 447
 448        zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 449        dir = pgd_offset(current->mm, address);
 450        flush_cache_range(current->mm, beg, end);
 451        while (address < end) {
 452                pmd_t *pmd = pmd_alloc(dir, address);
 453                error = -ENOMEM;
 454                if (!pmd)
 455                        break;
 456                error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 457                if (error)
 458                        break;
 459                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 460                dir++;
 461        }
 462        flush_tlb_range(current->mm, beg, end);
 463        return error;
 464}
 465
 466/*
 467 * maps a range of physical memory into the requested pages. the old
 468 * mappings are removed. any references to nonexistent pages results
 469 * in null mappings (currently treated as "copy-on-access")
 470 */
 471static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
 472        unsigned long phys_addr, pgprot_t prot)
 473{
 474        unsigned long end;
 475
 476        address &= ~PMD_MASK;
 477        end = address + size;
 478        if (end > PMD_SIZE)
 479                end = PMD_SIZE;
 480        do {
 481                unsigned long mapnr;
 482                pte_t oldpage = *pte;
 483                pte_clear(pte);
 484
 485                mapnr = MAP_NR(__va(phys_addr));
 486                if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
 487                        set_pte(pte, mk_pte_phys(phys_addr, prot));
 488                forget_pte(oldpage);
 489                address += PAGE_SIZE;
 490                phys_addr += PAGE_SIZE;
 491                pte++;
 492        } while (address < end);
 493}
 494
 495static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
 496        unsigned long phys_addr, pgprot_t prot)
 497{
 498        unsigned long end;
 499
 500        address &= ~PGDIR_MASK;
 501        end = address + size;
 502        if (end > PGDIR_SIZE)
 503                end = PGDIR_SIZE;
 504        phys_addr -= address;
 505        do {
 506                pte_t * pte = pte_alloc(pmd, address);
 507                if (!pte)
 508                        return -ENOMEM;
 509                remap_pte_range(pte, address, end - address, address + phys_addr, prot);
 510                address = (address + PMD_SIZE) & PMD_MASK;
 511                pmd++;
 512        } while (address < end);
 513        return 0;
 514}
 515
 516int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
 517{
 518        int error = 0;
 519        pgd_t * dir;
 520        unsigned long beg = from;
 521        unsigned long end = from + size;
 522
 523        phys_addr -= from;
 524        dir = pgd_offset(current->mm, from);
 525        flush_cache_range(current->mm, beg, end);
 526        while (from < end) {
 527                pmd_t *pmd = pmd_alloc(dir, from);
 528                error = -ENOMEM;
 529                if (!pmd)
 530                        break;
 531                error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
 532                if (error)
 533                        break;
 534                from = (from + PGDIR_SIZE) & PGDIR_MASK;
 535                dir++;
 536        }
 537        flush_tlb_range(current->mm, beg, end);
 538        return error;
 539}
 540
 541/*
 542 * sanity-check function..
 543 */
 544static void put_page(pte_t * page_table, pte_t pte)
 545{
 546        if (!pte_none(*page_table)) {
 547                free_page(pte_page(pte));
 548                return;
 549        }
 550/* no need for flush_tlb */
 551        set_pte(page_table, pte);
 552}
 553
 554/*
 555 * This routine is used to map in a page into an address space: needed by
 556 * execve() for the initial stack and environment pages.
 557 */
 558unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
 559{
 560        pgd_t * pgd;
 561        pmd_t * pmd;
 562        pte_t * pte;
 563
 564        if (MAP_NR(page) >= max_mapnr)
 565                printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 566        if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
 567                printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 568        pgd = pgd_offset(tsk->mm,address);
 569        pmd = pmd_alloc(pgd, address);
 570        if (!pmd) {
 571                free_page(page);
 572                oom(tsk);
 573                return 0;
 574        }
 575        pte = pte_alloc(pmd, address);
 576        if (!pte) {
 577                free_page(page);
 578                oom(tsk);
 579                return 0;
 580        }
 581        if (!pte_none(*pte)) {
 582                printk("put_dirty_page: page already exists\n");
 583                free_page(page);
 584                return 0;
 585        }
 586        flush_page_to_ram(page);
 587        set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 588/* no need for flush_tlb */
 589        return page;
 590}
 591
 592/*
 593 * This routine handles present pages, when users try to write
 594 * to a shared page. It is done by copying the page to a new address
 595 * and decrementing the shared-page counter for the old page.
 596 *
 597 * Goto-purists beware: the only reason for goto's here is that it results
 598 * in better assembly code.. The "default" path will see no jumps at all.
 599 *
 600 * Note that this routine assumes that the protection checks have been
 601 * done by the caller (the low-level page fault routine in most cases).
 602 * Thus we can safely just mark it writable once we've done any necessary
 603 * COW.
 604 *
 605 * We also mark the page dirty at this point even though the page will
 606 * change only once the write actually happens. This avoids a few races,
 607 * and potentially makes it more efficient.
 608 */
 609static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 610        unsigned long address, int write_access, pte_t *page_table)
 611{
 612        pte_t pte;
 613        unsigned long old_page, new_page;
 614
 615        new_page = __get_free_page(GFP_KERNEL);
 616        pte = *page_table;
 617        if (!pte_present(pte))
 618                goto end_wp_page;
 619        if (pte_write(pte))
 620                goto end_wp_page;
 621        old_page = pte_page(pte);
 622        if (MAP_NR(old_page) >= max_mapnr)
 623                goto bad_wp_page;
 624        tsk->min_flt++;
 625        /*
 626         * Do we need to copy?
 627         */
 628        if (atomic_read(&mem_map[MAP_NR(old_page)].count) != 1) {
 629                if (new_page) {
 630                        if (PageReserved(mem_map + MAP_NR(old_page)))
 631                                ++vma->vm_mm->rss;
 632                        copy_cow_page(old_page,new_page);
 633                        flush_page_to_ram(old_page);
 634                        flush_page_to_ram(new_page);
 635                        flush_cache_page(vma, address);
 636                        set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 637                        free_page(old_page);
 638                        flush_tlb_page(vma, address);
 639                        return;
 640                }
 641                flush_cache_page(vma, address);
 642                set_pte(page_table, BAD_PAGE);
 643                flush_tlb_page(vma, address);
 644                free_page(old_page);
 645                oom(tsk);
 646                return;
 647        }
 648        flush_cache_page(vma, address);
 649        set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 650        flush_tlb_page(vma, address);
 651        if (new_page)
 652                free_page(new_page);
 653        return;
 654bad_wp_page:
 655        printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 656        send_sig(SIGKILL, tsk, 1);
 657end_wp_page:
 658        if (new_page)
 659                free_page(new_page);
 660        return;
 661}
 662
 663/*
 664 * This function zeroes out partial mmap'ed pages at truncation time..
 665 */
 666static void partial_clear(struct vm_area_struct *vma, unsigned long address)
 667{
 668        pgd_t *page_dir;
 669        pmd_t *page_middle;
 670        pte_t *page_table, pte;
 671
 672        page_dir = pgd_offset(vma->vm_mm, address);
 673        if (pgd_none(*page_dir))
 674                return;
 675        if (pgd_bad(*page_dir)) {
 676                printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 677                pgd_clear(page_dir);
 678                return;
 679        }
 680        page_middle = pmd_offset(page_dir, address);
 681        if (pmd_none(*page_middle))
 682                return;
 683        if (pmd_bad(*page_middle)) {
 684                printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 685                pmd_clear(page_middle);
 686                return;
 687        }
 688        page_table = pte_offset(page_middle, address);
 689        pte = *page_table;
 690        if (!pte_present(pte))
 691                return;
 692        flush_cache_page(vma, address);
 693        address &= ~PAGE_MASK;
 694        address += pte_page(pte);
 695        if (MAP_NR(address) >= max_mapnr)
 696                return;
 697        memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 698        flush_page_to_ram(pte_page(pte));
 699}
 700
 701/*
 702 * Handle all mappings that got truncated by a "truncate()"
 703 * system call.
 704 *
 705 * NOTE! We have to be ready to update the memory sharing
 706 * between the file and the memory map for a potential last
 707 * incomplete page.  Ugly, but necessary.
 708 */
 709void vmtruncate(struct inode * inode, unsigned long offset)
 710{
 711        struct vm_area_struct * mpnt;
 712
 713        truncate_inode_pages(inode, offset);
 714        if (!inode->i_mmap)
 715                return;
 716        mpnt = inode->i_mmap;
 717        do {
 718                struct mm_struct *mm = mpnt->vm_mm;
 719                unsigned long start = mpnt->vm_start;
 720                unsigned long end = mpnt->vm_end;
 721                unsigned long len = end - start;
 722                unsigned long diff;
 723
 724                /* mapping wholly truncated? */
 725                if (mpnt->vm_offset >= offset) {
 726                        flush_cache_range(mm, start, end);
 727                        zap_page_range(mm, start, len);
 728                        flush_tlb_range(mm, start, end);
 729                        continue;
 730                }
 731                /* mapping wholly unaffected? */
 732                diff = offset - mpnt->vm_offset;
 733                if (diff >= len)
 734                        continue;
 735                /* Ok, partially affected.. */
 736                start += diff;
 737                len = (len - diff) & PAGE_MASK;
 738                if (start & ~PAGE_MASK) {
 739                        partial_clear(mpnt, start);
 740                        start = (start + ~PAGE_MASK) & PAGE_MASK;
 741                }
 742                flush_cache_range(mm, start, end);
 743                zap_page_range(mm, start, len);
 744                flush_tlb_range(mm, start, end);
 745        } while ((mpnt = mpnt->vm_next_share) != NULL);
 746}
 747
 748
 749static inline void do_swap_page(struct task_struct * tsk, 
 750        struct vm_area_struct * vma, unsigned long address,
 751        pte_t * page_table, pte_t entry, int write_access)
 752{
 753        pte_t page;
 754
 755        if (!vma->vm_ops || !vma->vm_ops->swapin) {
 756                swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 757                flush_page_to_ram(pte_page(*page_table));
 758                return;
 759        }
 760        page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 761        if (pte_val(*page_table) != pte_val(entry)) {
 762                free_page(pte_page(page));
 763                return;
 764        }
 765        if (atomic_read(&mem_map[MAP_NR(pte_page(page))].count) > 1 &&
 766            !(vma->vm_flags & VM_SHARED))
 767                page = pte_wrprotect(page);
 768        ++vma->vm_mm->rss;
 769        ++tsk->maj_flt;
 770        flush_page_to_ram(pte_page(page));
 771        set_pte(page_table, page);
 772        return;
 773}
 774
 775/*
 776 * do_no_page() tries to create a new page mapping. It aggressively
 777 * tries to share with existing pages, but makes a separate copy if
 778 * the "write_access" parameter is true in order to avoid the next
 779 * page fault.
 780 *
 781 * As this is called only for pages that do not currently exist, we
 782 * do not need to flush old virtual caches or the TLB.
 783 */
 784static void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 785        unsigned long address, int write_access, pte_t *page_table, pte_t entry)
 786{
 787        unsigned long page;
 788
 789        if (!pte_none(entry))
 790                goto swap_page;
 791        address &= PAGE_MASK;
 792        if (!vma->vm_ops || !vma->vm_ops->nopage)
 793                goto anonymous_page;
 794        /*
 795         * The third argument is "no_share", which tells the low-level code
 796         * to copy, not share the page even if sharing is possible.  It's
 797         * essentially an early COW detection 
 798         */
 799        page = vma->vm_ops->nopage(vma, address, 
 800                (vma->vm_flags & VM_SHARED)?0:write_access);
 801        if (!page)
 802                goto sigbus;
 803        ++tsk->maj_flt;
 804        ++vma->vm_mm->rss;
 805        /*
 806         * This silly early PAGE_DIRTY setting removes a race
 807         * due to the bad i386 page protection. But it's valid
 808         * for other architectures too.
 809         *
 810         * Note that if write_access is true, we either now have
 811         * an exclusive copy of the page, or this is a shared mapping,
 812         * so we can make it writable and dirty to avoid having to
 813         * handle that later.
 814         */
 815        flush_page_to_ram(page);
 816        entry = mk_pte(page, vma->vm_page_prot);
 817        if (write_access) {
 818                entry = pte_mkwrite(pte_mkdirty(entry));
 819        } else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
 820                   !(vma->vm_flags & VM_SHARED))
 821                entry = pte_wrprotect(entry);
 822        put_page(page_table, entry);
 823        /* no need to invalidate: a not-present page shouldn't be cached */
 824        return;
 825
 826anonymous_page:
 827        entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 828        if (write_access) {
 829                unsigned long page = __get_free_page(GFP_KERNEL);
 830                if (!page)
 831                        goto sigbus;
 832                clear_page(page);
 833                entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 834                vma->vm_mm->rss++;
 835                tsk->min_flt++;
 836                flush_page_to_ram(page);
 837        }
 838        put_page(page_table, entry);
 839        return;
 840
 841sigbus:
 842        force_sig(SIGBUS, current);
 843        put_page(page_table, BAD_PAGE);
 844        /* no need to invalidate, wasn't present */
 845        return;
 846
 847swap_page:
 848        do_swap_page(tsk, vma, address, page_table, entry, write_access);
 849        return;
 850}
 851
 852/*
 853 * These routines also need to handle stuff like marking pages dirty
 854 * and/or accessed for architectures that don't do it in hardware (most
 855 * RISC architectures).  The early dirtying is also good on the i386.
 856 *
 857 * There is also a hook called "update_mmu_cache()" that architectures
 858 * with external mmu caches can use to update those (ie the Sparc or
 859 * PowerPC hashed page tables that act as extended TLBs).
 860 */
 861static inline void handle_pte_fault(struct task_struct *tsk,
 862        struct vm_area_struct * vma, unsigned long address,
 863        int write_access, pte_t * pte)
 864{
 865        pte_t entry = *pte;
 866
 867        if (!pte_present(entry)) {
 868                do_no_page(tsk, vma, address, write_access, pte, entry);
 869                return;
 870        }
 871        set_pte(pte, pte_mkyoung(entry));
 872        flush_tlb_page(vma, address);
 873        if (!write_access)
 874                return;
 875        if (pte_write(entry)) {
 876                set_pte(pte, pte_mkdirty(entry));
 877                flush_tlb_page(vma, address);
 878                return;
 879        }
 880        do_wp_page(tsk, vma, address, write_access, pte);
 881}
 882
 883void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 884        unsigned long address, int write_access)
 885{
 886        pgd_t *pgd;
 887        pmd_t *pmd;
 888        pte_t *pte;
 889
 890        pgd = pgd_offset(vma->vm_mm, address);
 891        pmd = pmd_alloc(pgd, address);
 892        if (!pmd)
 893                goto no_memory;
 894        pte = pte_alloc(pmd, address);
 895        if (!pte)
 896                goto no_memory;
 897        handle_pte_fault(tsk, vma, address, write_access, pte);
 898        update_mmu_cache(vma, address, *pte);
 899        return;
 900no_memory:
 901        oom(tsk);
 902}
 903
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.