linux-bk/mm/rmap.c
<<
>>
Prefs
   1/*
   2 * mm/rmap.c - physical to virtual reverse mappings
   3 *
   4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5 * Released under the General Public License (GPL).
   6 *
   7 *
   8 * Simple, low overhead pte-based reverse mapping scheme.
   9 * This is kept modular because we may want to experiment
  10 * with object-based reverse mapping schemes. Please try
  11 * to keep this thing as modular as possible.
  12 */
  13
  14/*
  15 * Locking:
  16 * - the page->pte.chain is protected by the PG_chainlock bit,
  17 *   which nests within the zone->lru_lock, then the
  18 *   mm->page_table_lock, and then the page lock.
  19 * - because swapout locking is opposite to the locking order
  20 *   in the page fault path, the swapout path uses trylocks
  21 *   on the mm->page_table_lock
  22 */
  23#include <linux/mm.h>
  24#include <linux/pagemap.h>
  25#include <linux/swapops.h>
  26#include <linux/slab.h>
  27#include <linux/init.h>
  28#include <linux/rmap-locking.h>
  29
  30#include <asm/pgalloc.h>
  31#include <asm/rmap.h>
  32#include <asm/tlb.h>
  33#include <asm/tlbflush.h>
  34
  35/* #define DEBUG_RMAP */
  36
  37/*
  38 * Shared pages have a chain of pte_chain structures, used to locate
  39 * all the mappings to this page. We only need a pointer to the pte
  40 * here, the page struct for the page table page contains the process
  41 * it belongs to and the offset within that process.
  42 *
  43 * We use an array of pte pointers in this structure to minimise cache misses
  44 * while traversing reverse maps.
  45 */
  46#define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
  47
  48struct pte_chain {
  49        struct pte_chain *next;
  50        pte_addr_t ptes[NRPTE];
  51};
  52
  53static kmem_cache_t     *pte_chain_cache;
  54
  55/*
  56 * pte_chain list management policy:
  57 *
  58 * - If a page has a pte_chain list then it is shared by at least two processes,
  59 *   because a single sharing uses PageDirect. (Well, this isn't true yet,
  60 *   coz this code doesn't collapse singletons back to PageDirect on the remove
  61 *   path).
  62 * - A pte_chain list has free space only in the head member - all succeeding
  63 *   members are 100% full.
  64 * - If the head element has free space, it occurs in its leading slots.
  65 * - All free space in the pte_chain is at the start of the head member.
  66 * - Insertion into the pte_chain puts a pte pointer in the last free slot of
  67 *   the head member.
  68 * - Removal from a pte chain moves the head pte of the head member onto the
  69 *   victim pte and frees the head member if it became empty.
  70 */
  71
  72/**
  73 * pte_chain_alloc - allocate a pte_chain struct
  74 *
  75 * Returns a pointer to a fresh pte_chain structure. Allocates new
  76 * pte_chain structures as required.
  77 * Caller needs to hold the page's pte_chain_lock.
  78 */
  79static inline struct pte_chain *pte_chain_alloc(void)
  80{
  81        struct pte_chain *ret;
  82
  83        ret = kmem_cache_alloc(pte_chain_cache, GFP_ATOMIC);
  84#ifdef DEBUG_RMAP
  85        {
  86                int i;
  87                for (i = 0; i < NRPTE; i++)
  88                        BUG_ON(ret->ptes[i]);
  89                BUG_ON(ret->next);
  90        }
  91#endif
  92        return ret;
  93}
  94
  95/**
  96 * pte_chain_free - free pte_chain structure
  97 * @pte_chain: pte_chain struct to free
  98 */
  99static inline void pte_chain_free(struct pte_chain *pte_chain)
 100{
 101        pte_chain->next = NULL;
 102        kmem_cache_free(pte_chain_cache, pte_chain);
 103}
 104
 105/**
 106 ** VM stuff below this comment
 107 **/
 108
 109/**
 110 * page_referenced - test if the page was referenced
 111 * @page: the page to test
 112 *
 113 * Quick test_and_clear_referenced for all mappings to a page,
 114 * returns the number of processes which referenced the page.
 115 * Caller needs to hold the pte_chain_lock.
 116 *
 117 * If the page has a single-entry pte_chain, collapse that back to a PageDirect
 118 * representation.  This way, it's only done under memory pressure.
 119 */
 120int page_referenced(struct page * page)
 121{
 122        struct pte_chain * pc;
 123        int referenced = 0;
 124
 125        if (TestClearPageReferenced(page))
 126                referenced++;
 127
 128        if (PageDirect(page)) {
 129                pte_t *pte = rmap_ptep_map(page->pte.direct);
 130                if (ptep_test_and_clear_young(pte))
 131                        referenced++;
 132                rmap_ptep_unmap(pte);
 133        } else {
 134                int nr_chains = 0;
 135
 136                /* Check all the page tables mapping this page. */
 137                for (pc = page->pte.chain; pc; pc = pc->next) {
 138                        int i;
 139
 140                        for (i = NRPTE-1; i >= 0; i--) {
 141                                pte_addr_t pte_paddr = pc->ptes[i];
 142                                pte_t *p;
 143
 144                                if (!pte_paddr)
 145                                        break;
 146                                p = rmap_ptep_map(pte_paddr);
 147                                if (ptep_test_and_clear_young(p))
 148                                        referenced++;
 149                                rmap_ptep_unmap(p);
 150                                nr_chains++;
 151                        }
 152                }
 153                if (nr_chains == 1) {
 154                        pc = page->pte.chain;
 155                        page->pte.direct = pc->ptes[NRPTE-1];
 156                        SetPageDirect(page);
 157                        pc->ptes[NRPTE-1] = 0;
 158                        pte_chain_free(pc);
 159                }
 160        }
 161        return referenced;
 162}
 163
 164/**
 165 * page_add_rmap - add reverse mapping entry to a page
 166 * @page: the page to add the mapping to
 167 * @ptep: the page table entry mapping this page
 168 *
 169 * Add a new pte reverse mapping to a page.
 170 * The caller needs to hold the mm->page_table_lock.
 171 */
 172void page_add_rmap(struct page * page, pte_t * ptep)
 173{
 174        pte_addr_t pte_paddr = ptep_to_paddr(ptep);
 175        struct pte_chain *pte_chain;
 176        int i;
 177
 178#ifdef DEBUG_RMAP
 179        if (!page || !ptep)
 180                BUG();
 181        if (!pte_present(*ptep))
 182                BUG();
 183        if (!ptep_to_mm(ptep))
 184                BUG();
 185#endif
 186
 187        if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 188                return;
 189
 190        pte_chain_lock(page);
 191
 192#ifdef DEBUG_RMAP
 193        /*
 194         * This stuff needs help to get up to highmem speed.
 195         */
 196        {
 197                struct pte_chain * pc;
 198                if (PageDirect(page)) {
 199                        if (page->pte.direct == pte_paddr)
 200                                BUG();
 201                } else {
 202                        for (pc = page->pte.chain; pc; pc = pc->next) {
 203                                for (i = 0; i < NRPTE; i++) {
 204                                        pte_addr_t p = pc->ptes[i];
 205
 206                                        if (p && p == pte_paddr)
 207                                                BUG();
 208                                }
 209                        }
 210                }
 211        }
 212#endif
 213
 214        if (page->pte.direct == 0) {
 215                page->pte.direct = pte_paddr;
 216                SetPageDirect(page);
 217                inc_page_state(nr_mapped);
 218                goto out;
 219        }
 220
 221        if (PageDirect(page)) {
 222                /* Convert a direct pointer into a pte_chain */
 223                ClearPageDirect(page);
 224                pte_chain = pte_chain_alloc();
 225                pte_chain->ptes[NRPTE-1] = page->pte.direct;
 226                pte_chain->ptes[NRPTE-2] = pte_paddr;
 227                page->pte.direct = 0;
 228                page->pte.chain = pte_chain;
 229                goto out;
 230        }
 231
 232        pte_chain = page->pte.chain;
 233        if (pte_chain->ptes[0]) {       /* It's full */
 234                struct pte_chain *new;
 235
 236                new = pte_chain_alloc();
 237                new->next = pte_chain;
 238                page->pte.chain = new;
 239                new->ptes[NRPTE-1] = pte_paddr;
 240                goto out;
 241        }
 242
 243        BUG_ON(!pte_chain->ptes[NRPTE-1]);
 244
 245        for (i = NRPTE-2; i >= 0; i--) {
 246                if (!pte_chain->ptes[i]) {
 247                        pte_chain->ptes[i] = pte_paddr;
 248                        goto out;
 249                }
 250        }
 251        BUG();
 252out:
 253        pte_chain_unlock(page);
 254        inc_page_state(nr_reverse_maps);
 255        return;
 256}
 257
 258/**
 259 * page_remove_rmap - take down reverse mapping to a page
 260 * @page: page to remove mapping from
 261 * @ptep: page table entry to remove
 262 *
 263 * Removes the reverse mapping from the pte_chain of the page,
 264 * after that the caller can clear the page table entry and free
 265 * the page.
 266 * Caller needs to hold the mm->page_table_lock.
 267 */
 268void page_remove_rmap(struct page * page, pte_t * ptep)
 269{
 270        pte_addr_t pte_paddr = ptep_to_paddr(ptep);
 271        struct pte_chain *pc;
 272
 273        if (!page || !ptep)
 274                BUG();
 275        if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 276                return;
 277
 278        pte_chain_lock(page);
 279
 280        BUG_ON(page->pte.direct == 0);
 281 
 282        if (PageDirect(page)) {
 283                if (page->pte.direct == pte_paddr) {
 284                        page->pte.direct = 0;
 285                        dec_page_state(nr_reverse_maps);
 286                        ClearPageDirect(page);
 287                        goto out;
 288                }
 289        } else {
 290                struct pte_chain *start = page->pte.chain;
 291                int victim_i = -1;
 292
 293                for (pc = start; pc; pc = pc->next) {
 294                        int i;
 295
 296                        if (pc->next)
 297                                prefetch(pc->next);
 298                        for (i = 0; i < NRPTE; i++) {
 299                                pte_addr_t pa = pc->ptes[i];
 300
 301                                if (!pa)
 302                                        continue;
 303                                if (victim_i == -1)
 304                                        victim_i = i;
 305                                if (pa != pte_paddr)
 306                                        continue;
 307                                pc->ptes[i] = start->ptes[victim_i];
 308                                dec_page_state(nr_reverse_maps);
 309                                start->ptes[victim_i] = 0;
 310                                if (victim_i == NRPTE-1) {
 311                                        /* Emptied a pte_chain */
 312                                        page->pte.chain = start->next;
 313                                        pte_chain_free(start);
 314                                } else {
 315                                        /* Do singleton->PageDirect here */
 316                                }
 317                                goto out;
 318                        }
 319                }
 320        }
 321#ifdef DEBUG_RMAP
 322        /* Not found. This should NEVER happen! */
 323        printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
 324        printk(KERN_ERR "page_remove_rmap: only found: ");
 325        if (PageDirect(page)) {
 326                printk("%llx", (u64)page->pte.direct);
 327        } else {
 328                for (pc = page->pte.chain; pc; pc = pc->next) {
 329                        int i;
 330                        for (i = 0; i < NRPTE; i++)
 331                                printk(" %d:%llx", i, (u64)pc->ptes[i]);
 332                }
 333        }
 334        printk("\n");
 335        printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
 336#endif
 337
 338out:
 339        pte_chain_unlock(page);
 340        if (!page_mapped(page))
 341                dec_page_state(nr_mapped);
 342        return;
 343}
 344
 345/**
 346 * try_to_unmap_one - worker function for try_to_unmap
 347 * @page: page to unmap
 348 * @ptep: page table entry to unmap from page
 349 *
 350 * Internal helper function for try_to_unmap, called for each page
 351 * table entry mapping a page. Because locking order here is opposite
 352 * to the locking order used by the page fault path, we use trylocks.
 353 * Locking:
 354 *      zone->lru_lock                  page_launder()
 355 *          page lock                   page_launder(), trylock
 356 *              pte_chain_lock          page_launder()
 357 *                  mm->page_table_lock try_to_unmap_one(), trylock
 358 */
 359static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
 360static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
 361{
 362        pte_t *ptep = rmap_ptep_map(paddr);
 363        unsigned long address = ptep_to_address(ptep);
 364        struct mm_struct * mm = ptep_to_mm(ptep);
 365        struct vm_area_struct * vma;
 366        pte_t pte;
 367        int ret;
 368
 369        if (!mm)
 370                BUG();
 371
 372        /*
 373         * We need the page_table_lock to protect us from page faults,
 374         * munmap, fork, etc...
 375         */
 376        if (!spin_trylock(&mm->page_table_lock)) {
 377                rmap_ptep_unmap(ptep);
 378                return SWAP_AGAIN;
 379        }
 380
 381
 382        /* During mremap, it's possible pages are not in a VMA. */
 383        vma = find_vma(mm, address);
 384        if (!vma) {
 385                ret = SWAP_FAIL;
 386                goto out_unlock;
 387        }
 388
 389        /* The page is mlock()d, we cannot swap it out. */
 390        if (vma->vm_flags & VM_LOCKED) {
 391                ret = SWAP_FAIL;
 392                goto out_unlock;
 393        }
 394
 395        /* Nuke the page table entry. */
 396        pte = ptep_get_and_clear(ptep);
 397        flush_tlb_page(vma, address);
 398        flush_cache_page(vma, address);
 399
 400        /* Store the swap location in the pte. See handle_pte_fault() ... */
 401        if (PageSwapCache(page)) {
 402                swp_entry_t entry = { .val = page->index };
 403                swap_duplicate(entry);
 404                set_pte(ptep, swp_entry_to_pte(entry));
 405        }
 406
 407        /* Move the dirty bit to the physical page now the pte is gone. */
 408        if (pte_dirty(pte))
 409                set_page_dirty(page);
 410
 411        mm->rss--;
 412        page_cache_release(page);
 413        ret = SWAP_SUCCESS;
 414
 415out_unlock:
 416        rmap_ptep_unmap(ptep);
 417        spin_unlock(&mm->page_table_lock);
 418        return ret;
 419}
 420
 421/**
 422 * try_to_unmap - try to remove all page table mappings to a page
 423 * @page: the page to get unmapped
 424 *
 425 * Tries to remove all the page table entries which are mapping this
 426 * page, used in the pageout path.  Caller must hold zone->lru_lock
 427 * and the page lock.  Return values are:
 428 *
 429 * SWAP_SUCCESS - we succeeded in removing all mappings
 430 * SWAP_AGAIN   - we missed a trylock, try again later
 431 * SWAP_FAIL    - the page is unswappable
 432 * SWAP_ERROR   - an error occurred
 433 */
 434int try_to_unmap(struct page * page)
 435{
 436        struct pte_chain *pc, *next_pc, *start;
 437        int ret = SWAP_SUCCESS;
 438        int victim_i = -1;
 439
 440        /* This page should not be on the pageout lists. */
 441        if (PageReserved(page))
 442                BUG();
 443        if (!PageLocked(page))
 444                BUG();
 445        /* We need backing store to swap out a page. */
 446        if (!page->mapping)
 447                BUG();
 448
 449        if (PageDirect(page)) {
 450                ret = try_to_unmap_one(page, page->pte.direct);
 451                if (ret == SWAP_SUCCESS) {
 452                        page->pte.direct = 0;
 453                        dec_page_state(nr_reverse_maps);
 454                        ClearPageDirect(page);
 455                }
 456                goto out;
 457        }               
 458
 459        start = page->pte.chain;
 460        for (pc = start; pc; pc = next_pc) {
 461                int i;
 462
 463                next_pc = pc->next;
 464                if (next_pc)
 465                        prefetch(next_pc);
 466                for (i = 0; i < NRPTE; i++) {
 467                        pte_addr_t pte_paddr = pc->ptes[i];
 468
 469                        if (!pte_paddr)
 470                                continue;
 471                        if (victim_i == -1) 
 472                                victim_i = i;
 473
 474                        switch (try_to_unmap_one(page, pte_paddr)) {
 475                        case SWAP_SUCCESS:
 476                                /*
 477                                 * Release a slot.  If we're releasing the
 478                                 * first pte in the first pte_chain then
 479                                 * pc->ptes[i] and start->ptes[victim_i] both
 480                                 * refer to the same thing.  It works out.
 481                                 */
 482                                pc->ptes[i] = start->ptes[victim_i];
 483                                start->ptes[victim_i] = 0;
 484                                dec_page_state(nr_reverse_maps);
 485                                victim_i++;
 486                                if (victim_i == NRPTE) {
 487                                        page->pte.chain = start->next;
 488                                        pte_chain_free(start);
 489                                        start = page->pte.chain;
 490                                        victim_i = 0;
 491                                }
 492                                break;
 493                        case SWAP_AGAIN:
 494                                /* Skip this pte, remembering status. */
 495                                ret = SWAP_AGAIN;
 496                                continue;
 497                        case SWAP_FAIL:
 498                                ret = SWAP_FAIL;
 499                                goto out;
 500                        case SWAP_ERROR:
 501                                ret = SWAP_ERROR;
 502                                goto out;
 503                        }
 504                }
 505        }
 506out:
 507        if (!page_mapped(page))
 508                dec_page_state(nr_mapped);
 509        return ret;
 510}
 511
 512/**
 513 ** No more VM stuff below this comment, only pte_chain helper
 514 ** functions.
 515 **/
 516
 517static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
 518{
 519        struct pte_chain *pc = p;
 520
 521        memset(pc, 0, sizeof(*pc));
 522}
 523
 524void __init pte_chain_init(void)
 525{
 526        pte_chain_cache = kmem_cache_create(    "pte_chain",
 527                                                sizeof(struct pte_chain),
 528                                                0,
 529                                                0,
 530                                                pte_chain_ctor,
 531                                                NULL);
 532
 533        if (!pte_chain_cache)
 534                panic("failed to create pte_chain cache!\n");
 535}
 536
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.