linux-bk/mm/highmem.c
<<
>>
Prefs
   1/*
   2 * High memory handling common code and variables.
   3 *
   4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
   5 *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
   6 *
   7 *
   8 * Redesigned the x86 32-bit VM architecture to deal with
   9 * 64-bit physical space. With current x86 CPUs this
  10 * means up to 64 Gigabytes physical RAM.
  11 *
  12 * Rewrote high memory support to move the page cache into
  13 * high memory. Implemented permanent (schedulable) kmaps
  14 * based on Linus' idea.
  15 *
  16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
  17 */
  18
  19#include <linux/mm.h>
  20#include <linux/bio.h>
  21#include <linux/pagemap.h>
  22#include <linux/mempool.h>
  23#include <linux/blkdev.h>
  24#include <linux/init.h>
  25#include <asm/pgalloc.h>
  26
  27static mempool_t *page_pool, *isa_page_pool;
  28
  29static void *page_pool_alloc(int gfp_mask, void *data)
  30{
  31        int gfp = gfp_mask | (int) (long) data;
  32
  33        return alloc_page(gfp);
  34}
  35
  36static void page_pool_free(void *page, void *data)
  37{
  38        __free_page(page);
  39}
  40
  41/*
  42 * Virtual_count is not a pure "count".
  43 *  0 means that it is not mapped, and has not been mapped
  44 *    since a TLB flush - it is usable.
  45 *  1 means that there are no users, but it has been mapped
  46 *    since the last TLB flush - so we can't use it.
  47 *  n means that there are (n-1) current users of it.
  48 */
  49#ifdef CONFIG_HIGHMEM
  50static int pkmap_count[LAST_PKMAP];
  51static unsigned int last_pkmap_nr;
  52static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  53
  54pte_t * pkmap_page_table;
  55
  56static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
  57
  58static void flush_all_zero_pkmaps(void)
  59{
  60        int i;
  61
  62        flush_cache_all();
  63
  64        for (i = 0; i < LAST_PKMAP; i++) {
  65                struct page *page;
  66
  67                /*
  68                 * zero means we don't have anything to do,
  69                 * >1 means that it is still in use. Only
  70                 * a count of 1 means that it is free but
  71                 * needs to be unmapped
  72                 */
  73                if (pkmap_count[i] != 1)
  74                        continue;
  75                pkmap_count[i] = 0;
  76
  77                /* sanity check */
  78                if (pte_none(pkmap_page_table[i]))
  79                        BUG();
  80
  81                /*
  82                 * Don't need an atomic fetch-and-clear op here;
  83                 * no-one has the page mapped, and cannot get at
  84                 * its virtual address (and hence PTE) without first
  85                 * getting the kmap_lock (which is held here).
  86                 * So no dangers, even with speculative execution.
  87                 */
  88                page = pte_page(pkmap_page_table[i]);
  89                pte_clear(&pkmap_page_table[i]);
  90
  91                page->virtual = NULL;
  92        }
  93        flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
  94}
  95
  96static inline unsigned long map_new_virtual(struct page *page)
  97{
  98        unsigned long vaddr;
  99        int count;
 100
 101start:
 102        count = LAST_PKMAP;
 103        /* Find an empty entry */
 104        for (;;) {
 105                last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
 106                if (!last_pkmap_nr) {
 107                        flush_all_zero_pkmaps();
 108                        count = LAST_PKMAP;
 109                }
 110                if (!pkmap_count[last_pkmap_nr])
 111                        break;  /* Found a usable entry */
 112                if (--count)
 113                        continue;
 114
 115                /*
 116                 * Sleep for somebody else to unmap their entries
 117                 */
 118                {
 119                        DECLARE_WAITQUEUE(wait, current);
 120
 121                        current->state = TASK_UNINTERRUPTIBLE;
 122                        add_wait_queue(&pkmap_map_wait, &wait);
 123                        spin_unlock(&kmap_lock);
 124                        schedule();
 125                        remove_wait_queue(&pkmap_map_wait, &wait);
 126                        spin_lock(&kmap_lock);
 127
 128                        /* Somebody else might have mapped it while we slept */
 129                        if (page->virtual)
 130                                return (unsigned long) page->virtual;
 131
 132                        /* Re-start */
 133                        goto start;
 134                }
 135        }
 136        vaddr = PKMAP_ADDR(last_pkmap_nr);
 137        set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
 138
 139        pkmap_count[last_pkmap_nr] = 1;
 140        page->virtual = (void *) vaddr;
 141
 142        return vaddr;
 143}
 144
 145void *kmap_high(struct page *page)
 146{
 147        unsigned long vaddr;
 148
 149        /*
 150         * For highmem pages, we can't trust "virtual" until
 151         * after we have the lock.
 152         *
 153         * We cannot call this from interrupts, as it may block
 154         */
 155        spin_lock(&kmap_lock);
 156        vaddr = (unsigned long) page->virtual;
 157        if (!vaddr)
 158                vaddr = map_new_virtual(page);
 159        pkmap_count[PKMAP_NR(vaddr)]++;
 160        if (pkmap_count[PKMAP_NR(vaddr)] < 2)
 161                BUG();
 162        spin_unlock(&kmap_lock);
 163        return (void*) vaddr;
 164}
 165
 166void kunmap_high(struct page *page)
 167{
 168        unsigned long vaddr;
 169        unsigned long nr;
 170        int need_wakeup;
 171
 172        spin_lock(&kmap_lock);
 173        vaddr = (unsigned long) page->virtual;
 174        if (!vaddr)
 175                BUG();
 176        nr = PKMAP_NR(vaddr);
 177
 178        /*
 179         * A count must never go down to zero
 180         * without a TLB flush!
 181         */
 182        need_wakeup = 0;
 183        switch (--pkmap_count[nr]) {
 184        case 0:
 185                BUG();
 186        case 1:
 187                /*
 188                 * Avoid an unnecessary wake_up() function call.
 189                 * The common case is pkmap_count[] == 1, but
 190                 * no waiters.
 191                 * The tasks queued in the wait-queue are guarded
 192                 * by both the lock in the wait-queue-head and by
 193                 * the kmap_lock.  As the kmap_lock is held here,
 194                 * no need for the wait-queue-head's lock.  Simply
 195                 * test if the queue is empty.
 196                 */
 197                need_wakeup = waitqueue_active(&pkmap_map_wait);
 198        }
 199        spin_unlock(&kmap_lock);
 200
 201        /* do wake-up, if needed, race-free outside of the spin lock */
 202        if (need_wakeup)
 203                wake_up(&pkmap_map_wait);
 204}
 205
 206#define POOL_SIZE       64
 207
 208static __init int init_emergency_pool(void)
 209{
 210        struct sysinfo i;
 211        si_meminfo(&i);
 212        si_swapinfo(&i);
 213        
 214        if (!i.totalhigh)
 215                return 0;
 216
 217        page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
 218        if (!page_pool)
 219                BUG();
 220        printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
 221
 222        return 0;
 223}
 224
 225__initcall(init_emergency_pool);
 226
 227/*
 228 * highmem version, map in to vec
 229 */
 230static inline void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 231{
 232        unsigned long flags;
 233        unsigned char *vto;
 234
 235        local_irq_save(flags);
 236        vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 237        memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 238        kunmap_atomic(vto, KM_BOUNCE_READ);
 239        local_irq_restore(flags);
 240}
 241
 242#else /* CONFIG_HIGHMEM */
 243
 244#define bounce_copy_vec(to, vfrom)      \
 245        memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
 246
 247#endif
 248
 249#define ISA_POOL_SIZE   16
 250
 251/*
 252 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
 253 * as the max address, so check if the pool has already been created.
 254 */
 255int init_emergency_isa_pool(void)
 256{
 257        if (isa_page_pool)
 258                return 0;
 259
 260        isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
 261        if (!isa_page_pool)
 262                BUG();
 263
 264        printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
 265        return 0;
 266}
 267
 268/*
 269 * Simple bounce buffer support for highmem pages. Depending on the
 270 * queue gfp mask set, *to may or may not be a highmem page. kmap it
 271 * always, it will do the Right Thing
 272 */
 273static inline void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 274{
 275        unsigned char *vfrom;
 276        struct bio_vec *tovec, *fromvec;
 277        int i;
 278
 279        __bio_for_each_segment(tovec, to, i, 0) {
 280                fromvec = from->bi_io_vec + i;
 281
 282                /*
 283                 * not bounced
 284                 */
 285                if (tovec->bv_page == fromvec->bv_page)
 286                        continue;
 287
 288                vfrom = page_address(fromvec->bv_page) + fromvec->bv_offset;
 289
 290                bounce_copy_vec(tovec, vfrom);
 291        }
 292}
 293
 294static void bounce_end_io(struct bio *bio, mempool_t *pool)
 295{
 296        struct bio *bio_orig = bio->bi_private;
 297        struct bio_vec *bvec, *org_vec;
 298        int i;
 299
 300        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 301                goto out_eio;
 302
 303        set_bit(BIO_UPTODATE, &bio_orig->bi_flags);
 304
 305        /*
 306         * free up bounce indirect pages used
 307         */
 308        __bio_for_each_segment(bvec, bio, i, 0) {
 309                org_vec = bio_orig->bi_io_vec + i;
 310                if (bvec->bv_page == org_vec->bv_page)
 311                        continue;
 312
 313                mempool_free(bvec->bv_page, pool);      
 314        }
 315
 316out_eio:
 317        bio_endio(bio_orig, bio_orig->bi_size, 0);
 318        bio_put(bio);
 319}
 320
 321static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
 322{
 323        if (bio->bi_size)
 324                return 1;
 325
 326        bounce_end_io(bio, page_pool);
 327        return 0;
 328}
 329
 330static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
 331{
 332        if (bio->bi_size)
 333                return 1;
 334
 335        bounce_end_io(bio, isa_page_pool);
 336        return 0;
 337}
 338
 339static inline void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 340{
 341        struct bio *bio_orig = bio->bi_private;
 342
 343        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 344                copy_to_high_bio_irq(bio_orig, bio);
 345
 346        bounce_end_io(bio, pool);
 347}
 348
 349static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
 350{
 351        if (bio->bi_size)
 352                return 1;
 353
 354        __bounce_end_io_read(bio, page_pool);
 355        return 0;
 356}
 357
 358static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
 359{
 360        if (bio->bi_size)
 361                return 1;
 362
 363        __bounce_end_io_read(bio, isa_page_pool);
 364        return 0;
 365}
 366
 367void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 368{
 369        struct page *page;
 370        struct bio *bio = NULL;
 371        int i, rw = bio_data_dir(*bio_orig), bio_gfp;
 372        struct bio_vec *to, *from;
 373        mempool_t *pool;
 374        unsigned long pfn = q->bounce_pfn;
 375        int gfp = q->bounce_gfp;
 376
 377        BUG_ON((*bio_orig)->bi_idx);
 378
 379        /*
 380         * for non-isa bounce case, just check if the bounce pfn is equal
 381         * to or bigger than the highest pfn in the system -- in that case,
 382         * don't waste time iterating over bio segments
 383         */
 384        if (!(gfp & GFP_DMA)) {
 385                if (pfn >= blk_max_pfn)
 386                        return;
 387
 388                bio_gfp = GFP_NOHIGHIO;
 389                pool = page_pool;
 390        } else {
 391                BUG_ON(!isa_page_pool);
 392                bio_gfp = GFP_NOIO;
 393                pool = isa_page_pool;
 394        }
 395
 396        bio_for_each_segment(from, *bio_orig, i) {
 397                page = from->bv_page;
 398
 399                /*
 400                 * is destination page below bounce pfn?
 401                 */
 402                if ((page - page_zone(page)->zone_mem_map) + (page_zone(page)->zone_start_pfn) < pfn)
 403                        continue;
 404
 405                /*
 406                 * irk, bounce it
 407                 */
 408                if (!bio)
 409                        bio = bio_alloc(bio_gfp, (*bio_orig)->bi_vcnt);
 410
 411                to = bio->bi_io_vec + i;
 412
 413                to->bv_page = mempool_alloc(pool, gfp);
 414                to->bv_len = from->bv_len;
 415                to->bv_offset = from->bv_offset;
 416
 417                if (rw & WRITE) {
 418                        char *vto, *vfrom;
 419
 420                        vto = page_address(to->bv_page) + to->bv_offset;
 421                        vfrom = kmap(from->bv_page) + from->bv_offset;
 422                        memcpy(vto, vfrom, to->bv_len);
 423                        kunmap(from->bv_page);
 424                }
 425        }
 426
 427        /*
 428         * no pages bounced
 429         */
 430        if (!bio)
 431                return;
 432
 433        /*
 434         * at least one page was bounced, fill in possible non-highmem
 435         * pages
 436         */
 437        bio_for_each_segment(from, *bio_orig, i) {
 438                to = &bio->bi_io_vec[i];
 439                if (!to->bv_page) {
 440                        to->bv_page = from->bv_page;
 441                        to->bv_len = from->bv_len;
 442                        to->bv_offset = to->bv_offset;
 443                }
 444        }
 445
 446        bio->bi_bdev = (*bio_orig)->bi_bdev;
 447        bio->bi_sector = (*bio_orig)->bi_sector;
 448        bio->bi_rw = (*bio_orig)->bi_rw;
 449
 450        bio->bi_vcnt = (*bio_orig)->bi_vcnt;
 451        bio->bi_idx = 0;
 452        bio->bi_size = (*bio_orig)->bi_size;
 453
 454        if (pool == page_pool) {
 455                if (rw & WRITE)
 456                        bio->bi_end_io = bounce_end_io_write;
 457                else
 458                        bio->bi_end_io = bounce_end_io_read;
 459        } else {
 460                if (rw & WRITE)
 461                        bio->bi_end_io = bounce_end_io_write_isa;
 462                else
 463                        bio->bi_end_io = bounce_end_io_read_isa;
 464        }
 465
 466        bio->bi_private = *bio_orig;
 467        *bio_orig = bio;
 468}
 469
 470#if CONFIG_DEBUG_HIGHMEM
 471void check_highmem_ptes(void)
 472{
 473        int idx, type;
 474
 475        for (type = 0; type < KM_TYPE_NR; type++) {
 476                idx = type + KM_TYPE_NR*smp_processor_id();
 477                if (!pte_none(*(kmap_pte-idx))) {
 478                        printk("scheduling with KM_TYPE %d held!\n", type);
 479                        BUG();
 480                }
 481        }
 482}
 483#endif
 484
 485
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.