linux-bk/mm/readahead.c
<<
>>
Prefs
   1/*
   2 * mm/readahead.c - address_space-level file readahead.
   3 *
   4 * Copyright (C) 2002, Linus Torvalds
   5 *
   6 * 09Apr2002    akpm@zip.com.au
   7 *              Initial version.
   8 */
   9
  10#include <linux/kernel.h>
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/blkdev.h>
  14#include <linux/backing-dev.h>
  15#include <linux/pagevec.h>
  16
  17struct backing_dev_info default_backing_dev_info = {
  18        .ra_pages       = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
  19        .state          = 0,
  20};
  21
  22/*
  23 * Return max readahead size for this inode in number-of-pages.
  24 */
  25static inline unsigned long get_max_readahead(struct file *file)
  26{
  27        return file->f_ra.ra_pages;
  28}
  29
  30static inline unsigned long get_min_readahead(struct file *file)
  31{
  32        return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
  33}
  34
  35static int
  36read_pages(struct file *file, struct address_space *mapping,
  37                struct list_head *pages, unsigned nr_pages)
  38{
  39        unsigned page_idx;
  40        struct pagevec lru_pvec;
  41
  42        pagevec_init(&lru_pvec);
  43
  44        if (mapping->a_ops->readpages)
  45                return mapping->a_ops->readpages(mapping, pages, nr_pages);
  46
  47        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
  48                struct page *page = list_entry(pages->prev, struct page, list);
  49                list_del(&page->list);
  50                if (!add_to_page_cache(page, mapping, page->index)) {
  51                        mapping->a_ops->readpage(file, page);
  52                        if (!pagevec_add(&lru_pvec, page))
  53                                __pagevec_lru_add(&lru_pvec);
  54                } else {
  55                        page_cache_release(page);
  56                }
  57        }
  58        pagevec_lru_add(&lru_pvec);
  59        return 0;
  60}
  61
  62/*
  63 * Readahead design.
  64 *
  65 * The fields in struct file_ra_state represent the most-recently-executed
  66 * readahead attempt:
  67 *
  68 * start:       Page index at which we started the readahead
  69 * size:        Number of pages in that read
  70 *              Together, these form the "current window".
  71 *              Together, start and size represent the `readahead window'.
  72 * next_size:   The number of pages to read on the next readahead miss.
  73 *              Has the magical value -1UL if readahead has been disabled.
  74 * prev_page:   The page which the readahead algorithm most-recently inspected.
  75 *              prev_page is mainly an optimisation: if page_cache_readahead
  76 *              sees that it is again being called for a page which it just
  77 *              looked at, it can return immediately without making any state
  78 *              changes.
  79 * ahead_start,
  80 * ahead_size:  Together, these form the "ahead window".
  81 * ra_pages:    The externally controlled max readahead for this fd.
  82 *
  83 * The readahead code manages two windows - the "current" and the "ahead"
  84 * windows.  The intent is that while the application is walking the pages
  85 * in the current window, I/O is underway on the ahead window.  When the
  86 * current window is fully traversed, it is replaced by the ahead window
  87 * and the ahead window is invalidated.  When this copying happens, the
  88 * new current window's pages are probably still locked.  When I/O has
  89 * completed, we submit a new batch of I/O, creating a new ahead window.
  90 *
  91 * So:
  92 *
  93 *   ----|----------------|----------------|-----
  94 *       ^start           ^start+size
  95 *                        ^ahead_start     ^ahead_start+ahead_size
  96 *
  97 *         ^ When this page is read, we submit I/O for the
  98 *           ahead window.
  99 *
 100 * A `readahead hit' occurs when a read request is made against a page which is
 101 * inside the current window.  Hits are good, and the window size (next_size)
 102 * is grown aggressively when hits occur.  Two pages are added to the next
 103 * window size on each hit, which will end up doubling the next window size by
 104 * the time I/O is submitted for it.
 105 *
 106 * If readahead hits are more sparse (say, the application is only reading
 107 * every second page) then the window will build more slowly.
 108 *
 109 * On a readahead miss (the application seeked away) the readahead window is
 110 * shrunk by 25%.  We don't want to drop it too aggressively, because it is a
 111 * good assumption that an application which has built a good readahead window
 112 * will continue to perform linear reads.  Either at the new file position, or
 113 * at the old one after another seek.
 114 *
 115 * There is a special-case: if the first page which the application tries to
 116 * read happens to be the first page of the file, it is assumed that a linear
 117 * read is about to happen and the window is immediately set to half of the
 118 * device maximum.
 119 * 
 120 * A page request at (start + size) is not a miss at all - it's just a part of
 121 * sequential file reading.
 122 *
 123 * This function is to be called for every page which is read, rather than when
 124 * it is time to perform readahead.  This is so the readahead algorithm can
 125 * centrally work out the access patterns.  This could be costly with many tiny
 126 * read()s, so we specifically optimise for that case with prev_page.
 127 */
 128
 129/*
 130 * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
 131 * the pages first, then submits them all for I/O. This avoids the very bad
 132 * behaviour which would occur if page allocations are causing VM writeback.
 133 * We really don't want to intermingle reads and writes like that.
 134 *
 135 * Returns the number of pages which actually had IO started against them.
 136 */
 137int do_page_cache_readahead(struct file *file,
 138                        unsigned long offset, unsigned long nr_to_read)
 139{
 140        struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 141        struct inode *inode = mapping->host;
 142        struct page *page;
 143        unsigned long end_index;        /* The last page we want to read */
 144        LIST_HEAD(page_pool);
 145        int page_idx;
 146        int ret = 0;
 147
 148        if (inode->i_size == 0)
 149                goto out;
 150
 151        end_index = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT);
 152
 153        /*
 154         * Preallocate as many pages as we will need.
 155         */
 156        read_lock(&mapping->page_lock);
 157        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 158                unsigned long page_offset = offset + page_idx;
 159                
 160                if (page_offset > end_index)
 161                        break;
 162
 163                page = radix_tree_lookup(&mapping->page_tree, page_offset);
 164                if (page)
 165                        continue;
 166
 167                read_unlock(&mapping->page_lock);
 168                page = page_cache_alloc(mapping);
 169                read_lock(&mapping->page_lock);
 170                if (!page)
 171                        break;
 172                page->index = page_offset;
 173                list_add(&page->list, &page_pool);
 174                ret++;
 175        }
 176        read_unlock(&mapping->page_lock);
 177
 178        /*
 179         * Now start the IO.  We ignore I/O errors - if the page is not
 180         * uptodate then the caller will launch readpage again, and
 181         * will then handle the error.
 182         */
 183        if (ret) {
 184                read_pages(file, mapping, &page_pool, ret);
 185                blk_run_queues();
 186        }
 187        BUG_ON(!list_empty(&page_pool));
 188out:
 189        return ret;
 190}
 191
 192/*
 193 * Check how effective readahead is being.  If the amount of started IO is
 194 * less than expected then the file is partly or fully in pagecache and
 195 * readahead isn't helping.  Shrink the window.
 196 *
 197 * But don't shrink it too much - the application may read the same page
 198 * occasionally.
 199 */
 200static inline void
 201check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
 202                        pgoff_t actual, pgoff_t orig_next_size)
 203{
 204        if (actual == 0) {
 205                if (orig_next_size > 1) {
 206                        ra->next_size = orig_next_size - 1;
 207                        if (ra->ahead_size)
 208                                ra->ahead_size = ra->next_size;
 209                } else {
 210                        ra->next_size = -1UL;
 211                }
 212        }
 213}
 214
 215/*
 216 * page_cache_readahead is the main function.  If performs the adaptive
 217 * readahead window size management and submits the readahead I/O.
 218 */
 219void page_cache_readahead(struct file *file, unsigned long offset)
 220{
 221        struct file_ra_state *ra = &file->f_ra;
 222        unsigned max;
 223        unsigned min;
 224        unsigned orig_next_size;
 225        unsigned actual;
 226
 227        /*
 228         * Here we detect the case where the application is performing
 229         * sub-page sized reads.  We avoid doing extra work and bogusly
 230         * perturbing the readahead window expansion logic.
 231         * If next_size is zero, this is the very first read for this
 232         * file handle, or the window is maximally shrunk.
 233         */
 234        if (offset == ra->prev_page) {
 235                if (ra->next_size != 0)
 236                        goto out;
 237        }
 238
 239        if (ra->next_size == -1UL)
 240                goto out;       /* Maximally shrunk */
 241
 242        max = get_max_readahead(file);
 243        if (max == 0)
 244                goto out;       /* No readahead */
 245
 246        min = get_min_readahead(file);
 247        orig_next_size = ra->next_size;
 248
 249        if (ra->next_size == 0 && offset == 0) {
 250                /*
 251                 * Special case - first read from first page.
 252                 * We'll assume it's a whole-file read, and
 253                 * grow the window fast.
 254                 */
 255                ra->next_size = max / 2;
 256                goto do_io;
 257        }
 258
 259        ra->prev_page = offset;
 260
 261        if (offset >= ra->start && offset <= (ra->start + ra->size)) {
 262                /*
 263                 * A readahead hit.  Either inside the window, or one
 264                 * page beyond the end.  Expand the next readahead size.
 265                 */
 266                ra->next_size += 2;
 267        } else {
 268                /*
 269                 * A miss - lseek, pread, etc.  Shrink the readahead
 270                 * window by 25%.
 271                 */
 272                ra->next_size -= ra->next_size / 4;
 273        }
 274
 275        if (ra->next_size > max)
 276                ra->next_size = max;
 277        if (ra->next_size < min)
 278                ra->next_size = min;
 279
 280        /*
 281         * Is this request outside the current window?
 282         */
 283        if (offset < ra->start || offset >= (ra->start + ra->size)) {
 284                /*
 285                 * A miss against the current window.  Have we merely
 286                 * advanced into the ahead window?
 287                 */
 288                if (offset == ra->ahead_start) {
 289                        /*
 290                         * Yes, we have.  The ahead window now becomes
 291                         * the current window.
 292                         */
 293                        ra->start = ra->ahead_start;
 294                        ra->size = ra->ahead_size;
 295                        ra->prev_page = ra->start;
 296                        ra->ahead_start = 0;
 297                        ra->ahead_size = 0;
 298                        /*
 299                         * Control now returns, probably to sleep until I/O
 300                         * completes against the first ahead page.
 301                         * When the second page in the old ahead window is
 302                         * requested, control will return here and more I/O
 303                         * will be submitted to build the new ahead window.
 304                         */
 305                        goto out;
 306                }
 307do_io:
 308                /*
 309                 * This is the "unusual" path.  We come here during
 310                 * startup or after an lseek.  We invalidate the
 311                 * ahead window and get some I/O underway for the new
 312                 * current window.
 313                 */
 314                ra->start = offset;
 315                ra->size = ra->next_size;
 316                ra->ahead_start = 0;            /* Invalidate these */
 317                ra->ahead_size = 0;
 318
 319                actual = do_page_cache_readahead(file, offset, ra->size);
 320                check_ra_success(ra, ra->size, actual, orig_next_size);
 321        } else {
 322                /*
 323                 * This read request is within the current window.  It is time
 324                 * to submit I/O for the ahead window while the application is
 325                 * crunching through the current window.
 326                 */
 327                if (ra->ahead_start == 0) {
 328                        ra->ahead_start = ra->start + ra->size;
 329                        ra->ahead_size = ra->next_size;
 330                        actual = do_page_cache_readahead(file,
 331                                        ra->ahead_start, ra->ahead_size);
 332                        check_ra_success(ra, ra->ahead_size,
 333                                        actual, orig_next_size);
 334                }
 335        }
 336out:
 337        return;
 338}
 339
 340/*
 341 * For mmap reads (typically executables) the access pattern is fairly random,
 342 * but somewhat ascending.  So readaround favours pages beyond the target one.
 343 * We also boost the window size, as it can easily shrink due to misses.
 344 */
 345void page_cache_readaround(struct file *file, unsigned long offset)
 346{
 347        struct file_ra_state *ra = &file->f_ra;
 348
 349        if (ra->next_size != -1UL) {
 350                const unsigned long min = get_min_readahead(file) * 2;
 351                unsigned long target;
 352                unsigned long backward;
 353
 354                /*
 355                 * If next_size is zero then leave it alone, because that's a
 356                 * readahead startup state.
 357                 */
 358                if (ra->next_size && ra->next_size < min)
 359                        ra->next_size = min;
 360
 361                target = offset;
 362                backward = ra->next_size / 4;
 363
 364                if (backward > target)
 365                        target = 0;
 366                else
 367                        target -= backward;
 368                page_cache_readahead(file, target);
 369        }
 370}
 371
 372/*
 373 * handle_ra_miss() is called when it is known that a page which should have
 374 * been present in the pagecache (we just did some readahead there) was in fact
 375 * not found.  This will happen if it was evicted by the VM (readahead
 376 * thrashing) or if the readahead window is maximally shrunk.
 377 *
 378 * If the window has been maximally shrunk (next_size == 0) then bump it up
 379 * again to resume readahead.
 380 *
 381 * Otherwise we're thrashing, so shrink the readahead window by three pages.
 382 * This is because it is grown by two pages on a readahead hit.  Theory being
 383 * that the readahead window size will stabilise around the maximum level at
 384 * which there is no thrashing.
 385 */
 386void handle_ra_miss(struct file *file)
 387{
 388        struct file_ra_state *ra = &file->f_ra;
 389        const unsigned long min = get_min_readahead(file);
 390
 391        if (ra->next_size == -1UL) {
 392                ra->next_size = min;
 393        } else {
 394                ra->next_size -= 3;
 395                if (ra->next_size < min)
 396                        ra->next_size = min;
 397        }
 398}
 399
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.