1/* 2 * mm/readahead.c - address_space-level file readahead. 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 09Apr2002 akpm@zip.com.au 7 * Initial version. 8 */ 9 10#include <linux/kernel.h> 11#include <linux/fs.h> 12#include <linux/mm.h> 13#include <linux/blkdev.h> 14#include <linux/backing-dev.h> 15#include <linux/pagevec.h> 16 17struct backing_dev_info default_backing_dev_info = { 18 .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, 19 .state = 0, 20}; 21 22/* 23 * Return max readahead size for this inode in number-of-pages. 24 */ 25static inline unsigned long get_max_readahead(struct file *file) 26{ 27 return file->f_ra.ra_pages; 28} 29 30static inline unsigned long get_min_readahead(struct file *file) 31{ 32 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; 33} 34 35static int 36read_pages(struct file *file, struct address_space *mapping, 37 struct list_head *pages, unsigned nr_pages) 38{ 39 unsigned page_idx; 40 struct pagevec lru_pvec; 41 42 pagevec_init(&lru_pvec); 43 44 if (mapping->a_ops->readpages) 45 return mapping->a_ops->readpages(mapping, pages, nr_pages); 46 47 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 48 struct page *page = list_entry(pages->prev, struct page, list); 49 list_del(&page->list); 50 if (!add_to_page_cache(page, mapping, page->index)) { 51 mapping->a_ops->readpage(file, page); 52 if (!pagevec_add(&lru_pvec, page)) 53 __pagevec_lru_add(&lru_pvec); 54 } else { 55 page_cache_release(page); 56 } 57 } 58 pagevec_lru_add(&lru_pvec); 59 return 0; 60} 61 62/* 63 * Readahead design. 64 * 65 * The fields in struct file_ra_state represent the most-recently-executed 66 * readahead attempt: 67 * 68 * start: Page index at which we started the readahead 69 * size: Number of pages in that read 70 * Together, these form the "current window". 71 * Together, start and size represent the `readahead window'. 72 * next_size: The number of pages to read on the next readahead miss. 73 * Has the magical value -1UL if readahead has been disabled. 74 * prev_page: The page which the readahead algorithm most-recently inspected. 75 * prev_page is mainly an optimisation: if page_cache_readahead 76 * sees that it is again being called for a page which it just 77 * looked at, it can return immediately without making any state 78 * changes. 79 * ahead_start, 80 * ahead_size: Together, these form the "ahead window". 81 * ra_pages: The externally controlled max readahead for this fd. 82 * 83 * The readahead code manages two windows - the "current" and the "ahead" 84 * windows. The intent is that while the application is walking the pages 85 * in the current window, I/O is underway on the ahead window. When the 86 * current window is fully traversed, it is replaced by the ahead window 87 * and the ahead window is invalidated. When this copying happens, the 88 * new current window's pages are probably still locked. When I/O has 89 * completed, we submit a new batch of I/O, creating a new ahead window. 90 * 91 * So: 92 * 93 * ----|----------------|----------------|----- 94 * ^start ^start+size 95 * ^ahead_start ^ahead_start+ahead_size 96 * 97 * ^ When this page is read, we submit I/O for the 98 * ahead window. 99 * 100 * A `readahead hit' occurs when a read request is made against a page which is 101 * inside the current window. Hits are good, and the window size (next_size) 102 * is grown aggressively when hits occur. Two pages are added to the next 103 * window size on each hit, which will end up doubling the next window size by 104 * the time I/O is submitted for it. 105 * 106 * If readahead hits are more sparse (say, the application is only reading 107 * every second page) then the window will build more slowly. 108 * 109 * On a readahead miss (the application seeked away) the readahead window is 110 * shrunk by 25%. We don't want to drop it too aggressively, because it is a 111 * good assumption that an application which has built a good readahead window 112 * will continue to perform linear reads. Either at the new file position, or 113 * at the old one after another seek. 114 * 115 * There is a special-case: if the first page which the application tries to 116 * read happens to be the first page of the file, it is assumed that a linear 117 * read is about to happen and the window is immediately set to half of the 118 * device maximum. 119 * 120 * A page request at (start + size) is not a miss at all - it's just a part of 121 * sequential file reading. 122 * 123 * This function is to be called for every page which is read, rather than when 124 * it is time to perform readahead. This is so the readahead algorithm can 125 * centrally work out the access patterns. This could be costly with many tiny 126 * read()s, so we specifically optimise for that case with prev_page. 127 */ 128 129/* 130 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 131 * the pages first, then submits them all for I/O. This avoids the very bad 132 * behaviour which would occur if page allocations are causing VM writeback. 133 * We really don't want to intermingle reads and writes like that. 134 * 135 * Returns the number of pages which actually had IO started against them. 136 */ 137int do_page_cache_readahead(struct file *file, 138 unsigned long offset, unsigned long nr_to_read) 139{ 140 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 141 struct inode *inode = mapping->host; 142 struct page *page; 143 unsigned long end_index; /* The last page we want to read */ 144 LIST_HEAD(page_pool); 145 int page_idx; 146 int ret = 0; 147 148 if (inode->i_size == 0) 149 goto out; 150 151 end_index = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT); 152 153 /* 154 * Preallocate as many pages as we will need. 155 */ 156 read_lock(&mapping->page_lock); 157 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 158 unsigned long page_offset = offset + page_idx; 159 160 if (page_offset > end_index) 161 break; 162 163 page = radix_tree_lookup(&mapping->page_tree, page_offset); 164 if (page) 165 continue; 166 167 read_unlock(&mapping->page_lock); 168 page = page_cache_alloc(mapping); 169 read_lock(&mapping->page_lock); 170 if (!page) 171 break; 172 page->index = page_offset; 173 list_add(&page->list, &page_pool); 174 ret++; 175 } 176 read_unlock(&mapping->page_lock); 177 178 /* 179 * Now start the IO. We ignore I/O errors - if the page is not 180 * uptodate then the caller will launch readpage again, and 181 * will then handle the error. 182 */ 183 if (ret) { 184 read_pages(file, mapping, &page_pool, ret); 185 blk_run_queues(); 186 } 187 BUG_ON(!list_empty(&page_pool)); 188out: 189 return ret; 190} 191 192/* 193 * Check how effective readahead is being. If the amount of started IO is 194 * less than expected then the file is partly or fully in pagecache and 195 * readahead isn't helping. Shrink the window. 196 * 197 * But don't shrink it too much - the application may read the same page 198 * occasionally. 199 */ 200static inline void 201check_ra_success(struct file_ra_state *ra, pgoff_t attempt, 202 pgoff_t actual, pgoff_t orig_next_size) 203{ 204 if (actual == 0) { 205 if (orig_next_size > 1) { 206 ra->next_size = orig_next_size - 1; 207 if (ra->ahead_size) 208 ra->ahead_size = ra->next_size; 209 } else { 210 ra->next_size = -1UL; 211 } 212 } 213} 214 215/* 216 * page_cache_readahead is the main function. If performs the adaptive 217 * readahead window size management and submits the readahead I/O. 218 */ 219void page_cache_readahead(struct file *file, unsigned long offset) 220{ 221 struct file_ra_state *ra = &file->f_ra; 222 unsigned max; 223 unsigned min; 224 unsigned orig_next_size; 225 unsigned actual; 226 227 /* 228 * Here we detect the case where the application is performing 229 * sub-page sized reads. We avoid doing extra work and bogusly 230 * perturbing the readahead window expansion logic. 231 * If next_size is zero, this is the very first read for this 232 * file handle, or the window is maximally shrunk. 233 */ 234 if (offset == ra->prev_page) { 235 if (ra->next_size != 0) 236 goto out; 237 } 238 239 if (ra->next_size == -1UL) 240 goto out; /* Maximally shrunk */ 241 242 max = get_max_readahead(file); 243 if (max == 0) 244 goto out; /* No readahead */ 245 246 min = get_min_readahead(file); 247 orig_next_size = ra->next_size; 248 249 if (ra->next_size == 0 && offset == 0) { 250 /* 251 * Special case - first read from first page. 252 * We'll assume it's a whole-file read, and 253 * grow the window fast. 254 */ 255 ra->next_size = max / 2; 256 goto do_io; 257 } 258 259 ra->prev_page = offset; 260 261 if (offset >= ra->start && offset <= (ra->start + ra->size)) { 262 /* 263 * A readahead hit. Either inside the window, or one 264 * page beyond the end. Expand the next readahead size. 265 */ 266 ra->next_size += 2; 267 } else { 268 /* 269 * A miss - lseek, pread, etc. Shrink the readahead 270 * window by 25%. 271 */ 272 ra->next_size -= ra->next_size / 4; 273 } 274 275 if (ra->next_size > max) 276 ra->next_size = max; 277 if (ra->next_size < min) 278 ra->next_size = min; 279 280 /* 281 * Is this request outside the current window? 282 */ 283 if (offset < ra->start || offset >= (ra->start + ra->size)) { 284 /* 285 * A miss against the current window. Have we merely 286 * advanced into the ahead window? 287 */ 288 if (offset == ra->ahead_start) { 289 /* 290 * Yes, we have. The ahead window now becomes 291 * the current window. 292 */ 293 ra->start = ra->ahead_start; 294 ra->size = ra->ahead_size; 295 ra->prev_page = ra->start; 296 ra->ahead_start = 0; 297 ra->ahead_size = 0; 298 /* 299 * Control now returns, probably to sleep until I/O 300 * completes against the first ahead page. 301 * When the second page in the old ahead window is 302 * requested, control will return here and more I/O 303 * will be submitted to build the new ahead window. 304 */ 305 goto out; 306 } 307do_io: 308 /* 309 * This is the "unusual" path. We come here during 310 * startup or after an lseek. We invalidate the 311 * ahead window and get some I/O underway for the new 312 * current window. 313 */ 314 ra->start = offset; 315 ra->size = ra->next_size; 316 ra->ahead_start = 0; /* Invalidate these */ 317 ra->ahead_size = 0; 318 319 actual = do_page_cache_readahead(file, offset, ra->size); 320 check_ra_success(ra, ra->size, actual, orig_next_size); 321 } else { 322 /* 323 * This read request is within the current window. It is time 324 * to submit I/O for the ahead window while the application is 325 * crunching through the current window. 326 */ 327 if (ra->ahead_start == 0) { 328 ra->ahead_start = ra->start + ra->size; 329 ra->ahead_size = ra->next_size; 330 actual = do_page_cache_readahead(file, 331 ra->ahead_start, ra->ahead_size); 332 check_ra_success(ra, ra->ahead_size, 333 actual, orig_next_size); 334 } 335 } 336out: 337 return; 338} 339 340/* 341 * For mmap reads (typically executables) the access pattern is fairly random, 342 * but somewhat ascending. So readaround favours pages beyond the target one. 343 * We also boost the window size, as it can easily shrink due to misses. 344 */ 345void page_cache_readaround(struct file *file, unsigned long offset) 346{ 347 struct file_ra_state *ra = &file->f_ra; 348 349 if (ra->next_size != -1UL) { 350 const unsigned long min = get_min_readahead(file) * 2; 351 unsigned long target; 352 unsigned long backward; 353 354 /* 355 * If next_size is zero then leave it alone, because that's a 356 * readahead startup state. 357 */ 358 if (ra->next_size && ra->next_size < min) 359 ra->next_size = min; 360 361 target = offset; 362 backward = ra->next_size / 4; 363 364 if (backward > target) 365 target = 0; 366 else 367 target -= backward; 368 page_cache_readahead(file, target); 369 } 370} 371 372/* 373 * handle_ra_miss() is called when it is known that a page which should have 374 * been present in the pagecache (we just did some readahead there) was in fact 375 * not found. This will happen if it was evicted by the VM (readahead 376 * thrashing) or if the readahead window is maximally shrunk. 377 * 378 * If the window has been maximally shrunk (next_size == 0) then bump it up 379 * again to resume readahead. 380 * 381 * Otherwise we're thrashing, so shrink the readahead window by three pages. 382 * This is because it is grown by two pages on a readahead hit. Theory being 383 * that the readahead window size will stabilise around the maximum level at 384 * which there is no thrashing. 385 */ 386void handle_ra_miss(struct file *file) 387{ 388 struct file_ra_state *ra = &file->f_ra; 389 const unsigned long min = get_min_readahead(file); 390 391 if (ra->next_size == -1UL) { 392 ra->next_size = min; 393 } else { 394 ra->next_size -= 3; 395 if (ra->next_size < min) 396 ra->next_size = min; 397 } 398} 399

