1
2
3
4
5
6
7
8
9
10
11
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/fs.h>
15#include <linux/mm.h>
16#include <linux/highmem.h>
17#include <linux/pagemap.h>
18#include <linux/bio.h>
19#include <linux/wait.h>
20#include <linux/err.h>
21#include <linux/buffer_head.h>
22#include <linux/rwsem.h>
23#include <asm/atomic.h>
24
25
26
27
28
29#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
30
31
32
33
34
35#define DIO_PAGES 64
36
37struct dio {
38
39 struct bio *bio;
40 struct bio_vec *bvec;
41 struct inode *inode;
42 int rw;
43 unsigned blkbits;
44 sector_t block_in_file;
45 unsigned blocks_available;
46 sector_t final_block_in_request;
47 unsigned first_block_in_page;
48 int boundary;
49 int reap_counter;
50 get_blocks_t *get_blocks;
51 sector_t last_block_in_bio;
52 sector_t next_block_in_bio;
53 struct buffer_head map_bh;
54
55
56 int curr_page;
57 int total_pages;
58 unsigned long curr_user_address;
59
60
61 struct page *pages[DIO_PAGES];
62 unsigned head;
63 unsigned tail;
64 int page_errors;
65
66
67 atomic_t bio_count;
68 spinlock_t bio_list_lock;
69 struct bio *bio_list;
70 struct task_struct *waiter;
71};
72
73
74
75
76static inline unsigned dio_pages_present(struct dio *dio)
77{
78 return dio->tail - dio->head;
79}
80
81
82
83
84static int dio_refill_pages(struct dio *dio)
85{
86 int ret;
87 int nr_pages;
88
89 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
90 down_read(¤t->mm->mmap_sem);
91 ret = get_user_pages(
92 current,
93 current->mm,
94 dio->curr_user_address,
95 nr_pages,
96 dio->rw == READ,
97 0,
98 &dio->pages[0],
99 NULL);
100 up_read(¤t->mm->mmap_sem);
101
102 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
103
104
105
106
107
108 if (dio->page_errors == 0)
109 dio->page_errors = ret;
110 dio->pages[0] = ZERO_PAGE(dio->cur_user_address);
111 dio->head = 0;
112 dio->tail = 1;
113 ret = 0;
114 goto out;
115 }
116
117 if (ret >= 0) {
118 dio->curr_user_address += ret * PAGE_SIZE;
119 dio->curr_page += ret;
120 dio->head = 0;
121 dio->tail = ret;
122 ret = 0;
123 }
124out:
125 return ret;
126}
127
128
129
130
131
132
133
134static struct page *dio_get_page(struct dio *dio)
135{
136 if (dio_pages_present(dio) == 0) {
137 int ret;
138
139 ret = dio_refill_pages(dio);
140 if (ret)
141 return ERR_PTR(ret);
142 BUG_ON(dio_pages_present(dio) == 0);
143 }
144 return dio->pages[dio->head++];
145}
146
147
148
149
150
151
152
153
154static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
155{
156 struct dio *dio = bio->bi_private;
157 unsigned long flags;
158
159 if (bio->bi_size)
160 return 1;
161
162 spin_lock_irqsave(&dio->bio_list_lock, flags);
163 bio->bi_private = dio->bio_list;
164 dio->bio_list = bio;
165 if (dio->waiter)
166 wake_up_process(dio->waiter);
167 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
168 return 0;
169}
170
171static int
172dio_bio_alloc(struct dio *dio, struct block_device *bdev,
173 sector_t first_sector, int nr_vecs)
174{
175 struct bio *bio;
176
177 bio = bio_alloc(GFP_KERNEL, nr_vecs);
178 if (bio == NULL)
179 return -ENOMEM;
180
181 bio->bi_bdev = bdev;
182 bio->bi_vcnt = nr_vecs;
183 bio->bi_idx = 0;
184 bio->bi_size = 0;
185 bio->bi_sector = first_sector;
186 bio->bi_io_vec[0].bv_page = NULL;
187 bio->bi_end_io = dio_bio_end_io;
188
189 dio->bio = bio;
190 dio->bvec = NULL;
191 return 0;
192}
193
194static void dio_bio_submit(struct dio *dio)
195{
196 struct bio *bio = dio->bio;
197
198 bio->bi_vcnt = bio->bi_idx;
199 bio->bi_idx = 0;
200 bio->bi_private = dio;
201 atomic_inc(&dio->bio_count);
202 submit_bio(dio->rw, bio);
203
204 dio->bio = NULL;
205 dio->bvec = NULL;
206 dio->boundary = 0;
207}
208
209
210
211
212static void dio_cleanup(struct dio *dio)
213{
214 while (dio_pages_present(dio))
215 page_cache_release(dio_get_page(dio));
216}
217
218
219
220
221static struct bio *dio_await_one(struct dio *dio)
222{
223 unsigned long flags;
224 struct bio *bio;
225
226 spin_lock_irqsave(&dio->bio_list_lock, flags);
227 while (dio->bio_list == NULL) {
228 set_current_state(TASK_UNINTERRUPTIBLE);
229 if (dio->bio_list == NULL) {
230 dio->waiter = current;
231 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
232 blk_run_queues();
233 schedule();
234 spin_lock_irqsave(&dio->bio_list_lock, flags);
235 dio->waiter = NULL;
236 }
237 set_current_state(TASK_RUNNING);
238 }
239 bio = dio->bio_list;
240 dio->bio_list = bio->bi_private;
241 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
242 return bio;
243}
244
245
246
247
248static int dio_bio_complete(struct dio *dio, struct bio *bio)
249{
250 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
251 struct bio_vec *bvec = bio->bi_io_vec;
252 int page_no;
253
254 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
255 struct page *page = bvec[page_no].bv_page;
256
257 if (dio->rw == READ)
258 set_page_dirty(page);
259 page_cache_release(page);
260 }
261 atomic_dec(&dio->bio_count);
262 bio_put(bio);
263 return uptodate ? 0 : -EIO;
264}
265
266
267
268
269static int dio_await_completion(struct dio *dio)
270{
271 int ret = 0;
272
273 if (dio->bio)
274 dio_bio_submit(dio);
275
276 while (atomic_read(&dio->bio_count)) {
277 struct bio *bio = dio_await_one(dio);
278 int ret2;
279
280 ret2 = dio_bio_complete(dio, bio);
281 if (ret == 0)
282 ret = ret2;
283 }
284 return ret;
285}
286
287
288
289
290
291
292
293
294static int dio_bio_reap(struct dio *dio)
295{
296 int ret = 0;
297
298 if (dio->reap_counter++ >= 64) {
299 while (dio->bio_list) {
300 unsigned long flags;
301 struct bio *bio;
302
303 spin_lock_irqsave(&dio->bio_list_lock, flags);
304 bio = dio->bio_list;
305 dio->bio_list = bio->bi_private;
306 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
307 ret = dio_bio_complete(dio, bio);
308 }
309 dio->reap_counter = 0;
310 }
311 return ret;
312}
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337static int get_more_blocks(struct dio *dio)
338{
339 int ret;
340 struct buffer_head *map_bh = &dio->map_bh;
341
342 if (dio->blocks_available)
343 return 0;
344
345
346
347
348
349 if (dio->page_errors) {
350 ret = dio->page_errors;
351 goto out;
352 }
353
354 map_bh->b_state = 0;
355 map_bh->b_size = 0;
356 BUG_ON(dio->block_in_file >= dio->final_block_in_request);
357 ret = (*dio->get_blocks)(dio->inode, dio->block_in_file,
358 dio->final_block_in_request - dio->block_in_file,
359 map_bh, dio->rw == WRITE);
360 if (ret)
361 goto out;
362
363 if (buffer_mapped(map_bh)) {
364 BUG_ON(map_bh->b_size == 0);
365 BUG_ON((map_bh->b_size & ((1 << dio->blkbits) - 1)) != 0);
366
367 dio->blocks_available = map_bh->b_size >> dio->blkbits;
368
369
370 if (buffer_new(map_bh)) {
371 sector_t block = map_bh->b_blocknr;
372 unsigned i;
373
374 for (i = 0; i < dio->blocks_available; i++)
375 unmap_underlying_metadata(map_bh->b_bdev,
376 block++);
377 }
378 } else {
379 BUG_ON(dio->rw != READ);
380 if (dio->bio)
381 dio_bio_submit(dio);
382 }
383 dio->next_block_in_bio = map_bh->b_blocknr;
384out:
385 return ret;
386}
387
388
389
390
391static void dio_prep_bio(struct dio *dio)
392{
393 if (dio->bio == NULL)
394 return;
395
396 if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
397 dio->boundary ||
398 dio->last_block_in_bio != dio->next_block_in_bio - 1)
399 dio_bio_submit(dio);
400}
401
402
403
404
405static int dio_new_bio(struct dio *dio)
406{
407 sector_t sector;
408 int ret;
409
410 ret = dio_bio_reap(dio);
411 if (ret)
412 goto out;
413 sector = dio->next_block_in_bio << (dio->blkbits - 9);
414 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector,
415 DIO_BIO_MAX_SIZE / PAGE_SIZE);
416 dio->boundary = 0;
417out:
418 return ret;
419}
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435int do_direct_IO(struct dio *dio)
436{
437 const unsigned blkbits = dio->blkbits;
438 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
439 struct page *page;
440 unsigned block_in_page;
441 int ret;
442
443
444 block_in_page = dio->first_block_in_page;
445
446 while (dio->block_in_file < dio->final_block_in_request) {
447 int new_page;
448
449 page = dio_get_page(dio);
450 if (IS_ERR(page)) {
451 ret = PTR_ERR(page);
452 goto out;
453 }
454
455 new_page = 1;
456 while (block_in_page < blocks_per_page) {
457 struct bio *bio;
458 unsigned this_chunk_bytes;
459 unsigned this_chunk_blocks;
460 unsigned u;
461
462 ret = get_more_blocks(dio);
463 if (ret)
464 goto fail_release;
465
466
467 if (!buffer_mapped(&dio->map_bh)) {
468 char *kaddr = kmap_atomic(page, KM_USER0);
469 memset(kaddr + (block_in_page << blkbits),
470 0, 1 << blkbits);
471 flush_dcache_page(page);
472 kunmap_atomic(kaddr, KM_USER0);
473 dio->block_in_file++;
474 dio->next_block_in_bio++;
475 block_in_page++;
476 goto next_block;
477 }
478
479 dio_prep_bio(dio);
480 if (dio->bio == NULL) {
481 ret = dio_new_bio(dio);
482 if (ret)
483 goto fail_release;
484 new_page = 1;
485 }
486
487 bio = dio->bio;
488 if (new_page) {
489 dio->bvec = &bio->bi_io_vec[bio->bi_idx];
490 page_cache_get(page);
491 dio->bvec->bv_page = page;
492 dio->bvec->bv_len = 0;
493 dio->bvec->bv_offset = block_in_page << blkbits;
494 bio->bi_idx++;
495 new_page = 0;
496 }
497
498
499 this_chunk_blocks = dio->blocks_available;
500 u = (PAGE_SIZE - (dio->bvec->bv_offset + dio->bvec->bv_len)) >> blkbits;
501 if (this_chunk_blocks > u)
502 this_chunk_blocks = u;
503 u = dio->final_block_in_request - dio->block_in_file;
504 if (this_chunk_blocks > u)
505 this_chunk_blocks = u;
506 this_chunk_bytes = this_chunk_blocks << blkbits;
507 BUG_ON(this_chunk_bytes == 0);
508
509 dio->bvec->bv_len += this_chunk_bytes;
510 bio->bi_size += this_chunk_bytes;
511 dio->next_block_in_bio += this_chunk_blocks;
512 dio->last_block_in_bio = dio->next_block_in_bio - 1;
513 dio->boundary = buffer_boundary(&dio->map_bh);
514 dio->block_in_file += this_chunk_blocks;
515 block_in_page += this_chunk_blocks;
516 dio->blocks_available -= this_chunk_blocks;
517next_block:
518 if (dio->block_in_file > dio->final_block_in_request)
519 BUG();
520 if (dio->block_in_file == dio->final_block_in_request)
521 break;
522 }
523 block_in_page = 0;
524 page_cache_release(page);
525 }
526 ret = 0;
527 goto out;
528fail_release:
529 page_cache_release(page);
530out:
531 return ret;
532}
533
534int
535direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
536 loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
537{
538 const unsigned blkbits = inode->i_blkbits;
539 unsigned long user_addr;
540 int seg, ret2, ret = 0;
541 struct dio dio;
542 size_t bytes, tot_bytes = 0;
543
544 dio.bio = NULL;
545 dio.bvec = NULL;
546 dio.inode = inode;
547 dio.rw = rw;
548 dio.blkbits = blkbits;
549 dio.block_in_file = offset >> blkbits;
550 dio.blocks_available = 0;
551
552 dio.boundary = 0;
553 dio.reap_counter = 0;
554 dio.get_blocks = get_blocks;
555 dio.last_block_in_bio = -1;
556 dio.next_block_in_bio = -1;
557
558 dio.page_errors = 0;
559
560
561 atomic_set(&dio.bio_count, 0);
562 spin_lock_init(&dio.bio_list_lock);
563 dio.bio_list = NULL;
564 dio.waiter = NULL;
565
566 for (seg = 0; seg < nr_segs; seg++) {
567 user_addr = (unsigned long)iov[seg].iov_base;
568 bytes = iov[seg].iov_len;
569
570
571 dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
572 dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
573
574 dio.head = 0;
575 dio.tail = 0;
576 dio.curr_page = 0;
577
578 dio.total_pages = 0;
579 if (user_addr & (PAGE_SIZE-1)) {
580 dio.total_pages++;
581 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
582 }
583 dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
584 dio.curr_user_address = user_addr;
585
586 ret = do_direct_IO(&dio);
587
588 if (ret) {
589 dio_cleanup(&dio);
590 break;
591 }
592
593 tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
594 dio.block_in_file) << blkbits);
595
596 }
597
598 ret2 = dio_await_completion(&dio);
599 if (ret == 0)
600 ret = ret2;
601 if (ret == 0)
602 ret = dio.page_errors;
603 if (ret == 0)
604 ret = tot_bytes;
605
606 return ret;
607}
608
609
610
611
612int
613generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
614 loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
615{
616 int seg;
617 size_t size;
618 unsigned long addr;
619 struct address_space *mapping = inode->i_mapping;
620 unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
621 ssize_t retval = -EINVAL;
622
623 if (offset & blocksize_mask) {
624 goto out;
625 }
626
627
628 for (seg = 0; seg < nr_segs; seg++) {
629 addr = (unsigned long)iov[seg].iov_base;
630 size = iov[seg].iov_len;
631 if ((addr & blocksize_mask) || (size & blocksize_mask))
632 goto out;
633 }
634
635 if (mapping->nrpages) {
636 retval = filemap_fdatawrite(mapping);
637 if (retval == 0)
638 retval = filemap_fdatawait(mapping);
639 if (retval)
640 goto out;
641 }
642
643 retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
644out:
645 return retval;
646}
647
648ssize_t
649generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
650 loff_t offset, unsigned long nr_segs)
651{
652 struct address_space *mapping = inode->i_mapping;
653 ssize_t retval;
654
655 retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
656 if (inode->i_mapping->nrpages)
657 invalidate_inode_pages2(inode->i_mapping);
658 return retval;
659}
660