1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/bio.h>
31#include <linux/wait.h>
32#include <linux/err.h>
33#include <linux/blkdev.h>
34#include <linux/buffer_head.h>
35#include <linux/rwsem.h>
36#include <linux/uio.h>
37#include <asm/atomic.h>
38
39
40
41
42
43#define DIO_PAGES 64
44
45
46
47
48
49
50
51
52
53
54
55
56
57struct dio {
58
59 struct bio *bio;
60 struct inode *inode;
61 int rw;
62 unsigned blkbits;
63 unsigned blkfactor;
64
65
66
67
68 unsigned start_zero_done;
69
70
71 int pages_in_io;
72 sector_t block_in_file;
73
74 unsigned blocks_available;
75 sector_t final_block_in_request;
76 unsigned first_block_in_page;
77 int boundary;
78 int reap_counter;
79 get_blocks_t *get_blocks;
80 dio_iodone_t *end_io;
81 sector_t final_block_in_bio;
82 sector_t next_block_for_io;
83
84 struct buffer_head map_bh;
85
86
87
88
89
90
91 struct page *cur_page;
92 unsigned cur_page_offset;
93 unsigned cur_page_len;
94 sector_t cur_page_block;
95
96
97
98
99 int curr_page;
100 int total_pages;
101 unsigned long curr_user_address;
102
103
104
105
106
107 struct page *pages[DIO_PAGES];
108 unsigned head;
109 unsigned tail;
110 int page_errors;
111
112
113 atomic_t bio_count;
114 atomic_t bios_in_flight;
115 spinlock_t bio_list_lock;
116 struct bio *bio_list;
117 struct task_struct *waiter;
118
119
120 struct kiocb *iocb;
121 int is_async;
122 int result;
123};
124
125
126
127
128static inline unsigned dio_pages_present(struct dio *dio)
129{
130 return dio->tail - dio->head;
131}
132
133
134
135
136static int dio_refill_pages(struct dio *dio)
137{
138 int ret;
139 int nr_pages;
140
141 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
142 down_read(¤t->mm->mmap_sem);
143 ret = get_user_pages(
144 current,
145 current->mm,
146 dio->curr_user_address,
147 nr_pages,
148 dio->rw == READ,
149 0,
150 &dio->pages[0],
151 NULL);
152 up_read(¤t->mm->mmap_sem);
153
154 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
155
156
157
158
159
160 if (dio->page_errors == 0)
161 dio->page_errors = ret;
162 dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
163 dio->head = 0;
164 dio->tail = 1;
165 ret = 0;
166 goto out;
167 }
168
169 if (ret >= 0) {
170 dio->curr_user_address += ret * PAGE_SIZE;
171 dio->curr_page += ret;
172 dio->head = 0;
173 dio->tail = ret;
174 ret = 0;
175 }
176out:
177 return ret;
178}
179
180
181
182
183
184
185
186static struct page *dio_get_page(struct dio *dio)
187{
188 if (dio_pages_present(dio) == 0) {
189 int ret;
190
191 ret = dio_refill_pages(dio);
192 if (ret)
193 return ERR_PTR(ret);
194 BUG_ON(dio_pages_present(dio) == 0);
195 }
196 return dio->pages[dio->head++];
197}
198
199
200
201
202
203
204
205static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
206{
207 if (dio->end_io)
208 dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
209}
210
211
212
213
214
215static void finished_one_bio(struct dio *dio)
216{
217 if (atomic_dec_and_test(&dio->bio_count)) {
218 if (dio->is_async) {
219 dio_complete(dio, dio->block_in_file << dio->blkbits,
220 dio->result);
221 aio_complete(dio->iocb, dio->result, 0);
222 kfree(dio);
223 }
224 }
225}
226
227static int dio_bio_complete(struct dio *dio, struct bio *bio);
228
229
230
231static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
232{
233 struct dio *dio = bio->bi_private;
234
235 if (bio->bi_size)
236 return 1;
237
238
239 dio_bio_complete(dio, bio);
240 return 0;
241}
242
243
244
245
246
247
248
249
250static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
251{
252 struct dio *dio = bio->bi_private;
253 unsigned long flags;
254
255 if (bio->bi_size)
256 return 1;
257
258 spin_lock_irqsave(&dio->bio_list_lock, flags);
259 bio->bi_private = dio->bio_list;
260 dio->bio_list = bio;
261 atomic_dec(&dio->bios_in_flight);
262 if (dio->waiter && atomic_read(&dio->bios_in_flight) == 0)
263 wake_up_process(dio->waiter);
264 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
265 return 0;
266}
267
268static int
269dio_bio_alloc(struct dio *dio, struct block_device *bdev,
270 sector_t first_sector, int nr_vecs)
271{
272 struct bio *bio;
273
274 bio = bio_alloc(GFP_KERNEL, nr_vecs);
275 if (bio == NULL)
276 return -ENOMEM;
277
278 bio->bi_bdev = bdev;
279 bio->bi_sector = first_sector;
280 if (dio->is_async)
281 bio->bi_end_io = dio_bio_end_aio;
282 else
283 bio->bi_end_io = dio_bio_end_io;
284
285 dio->bio = bio;
286 return 0;
287}
288
289
290
291
292
293
294static void dio_bio_submit(struct dio *dio)
295{
296 struct bio *bio = dio->bio;
297
298 bio->bi_private = dio;
299 atomic_inc(&dio->bio_count);
300 atomic_inc(&dio->bios_in_flight);
301 if (dio->is_async && dio->rw == READ)
302 bio_set_pages_dirty(bio);
303 submit_bio(dio->rw, bio);
304
305 dio->bio = NULL;
306 dio->boundary = 0;
307}
308
309
310
311
312static void dio_cleanup(struct dio *dio)
313{
314 while (dio_pages_present(dio))
315 page_cache_release(dio_get_page(dio));
316}
317
318
319
320
321static struct bio *dio_await_one(struct dio *dio)
322{
323 unsigned long flags;
324 struct bio *bio;
325
326 spin_lock_irqsave(&dio->bio_list_lock, flags);
327 while (dio->bio_list == NULL) {
328 set_current_state(TASK_UNINTERRUPTIBLE);
329 if (dio->bio_list == NULL) {
330 dio->waiter = current;
331 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
332 blk_run_queues();
333 io_schedule();
334 spin_lock_irqsave(&dio->bio_list_lock, flags);
335 dio->waiter = NULL;
336 }
337 set_current_state(TASK_RUNNING);
338 }
339 bio = dio->bio_list;
340 dio->bio_list = bio->bi_private;
341 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
342 return bio;
343}
344
345
346
347
348static int dio_bio_complete(struct dio *dio, struct bio *bio)
349{
350 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
351 struct bio_vec *bvec = bio->bi_io_vec;
352 int page_no;
353
354 if (!uptodate)
355 dio->result = -EIO;
356
357 if (dio->is_async && dio->rw == READ) {
358 bio_check_pages_dirty(bio);
359 } else {
360 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
361 struct page *page = bvec[page_no].bv_page;
362
363 if (dio->rw == READ)
364 set_page_dirty_lock(page);
365 page_cache_release(page);
366 }
367 bio_put(bio);
368 }
369 finished_one_bio(dio);
370 return uptodate ? 0 : -EIO;
371}
372
373
374
375
376static int dio_await_completion(struct dio *dio)
377{
378 int ret = 0;
379
380 if (dio->bio)
381 dio_bio_submit(dio);
382
383 while (atomic_read(&dio->bio_count)) {
384 struct bio *bio = dio_await_one(dio);
385 int ret2;
386
387 ret2 = dio_bio_complete(dio, bio);
388 if (ret == 0)
389 ret = ret2;
390 }
391 return ret;
392}
393
394
395
396
397
398
399
400
401static int dio_bio_reap(struct dio *dio)
402{
403 int ret = 0;
404
405 if (dio->reap_counter++ >= 64) {
406 while (dio->bio_list) {
407 unsigned long flags;
408 struct bio *bio;
409
410 spin_lock_irqsave(&dio->bio_list_lock, flags);
411 bio = dio->bio_list;
412 dio->bio_list = bio->bi_private;
413 spin_unlock_irqrestore(&dio->bio_list_lock, flags);
414 ret = dio_bio_complete(dio, bio);
415 }
416 dio->reap_counter = 0;
417 }
418 return ret;
419}
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444static int get_more_blocks(struct dio *dio)
445{
446 int ret;
447 struct buffer_head *map_bh = &dio->map_bh;
448 sector_t fs_startblk;
449 unsigned long fs_count;
450 unsigned long dio_count;
451 unsigned long blkmask;
452
453
454
455
456
457 ret = dio->page_errors;
458 if (ret == 0) {
459 map_bh->b_state = 0;
460 map_bh->b_size = 0;
461 BUG_ON(dio->block_in_file >= dio->final_block_in_request);
462 fs_startblk = dio->block_in_file >> dio->blkfactor;
463 dio_count = dio->final_block_in_request - dio->block_in_file;
464 fs_count = dio_count >> dio->blkfactor;
465 blkmask = (1 << dio->blkfactor) - 1;
466 if (dio_count & blkmask)
467 fs_count++;
468
469 ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
470 map_bh, dio->rw == WRITE);
471 }
472 return ret;
473}
474
475
476
477
478static int dio_new_bio(struct dio *dio, sector_t start_sector)
479{
480 sector_t sector;
481 int ret, nr_pages;
482
483 ret = dio_bio_reap(dio);
484 if (ret)
485 goto out;
486 sector = start_sector << (dio->blkbits - 9);
487 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
488 BUG_ON(nr_pages <= 0);
489 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
490 dio->boundary = 0;
491out:
492 return ret;
493}
494
495
496
497
498
499
500
501
502static int dio_bio_add_page(struct dio *dio)
503{
504 int ret;
505
506 ret = bio_add_page(dio->bio, dio->cur_page,
507 dio->cur_page_len, dio->cur_page_offset);
508 if (ret == dio->cur_page_len) {
509 dio->pages_in_io--;
510 page_cache_get(dio->cur_page);
511 dio->final_block_in_bio = dio->cur_page_block +
512 (dio->cur_page_len >> dio->blkbits);
513 ret = 0;
514 } else {
515 ret = 1;
516 }
517 return ret;
518}
519
520
521
522
523
524
525
526
527
528
529
530static int dio_send_cur_page(struct dio *dio)
531{
532 int ret = 0;
533
534 if (dio->bio) {
535
536
537
538 if (dio->final_block_in_bio != dio->cur_page_block)
539 dio_bio_submit(dio);
540
541
542
543
544 if (dio->boundary)
545 dio_bio_submit(dio);
546 }
547
548 if (dio->bio == NULL) {
549 ret = dio_new_bio(dio, dio->cur_page_block);
550 if (ret)
551 goto out;
552 }
553
554 if (dio_bio_add_page(dio) != 0) {
555 dio_bio_submit(dio);
556 ret = dio_new_bio(dio, dio->cur_page_block);
557 if (ret == 0) {
558 ret = dio_bio_add_page(dio);
559 BUG_ON(ret != 0);
560 }
561 }
562out:
563 return ret;
564}
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583static int
584submit_page_section(struct dio *dio, struct page *page,
585 unsigned offset, unsigned len, sector_t blocknr)
586{
587 int ret = 0;
588
589
590
591
592 if ( (dio->cur_page == page) &&
593 (dio->cur_page_offset + dio->cur_page_len == offset) &&
594 (dio->cur_page_block +
595 (dio->cur_page_len >> dio->blkbits) == blocknr)) {
596 dio->cur_page_len += len;
597
598
599
600
601
602 if (dio->boundary) {
603 ret = dio_send_cur_page(dio);
604 page_cache_release(dio->cur_page);
605 dio->cur_page = NULL;
606 }
607 goto out;
608 }
609
610
611
612
613 if (dio->cur_page) {
614 ret = dio_send_cur_page(dio);
615 page_cache_release(dio->cur_page);
616 dio->cur_page = NULL;
617 if (ret)
618 goto out;
619 }
620
621 page_cache_get(page);
622 dio->cur_page = page;
623 dio->cur_page_offset = offset;
624 dio->cur_page_len = len;
625 dio->cur_page_block = blocknr;
626out:
627 return ret;
628}
629
630
631
632
633
634
635static void clean_blockdev_aliases(struct dio *dio)
636{
637 unsigned i;
638
639 for (i = 0; i < dio->blocks_available; i++) {
640 unmap_underlying_metadata(dio->map_bh.b_bdev,
641 dio->map_bh.b_blocknr + i);
642 }
643}
644
645
646
647
648
649
650
651
652
653
654static void dio_zero_block(struct dio *dio, int end)
655{
656 unsigned dio_blocks_per_fs_block;
657 unsigned this_chunk_blocks;
658 unsigned this_chunk_bytes;
659 struct page *page;
660
661 dio->start_zero_done = 1;
662 if (!dio->blkfactor || !buffer_new(&dio->map_bh))
663 return;
664
665 dio_blocks_per_fs_block = 1 << dio->blkfactor;
666 this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
667
668 if (!this_chunk_blocks)
669 return;
670
671
672
673
674
675 if (end)
676 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
677
678 this_chunk_bytes = this_chunk_blocks << dio->blkbits;
679
680 page = ZERO_PAGE(dio->curr_user_address);
681 if (submit_page_section(dio, page, 0, this_chunk_bytes,
682 dio->next_block_for_io))
683 return;
684
685 dio->next_block_for_io += this_chunk_blocks;
686}
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704static int do_direct_IO(struct dio *dio)
705{
706 const unsigned blkbits = dio->blkbits;
707 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
708 struct page *page;
709 unsigned block_in_page;
710 struct buffer_head *map_bh = &dio->map_bh;
711 int ret = 0;
712
713
714 block_in_page = dio->first_block_in_page;
715
716 while (dio->block_in_file < dio->final_block_in_request) {
717 page = dio_get_page(dio);
718 if (IS_ERR(page)) {
719 ret = PTR_ERR(page);
720 goto out;
721 }
722
723 while (block_in_page < blocks_per_page) {
724 unsigned offset_in_page = block_in_page << blkbits;
725 unsigned this_chunk_bytes;
726 unsigned this_chunk_blocks;
727 unsigned u;
728
729 if (dio->blocks_available == 0) {
730
731
732
733 unsigned long blkmask;
734 unsigned long dio_remainder;
735
736 ret = get_more_blocks(dio);
737 if (ret) {
738 page_cache_release(page);
739 goto out;
740 }
741 if (!buffer_mapped(map_bh))
742 goto do_holes;
743
744 dio->blocks_available =
745 map_bh->b_size >> dio->blkbits;
746 dio->next_block_for_io =
747 map_bh->b_blocknr << dio->blkfactor;
748 if (buffer_new(map_bh))
749 clean_blockdev_aliases(dio);
750
751 if (!dio->blkfactor)
752 goto do_holes;
753
754 blkmask = (1 << dio->blkfactor) - 1;
755 dio_remainder = (dio->block_in_file & blkmask);
756
757
758
759
760
761
762
763
764
765
766
767
768 if (!buffer_new(map_bh))
769 dio->next_block_for_io += dio_remainder;
770 dio->blocks_available -= dio_remainder;
771 }
772do_holes:
773
774 if (!buffer_mapped(map_bh)) {
775 char *kaddr;
776
777 if (dio->block_in_file >=
778 i_size_read(dio->inode)>>blkbits) {
779
780 page_cache_release(page);
781 goto out;
782 }
783 kaddr = kmap_atomic(page, KM_USER0);
784 memset(kaddr + (block_in_page << blkbits),
785 0, 1 << blkbits);
786 flush_dcache_page(page);
787 kunmap_atomic(kaddr, KM_USER0);
788 dio->block_in_file++;
789 block_in_page++;
790 goto next_block;
791 }
792
793
794
795
796
797
798 if (unlikely(dio->blkfactor && !dio->start_zero_done))
799 dio_zero_block(dio, 0);
800
801
802
803
804
805 this_chunk_blocks = dio->blocks_available;
806 u = (PAGE_SIZE - offset_in_page) >> blkbits;
807 if (this_chunk_blocks > u)
808 this_chunk_blocks = u;
809 u = dio->final_block_in_request - dio->block_in_file;
810 if (this_chunk_blocks > u)
811 this_chunk_blocks = u;
812 this_chunk_bytes = this_chunk_blocks << blkbits;
813 BUG_ON(this_chunk_bytes == 0);
814
815 dio->boundary = buffer_boundary(map_bh);
816 ret = submit_page_section(dio, page, offset_in_page,
817 this_chunk_bytes, dio->next_block_for_io);
818 if (ret) {
819 page_cache_release(page);
820 goto out;
821 }
822 dio->next_block_for_io += this_chunk_blocks;
823
824 dio->block_in_file += this_chunk_blocks;
825 block_in_page += this_chunk_blocks;
826 dio->blocks_available -= this_chunk_blocks;
827next_block:
828 if (dio->block_in_file > dio->final_block_in_request)
829 BUG();
830 if (dio->block_in_file == dio->final_block_in_request)
831 break;
832 }
833
834
835 page_cache_release(page);
836 block_in_page = 0;
837 }
838out:
839 return ret;
840}
841
842static int
843direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
844 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
845 unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
846{
847 unsigned long user_addr;
848 int seg;
849 int ret = 0;
850 int ret2;
851 struct dio *dio;
852 size_t bytes;
853
854 dio = kmalloc(sizeof(*dio), GFP_KERNEL);
855 if (!dio)
856 return -ENOMEM;
857 dio->is_async = !is_sync_kiocb(iocb);
858
859 dio->bio = NULL;
860 dio->inode = inode;
861 dio->rw = rw;
862 dio->blkbits = blkbits;
863 dio->blkfactor = inode->i_blkbits - blkbits;
864 dio->start_zero_done = 0;
865 dio->block_in_file = offset >> blkbits;
866 dio->blocks_available = 0;
867
868 dio->cur_page = NULL;
869
870 dio->boundary = 0;
871 dio->reap_counter = 0;
872 dio->get_blocks = get_blocks;
873 dio->end_io = end_io;
874 dio->map_bh.b_private = NULL;
875 dio->final_block_in_bio = -1;
876 dio->next_block_for_io = -1;
877
878 dio->page_errors = 0;
879 dio->result = 0;
880 dio->iocb = iocb;
881
882
883
884
885
886
887
888
889
890 atomic_set(&dio->bio_count, 1);
891 atomic_set(&dio->bios_in_flight, 0);
892 spin_lock_init(&dio->bio_list_lock);
893 dio->bio_list = NULL;
894 dio->waiter = NULL;
895
896 dio->pages_in_io = 0;
897 for (seg = 0; seg < nr_segs; seg++)
898 dio->pages_in_io += (iov[seg].iov_len >> blkbits) + 2;
899
900 for (seg = 0; seg < nr_segs; seg++) {
901 user_addr = (unsigned long)iov[seg].iov_base;
902 bytes = iov[seg].iov_len;
903
904
905 dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
906 dio->final_block_in_request = dio->block_in_file +
907 (bytes >> blkbits);
908
909 dio->head = 0;
910 dio->tail = 0;
911 dio->curr_page = 0;
912
913 dio->total_pages = 0;
914 if (user_addr & (PAGE_SIZE-1)) {
915 dio->total_pages++;
916 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
917 }
918 dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
919 dio->curr_user_address = user_addr;
920
921 ret = do_direct_IO(dio);
922
923 dio->result += iov[seg].iov_len -
924 ((dio->final_block_in_request - dio->block_in_file) <<
925 blkbits);
926
927 if (ret) {
928 dio_cleanup(dio);
929 break;
930 }
931 }
932
933
934
935
936
937 dio_zero_block(dio, 1);
938
939 if (dio->cur_page) {
940 ret2 = dio_send_cur_page(dio);
941 if (ret == 0)
942 ret = ret2;
943 page_cache_release(dio->cur_page);
944 dio->cur_page = NULL;
945 }
946 if (dio->bio)
947 dio_bio_submit(dio);
948
949
950
951
952
953 dio_cleanup(dio);
954
955
956
957
958
959 if (dio->is_async) {
960 if (ret == 0)
961 ret = dio->result;
962 finished_one_bio(dio);
963 blk_run_queues();
964 } else {
965 finished_one_bio(dio);
966 ret2 = dio_await_completion(dio);
967 if (ret == 0)
968 ret = ret2;
969 if (ret == 0)
970 ret = dio->page_errors;
971 if (ret == 0 && dio->result) {
972 loff_t i_size = i_size_read(inode);
973
974 ret = dio->result;
975
976
977
978
979 if (rw == READ && (offset + ret > i_size))
980 ret = i_size - offset;
981 }
982 dio_complete(dio, offset, ret);
983 kfree(dio);
984 }
985 return ret;
986}
987
988
989
990
991int
992blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
993 struct block_device *bdev, const struct iovec *iov, loff_t offset,
994 unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
995{
996 int seg;
997 size_t size;
998 unsigned long addr;
999 unsigned blkbits = inode->i_blkbits;
1000 unsigned bdev_blkbits = 0;
1001 unsigned blocksize_mask = (1 << blkbits) - 1;
1002 ssize_t retval = -EINVAL;
1003
1004 if (bdev)
1005 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
1006
1007 if (offset & blocksize_mask) {
1008 if (bdev)
1009 blkbits = bdev_blkbits;
1010 blocksize_mask = (1 << blkbits) - 1;
1011 if (offset & blocksize_mask)
1012 goto out;
1013 }
1014
1015
1016 for (seg = 0; seg < nr_segs; seg++) {
1017 addr = (unsigned long)iov[seg].iov_base;
1018 size = iov[seg].iov_len;
1019 if ((addr & blocksize_mask) || (size & blocksize_mask)) {
1020 if (bdev)
1021 blkbits = bdev_blkbits;
1022 blocksize_mask = (1 << blkbits) - 1;
1023 if ((addr & blocksize_mask) || (size & blocksize_mask))
1024 goto out;
1025 }
1026 }
1027
1028 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1029 nr_segs, blkbits, get_blocks, end_io);
1030out:
1031 return retval;
1032}
1033
1034EXPORT_SYMBOL(blockdev_direct_IO);
1035