1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/task_io_accounting_ops.h>
31#include <linux/bio.h>
32#include <linux/wait.h>
33#include <linux/err.h>
34#include <linux/blkdev.h>
35#include <linux/buffer_head.h>
36#include <linux/rwsem.h>
37#include <linux/uio.h>
38#include <linux/atomic.h>
39#include <linux/prefetch.h>
40
41
42
43
44
45#define DIO_PAGES 64
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61struct dio_submit {
62 struct bio *bio;
63 unsigned blkbits;
64 unsigned blkfactor;
65
66
67
68
69 unsigned start_zero_done;
70
71
72 int pages_in_io;
73 size_t size;
74 sector_t block_in_file;
75
76 unsigned blocks_available;
77 int reap_counter;
78 sector_t final_block_in_request;
79 unsigned first_block_in_page;
80 int boundary;
81 get_block_t *get_block;
82 dio_submit_t *submit_io;
83
84 loff_t logical_offset_in_bio;
85 sector_t final_block_in_bio;
86 sector_t next_block_for_io;
87
88
89
90
91
92
93
94 struct page *cur_page;
95 unsigned cur_page_offset;
96 unsigned cur_page_len;
97 sector_t cur_page_block;
98 loff_t cur_page_fs_offset;
99
100
101
102
103 int curr_page;
104 int total_pages;
105 unsigned long curr_user_address;
106
107
108
109
110
111 unsigned head;
112 unsigned tail;
113};
114
115
116struct dio {
117 int flags;
118 int rw;
119 struct inode *inode;
120 loff_t i_size;
121 dio_iodone_t *end_io;
122
123 void *private;
124
125
126 spinlock_t bio_lock;
127 int page_errors;
128 int is_async;
129 int io_error;
130 unsigned long refcount;
131 struct bio *bio_list;
132 struct task_struct *waiter;
133
134
135 struct kiocb *iocb;
136 ssize_t result;
137
138
139
140
141
142
143 struct page *pages[DIO_PAGES];
144} ____cacheline_aligned_in_smp;
145
146static struct kmem_cache *dio_cache __read_mostly;
147
148
149
150
151static inline unsigned dio_pages_present(struct dio_submit *sdio)
152{
153 return sdio->tail - sdio->head;
154}
155
156
157
158
159static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
160{
161 int ret;
162 int nr_pages;
163
164 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
165 ret = get_user_pages_fast(
166 sdio->curr_user_address,
167 nr_pages,
168 dio->rw == READ,
169 &dio->pages[0]);
170
171 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
172 struct page *page = ZERO_PAGE(0);
173
174
175
176
177
178 if (dio->page_errors == 0)
179 dio->page_errors = ret;
180 page_cache_get(page);
181 dio->pages[0] = page;
182 sdio->head = 0;
183 sdio->tail = 1;
184 ret = 0;
185 goto out;
186 }
187
188 if (ret >= 0) {
189 sdio->curr_user_address += ret * PAGE_SIZE;
190 sdio->curr_page += ret;
191 sdio->head = 0;
192 sdio->tail = ret;
193 ret = 0;
194 }
195out:
196 return ret;
197}
198
199
200
201
202
203
204
205static inline struct page *dio_get_page(struct dio *dio,
206 struct dio_submit *sdio)
207{
208 if (dio_pages_present(sdio) == 0) {
209 int ret;
210
211 ret = dio_refill_pages(dio, sdio);
212 if (ret)
213 return ERR_PTR(ret);
214 BUG_ON(dio_pages_present(sdio) == 0);
215 }
216 return dio->pages[sdio->head++];
217}
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
233{
234 ssize_t transferred = 0;
235
236
237
238
239
240
241
242 if (ret == -EIOCBQUEUED)
243 ret = 0;
244
245 if (dio->result) {
246 transferred = dio->result;
247
248
249 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
250 transferred = dio->i_size - offset;
251 }
252
253 if (ret == 0)
254 ret = dio->page_errors;
255 if (ret == 0)
256 ret = dio->io_error;
257 if (ret == 0)
258 ret = transferred;
259
260 if (dio->end_io && dio->result) {
261 dio->end_io(dio->iocb, offset, transferred,
262 dio->private, ret, is_async);
263 } else {
264 if (is_async)
265 aio_complete(dio->iocb, ret, 0);
266 inode_dio_done(dio->inode);
267 }
268
269 return ret;
270}
271
272static int dio_bio_complete(struct dio *dio, struct bio *bio);
273
274
275
276static void dio_bio_end_aio(struct bio *bio, int error)
277{
278 struct dio *dio = bio->bi_private;
279 unsigned long remaining;
280 unsigned long flags;
281
282
283 dio_bio_complete(dio, bio);
284
285 spin_lock_irqsave(&dio->bio_lock, flags);
286 remaining = --dio->refcount;
287 if (remaining == 1 && dio->waiter)
288 wake_up_process(dio->waiter);
289 spin_unlock_irqrestore(&dio->bio_lock, flags);
290
291 if (remaining == 0) {
292 dio_complete(dio, dio->iocb->ki_pos, 0, true);
293 kmem_cache_free(dio_cache, dio);
294 }
295}
296
297
298
299
300
301
302
303
304static void dio_bio_end_io(struct bio *bio, int error)
305{
306 struct dio *dio = bio->bi_private;
307 unsigned long flags;
308
309 spin_lock_irqsave(&dio->bio_lock, flags);
310 bio->bi_private = dio->bio_list;
311 dio->bio_list = bio;
312 if (--dio->refcount == 1 && dio->waiter)
313 wake_up_process(dio->waiter);
314 spin_unlock_irqrestore(&dio->bio_lock, flags);
315}
316
317
318
319
320
321
322
323
324
325
326void dio_end_io(struct bio *bio, int error)
327{
328 struct dio *dio = bio->bi_private;
329
330 if (dio->is_async)
331 dio_bio_end_aio(bio, error);
332 else
333 dio_bio_end_io(bio, error);
334}
335EXPORT_SYMBOL_GPL(dio_end_io);
336
337static inline void
338dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
339 struct block_device *bdev,
340 sector_t first_sector, int nr_vecs)
341{
342 struct bio *bio;
343
344
345
346
347
348 bio = bio_alloc(GFP_KERNEL, nr_vecs);
349
350 bio->bi_bdev = bdev;
351 bio->bi_sector = first_sector;
352 if (dio->is_async)
353 bio->bi_end_io = dio_bio_end_aio;
354 else
355 bio->bi_end_io = dio_bio_end_io;
356
357 sdio->bio = bio;
358 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
359}
360
361
362
363
364
365
366
367
368static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
369{
370 struct bio *bio = sdio->bio;
371 unsigned long flags;
372
373 bio->bi_private = dio;
374
375 spin_lock_irqsave(&dio->bio_lock, flags);
376 dio->refcount++;
377 spin_unlock_irqrestore(&dio->bio_lock, flags);
378
379 if (dio->is_async && dio->rw == READ)
380 bio_set_pages_dirty(bio);
381
382 if (sdio->submit_io)
383 sdio->submit_io(dio->rw, bio, dio->inode,
384 sdio->logical_offset_in_bio);
385 else
386 submit_bio(dio->rw, bio);
387
388 sdio->bio = NULL;
389 sdio->boundary = 0;
390 sdio->logical_offset_in_bio = 0;
391}
392
393
394
395
396static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
397{
398 while (dio_pages_present(sdio))
399 page_cache_release(dio_get_page(dio, sdio));
400}
401
402
403
404
405
406
407
408static struct bio *dio_await_one(struct dio *dio)
409{
410 unsigned long flags;
411 struct bio *bio = NULL;
412
413 spin_lock_irqsave(&dio->bio_lock, flags);
414
415
416
417
418
419
420
421 while (dio->refcount > 1 && dio->bio_list == NULL) {
422 __set_current_state(TASK_UNINTERRUPTIBLE);
423 dio->waiter = current;
424 spin_unlock_irqrestore(&dio->bio_lock, flags);
425 io_schedule();
426
427 spin_lock_irqsave(&dio->bio_lock, flags);
428 dio->waiter = NULL;
429 }
430 if (dio->bio_list) {
431 bio = dio->bio_list;
432 dio->bio_list = bio->bi_private;
433 }
434 spin_unlock_irqrestore(&dio->bio_lock, flags);
435 return bio;
436}
437
438
439
440
441static int dio_bio_complete(struct dio *dio, struct bio *bio)
442{
443 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
444 struct bio_vec *bvec = bio->bi_io_vec;
445 int page_no;
446
447 if (!uptodate)
448 dio->io_error = -EIO;
449
450 if (dio->is_async && dio->rw == READ) {
451 bio_check_pages_dirty(bio);
452 } else {
453 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
454 struct page *page = bvec[page_no].bv_page;
455
456 if (dio->rw == READ && !PageCompound(page))
457 set_page_dirty_lock(page);
458 page_cache_release(page);
459 }
460 bio_put(bio);
461 }
462 return uptodate ? 0 : -EIO;
463}
464
465
466
467
468
469
470
471
472static void dio_await_completion(struct dio *dio)
473{
474 struct bio *bio;
475 do {
476 bio = dio_await_one(dio);
477 if (bio)
478 dio_bio_complete(dio, bio);
479 } while (bio);
480}
481
482
483
484
485
486
487
488
489static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
490{
491 int ret = 0;
492
493 if (sdio->reap_counter++ >= 64) {
494 while (dio->bio_list) {
495 unsigned long flags;
496 struct bio *bio;
497 int ret2;
498
499 spin_lock_irqsave(&dio->bio_lock, flags);
500 bio = dio->bio_list;
501 dio->bio_list = bio->bi_private;
502 spin_unlock_irqrestore(&dio->bio_lock, flags);
503 ret2 = dio_bio_complete(dio, bio);
504 if (ret == 0)
505 ret = ret2;
506 }
507 sdio->reap_counter = 0;
508 }
509 return ret;
510}
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
536 struct buffer_head *map_bh)
537{
538 int ret;
539 sector_t fs_startblk;
540 sector_t fs_endblk;
541 unsigned long fs_count;
542 int create;
543 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
544
545
546
547
548
549 ret = dio->page_errors;
550 if (ret == 0) {
551 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
552 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
553 fs_endblk = (sdio->final_block_in_request - 1) >>
554 sdio->blkfactor;
555 fs_count = fs_endblk - fs_startblk + 1;
556
557 map_bh->b_state = 0;
558 map_bh->b_size = fs_count << i_blkbits;
559
560
561
562
563
564
565
566
567
568
569
570
571 create = dio->rw & WRITE;
572 if (dio->flags & DIO_SKIP_HOLES) {
573 if (sdio->block_in_file < (i_size_read(dio->inode) >>
574 sdio->blkbits))
575 create = 0;
576 }
577
578 ret = (*sdio->get_block)(dio->inode, fs_startblk,
579 map_bh, create);
580
581
582 dio->private = map_bh->b_private;
583 }
584 return ret;
585}
586
587
588
589
590static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
591 sector_t start_sector, struct buffer_head *map_bh)
592{
593 sector_t sector;
594 int ret, nr_pages;
595
596 ret = dio_bio_reap(dio, sdio);
597 if (ret)
598 goto out;
599 sector = start_sector << (sdio->blkbits - 9);
600 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
601 nr_pages = min(nr_pages, BIO_MAX_PAGES);
602 BUG_ON(nr_pages <= 0);
603 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
604 sdio->boundary = 0;
605out:
606 return ret;
607}
608
609
610
611
612
613
614
615
616static inline int dio_bio_add_page(struct dio_submit *sdio)
617{
618 int ret;
619
620 ret = bio_add_page(sdio->bio, sdio->cur_page,
621 sdio->cur_page_len, sdio->cur_page_offset);
622 if (ret == sdio->cur_page_len) {
623
624
625
626 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
627 sdio->pages_in_io--;
628 page_cache_get(sdio->cur_page);
629 sdio->final_block_in_bio = sdio->cur_page_block +
630 (sdio->cur_page_len >> sdio->blkbits);
631 ret = 0;
632 } else {
633 ret = 1;
634 }
635 return ret;
636}
637
638
639
640
641
642
643
644
645
646
647
648static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
649 struct buffer_head *map_bh)
650{
651 int ret = 0;
652
653 if (sdio->bio) {
654 loff_t cur_offset = sdio->cur_page_fs_offset;
655 loff_t bio_next_offset = sdio->logical_offset_in_bio +
656 sdio->bio->bi_size;
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672 if (sdio->final_block_in_bio != sdio->cur_page_block ||
673 cur_offset != bio_next_offset)
674 dio_bio_submit(dio, sdio);
675
676
677
678
679 else if (sdio->boundary)
680 dio_bio_submit(dio, sdio);
681 }
682
683 if (sdio->bio == NULL) {
684 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
685 if (ret)
686 goto out;
687 }
688
689 if (dio_bio_add_page(sdio) != 0) {
690 dio_bio_submit(dio, sdio);
691 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
692 if (ret == 0) {
693 ret = dio_bio_add_page(sdio);
694 BUG_ON(ret != 0);
695 }
696 }
697out:
698 return ret;
699}
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718static inline int
719submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
720 unsigned offset, unsigned len, sector_t blocknr,
721 struct buffer_head *map_bh)
722{
723 int ret = 0;
724
725 if (dio->rw & WRITE) {
726
727
728
729 task_io_account_write(len);
730 }
731
732
733
734
735 if (sdio->cur_page == page &&
736 sdio->cur_page_offset + sdio->cur_page_len == offset &&
737 sdio->cur_page_block +
738 (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
739 sdio->cur_page_len += len;
740
741
742
743
744
745 if (sdio->boundary) {
746 ret = dio_send_cur_page(dio, sdio, map_bh);
747 page_cache_release(sdio->cur_page);
748 sdio->cur_page = NULL;
749 }
750 goto out;
751 }
752
753
754
755
756 if (sdio->cur_page) {
757 ret = dio_send_cur_page(dio, sdio, map_bh);
758 page_cache_release(sdio->cur_page);
759 sdio->cur_page = NULL;
760 if (ret)
761 goto out;
762 }
763
764 page_cache_get(page);
765 sdio->cur_page = page;
766 sdio->cur_page_offset = offset;
767 sdio->cur_page_len = len;
768 sdio->cur_page_block = blocknr;
769 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
770out:
771 return ret;
772}
773
774
775
776
777
778
779static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
780{
781 unsigned i;
782 unsigned nblocks;
783
784 nblocks = map_bh->b_size >> dio->inode->i_blkbits;
785
786 for (i = 0; i < nblocks; i++) {
787 unmap_underlying_metadata(map_bh->b_bdev,
788 map_bh->b_blocknr + i);
789 }
790}
791
792
793
794
795
796
797
798
799
800
801static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
802 int end, struct buffer_head *map_bh)
803{
804 unsigned dio_blocks_per_fs_block;
805 unsigned this_chunk_blocks;
806 unsigned this_chunk_bytes;
807 struct page *page;
808
809 sdio->start_zero_done = 1;
810 if (!sdio->blkfactor || !buffer_new(map_bh))
811 return;
812
813 dio_blocks_per_fs_block = 1 << sdio->blkfactor;
814 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
815
816 if (!this_chunk_blocks)
817 return;
818
819
820
821
822
823 if (end)
824 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
825
826 this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
827
828 page = ZERO_PAGE(0);
829 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
830 sdio->next_block_for_io, map_bh))
831 return;
832
833 sdio->next_block_for_io += this_chunk_blocks;
834}
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
853 struct buffer_head *map_bh)
854{
855 const unsigned blkbits = sdio->blkbits;
856 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
857 struct page *page;
858 unsigned block_in_page;
859 int ret = 0;
860
861
862 block_in_page = sdio->first_block_in_page;
863
864 while (sdio->block_in_file < sdio->final_block_in_request) {
865 page = dio_get_page(dio, sdio);
866 if (IS_ERR(page)) {
867 ret = PTR_ERR(page);
868 goto out;
869 }
870
871 while (block_in_page < blocks_per_page) {
872 unsigned offset_in_page = block_in_page << blkbits;
873 unsigned this_chunk_bytes;
874 unsigned this_chunk_blocks;
875 unsigned u;
876
877 if (sdio->blocks_available == 0) {
878
879
880
881 unsigned long blkmask;
882 unsigned long dio_remainder;
883
884 ret = get_more_blocks(dio, sdio, map_bh);
885 if (ret) {
886 page_cache_release(page);
887 goto out;
888 }
889 if (!buffer_mapped(map_bh))
890 goto do_holes;
891
892 sdio->blocks_available =
893 map_bh->b_size >> sdio->blkbits;
894 sdio->next_block_for_io =
895 map_bh->b_blocknr << sdio->blkfactor;
896 if (buffer_new(map_bh))
897 clean_blockdev_aliases(dio, map_bh);
898
899 if (!sdio->blkfactor)
900 goto do_holes;
901
902 blkmask = (1 << sdio->blkfactor) - 1;
903 dio_remainder = (sdio->block_in_file & blkmask);
904
905
906
907
908
909
910
911
912
913
914
915
916 if (!buffer_new(map_bh))
917 sdio->next_block_for_io += dio_remainder;
918 sdio->blocks_available -= dio_remainder;
919 }
920do_holes:
921
922 if (!buffer_mapped(map_bh)) {
923 loff_t i_size_aligned;
924
925
926 if (dio->rw & WRITE) {
927 page_cache_release(page);
928 return -ENOTBLK;
929 }
930
931
932
933
934
935 i_size_aligned = ALIGN(i_size_read(dio->inode),
936 1 << blkbits);
937 if (sdio->block_in_file >=
938 i_size_aligned >> blkbits) {
939
940 page_cache_release(page);
941 goto out;
942 }
943 zero_user(page, block_in_page << blkbits,
944 1 << blkbits);
945 sdio->block_in_file++;
946 block_in_page++;
947 goto next_block;
948 }
949
950
951
952
953
954
955 if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
956 dio_zero_block(dio, sdio, 0, map_bh);
957
958
959
960
961
962 this_chunk_blocks = sdio->blocks_available;
963 u = (PAGE_SIZE - offset_in_page) >> blkbits;
964 if (this_chunk_blocks > u)
965 this_chunk_blocks = u;
966 u = sdio->final_block_in_request - sdio->block_in_file;
967 if (this_chunk_blocks > u)
968 this_chunk_blocks = u;
969 this_chunk_bytes = this_chunk_blocks << blkbits;
970 BUG_ON(this_chunk_bytes == 0);
971
972 sdio->boundary = buffer_boundary(map_bh);
973 ret = submit_page_section(dio, sdio, page,
974 offset_in_page,
975 this_chunk_bytes,
976 sdio->next_block_for_io,
977 map_bh);
978 if (ret) {
979 page_cache_release(page);
980 goto out;
981 }
982 sdio->next_block_for_io += this_chunk_blocks;
983
984 sdio->block_in_file += this_chunk_blocks;
985 block_in_page += this_chunk_blocks;
986 sdio->blocks_available -= this_chunk_blocks;
987next_block:
988 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
989 if (sdio->block_in_file == sdio->final_block_in_request)
990 break;
991 }
992
993
994 page_cache_release(page);
995 block_in_page = 0;
996 }
997out:
998 return ret;
999}
1000
1001static inline int drop_refcount(struct dio *dio)
1002{
1003 int ret2;
1004 unsigned long flags;
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017 spin_lock_irqsave(&dio->bio_lock, flags);
1018 ret2 = --dio->refcount;
1019 spin_unlock_irqrestore(&dio->bio_lock, flags);
1020 return ret2;
1021}
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048static inline ssize_t
1049do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1050 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1051 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1052 dio_submit_t submit_io, int flags)
1053{
1054 int seg;
1055 size_t size;
1056 unsigned long addr;
1057 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1058 unsigned blkbits = i_blkbits;
1059 unsigned blocksize_mask = (1 << blkbits) - 1;
1060 ssize_t retval = -EINVAL;
1061 loff_t end = offset;
1062 struct dio *dio;
1063 struct dio_submit sdio = { 0, };
1064 unsigned long user_addr;
1065 size_t bytes;
1066 struct buffer_head map_bh = { 0, };
1067 struct blk_plug plug;
1068
1069 if (rw & WRITE)
1070 rw = WRITE_ODIRECT;
1071
1072
1073
1074
1075
1076
1077 if (offset & blocksize_mask) {
1078 if (bdev)
1079 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1080 blocksize_mask = (1 << blkbits) - 1;
1081 if (offset & blocksize_mask)
1082 goto out;
1083 }
1084
1085
1086 for (seg = 0; seg < nr_segs; seg++) {
1087 addr = (unsigned long)iov[seg].iov_base;
1088 size = iov[seg].iov_len;
1089 end += size;
1090 if (unlikely((addr & blocksize_mask) ||
1091 (size & blocksize_mask))) {
1092 if (bdev)
1093 blkbits = blksize_bits(
1094 bdev_logical_block_size(bdev));
1095 blocksize_mask = (1 << blkbits) - 1;
1096 if ((addr & blocksize_mask) || (size & blocksize_mask))
1097 goto out;
1098 }
1099 }
1100
1101
1102 if (rw == READ && end == offset)
1103 return 0;
1104
1105 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1106 retval = -ENOMEM;
1107 if (!dio)
1108 goto out;
1109
1110
1111
1112
1113
1114 memset(dio, 0, offsetof(struct dio, pages));
1115
1116 dio->flags = flags;
1117 if (dio->flags & DIO_LOCKING) {
1118 if (rw == READ) {
1119 struct address_space *mapping =
1120 iocb->ki_filp->f_mapping;
1121
1122
1123 mutex_lock(&inode->i_mutex);
1124
1125 retval = filemap_write_and_wait_range(mapping, offset,
1126 end - 1);
1127 if (retval) {
1128 mutex_unlock(&inode->i_mutex);
1129 kmem_cache_free(dio_cache, dio);
1130 goto out;
1131 }
1132 }
1133 }
1134
1135
1136
1137
1138 atomic_inc(&inode->i_dio_count);
1139
1140
1141
1142
1143
1144
1145
1146 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1147 (end > i_size_read(inode)));
1148
1149 retval = 0;
1150
1151 dio->inode = inode;
1152 dio->rw = rw;
1153 sdio.blkbits = blkbits;
1154 sdio.blkfactor = i_blkbits - blkbits;
1155 sdio.block_in_file = offset >> blkbits;
1156
1157 sdio.get_block = get_block;
1158 dio->end_io = end_io;
1159 sdio.submit_io = submit_io;
1160 sdio.final_block_in_bio = -1;
1161 sdio.next_block_for_io = -1;
1162
1163 dio->iocb = iocb;
1164 dio->i_size = i_size_read(inode);
1165
1166 spin_lock_init(&dio->bio_lock);
1167 dio->refcount = 1;
1168
1169
1170
1171
1172
1173 if (unlikely(sdio.blkfactor))
1174 sdio.pages_in_io = 2;
1175
1176 for (seg = 0; seg < nr_segs; seg++) {
1177 user_addr = (unsigned long)iov[seg].iov_base;
1178 sdio.pages_in_io +=
1179 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1180 PAGE_SIZE - user_addr / PAGE_SIZE);
1181 }
1182
1183 blk_start_plug(&plug);
1184
1185 for (seg = 0; seg < nr_segs; seg++) {
1186 user_addr = (unsigned long)iov[seg].iov_base;
1187 sdio.size += bytes = iov[seg].iov_len;
1188
1189
1190 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1191 sdio.final_block_in_request = sdio.block_in_file +
1192 (bytes >> blkbits);
1193
1194 sdio.head = 0;
1195 sdio.tail = 0;
1196 sdio.curr_page = 0;
1197
1198 sdio.total_pages = 0;
1199 if (user_addr & (PAGE_SIZE-1)) {
1200 sdio.total_pages++;
1201 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1202 }
1203 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1204 sdio.curr_user_address = user_addr;
1205
1206 retval = do_direct_IO(dio, &sdio, &map_bh);
1207
1208 dio->result += iov[seg].iov_len -
1209 ((sdio.final_block_in_request - sdio.block_in_file) <<
1210 blkbits);
1211
1212 if (retval) {
1213 dio_cleanup(dio, &sdio);
1214 break;
1215 }
1216 }
1217
1218 if (retval == -ENOTBLK) {
1219
1220
1221
1222
1223 retval = 0;
1224 }
1225
1226
1227
1228
1229 dio_zero_block(dio, &sdio, 1, &map_bh);
1230
1231 if (sdio.cur_page) {
1232 ssize_t ret2;
1233
1234 ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1235 if (retval == 0)
1236 retval = ret2;
1237 page_cache_release(sdio.cur_page);
1238 sdio.cur_page = NULL;
1239 }
1240 if (sdio.bio)
1241 dio_bio_submit(dio, &sdio);
1242
1243 blk_finish_plug(&plug);
1244
1245
1246
1247
1248
1249 dio_cleanup(dio, &sdio);
1250
1251
1252
1253
1254
1255
1256 if (rw == READ && (dio->flags & DIO_LOCKING))
1257 mutex_unlock(&dio->inode->i_mutex);
1258
1259
1260
1261
1262
1263
1264
1265
1266 BUG_ON(retval == -EIOCBQUEUED);
1267 if (dio->is_async && retval == 0 && dio->result &&
1268 ((rw == READ) || (dio->result == sdio.size)))
1269 retval = -EIOCBQUEUED;
1270
1271 if (retval != -EIOCBQUEUED)
1272 dio_await_completion(dio);
1273
1274 if (drop_refcount(dio) == 0) {
1275 retval = dio_complete(dio, offset, retval, false);
1276 kmem_cache_free(dio_cache, dio);
1277 } else
1278 BUG_ON(retval != -EIOCBQUEUED);
1279
1280out:
1281 return retval;
1282}
1283
1284ssize_t
1285__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1286 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1287 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1288 dio_submit_t submit_io, int flags)
1289{
1290
1291
1292
1293
1294
1295
1296
1297
1298 prefetch(&bdev->bd_disk->part_tbl);
1299 prefetch(bdev->bd_queue);
1300 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1301
1302 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1303 nr_segs, get_block, end_io,
1304 submit_io, flags);
1305}
1306
1307EXPORT_SYMBOL(__blockdev_direct_IO);
1308
1309static __init int dio_init(void)
1310{
1311 dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1312 return 0;
1313}
1314module_init(dio_init)
1315