1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/task_io_accounting_ops.h>
31#include <linux/bio.h>
32#include <linux/wait.h>
33#include <linux/err.h>
34#include <linux/blkdev.h>
35#include <linux/buffer_head.h>
36#include <linux/rwsem.h>
37#include <linux/uio.h>
38#include <linux/atomic.h>
39
40
41
42
43
44#define DIO_PAGES 64
45
46
47
48
49
50
51
52
53
54
55
56
57
58struct dio {
59
60 struct bio *bio;
61 struct inode *inode;
62 int rw;
63 loff_t i_size;
64 int flags;
65 unsigned blkbits;
66 unsigned blkfactor;
67
68
69
70
71 unsigned start_zero_done;
72
73
74 int pages_in_io;
75 size_t size;
76 sector_t block_in_file;
77
78 unsigned blocks_available;
79 sector_t final_block_in_request;
80 unsigned first_block_in_page;
81 int boundary;
82 int reap_counter;
83 get_block_t *get_block;
84 dio_iodone_t *end_io;
85 dio_submit_t *submit_io;
86 loff_t logical_offset_in_bio;
87 sector_t final_block_in_bio;
88 sector_t next_block_for_io;
89
90 struct buffer_head map_bh;
91
92
93
94
95
96
97 struct page *cur_page;
98 unsigned cur_page_offset;
99 unsigned cur_page_len;
100 sector_t cur_page_block;
101 loff_t cur_page_fs_offset;
102
103
104 spinlock_t bio_lock;
105 unsigned long refcount;
106 struct bio *bio_list;
107 struct task_struct *waiter;
108
109
110 struct kiocb *iocb;
111 int is_async;
112 int io_error;
113 ssize_t result;
114
115
116
117
118 int curr_page;
119 int total_pages;
120 unsigned long curr_user_address;
121
122
123
124
125
126 unsigned head;
127 unsigned tail;
128 int page_errors;
129
130
131
132
133
134
135 struct page *pages[DIO_PAGES];
136};
137
138static void __inode_dio_wait(struct inode *inode)
139{
140 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
141 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
142
143 do {
144 prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
145 if (atomic_read(&inode->i_dio_count))
146 schedule();
147 } while (atomic_read(&inode->i_dio_count));
148 finish_wait(wq, &q.wait);
149}
150
151
152
153
154
155
156
157
158
159
160
161void inode_dio_wait(struct inode *inode)
162{
163 if (atomic_read(&inode->i_dio_count))
164 __inode_dio_wait(inode);
165}
166EXPORT_SYMBOL_GPL(inode_dio_wait);
167
168
169
170
171
172
173
174
175void inode_dio_done(struct inode *inode)
176{
177 if (atomic_dec_and_test(&inode->i_dio_count))
178 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
179}
180EXPORT_SYMBOL_GPL(inode_dio_done);
181
182
183
184
185static inline unsigned dio_pages_present(struct dio *dio)
186{
187 return dio->tail - dio->head;
188}
189
190
191
192
193static int dio_refill_pages(struct dio *dio)
194{
195 int ret;
196 int nr_pages;
197
198 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
199 ret = get_user_pages_fast(
200 dio->curr_user_address,
201 nr_pages,
202 dio->rw == READ,
203 &dio->pages[0]);
204
205 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
206 struct page *page = ZERO_PAGE(0);
207
208
209
210
211
212 if (dio->page_errors == 0)
213 dio->page_errors = ret;
214 page_cache_get(page);
215 dio->pages[0] = page;
216 dio->head = 0;
217 dio->tail = 1;
218 ret = 0;
219 goto out;
220 }
221
222 if (ret >= 0) {
223 dio->curr_user_address += ret * PAGE_SIZE;
224 dio->curr_page += ret;
225 dio->head = 0;
226 dio->tail = ret;
227 ret = 0;
228 }
229out:
230 return ret;
231}
232
233
234
235
236
237
238
239static struct page *dio_get_page(struct dio *dio)
240{
241 if (dio_pages_present(dio) == 0) {
242 int ret;
243
244 ret = dio_refill_pages(dio);
245 if (ret)
246 return ERR_PTR(ret);
247 BUG_ON(dio_pages_present(dio) == 0);
248 }
249 return dio->pages[dio->head++];
250}
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
266{
267 ssize_t transferred = 0;
268
269
270
271
272
273
274
275 if (ret == -EIOCBQUEUED)
276 ret = 0;
277
278 if (dio->result) {
279 transferred = dio->result;
280
281
282 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
283 transferred = dio->i_size - offset;
284 }
285
286 if (ret == 0)
287 ret = dio->page_errors;
288 if (ret == 0)
289 ret = dio->io_error;
290 if (ret == 0)
291 ret = transferred;
292
293 if (dio->end_io && dio->result) {
294 dio->end_io(dio->iocb, offset, transferred,
295 dio->map_bh.b_private, ret, is_async);
296 } else {
297 if (is_async)
298 aio_complete(dio->iocb, ret, 0);
299 inode_dio_done(dio->inode);
300 }
301
302 return ret;
303}
304
305static int dio_bio_complete(struct dio *dio, struct bio *bio);
306
307
308
309static void dio_bio_end_aio(struct bio *bio, int error)
310{
311 struct dio *dio = bio->bi_private;
312 unsigned long remaining;
313 unsigned long flags;
314
315
316 dio_bio_complete(dio, bio);
317
318 spin_lock_irqsave(&dio->bio_lock, flags);
319 remaining = --dio->refcount;
320 if (remaining == 1 && dio->waiter)
321 wake_up_process(dio->waiter);
322 spin_unlock_irqrestore(&dio->bio_lock, flags);
323
324 if (remaining == 0) {
325 dio_complete(dio, dio->iocb->ki_pos, 0, true);
326 kfree(dio);
327 }
328}
329
330
331
332
333
334
335
336
337static void dio_bio_end_io(struct bio *bio, int error)
338{
339 struct dio *dio = bio->bi_private;
340 unsigned long flags;
341
342 spin_lock_irqsave(&dio->bio_lock, flags);
343 bio->bi_private = dio->bio_list;
344 dio->bio_list = bio;
345 if (--dio->refcount == 1 && dio->waiter)
346 wake_up_process(dio->waiter);
347 spin_unlock_irqrestore(&dio->bio_lock, flags);
348}
349
350
351
352
353
354
355
356
357
358
359void dio_end_io(struct bio *bio, int error)
360{
361 struct dio *dio = bio->bi_private;
362
363 if (dio->is_async)
364 dio_bio_end_aio(bio, error);
365 else
366 dio_bio_end_io(bio, error);
367}
368EXPORT_SYMBOL_GPL(dio_end_io);
369
370static void
371dio_bio_alloc(struct dio *dio, struct block_device *bdev,
372 sector_t first_sector, int nr_vecs)
373{
374 struct bio *bio;
375
376
377
378
379
380 bio = bio_alloc(GFP_KERNEL, nr_vecs);
381
382 bio->bi_bdev = bdev;
383 bio->bi_sector = first_sector;
384 if (dio->is_async)
385 bio->bi_end_io = dio_bio_end_aio;
386 else
387 bio->bi_end_io = dio_bio_end_io;
388
389 dio->bio = bio;
390 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
391}
392
393
394
395
396
397
398
399
400static void dio_bio_submit(struct dio *dio)
401{
402 struct bio *bio = dio->bio;
403 unsigned long flags;
404
405 bio->bi_private = dio;
406
407 spin_lock_irqsave(&dio->bio_lock, flags);
408 dio->refcount++;
409 spin_unlock_irqrestore(&dio->bio_lock, flags);
410
411 if (dio->is_async && dio->rw == READ)
412 bio_set_pages_dirty(bio);
413
414 if (dio->submit_io)
415 dio->submit_io(dio->rw, bio, dio->inode,
416 dio->logical_offset_in_bio);
417 else
418 submit_bio(dio->rw, bio);
419
420 dio->bio = NULL;
421 dio->boundary = 0;
422 dio->logical_offset_in_bio = 0;
423}
424
425
426
427
428static void dio_cleanup(struct dio *dio)
429{
430 while (dio_pages_present(dio))
431 page_cache_release(dio_get_page(dio));
432}
433
434
435
436
437
438
439
440static struct bio *dio_await_one(struct dio *dio)
441{
442 unsigned long flags;
443 struct bio *bio = NULL;
444
445 spin_lock_irqsave(&dio->bio_lock, flags);
446
447
448
449
450
451
452
453 while (dio->refcount > 1 && dio->bio_list == NULL) {
454 __set_current_state(TASK_UNINTERRUPTIBLE);
455 dio->waiter = current;
456 spin_unlock_irqrestore(&dio->bio_lock, flags);
457 io_schedule();
458
459 spin_lock_irqsave(&dio->bio_lock, flags);
460 dio->waiter = NULL;
461 }
462 if (dio->bio_list) {
463 bio = dio->bio_list;
464 dio->bio_list = bio->bi_private;
465 }
466 spin_unlock_irqrestore(&dio->bio_lock, flags);
467 return bio;
468}
469
470
471
472
473static int dio_bio_complete(struct dio *dio, struct bio *bio)
474{
475 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
476 struct bio_vec *bvec = bio->bi_io_vec;
477 int page_no;
478
479 if (!uptodate)
480 dio->io_error = -EIO;
481
482 if (dio->is_async && dio->rw == READ) {
483 bio_check_pages_dirty(bio);
484 } else {
485 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
486 struct page *page = bvec[page_no].bv_page;
487
488 if (dio->rw == READ && !PageCompound(page))
489 set_page_dirty_lock(page);
490 page_cache_release(page);
491 }
492 bio_put(bio);
493 }
494 return uptodate ? 0 : -EIO;
495}
496
497
498
499
500
501
502
503
504static void dio_await_completion(struct dio *dio)
505{
506 struct bio *bio;
507 do {
508 bio = dio_await_one(dio);
509 if (bio)
510 dio_bio_complete(dio, bio);
511 } while (bio);
512}
513
514
515
516
517
518
519
520
521static int dio_bio_reap(struct dio *dio)
522{
523 int ret = 0;
524
525 if (dio->reap_counter++ >= 64) {
526 while (dio->bio_list) {
527 unsigned long flags;
528 struct bio *bio;
529 int ret2;
530
531 spin_lock_irqsave(&dio->bio_lock, flags);
532 bio = dio->bio_list;
533 dio->bio_list = bio->bi_private;
534 spin_unlock_irqrestore(&dio->bio_lock, flags);
535 ret2 = dio_bio_complete(dio, bio);
536 if (ret == 0)
537 ret = ret2;
538 }
539 dio->reap_counter = 0;
540 }
541 return ret;
542}
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567static int get_more_blocks(struct dio *dio)
568{
569 int ret;
570 struct buffer_head *map_bh = &dio->map_bh;
571 sector_t fs_startblk;
572 unsigned long fs_count;
573 unsigned long dio_count;
574 unsigned long blkmask;
575 int create;
576
577
578
579
580
581 ret = dio->page_errors;
582 if (ret == 0) {
583 BUG_ON(dio->block_in_file >= dio->final_block_in_request);
584 fs_startblk = dio->block_in_file >> dio->blkfactor;
585 dio_count = dio->final_block_in_request - dio->block_in_file;
586 fs_count = dio_count >> dio->blkfactor;
587 blkmask = (1 << dio->blkfactor) - 1;
588 if (dio_count & blkmask)
589 fs_count++;
590
591 map_bh->b_state = 0;
592 map_bh->b_size = fs_count << dio->inode->i_blkbits;
593
594
595
596
597
598
599
600
601
602
603
604
605 create = dio->rw & WRITE;
606 if (dio->flags & DIO_SKIP_HOLES) {
607 if (dio->block_in_file < (i_size_read(dio->inode) >>
608 dio->blkbits))
609 create = 0;
610 }
611
612 ret = (*dio->get_block)(dio->inode, fs_startblk,
613 map_bh, create);
614 }
615 return ret;
616}
617
618
619
620
621static int dio_new_bio(struct dio *dio, sector_t start_sector)
622{
623 sector_t sector;
624 int ret, nr_pages;
625
626 ret = dio_bio_reap(dio);
627 if (ret)
628 goto out;
629 sector = start_sector << (dio->blkbits - 9);
630 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
631 nr_pages = min(nr_pages, BIO_MAX_PAGES);
632 BUG_ON(nr_pages <= 0);
633 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
634 dio->boundary = 0;
635out:
636 return ret;
637}
638
639
640
641
642
643
644
645
646static int dio_bio_add_page(struct dio *dio)
647{
648 int ret;
649
650 ret = bio_add_page(dio->bio, dio->cur_page,
651 dio->cur_page_len, dio->cur_page_offset);
652 if (ret == dio->cur_page_len) {
653
654
655
656 if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
657 dio->pages_in_io--;
658 page_cache_get(dio->cur_page);
659 dio->final_block_in_bio = dio->cur_page_block +
660 (dio->cur_page_len >> dio->blkbits);
661 ret = 0;
662 } else {
663 ret = 1;
664 }
665 return ret;
666}
667
668
669
670
671
672
673
674
675
676
677
678static int dio_send_cur_page(struct dio *dio)
679{
680 int ret = 0;
681
682 if (dio->bio) {
683 loff_t cur_offset = dio->cur_page_fs_offset;
684 loff_t bio_next_offset = dio->logical_offset_in_bio +
685 dio->bio->bi_size;
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701 if (dio->final_block_in_bio != dio->cur_page_block ||
702 cur_offset != bio_next_offset)
703 dio_bio_submit(dio);
704
705
706
707
708 else if (dio->boundary)
709 dio_bio_submit(dio);
710 }
711
712 if (dio->bio == NULL) {
713 ret = dio_new_bio(dio, dio->cur_page_block);
714 if (ret)
715 goto out;
716 }
717
718 if (dio_bio_add_page(dio) != 0) {
719 dio_bio_submit(dio);
720 ret = dio_new_bio(dio, dio->cur_page_block);
721 if (ret == 0) {
722 ret = dio_bio_add_page(dio);
723 BUG_ON(ret != 0);
724 }
725 }
726out:
727 return ret;
728}
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747static int
748submit_page_section(struct dio *dio, struct page *page,
749 unsigned offset, unsigned len, sector_t blocknr)
750{
751 int ret = 0;
752
753 if (dio->rw & WRITE) {
754
755
756
757 task_io_account_write(len);
758 }
759
760
761
762
763 if ( (dio->cur_page == page) &&
764 (dio->cur_page_offset + dio->cur_page_len == offset) &&
765 (dio->cur_page_block +
766 (dio->cur_page_len >> dio->blkbits) == blocknr)) {
767 dio->cur_page_len += len;
768
769
770
771
772
773 if (dio->boundary) {
774 ret = dio_send_cur_page(dio);
775 page_cache_release(dio->cur_page);
776 dio->cur_page = NULL;
777 }
778 goto out;
779 }
780
781
782
783
784 if (dio->cur_page) {
785 ret = dio_send_cur_page(dio);
786 page_cache_release(dio->cur_page);
787 dio->cur_page = NULL;
788 if (ret)
789 goto out;
790 }
791
792 page_cache_get(page);
793 dio->cur_page = page;
794 dio->cur_page_offset = offset;
795 dio->cur_page_len = len;
796 dio->cur_page_block = blocknr;
797 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
798out:
799 return ret;
800}
801
802
803
804
805
806
807static void clean_blockdev_aliases(struct dio *dio)
808{
809 unsigned i;
810 unsigned nblocks;
811
812 nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
813
814 for (i = 0; i < nblocks; i++) {
815 unmap_underlying_metadata(dio->map_bh.b_bdev,
816 dio->map_bh.b_blocknr + i);
817 }
818}
819
820
821
822
823
824
825
826
827
828
829static void dio_zero_block(struct dio *dio, int end)
830{
831 unsigned dio_blocks_per_fs_block;
832 unsigned this_chunk_blocks;
833 unsigned this_chunk_bytes;
834 struct page *page;
835
836 dio->start_zero_done = 1;
837 if (!dio->blkfactor || !buffer_new(&dio->map_bh))
838 return;
839
840 dio_blocks_per_fs_block = 1 << dio->blkfactor;
841 this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
842
843 if (!this_chunk_blocks)
844 return;
845
846
847
848
849
850 if (end)
851 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
852
853 this_chunk_bytes = this_chunk_blocks << dio->blkbits;
854
855 page = ZERO_PAGE(0);
856 if (submit_page_section(dio, page, 0, this_chunk_bytes,
857 dio->next_block_for_io))
858 return;
859
860 dio->next_block_for_io += this_chunk_blocks;
861}
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879static int do_direct_IO(struct dio *dio)
880{
881 const unsigned blkbits = dio->blkbits;
882 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
883 struct page *page;
884 unsigned block_in_page;
885 struct buffer_head *map_bh = &dio->map_bh;
886 int ret = 0;
887
888
889 block_in_page = dio->first_block_in_page;
890
891 while (dio->block_in_file < dio->final_block_in_request) {
892 page = dio_get_page(dio);
893 if (IS_ERR(page)) {
894 ret = PTR_ERR(page);
895 goto out;
896 }
897
898 while (block_in_page < blocks_per_page) {
899 unsigned offset_in_page = block_in_page << blkbits;
900 unsigned this_chunk_bytes;
901 unsigned this_chunk_blocks;
902 unsigned u;
903
904 if (dio->blocks_available == 0) {
905
906
907
908 unsigned long blkmask;
909 unsigned long dio_remainder;
910
911 ret = get_more_blocks(dio);
912 if (ret) {
913 page_cache_release(page);
914 goto out;
915 }
916 if (!buffer_mapped(map_bh))
917 goto do_holes;
918
919 dio->blocks_available =
920 map_bh->b_size >> dio->blkbits;
921 dio->next_block_for_io =
922 map_bh->b_blocknr << dio->blkfactor;
923 if (buffer_new(map_bh))
924 clean_blockdev_aliases(dio);
925
926 if (!dio->blkfactor)
927 goto do_holes;
928
929 blkmask = (1 << dio->blkfactor) - 1;
930 dio_remainder = (dio->block_in_file & blkmask);
931
932
933
934
935
936
937
938
939
940
941
942
943 if (!buffer_new(map_bh))
944 dio->next_block_for_io += dio_remainder;
945 dio->blocks_available -= dio_remainder;
946 }
947do_holes:
948
949 if (!buffer_mapped(map_bh)) {
950 loff_t i_size_aligned;
951
952
953 if (dio->rw & WRITE) {
954 page_cache_release(page);
955 return -ENOTBLK;
956 }
957
958
959
960
961
962 i_size_aligned = ALIGN(i_size_read(dio->inode),
963 1 << blkbits);
964 if (dio->block_in_file >=
965 i_size_aligned >> blkbits) {
966
967 page_cache_release(page);
968 goto out;
969 }
970 zero_user(page, block_in_page << blkbits,
971 1 << blkbits);
972 dio->block_in_file++;
973 block_in_page++;
974 goto next_block;
975 }
976
977
978
979
980
981
982 if (unlikely(dio->blkfactor && !dio->start_zero_done))
983 dio_zero_block(dio, 0);
984
985
986
987
988
989 this_chunk_blocks = dio->blocks_available;
990 u = (PAGE_SIZE - offset_in_page) >> blkbits;
991 if (this_chunk_blocks > u)
992 this_chunk_blocks = u;
993 u = dio->final_block_in_request - dio->block_in_file;
994 if (this_chunk_blocks > u)
995 this_chunk_blocks = u;
996 this_chunk_bytes = this_chunk_blocks << blkbits;
997 BUG_ON(this_chunk_bytes == 0);
998
999 dio->boundary = buffer_boundary(map_bh);
1000 ret = submit_page_section(dio, page, offset_in_page,
1001 this_chunk_bytes, dio->next_block_for_io);
1002 if (ret) {
1003 page_cache_release(page);
1004 goto out;
1005 }
1006 dio->next_block_for_io += this_chunk_blocks;
1007
1008 dio->block_in_file += this_chunk_blocks;
1009 block_in_page += this_chunk_blocks;
1010 dio->blocks_available -= this_chunk_blocks;
1011next_block:
1012 BUG_ON(dio->block_in_file > dio->final_block_in_request);
1013 if (dio->block_in_file == dio->final_block_in_request)
1014 break;
1015 }
1016
1017
1018 page_cache_release(page);
1019 block_in_page = 0;
1020 }
1021out:
1022 return ret;
1023}
1024
1025static ssize_t
1026direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1027 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
1028 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
1029 dio_submit_t submit_io, struct dio *dio)
1030{
1031 unsigned long user_addr;
1032 unsigned long flags;
1033 int seg;
1034 ssize_t ret = 0;
1035 ssize_t ret2;
1036 size_t bytes;
1037
1038 dio->inode = inode;
1039 dio->rw = rw;
1040 dio->blkbits = blkbits;
1041 dio->blkfactor = inode->i_blkbits - blkbits;
1042 dio->block_in_file = offset >> blkbits;
1043
1044 dio->get_block = get_block;
1045 dio->end_io = end_io;
1046 dio->submit_io = submit_io;
1047 dio->final_block_in_bio = -1;
1048 dio->next_block_for_io = -1;
1049
1050 dio->iocb = iocb;
1051 dio->i_size = i_size_read(inode);
1052
1053 spin_lock_init(&dio->bio_lock);
1054 dio->refcount = 1;
1055
1056
1057
1058
1059
1060 if (unlikely(dio->blkfactor))
1061 dio->pages_in_io = 2;
1062
1063 for (seg = 0; seg < nr_segs; seg++) {
1064 user_addr = (unsigned long)iov[seg].iov_base;
1065 dio->pages_in_io +=
1066 ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
1067 - user_addr/PAGE_SIZE);
1068 }
1069
1070 for (seg = 0; seg < nr_segs; seg++) {
1071 user_addr = (unsigned long)iov[seg].iov_base;
1072 dio->size += bytes = iov[seg].iov_len;
1073
1074
1075 dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1076 dio->final_block_in_request = dio->block_in_file +
1077 (bytes >> blkbits);
1078
1079 dio->head = 0;
1080 dio->tail = 0;
1081 dio->curr_page = 0;
1082
1083 dio->total_pages = 0;
1084 if (user_addr & (PAGE_SIZE-1)) {
1085 dio->total_pages++;
1086 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1087 }
1088 dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1089 dio->curr_user_address = user_addr;
1090
1091 ret = do_direct_IO(dio);
1092
1093 dio->result += iov[seg].iov_len -
1094 ((dio->final_block_in_request - dio->block_in_file) <<
1095 blkbits);
1096
1097 if (ret) {
1098 dio_cleanup(dio);
1099 break;
1100 }
1101 }
1102
1103 if (ret == -ENOTBLK) {
1104
1105
1106
1107
1108 ret = 0;
1109 }
1110
1111
1112
1113
1114 dio_zero_block(dio, 1);
1115
1116 if (dio->cur_page) {
1117 ret2 = dio_send_cur_page(dio);
1118 if (ret == 0)
1119 ret = ret2;
1120 page_cache_release(dio->cur_page);
1121 dio->cur_page = NULL;
1122 }
1123 if (dio->bio)
1124 dio_bio_submit(dio);
1125
1126
1127
1128
1129
1130 dio_cleanup(dio);
1131
1132
1133
1134
1135
1136
1137 if (rw == READ && (dio->flags & DIO_LOCKING))
1138 mutex_unlock(&dio->inode->i_mutex);
1139
1140
1141
1142
1143
1144
1145
1146
1147 BUG_ON(ret == -EIOCBQUEUED);
1148 if (dio->is_async && ret == 0 && dio->result &&
1149 ((rw & READ) || (dio->result == dio->size)))
1150 ret = -EIOCBQUEUED;
1151
1152 if (ret != -EIOCBQUEUED)
1153 dio_await_completion(dio);
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166 spin_lock_irqsave(&dio->bio_lock, flags);
1167 ret2 = --dio->refcount;
1168 spin_unlock_irqrestore(&dio->bio_lock, flags);
1169
1170 if (ret2 == 0) {
1171 ret = dio_complete(dio, offset, ret, false);
1172 kfree(dio);
1173 } else
1174 BUG_ON(ret != -EIOCBQUEUED);
1175
1176 return ret;
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199ssize_t
1200__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1201 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1202 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1203 dio_submit_t submit_io, int flags)
1204{
1205 int seg;
1206 size_t size;
1207 unsigned long addr;
1208 unsigned blkbits = inode->i_blkbits;
1209 unsigned bdev_blkbits = 0;
1210 unsigned blocksize_mask = (1 << blkbits) - 1;
1211 ssize_t retval = -EINVAL;
1212 loff_t end = offset;
1213 struct dio *dio;
1214
1215 if (rw & WRITE)
1216 rw = WRITE_ODIRECT;
1217
1218 if (bdev)
1219 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
1220
1221 if (offset & blocksize_mask) {
1222 if (bdev)
1223 blkbits = bdev_blkbits;
1224 blocksize_mask = (1 << blkbits) - 1;
1225 if (offset & blocksize_mask)
1226 goto out;
1227 }
1228
1229
1230 for (seg = 0; seg < nr_segs; seg++) {
1231 addr = (unsigned long)iov[seg].iov_base;
1232 size = iov[seg].iov_len;
1233 end += size;
1234 if ((addr & blocksize_mask) || (size & blocksize_mask)) {
1235 if (bdev)
1236 blkbits = bdev_blkbits;
1237 blocksize_mask = (1 << blkbits) - 1;
1238 if ((addr & blocksize_mask) || (size & blocksize_mask))
1239 goto out;
1240 }
1241 }
1242
1243
1244 if (rw == READ && end == offset)
1245 return 0;
1246
1247 dio = kmalloc(sizeof(*dio), GFP_KERNEL);
1248 retval = -ENOMEM;
1249 if (!dio)
1250 goto out;
1251
1252
1253
1254
1255
1256 memset(dio, 0, offsetof(struct dio, pages));
1257
1258 dio->flags = flags;
1259 if (dio->flags & DIO_LOCKING) {
1260 if (rw == READ) {
1261 struct address_space *mapping =
1262 iocb->ki_filp->f_mapping;
1263
1264
1265 mutex_lock(&inode->i_mutex);
1266
1267 retval = filemap_write_and_wait_range(mapping, offset,
1268 end - 1);
1269 if (retval) {
1270 mutex_unlock(&inode->i_mutex);
1271 kfree(dio);
1272 goto out;
1273 }
1274 }
1275 }
1276
1277
1278
1279
1280 atomic_inc(&inode->i_dio_count);
1281
1282
1283
1284
1285
1286
1287
1288 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1289 (end > i_size_read(inode)));
1290
1291 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1292 nr_segs, blkbits, get_block, end_io,
1293 submit_io, dio);
1294
1295out:
1296 return retval;
1297}
1298EXPORT_SYMBOL(__blockdev_direct_IO);
1299