1
2
3
4
5
6
7
8
9
10
11
12#include <linux/malloc.h>
13#include <linux/shm.h>
14#include <linux/mman.h>
15#include <linux/locks.h>
16#include <linux/pagemap.h>
17#include <linux/swap.h>
18#include <linux/smp_lock.h>
19#include <linux/blkdev.h>
20#include <linux/file.h>
21#include <linux/swapctl.h>
22#include <linux/init.h>
23
24#include <asm/pgtable.h>
25#include <asm/uaccess.h>
26
27
28
29
30
31
32
33
34unsigned long page_cache_size = 0;
35unsigned int page_hash_bits, page_hash_mask;
36struct page **page_hash_table;
37
38static inline int sync_page(struct page *page)
39{
40 struct inode *inode = page->inode;
41
42 if (inode && inode->i_op && inode->i_op->sync_page)
43 return inode->i_op->sync_page(page);
44 run_task_queue(&tq_disk);
45 return 0;
46}
47
48
49
50
51
52
53void invalidate_inode_pages(struct inode * inode)
54{
55 struct page ** p;
56 struct page * page;
57
58 p = &inode->i_pages;
59 while ((page = *p) != NULL) {
60 if (PageLocked(page)) {
61 p = &page->next;
62 continue;
63 }
64 inode->i_nrpages--;
65 if ((*p = page->next) != NULL)
66 (*p)->prev = page->prev;
67 page->next = NULL;
68 page->prev = NULL;
69 remove_page_from_hash_queue(page);
70 page->inode = NULL;
71 page_cache_release(page);
72 continue;
73 }
74}
75
76
77
78
79
80void truncate_inode_pages(struct inode * inode, unsigned long start)
81{
82 struct page ** p;
83 struct page * page;
84
85repeat:
86 p = &inode->i_pages;
87 while ((page = *p) != NULL) {
88 unsigned long offset = page->offset;
89
90
91 if (offset >= start) {
92 if (PageLocked(page)) {
93 wait_on_page(page);
94 goto repeat;
95 }
96 inode->i_nrpages--;
97 if ((*p = page->next) != NULL)
98 (*p)->prev = page->prev;
99 page->next = NULL;
100 page->prev = NULL;
101 remove_page_from_hash_queue(page);
102 page->inode = NULL;
103 page_cache_release(page);
104 continue;
105 }
106 p = &page->next;
107 offset = start - offset;
108
109 if (offset < PAGE_CACHE_SIZE) {
110 unsigned long address = page_address(page);
111 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
112 flush_page_to_ram(address);
113 }
114 }
115}
116
117
118
119
120void remove_inode_page(struct page *page)
121{
122 remove_page_from_hash_queue(page);
123 remove_page_from_inode_queue(page);
124 page_cache_release(page);
125}
126
127int shrink_mmap(int priority, int gfp_mask)
128{
129 static unsigned long clock = 0;
130 unsigned long limit = num_physpages;
131 struct page * page;
132 int count;
133
134
135 count = limit / priority;
136
137 refresh_clock:
138 page = mem_map + clock;
139 do {
140 int referenced;
141
142 if (current->need_resched) {
143 current->state = TASK_RUNNING;
144 schedule();
145 goto refresh_clock;
146 }
147
148
149
150
151
152 page++;
153 clock++;
154 if (clock >= max_mapnr) {
155 clock = 0;
156 page = mem_map;
157 }
158 if (PageSkip(page)) {
159
160 page = page->next_hash;
161 clock = page - mem_map;
162 }
163
164 count--;
165
166
167 if (atomic_read(&page->count) != 1)
168 continue;
169
170 referenced = test_and_clear_bit(PG_referenced, &page->flags);
171
172 if (PageLocked(page))
173 continue;
174
175 if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
176 count++;
177 continue;
178 }
179
180
181
182
183
184
185 if (PageSwapCache(page)) {
186 if (referenced && swap_count(page->offset) != 1)
187 continue;
188 delete_from_swap_cache(page);
189 return 1;
190 }
191
192 if (referenced)
193 continue;
194
195
196 if (page->buffers) {
197 if (buffer_under_min())
198 continue;
199
200
201
202
203
204 if (!try_to_free_buffers(page, gfp_mask))
205 goto refresh_clock;
206 return 1;
207 }
208
209
210 if (page->inode) {
211 if (pgcache_under_min())
212 continue;
213 remove_inode_page(page);
214 return 1;
215 }
216 } while (count > 0);
217 return 0;
218}
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241void update_vm_cache_conditional(struct inode * inode, unsigned long pos, const char * buf, int count, unsigned long source_address)
242{
243 unsigned long offset, len;
244
245 offset = (pos & ~PAGE_CACHE_MASK);
246 pos = pos & PAGE_CACHE_MASK;
247 len = PAGE_CACHE_SIZE - offset;
248 do {
249 struct page * page;
250
251 if (len > count)
252 len = count;
253 page = find_page(inode, pos);
254 if (page) {
255 char *dest = (char*) (offset + page_address(page));
256
257 if ((unsigned long)dest != source_address
258 || !segment_eq(get_fs(), KERNEL_DS)) {
259 wait_on_page(page);
260 memcpy(dest, buf, len);
261 flush_dcache_page(page_address(page));
262 }
263 page_cache_release(page);
264 }
265 count -= len;
266 buf += len;
267 len = PAGE_CACHE_SIZE;
268 offset = 0;
269 pos += PAGE_CACHE_SIZE;
270 } while (count);
271}
272
273void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
274{
275 update_vm_cache_conditional(inode, pos, buf, count, 0);
276}
277
278
279static inline void add_to_page_cache(struct page * page,
280 struct inode * inode, unsigned long offset,
281 struct page **hash)
282{
283 atomic_inc(&page->count);
284 page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
285 page->offset = offset;
286 add_page_to_inode_queue(inode, page);
287 __add_page_to_hash_queue(page, hash);
288}
289
290
291
292
293
294
295static unsigned long try_to_read_ahead(struct file * file,
296 unsigned long offset, unsigned long page_cache)
297{
298 struct inode *inode = file->f_dentry->d_inode;
299 struct page * page;
300 struct page ** hash;
301
302 offset &= PAGE_CACHE_MASK;
303 switch (page_cache) {
304 case 0:
305 page_cache = page_cache_alloc();
306 if (!page_cache)
307 break;
308 default:
309 if (offset >= inode->i_size)
310 break;
311 hash = page_hash(inode, offset);
312 page = __find_page(inode, offset, *hash);
313 if (!page) {
314
315
316
317 page = page_cache_entry(page_cache);
318 add_to_page_cache(page, inode, offset, hash);
319 inode->i_op->readpage(file, page);
320 page_cache = 0;
321 }
322 page_cache_release(page);
323 }
324 return page_cache;
325}
326
327
328
329
330
331
332
333
334void __wait_on_page(struct page *page)
335{
336 struct task_struct *tsk = current;
337 struct wait_queue wait;
338
339 wait.task = tsk;
340 add_wait_queue(&page->wait, &wait);
341repeat:
342 tsk->state = TASK_UNINTERRUPTIBLE;
343 sync_page(page);
344 if (PageLocked(page)) {
345 schedule();
346 goto repeat;
347 }
348 tsk->state = TASK_RUNNING;
349 remove_wait_queue(&page->wait, &wait);
350}
351
352#if 0
353#define PROFILE_READAHEAD
354#define DEBUG_READAHEAD
355#endif
356
357
358
359
360
361
362
363
364
365
366
367
368#ifdef PROFILE_READAHEAD
369
370#define PROFILE_MAXREADCOUNT 1000
371
372static unsigned long total_reada;
373static unsigned long total_async;
374static unsigned long total_ramax;
375static unsigned long total_ralen;
376static unsigned long total_rawin;
377
378static void profile_readahead(int async, struct file *filp)
379{
380 unsigned long flags;
381
382 ++total_reada;
383 if (async)
384 ++total_async;
385
386 total_ramax += filp->f_ramax;
387 total_ralen += filp->f_ralen;
388 total_rawin += filp->f_rawin;
389
390 if (total_reada > PROFILE_MAXREADCOUNT) {
391 save_flags(flags);
392 cli();
393 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
394 restore_flags(flags);
395 return;
396 }
397
398 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
399 total_ramax/total_reada,
400 total_ralen/total_reada,
401 total_rawin/total_reada,
402 (total_async*100)/total_reada);
403#ifdef DEBUG_READAHEAD
404 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
405 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
406#endif
407
408 total_reada = 0;
409 total_async = 0;
410 total_ramax = 0;
411 total_ralen = 0;
412 total_rawin = 0;
413
414 restore_flags(flags);
415 }
416}
417#endif
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475static inline int get_max_readahead(struct inode * inode)
476{
477 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
478 return MAX_READAHEAD;
479 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
480}
481
482static inline unsigned long generic_file_readahead(int reada_ok,
483 struct file * filp, struct inode * inode,
484 unsigned long ppos, struct page * page, unsigned long page_cache)
485{
486 unsigned long max_ahead, ahead;
487 unsigned long raend;
488 int max_readahead = get_max_readahead(inode);
489
490 raend = filp->f_raend & PAGE_CACHE_MASK;
491 max_ahead = 0;
492
493
494
495
496
497
498
499
500
501 if (PageLocked(page)) {
502 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
503 raend = ppos;
504 if (raend < inode->i_size)
505 max_ahead = filp->f_ramax;
506 filp->f_rawin = 0;
507 filp->f_ralen = PAGE_CACHE_SIZE;
508 if (!max_ahead) {
509 filp->f_raend = ppos + filp->f_ralen;
510 filp->f_rawin += filp->f_ralen;
511 }
512 }
513 }
514
515
516
517
518
519
520
521
522 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
523 ppos <= raend && ppos + filp->f_ralen >= raend) {
524
525
526
527
528
529
530 raend -= PAGE_CACHE_SIZE;
531 if (raend < inode->i_size)
532 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
533
534 if (max_ahead) {
535 filp->f_rawin = filp->f_ralen;
536 filp->f_ralen = 0;
537 reada_ok = 2;
538 }
539 }
540
541
542
543
544
545 ahead = 0;
546 while (ahead < max_ahead) {
547 ahead += PAGE_CACHE_SIZE;
548 page_cache = try_to_read_ahead(filp, raend + ahead,
549 page_cache);
550 }
551
552
553
554
555
556
557
558
559
560
561
562 if (ahead) {
563 if (reada_ok == 2) {
564 run_task_queue(&tq_disk);
565 }
566
567 filp->f_ralen += ahead;
568 filp->f_rawin += filp->f_ralen;
569 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
570
571 filp->f_ramax += filp->f_ramax;
572
573 if (filp->f_ramax > max_readahead)
574 filp->f_ramax = max_readahead;
575
576#ifdef PROFILE_READAHEAD
577 profile_readahead((reada_ok == 2), filp);
578#endif
579 }
580
581 return page_cache;
582}
583
584
585
586
587
588
589
590
591
592
593typedef struct {
594 size_t written;
595 size_t count;
596 char * buf;
597 int error;
598} read_descriptor_t;
599
600typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
601
602
603
604
605
606
607
608
609
610static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
611{
612 struct dentry *dentry = filp->f_dentry;
613 struct inode *inode = dentry->d_inode;
614 unsigned long page_cache;
615 size_t pos, pgpos;
616 int reada_ok;
617 int max_readahead = get_max_readahead(inode);
618
619 page_cache = 0;
620
621 pos = *ppos;
622 pgpos = pos & PAGE_CACHE_MASK;
623
624
625
626
627
628
629
630 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
631 reada_ok = 0;
632 filp->f_raend = 0;
633 filp->f_ralen = 0;
634 filp->f_ramax = 0;
635 filp->f_rawin = 0;
636 } else {
637 reada_ok = 1;
638 }
639
640
641
642
643
644
645
646 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
647 filp->f_ramax = 0;
648 } else {
649 unsigned long needed;
650
651 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
652
653 if (filp->f_ramax < needed)
654 filp->f_ramax = needed;
655
656 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
657 filp->f_ramax = MIN_READAHEAD;
658 if (filp->f_ramax > max_readahead)
659 filp->f_ramax = max_readahead;
660 }
661
662 for (;;) {
663 struct page *page, **hash;
664
665 if (pos >= inode->i_size)
666 break;
667
668
669
670
671 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
672 page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
673 if (!page)
674 goto no_cached_page;
675
676found_page:
677
678
679
680
681
682
683
684 if (PageUptodate(page) || PageLocked(page))
685 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
686 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
687 filp->f_ramax = MIN_READAHEAD;
688
689 wait_on_page(page);
690
691 if (!PageUptodate(page))
692 goto page_read_error;
693
694success:
695
696
697
698
699 {
700 unsigned long offset, nr;
701
702
703
704
705
706 if (inode->i_mmap_shared != NULL)
707 flush_dcache_page(page_address(page));
708
709 offset = pos & ~PAGE_CACHE_MASK;
710 nr = PAGE_CACHE_SIZE - offset;
711 if (nr > inode->i_size - pos)
712 nr = inode->i_size - pos;
713
714
715
716
717
718
719
720
721 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
722 pos += nr;
723 page_cache_release(page);
724 if (nr && desc->count)
725 continue;
726 break;
727 }
728
729no_cached_page:
730
731
732
733
734 if (!page_cache) {
735 page_cache = page_cache_alloc();
736
737
738
739
740 if (page_cache)
741 continue;
742 desc->error = -ENOMEM;
743 break;
744 }
745
746
747
748
749 page = page_cache_entry(page_cache);
750 page_cache = 0;
751 add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
769 filp->f_ramax = MIN_READAHEAD;
770
771 {
772 int error = inode->i_op->readpage(filp, page);
773 if (!error)
774 goto found_page;
775 desc->error = error;
776 page_cache_release(page);
777 break;
778 }
779
780page_read_error:
781
782
783
784
785
786 {
787 int error = inode->i_op->readpage(filp, page);
788 if (!error) {
789 wait_on_page(page);
790 if (PageUptodate(page) && !PageError(page))
791 goto success;
792 error = -EIO;
793 }
794 desc->error = error;
795 page_cache_release(page);
796 break;
797 }
798 }
799
800 *ppos = pos;
801 filp->f_reada = 1;
802 if (page_cache)
803 page_cache_free(page_cache);
804 UPDATE_ATIME(inode);
805}
806
807static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
808{
809 unsigned long left;
810 unsigned long count = desc->count;
811
812 if (size > count)
813 size = count;
814 left = __copy_to_user(desc->buf, area, size);
815 if (left) {
816 size -= left;
817 desc->error = -EFAULT;
818 }
819 desc->count = count - size;
820 desc->written += size;
821 desc->buf += size;
822 return size;
823}
824
825
826
827
828
829ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
830{
831 ssize_t retval;
832
833 retval = -EFAULT;
834 if (access_ok(VERIFY_WRITE, buf, count)) {
835 retval = 0;
836 if (count) {
837 read_descriptor_t desc;
838
839 desc.written = 0;
840 desc.count = count;
841 desc.buf = buf;
842 desc.error = 0;
843 do_generic_file_read(filp, ppos, &desc, file_read_actor);
844
845 retval = desc.written;
846 if (!retval)
847 retval = desc.error;
848 }
849 }
850 return retval;
851}
852
853static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
854{
855 ssize_t written;
856 unsigned long count = desc->count;
857 struct file *file = (struct file *) desc->buf;
858 struct inode *inode = file->f_dentry->d_inode;
859 mm_segment_t old_fs;
860
861 if (size > count)
862 size = count;
863 fs_down(&inode->i_sem);
864 old_fs = get_fs();
865 set_fs(KERNEL_DS);
866 written = file->f_op->write(file, area, size, &file->f_pos);
867 set_fs(old_fs);
868 fs_up(&inode->i_sem);
869 if (written < 0) {
870 desc->error = written;
871 written = 0;
872 }
873 desc->count = count - written;
874 desc->written += written;
875 return written;
876}
877
878asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
879{
880 ssize_t retval;
881 struct file * in_file, * out_file;
882 struct inode * in_inode, * out_inode;
883
884 lock_kernel();
885
886
887
888
889 retval = -EBADF;
890 in_file = fget(in_fd);
891 if (!in_file)
892 goto out;
893 if (!(in_file->f_mode & FMODE_READ))
894 goto fput_in;
895 retval = -EINVAL;
896 in_inode = in_file->f_dentry->d_inode;
897 if (!in_inode)
898 goto fput_in;
899 if (!in_inode->i_op || !in_inode->i_op->readpage)
900 goto fput_in;
901 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
902 if (retval)
903 goto fput_in;
904
905
906
907
908 retval = -EBADF;
909 out_file = fget(out_fd);
910 if (!out_file)
911 goto fput_in;
912 if (!(out_file->f_mode & FMODE_WRITE))
913 goto fput_out;
914 retval = -EINVAL;
915 if (!out_file->f_op || !out_file->f_op->write)
916 goto fput_out;
917 out_inode = out_file->f_dentry->d_inode;
918 if (!out_inode)
919 goto fput_out;
920 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
921 if (retval)
922 goto fput_out;
923
924 retval = 0;
925 if (count) {
926 read_descriptor_t desc;
927 loff_t pos = 0, *ppos;
928
929 retval = -EFAULT;
930 ppos = &in_file->f_pos;
931 if (offset) {
932 if (get_user(pos, offset))
933 goto fput_out;
934 ppos = &pos;
935 }
936
937 desc.written = 0;
938 desc.count = count;
939 desc.buf = (char *) out_file;
940 desc.error = 0;
941 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
942
943 retval = desc.written;
944 if (!retval)
945 retval = desc.error;
946 if (offset)
947 put_user(pos, offset);
948 }
949
950
951fput_out:
952 fput(out_file);
953fput_in:
954 fput(in_file);
955out:
956 unlock_kernel();
957 return retval;
958}
959
960
961
962
963
964
965
966
967
968
969
970
971
972static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
973{
974 struct file * file = area->vm_file;
975 struct dentry * dentry = file->f_dentry;
976 struct inode * inode = dentry->d_inode;
977 unsigned long offset, reada, i;
978 struct page * page, **hash;
979 unsigned long old_page, new_page;
980
981 new_page = 0;
982 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
983 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
984 goto no_page;
985
986
987
988
989 hash = page_hash(inode, offset);
990 page = __find_page(inode, offset, *hash);
991 if (!page)
992 goto no_cached_page;
993
994found_page:
995
996
997
998
999
1000 if (no_share && !new_page) {
1001 new_page = page_cache_alloc();
1002 if (!new_page)
1003 goto release_and_oom;
1004 }
1005
1006 if (PageLocked(page))
1007 goto page_locked_wait;
1008 if (!PageUptodate(page))
1009 goto page_read_error;
1010
1011success:
1012
1013
1014
1015
1016 old_page = page_address(page);
1017 if (!no_share) {
1018
1019
1020
1021
1022 if (new_page)
1023 page_cache_free(new_page);
1024
1025 flush_page_to_ram(old_page);
1026 return old_page;
1027 }
1028
1029
1030
1031
1032 copy_page(new_page, old_page);
1033 flush_page_to_ram(new_page);
1034 page_cache_release(page);
1035 return new_page;
1036
1037no_cached_page:
1038
1039
1040
1041 reada = offset;
1042 reada >>= PAGE_CACHE_SHIFT + page_cluster;
1043 reada <<= PAGE_CACHE_SHIFT + page_cluster;
1044
1045 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1046 new_page = try_to_read_ahead(file, reada, new_page);
1047
1048 if (!new_page)
1049 new_page = page_cache_alloc();
1050 if (!new_page)
1051 goto oom;
1052
1053
1054
1055
1056
1057
1058
1059 page = find_page(inode, offset);
1060 if (page)
1061 goto found_page;
1062
1063
1064
1065
1066 page = page_cache_entry(new_page);
1067 new_page = 0;
1068 add_to_page_cache(page, inode, offset, hash);
1069
1070 if (inode->i_op->readpage(file, page) != 0)
1071 goto failure;
1072
1073 goto found_page;
1074
1075page_locked_wait:
1076 __wait_on_page(page);
1077 if (PageUptodate(page))
1078 goto success;
1079
1080page_read_error:
1081
1082
1083
1084
1085
1086
1087 if (inode->i_op->readpage(file, page) != 0)
1088 goto failure;
1089 wait_on_page(page);
1090 if (PageError(page))
1091 goto failure;
1092 if (PageUptodate(page))
1093 goto success;
1094
1095
1096
1097
1098
1099failure:
1100 page_cache_release(page);
1101 if (new_page)
1102 page_cache_free(new_page);
1103no_page:
1104 return 0;
1105
1106release_and_oom:
1107 page_cache_release(page);
1108oom:
1109 return -1;
1110}
1111
1112
1113
1114
1115
1116static inline int do_write_page(struct inode * inode, struct file * file,
1117 const char * page, unsigned long offset)
1118{
1119 int retval;
1120 unsigned long size;
1121 loff_t loff = offset;
1122 mm_segment_t old_fs;
1123
1124 size = offset + PAGE_SIZE;
1125
1126 if (S_ISREG(inode->i_mode)) {
1127 if (size > inode->i_size)
1128 size = inode->i_size;
1129
1130 if (size < offset)
1131 return -EIO;
1132 }
1133 size -= offset;
1134 old_fs = get_fs();
1135 set_fs(KERNEL_DS);
1136 retval = -EIO;
1137 if (size == file->f_op->write(file, (const char *) page, size, &loff))
1138 retval = 0;
1139 set_fs(old_fs);
1140 return retval;
1141}
1142
1143static int filemap_write_page(struct vm_area_struct * vma,
1144 unsigned long offset,
1145 unsigned long page)
1146{
1147 int result;
1148 struct file * file;
1149 struct dentry * dentry;
1150 struct inode * inode;
1151
1152 file = vma->vm_file;
1153 dentry = file->f_dentry;
1154 inode = dentry->d_inode;
1155 if (!file->f_op->write)
1156 return -EIO;
1157
1158
1159
1160
1161
1162 file->f_count++;
1163 fs_down(&inode->i_sem);
1164 result = do_write_page(inode, file, (const char *) page, offset);
1165 fs_up(&inode->i_sem);
1166 fput(file);
1167 return result;
1168}
1169
1170
1171
1172
1173
1174
1175
1176int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1177{
1178 return filemap_write_page(vma, page->offset, page_address(page));
1179}
1180
1181static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1182 unsigned long address, unsigned int flags)
1183{
1184 pte_t pte = *ptep;
1185 unsigned long page;
1186 int error;
1187
1188 if (!(flags & MS_INVALIDATE)) {
1189 if (!pte_present(pte))
1190 return 0;
1191 if (!pte_dirty(pte))
1192 return 0;
1193 flush_page_to_ram(pte_page(pte));
1194 flush_cache_page(vma, address);
1195 set_pte(ptep, pte_mkclean(pte));
1196 flush_tlb_page(vma, address);
1197 page = pte_page(pte);
1198 atomic_inc(&page_cache_entry(page)->count);
1199 } else {
1200 if (pte_none(pte))
1201 return 0;
1202 flush_cache_page(vma, address);
1203 pte_clear(ptep);
1204 flush_tlb_page(vma, address);
1205 if (!pte_present(pte)) {
1206 swap_free(pte_val(pte));
1207 return 0;
1208 }
1209 page = pte_page(pte);
1210 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1211 page_cache_free(page);
1212 return 0;
1213 }
1214 }
1215 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1216 page_cache_free(page);
1217 return error;
1218}
1219
1220static inline int filemap_sync_pte_range(pmd_t * pmd,
1221 unsigned long address, unsigned long size,
1222 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1223{
1224 pte_t * pte;
1225 unsigned long end;
1226 int error;
1227
1228 if (pmd_none(*pmd))
1229 return 0;
1230 if (pmd_bad(*pmd)) {
1231 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1232 pmd_clear(pmd);
1233 return 0;
1234 }
1235 pte = pte_offset(pmd, address);
1236 offset += address & PMD_MASK;
1237 address &= ~PMD_MASK;
1238 end = address + size;
1239 if (end > PMD_SIZE)
1240 end = PMD_SIZE;
1241 error = 0;
1242 do {
1243 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1244 address += PAGE_SIZE;
1245 pte++;
1246 } while (address < end);
1247 return error;
1248}
1249
1250static inline int filemap_sync_pmd_range(pgd_t * pgd,
1251 unsigned long address, unsigned long size,
1252 struct vm_area_struct *vma, unsigned int flags)
1253{
1254 pmd_t * pmd;
1255 unsigned long offset, end;
1256 int error;
1257
1258 if (pgd_none(*pgd))
1259 return 0;
1260 if (pgd_bad(*pgd)) {
1261 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1262 pgd_clear(pgd);
1263 return 0;
1264 }
1265 pmd = pmd_offset(pgd, address);
1266 offset = address & PGDIR_MASK;
1267 address &= ~PGDIR_MASK;
1268 end = address + size;
1269 if (end > PGDIR_SIZE)
1270 end = PGDIR_SIZE;
1271 error = 0;
1272 do {
1273 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1274 address = (address + PMD_SIZE) & PMD_MASK;
1275 pmd++;
1276 } while (address < end);
1277 return error;
1278}
1279
1280static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1281 size_t size, unsigned int flags)
1282{
1283 pgd_t * dir;
1284 unsigned long end = address + size;
1285 int error = 0;
1286
1287 dir = pgd_offset(vma->vm_mm, address);
1288 flush_cache_range(vma->vm_mm, end - size, end);
1289 while (address < end) {
1290 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1291 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1292 dir++;
1293 }
1294 flush_tlb_range(vma->vm_mm, end - size, end);
1295 return error;
1296}
1297
1298
1299
1300
1301static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1302{
1303 filemap_sync(vma, start, len, MS_ASYNC);
1304}
1305
1306
1307
1308
1309
1310
1311static struct vm_operations_struct file_shared_mmap = {
1312 NULL,
1313 NULL,
1314 filemap_unmap,
1315 NULL,
1316 filemap_sync,
1317 NULL,
1318 filemap_nopage,
1319 NULL,
1320 filemap_swapout,
1321 NULL,
1322};
1323
1324
1325
1326
1327
1328
1329
1330static struct vm_operations_struct file_private_mmap = {
1331 NULL,
1332 NULL,
1333 NULL,
1334 NULL,
1335 NULL,
1336 NULL,
1337 filemap_nopage,
1338 NULL,
1339 NULL,
1340 NULL,
1341};
1342
1343
1344
1345int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1346{
1347 struct vm_operations_struct * ops;
1348 struct inode *inode = file->f_dentry->d_inode;
1349
1350 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1351 ops = &file_shared_mmap;
1352
1353
1354 if (vma->vm_offset & (PAGE_SIZE - 1))
1355 return -EINVAL;
1356 } else {
1357 ops = &file_private_mmap;
1358 if (inode->i_op && inode->i_op->bmap &&
1359 (vma->vm_offset & (inode->i_sb->s_blocksize - 1)))
1360 return -EINVAL;
1361 }
1362 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1363 return -EACCES;
1364 if (!inode->i_op || !inode->i_op->readpage)
1365 return -ENOEXEC;
1366 UPDATE_ATIME(inode);
1367 vma->vm_ops = ops;
1368 return 0;
1369}
1370
1371
1372
1373
1374
1375
1376static int msync_interval(struct vm_area_struct * vma,
1377 unsigned long start, unsigned long end, int flags)
1378{
1379 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1380 int error;
1381 error = vma->vm_ops->sync(vma, start, end-start, flags);
1382 if (!error && (flags & MS_SYNC)) {
1383 struct file * file = vma->vm_file;
1384 if (file) {
1385 struct dentry * dentry = file->f_dentry;
1386 struct inode * inode = dentry->d_inode;
1387 fs_down(&inode->i_sem);
1388 error = file_fsync(file, dentry);
1389 fs_up(&inode->i_sem);
1390 }
1391 }
1392 return error;
1393 }
1394 return 0;
1395}
1396
1397asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1398{
1399 unsigned long end;
1400 struct vm_area_struct * vma;
1401 int unmapped_error, error = -EINVAL;
1402
1403 down(¤t->mm->mmap_sem);
1404 lock_kernel();
1405 if (start & ~PAGE_MASK)
1406 goto out;
1407 len = (len + ~PAGE_MASK) & PAGE_MASK;
1408 end = start + len;
1409 if (end < start)
1410 goto out;
1411 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1412 goto out;
1413 error = 0;
1414 if (end == start)
1415 goto out;
1416
1417
1418
1419
1420 vma = find_vma(current->mm, start);
1421 unmapped_error = 0;
1422 for (;;) {
1423
1424 error = -EFAULT;
1425 if (!vma)
1426 goto out;
1427
1428 if (start < vma->vm_start) {
1429 unmapped_error = -EFAULT;
1430 start = vma->vm_start;
1431 }
1432
1433 if (end <= vma->vm_end) {
1434 if (start < end) {
1435 error = msync_interval(vma, start, end, flags);
1436 if (error)
1437 goto out;
1438 }
1439 error = unmapped_error;
1440 goto out;
1441 }
1442
1443 error = msync_interval(vma, start, vma->vm_end, flags);
1444 if (error)
1445 goto out;
1446 start = vma->vm_end;
1447 vma = vma->vm_next;
1448 }
1449out:
1450 unlock_kernel();
1451 up(¤t->mm->mmap_sem);
1452 return error;
1453}
1454
1455static inline
1456struct page *__read_cache_page(struct inode *inode,
1457 unsigned long offset,
1458 int (*filler)(void *,struct page*),
1459 void *data)
1460{
1461 struct page **hash = page_hash(inode, offset);
1462 struct page *page;
1463 unsigned long cached_page = 0;
1464 int err;
1465
1466 offset &= PAGE_CACHE_MASK;
1467repeat:
1468 page = __find_page(inode, offset, *hash);
1469 if (!page) {
1470 if (!cached_page) {
1471 cached_page = page_cache_alloc();
1472 if (!cached_page)
1473 return ERR_PTR(-ENOMEM);
1474 goto repeat;
1475 }
1476 page = page_cache_entry(cached_page);
1477 cached_page = 0;
1478 add_to_page_cache(page, inode, offset, hash);
1479 set_bit(PG_locked, &page->flags);
1480 err = filler(data, page);
1481 if (err < 0) {
1482 page_cache_release(page);
1483 page = ERR_PTR(err);
1484 }
1485 }
1486 if (cached_page)
1487 page_cache_free(cached_page);
1488 return page;
1489}
1490
1491
1492
1493
1494
1495struct page *read_cache_page(struct inode *inode,
1496 unsigned long offset,
1497 int (*filler)(void *,struct page*),
1498 void *data)
1499{
1500 struct page *page = __read_cache_page(inode, offset, filler, data);
1501 int err;
1502
1503 if (IS_ERR(page) || PageUptodate(page))
1504 goto out;
1505
1506 wait_on_page(page);
1507 if (PageUptodate(page))
1508 goto out;
1509
1510 set_bit(PG_locked, &page->flags);
1511 err = filler(data, page);
1512 if (err < 0) {
1513 page_cache_release(page);
1514 page = ERR_PTR(err);
1515 }
1516 out:
1517 return page;
1518}
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536ssize_t
1537generic_file_write(struct file *file, const char *buf,
1538 size_t count, loff_t *ppos)
1539{
1540 struct dentry *dentry = file->f_dentry;
1541 struct inode *inode = dentry->d_inode;
1542 unsigned long pos = *ppos;
1543 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1544 struct page *page, **hash;
1545 unsigned long page_cache = 0;
1546 unsigned long written;
1547 long status, sync;
1548
1549 if (!inode->i_op || !inode->i_op->updatepage)
1550 return -EIO;
1551
1552 if (file->f_error) {
1553 int error = file->f_error;
1554 file->f_error = 0;
1555 return error;
1556 }
1557
1558 sync = file->f_flags & O_SYNC;
1559 written = 0;
1560
1561 if (file->f_flags & O_APPEND)
1562 pos = inode->i_size;
1563
1564
1565
1566
1567 status = -EFBIG;
1568 if (pos >= limit) {
1569 send_sig(SIGXFSZ, current, 0);
1570 goto out;
1571 }
1572
1573 status = 0;
1574
1575
1576
1577
1578 if (count > limit - pos) {
1579 send_sig(SIGXFSZ, current, 0);
1580 count = limit - pos;
1581 }
1582
1583 while (count) {
1584 unsigned long bytes, pgpos, offset;
1585 char * dest;
1586
1587
1588
1589
1590
1591 offset = (pos & ~PAGE_CACHE_MASK);
1592 pgpos = pos & PAGE_CACHE_MASK;
1593 bytes = PAGE_CACHE_SIZE - offset;
1594 if (bytes > count)
1595 bytes = count;
1596
1597 hash = page_hash(inode, pgpos);
1598 page = __find_page(inode, pgpos, *hash);
1599 if (!page) {
1600 if (!page_cache) {
1601 page_cache = page_cache_alloc();
1602 if (page_cache)
1603 continue;
1604 status = -ENOMEM;
1605 break;
1606 }
1607 page = page_cache_entry(page_cache);
1608 add_to_page_cache(page, inode, pgpos, hash);
1609 page_cache = 0;
1610 }
1611
1612
1613 wait_on_page(page);
1614 set_bit(PG_locked, &page->flags);
1615
1616 if (inode->i_op->prepare_write)
1617 status = inode->i_op->prepare_write(file, page, offset, bytes);
1618 if (status < 0)
1619 goto unlock;
1620
1621
1622
1623
1624
1625
1626 dest = (char *) page_address(page) + offset;
1627 if (dest != buf) {
1628 bytes -= copy_from_user(dest, buf, bytes);
1629 flush_dcache_page(page_address(page));
1630 }
1631 status = -EFAULT;
1632 if (bytes)
1633 status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1634
1635 unlock:
1636
1637 clear_bit(PG_locked, &page->flags);
1638 wake_up(&page->wait);
1639 page_cache_release(page);
1640
1641 if (status < 0)
1642 break;
1643
1644 written += status;
1645 count -= status;
1646 pos += status;
1647 buf += status;
1648 }
1649 *ppos = pos;
1650 if (pos > inode->i_size)
1651 inode->i_size = pos;
1652
1653 if (page_cache)
1654 page_cache_free(page_cache);
1655out:
1656 return written ? written : status;
1657}
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1671 int new)
1672{
1673 struct page * page;
1674 struct page ** hash;
1675 unsigned long page_cache = 0;
1676
1677 hash = page_hash(inode, offset);
1678 page = __find_page(inode, offset, *hash);
1679 if (!page) {
1680 if (!new)
1681 goto out;
1682 page_cache = page_cache_alloc();
1683 if (!page_cache)
1684 goto out;
1685 clear_page(page_cache);
1686 page = page_cache_entry(page_cache);
1687 add_to_page_cache(page, inode, offset, hash);
1688 }
1689 if (atomic_read(&page->count) != 2)
1690 printk(KERN_ERR "get_cached_page: page count=%d\n",
1691 atomic_read(&page->count));
1692 if (test_bit(PG_locked, &page->flags))
1693 printk(KERN_ERR "get_cached_page: page already locked!\n");
1694 set_bit(PG_locked, &page->flags);
1695 page_cache = page_address(page);
1696
1697out:
1698 return page_cache;
1699}
1700
1701
1702
1703
1704void put_cached_page(unsigned long addr)
1705{
1706 struct page * page = page_cache_entry(addr);
1707
1708 if (!test_bit(PG_locked, &page->flags))
1709 printk("put_cached_page: page not locked!\n");
1710 if (atomic_read(&page->count) != 2)
1711 printk("put_cached_page: page count=%d\n",
1712 atomic_read(&page->count));
1713 clear_bit(PG_locked, &page->flags);
1714 wake_up(&page->wait);
1715 page_cache_release(page);
1716}
1717
1718void __init page_cache_init(unsigned long memory_size)
1719{
1720 unsigned long htable_size;
1721 long order;
1722
1723 htable_size = memory_size >> PAGE_SHIFT;
1724 htable_size *= sizeof(struct page *);
1725 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1726 ;
1727
1728 do {
1729 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1730
1731 page_hash_mask = (tmp - 1UL);
1732
1733 page_hash_bits = 0;
1734 while((tmp >>= 1UL) != 0UL)
1735 page_hash_bits++;
1736
1737 page_hash_table = (struct page **)
1738 __get_free_pages(GFP_ATOMIC, order);
1739 } while(page_hash_table == NULL && --order >= 0L);
1740
1741 printk("Page cache hash table entries: %d (order %ld, %ldk)\n",
1742 (1 << page_hash_bits), order, (1UL << order) * PAGE_SIZE / 1024);
1743 if (!page_hash_table)
1744 panic("Failed to allocate page hash table\n");
1745 memset(page_hash_table, 0,
1746 (PAGE_HASH_MASK + 1UL) * sizeof(struct page *));
1747}
1748