1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/rmap.h>
49#include <linux/module.h>
50#include <linux/delayacct.h>
51#include <linux/init.h>
52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
55
56#include <asm/pgalloc.h>
57#include <asm/uaccess.h>
58#include <asm/tlb.h>
59#include <asm/tlbflush.h>
60#include <asm/pgtable.h>
61
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h"
66
67#ifndef CONFIG_NEED_MULTIPLE_NODES
68
69unsigned long max_mapnr;
70struct page *mem_map;
71
72EXPORT_SYMBOL(max_mapnr);
73EXPORT_SYMBOL(mem_map);
74#endif
75
76unsigned long num_physpages;
77
78
79
80
81
82
83
84void * high_memory;
85
86EXPORT_SYMBOL(num_physpages);
87EXPORT_SYMBOL(high_memory);
88
89
90
91
92
93
94
95int randomize_va_space __read_mostly =
96#ifdef CONFIG_COMPAT_BRK
97 1;
98#else
99 2;
100#endif
101
102static int __init disable_randmaps(char *s)
103{
104 randomize_va_space = 0;
105 return 1;
106}
107__setup("norandmaps", disable_randmaps);
108
109
110
111
112
113
114
115
116void pgd_clear_bad(pgd_t *pgd)
117{
118 pgd_ERROR(*pgd);
119 pgd_clear(pgd);
120}
121
122void pud_clear_bad(pud_t *pud)
123{
124 pud_ERROR(*pud);
125 pud_clear(pud);
126}
127
128void pmd_clear_bad(pmd_t *pmd)
129{
130 pmd_ERROR(*pmd);
131 pmd_clear(pmd);
132}
133
134
135
136
137
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
139{
140 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd);
142 pte_free_tlb(tlb, token);
143 tlb->mm->nr_ptes--;
144}
145
146static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
147 unsigned long addr, unsigned long end,
148 unsigned long floor, unsigned long ceiling)
149{
150 pmd_t *pmd;
151 unsigned long next;
152 unsigned long start;
153
154 start = addr;
155 pmd = pmd_offset(pud, addr);
156 do {
157 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd))
159 continue;
160 free_pte_range(tlb, pmd);
161 } while (pmd++, addr = next, addr != end);
162
163 start &= PUD_MASK;
164 if (start < floor)
165 return;
166 if (ceiling) {
167 ceiling &= PUD_MASK;
168 if (!ceiling)
169 return;
170 }
171 if (end - 1 > ceiling - 1)
172 return;
173
174 pmd = pmd_offset(pud, start);
175 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd);
177}
178
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
180 unsigned long addr, unsigned long end,
181 unsigned long floor, unsigned long ceiling)
182{
183 pud_t *pud;
184 unsigned long next;
185 unsigned long start;
186
187 start = addr;
188 pud = pud_offset(pgd, addr);
189 do {
190 next = pud_addr_end(addr, end);
191 if (pud_none_or_clear_bad(pud))
192 continue;
193 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
194 } while (pud++, addr = next, addr != end);
195
196 start &= PGDIR_MASK;
197 if (start < floor)
198 return;
199 if (ceiling) {
200 ceiling &= PGDIR_MASK;
201 if (!ceiling)
202 return;
203 }
204 if (end - 1 > ceiling - 1)
205 return;
206
207 pud = pud_offset(pgd, start);
208 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud);
210}
211
212
213
214
215
216
217void free_pgd_range(struct mmu_gather *tlb,
218 unsigned long addr, unsigned long end,
219 unsigned long floor, unsigned long ceiling)
220{
221 pgd_t *pgd;
222 unsigned long next;
223 unsigned long start;
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251 addr &= PMD_MASK;
252 if (addr < floor) {
253 addr += PMD_SIZE;
254 if (!addr)
255 return;
256 }
257 if (ceiling) {
258 ceiling &= PMD_MASK;
259 if (!ceiling)
260 return;
261 }
262 if (end - 1 > ceiling - 1)
263 end -= PMD_SIZE;
264 if (addr > end - 1)
265 return;
266
267 start = addr;
268 pgd = pgd_offset(tlb->mm, addr);
269 do {
270 next = pgd_addr_end(addr, end);
271 if (pgd_none_or_clear_bad(pgd))
272 continue;
273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
274 } while (pgd++, addr = next, addr != end);
275}
276
277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
278 unsigned long floor, unsigned long ceiling)
279{
280 while (vma) {
281 struct vm_area_struct *next = vma->vm_next;
282 unsigned long addr = vma->vm_start;
283
284
285
286
287 anon_vma_unlink(vma);
288 unlink_file_vma(vma);
289
290 if (is_vm_hugetlb_page(vma)) {
291 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
292 floor, next? next->vm_start: ceiling);
293 } else {
294
295
296
297 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
298 && !is_vm_hugetlb_page(next)) {
299 vma = next;
300 next = vma->vm_next;
301 anon_vma_unlink(vma);
302 unlink_file_vma(vma);
303 }
304 free_pgd_range(tlb, addr, vma->vm_end,
305 floor, next? next->vm_start: ceiling);
306 }
307 vma = next;
308 }
309}
310
311int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
312{
313 pgtable_t new = pte_alloc_one(mm, address);
314 if (!new)
315 return -ENOMEM;
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330 smp_wmb();
331
332 spin_lock(&mm->page_table_lock);
333 if (!pmd_present(*pmd)) {
334 mm->nr_ptes++;
335 pmd_populate(mm, pmd, new);
336 new = NULL;
337 }
338 spin_unlock(&mm->page_table_lock);
339 if (new)
340 pte_free(mm, new);
341 return 0;
342}
343
344int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
345{
346 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
347 if (!new)
348 return -ENOMEM;
349
350 smp_wmb();
351
352 spin_lock(&init_mm.page_table_lock);
353 if (!pmd_present(*pmd)) {
354 pmd_populate_kernel(&init_mm, pmd, new);
355 new = NULL;
356 }
357 spin_unlock(&init_mm.page_table_lock);
358 if (new)
359 pte_free_kernel(&init_mm, new);
360 return 0;
361}
362
363static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
364{
365 if (file_rss)
366 add_mm_counter(mm, file_rss, file_rss);
367 if (anon_rss)
368 add_mm_counter(mm, anon_rss, anon_rss);
369}
370
371
372
373
374
375
376
377
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
382 "vm_flags = %lx, vaddr = %lx\n",
383 (long long)pte_val(pte),
384 (vma->vm_mm == current->mm ? current->comm : "???"),
385 vma->vm_flags, vaddr);
386 dump_stack();
387}
388
389static inline int is_cow_mapping(unsigned int flags)
390{
391 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
392}
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436#ifdef __HAVE_ARCH_PTE_SPECIAL
437# define HAVE_PTE_SPECIAL 1
438#else
439# define HAVE_PTE_SPECIAL 0
440#endif
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte)
443{
444 unsigned long pfn;
445
446 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) {
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
449 return pte_page(pte);
450 }
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL;
453 }
454
455
456
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn))
462 return NULL;
463 goto out;
464 } else {
465 unsigned long off;
466 off = (addr - vma->vm_start) >> PAGE_SHIFT;
467 if (pfn == vma->vm_pgoff + off)
468 return NULL;
469 if (!is_cow_mapping(vma->vm_flags))
470 return NULL;
471 }
472 }
473
474 VM_BUG_ON(!pfn_valid(pfn));
475
476
477
478
479
480
481out:
482 return pfn_to_page(pfn);
483}
484
485
486
487
488
489
490
491static inline void
492copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
493 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
494 unsigned long addr, int *rss)
495{
496 unsigned long vm_flags = vma->vm_flags;
497 pte_t pte = *src_pte;
498 struct page *page;
499
500
501 if (unlikely(!pte_present(pte))) {
502 if (!pte_file(pte)) {
503 swp_entry_t entry = pte_to_swp_entry(pte);
504
505 swap_duplicate(entry);
506
507 if (unlikely(list_empty(&dst_mm->mmlist))) {
508 spin_lock(&mmlist_lock);
509 if (list_empty(&dst_mm->mmlist))
510 list_add(&dst_mm->mmlist,
511 &src_mm->mmlist);
512 spin_unlock(&mmlist_lock);
513 }
514 if (is_write_migration_entry(entry) &&
515 is_cow_mapping(vm_flags)) {
516
517
518
519
520 make_migration_entry_read(&entry);
521 pte = swp_entry_to_pte(entry);
522 set_pte_at(src_mm, addr, src_pte, pte);
523 }
524 }
525 goto out_set_pte;
526 }
527
528
529
530
531
532 if (is_cow_mapping(vm_flags)) {
533 ptep_set_wrprotect(src_mm, addr, src_pte);
534 pte = pte_wrprotect(pte);
535 }
536
537
538
539
540
541 if (vm_flags & VM_SHARED)
542 pte = pte_mkclean(pte);
543 pte = pte_mkold(pte);
544
545 page = vm_normal_page(vma, addr, pte);
546 if (page) {
547 get_page(page);
548 page_dup_rmap(page, vma, addr);
549 rss[!!PageAnon(page)]++;
550 }
551
552out_set_pte:
553 set_pte_at(dst_mm, addr, dst_pte, pte);
554}
555
556static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
557 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
558 unsigned long addr, unsigned long end)
559{
560 pte_t *src_pte, *dst_pte;
561 spinlock_t *src_ptl, *dst_ptl;
562 int progress = 0;
563 int rss[2];
564
565again:
566 rss[1] = rss[0] = 0;
567 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
568 if (!dst_pte)
569 return -ENOMEM;
570 src_pte = pte_offset_map_nested(src_pmd, addr);
571 src_ptl = pte_lockptr(src_mm, src_pmd);
572 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
573 arch_enter_lazy_mmu_mode();
574
575 do {
576
577
578
579
580 if (progress >= 32) {
581 progress = 0;
582 if (need_resched() ||
583 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
584 break;
585 }
586 if (pte_none(*src_pte)) {
587 progress++;
588 continue;
589 }
590 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
591 progress += 8;
592 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
593
594 arch_leave_lazy_mmu_mode();
595 spin_unlock(src_ptl);
596 pte_unmap_nested(src_pte - 1);
597 add_mm_rss(dst_mm, rss[0], rss[1]);
598 pte_unmap_unlock(dst_pte - 1, dst_ptl);
599 cond_resched();
600 if (addr != end)
601 goto again;
602 return 0;
603}
604
605static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
606 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
607 unsigned long addr, unsigned long end)
608{
609 pmd_t *src_pmd, *dst_pmd;
610 unsigned long next;
611
612 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
613 if (!dst_pmd)
614 return -ENOMEM;
615 src_pmd = pmd_offset(src_pud, addr);
616 do {
617 next = pmd_addr_end(addr, end);
618 if (pmd_none_or_clear_bad(src_pmd))
619 continue;
620 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
621 vma, addr, next))
622 return -ENOMEM;
623 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
624 return 0;
625}
626
627static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
628 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
629 unsigned long addr, unsigned long end)
630{
631 pud_t *src_pud, *dst_pud;
632 unsigned long next;
633
634 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
635 if (!dst_pud)
636 return -ENOMEM;
637 src_pud = pud_offset(src_pgd, addr);
638 do {
639 next = pud_addr_end(addr, end);
640 if (pud_none_or_clear_bad(src_pud))
641 continue;
642 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
643 vma, addr, next))
644 return -ENOMEM;
645 } while (dst_pud++, src_pud++, addr = next, addr != end);
646 return 0;
647}
648
649int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
650 struct vm_area_struct *vma)
651{
652 pgd_t *src_pgd, *dst_pgd;
653 unsigned long next;
654 unsigned long addr = vma->vm_start;
655 unsigned long end = vma->vm_end;
656 int ret;
657
658
659
660
661
662
663
664 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
665 if (!vma->anon_vma)
666 return 0;
667 }
668
669 if (is_vm_hugetlb_page(vma))
670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
671
672
673
674
675
676
677
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
682 dst_pgd = pgd_offset(dst_mm, addr);
683 src_pgd = pgd_offset(src_mm, addr);
684 do {
685 next = pgd_addr_end(addr, end);
686 if (pgd_none_or_clear_bad(src_pgd))
687 continue;
688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
689 vma, addr, next))) {
690 ret = -ENOMEM;
691 break;
692 }
693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
699}
700
701static unsigned long zap_pte_range(struct mmu_gather *tlb,
702 struct vm_area_struct *vma, pmd_t *pmd,
703 unsigned long addr, unsigned long end,
704 long *zap_work, struct zap_details *details)
705{
706 struct mm_struct *mm = tlb->mm;
707 pte_t *pte;
708 spinlock_t *ptl;
709 int file_rss = 0;
710 int anon_rss = 0;
711
712 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
713 arch_enter_lazy_mmu_mode();
714 do {
715 pte_t ptent = *pte;
716 if (pte_none(ptent)) {
717 (*zap_work)--;
718 continue;
719 }
720
721 (*zap_work) -= PAGE_SIZE;
722
723 if (pte_present(ptent)) {
724 struct page *page;
725
726 page = vm_normal_page(vma, addr, ptent);
727 if (unlikely(details) && page) {
728
729
730
731
732
733 if (details->check_mapping &&
734 details->check_mapping != page->mapping)
735 continue;
736
737
738
739
740 if (details->nonlinear_vma &&
741 (page->index < details->first_index ||
742 page->index > details->last_index))
743 continue;
744 }
745 ptent = ptep_get_and_clear_full(mm, addr, pte,
746 tlb->fullmm);
747 tlb_remove_tlb_entry(tlb, pte, addr);
748 if (unlikely(!page))
749 continue;
750 if (unlikely(details) && details->nonlinear_vma
751 && linear_page_index(details->nonlinear_vma,
752 addr) != page->index)
753 set_pte_at(mm, addr, pte,
754 pgoff_to_pte(page->index));
755 if (PageAnon(page))
756 anon_rss--;
757 else {
758 if (pte_dirty(ptent))
759 set_page_dirty(page);
760 if (pte_young(ptent))
761 SetPageReferenced(page);
762 file_rss--;
763 }
764 page_remove_rmap(page, vma);
765 tlb_remove_page(tlb, page);
766 continue;
767 }
768
769
770
771
772 if (unlikely(details))
773 continue;
774 if (!pte_file(ptent))
775 free_swap_and_cache(pte_to_swp_entry(ptent));
776 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
777 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
778
779 add_mm_rss(mm, file_rss, anon_rss);
780 arch_leave_lazy_mmu_mode();
781 pte_unmap_unlock(pte - 1, ptl);
782
783 return addr;
784}
785
786static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
787 struct vm_area_struct *vma, pud_t *pud,
788 unsigned long addr, unsigned long end,
789 long *zap_work, struct zap_details *details)
790{
791 pmd_t *pmd;
792 unsigned long next;
793
794 pmd = pmd_offset(pud, addr);
795 do {
796 next = pmd_addr_end(addr, end);
797 if (pmd_none_or_clear_bad(pmd)) {
798 (*zap_work)--;
799 continue;
800 }
801 next = zap_pte_range(tlb, vma, pmd, addr, next,
802 zap_work, details);
803 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
804
805 return addr;
806}
807
808static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
809 struct vm_area_struct *vma, pgd_t *pgd,
810 unsigned long addr, unsigned long end,
811 long *zap_work, struct zap_details *details)
812{
813 pud_t *pud;
814 unsigned long next;
815
816 pud = pud_offset(pgd, addr);
817 do {
818 next = pud_addr_end(addr, end);
819 if (pud_none_or_clear_bad(pud)) {
820 (*zap_work)--;
821 continue;
822 }
823 next = zap_pmd_range(tlb, vma, pud, addr, next,
824 zap_work, details);
825 } while (pud++, addr = next, (addr != end && *zap_work > 0));
826
827 return addr;
828}
829
830static unsigned long unmap_page_range(struct mmu_gather *tlb,
831 struct vm_area_struct *vma,
832 unsigned long addr, unsigned long end,
833 long *zap_work, struct zap_details *details)
834{
835 pgd_t *pgd;
836 unsigned long next;
837
838 if (details && !details->check_mapping && !details->nonlinear_vma)
839 details = NULL;
840
841 BUG_ON(addr >= end);
842 tlb_start_vma(tlb, vma);
843 pgd = pgd_offset(vma->vm_mm, addr);
844 do {
845 next = pgd_addr_end(addr, end);
846 if (pgd_none_or_clear_bad(pgd)) {
847 (*zap_work)--;
848 continue;
849 }
850 next = zap_pud_range(tlb, vma, pgd, addr, next,
851 zap_work, details);
852 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
853 tlb_end_vma(tlb, vma);
854
855 return addr;
856}
857
858#ifdef CONFIG_PREEMPT
859# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
860#else
861
862# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
863#endif
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891unsigned long unmap_vmas(struct mmu_gather **tlbp,
892 struct vm_area_struct *vma, unsigned long start_addr,
893 unsigned long end_addr, unsigned long *nr_accounted,
894 struct zap_details *details)
895{
896 long zap_work = ZAP_BLOCK_SIZE;
897 unsigned long tlb_start = 0;
898 int tlb_start_valid = 0;
899 unsigned long start = start_addr;
900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
906 unsigned long end;
907
908 start = max(vma->vm_start, start_addr);
909 if (start >= vma->vm_end)
910 continue;
911 end = min(vma->vm_end, end_addr);
912 if (end <= vma->vm_start)
913 continue;
914
915 if (vma->vm_flags & VM_ACCOUNT)
916 *nr_accounted += (end - start) >> PAGE_SHIFT;
917
918 while (start != end) {
919 if (!tlb_start_valid) {
920 tlb_start = start;
921 tlb_start_valid = 1;
922 }
923
924 if (unlikely(is_vm_hugetlb_page(vma))) {
925
926
927
928
929
930
931
932
933
934
935
936 if (vma->vm_file) {
937 unmap_hugepage_range(vma, start, end, NULL);
938 zap_work -= (end - start) /
939 pages_per_huge_page(hstate_vma(vma));
940 }
941
942 start = end;
943 } else
944 start = unmap_page_range(*tlbp, vma,
945 start, end, &zap_work, details);
946
947 if (zap_work > 0) {
948 BUG_ON(start != end);
949 break;
950 }
951
952 tlb_finish_mmu(*tlbp, tlb_start, start);
953
954 if (need_resched() ||
955 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
956 if (i_mmap_lock) {
957 *tlbp = NULL;
958 goto out;
959 }
960 cond_resched();
961 }
962
963 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
964 tlb_start_valid = 0;
965 zap_work = ZAP_BLOCK_SIZE;
966 }
967 }
968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
970 return start;
971}
972
973
974
975
976
977
978
979
980unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
981 unsigned long size, struct zap_details *details)
982{
983 struct mm_struct *mm = vma->vm_mm;
984 struct mmu_gather *tlb;
985 unsigned long end = address + size;
986 unsigned long nr_accounted = 0;
987
988 lru_add_drain();
989 tlb = tlb_gather_mmu(mm, 0);
990 update_hiwater_rss(mm);
991 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
992 if (tlb)
993 tlb_finish_mmu(tlb, address, end);
994 return end;
995}
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
1020
1021
1022
1023struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1024 unsigned int flags)
1025{
1026 pgd_t *pgd;
1027 pud_t *pud;
1028 pmd_t *pmd;
1029 pte_t *ptep, pte;
1030 spinlock_t *ptl;
1031 struct page *page;
1032 struct mm_struct *mm = vma->vm_mm;
1033
1034 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1035 if (!IS_ERR(page)) {
1036 BUG_ON(flags & FOLL_GET);
1037 goto out;
1038 }
1039
1040 page = NULL;
1041 pgd = pgd_offset(mm, address);
1042 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1043 goto no_page_table;
1044
1045 pud = pud_offset(pgd, address);
1046 if (pud_none(*pud))
1047 goto no_page_table;
1048 if (pud_huge(*pud)) {
1049 BUG_ON(flags & FOLL_GET);
1050 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1051 goto out;
1052 }
1053 if (unlikely(pud_bad(*pud)))
1054 goto no_page_table;
1055
1056 pmd = pmd_offset(pud, address);
1057 if (pmd_none(*pmd))
1058 goto no_page_table;
1059 if (pmd_huge(*pmd)) {
1060 BUG_ON(flags & FOLL_GET);
1061 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1062 goto out;
1063 }
1064 if (unlikely(pmd_bad(*pmd)))
1065 goto no_page_table;
1066
1067 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1068
1069 pte = *ptep;
1070 if (!pte_present(pte))
1071 goto no_page;
1072 if ((flags & FOLL_WRITE) && !pte_write(pte))
1073 goto unlock;
1074 page = vm_normal_page(vma, address, pte);
1075 if (unlikely(!page))
1076 goto bad_page;
1077
1078 if (flags & FOLL_GET)
1079 get_page(page);
1080 if (flags & FOLL_TOUCH) {
1081 if ((flags & FOLL_WRITE) &&
1082 !pte_dirty(pte) && !PageDirty(page))
1083 set_page_dirty(page);
1084 mark_page_accessed(page);
1085 }
1086unlock:
1087 pte_unmap_unlock(ptep, ptl);
1088out:
1089 return page;
1090
1091bad_page:
1092 pte_unmap_unlock(ptep, ptl);
1093 return ERR_PTR(-EFAULT);
1094
1095no_page:
1096 pte_unmap_unlock(ptep, ptl);
1097 if (!pte_none(pte))
1098 return page;
1099
1100no_page_table:
1101
1102
1103
1104
1105 if (flags & FOLL_ANON) {
1106 page = ZERO_PAGE(0);
1107 if (flags & FOLL_GET)
1108 get_page(page);
1109 BUG_ON(flags & FOLL_WRITE);
1110 }
1111 return page;
1112}
1113
1114
1115static inline int use_zero_page(struct vm_area_struct *vma)
1116{
1117
1118
1119
1120
1121
1122
1123
1124 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1125 return 0;
1126
1127
1128
1129 return !vma->vm_ops || !vma->vm_ops->fault;
1130}
1131
1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1133 unsigned long start, int len, int write, int force,
1134 struct page **pages, struct vm_area_struct **vmas)
1135{
1136 int i;
1137 unsigned int vm_flags;
1138
1139 if (len <= 0)
1140 return 0;
1141
1142
1143
1144
1145 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1146 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1147 i = 0;
1148
1149 do {
1150 struct vm_area_struct *vma;
1151 unsigned int foll_flags;
1152
1153 vma = find_extend_vma(mm, start);
1154 if (!vma && in_gate_area(tsk, start)) {
1155 unsigned long pg = start & PAGE_MASK;
1156 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1157 pgd_t *pgd;
1158 pud_t *pud;
1159 pmd_t *pmd;
1160 pte_t *pte;
1161 if (write)
1162 return i ? : -EFAULT;
1163 if (pg > TASK_SIZE)
1164 pgd = pgd_offset_k(pg);
1165 else
1166 pgd = pgd_offset_gate(mm, pg);
1167 BUG_ON(pgd_none(*pgd));
1168 pud = pud_offset(pgd, pg);
1169 BUG_ON(pud_none(*pud));
1170 pmd = pmd_offset(pud, pg);
1171 if (pmd_none(*pmd))
1172 return i ? : -EFAULT;
1173 pte = pte_offset_map(pmd, pg);
1174 if (pte_none(*pte)) {
1175 pte_unmap(pte);
1176 return i ? : -EFAULT;
1177 }
1178 if (pages) {
1179 struct page *page = vm_normal_page(gate_vma, start, *pte);
1180 pages[i] = page;
1181 if (page)
1182 get_page(page);
1183 }
1184 pte_unmap(pte);
1185 if (vmas)
1186 vmas[i] = gate_vma;
1187 i++;
1188 start += PAGE_SIZE;
1189 len--;
1190 continue;
1191 }
1192
1193 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
1194 || !(vm_flags & vma->vm_flags))
1195 return i ? : -EFAULT;
1196
1197 if (is_vm_hugetlb_page(vma)) {
1198 i = follow_hugetlb_page(mm, vma, pages, vmas,
1199 &start, &len, i, write);
1200 continue;
1201 }
1202
1203 foll_flags = FOLL_TOUCH;
1204 if (pages)
1205 foll_flags |= FOLL_GET;
1206 if (!write && use_zero_page(vma))
1207 foll_flags |= FOLL_ANON;
1208
1209 do {
1210 struct page *page;
1211
1212
1213
1214
1215
1216
1217 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1218 return i ? i : -ENOMEM;
1219
1220 if (write)
1221 foll_flags |= FOLL_WRITE;
1222
1223 cond_resched();
1224 while (!(page = follow_page(vma, start, foll_flags))) {
1225 int ret;
1226 ret = handle_mm_fault(mm, vma, start,
1227 foll_flags & FOLL_WRITE);
1228 if (ret & VM_FAULT_ERROR) {
1229 if (ret & VM_FAULT_OOM)
1230 return i ? i : -ENOMEM;
1231 else if (ret & VM_FAULT_SIGBUS)
1232 return i ? i : -EFAULT;
1233 BUG();
1234 }
1235 if (ret & VM_FAULT_MAJOR)
1236 tsk->maj_flt++;
1237 else
1238 tsk->min_flt++;
1239
1240
1241
1242
1243
1244
1245
1246
1247 if (ret & VM_FAULT_WRITE)
1248 foll_flags &= ~FOLL_WRITE;
1249
1250 cond_resched();
1251 }
1252 if (IS_ERR(page))
1253 return i ? i : PTR_ERR(page);
1254 if (pages) {
1255 pages[i] = page;
1256
1257 flush_anon_page(vma, page, start);
1258 flush_dcache_page(page);
1259 }
1260 if (vmas)
1261 vmas[i] = vma;
1262 i++;
1263 start += PAGE_SIZE;
1264 len--;
1265 } while (len && start < vma->vm_end);
1266 } while (len);
1267 return i;
1268}
1269EXPORT_SYMBOL(get_user_pages);
1270
1271pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1272 spinlock_t **ptl)
1273{
1274 pgd_t * pgd = pgd_offset(mm, addr);
1275 pud_t * pud = pud_alloc(mm, pgd, addr);
1276 if (pud) {
1277 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1278 if (pmd)
1279 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1280 }
1281 return NULL;
1282}
1283
1284
1285
1286
1287
1288
1289
1290
1291static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1292 struct page *page, pgprot_t prot)
1293{
1294 struct mm_struct *mm = vma->vm_mm;
1295 int retval;
1296 pte_t *pte;
1297 spinlock_t *ptl;
1298
1299 retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1300 if (retval)
1301 goto out;
1302
1303 retval = -EINVAL;
1304 if (PageAnon(page))
1305 goto out_uncharge;
1306 retval = -ENOMEM;
1307 flush_dcache_page(page);
1308 pte = get_locked_pte(mm, addr, &ptl);
1309 if (!pte)
1310 goto out_uncharge;
1311 retval = -EBUSY;
1312 if (!pte_none(*pte))
1313 goto out_unlock;
1314
1315
1316 get_page(page);
1317 inc_mm_counter(mm, file_rss);
1318 page_add_file_rmap(page);
1319 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1320
1321 retval = 0;
1322 pte_unmap_unlock(pte, ptl);
1323 return retval;
1324out_unlock:
1325 pte_unmap_unlock(pte, ptl);
1326out_uncharge:
1327 mem_cgroup_uncharge_page(page);
1328out:
1329 return retval;
1330}
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1355 struct page *page)
1356{
1357 if (addr < vma->vm_start || addr >= vma->vm_end)
1358 return -EFAULT;
1359 if (!page_count(page))
1360 return -EINVAL;
1361 vma->vm_flags |= VM_INSERTPAGE;
1362 return insert_page(vma, addr, page, vma->vm_page_prot);
1363}
1364EXPORT_SYMBOL(vm_insert_page);
1365
1366static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1367 unsigned long pfn, pgprot_t prot)
1368{
1369 struct mm_struct *mm = vma->vm_mm;
1370 int retval;
1371 pte_t *pte, entry;
1372 spinlock_t *ptl;
1373
1374 retval = -ENOMEM;
1375 pte = get_locked_pte(mm, addr, &ptl);
1376 if (!pte)
1377 goto out;
1378 retval = -EBUSY;
1379 if (!pte_none(*pte))
1380 goto out_unlock;
1381
1382
1383 entry = pte_mkspecial(pfn_pte(pfn, prot));
1384 set_pte_at(mm, addr, pte, entry);
1385 update_mmu_cache(vma, addr, entry);
1386
1387 retval = 0;
1388out_unlock:
1389 pte_unmap_unlock(pte, ptl);
1390out:
1391 return retval;
1392}
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1412 unsigned long pfn)
1413{
1414
1415
1416
1417
1418
1419
1420 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1421 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1422 (VM_PFNMAP|VM_MIXEDMAP));
1423 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1424 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1425
1426 if (addr < vma->vm_start || addr >= vma->vm_end)
1427 return -EFAULT;
1428 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1429}
1430EXPORT_SYMBOL(vm_insert_pfn);
1431
1432int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1433 unsigned long pfn)
1434{
1435 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1436
1437 if (addr < vma->vm_start || addr >= vma->vm_end)
1438 return -EFAULT;
1439
1440
1441
1442
1443
1444
1445
1446 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1447 struct page *page;
1448
1449 page = pfn_to_page(pfn);
1450 return insert_page(vma, addr, page, vma->vm_page_prot);
1451 }
1452 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1453}
1454EXPORT_SYMBOL(vm_insert_mixed);
1455
1456
1457
1458
1459
1460
1461static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1462 unsigned long addr, unsigned long end,
1463 unsigned long pfn, pgprot_t prot)
1464{
1465 pte_t *pte;
1466 spinlock_t *ptl;
1467
1468 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1469 if (!pte)
1470 return -ENOMEM;
1471 arch_enter_lazy_mmu_mode();
1472 do {
1473 BUG_ON(!pte_none(*pte));
1474 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1475 pfn++;
1476 } while (pte++, addr += PAGE_SIZE, addr != end);
1477 arch_leave_lazy_mmu_mode();
1478 pte_unmap_unlock(pte - 1, ptl);
1479 return 0;
1480}
1481
1482static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1483 unsigned long addr, unsigned long end,
1484 unsigned long pfn, pgprot_t prot)
1485{
1486 pmd_t *pmd;
1487 unsigned long next;
1488
1489 pfn -= addr >> PAGE_SHIFT;
1490 pmd = pmd_alloc(mm, pud, addr);
1491 if (!pmd)
1492 return -ENOMEM;
1493 do {
1494 next = pmd_addr_end(addr, end);
1495 if (remap_pte_range(mm, pmd, addr, next,
1496 pfn + (addr >> PAGE_SHIFT), prot))
1497 return -ENOMEM;
1498 } while (pmd++, addr = next, addr != end);
1499 return 0;
1500}
1501
1502static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1503 unsigned long addr, unsigned long end,
1504 unsigned long pfn, pgprot_t prot)
1505{
1506 pud_t *pud;
1507 unsigned long next;
1508
1509 pfn -= addr >> PAGE_SHIFT;
1510 pud = pud_alloc(mm, pgd, addr);
1511 if (!pud)
1512 return -ENOMEM;
1513 do {
1514 next = pud_addr_end(addr, end);
1515 if (remap_pmd_range(mm, pud, addr, next,
1516 pfn + (addr >> PAGE_SHIFT), prot))
1517 return -ENOMEM;
1518 } while (pud++, addr = next, addr != end);
1519 return 0;
1520}
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1533 unsigned long pfn, unsigned long size, pgprot_t prot)
1534{
1535 pgd_t *pgd;
1536 unsigned long next;
1537 unsigned long end = addr + PAGE_ALIGN(size);
1538 struct mm_struct *mm = vma->vm_mm;
1539 int err;
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559 if (is_cow_mapping(vma->vm_flags)) {
1560 if (addr != vma->vm_start || end != vma->vm_end)
1561 return -EINVAL;
1562 vma->vm_pgoff = pfn;
1563 }
1564
1565 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1566
1567 BUG_ON(addr >= end);
1568 pfn -= addr >> PAGE_SHIFT;
1569 pgd = pgd_offset(mm, addr);
1570 flush_cache_range(vma, addr, end);
1571 do {
1572 next = pgd_addr_end(addr, end);
1573 err = remap_pud_range(mm, pgd, addr, next,
1574 pfn + (addr >> PAGE_SHIFT), prot);
1575 if (err)
1576 break;
1577 } while (pgd++, addr = next, addr != end);
1578 return err;
1579}
1580EXPORT_SYMBOL(remap_pfn_range);
1581
1582static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1583 unsigned long addr, unsigned long end,
1584 pte_fn_t fn, void *data)
1585{
1586 pte_t *pte;
1587 int err;
1588 pgtable_t token;
1589 spinlock_t *uninitialized_var(ptl);
1590
1591 pte = (mm == &init_mm) ?
1592 pte_alloc_kernel(pmd, addr) :
1593 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1594 if (!pte)
1595 return -ENOMEM;
1596
1597 BUG_ON(pmd_huge(*pmd));
1598
1599 token = pmd_pgtable(*pmd);
1600
1601 do {
1602 err = fn(pte, token, addr, data);
1603 if (err)
1604 break;
1605 } while (pte++, addr += PAGE_SIZE, addr != end);
1606
1607 if (mm != &init_mm)
1608 pte_unmap_unlock(pte-1, ptl);
1609 return err;
1610}
1611
1612static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1613 unsigned long addr, unsigned long end,
1614 pte_fn_t fn, void *data)
1615{
1616 pmd_t *pmd;
1617 unsigned long next;
1618 int err;
1619
1620 BUG_ON(pud_huge(*pud));
1621
1622 pmd = pmd_alloc(mm, pud, addr);
1623 if (!pmd)
1624 return -ENOMEM;
1625 do {
1626 next = pmd_addr_end(addr, end);
1627 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1628 if (err)
1629 break;
1630 } while (pmd++, addr = next, addr != end);
1631 return err;
1632}
1633
1634static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1635 unsigned long addr, unsigned long end,
1636 pte_fn_t fn, void *data)
1637{
1638 pud_t *pud;
1639 unsigned long next;
1640 int err;
1641
1642 pud = pud_alloc(mm, pgd, addr);
1643 if (!pud)
1644 return -ENOMEM;
1645 do {
1646 next = pud_addr_end(addr, end);
1647 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1648 if (err)
1649 break;
1650 } while (pud++, addr = next, addr != end);
1651 return err;
1652}
1653
1654
1655
1656
1657
1658int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1659 unsigned long size, pte_fn_t fn, void *data)
1660{
1661 pgd_t *pgd;
1662 unsigned long next;
1663 unsigned long start = addr, end = addr + size;
1664 int err;
1665
1666 BUG_ON(addr >= end);
1667 mmu_notifier_invalidate_range_start(mm, start, end);
1668 pgd = pgd_offset(mm, addr);
1669 do {
1670 next = pgd_addr_end(addr, end);
1671 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1672 if (err)
1673 break;
1674 } while (pgd++, addr = next, addr != end);
1675 mmu_notifier_invalidate_range_end(mm, start, end);
1676 return err;
1677}
1678EXPORT_SYMBOL_GPL(apply_to_page_range);
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1690 pte_t *page_table, pte_t orig_pte)
1691{
1692 int same = 1;
1693#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1694 if (sizeof(pte_t) > sizeof(unsigned long)) {
1695 spinlock_t *ptl = pte_lockptr(mm, pmd);
1696 spin_lock(ptl);
1697 same = pte_same(*page_table, orig_pte);
1698 spin_unlock(ptl);
1699 }
1700#endif
1701 pte_unmap(page_table);
1702 return same;
1703}
1704
1705
1706
1707
1708
1709
1710
1711static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1712{
1713 if (likely(vma->vm_flags & VM_WRITE))
1714 pte = pte_mkwrite(pte);
1715 return pte;
1716}
1717
1718static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1719{
1720
1721
1722
1723
1724
1725
1726 if (unlikely(!src)) {
1727 void *kaddr = kmap_atomic(dst, KM_USER0);
1728 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1729
1730
1731
1732
1733
1734
1735
1736 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1737 memset(kaddr, 0, PAGE_SIZE);
1738 kunmap_atomic(kaddr, KM_USER0);
1739 flush_dcache_page(dst);
1740 } else
1741 copy_user_highpage(dst, src, va, vma);
1742}
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1763 unsigned long address, pte_t *page_table, pmd_t *pmd,
1764 spinlock_t *ptl, pte_t orig_pte)
1765{
1766 struct page *old_page, *new_page;
1767 pte_t entry;
1768 int reuse = 0, ret = 0;
1769 int page_mkwrite = 0;
1770 struct page *dirty_page = NULL;
1771
1772 old_page = vm_normal_page(vma, address, orig_pte);
1773 if (!old_page) {
1774
1775
1776
1777
1778
1779
1780
1781 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1782 (VM_WRITE|VM_SHARED))
1783 goto reuse;
1784 goto gotten;
1785 }
1786
1787
1788
1789
1790
1791 if (PageAnon(old_page)) {
1792 if (trylock_page(old_page)) {
1793 reuse = can_share_swap_page(old_page);
1794 unlock_page(old_page);
1795 }
1796 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1797 (VM_WRITE|VM_SHARED))) {
1798
1799
1800
1801
1802
1803 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1804 struct vm_fault vmf;
1805 int tmp;
1806
1807 vmf.virtual_address = (void __user *)(address &
1808 PAGE_MASK);
1809 vmf.pgoff = old_page->index;
1810 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1811 vmf.page = old_page;
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821 page_cache_get(old_page);
1822 pte_unmap_unlock(page_table, ptl);
1823
1824 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
1825 if (unlikely(tmp &
1826 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
1827 ret = tmp;
1828 goto unwritable_page;
1829 }
1830 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
1831 lock_page(old_page);
1832 if (!old_page->mapping) {
1833 ret = 0;
1834 unlock_page(old_page);
1835 goto unwritable_page;
1836 }
1837 } else
1838 VM_BUG_ON(!PageLocked(old_page));
1839
1840
1841
1842
1843
1844
1845
1846 page_table = pte_offset_map_lock(mm, pmd, address,
1847 &ptl);
1848 if (!pte_same(*page_table, orig_pte)) {
1849 unlock_page(old_page);
1850 page_cache_release(old_page);
1851 goto unlock;
1852 }
1853
1854 page_mkwrite = 1;
1855 }
1856 dirty_page = old_page;
1857 get_page(dirty_page);
1858 reuse = 1;
1859 }
1860
1861 if (reuse) {
1862reuse:
1863 flush_cache_page(vma, address, pte_pfn(orig_pte));
1864 entry = pte_mkyoung(orig_pte);
1865 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1866 if (ptep_set_access_flags(vma, address, page_table, entry,1))
1867 update_mmu_cache(vma, address, entry);
1868 ret |= VM_FAULT_WRITE;
1869 goto unlock;
1870 }
1871
1872
1873
1874
1875 page_cache_get(old_page);
1876gotten:
1877 pte_unmap_unlock(page_table, ptl);
1878
1879 if (unlikely(anon_vma_prepare(vma)))
1880 goto oom;
1881 VM_BUG_ON(old_page == ZERO_PAGE(0));
1882 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1883 if (!new_page)
1884 goto oom;
1885 cow_user_page(new_page, old_page, address, vma);
1886 __SetPageUptodate(new_page);
1887
1888 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
1889 goto oom_free_new;
1890
1891
1892
1893
1894 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1895 if (likely(pte_same(*page_table, orig_pte))) {
1896 if (old_page) {
1897 if (!PageAnon(old_page)) {
1898 dec_mm_counter(mm, file_rss);
1899 inc_mm_counter(mm, anon_rss);
1900 }
1901 } else
1902 inc_mm_counter(mm, anon_rss);
1903 flush_cache_page(vma, address, pte_pfn(orig_pte));
1904 entry = mk_pte(new_page, vma->vm_page_prot);
1905 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1906
1907
1908
1909
1910
1911
1912 ptep_clear_flush_notify(vma, address, page_table);
1913 set_pte_at(mm, address, page_table, entry);
1914 update_mmu_cache(vma, address, entry);
1915 lru_cache_add_active(new_page);
1916 page_add_new_anon_rmap(new_page, vma, address);
1917
1918 if (old_page) {
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941 page_remove_rmap(old_page, vma);
1942 }
1943
1944
1945 new_page = old_page;
1946 ret |= VM_FAULT_WRITE;
1947 } else
1948 mem_cgroup_uncharge_page(new_page);
1949
1950 if (new_page)
1951 page_cache_release(new_page);
1952 if (old_page)
1953 page_cache_release(old_page);
1954unlock:
1955 pte_unmap_unlock(page_table, ptl);
1956 if (dirty_page) {
1957
1958
1959
1960
1961
1962
1963
1964
1965 if (!page_mkwrite) {
1966 wait_on_page_locked(dirty_page);
1967 set_page_dirty_balance(dirty_page, page_mkwrite);
1968 }
1969 put_page(dirty_page);
1970 if (page_mkwrite) {
1971 struct address_space *mapping = dirty_page->mapping;
1972
1973 set_page_dirty(dirty_page);
1974 unlock_page(dirty_page);
1975 page_cache_release(dirty_page);
1976 if (mapping) {
1977
1978
1979
1980
1981 balance_dirty_pages_ratelimited(mapping);
1982 }
1983 }
1984
1985
1986 if (vma->vm_file)
1987 file_update_time(vma->vm_file);
1988 }
1989 return ret;
1990oom_free_new:
1991 page_cache_release(new_page);
1992oom:
1993 if (old_page) {
1994 if (page_mkwrite) {
1995 unlock_page(old_page);
1996 page_cache_release(old_page);
1997 }
1998 page_cache_release(old_page);
1999 }
2000 return VM_FAULT_OOM;
2001
2002unwritable_page:
2003 page_cache_release(old_page);
2004 return ret;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2040
2041static void reset_vma_truncate_counts(struct address_space *mapping)
2042{
2043 struct vm_area_struct *vma;
2044 struct prio_tree_iter iter;
2045
2046 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2047 vma->vm_truncate_count = 0;
2048 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2049 vma->vm_truncate_count = 0;
2050}
2051
2052static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2053 unsigned long start_addr, unsigned long end_addr,
2054 struct zap_details *details)
2055{
2056 unsigned long restart_addr;
2057 int need_break;
2058
2059
2060
2061
2062
2063
2064
2065
2066again:
2067 restart_addr = vma->vm_truncate_count;
2068 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2069 start_addr = restart_addr;
2070 if (start_addr >= end_addr) {
2071
2072 vma->vm_truncate_count = details->truncate_count;
2073 return 0;
2074 }
2075 }
2076
2077 restart_addr = zap_page_range(vma, start_addr,
2078 end_addr - start_addr, details);
2079 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2080
2081 if (restart_addr >= end_addr) {
2082
2083 vma->vm_truncate_count = details->truncate_count;
2084 if (!need_break)
2085 return 0;
2086 } else {
2087
2088 vma->vm_truncate_count = restart_addr;
2089 if (!need_break)
2090 goto again;
2091 }
2092
2093 spin_unlock(details->i_mmap_lock);
2094 cond_resched();
2095 spin_lock(details->i_mmap_lock);
2096 return -EINTR;
2097}
2098
2099static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2100 struct zap_details *details)
2101{
2102 struct vm_area_struct *vma;
2103 struct prio_tree_iter iter;
2104 pgoff_t vba, vea, zba, zea;
2105
2106restart:
2107 vma_prio_tree_foreach(vma, &iter, root,
2108 details->first_index, details->last_index) {
2109
2110 if (vma->vm_truncate_count == details->truncate_count)
2111 continue;
2112
2113 vba = vma->vm_pgoff;
2114 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2115
2116 zba = details->first_index;
2117 if (zba < vba)
2118 zba = vba;
2119 zea = details->last_index;
2120 if (zea > vea)
2121 zea = vea;
2122
2123 if (unmap_mapping_range_vma(vma,
2124 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2125 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2126 details) < 0)
2127 goto restart;
2128 }
2129}
2130
2131static inline void unmap_mapping_range_list(struct list_head *head,
2132 struct zap_details *details)
2133{
2134 struct vm_area_struct *vma;
2135
2136
2137
2138
2139
2140
2141
2142restart:
2143 list_for_each_entry(vma, head, shared.vm_set.list) {
2144
2145 if (vma->vm_truncate_count == details->truncate_count)
2146 continue;
2147 details->nonlinear_vma = vma;
2148 if (unmap_mapping_range_vma(vma, vma->vm_start,
2149 vma->vm_end, details) < 0)
2150 goto restart;
2151 }
2152}
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168void unmap_mapping_range(struct address_space *mapping,
2169 loff_t const holebegin, loff_t const holelen, int even_cows)
2170{
2171 struct zap_details details;
2172 pgoff_t hba = holebegin >> PAGE_SHIFT;
2173 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2174
2175
2176 if (sizeof(holelen) > sizeof(hlen)) {
2177 long long holeend =
2178 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2179 if (holeend & ~(long long)ULONG_MAX)
2180 hlen = ULONG_MAX - hba + 1;
2181 }
2182
2183 details.check_mapping = even_cows? NULL: mapping;
2184 details.nonlinear_vma = NULL;
2185 details.first_index = hba;
2186 details.last_index = hba + hlen - 1;
2187 if (details.last_index < details.first_index)
2188 details.last_index = ULONG_MAX;
2189 details.i_mmap_lock = &mapping->i_mmap_lock;
2190
2191 spin_lock(&mapping->i_mmap_lock);
2192
2193
2194 mapping->truncate_count++;
2195 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2196 if (mapping->truncate_count == 0)
2197 reset_vma_truncate_counts(mapping);
2198 mapping->truncate_count++;
2199 }
2200 details.truncate_count = mapping->truncate_count;
2201
2202 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2203 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2204 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2205 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2206 spin_unlock(&mapping->i_mmap_lock);
2207}
2208EXPORT_SYMBOL(unmap_mapping_range);
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219int vmtruncate(struct inode * inode, loff_t offset)
2220{
2221 if (inode->i_size < offset) {
2222 unsigned long limit;
2223
2224 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2225 if (limit != RLIM_INFINITY && offset > limit)
2226 goto out_sig;
2227 if (offset > inode->i_sb->s_maxbytes)
2228 goto out_big;
2229 i_size_write(inode, offset);
2230 } else {
2231 struct address_space *mapping = inode->i_mapping;
2232
2233
2234
2235
2236
2237
2238 if (IS_SWAPFILE(inode))
2239 return -ETXTBSY;
2240 i_size_write(inode, offset);
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2252 truncate_inode_pages(mapping, offset);
2253 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2254 }
2255
2256 if (inode->i_op && inode->i_op->truncate)
2257 inode->i_op->truncate(inode);
2258 return 0;
2259
2260out_sig:
2261 send_sig(SIGXFSZ, current, 0);
2262out_big:
2263 return -EFBIG;
2264}
2265EXPORT_SYMBOL(vmtruncate);
2266
2267int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2268{
2269 struct address_space *mapping = inode->i_mapping;
2270
2271
2272
2273
2274
2275
2276 if (!inode->i_op || !inode->i_op->truncate_range)
2277 return -ENOSYS;
2278
2279 mutex_lock(&inode->i_mutex);
2280 down_write(&inode->i_alloc_sem);
2281 unmap_mapping_range(mapping, offset, (end - offset), 1);
2282 truncate_inode_pages_range(mapping, offset, end);
2283 unmap_mapping_range(mapping, offset, (end - offset), 1);
2284 inode->i_op->truncate_range(inode, offset, end);
2285 up_write(&inode->i_alloc_sem);
2286 mutex_unlock(&inode->i_mutex);
2287
2288 return 0;
2289}
2290
2291
2292
2293
2294
2295
2296static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2297 unsigned long address, pte_t *page_table, pmd_t *pmd,
2298 int write_access, pte_t orig_pte)
2299{
2300 spinlock_t *ptl;
2301 struct page *page;
2302 swp_entry_t entry;
2303 pte_t pte;
2304 int ret = 0;
2305
2306 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2307 goto out;
2308
2309 entry = pte_to_swp_entry(orig_pte);
2310 if (is_migration_entry(entry)) {
2311 migration_entry_wait(mm, pmd, address);
2312 goto out;
2313 }
2314 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2315 page = lookup_swap_cache(entry);
2316 if (!page) {
2317 grab_swap_token();
2318 page = swapin_readahead(entry,
2319 GFP_HIGHUSER_MOVABLE, vma, address);
2320 if (!page) {
2321
2322
2323
2324
2325 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2326 if (likely(pte_same(*page_table, orig_pte)))
2327 ret = VM_FAULT_OOM;
2328 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2329 goto unlock;
2330 }
2331
2332
2333 ret = VM_FAULT_MAJOR;
2334 count_vm_event(PGMAJFAULT);
2335 }
2336
2337 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2338 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2339 ret = VM_FAULT_OOM;
2340 goto out;
2341 }
2342
2343 mark_page_accessed(page);
2344 lock_page(page);
2345 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2346
2347
2348
2349
2350 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2351 if (unlikely(!pte_same(*page_table, orig_pte)))
2352 goto out_nomap;
2353
2354 if (unlikely(!PageUptodate(page))) {
2355 ret = VM_FAULT_SIGBUS;
2356 goto out_nomap;
2357 }
2358
2359
2360
2361 inc_mm_counter(mm, anon_rss);
2362 pte = mk_pte(page, vma->vm_page_prot);
2363 if (write_access && can_share_swap_page(page)) {
2364 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2365 write_access = 0;
2366 }
2367
2368 flush_icache_page(vma, page);
2369 set_pte_at(mm, address, page_table, pte);
2370 page_add_anon_rmap(page, vma, address);
2371
2372 swap_free(entry);
2373 if (vm_swap_full())
2374 remove_exclusive_swap_page(page);
2375 unlock_page(page);
2376
2377 if (write_access) {
2378 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2379 if (ret & VM_FAULT_ERROR)
2380 ret &= VM_FAULT_ERROR;
2381 goto out;
2382 }
2383
2384
2385 update_mmu_cache(vma, address, pte);
2386unlock:
2387 pte_unmap_unlock(page_table, ptl);
2388out:
2389 return ret;
2390out_nomap:
2391 mem_cgroup_uncharge_page(page);
2392 pte_unmap_unlock(page_table, ptl);
2393 unlock_page(page);
2394 page_cache_release(page);
2395 return ret;
2396}
2397
2398
2399
2400
2401
2402
2403static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2404 unsigned long address, pte_t *page_table, pmd_t *pmd,
2405 int write_access)
2406{
2407 struct page *page;
2408 spinlock_t *ptl;
2409 pte_t entry;
2410
2411
2412 pte_unmap(page_table);
2413
2414 if (unlikely(anon_vma_prepare(vma)))
2415 goto oom;
2416 page = alloc_zeroed_user_highpage_movable(vma, address);
2417 if (!page)
2418 goto oom;
2419 __SetPageUptodate(page);
2420
2421 if (mem_cgroup_charge(page, mm, GFP_KERNEL))
2422 goto oom_free_page;
2423
2424 entry = mk_pte(page, vma->vm_page_prot);
2425 if (vma->vm_flags & VM_WRITE)
2426 entry = pte_mkwrite(pte_mkdirty(entry));
2427
2428 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2429 if (!pte_none(*page_table))
2430 goto release;
2431 inc_mm_counter(mm, anon_rss);
2432 lru_cache_add_active(page);
2433 page_add_new_anon_rmap(page, vma, address);
2434 set_pte_at(mm, address, page_table, entry);
2435
2436
2437 update_mmu_cache(vma, address, entry);
2438unlock:
2439 pte_unmap_unlock(page_table, ptl);
2440 return 0;
2441release:
2442 mem_cgroup_uncharge_page(page);
2443 page_cache_release(page);
2444 goto unlock;
2445oom_free_page:
2446 page_cache_release(page);
2447oom:
2448 return VM_FAULT_OOM;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2465 unsigned long address, pmd_t *pmd,
2466 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2467{
2468 pte_t *page_table;
2469 spinlock_t *ptl;
2470 struct page *page;
2471 pte_t entry;
2472 int anon = 0;
2473 struct page *dirty_page = NULL;
2474 struct vm_fault vmf;
2475 int ret;
2476 int page_mkwrite = 0;
2477
2478 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2479 vmf.pgoff = pgoff;
2480 vmf.flags = flags;
2481 vmf.page = NULL;
2482
2483 ret = vma->vm_ops->fault(vma, &vmf);
2484 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2485 return ret;
2486
2487
2488
2489
2490
2491 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2492 lock_page(vmf.page);
2493 else
2494 VM_BUG_ON(!PageLocked(vmf.page));
2495
2496
2497
2498
2499 page = vmf.page;
2500 if (flags & FAULT_FLAG_WRITE) {
2501 if (!(vma->vm_flags & VM_SHARED)) {
2502 anon = 1;
2503 if (unlikely(anon_vma_prepare(vma))) {
2504 ret = VM_FAULT_OOM;
2505 goto out;
2506 }
2507 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2508 vma, address);
2509 if (!page) {
2510 ret = VM_FAULT_OOM;
2511 goto out;
2512 }
2513 copy_user_highpage(page, vmf.page, address, vma);
2514 __SetPageUptodate(page);
2515 } else {
2516
2517
2518
2519
2520
2521 if (vma->vm_ops->page_mkwrite) {
2522 int tmp;
2523
2524 unlock_page(page);
2525 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2526 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2527 if (unlikely(tmp &
2528 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2529 ret = tmp;
2530 goto unwritable_page;
2531 }
2532 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2533 lock_page(page);
2534 if (!page->mapping) {
2535 ret = 0;
2536 unlock_page(page);
2537 goto unwritable_page;
2538 }
2539 } else
2540 VM_BUG_ON(!PageLocked(page));
2541 page_mkwrite = 1;
2542 }
2543 }
2544
2545 }
2546
2547 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2548 ret = VM_FAULT_OOM;
2549 goto out;
2550 }
2551
2552 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565 if (likely(pte_same(*page_table, orig_pte))) {
2566 flush_icache_page(vma, page);
2567 entry = mk_pte(page, vma->vm_page_prot);
2568 if (flags & FAULT_FLAG_WRITE)
2569 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2570 set_pte_at(mm, address, page_table, entry);
2571 if (anon) {
2572 inc_mm_counter(mm, anon_rss);
2573 lru_cache_add_active(page);
2574 page_add_new_anon_rmap(page, vma, address);
2575 } else {
2576 inc_mm_counter(mm, file_rss);
2577 page_add_file_rmap(page);
2578 if (flags & FAULT_FLAG_WRITE) {
2579 dirty_page = page;
2580 get_page(dirty_page);
2581 }
2582 }
2583
2584
2585 update_mmu_cache(vma, address, entry);
2586 } else {
2587 mem_cgroup_uncharge_page(page);
2588 if (anon)
2589 page_cache_release(page);
2590 else
2591 anon = 1;
2592 }
2593
2594 pte_unmap_unlock(page_table, ptl);
2595
2596out:
2597 if (dirty_page) {
2598 struct address_space *mapping = page->mapping;
2599
2600 if (set_page_dirty(dirty_page))
2601 page_mkwrite = 1;
2602 unlock_page(dirty_page);
2603 put_page(dirty_page);
2604 if (page_mkwrite && mapping) {
2605
2606
2607
2608
2609 balance_dirty_pages_ratelimited(mapping);
2610 }
2611
2612
2613 if (vma->vm_file)
2614 file_update_time(vma->vm_file);
2615 } else {
2616 unlock_page(vmf.page);
2617 if (anon)
2618 page_cache_release(vmf.page);
2619 }
2620
2621 return ret;
2622
2623unwritable_page:
2624 page_cache_release(page);
2625 return ret;
2626}
2627
2628static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2629 unsigned long address, pte_t *page_table, pmd_t *pmd,
2630 int write_access, pte_t orig_pte)
2631{
2632 pgoff_t pgoff = (((address & PAGE_MASK)
2633 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2634 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2635
2636 pte_unmap(page_table);
2637 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2638}
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2650 unsigned long address, pte_t *page_table, pmd_t *pmd,
2651 int write_access, pte_t orig_pte)
2652{
2653 unsigned int flags = FAULT_FLAG_NONLINEAR |
2654 (write_access ? FAULT_FLAG_WRITE : 0);
2655 pgoff_t pgoff;
2656
2657 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2658 return 0;
2659
2660 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
2661 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2662
2663
2664
2665 print_bad_pte(vma, orig_pte, address);
2666 return VM_FAULT_OOM;
2667 }
2668
2669 pgoff = pte_to_pgoff(orig_pte);
2670 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2671}
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686static inline int handle_pte_fault(struct mm_struct *mm,
2687 struct vm_area_struct *vma, unsigned long address,
2688 pte_t *pte, pmd_t *pmd, int write_access)
2689{
2690 pte_t entry;
2691 spinlock_t *ptl;
2692
2693 entry = *pte;
2694 if (!pte_present(entry)) {
2695 if (pte_none(entry)) {
2696 if (vma->vm_ops) {
2697 if (likely(vma->vm_ops->fault))
2698 return do_linear_fault(mm, vma, address,
2699 pte, pmd, write_access, entry);
2700 }
2701 return do_anonymous_page(mm, vma, address,
2702 pte, pmd, write_access);
2703 }
2704 if (pte_file(entry))
2705 return do_nonlinear_fault(mm, vma, address,
2706 pte, pmd, write_access, entry);
2707 return do_swap_page(mm, vma, address,
2708 pte, pmd, write_access, entry);
2709 }
2710
2711 ptl = pte_lockptr(mm, pmd);
2712 spin_lock(ptl);
2713 if (unlikely(!pte_same(*pte, entry)))
2714 goto unlock;
2715 if (write_access) {
2716 if (!pte_write(entry))
2717 return do_wp_page(mm, vma, address,
2718 pte, pmd, ptl, entry);
2719 entry = pte_mkdirty(entry);
2720 }
2721 entry = pte_mkyoung(entry);
2722 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2723 update_mmu_cache(vma, address, entry);
2724 } else {
2725
2726
2727
2728
2729
2730
2731 if (write_access)
2732 flush_tlb_page(vma, address);
2733 }
2734unlock:
2735 pte_unmap_unlock(pte, ptl);
2736 return 0;
2737}
2738
2739
2740
2741
2742int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2743 unsigned long address, int write_access)
2744{
2745 pgd_t *pgd;
2746 pud_t *pud;
2747 pmd_t *pmd;
2748 pte_t *pte;
2749
2750 __set_current_state(TASK_RUNNING);
2751
2752 count_vm_event(PGFAULT);
2753
2754 if (unlikely(is_vm_hugetlb_page(vma)))
2755 return hugetlb_fault(mm, vma, address, write_access);
2756
2757 pgd = pgd_offset(mm, address);
2758 pud = pud_alloc(mm, pgd, address);
2759 if (!pud)
2760 return VM_FAULT_OOM;
2761 pmd = pmd_alloc(mm, pud, address);
2762 if (!pmd)
2763 return VM_FAULT_OOM;
2764 pte = pte_alloc_map(mm, pmd, address);
2765 if (!pte)
2766 return VM_FAULT_OOM;
2767
2768 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2769}
2770
2771#ifndef __PAGETABLE_PUD_FOLDED
2772
2773
2774
2775
2776int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2777{
2778 pud_t *new = pud_alloc_one(mm, address);
2779 if (!new)
2780 return -ENOMEM;
2781
2782 smp_wmb();
2783
2784 spin_lock(&mm->page_table_lock);
2785 if (pgd_present(*pgd))
2786 pud_free(mm, new);
2787 else
2788 pgd_populate(mm, pgd, new);
2789 spin_unlock(&mm->page_table_lock);
2790 return 0;
2791}
2792#endif
2793
2794#ifndef __PAGETABLE_PMD_FOLDED
2795
2796
2797
2798
2799int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2800{
2801 pmd_t *new = pmd_alloc_one(mm, address);
2802 if (!new)
2803 return -ENOMEM;
2804
2805 smp_wmb();
2806
2807 spin_lock(&mm->page_table_lock);
2808#ifndef __ARCH_HAS_4LEVEL_HACK
2809 if (pud_present(*pud))
2810 pmd_free(mm, new);
2811 else
2812 pud_populate(mm, pud, new);
2813#else
2814 if (pgd_present(*pud))
2815 pmd_free(mm, new);
2816 else
2817 pgd_populate(mm, pud, new);
2818#endif
2819 spin_unlock(&mm->page_table_lock);
2820 return 0;
2821}
2822#endif
2823
2824int make_pages_present(unsigned long addr, unsigned long end)
2825{
2826 int ret, len, write;
2827 struct vm_area_struct * vma;
2828
2829 vma = find_vma(current->mm, addr);
2830 if (!vma)
2831 return -ENOMEM;
2832 write = (vma->vm_flags & VM_WRITE) != 0;
2833 BUG_ON(addr >= end);
2834 BUG_ON(end > vma->vm_end);
2835 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2836 ret = get_user_pages(current, current->mm, addr,
2837 len, write, 0, NULL, NULL);
2838 if (ret < 0) {
2839
2840
2841
2842
2843
2844 if (ret == -EFAULT)
2845 ret = -ENOMEM;
2846 else if (ret == -ENOMEM)
2847 ret = -EAGAIN;
2848 return ret;
2849 }
2850 return ret == len ? 0 : -ENOMEM;
2851}
2852
2853#if !defined(__HAVE_ARCH_GATE_AREA)
2854
2855#if defined(AT_SYSINFO_EHDR)
2856static struct vm_area_struct gate_vma;
2857
2858static int __init gate_vma_init(void)
2859{
2860 gate_vma.vm_mm = NULL;
2861 gate_vma.vm_start = FIXADDR_USER_START;
2862 gate_vma.vm_end = FIXADDR_USER_END;
2863 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
2864 gate_vma.vm_page_prot = __P101;
2865
2866
2867
2868
2869
2870
2871 gate_vma.vm_flags |= VM_ALWAYSDUMP;
2872 return 0;
2873}
2874__initcall(gate_vma_init);
2875#endif
2876
2877struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2878{
2879#ifdef AT_SYSINFO_EHDR
2880 return &gate_vma;
2881#else
2882 return NULL;
2883#endif
2884}
2885
2886int in_gate_area_no_task(unsigned long addr)
2887{
2888#ifdef AT_SYSINFO_EHDR
2889 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2890 return 1;
2891#endif
2892 return 0;
2893}
2894
2895#endif
2896
2897#ifdef CONFIG_HAVE_IOREMAP_PROT
2898static resource_size_t follow_phys(struct vm_area_struct *vma,
2899 unsigned long address, unsigned int flags,
2900 unsigned long *prot)
2901{
2902 pgd_t *pgd;
2903 pud_t *pud;
2904 pmd_t *pmd;
2905 pte_t *ptep, pte;
2906 spinlock_t *ptl;
2907 resource_size_t phys_addr = 0;
2908 struct mm_struct *mm = vma->vm_mm;
2909
2910 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2911
2912 pgd = pgd_offset(mm, address);
2913 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2914 goto no_page_table;
2915
2916 pud = pud_offset(pgd, address);
2917 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2918 goto no_page_table;
2919
2920 pmd = pmd_offset(pud, address);
2921 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2922 goto no_page_table;
2923
2924
2925 if (pmd_huge(*pmd))
2926 goto no_page_table;
2927
2928 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2929 if (!ptep)
2930 goto out;
2931
2932 pte = *ptep;
2933 if (!pte_present(pte))
2934 goto unlock;
2935 if ((flags & FOLL_WRITE) && !pte_write(pte))
2936 goto unlock;
2937 phys_addr = pte_pfn(pte);
2938 phys_addr <<= PAGE_SHIFT;
2939
2940 *prot = pgprot_val(pte_pgprot(pte));
2941
2942unlock:
2943 pte_unmap_unlock(ptep, ptl);
2944out:
2945 return phys_addr;
2946no_page_table:
2947 return 0;
2948}
2949
2950int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2951 void *buf, int len, int write)
2952{
2953 resource_size_t phys_addr;
2954 unsigned long prot = 0;
2955 void *maddr;
2956 int offset = addr & (PAGE_SIZE-1);
2957
2958 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2959 return -EINVAL;
2960
2961 phys_addr = follow_phys(vma, addr, write, &prot);
2962
2963 if (!phys_addr)
2964 return -EINVAL;
2965
2966 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2967 if (write)
2968 memcpy_toio(maddr + offset, buf, len);
2969 else
2970 memcpy_fromio(buf, maddr + offset, len);
2971 iounmap(maddr);
2972
2973 return len;
2974}
2975#endif
2976
2977
2978
2979
2980
2981
2982int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2983{
2984 struct mm_struct *mm;
2985 struct vm_area_struct *vma;
2986 void *old_buf = buf;
2987
2988 mm = get_task_mm(tsk);
2989 if (!mm)
2990 return 0;
2991
2992 down_read(&mm->mmap_sem);
2993
2994 while (len) {
2995 int bytes, ret, offset;
2996 void *maddr;
2997 struct page *page = NULL;
2998
2999 ret = get_user_pages(tsk, mm, addr, 1,
3000 write, 1, &page, &vma);
3001 if (ret <= 0) {
3002
3003
3004
3005
3006#ifdef CONFIG_HAVE_IOREMAP_PROT
3007 vma = find_vma(mm, addr);
3008 if (!vma)
3009 break;
3010 if (vma->vm_ops && vma->vm_ops->access)
3011 ret = vma->vm_ops->access(vma, addr, buf,
3012 len, write);
3013 if (ret <= 0)
3014#endif
3015 break;
3016 bytes = ret;
3017 } else {
3018 bytes = len;
3019 offset = addr & (PAGE_SIZE-1);
3020 if (bytes > PAGE_SIZE-offset)
3021 bytes = PAGE_SIZE-offset;
3022
3023 maddr = kmap(page);
3024 if (write) {
3025 copy_to_user_page(vma, page, addr,
3026 maddr + offset, buf, bytes);
3027 set_page_dirty_lock(page);
3028 } else {
3029 copy_from_user_page(vma, page, addr,
3030 buf, maddr + offset, bytes);
3031 }
3032 kunmap(page);
3033 page_cache_release(page);
3034 }
3035 len -= bytes;
3036 buf += bytes;
3037 addr += bytes;
3038 }
3039 up_read(&mm->mmap_sem);
3040 mmput(mm);
3041
3042 return buf - old_buf;
3043}
3044
3045
3046
3047
3048void print_vma_addr(char *prefix, unsigned long ip)
3049{
3050 struct mm_struct *mm = current->mm;
3051 struct vm_area_struct *vma;
3052
3053
3054
3055
3056
3057 if (preempt_count())
3058 return;
3059
3060 down_read(&mm->mmap_sem);
3061 vma = find_vma(mm, ip);
3062 if (vma && vma->vm_file) {
3063 struct file *f = vma->vm_file;
3064 char *buf = (char *)__get_free_page(GFP_KERNEL);
3065 if (buf) {
3066 char *p, *s;
3067
3068 p = d_path(&f->f_path, buf, PAGE_SIZE);
3069 if (IS_ERR(p))
3070 p = "?";
3071 s = strrchr(p, '/');
3072 if (s)
3073 p = s+1;
3074 printk("%s%s[%lx+%lx]", prefix, p,
3075 vma->vm_start,
3076 vma->vm_end - vma->vm_start);
3077 free_page((unsigned long)buf);
3078 }
3079 }
3080 up_read(¤t->mm->mmap_sem);
3081}
3082