1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/rmap.h>
49#include <linux/module.h>
50#include <linux/delayacct.h>
51#include <linux/init.h>
52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
58
59#include <asm/pgalloc.h>
60#include <asm/uaccess.h>
61#include <asm/tlb.h>
62#include <asm/tlbflush.h>
63#include <asm/pgtable.h>
64
65#include "internal.h"
66
67#ifndef CONFIG_NEED_MULTIPLE_NODES
68
69unsigned long max_mapnr;
70struct page *mem_map;
71
72EXPORT_SYMBOL(max_mapnr);
73EXPORT_SYMBOL(mem_map);
74#endif
75
76unsigned long num_physpages;
77
78
79
80
81
82
83
84void * high_memory;
85
86EXPORT_SYMBOL(num_physpages);
87EXPORT_SYMBOL(high_memory);
88
89
90
91
92
93
94
95int randomize_va_space __read_mostly =
96#ifdef CONFIG_COMPAT_BRK
97 1;
98#else
99 2;
100#endif
101
102static int __init disable_randmaps(char *s)
103{
104 randomize_va_space = 0;
105 return 1;
106}
107__setup("norandmaps", disable_randmaps);
108
109
110
111
112
113
114
115
116void pgd_clear_bad(pgd_t *pgd)
117{
118 pgd_ERROR(*pgd);
119 pgd_clear(pgd);
120}
121
122void pud_clear_bad(pud_t *pud)
123{
124 pud_ERROR(*pud);
125 pud_clear(pud);
126}
127
128void pmd_clear_bad(pmd_t *pmd)
129{
130 pmd_ERROR(*pmd);
131 pmd_clear(pmd);
132}
133
134
135
136
137
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
139 unsigned long addr)
140{
141 pgtable_t token = pmd_pgtable(*pmd);
142 pmd_clear(pmd);
143 pte_free_tlb(tlb, token, addr);
144 tlb->mm->nr_ptes--;
145}
146
147static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
148 unsigned long addr, unsigned long end,
149 unsigned long floor, unsigned long ceiling)
150{
151 pmd_t *pmd;
152 unsigned long next;
153 unsigned long start;
154
155 start = addr;
156 pmd = pmd_offset(pud, addr);
157 do {
158 next = pmd_addr_end(addr, end);
159 if (pmd_none_or_clear_bad(pmd))
160 continue;
161 free_pte_range(tlb, pmd, addr);
162 } while (pmd++, addr = next, addr != end);
163
164 start &= PUD_MASK;
165 if (start < floor)
166 return;
167 if (ceiling) {
168 ceiling &= PUD_MASK;
169 if (!ceiling)
170 return;
171 }
172 if (end - 1 > ceiling - 1)
173 return;
174
175 pmd = pmd_offset(pud, start);
176 pud_clear(pud);
177 pmd_free_tlb(tlb, pmd, start);
178}
179
180static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
181 unsigned long addr, unsigned long end,
182 unsigned long floor, unsigned long ceiling)
183{
184 pud_t *pud;
185 unsigned long next;
186 unsigned long start;
187
188 start = addr;
189 pud = pud_offset(pgd, addr);
190 do {
191 next = pud_addr_end(addr, end);
192 if (pud_none_or_clear_bad(pud))
193 continue;
194 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
195 } while (pud++, addr = next, addr != end);
196
197 start &= PGDIR_MASK;
198 if (start < floor)
199 return;
200 if (ceiling) {
201 ceiling &= PGDIR_MASK;
202 if (!ceiling)
203 return;
204 }
205 if (end - 1 > ceiling - 1)
206 return;
207
208 pud = pud_offset(pgd, start);
209 pgd_clear(pgd);
210 pud_free_tlb(tlb, pud, start);
211}
212
213
214
215
216
217
218void free_pgd_range(struct mmu_gather *tlb,
219 unsigned long addr, unsigned long end,
220 unsigned long floor, unsigned long ceiling)
221{
222 pgd_t *pgd;
223 unsigned long next;
224 unsigned long start;
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252 addr &= PMD_MASK;
253 if (addr < floor) {
254 addr += PMD_SIZE;
255 if (!addr)
256 return;
257 }
258 if (ceiling) {
259 ceiling &= PMD_MASK;
260 if (!ceiling)
261 return;
262 }
263 if (end - 1 > ceiling - 1)
264 end -= PMD_SIZE;
265 if (addr > end - 1)
266 return;
267
268 start = addr;
269 pgd = pgd_offset(tlb->mm, addr);
270 do {
271 next = pgd_addr_end(addr, end);
272 if (pgd_none_or_clear_bad(pgd))
273 continue;
274 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
275 } while (pgd++, addr = next, addr != end);
276}
277
278void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
279 unsigned long floor, unsigned long ceiling)
280{
281 while (vma) {
282 struct vm_area_struct *next = vma->vm_next;
283 unsigned long addr = vma->vm_start;
284
285
286
287
288 anon_vma_unlink(vma);
289 unlink_file_vma(vma);
290
291 if (is_vm_hugetlb_page(vma)) {
292 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
293 floor, next? next->vm_start: ceiling);
294 } else {
295
296
297
298 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
299 && !is_vm_hugetlb_page(next)) {
300 vma = next;
301 next = vma->vm_next;
302 anon_vma_unlink(vma);
303 unlink_file_vma(vma);
304 }
305 free_pgd_range(tlb, addr, vma->vm_end,
306 floor, next? next->vm_start: ceiling);
307 }
308 vma = next;
309 }
310}
311
312int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
313{
314 pgtable_t new = pte_alloc_one(mm, address);
315 if (!new)
316 return -ENOMEM;
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331 smp_wmb();
332
333 spin_lock(&mm->page_table_lock);
334 if (!pmd_present(*pmd)) {
335 mm->nr_ptes++;
336 pmd_populate(mm, pmd, new);
337 new = NULL;
338 }
339 spin_unlock(&mm->page_table_lock);
340 if (new)
341 pte_free(mm, new);
342 return 0;
343}
344
345int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
346{
347 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
348 if (!new)
349 return -ENOMEM;
350
351 smp_wmb();
352
353 spin_lock(&init_mm.page_table_lock);
354 if (!pmd_present(*pmd)) {
355 pmd_populate_kernel(&init_mm, pmd, new);
356 new = NULL;
357 }
358 spin_unlock(&init_mm.page_table_lock);
359 if (new)
360 pte_free_kernel(&init_mm, new);
361 return 0;
362}
363
364static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
365{
366 if (file_rss)
367 add_mm_counter(mm, file_rss, file_rss);
368 if (anon_rss)
369 add_mm_counter(mm, anon_rss, anon_rss);
370}
371
372
373
374
375
376
377
378
379static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
380 pte_t pte, struct page *page)
381{
382 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
383 pud_t *pud = pud_offset(pgd, addr);
384 pmd_t *pmd = pmd_offset(pud, addr);
385 struct address_space *mapping;
386 pgoff_t index;
387 static unsigned long resume;
388 static unsigned long nr_shown;
389 static unsigned long nr_unshown;
390
391
392
393
394
395 if (nr_shown == 60) {
396 if (time_before(jiffies, resume)) {
397 nr_unshown++;
398 return;
399 }
400 if (nr_unshown) {
401 printk(KERN_ALERT
402 "BUG: Bad page map: %lu messages suppressed\n",
403 nr_unshown);
404 nr_unshown = 0;
405 }
406 nr_shown = 0;
407 }
408 if (nr_shown++ == 0)
409 resume = jiffies + 60 * HZ;
410
411 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
412 index = linear_page_index(vma, addr);
413
414 printk(KERN_ALERT
415 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
416 current->comm,
417 (long long)pte_val(pte), (long long)pmd_val(*pmd));
418 if (page) {
419 printk(KERN_ALERT
420 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
421 page, (void *)page->flags, page_count(page),
422 page_mapcount(page), page->mapping, page->index);
423 }
424 printk(KERN_ALERT
425 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
426 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
427
428
429
430 if (vma->vm_ops)
431 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
432 (unsigned long)vma->vm_ops->fault);
433 if (vma->vm_file && vma->vm_file->f_op)
434 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
435 (unsigned long)vma->vm_file->f_op->mmap);
436 dump_stack();
437 add_taint(TAINT_BAD_PAGE);
438}
439
440static inline int is_cow_mapping(unsigned int flags)
441{
442 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
443}
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487#ifdef __HAVE_ARCH_PTE_SPECIAL
488# define HAVE_PTE_SPECIAL 1
489#else
490# define HAVE_PTE_SPECIAL 0
491#endif
492struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
493 pte_t pte)
494{
495 unsigned long pfn = pte_pfn(pte);
496
497 if (HAVE_PTE_SPECIAL) {
498 if (likely(!pte_special(pte)))
499 goto check_pfn;
500 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
501 print_bad_pte(vma, addr, pte, NULL);
502 return NULL;
503 }
504
505
506
507 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
508 if (vma->vm_flags & VM_MIXEDMAP) {
509 if (!pfn_valid(pfn))
510 return NULL;
511 goto out;
512 } else {
513 unsigned long off;
514 off = (addr - vma->vm_start) >> PAGE_SHIFT;
515 if (pfn == vma->vm_pgoff + off)
516 return NULL;
517 if (!is_cow_mapping(vma->vm_flags))
518 return NULL;
519 }
520 }
521
522check_pfn:
523 if (unlikely(pfn > highest_memmap_pfn)) {
524 print_bad_pte(vma, addr, pte, NULL);
525 return NULL;
526 }
527
528
529
530
531
532out:
533 return pfn_to_page(pfn);
534}
535
536
537
538
539
540
541
542static inline void
543copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
544 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
545 unsigned long addr, int *rss)
546{
547 unsigned long vm_flags = vma->vm_flags;
548 pte_t pte = *src_pte;
549 struct page *page;
550
551
552 if (unlikely(!pte_present(pte))) {
553 if (!pte_file(pte)) {
554 swp_entry_t entry = pte_to_swp_entry(pte);
555
556 swap_duplicate(entry);
557
558 if (unlikely(list_empty(&dst_mm->mmlist))) {
559 spin_lock(&mmlist_lock);
560 if (list_empty(&dst_mm->mmlist))
561 list_add(&dst_mm->mmlist,
562 &src_mm->mmlist);
563 spin_unlock(&mmlist_lock);
564 }
565 if (is_write_migration_entry(entry) &&
566 is_cow_mapping(vm_flags)) {
567
568
569
570
571 make_migration_entry_read(&entry);
572 pte = swp_entry_to_pte(entry);
573 set_pte_at(src_mm, addr, src_pte, pte);
574 }
575 }
576 goto out_set_pte;
577 }
578
579
580
581
582
583 if (is_cow_mapping(vm_flags)) {
584 ptep_set_wrprotect(src_mm, addr, src_pte);
585 pte = pte_wrprotect(pte);
586 }
587
588
589
590
591
592 if (vm_flags & VM_SHARED)
593 pte = pte_mkclean(pte);
594 pte = pte_mkold(pte);
595
596 page = vm_normal_page(vma, addr, pte);
597 if (page) {
598 get_page(page);
599 page_dup_rmap(page, vma, addr);
600 rss[!!PageAnon(page)]++;
601 }
602
603out_set_pte:
604 set_pte_at(dst_mm, addr, dst_pte, pte);
605}
606
607static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
608 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
609 unsigned long addr, unsigned long end)
610{
611 pte_t *src_pte, *dst_pte;
612 spinlock_t *src_ptl, *dst_ptl;
613 int progress = 0;
614 int rss[2];
615
616again:
617 rss[1] = rss[0] = 0;
618 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
619 if (!dst_pte)
620 return -ENOMEM;
621 src_pte = pte_offset_map_nested(src_pmd, addr);
622 src_ptl = pte_lockptr(src_mm, src_pmd);
623 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
624 arch_enter_lazy_mmu_mode();
625
626 do {
627
628
629
630
631 if (progress >= 32) {
632 progress = 0;
633 if (need_resched() ||
634 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
635 break;
636 }
637 if (pte_none(*src_pte)) {
638 progress++;
639 continue;
640 }
641 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
642 progress += 8;
643 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
644
645 arch_leave_lazy_mmu_mode();
646 spin_unlock(src_ptl);
647 pte_unmap_nested(src_pte - 1);
648 add_mm_rss(dst_mm, rss[0], rss[1]);
649 pte_unmap_unlock(dst_pte - 1, dst_ptl);
650 cond_resched();
651 if (addr != end)
652 goto again;
653 return 0;
654}
655
656static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
657 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
658 unsigned long addr, unsigned long end)
659{
660 pmd_t *src_pmd, *dst_pmd;
661 unsigned long next;
662
663 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
664 if (!dst_pmd)
665 return -ENOMEM;
666 src_pmd = pmd_offset(src_pud, addr);
667 do {
668 next = pmd_addr_end(addr, end);
669 if (pmd_none_or_clear_bad(src_pmd))
670 continue;
671 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
672 vma, addr, next))
673 return -ENOMEM;
674 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
675 return 0;
676}
677
678static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
679 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
680 unsigned long addr, unsigned long end)
681{
682 pud_t *src_pud, *dst_pud;
683 unsigned long next;
684
685 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
686 if (!dst_pud)
687 return -ENOMEM;
688 src_pud = pud_offset(src_pgd, addr);
689 do {
690 next = pud_addr_end(addr, end);
691 if (pud_none_or_clear_bad(src_pud))
692 continue;
693 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
694 vma, addr, next))
695 return -ENOMEM;
696 } while (dst_pud++, src_pud++, addr = next, addr != end);
697 return 0;
698}
699
700int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
701 struct vm_area_struct *vma)
702{
703 pgd_t *src_pgd, *dst_pgd;
704 unsigned long next;
705 unsigned long addr = vma->vm_start;
706 unsigned long end = vma->vm_end;
707 int ret;
708
709
710
711
712
713
714
715 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
716 if (!vma->anon_vma)
717 return 0;
718 }
719
720 if (is_vm_hugetlb_page(vma))
721 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
722
723 if (unlikely(is_pfn_mapping(vma))) {
724
725
726
727
728 ret = track_pfn_vma_copy(vma);
729 if (ret)
730 return ret;
731 }
732
733
734
735
736
737
738
739 if (is_cow_mapping(vma->vm_flags))
740 mmu_notifier_invalidate_range_start(src_mm, addr, end);
741
742 ret = 0;
743 dst_pgd = pgd_offset(dst_mm, addr);
744 src_pgd = pgd_offset(src_mm, addr);
745 do {
746 next = pgd_addr_end(addr, end);
747 if (pgd_none_or_clear_bad(src_pgd))
748 continue;
749 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
750 vma, addr, next))) {
751 ret = -ENOMEM;
752 break;
753 }
754 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
755
756 if (is_cow_mapping(vma->vm_flags))
757 mmu_notifier_invalidate_range_end(src_mm,
758 vma->vm_start, end);
759 return ret;
760}
761
762static unsigned long zap_pte_range(struct mmu_gather *tlb,
763 struct vm_area_struct *vma, pmd_t *pmd,
764 unsigned long addr, unsigned long end,
765 long *zap_work, struct zap_details *details)
766{
767 struct mm_struct *mm = tlb->mm;
768 pte_t *pte;
769 spinlock_t *ptl;
770 int file_rss = 0;
771 int anon_rss = 0;
772
773 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
774 arch_enter_lazy_mmu_mode();
775 do {
776 pte_t ptent = *pte;
777 if (pte_none(ptent)) {
778 (*zap_work)--;
779 continue;
780 }
781
782 (*zap_work) -= PAGE_SIZE;
783
784 if (pte_present(ptent)) {
785 struct page *page;
786
787 page = vm_normal_page(vma, addr, ptent);
788 if (unlikely(details) && page) {
789
790
791
792
793
794 if (details->check_mapping &&
795 details->check_mapping != page->mapping)
796 continue;
797
798
799
800
801 if (details->nonlinear_vma &&
802 (page->index < details->first_index ||
803 page->index > details->last_index))
804 continue;
805 }
806 ptent = ptep_get_and_clear_full(mm, addr, pte,
807 tlb->fullmm);
808 tlb_remove_tlb_entry(tlb, pte, addr);
809 if (unlikely(!page))
810 continue;
811 if (unlikely(details) && details->nonlinear_vma
812 && linear_page_index(details->nonlinear_vma,
813 addr) != page->index)
814 set_pte_at(mm, addr, pte,
815 pgoff_to_pte(page->index));
816 if (PageAnon(page))
817 anon_rss--;
818 else {
819 if (pte_dirty(ptent))
820 set_page_dirty(page);
821 if (pte_young(ptent) &&
822 likely(!VM_SequentialReadHint(vma)))
823 mark_page_accessed(page);
824 file_rss--;
825 }
826 page_remove_rmap(page);
827 if (unlikely(page_mapcount(page) < 0))
828 print_bad_pte(vma, addr, ptent, page);
829 tlb_remove_page(tlb, page);
830 continue;
831 }
832
833
834
835
836 if (unlikely(details))
837 continue;
838 if (pte_file(ptent)) {
839 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
840 print_bad_pte(vma, addr, ptent, NULL);
841 } else if
842 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
843 print_bad_pte(vma, addr, ptent, NULL);
844 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
845 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
846
847 add_mm_rss(mm, file_rss, anon_rss);
848 arch_leave_lazy_mmu_mode();
849 pte_unmap_unlock(pte - 1, ptl);
850
851 return addr;
852}
853
854static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
855 struct vm_area_struct *vma, pud_t *pud,
856 unsigned long addr, unsigned long end,
857 long *zap_work, struct zap_details *details)
858{
859 pmd_t *pmd;
860 unsigned long next;
861
862 pmd = pmd_offset(pud, addr);
863 do {
864 next = pmd_addr_end(addr, end);
865 if (pmd_none_or_clear_bad(pmd)) {
866 (*zap_work)--;
867 continue;
868 }
869 next = zap_pte_range(tlb, vma, pmd, addr, next,
870 zap_work, details);
871 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
872
873 return addr;
874}
875
876static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
877 struct vm_area_struct *vma, pgd_t *pgd,
878 unsigned long addr, unsigned long end,
879 long *zap_work, struct zap_details *details)
880{
881 pud_t *pud;
882 unsigned long next;
883
884 pud = pud_offset(pgd, addr);
885 do {
886 next = pud_addr_end(addr, end);
887 if (pud_none_or_clear_bad(pud)) {
888 (*zap_work)--;
889 continue;
890 }
891 next = zap_pmd_range(tlb, vma, pud, addr, next,
892 zap_work, details);
893 } while (pud++, addr = next, (addr != end && *zap_work > 0));
894
895 return addr;
896}
897
898static unsigned long unmap_page_range(struct mmu_gather *tlb,
899 struct vm_area_struct *vma,
900 unsigned long addr, unsigned long end,
901 long *zap_work, struct zap_details *details)
902{
903 pgd_t *pgd;
904 unsigned long next;
905
906 if (details && !details->check_mapping && !details->nonlinear_vma)
907 details = NULL;
908
909 BUG_ON(addr >= end);
910 tlb_start_vma(tlb, vma);
911 pgd = pgd_offset(vma->vm_mm, addr);
912 do {
913 next = pgd_addr_end(addr, end);
914 if (pgd_none_or_clear_bad(pgd)) {
915 (*zap_work)--;
916 continue;
917 }
918 next = zap_pud_range(tlb, vma, pgd, addr, next,
919 zap_work, details);
920 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
921 tlb_end_vma(tlb, vma);
922
923 return addr;
924}
925
926#ifdef CONFIG_PREEMPT
927# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
928#else
929
930# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
931#endif
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959unsigned long unmap_vmas(struct mmu_gather **tlbp,
960 struct vm_area_struct *vma, unsigned long start_addr,
961 unsigned long end_addr, unsigned long *nr_accounted,
962 struct zap_details *details)
963{
964 long zap_work = ZAP_BLOCK_SIZE;
965 unsigned long tlb_start = 0;
966 int tlb_start_valid = 0;
967 unsigned long start = start_addr;
968 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
969 int fullmm = (*tlbp)->fullmm;
970 struct mm_struct *mm = vma->vm_mm;
971
972 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
973 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
974 unsigned long end;
975
976 start = max(vma->vm_start, start_addr);
977 if (start >= vma->vm_end)
978 continue;
979 end = min(vma->vm_end, end_addr);
980 if (end <= vma->vm_start)
981 continue;
982
983 if (vma->vm_flags & VM_ACCOUNT)
984 *nr_accounted += (end - start) >> PAGE_SHIFT;
985
986 if (unlikely(is_pfn_mapping(vma)))
987 untrack_pfn_vma(vma, 0, 0);
988
989 while (start != end) {
990 if (!tlb_start_valid) {
991 tlb_start = start;
992 tlb_start_valid = 1;
993 }
994
995 if (unlikely(is_vm_hugetlb_page(vma))) {
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007 if (vma->vm_file) {
1008 unmap_hugepage_range(vma, start, end, NULL);
1009 zap_work -= (end - start) /
1010 pages_per_huge_page(hstate_vma(vma));
1011 }
1012
1013 start = end;
1014 } else
1015 start = unmap_page_range(*tlbp, vma,
1016 start, end, &zap_work, details);
1017
1018 if (zap_work > 0) {
1019 BUG_ON(start != end);
1020 break;
1021 }
1022
1023 tlb_finish_mmu(*tlbp, tlb_start, start);
1024
1025 if (need_resched() ||
1026 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1027 if (i_mmap_lock) {
1028 *tlbp = NULL;
1029 goto out;
1030 }
1031 cond_resched();
1032 }
1033
1034 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1035 tlb_start_valid = 0;
1036 zap_work = ZAP_BLOCK_SIZE;
1037 }
1038 }
1039out:
1040 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1041 return start;
1042}
1043
1044
1045
1046
1047
1048
1049
1050
1051unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1052 unsigned long size, struct zap_details *details)
1053{
1054 struct mm_struct *mm = vma->vm_mm;
1055 struct mmu_gather *tlb;
1056 unsigned long end = address + size;
1057 unsigned long nr_accounted = 0;
1058
1059 lru_add_drain();
1060 tlb = tlb_gather_mmu(mm, 0);
1061 update_hiwater_rss(mm);
1062 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1063 if (tlb)
1064 tlb_finish_mmu(tlb, address, end);
1065 return end;
1066}
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1081 unsigned long size)
1082{
1083 if (address < vma->vm_start || address + size > vma->vm_end ||
1084 !(vma->vm_flags & VM_PFNMAP))
1085 return -1;
1086 zap_page_range(vma, address, size, NULL);
1087 return 0;
1088}
1089EXPORT_SYMBOL_GPL(zap_vma_ptes);
1090
1091
1092
1093
1094struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1095 unsigned int flags)
1096{
1097 pgd_t *pgd;
1098 pud_t *pud;
1099 pmd_t *pmd;
1100 pte_t *ptep, pte;
1101 spinlock_t *ptl;
1102 struct page *page;
1103 struct mm_struct *mm = vma->vm_mm;
1104
1105 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1106 if (!IS_ERR(page)) {
1107 BUG_ON(flags & FOLL_GET);
1108 goto out;
1109 }
1110
1111 page = NULL;
1112 pgd = pgd_offset(mm, address);
1113 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1114 goto no_page_table;
1115
1116 pud = pud_offset(pgd, address);
1117 if (pud_none(*pud))
1118 goto no_page_table;
1119 if (pud_huge(*pud)) {
1120 BUG_ON(flags & FOLL_GET);
1121 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1122 goto out;
1123 }
1124 if (unlikely(pud_bad(*pud)))
1125 goto no_page_table;
1126
1127 pmd = pmd_offset(pud, address);
1128 if (pmd_none(*pmd))
1129 goto no_page_table;
1130 if (pmd_huge(*pmd)) {
1131 BUG_ON(flags & FOLL_GET);
1132 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1133 goto out;
1134 }
1135 if (unlikely(pmd_bad(*pmd)))
1136 goto no_page_table;
1137
1138 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1139
1140 pte = *ptep;
1141 if (!pte_present(pte))
1142 goto no_page;
1143 if ((flags & FOLL_WRITE) && !pte_write(pte))
1144 goto unlock;
1145 page = vm_normal_page(vma, address, pte);
1146 if (unlikely(!page))
1147 goto bad_page;
1148
1149 if (flags & FOLL_GET)
1150 get_page(page);
1151 if (flags & FOLL_TOUCH) {
1152 if ((flags & FOLL_WRITE) &&
1153 !pte_dirty(pte) && !PageDirty(page))
1154 set_page_dirty(page);
1155
1156
1157
1158
1159
1160 mark_page_accessed(page);
1161 }
1162unlock:
1163 pte_unmap_unlock(ptep, ptl);
1164out:
1165 return page;
1166
1167bad_page:
1168 pte_unmap_unlock(ptep, ptl);
1169 return ERR_PTR(-EFAULT);
1170
1171no_page:
1172 pte_unmap_unlock(ptep, ptl);
1173 if (!pte_none(pte))
1174 return page;
1175
1176no_page_table:
1177
1178
1179
1180
1181 if (flags & FOLL_ANON) {
1182 page = ZERO_PAGE(0);
1183 if (flags & FOLL_GET)
1184 get_page(page);
1185 BUG_ON(flags & FOLL_WRITE);
1186 }
1187 return page;
1188}
1189
1190
1191static inline int use_zero_page(struct vm_area_struct *vma)
1192{
1193
1194
1195
1196
1197
1198
1199
1200 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1201 return 0;
1202
1203
1204
1205 return !vma->vm_ops || !vma->vm_ops->fault;
1206}
1207
1208
1209
1210int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1211 unsigned long start, int nr_pages, int flags,
1212 struct page **pages, struct vm_area_struct **vmas)
1213{
1214 int i;
1215 unsigned int vm_flags = 0;
1216 int write = !!(flags & GUP_FLAGS_WRITE);
1217 int force = !!(flags & GUP_FLAGS_FORCE);
1218 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1219 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1220
1221 if (nr_pages <= 0)
1222 return 0;
1223
1224
1225
1226
1227 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1228 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1229 i = 0;
1230
1231 do {
1232 struct vm_area_struct *vma;
1233 unsigned int foll_flags;
1234
1235 vma = find_extend_vma(mm, start);
1236 if (!vma && in_gate_area(tsk, start)) {
1237 unsigned long pg = start & PAGE_MASK;
1238 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1239 pgd_t *pgd;
1240 pud_t *pud;
1241 pmd_t *pmd;
1242 pte_t *pte;
1243
1244
1245 if (!ignore && write)
1246 return i ? : -EFAULT;
1247 if (pg > TASK_SIZE)
1248 pgd = pgd_offset_k(pg);
1249 else
1250 pgd = pgd_offset_gate(mm, pg);
1251 BUG_ON(pgd_none(*pgd));
1252 pud = pud_offset(pgd, pg);
1253 BUG_ON(pud_none(*pud));
1254 pmd = pmd_offset(pud, pg);
1255 if (pmd_none(*pmd))
1256 return i ? : -EFAULT;
1257 pte = pte_offset_map(pmd, pg);
1258 if (pte_none(*pte)) {
1259 pte_unmap(pte);
1260 return i ? : -EFAULT;
1261 }
1262 if (pages) {
1263 struct page *page = vm_normal_page(gate_vma, start, *pte);
1264 pages[i] = page;
1265 if (page)
1266 get_page(page);
1267 }
1268 pte_unmap(pte);
1269 if (vmas)
1270 vmas[i] = gate_vma;
1271 i++;
1272 start += PAGE_SIZE;
1273 nr_pages--;
1274 continue;
1275 }
1276
1277 if (!vma ||
1278 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1279 (!ignore && !(vm_flags & vma->vm_flags)))
1280 return i ? : -EFAULT;
1281
1282 if (is_vm_hugetlb_page(vma)) {
1283 i = follow_hugetlb_page(mm, vma, pages, vmas,
1284 &start, &nr_pages, i, write);
1285 continue;
1286 }
1287
1288 foll_flags = FOLL_TOUCH;
1289 if (pages)
1290 foll_flags |= FOLL_GET;
1291 if (!write && use_zero_page(vma))
1292 foll_flags |= FOLL_ANON;
1293
1294 do {
1295 struct page *page;
1296
1297
1298
1299
1300
1301
1302
1303
1304 if (unlikely(!ignore_sigkill &&
1305 fatal_signal_pending(current)))
1306 return i ? i : -ERESTARTSYS;
1307
1308 if (write)
1309 foll_flags |= FOLL_WRITE;
1310
1311 cond_resched();
1312 while (!(page = follow_page(vma, start, foll_flags))) {
1313 int ret;
1314
1315 ret = handle_mm_fault(mm, vma, start,
1316 (foll_flags & FOLL_WRITE) ?
1317 FAULT_FLAG_WRITE : 0);
1318
1319 if (ret & VM_FAULT_ERROR) {
1320 if (ret & VM_FAULT_OOM)
1321 return i ? i : -ENOMEM;
1322 else if (ret & VM_FAULT_SIGBUS)
1323 return i ? i : -EFAULT;
1324 BUG();
1325 }
1326 if (ret & VM_FAULT_MAJOR)
1327 tsk->maj_flt++;
1328 else
1329 tsk->min_flt++;
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343 if ((ret & VM_FAULT_WRITE) &&
1344 !(vma->vm_flags & VM_WRITE))
1345 foll_flags &= ~FOLL_WRITE;
1346
1347 cond_resched();
1348 }
1349 if (IS_ERR(page))
1350 return i ? i : PTR_ERR(page);
1351 if (pages) {
1352 pages[i] = page;
1353
1354 flush_anon_page(vma, page, start);
1355 flush_dcache_page(page);
1356 }
1357 if (vmas)
1358 vmas[i] = vma;
1359 i++;
1360 start += PAGE_SIZE;
1361 nr_pages--;
1362 } while (nr_pages && start < vma->vm_end);
1363 } while (nr_pages);
1364 return i;
1365}
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1418 unsigned long start, int nr_pages, int write, int force,
1419 struct page **pages, struct vm_area_struct **vmas)
1420{
1421 int flags = 0;
1422
1423 if (write)
1424 flags |= GUP_FLAGS_WRITE;
1425 if (force)
1426 flags |= GUP_FLAGS_FORCE;
1427
1428 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1429}
1430
1431EXPORT_SYMBOL(get_user_pages);
1432
1433pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1434 spinlock_t **ptl)
1435{
1436 pgd_t * pgd = pgd_offset(mm, addr);
1437 pud_t * pud = pud_alloc(mm, pgd, addr);
1438 if (pud) {
1439 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1440 if (pmd)
1441 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1442 }
1443 return NULL;
1444}
1445
1446
1447
1448
1449
1450
1451
1452
1453static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1454 struct page *page, pgprot_t prot)
1455{
1456 struct mm_struct *mm = vma->vm_mm;
1457 int retval;
1458 pte_t *pte;
1459 spinlock_t *ptl;
1460
1461 retval = -EINVAL;
1462 if (PageAnon(page))
1463 goto out;
1464 retval = -ENOMEM;
1465 flush_dcache_page(page);
1466 pte = get_locked_pte(mm, addr, &ptl);
1467 if (!pte)
1468 goto out;
1469 retval = -EBUSY;
1470 if (!pte_none(*pte))
1471 goto out_unlock;
1472
1473
1474 get_page(page);
1475 inc_mm_counter(mm, file_rss);
1476 page_add_file_rmap(page);
1477 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1478
1479 retval = 0;
1480 pte_unmap_unlock(pte, ptl);
1481 return retval;
1482out_unlock:
1483 pte_unmap_unlock(pte, ptl);
1484out:
1485 return retval;
1486}
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1511 struct page *page)
1512{
1513 if (addr < vma->vm_start || addr >= vma->vm_end)
1514 return -EFAULT;
1515 if (!page_count(page))
1516 return -EINVAL;
1517 vma->vm_flags |= VM_INSERTPAGE;
1518 return insert_page(vma, addr, page, vma->vm_page_prot);
1519}
1520EXPORT_SYMBOL(vm_insert_page);
1521
1522static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1523 unsigned long pfn, pgprot_t prot)
1524{
1525 struct mm_struct *mm = vma->vm_mm;
1526 int retval;
1527 pte_t *pte, entry;
1528 spinlock_t *ptl;
1529
1530 retval = -ENOMEM;
1531 pte = get_locked_pte(mm, addr, &ptl);
1532 if (!pte)
1533 goto out;
1534 retval = -EBUSY;
1535 if (!pte_none(*pte))
1536 goto out_unlock;
1537
1538
1539 entry = pte_mkspecial(pfn_pte(pfn, prot));
1540 set_pte_at(mm, addr, pte, entry);
1541 update_mmu_cache(vma, addr, entry);
1542
1543 retval = 0;
1544out_unlock:
1545 pte_unmap_unlock(pte, ptl);
1546out:
1547 return retval;
1548}
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1568 unsigned long pfn)
1569{
1570 int ret;
1571 pgprot_t pgprot = vma->vm_page_prot;
1572
1573
1574
1575
1576
1577
1578 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1579 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1580 (VM_PFNMAP|VM_MIXEDMAP));
1581 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1582 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1583
1584 if (addr < vma->vm_start || addr >= vma->vm_end)
1585 return -EFAULT;
1586 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1587 return -EINVAL;
1588
1589 ret = insert_pfn(vma, addr, pfn, pgprot);
1590
1591 if (ret)
1592 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
1593
1594 return ret;
1595}
1596EXPORT_SYMBOL(vm_insert_pfn);
1597
1598int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1599 unsigned long pfn)
1600{
1601 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1602
1603 if (addr < vma->vm_start || addr >= vma->vm_end)
1604 return -EFAULT;
1605
1606
1607
1608
1609
1610
1611
1612 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1613 struct page *page;
1614
1615 page = pfn_to_page(pfn);
1616 return insert_page(vma, addr, page, vma->vm_page_prot);
1617 }
1618 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1619}
1620EXPORT_SYMBOL(vm_insert_mixed);
1621
1622
1623
1624
1625
1626
1627static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1628 unsigned long addr, unsigned long end,
1629 unsigned long pfn, pgprot_t prot)
1630{
1631 pte_t *pte;
1632 spinlock_t *ptl;
1633
1634 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1635 if (!pte)
1636 return -ENOMEM;
1637 arch_enter_lazy_mmu_mode();
1638 do {
1639 BUG_ON(!pte_none(*pte));
1640 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1641 pfn++;
1642 } while (pte++, addr += PAGE_SIZE, addr != end);
1643 arch_leave_lazy_mmu_mode();
1644 pte_unmap_unlock(pte - 1, ptl);
1645 return 0;
1646}
1647
1648static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1649 unsigned long addr, unsigned long end,
1650 unsigned long pfn, pgprot_t prot)
1651{
1652 pmd_t *pmd;
1653 unsigned long next;
1654
1655 pfn -= addr >> PAGE_SHIFT;
1656 pmd = pmd_alloc(mm, pud, addr);
1657 if (!pmd)
1658 return -ENOMEM;
1659 do {
1660 next = pmd_addr_end(addr, end);
1661 if (remap_pte_range(mm, pmd, addr, next,
1662 pfn + (addr >> PAGE_SHIFT), prot))
1663 return -ENOMEM;
1664 } while (pmd++, addr = next, addr != end);
1665 return 0;
1666}
1667
1668static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1669 unsigned long addr, unsigned long end,
1670 unsigned long pfn, pgprot_t prot)
1671{
1672 pud_t *pud;
1673 unsigned long next;
1674
1675 pfn -= addr >> PAGE_SHIFT;
1676 pud = pud_alloc(mm, pgd, addr);
1677 if (!pud)
1678 return -ENOMEM;
1679 do {
1680 next = pud_addr_end(addr, end);
1681 if (remap_pmd_range(mm, pud, addr, next,
1682 pfn + (addr >> PAGE_SHIFT), prot))
1683 return -ENOMEM;
1684 } while (pud++, addr = next, addr != end);
1685 return 0;
1686}
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1699 unsigned long pfn, unsigned long size, pgprot_t prot)
1700{
1701 pgd_t *pgd;
1702 unsigned long next;
1703 unsigned long end = addr + PAGE_ALIGN(size);
1704 struct mm_struct *mm = vma->vm_mm;
1705 int err;
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725 if (addr == vma->vm_start && end == vma->vm_end) {
1726 vma->vm_pgoff = pfn;
1727 vma->vm_flags |= VM_PFN_AT_MMAP;
1728 } else if (is_cow_mapping(vma->vm_flags))
1729 return -EINVAL;
1730
1731 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1732
1733 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1734 if (err) {
1735
1736
1737
1738
1739 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1740 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1741 return -EINVAL;
1742 }
1743
1744 BUG_ON(addr >= end);
1745 pfn -= addr >> PAGE_SHIFT;
1746 pgd = pgd_offset(mm, addr);
1747 flush_cache_range(vma, addr, end);
1748 do {
1749 next = pgd_addr_end(addr, end);
1750 err = remap_pud_range(mm, pgd, addr, next,
1751 pfn + (addr >> PAGE_SHIFT), prot);
1752 if (err)
1753 break;
1754 } while (pgd++, addr = next, addr != end);
1755
1756 if (err)
1757 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
1758
1759 return err;
1760}
1761EXPORT_SYMBOL(remap_pfn_range);
1762
1763static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1764 unsigned long addr, unsigned long end,
1765 pte_fn_t fn, void *data)
1766{
1767 pte_t *pte;
1768 int err;
1769 pgtable_t token;
1770 spinlock_t *uninitialized_var(ptl);
1771
1772 pte = (mm == &init_mm) ?
1773 pte_alloc_kernel(pmd, addr) :
1774 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1775 if (!pte)
1776 return -ENOMEM;
1777
1778 BUG_ON(pmd_huge(*pmd));
1779
1780 arch_enter_lazy_mmu_mode();
1781
1782 token = pmd_pgtable(*pmd);
1783
1784 do {
1785 err = fn(pte, token, addr, data);
1786 if (err)
1787 break;
1788 } while (pte++, addr += PAGE_SIZE, addr != end);
1789
1790 arch_leave_lazy_mmu_mode();
1791
1792 if (mm != &init_mm)
1793 pte_unmap_unlock(pte-1, ptl);
1794 return err;
1795}
1796
1797static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1798 unsigned long addr, unsigned long end,
1799 pte_fn_t fn, void *data)
1800{
1801 pmd_t *pmd;
1802 unsigned long next;
1803 int err;
1804
1805 BUG_ON(pud_huge(*pud));
1806
1807 pmd = pmd_alloc(mm, pud, addr);
1808 if (!pmd)
1809 return -ENOMEM;
1810 do {
1811 next = pmd_addr_end(addr, end);
1812 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1813 if (err)
1814 break;
1815 } while (pmd++, addr = next, addr != end);
1816 return err;
1817}
1818
1819static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1820 unsigned long addr, unsigned long end,
1821 pte_fn_t fn, void *data)
1822{
1823 pud_t *pud;
1824 unsigned long next;
1825 int err;
1826
1827 pud = pud_alloc(mm, pgd, addr);
1828 if (!pud)
1829 return -ENOMEM;
1830 do {
1831 next = pud_addr_end(addr, end);
1832 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1833 if (err)
1834 break;
1835 } while (pud++, addr = next, addr != end);
1836 return err;
1837}
1838
1839
1840
1841
1842
1843int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1844 unsigned long size, pte_fn_t fn, void *data)
1845{
1846 pgd_t *pgd;
1847 unsigned long next;
1848 unsigned long start = addr, end = addr + size;
1849 int err;
1850
1851 BUG_ON(addr >= end);
1852 mmu_notifier_invalidate_range_start(mm, start, end);
1853 pgd = pgd_offset(mm, addr);
1854 do {
1855 next = pgd_addr_end(addr, end);
1856 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1857 if (err)
1858 break;
1859 } while (pgd++, addr = next, addr != end);
1860 mmu_notifier_invalidate_range_end(mm, start, end);
1861 return err;
1862}
1863EXPORT_SYMBOL_GPL(apply_to_page_range);
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1875 pte_t *page_table, pte_t orig_pte)
1876{
1877 int same = 1;
1878#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1879 if (sizeof(pte_t) > sizeof(unsigned long)) {
1880 spinlock_t *ptl = pte_lockptr(mm, pmd);
1881 spin_lock(ptl);
1882 same = pte_same(*page_table, orig_pte);
1883 spin_unlock(ptl);
1884 }
1885#endif
1886 pte_unmap(page_table);
1887 return same;
1888}
1889
1890
1891
1892
1893
1894
1895
1896static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1897{
1898 if (likely(vma->vm_flags & VM_WRITE))
1899 pte = pte_mkwrite(pte);
1900 return pte;
1901}
1902
1903static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1904{
1905
1906
1907
1908
1909
1910
1911 if (unlikely(!src)) {
1912 void *kaddr = kmap_atomic(dst, KM_USER0);
1913 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1914
1915
1916
1917
1918
1919
1920
1921 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1922 memset(kaddr, 0, PAGE_SIZE);
1923 kunmap_atomic(kaddr, KM_USER0);
1924 flush_dcache_page(dst);
1925 } else
1926 copy_user_highpage(dst, src, va, vma);
1927}
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1948 unsigned long address, pte_t *page_table, pmd_t *pmd,
1949 spinlock_t *ptl, pte_t orig_pte)
1950{
1951 struct page *old_page, *new_page;
1952 pte_t entry;
1953 int reuse = 0, ret = 0;
1954 int page_mkwrite = 0;
1955 struct page *dirty_page = NULL;
1956
1957 old_page = vm_normal_page(vma, address, orig_pte);
1958 if (!old_page) {
1959
1960
1961
1962
1963
1964
1965
1966 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1967 (VM_WRITE|VM_SHARED))
1968 goto reuse;
1969 goto gotten;
1970 }
1971
1972
1973
1974
1975
1976 if (PageAnon(old_page)) {
1977 if (!trylock_page(old_page)) {
1978 page_cache_get(old_page);
1979 pte_unmap_unlock(page_table, ptl);
1980 lock_page(old_page);
1981 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl);
1983 if (!pte_same(*page_table, orig_pte)) {
1984 unlock_page(old_page);
1985 page_cache_release(old_page);
1986 goto unlock;
1987 }
1988 page_cache_release(old_page);
1989 }
1990 reuse = reuse_swap_page(old_page);
1991 unlock_page(old_page);
1992 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1993 (VM_WRITE|VM_SHARED))) {
1994
1995
1996
1997
1998
1999 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2000 struct vm_fault vmf;
2001 int tmp;
2002
2003 vmf.virtual_address = (void __user *)(address &
2004 PAGE_MASK);
2005 vmf.pgoff = old_page->index;
2006 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2007 vmf.page = old_page;
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017 page_cache_get(old_page);
2018 pte_unmap_unlock(page_table, ptl);
2019
2020 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2021 if (unlikely(tmp &
2022 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2023 ret = tmp;
2024 goto unwritable_page;
2025 }
2026 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2027 lock_page(old_page);
2028 if (!old_page->mapping) {
2029 ret = 0;
2030 unlock_page(old_page);
2031 goto unwritable_page;
2032 }
2033 } else
2034 VM_BUG_ON(!PageLocked(old_page));
2035
2036
2037
2038
2039
2040
2041
2042 page_table = pte_offset_map_lock(mm, pmd, address,
2043 &ptl);
2044 if (!pte_same(*page_table, orig_pte)) {
2045 unlock_page(old_page);
2046 page_cache_release(old_page);
2047 goto unlock;
2048 }
2049
2050 page_mkwrite = 1;
2051 }
2052 dirty_page = old_page;
2053 get_page(dirty_page);
2054 reuse = 1;
2055 }
2056
2057 if (reuse) {
2058reuse:
2059 flush_cache_page(vma, address, pte_pfn(orig_pte));
2060 entry = pte_mkyoung(orig_pte);
2061 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2062 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2063 update_mmu_cache(vma, address, entry);
2064 ret |= VM_FAULT_WRITE;
2065 goto unlock;
2066 }
2067
2068
2069
2070
2071 page_cache_get(old_page);
2072gotten:
2073 pte_unmap_unlock(page_table, ptl);
2074
2075 if (unlikely(anon_vma_prepare(vma)))
2076 goto oom;
2077 VM_BUG_ON(old_page == ZERO_PAGE(0));
2078 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2079 if (!new_page)
2080 goto oom;
2081
2082
2083
2084
2085 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2086 lock_page(old_page);
2087 clear_page_mlock(old_page);
2088 unlock_page(old_page);
2089 }
2090 cow_user_page(new_page, old_page, address, vma);
2091 __SetPageUptodate(new_page);
2092
2093 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2094 goto oom_free_new;
2095
2096
2097
2098
2099 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2100 if (likely(pte_same(*page_table, orig_pte))) {
2101 if (old_page) {
2102 if (!PageAnon(old_page)) {
2103 dec_mm_counter(mm, file_rss);
2104 inc_mm_counter(mm, anon_rss);
2105 }
2106 } else
2107 inc_mm_counter(mm, anon_rss);
2108 flush_cache_page(vma, address, pte_pfn(orig_pte));
2109 entry = mk_pte(new_page, vma->vm_page_prot);
2110 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2111
2112
2113
2114
2115
2116
2117 ptep_clear_flush_notify(vma, address, page_table);
2118 page_add_new_anon_rmap(new_page, vma, address);
2119 set_pte_at(mm, address, page_table, entry);
2120 update_mmu_cache(vma, address, entry);
2121 if (old_page) {
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144 page_remove_rmap(old_page);
2145 }
2146
2147
2148 new_page = old_page;
2149 ret |= VM_FAULT_WRITE;
2150 } else
2151 mem_cgroup_uncharge_page(new_page);
2152
2153 if (new_page)
2154 page_cache_release(new_page);
2155 if (old_page)
2156 page_cache_release(old_page);
2157unlock:
2158 pte_unmap_unlock(page_table, ptl);
2159 if (dirty_page) {
2160
2161
2162
2163
2164
2165
2166
2167
2168 if (!page_mkwrite) {
2169 wait_on_page_locked(dirty_page);
2170 set_page_dirty_balance(dirty_page, page_mkwrite);
2171 }
2172 put_page(dirty_page);
2173 if (page_mkwrite) {
2174 struct address_space *mapping = dirty_page->mapping;
2175
2176 set_page_dirty(dirty_page);
2177 unlock_page(dirty_page);
2178 page_cache_release(dirty_page);
2179 if (mapping) {
2180
2181
2182
2183
2184 balance_dirty_pages_ratelimited(mapping);
2185 }
2186 }
2187
2188
2189 if (vma->vm_file)
2190 file_update_time(vma->vm_file);
2191 }
2192 return ret;
2193oom_free_new:
2194 page_cache_release(new_page);
2195oom:
2196 if (old_page) {
2197 if (page_mkwrite) {
2198 unlock_page(old_page);
2199 page_cache_release(old_page);
2200 }
2201 page_cache_release(old_page);
2202 }
2203 return VM_FAULT_OOM;
2204
2205unwritable_page:
2206 page_cache_release(old_page);
2207 return ret;
2208}
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2243
2244static void reset_vma_truncate_counts(struct address_space *mapping)
2245{
2246 struct vm_area_struct *vma;
2247 struct prio_tree_iter iter;
2248
2249 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2250 vma->vm_truncate_count = 0;
2251 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2252 vma->vm_truncate_count = 0;
2253}
2254
2255static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2256 unsigned long start_addr, unsigned long end_addr,
2257 struct zap_details *details)
2258{
2259 unsigned long restart_addr;
2260 int need_break;
2261
2262
2263
2264
2265
2266
2267
2268
2269again:
2270 restart_addr = vma->vm_truncate_count;
2271 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2272 start_addr = restart_addr;
2273 if (start_addr >= end_addr) {
2274
2275 vma->vm_truncate_count = details->truncate_count;
2276 return 0;
2277 }
2278 }
2279
2280 restart_addr = zap_page_range(vma, start_addr,
2281 end_addr - start_addr, details);
2282 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2283
2284 if (restart_addr >= end_addr) {
2285
2286 vma->vm_truncate_count = details->truncate_count;
2287 if (!need_break)
2288 return 0;
2289 } else {
2290
2291 vma->vm_truncate_count = restart_addr;
2292 if (!need_break)
2293 goto again;
2294 }
2295
2296 spin_unlock(details->i_mmap_lock);
2297 cond_resched();
2298 spin_lock(details->i_mmap_lock);
2299 return -EINTR;
2300}
2301
2302static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2303 struct zap_details *details)
2304{
2305 struct vm_area_struct *vma;
2306 struct prio_tree_iter iter;
2307 pgoff_t vba, vea, zba, zea;
2308
2309restart:
2310 vma_prio_tree_foreach(vma, &iter, root,
2311 details->first_index, details->last_index) {
2312
2313 if (vma->vm_truncate_count == details->truncate_count)
2314 continue;
2315
2316 vba = vma->vm_pgoff;
2317 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2318
2319 zba = details->first_index;
2320 if (zba < vba)
2321 zba = vba;
2322 zea = details->last_index;
2323 if (zea > vea)
2324 zea = vea;
2325
2326 if (unmap_mapping_range_vma(vma,
2327 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2328 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2329 details) < 0)
2330 goto restart;
2331 }
2332}
2333
2334static inline void unmap_mapping_range_list(struct list_head *head,
2335 struct zap_details *details)
2336{
2337 struct vm_area_struct *vma;
2338
2339
2340
2341
2342
2343
2344
2345restart:
2346 list_for_each_entry(vma, head, shared.vm_set.list) {
2347
2348 if (vma->vm_truncate_count == details->truncate_count)
2349 continue;
2350 details->nonlinear_vma = vma;
2351 if (unmap_mapping_range_vma(vma, vma->vm_start,
2352 vma->vm_end, details) < 0)
2353 goto restart;
2354 }
2355}
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371void unmap_mapping_range(struct address_space *mapping,
2372 loff_t const holebegin, loff_t const holelen, int even_cows)
2373{
2374 struct zap_details details;
2375 pgoff_t hba = holebegin >> PAGE_SHIFT;
2376 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2377
2378
2379 if (sizeof(holelen) > sizeof(hlen)) {
2380 long long holeend =
2381 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2382 if (holeend & ~(long long)ULONG_MAX)
2383 hlen = ULONG_MAX - hba + 1;
2384 }
2385
2386 details.check_mapping = even_cows? NULL: mapping;
2387 details.nonlinear_vma = NULL;
2388 details.first_index = hba;
2389 details.last_index = hba + hlen - 1;
2390 if (details.last_index < details.first_index)
2391 details.last_index = ULONG_MAX;
2392 details.i_mmap_lock = &mapping->i_mmap_lock;
2393
2394 spin_lock(&mapping->i_mmap_lock);
2395
2396
2397 mapping->truncate_count++;
2398 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2399 if (mapping->truncate_count == 0)
2400 reset_vma_truncate_counts(mapping);
2401 mapping->truncate_count++;
2402 }
2403 details.truncate_count = mapping->truncate_count;
2404
2405 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2406 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2407 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2408 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2409 spin_unlock(&mapping->i_mmap_lock);
2410}
2411EXPORT_SYMBOL(unmap_mapping_range);
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422int vmtruncate(struct inode * inode, loff_t offset)
2423{
2424 if (inode->i_size < offset) {
2425 unsigned long limit;
2426
2427 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2428 if (limit != RLIM_INFINITY && offset > limit)
2429 goto out_sig;
2430 if (offset > inode->i_sb->s_maxbytes)
2431 goto out_big;
2432 i_size_write(inode, offset);
2433 } else {
2434 struct address_space *mapping = inode->i_mapping;
2435
2436
2437
2438
2439
2440
2441 if (IS_SWAPFILE(inode))
2442 return -ETXTBSY;
2443 i_size_write(inode, offset);
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2455 truncate_inode_pages(mapping, offset);
2456 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2457 }
2458
2459 if (inode->i_op->truncate)
2460 inode->i_op->truncate(inode);
2461 return 0;
2462
2463out_sig:
2464 send_sig(SIGXFSZ, current, 0);
2465out_big:
2466 return -EFBIG;
2467}
2468EXPORT_SYMBOL(vmtruncate);
2469
2470int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2471{
2472 struct address_space *mapping = inode->i_mapping;
2473
2474
2475
2476
2477
2478
2479 if (!inode->i_op->truncate_range)
2480 return -ENOSYS;
2481
2482 mutex_lock(&inode->i_mutex);
2483 down_write(&inode->i_alloc_sem);
2484 unmap_mapping_range(mapping, offset, (end - offset), 1);
2485 truncate_inode_pages_range(mapping, offset, end);
2486 unmap_mapping_range(mapping, offset, (end - offset), 1);
2487 inode->i_op->truncate_range(inode, offset, end);
2488 up_write(&inode->i_alloc_sem);
2489 mutex_unlock(&inode->i_mutex);
2490
2491 return 0;
2492}
2493
2494
2495
2496
2497
2498
2499static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2500 unsigned long address, pte_t *page_table, pmd_t *pmd,
2501 unsigned int flags, pte_t orig_pte)
2502{
2503 spinlock_t *ptl;
2504 struct page *page;
2505 swp_entry_t entry;
2506 pte_t pte;
2507 struct mem_cgroup *ptr = NULL;
2508 int ret = 0;
2509
2510 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2511 goto out;
2512
2513 entry = pte_to_swp_entry(orig_pte);
2514 if (is_migration_entry(entry)) {
2515 migration_entry_wait(mm, pmd, address);
2516 goto out;
2517 }
2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2519 page = lookup_swap_cache(entry);
2520 if (!page) {
2521 grab_swap_token(mm);
2522 page = swapin_readahead(entry,
2523 GFP_HIGHUSER_MOVABLE, vma, address);
2524 if (!page) {
2525
2526
2527
2528
2529 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2530 if (likely(pte_same(*page_table, orig_pte)))
2531 ret = VM_FAULT_OOM;
2532 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2533 goto unlock;
2534 }
2535
2536
2537 ret = VM_FAULT_MAJOR;
2538 count_vm_event(PGMAJFAULT);
2539 }
2540
2541 lock_page(page);
2542 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2543
2544 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2545 ret = VM_FAULT_OOM;
2546 goto out_page;
2547 }
2548
2549
2550
2551
2552 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2553 if (unlikely(!pte_same(*page_table, orig_pte)))
2554 goto out_nomap;
2555
2556 if (unlikely(!PageUptodate(page))) {
2557 ret = VM_FAULT_SIGBUS;
2558 goto out_nomap;
2559 }
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575 inc_mm_counter(mm, anon_rss);
2576 pte = mk_pte(page, vma->vm_page_prot);
2577 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2578 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2579 flags &= ~FAULT_FLAG_WRITE;
2580 }
2581 flush_icache_page(vma, page);
2582 set_pte_at(mm, address, page_table, pte);
2583 page_add_anon_rmap(page, vma, address);
2584
2585 mem_cgroup_commit_charge_swapin(page, ptr);
2586
2587 swap_free(entry);
2588 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2589 try_to_free_swap(page);
2590 unlock_page(page);
2591
2592 if (flags & FAULT_FLAG_WRITE) {
2593 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2594 if (ret & VM_FAULT_ERROR)
2595 ret &= VM_FAULT_ERROR;
2596 goto out;
2597 }
2598
2599
2600 update_mmu_cache(vma, address, pte);
2601unlock:
2602 pte_unmap_unlock(page_table, ptl);
2603out:
2604 return ret;
2605out_nomap:
2606 mem_cgroup_cancel_charge_swapin(ptr);
2607 pte_unmap_unlock(page_table, ptl);
2608out_page:
2609 unlock_page(page);
2610 page_cache_release(page);
2611 return ret;
2612}
2613
2614
2615
2616
2617
2618
2619static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2620 unsigned long address, pte_t *page_table, pmd_t *pmd,
2621 unsigned int flags)
2622{
2623 struct page *page;
2624 spinlock_t *ptl;
2625 pte_t entry;
2626
2627
2628 pte_unmap(page_table);
2629
2630 if (unlikely(anon_vma_prepare(vma)))
2631 goto oom;
2632 page = alloc_zeroed_user_highpage_movable(vma, address);
2633 if (!page)
2634 goto oom;
2635 __SetPageUptodate(page);
2636
2637 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2638 goto oom_free_page;
2639
2640 entry = mk_pte(page, vma->vm_page_prot);
2641 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2642
2643 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2644 if (!pte_none(*page_table))
2645 goto release;
2646 inc_mm_counter(mm, anon_rss);
2647 page_add_new_anon_rmap(page, vma, address);
2648 set_pte_at(mm, address, page_table, entry);
2649
2650
2651 update_mmu_cache(vma, address, entry);
2652unlock:
2653 pte_unmap_unlock(page_table, ptl);
2654 return 0;
2655release:
2656 mem_cgroup_uncharge_page(page);
2657 page_cache_release(page);
2658 goto unlock;
2659oom_free_page:
2660 page_cache_release(page);
2661oom:
2662 return VM_FAULT_OOM;
2663}
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2679 unsigned long address, pmd_t *pmd,
2680 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2681{
2682 pte_t *page_table;
2683 spinlock_t *ptl;
2684 struct page *page;
2685 pte_t entry;
2686 int anon = 0;
2687 int charged = 0;
2688 struct page *dirty_page = NULL;
2689 struct vm_fault vmf;
2690 int ret;
2691 int page_mkwrite = 0;
2692
2693 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2694 vmf.pgoff = pgoff;
2695 vmf.flags = flags;
2696 vmf.page = NULL;
2697
2698 ret = vma->vm_ops->fault(vma, &vmf);
2699 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2700 return ret;
2701
2702
2703
2704
2705
2706 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2707 lock_page(vmf.page);
2708 else
2709 VM_BUG_ON(!PageLocked(vmf.page));
2710
2711
2712
2713
2714 page = vmf.page;
2715 if (flags & FAULT_FLAG_WRITE) {
2716 if (!(vma->vm_flags & VM_SHARED)) {
2717 anon = 1;
2718 if (unlikely(anon_vma_prepare(vma))) {
2719 ret = VM_FAULT_OOM;
2720 goto out;
2721 }
2722 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2723 vma, address);
2724 if (!page) {
2725 ret = VM_FAULT_OOM;
2726 goto out;
2727 }
2728 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2729 ret = VM_FAULT_OOM;
2730 page_cache_release(page);
2731 goto out;
2732 }
2733 charged = 1;
2734
2735
2736
2737
2738 if (vma->vm_flags & VM_LOCKED)
2739 clear_page_mlock(vmf.page);
2740 copy_user_highpage(page, vmf.page, address, vma);
2741 __SetPageUptodate(page);
2742 } else {
2743
2744
2745
2746
2747
2748 if (vma->vm_ops->page_mkwrite) {
2749 int tmp;
2750
2751 unlock_page(page);
2752 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2753 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2754 if (unlikely(tmp &
2755 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2756 ret = tmp;
2757 goto unwritable_page;
2758 }
2759 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2760 lock_page(page);
2761 if (!page->mapping) {
2762 ret = 0;
2763 unlock_page(page);
2764 goto unwritable_page;
2765 }
2766 } else
2767 VM_BUG_ON(!PageLocked(page));
2768 page_mkwrite = 1;
2769 }
2770 }
2771
2772 }
2773
2774 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787 if (likely(pte_same(*page_table, orig_pte))) {
2788 flush_icache_page(vma, page);
2789 entry = mk_pte(page, vma->vm_page_prot);
2790 if (flags & FAULT_FLAG_WRITE)
2791 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2792 if (anon) {
2793 inc_mm_counter(mm, anon_rss);
2794 page_add_new_anon_rmap(page, vma, address);
2795 } else {
2796 inc_mm_counter(mm, file_rss);
2797 page_add_file_rmap(page);
2798 if (flags & FAULT_FLAG_WRITE) {
2799 dirty_page = page;
2800 get_page(dirty_page);
2801 }
2802 }
2803 set_pte_at(mm, address, page_table, entry);
2804
2805
2806 update_mmu_cache(vma, address, entry);
2807 } else {
2808 if (charged)
2809 mem_cgroup_uncharge_page(page);
2810 if (anon)
2811 page_cache_release(page);
2812 else
2813 anon = 1;
2814 }
2815
2816 pte_unmap_unlock(page_table, ptl);
2817
2818out:
2819 if (dirty_page) {
2820 struct address_space *mapping = page->mapping;
2821
2822 if (set_page_dirty(dirty_page))
2823 page_mkwrite = 1;
2824 unlock_page(dirty_page);
2825 put_page(dirty_page);
2826 if (page_mkwrite && mapping) {
2827
2828
2829
2830
2831 balance_dirty_pages_ratelimited(mapping);
2832 }
2833
2834
2835 if (vma->vm_file)
2836 file_update_time(vma->vm_file);
2837 } else {
2838 unlock_page(vmf.page);
2839 if (anon)
2840 page_cache_release(vmf.page);
2841 }
2842
2843 return ret;
2844
2845unwritable_page:
2846 page_cache_release(page);
2847 return ret;
2848}
2849
2850static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2851 unsigned long address, pte_t *page_table, pmd_t *pmd,
2852 unsigned int flags, pte_t orig_pte)
2853{
2854 pgoff_t pgoff = (((address & PAGE_MASK)
2855 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2856
2857 pte_unmap(page_table);
2858 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2859}
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2871 unsigned long address, pte_t *page_table, pmd_t *pmd,
2872 unsigned int flags, pte_t orig_pte)
2873{
2874 pgoff_t pgoff;
2875
2876 flags |= FAULT_FLAG_NONLINEAR;
2877
2878 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2879 return 0;
2880
2881 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2882
2883
2884
2885 print_bad_pte(vma, address, orig_pte, NULL);
2886 return VM_FAULT_OOM;
2887 }
2888
2889 pgoff = pte_to_pgoff(orig_pte);
2890 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2891}
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906static inline int handle_pte_fault(struct mm_struct *mm,
2907 struct vm_area_struct *vma, unsigned long address,
2908 pte_t *pte, pmd_t *pmd, unsigned int flags)
2909{
2910 pte_t entry;
2911 spinlock_t *ptl;
2912
2913 entry = *pte;
2914 if (!pte_present(entry)) {
2915 if (pte_none(entry)) {
2916 if (vma->vm_ops) {
2917 if (likely(vma->vm_ops->fault))
2918 return do_linear_fault(mm, vma, address,
2919 pte, pmd, flags, entry);
2920 }
2921 return do_anonymous_page(mm, vma, address,
2922 pte, pmd, flags);
2923 }
2924 if (pte_file(entry))
2925 return do_nonlinear_fault(mm, vma, address,
2926 pte, pmd, flags, entry);
2927 return do_swap_page(mm, vma, address,
2928 pte, pmd, flags, entry);
2929 }
2930
2931 ptl = pte_lockptr(mm, pmd);
2932 spin_lock(ptl);
2933 if (unlikely(!pte_same(*pte, entry)))
2934 goto unlock;
2935 if (flags & FAULT_FLAG_WRITE) {
2936 if (!pte_write(entry))
2937 return do_wp_page(mm, vma, address,
2938 pte, pmd, ptl, entry);
2939 entry = pte_mkdirty(entry);
2940 }
2941 entry = pte_mkyoung(entry);
2942 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2943 update_mmu_cache(vma, address, entry);
2944 } else {
2945
2946
2947
2948
2949
2950
2951 if (flags & FAULT_FLAG_WRITE)
2952 flush_tlb_page(vma, address);
2953 }
2954unlock:
2955 pte_unmap_unlock(pte, ptl);
2956 return 0;
2957}
2958
2959
2960
2961
2962int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2963 unsigned long address, unsigned int flags)
2964{
2965 pgd_t *pgd;
2966 pud_t *pud;
2967 pmd_t *pmd;
2968 pte_t *pte;
2969
2970 __set_current_state(TASK_RUNNING);
2971
2972 count_vm_event(PGFAULT);
2973
2974 if (unlikely(is_vm_hugetlb_page(vma)))
2975 return hugetlb_fault(mm, vma, address, flags);
2976
2977 pgd = pgd_offset(mm, address);
2978 pud = pud_alloc(mm, pgd, address);
2979 if (!pud)
2980 return VM_FAULT_OOM;
2981 pmd = pmd_alloc(mm, pud, address);
2982 if (!pmd)
2983 return VM_FAULT_OOM;
2984 pte = pte_alloc_map(mm, pmd, address);
2985 if (!pte)
2986 return VM_FAULT_OOM;
2987
2988 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
2989}
2990
2991#ifndef __PAGETABLE_PUD_FOLDED
2992
2993
2994
2995
2996int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2997{
2998 pud_t *new = pud_alloc_one(mm, address);
2999 if (!new)
3000 return -ENOMEM;
3001
3002 smp_wmb();
3003
3004 spin_lock(&mm->page_table_lock);
3005 if (pgd_present(*pgd))
3006 pud_free(mm, new);
3007 else
3008 pgd_populate(mm, pgd, new);
3009 spin_unlock(&mm->page_table_lock);
3010 return 0;
3011}
3012#endif
3013
3014#ifndef __PAGETABLE_PMD_FOLDED
3015
3016
3017
3018
3019int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3020{
3021 pmd_t *new = pmd_alloc_one(mm, address);
3022 if (!new)
3023 return -ENOMEM;
3024
3025 smp_wmb();
3026
3027 spin_lock(&mm->page_table_lock);
3028#ifndef __ARCH_HAS_4LEVEL_HACK
3029 if (pud_present(*pud))
3030 pmd_free(mm, new);
3031 else
3032 pud_populate(mm, pud, new);
3033#else
3034 if (pgd_present(*pud))
3035 pmd_free(mm, new);
3036 else
3037 pgd_populate(mm, pud, new);
3038#endif
3039 spin_unlock(&mm->page_table_lock);
3040 return 0;
3041}
3042#endif
3043
3044int make_pages_present(unsigned long addr, unsigned long end)
3045{
3046 int ret, len, write;
3047 struct vm_area_struct * vma;
3048
3049 vma = find_vma(current->mm, addr);
3050 if (!vma)
3051 return -ENOMEM;
3052 write = (vma->vm_flags & VM_WRITE) != 0;
3053 BUG_ON(addr >= end);
3054 BUG_ON(end > vma->vm_end);
3055 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3056 ret = get_user_pages(current, current->mm, addr,
3057 len, write, 0, NULL, NULL);
3058 if (ret < 0)
3059 return ret;
3060 return ret == len ? 0 : -EFAULT;
3061}
3062
3063#if !defined(__HAVE_ARCH_GATE_AREA)
3064
3065#if defined(AT_SYSINFO_EHDR)
3066static struct vm_area_struct gate_vma;
3067
3068static int __init gate_vma_init(void)
3069{
3070 gate_vma.vm_mm = NULL;
3071 gate_vma.vm_start = FIXADDR_USER_START;
3072 gate_vma.vm_end = FIXADDR_USER_END;
3073 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3074 gate_vma.vm_page_prot = __P101;
3075
3076
3077
3078
3079
3080
3081 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3082 return 0;
3083}
3084__initcall(gate_vma_init);
3085#endif
3086
3087struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3088{
3089#ifdef AT_SYSINFO_EHDR
3090 return &gate_vma;
3091#else
3092 return NULL;
3093#endif
3094}
3095
3096int in_gate_area_no_task(unsigned long addr)
3097{
3098#ifdef AT_SYSINFO_EHDR
3099 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3100 return 1;
3101#endif
3102 return 0;
3103}
3104
3105#endif
3106
3107static int follow_pte(struct mm_struct *mm, unsigned long address,
3108 pte_t **ptepp, spinlock_t **ptlp)
3109{
3110 pgd_t *pgd;
3111 pud_t *pud;
3112 pmd_t *pmd;
3113 pte_t *ptep;
3114
3115 pgd = pgd_offset(mm, address);
3116 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3117 goto out;
3118
3119 pud = pud_offset(pgd, address);
3120 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3121 goto out;
3122
3123 pmd = pmd_offset(pud, address);
3124 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3125 goto out;
3126
3127
3128 if (pmd_huge(*pmd))
3129 goto out;
3130
3131 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3132 if (!ptep)
3133 goto out;
3134 if (!pte_present(*ptep))
3135 goto unlock;
3136 *ptepp = ptep;
3137 return 0;
3138unlock:
3139 pte_unmap_unlock(ptep, *ptlp);
3140out:
3141 return -EINVAL;
3142}
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3155 unsigned long *pfn)
3156{
3157 int ret = -EINVAL;
3158 spinlock_t *ptl;
3159 pte_t *ptep;
3160
3161 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3162 return ret;
3163
3164 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3165 if (ret)
3166 return ret;
3167 *pfn = pte_pfn(*ptep);
3168 pte_unmap_unlock(ptep, ptl);
3169 return 0;
3170}
3171EXPORT_SYMBOL(follow_pfn);
3172
3173#ifdef CONFIG_HAVE_IOREMAP_PROT
3174int follow_phys(struct vm_area_struct *vma,
3175 unsigned long address, unsigned int flags,
3176 unsigned long *prot, resource_size_t *phys)
3177{
3178 int ret = -EINVAL;
3179 pte_t *ptep, pte;
3180 spinlock_t *ptl;
3181
3182 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3183 goto out;
3184
3185 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3186 goto out;
3187 pte = *ptep;
3188
3189 if ((flags & FOLL_WRITE) && !pte_write(pte))
3190 goto unlock;
3191
3192 *prot = pgprot_val(pte_pgprot(pte));
3193 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3194
3195 ret = 0;
3196unlock:
3197 pte_unmap_unlock(ptep, ptl);
3198out:
3199 return ret;
3200}
3201
3202int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3203 void *buf, int len, int write)
3204{
3205 resource_size_t phys_addr;
3206 unsigned long prot = 0;
3207 void __iomem *maddr;
3208 int offset = addr & (PAGE_SIZE-1);
3209
3210 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3211 return -EINVAL;
3212
3213 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3214 if (write)
3215 memcpy_toio(maddr + offset, buf, len);
3216 else
3217 memcpy_fromio(buf, maddr + offset, len);
3218 iounmap(maddr);
3219
3220 return len;
3221}
3222#endif
3223
3224
3225
3226
3227
3228
3229int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
3230{
3231 struct mm_struct *mm;
3232 struct vm_area_struct *vma;
3233 void *old_buf = buf;
3234
3235 mm = get_task_mm(tsk);
3236 if (!mm)
3237 return 0;
3238
3239 down_read(&mm->mmap_sem);
3240
3241 while (len) {
3242 int bytes, ret, offset;
3243 void *maddr;
3244 struct page *page = NULL;
3245
3246 ret = get_user_pages(tsk, mm, addr, 1,
3247 write, 1, &page, &vma);
3248 if (ret <= 0) {
3249
3250
3251
3252
3253#ifdef CONFIG_HAVE_IOREMAP_PROT
3254 vma = find_vma(mm, addr);
3255 if (!vma)
3256 break;
3257 if (vma->vm_ops && vma->vm_ops->access)
3258 ret = vma->vm_ops->access(vma, addr, buf,
3259 len, write);
3260 if (ret <= 0)
3261#endif
3262 break;
3263 bytes = ret;
3264 } else {
3265 bytes = len;
3266 offset = addr & (PAGE_SIZE-1);
3267 if (bytes > PAGE_SIZE-offset)
3268 bytes = PAGE_SIZE-offset;
3269
3270 maddr = kmap(page);
3271 if (write) {
3272 copy_to_user_page(vma, page, addr,
3273 maddr + offset, buf, bytes);
3274 set_page_dirty_lock(page);
3275 } else {
3276 copy_from_user_page(vma, page, addr,
3277 buf, maddr + offset, bytes);
3278 }
3279 kunmap(page);
3280 page_cache_release(page);
3281 }
3282 len -= bytes;
3283 buf += bytes;
3284 addr += bytes;
3285 }
3286 up_read(&mm->mmap_sem);
3287 mmput(mm);
3288
3289 return buf - old_buf;
3290}
3291
3292
3293
3294
3295void print_vma_addr(char *prefix, unsigned long ip)
3296{
3297 struct mm_struct *mm = current->mm;
3298 struct vm_area_struct *vma;
3299
3300
3301
3302
3303
3304 if (preempt_count())
3305 return;
3306
3307 down_read(&mm->mmap_sem);
3308 vma = find_vma(mm, ip);
3309 if (vma && vma->vm_file) {
3310 struct file *f = vma->vm_file;
3311 char *buf = (char *)__get_free_page(GFP_KERNEL);
3312 if (buf) {
3313 char *p, *s;
3314
3315 p = d_path(&f->f_path, buf, PAGE_SIZE);
3316 if (IS_ERR(p))
3317 p = "?";
3318 s = strrchr(p, '/');
3319 if (s)
3320 p = s+1;
3321 printk("%s%s[%lx+%lx]", prefix, p,
3322 vma->vm_start,
3323 vma->vm_end - vma->vm_start);
3324 free_page((unsigned long)buf);
3325 }
3326 }
3327 up_read(¤t->mm->mmap_sem);
3328}
3329
3330#ifdef CONFIG_PROVE_LOCKING
3331void might_fault(void)
3332{
3333
3334
3335
3336
3337
3338
3339 if (segment_eq(get_fs(), KERNEL_DS))
3340 return;
3341
3342 might_sleep();
3343
3344
3345
3346
3347
3348 if (!in_atomic() && current->mm)
3349 might_lock_read(¤t->mm->mmap_sem);
3350}
3351EXPORT_SYMBOL(might_fault);
3352#endif
3353