1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167
168
169
170
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172
173
174
175
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
196#ifdef HAVE_GENERIC_MMU_GATHER
197
198static int tlb_next_batch(struct mmu_gather *tlb)
199{
200 struct mmu_gather_batch *batch;
201
202 batch = tlb->active;
203 if (batch->next) {
204 tlb->active = batch->next;
205 return 1;
206 }
207
208 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
209 if (!batch)
210 return 0;
211
212 batch->next = NULL;
213 batch->nr = 0;
214 batch->max = MAX_GATHER_BATCH;
215
216 tlb->active->next = batch;
217 tlb->active = batch;
218
219 return 1;
220}
221
222
223
224
225
226
227void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
228{
229 tlb->mm = mm;
230
231 tlb->fullmm = fullmm;
232 tlb->need_flush = 0;
233 tlb->fast_mode = (num_possible_cpus() == 1);
234 tlb->local.next = NULL;
235 tlb->local.nr = 0;
236 tlb->local.max = ARRAY_SIZE(tlb->__pages);
237 tlb->active = &tlb->local;
238
239#ifdef CONFIG_HAVE_RCU_TABLE_FREE
240 tlb->batch = NULL;
241#endif
242}
243
244void tlb_flush_mmu(struct mmu_gather *tlb)
245{
246 struct mmu_gather_batch *batch;
247
248 if (!tlb->need_flush)
249 return;
250 tlb->need_flush = 0;
251 tlb_flush(tlb);
252#ifdef CONFIG_HAVE_RCU_TABLE_FREE
253 tlb_table_flush(tlb);
254#endif
255
256 if (tlb_fast_mode(tlb))
257 return;
258
259 for (batch = &tlb->local; batch; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266
267
268
269
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286
287
288
289
290
291
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 VM_BUG_ON(!tlb->need_flush);
297
298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page);
300 return 1;
301 }
302
303 batch = tlb->active;
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return 0;
308 batch = tlb->active;
309 }
310 VM_BUG_ON(batch->nr > batch->max);
311
312 return batch->max - batch->nr;
313}
314
315#endif
316
317#ifdef CONFIG_HAVE_RCU_TABLE_FREE
318
319
320
321
322
323static void tlb_remove_table_smp_sync(void *arg)
324{
325
326}
327
328static void tlb_remove_table_one(void *table)
329{
330
331
332
333
334
335
336
337 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
338 __tlb_remove_table(table);
339}
340
341static void tlb_remove_table_rcu(struct rcu_head *head)
342{
343 struct mmu_table_batch *batch;
344 int i;
345
346 batch = container_of(head, struct mmu_table_batch, rcu);
347
348 for (i = 0; i < batch->nr; i++)
349 __tlb_remove_table(batch->tables[i]);
350
351 free_page((unsigned long)batch);
352}
353
354void tlb_table_flush(struct mmu_gather *tlb)
355{
356 struct mmu_table_batch **batch = &tlb->batch;
357
358 if (*batch) {
359 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
360 *batch = NULL;
361 }
362}
363
364void tlb_remove_table(struct mmu_gather *tlb, void *table)
365{
366 struct mmu_table_batch **batch = &tlb->batch;
367
368 tlb->need_flush = 1;
369
370
371
372
373
374 if (atomic_read(&tlb->mm->mm_users) < 2) {
375 __tlb_remove_table(table);
376 return;
377 }
378
379 if (*batch == NULL) {
380 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
381 if (*batch == NULL) {
382 tlb_remove_table_one(table);
383 return;
384 }
385 (*batch)->nr = 0;
386 }
387 (*batch)->tables[(*batch)->nr++] = table;
388 if ((*batch)->nr == MAX_TABLE_BATCH)
389 tlb_table_flush(tlb);
390}
391
392#endif
393
394
395
396
397
398
399
400void pgd_clear_bad(pgd_t *pgd)
401{
402 pgd_ERROR(*pgd);
403 pgd_clear(pgd);
404}
405
406void pud_clear_bad(pud_t *pud)
407{
408 pud_ERROR(*pud);
409 pud_clear(pud);
410}
411
412void pmd_clear_bad(pmd_t *pmd)
413{
414 pmd_ERROR(*pmd);
415 pmd_clear(pmd);
416}
417
418
419
420
421
422static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
423 unsigned long addr)
424{
425 pgtable_t token = pmd_pgtable(*pmd);
426 pmd_clear(pmd);
427 pte_free_tlb(tlb, token, addr);
428 tlb->mm->nr_ptes--;
429}
430
431static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
432 unsigned long addr, unsigned long end,
433 unsigned long floor, unsigned long ceiling)
434{
435 pmd_t *pmd;
436 unsigned long next;
437 unsigned long start;
438
439 start = addr;
440 pmd = pmd_offset(pud, addr);
441 do {
442 next = pmd_addr_end(addr, end);
443 if (pmd_none_or_clear_bad(pmd))
444 continue;
445 free_pte_range(tlb, pmd, addr);
446 } while (pmd++, addr = next, addr != end);
447
448 start &= PUD_MASK;
449 if (start < floor)
450 return;
451 if (ceiling) {
452 ceiling &= PUD_MASK;
453 if (!ceiling)
454 return;
455 }
456 if (end - 1 > ceiling - 1)
457 return;
458
459 pmd = pmd_offset(pud, start);
460 pud_clear(pud);
461 pmd_free_tlb(tlb, pmd, start);
462}
463
464static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
465 unsigned long addr, unsigned long end,
466 unsigned long floor, unsigned long ceiling)
467{
468 pud_t *pud;
469 unsigned long next;
470 unsigned long start;
471
472 start = addr;
473 pud = pud_offset(pgd, addr);
474 do {
475 next = pud_addr_end(addr, end);
476 if (pud_none_or_clear_bad(pud))
477 continue;
478 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
479 } while (pud++, addr = next, addr != end);
480
481 start &= PGDIR_MASK;
482 if (start < floor)
483 return;
484 if (ceiling) {
485 ceiling &= PGDIR_MASK;
486 if (!ceiling)
487 return;
488 }
489 if (end - 1 > ceiling - 1)
490 return;
491
492 pud = pud_offset(pgd, start);
493 pgd_clear(pgd);
494 pud_free_tlb(tlb, pud, start);
495}
496
497
498
499
500
501
502void free_pgd_range(struct mmu_gather *tlb,
503 unsigned long addr, unsigned long end,
504 unsigned long floor, unsigned long ceiling)
505{
506 pgd_t *pgd;
507 unsigned long next;
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535 addr &= PMD_MASK;
536 if (addr < floor) {
537 addr += PMD_SIZE;
538 if (!addr)
539 return;
540 }
541 if (ceiling) {
542 ceiling &= PMD_MASK;
543 if (!ceiling)
544 return;
545 }
546 if (end - 1 > ceiling - 1)
547 end -= PMD_SIZE;
548 if (addr > end - 1)
549 return;
550
551 pgd = pgd_offset(tlb->mm, addr);
552 do {
553 next = pgd_addr_end(addr, end);
554 if (pgd_none_or_clear_bad(pgd))
555 continue;
556 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
557 } while (pgd++, addr = next, addr != end);
558}
559
560void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
561 unsigned long floor, unsigned long ceiling)
562{
563 while (vma) {
564 struct vm_area_struct *next = vma->vm_next;
565 unsigned long addr = vma->vm_start;
566
567
568
569
570
571 unlink_anon_vmas(vma);
572 unlink_file_vma(vma);
573
574 if (is_vm_hugetlb_page(vma)) {
575 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
576 floor, next? next->vm_start: ceiling);
577 } else {
578
579
580
581 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
582 && !is_vm_hugetlb_page(next)) {
583 vma = next;
584 next = vma->vm_next;
585 unlink_anon_vmas(vma);
586 unlink_file_vma(vma);
587 }
588 free_pgd_range(tlb, addr, vma->vm_end,
589 floor, next? next->vm_start: ceiling);
590 }
591 vma = next;
592 }
593}
594
595int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
596 pmd_t *pmd, unsigned long address)
597{
598 pgtable_t new = pte_alloc_one(mm, address);
599 int wait_split_huge_page;
600 if (!new)
601 return -ENOMEM;
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616 smp_wmb();
617
618 spin_lock(&mm->page_table_lock);
619 wait_split_huge_page = 0;
620 if (likely(pmd_none(*pmd))) {
621 mm->nr_ptes++;
622 pmd_populate(mm, pmd, new);
623 new = NULL;
624 } else if (unlikely(pmd_trans_splitting(*pmd)))
625 wait_split_huge_page = 1;
626 spin_unlock(&mm->page_table_lock);
627 if (new)
628 pte_free(mm, new);
629 if (wait_split_huge_page)
630 wait_split_huge_page(vma->anon_vma, pmd);
631 return 0;
632}
633
634int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
635{
636 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
637 if (!new)
638 return -ENOMEM;
639
640 smp_wmb();
641
642 spin_lock(&init_mm.page_table_lock);
643 if (likely(pmd_none(*pmd))) {
644 pmd_populate_kernel(&init_mm, pmd, new);
645 new = NULL;
646 } else
647 VM_BUG_ON(pmd_trans_splitting(*pmd));
648 spin_unlock(&init_mm.page_table_lock);
649 if (new)
650 pte_free_kernel(&init_mm, new);
651 return 0;
652}
653
654static inline void init_rss_vec(int *rss)
655{
656 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
657}
658
659static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
660{
661 int i;
662
663 if (current->mm == mm)
664 sync_mm_rss(current, mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i])
667 add_mm_counter(mm, i, rss[i]);
668}
669
670
671
672
673
674
675
676
677static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
678 pte_t pte, struct page *page)
679{
680 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
681 pud_t *pud = pud_offset(pgd, addr);
682 pmd_t *pmd = pmd_offset(pud, addr);
683 struct address_space *mapping;
684 pgoff_t index;
685 static unsigned long resume;
686 static unsigned long nr_shown;
687 static unsigned long nr_unshown;
688
689
690
691
692
693 if (nr_shown == 60) {
694 if (time_before(jiffies, resume)) {
695 nr_unshown++;
696 return;
697 }
698 if (nr_unshown) {
699 printk(KERN_ALERT
700 "BUG: Bad page map: %lu messages suppressed\n",
701 nr_unshown);
702 nr_unshown = 0;
703 }
704 nr_shown = 0;
705 }
706 if (nr_shown++ == 0)
707 resume = jiffies + 60 * HZ;
708
709 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
710 index = linear_page_index(vma, addr);
711
712 printk(KERN_ALERT
713 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
714 current->comm,
715 (long long)pte_val(pte), (long long)pmd_val(*pmd));
716 if (page)
717 dump_page(page);
718 printk(KERN_ALERT
719 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
720 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
721
722
723
724 if (vma->vm_ops)
725 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
726 (unsigned long)vma->vm_ops->fault);
727 if (vma->vm_file && vma->vm_file->f_op)
728 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
729 (unsigned long)vma->vm_file->f_op->mmap);
730 dump_stack();
731 add_taint(TAINT_BAD_PAGE);
732}
733
734static inline int is_cow_mapping(vm_flags_t flags)
735{
736 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
737}
738
739#ifndef is_zero_pfn
740static inline int is_zero_pfn(unsigned long pfn)
741{
742 return pfn == zero_pfn;
743}
744#endif
745
746#ifndef my_zero_pfn
747static inline unsigned long my_zero_pfn(unsigned long addr)
748{
749 return zero_pfn;
750}
751#endif
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795#ifdef __HAVE_ARCH_PTE_SPECIAL
796# define HAVE_PTE_SPECIAL 1
797#else
798# define HAVE_PTE_SPECIAL 0
799#endif
800struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
801 pte_t pte)
802{
803 unsigned long pfn = pte_pfn(pte);
804
805 if (HAVE_PTE_SPECIAL) {
806 if (likely(!pte_special(pte)))
807 goto check_pfn;
808 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
809 return NULL;
810 if (!is_zero_pfn(pfn))
811 print_bad_pte(vma, addr, pte, NULL);
812 return NULL;
813 }
814
815
816
817 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
818 if (vma->vm_flags & VM_MIXEDMAP) {
819 if (!pfn_valid(pfn))
820 return NULL;
821 goto out;
822 } else {
823 unsigned long off;
824 off = (addr - vma->vm_start) >> PAGE_SHIFT;
825 if (pfn == vma->vm_pgoff + off)
826 return NULL;
827 if (!is_cow_mapping(vma->vm_flags))
828 return NULL;
829 }
830 }
831
832 if (is_zero_pfn(pfn))
833 return NULL;
834check_pfn:
835 if (unlikely(pfn > highest_memmap_pfn)) {
836 print_bad_pte(vma, addr, pte, NULL);
837 return NULL;
838 }
839
840
841
842
843
844out:
845 return pfn_to_page(pfn);
846}
847
848
849
850
851
852
853
854static inline unsigned long
855copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
856 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
857 unsigned long addr, int *rss)
858{
859 unsigned long vm_flags = vma->vm_flags;
860 pte_t pte = *src_pte;
861 struct page *page;
862
863
864 if (unlikely(!pte_present(pte))) {
865 if (!pte_file(pte)) {
866 swp_entry_t entry = pte_to_swp_entry(pte);
867
868 if (swap_duplicate(entry) < 0)
869 return entry.val;
870
871
872 if (unlikely(list_empty(&dst_mm->mmlist))) {
873 spin_lock(&mmlist_lock);
874 if (list_empty(&dst_mm->mmlist))
875 list_add(&dst_mm->mmlist,
876 &src_mm->mmlist);
877 spin_unlock(&mmlist_lock);
878 }
879 if (likely(!non_swap_entry(entry)))
880 rss[MM_SWAPENTS]++;
881 else if (is_migration_entry(entry)) {
882 page = migration_entry_to_page(entry);
883
884 if (PageAnon(page))
885 rss[MM_ANONPAGES]++;
886 else
887 rss[MM_FILEPAGES]++;
888
889 if (is_write_migration_entry(entry) &&
890 is_cow_mapping(vm_flags)) {
891
892
893
894
895 make_migration_entry_read(&entry);
896 pte = swp_entry_to_pte(entry);
897 set_pte_at(src_mm, addr, src_pte, pte);
898 }
899 }
900 }
901 goto out_set_pte;
902 }
903
904
905
906
907
908 if (is_cow_mapping(vm_flags)) {
909 ptep_set_wrprotect(src_mm, addr, src_pte);
910 pte = pte_wrprotect(pte);
911 }
912
913
914
915
916
917 if (vm_flags & VM_SHARED)
918 pte = pte_mkclean(pte);
919 pte = pte_mkold(pte);
920
921 page = vm_normal_page(vma, addr, pte);
922 if (page) {
923 get_page(page);
924 page_dup_rmap(page);
925 if (PageAnon(page))
926 rss[MM_ANONPAGES]++;
927 else
928 rss[MM_FILEPAGES]++;
929 }
930
931out_set_pte:
932 set_pte_at(dst_mm, addr, dst_pte, pte);
933 return 0;
934}
935
936int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
937 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
938 unsigned long addr, unsigned long end)
939{
940 pte_t *orig_src_pte, *orig_dst_pte;
941 pte_t *src_pte, *dst_pte;
942 spinlock_t *src_ptl, *dst_ptl;
943 int progress = 0;
944 int rss[NR_MM_COUNTERS];
945 swp_entry_t entry = (swp_entry_t){0};
946
947again:
948 init_rss_vec(rss);
949
950 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
951 if (!dst_pte)
952 return -ENOMEM;
953 src_pte = pte_offset_map(src_pmd, addr);
954 src_ptl = pte_lockptr(src_mm, src_pmd);
955 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
956 orig_src_pte = src_pte;
957 orig_dst_pte = dst_pte;
958 arch_enter_lazy_mmu_mode();
959
960 do {
961
962
963
964
965 if (progress >= 32) {
966 progress = 0;
967 if (need_resched() ||
968 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
969 break;
970 }
971 if (pte_none(*src_pte)) {
972 progress++;
973 continue;
974 }
975 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
976 vma, addr, rss);
977 if (entry.val)
978 break;
979 progress += 8;
980 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
981
982 arch_leave_lazy_mmu_mode();
983 spin_unlock(src_ptl);
984 pte_unmap(orig_src_pte);
985 add_mm_rss_vec(dst_mm, rss);
986 pte_unmap_unlock(orig_dst_pte, dst_ptl);
987 cond_resched();
988
989 if (entry.val) {
990 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
991 return -ENOMEM;
992 progress = 0;
993 }
994 if (addr != end)
995 goto again;
996 return 0;
997}
998
999static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1000 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1001 unsigned long addr, unsigned long end)
1002{
1003 pmd_t *src_pmd, *dst_pmd;
1004 unsigned long next;
1005
1006 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1007 if (!dst_pmd)
1008 return -ENOMEM;
1009 src_pmd = pmd_offset(src_pud, addr);
1010 do {
1011 next = pmd_addr_end(addr, end);
1012 if (pmd_trans_huge(*src_pmd)) {
1013 int err;
1014 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1015 err = copy_huge_pmd(dst_mm, src_mm,
1016 dst_pmd, src_pmd, addr, vma);
1017 if (err == -ENOMEM)
1018 return -ENOMEM;
1019 if (!err)
1020 continue;
1021
1022 }
1023 if (pmd_none_or_clear_bad(src_pmd))
1024 continue;
1025 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1026 vma, addr, next))
1027 return -ENOMEM;
1028 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1029 return 0;
1030}
1031
1032static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1033 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1034 unsigned long addr, unsigned long end)
1035{
1036 pud_t *src_pud, *dst_pud;
1037 unsigned long next;
1038
1039 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1040 if (!dst_pud)
1041 return -ENOMEM;
1042 src_pud = pud_offset(src_pgd, addr);
1043 do {
1044 next = pud_addr_end(addr, end);
1045 if (pud_none_or_clear_bad(src_pud))
1046 continue;
1047 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1048 vma, addr, next))
1049 return -ENOMEM;
1050 } while (dst_pud++, src_pud++, addr = next, addr != end);
1051 return 0;
1052}
1053
1054int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1055 struct vm_area_struct *vma)
1056{
1057 pgd_t *src_pgd, *dst_pgd;
1058 unsigned long next;
1059 unsigned long addr = vma->vm_start;
1060 unsigned long end = vma->vm_end;
1061 int ret;
1062
1063
1064
1065
1066
1067
1068
1069 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1070 if (!vma->anon_vma)
1071 return 0;
1072 }
1073
1074 if (is_vm_hugetlb_page(vma))
1075 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1076
1077 if (unlikely(is_pfn_mapping(vma))) {
1078
1079
1080
1081
1082 ret = track_pfn_vma_copy(vma);
1083 if (ret)
1084 return ret;
1085 }
1086
1087
1088
1089
1090
1091
1092
1093 if (is_cow_mapping(vma->vm_flags))
1094 mmu_notifier_invalidate_range_start(src_mm, addr, end);
1095
1096 ret = 0;
1097 dst_pgd = pgd_offset(dst_mm, addr);
1098 src_pgd = pgd_offset(src_mm, addr);
1099 do {
1100 next = pgd_addr_end(addr, end);
1101 if (pgd_none_or_clear_bad(src_pgd))
1102 continue;
1103 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1104 vma, addr, next))) {
1105 ret = -ENOMEM;
1106 break;
1107 }
1108 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1109
1110 if (is_cow_mapping(vma->vm_flags))
1111 mmu_notifier_invalidate_range_end(src_mm,
1112 vma->vm_start, end);
1113 return ret;
1114}
1115
1116static unsigned long zap_pte_range(struct mmu_gather *tlb,
1117 struct vm_area_struct *vma, pmd_t *pmd,
1118 unsigned long addr, unsigned long end,
1119 struct zap_details *details)
1120{
1121 struct mm_struct *mm = tlb->mm;
1122 int force_flush = 0;
1123 int rss[NR_MM_COUNTERS];
1124 spinlock_t *ptl;
1125 pte_t *start_pte;
1126 pte_t *pte;
1127
1128again:
1129 init_rss_vec(rss);
1130 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1131 pte = start_pte;
1132 arch_enter_lazy_mmu_mode();
1133 do {
1134 pte_t ptent = *pte;
1135 if (pte_none(ptent)) {
1136 continue;
1137 }
1138
1139 if (pte_present(ptent)) {
1140 struct page *page;
1141
1142 page = vm_normal_page(vma, addr, ptent);
1143 if (unlikely(details) && page) {
1144
1145
1146
1147
1148
1149 if (details->check_mapping &&
1150 details->check_mapping != page->mapping)
1151 continue;
1152
1153
1154
1155
1156 if (details->nonlinear_vma &&
1157 (page->index < details->first_index ||
1158 page->index > details->last_index))
1159 continue;
1160 }
1161 ptent = ptep_get_and_clear_full(mm, addr, pte,
1162 tlb->fullmm);
1163 tlb_remove_tlb_entry(tlb, pte, addr);
1164 if (unlikely(!page))
1165 continue;
1166 if (unlikely(details) && details->nonlinear_vma
1167 && linear_page_index(details->nonlinear_vma,
1168 addr) != page->index)
1169 set_pte_at(mm, addr, pte,
1170 pgoff_to_pte(page->index));
1171 if (PageAnon(page))
1172 rss[MM_ANONPAGES]--;
1173 else {
1174 if (pte_dirty(ptent))
1175 set_page_dirty(page);
1176 if (pte_young(ptent) &&
1177 likely(!VM_SequentialReadHint(vma)))
1178 mark_page_accessed(page);
1179 rss[MM_FILEPAGES]--;
1180 }
1181 page_remove_rmap(page);
1182 if (unlikely(page_mapcount(page) < 0))
1183 print_bad_pte(vma, addr, ptent, page);
1184 force_flush = !__tlb_remove_page(tlb, page);
1185 if (force_flush)
1186 break;
1187 continue;
1188 }
1189
1190
1191
1192
1193 if (unlikely(details))
1194 continue;
1195 if (pte_file(ptent)) {
1196 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1197 print_bad_pte(vma, addr, ptent, NULL);
1198 } else {
1199 swp_entry_t entry = pte_to_swp_entry(ptent);
1200
1201 if (!non_swap_entry(entry))
1202 rss[MM_SWAPENTS]--;
1203 else if (is_migration_entry(entry)) {
1204 struct page *page;
1205
1206 page = migration_entry_to_page(entry);
1207
1208 if (PageAnon(page))
1209 rss[MM_ANONPAGES]--;
1210 else
1211 rss[MM_FILEPAGES]--;
1212 }
1213 if (unlikely(!free_swap_and_cache(entry)))
1214 print_bad_pte(vma, addr, ptent, NULL);
1215 }
1216 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1217 } while (pte++, addr += PAGE_SIZE, addr != end);
1218
1219 add_mm_rss_vec(mm, rss);
1220 arch_leave_lazy_mmu_mode();
1221 pte_unmap_unlock(start_pte, ptl);
1222
1223
1224
1225
1226
1227
1228 if (force_flush) {
1229 force_flush = 0;
1230 tlb_flush_mmu(tlb);
1231 if (addr != end)
1232 goto again;
1233 }
1234
1235 return addr;
1236}
1237
1238static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1239 struct vm_area_struct *vma, pud_t *pud,
1240 unsigned long addr, unsigned long end,
1241 struct zap_details *details)
1242{
1243 pmd_t *pmd;
1244 unsigned long next;
1245
1246 pmd = pmd_offset(pud, addr);
1247 do {
1248 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) {
1250 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 goto next;
1255
1256 }
1257
1258
1259
1260
1261
1262
1263
1264 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1265 goto next;
1266 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1267next:
1268 cond_resched();
1269 } while (pmd++, addr = next, addr != end);
1270
1271 return addr;
1272}
1273
1274static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1275 struct vm_area_struct *vma, pgd_t *pgd,
1276 unsigned long addr, unsigned long end,
1277 struct zap_details *details)
1278{
1279 pud_t *pud;
1280 unsigned long next;
1281
1282 pud = pud_offset(pgd, addr);
1283 do {
1284 next = pud_addr_end(addr, end);
1285 if (pud_none_or_clear_bad(pud))
1286 continue;
1287 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1288 } while (pud++, addr = next, addr != end);
1289
1290 return addr;
1291}
1292
1293static unsigned long unmap_page_range(struct mmu_gather *tlb,
1294 struct vm_area_struct *vma,
1295 unsigned long addr, unsigned long end,
1296 struct zap_details *details)
1297{
1298 pgd_t *pgd;
1299 unsigned long next;
1300
1301 if (details && !details->check_mapping && !details->nonlinear_vma)
1302 details = NULL;
1303
1304 BUG_ON(addr >= end);
1305 mem_cgroup_uncharge_start();
1306 tlb_start_vma(tlb, vma);
1307 pgd = pgd_offset(vma->vm_mm, addr);
1308 do {
1309 next = pgd_addr_end(addr, end);
1310 if (pgd_none_or_clear_bad(pgd))
1311 continue;
1312 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1313 } while (pgd++, addr = next, addr != end);
1314 tlb_end_vma(tlb, vma);
1315 mem_cgroup_uncharge_end();
1316
1317 return addr;
1318}
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342unsigned long unmap_vmas(struct mmu_gather *tlb,
1343 struct vm_area_struct *vma, unsigned long start_addr,
1344 unsigned long end_addr, unsigned long *nr_accounted,
1345 struct zap_details *details)
1346{
1347 unsigned long start = start_addr;
1348 struct mm_struct *mm = vma->vm_mm;
1349
1350 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1351 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1352 unsigned long end;
1353
1354 start = max(vma->vm_start, start_addr);
1355 if (start >= vma->vm_end)
1356 continue;
1357 end = min(vma->vm_end, end_addr);
1358 if (end <= vma->vm_start)
1359 continue;
1360
1361 if (vma->vm_flags & VM_ACCOUNT)
1362 *nr_accounted += (end - start) >> PAGE_SHIFT;
1363
1364 if (unlikely(is_pfn_mapping(vma)))
1365 untrack_pfn_vma(vma, 0, 0);
1366
1367 while (start != end) {
1368 if (unlikely(is_vm_hugetlb_page(vma))) {
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380 if (vma->vm_file)
1381 unmap_hugepage_range(vma, start, end, NULL);
1382
1383 start = end;
1384 } else
1385 start = unmap_page_range(tlb, vma, start, end, details);
1386 }
1387 }
1388
1389 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1390 return start;
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1401 unsigned long size, struct zap_details *details)
1402{
1403 struct mm_struct *mm = vma->vm_mm;
1404 struct mmu_gather tlb;
1405 unsigned long end = address + size;
1406 unsigned long nr_accounted = 0;
1407
1408 lru_add_drain();
1409 tlb_gather_mmu(&tlb, mm, 0);
1410 update_hiwater_rss(mm);
1411 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1412 tlb_finish_mmu(&tlb, address, end);
1413 return end;
1414}
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1429 unsigned long size)
1430{
1431 if (address < vma->vm_start || address + size > vma->vm_end ||
1432 !(vma->vm_flags & VM_PFNMAP))
1433 return -1;
1434 zap_page_range(vma, address, size, NULL);
1435 return 0;
1436}
1437EXPORT_SYMBOL_GPL(zap_vma_ptes);
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1452 unsigned int flags)
1453{
1454 pgd_t *pgd;
1455 pud_t *pud;
1456 pmd_t *pmd;
1457 pte_t *ptep, pte;
1458 spinlock_t *ptl;
1459 struct page *page;
1460 struct mm_struct *mm = vma->vm_mm;
1461
1462 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1463 if (!IS_ERR(page)) {
1464 BUG_ON(flags & FOLL_GET);
1465 goto out;
1466 }
1467
1468 page = NULL;
1469 pgd = pgd_offset(mm, address);
1470 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1471 goto no_page_table;
1472
1473 pud = pud_offset(pgd, address);
1474 if (pud_none(*pud))
1475 goto no_page_table;
1476 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1477 BUG_ON(flags & FOLL_GET);
1478 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1479 goto out;
1480 }
1481 if (unlikely(pud_bad(*pud)))
1482 goto no_page_table;
1483
1484 pmd = pmd_offset(pud, address);
1485 if (pmd_none(*pmd))
1486 goto no_page_table;
1487 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1488 BUG_ON(flags & FOLL_GET);
1489 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1490 goto out;
1491 }
1492 if (pmd_trans_huge(*pmd)) {
1493 if (flags & FOLL_SPLIT) {
1494 split_huge_page_pmd(mm, pmd);
1495 goto split_fallthrough;
1496 }
1497 spin_lock(&mm->page_table_lock);
1498 if (likely(pmd_trans_huge(*pmd))) {
1499 if (unlikely(pmd_trans_splitting(*pmd))) {
1500 spin_unlock(&mm->page_table_lock);
1501 wait_split_huge_page(vma->anon_vma, pmd);
1502 } else {
1503 page = follow_trans_huge_pmd(mm, address,
1504 pmd, flags);
1505 spin_unlock(&mm->page_table_lock);
1506 goto out;
1507 }
1508 } else
1509 spin_unlock(&mm->page_table_lock);
1510
1511 }
1512split_fallthrough:
1513 if (unlikely(pmd_bad(*pmd)))
1514 goto no_page_table;
1515
1516 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1517
1518 pte = *ptep;
1519 if (!pte_present(pte))
1520 goto no_page;
1521 if ((flags & FOLL_WRITE) && !pte_write(pte))
1522 goto unlock;
1523
1524 page = vm_normal_page(vma, address, pte);
1525 if (unlikely(!page)) {
1526 if ((flags & FOLL_DUMP) ||
1527 !is_zero_pfn(pte_pfn(pte)))
1528 goto bad_page;
1529 page = pte_page(pte);
1530 }
1531
1532 if (flags & FOLL_GET)
1533 get_page_foll(page);
1534 if (flags & FOLL_TOUCH) {
1535 if ((flags & FOLL_WRITE) &&
1536 !pte_dirty(pte) && !PageDirty(page))
1537 set_page_dirty(page);
1538
1539
1540
1541
1542
1543 mark_page_accessed(page);
1544 }
1545 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555 if (page->mapping && trylock_page(page)) {
1556 lru_add_drain();
1557
1558
1559
1560
1561
1562 if (page->mapping)
1563 mlock_vma_page(page);
1564 unlock_page(page);
1565 }
1566 }
1567unlock:
1568 pte_unmap_unlock(ptep, ptl);
1569out:
1570 return page;
1571
1572bad_page:
1573 pte_unmap_unlock(ptep, ptl);
1574 return ERR_PTR(-EFAULT);
1575
1576no_page:
1577 pte_unmap_unlock(ptep, ptl);
1578 if (!pte_none(pte))
1579 return page;
1580
1581no_page_table:
1582
1583
1584
1585
1586
1587
1588
1589
1590 if ((flags & FOLL_DUMP) &&
1591 (!vma->vm_ops || !vma->vm_ops->fault))
1592 return ERR_PTR(-EFAULT);
1593 return page;
1594}
1595
1596static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1597{
1598 return stack_guard_page_start(vma, addr) ||
1599 stack_guard_page_end(vma, addr+PAGE_SIZE);
1600}
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1652 unsigned long start, int nr_pages, unsigned int gup_flags,
1653 struct page **pages, struct vm_area_struct **vmas,
1654 int *nonblocking)
1655{
1656 int i;
1657 unsigned long vm_flags;
1658
1659 if (nr_pages <= 0)
1660 return 0;
1661
1662 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1663
1664
1665
1666
1667
1668 vm_flags = (gup_flags & FOLL_WRITE) ?
1669 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1670 vm_flags &= (gup_flags & FOLL_FORCE) ?
1671 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1672 i = 0;
1673
1674 do {
1675 struct vm_area_struct *vma;
1676
1677 vma = find_extend_vma(mm, start);
1678 if (!vma && in_gate_area(mm, start)) {
1679 unsigned long pg = start & PAGE_MASK;
1680 pgd_t *pgd;
1681 pud_t *pud;
1682 pmd_t *pmd;
1683 pte_t *pte;
1684
1685
1686 if (gup_flags & FOLL_WRITE)
1687 return i ? : -EFAULT;
1688 if (pg > TASK_SIZE)
1689 pgd = pgd_offset_k(pg);
1690 else
1691 pgd = pgd_offset_gate(mm, pg);
1692 BUG_ON(pgd_none(*pgd));
1693 pud = pud_offset(pgd, pg);
1694 BUG_ON(pud_none(*pud));
1695 pmd = pmd_offset(pud, pg);
1696 if (pmd_none(*pmd))
1697 return i ? : -EFAULT;
1698 VM_BUG_ON(pmd_trans_huge(*pmd));
1699 pte = pte_offset_map(pmd, pg);
1700 if (pte_none(*pte)) {
1701 pte_unmap(pte);
1702 return i ? : -EFAULT;
1703 }
1704 vma = get_gate_vma(mm);
1705 if (pages) {
1706 struct page *page;
1707
1708 page = vm_normal_page(vma, start, *pte);
1709 if (!page) {
1710 if (!(gup_flags & FOLL_DUMP) &&
1711 is_zero_pfn(pte_pfn(*pte)))
1712 page = pte_page(*pte);
1713 else {
1714 pte_unmap(pte);
1715 return i ? : -EFAULT;
1716 }
1717 }
1718 pages[i] = page;
1719 get_page(page);
1720 }
1721 pte_unmap(pte);
1722 goto next_page;
1723 }
1724
1725 if (!vma ||
1726 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1727 !(vm_flags & vma->vm_flags))
1728 return i ? : -EFAULT;
1729
1730 if (is_vm_hugetlb_page(vma)) {
1731 i = follow_hugetlb_page(mm, vma, pages, vmas,
1732 &start, &nr_pages, i, gup_flags);
1733 continue;
1734 }
1735
1736 do {
1737 struct page *page;
1738 unsigned int foll_flags = gup_flags;
1739
1740
1741
1742
1743
1744 if (unlikely(fatal_signal_pending(current)))
1745 return i ? i : -ERESTARTSYS;
1746
1747 cond_resched();
1748 while (!(page = follow_page(vma, start, foll_flags))) {
1749 int ret;
1750 unsigned int fault_flags = 0;
1751
1752
1753 if (foll_flags & FOLL_MLOCK) {
1754 if (stack_guard_page(vma, start))
1755 goto next_page;
1756 }
1757 if (foll_flags & FOLL_WRITE)
1758 fault_flags |= FAULT_FLAG_WRITE;
1759 if (nonblocking)
1760 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1761 if (foll_flags & FOLL_NOWAIT)
1762 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1763
1764 ret = handle_mm_fault(mm, vma, start,
1765 fault_flags);
1766
1767 if (ret & VM_FAULT_ERROR) {
1768 if (ret & VM_FAULT_OOM)
1769 return i ? i : -ENOMEM;
1770 if (ret & (VM_FAULT_HWPOISON |
1771 VM_FAULT_HWPOISON_LARGE)) {
1772 if (i)
1773 return i;
1774 else if (gup_flags & FOLL_HWPOISON)
1775 return -EHWPOISON;
1776 else
1777 return -EFAULT;
1778 }
1779 if (ret & VM_FAULT_SIGBUS)
1780 return i ? i : -EFAULT;
1781 BUG();
1782 }
1783
1784 if (tsk) {
1785 if (ret & VM_FAULT_MAJOR)
1786 tsk->maj_flt++;
1787 else
1788 tsk->min_flt++;
1789 }
1790
1791 if (ret & VM_FAULT_RETRY) {
1792 if (nonblocking)
1793 *nonblocking = 0;
1794 return i;
1795 }
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 if ((ret & VM_FAULT_WRITE) &&
1810 !(vma->vm_flags & VM_WRITE))
1811 foll_flags &= ~FOLL_WRITE;
1812
1813 cond_resched();
1814 }
1815 if (IS_ERR(page))
1816 return i ? i : PTR_ERR(page);
1817 if (pages) {
1818 pages[i] = page;
1819
1820 flush_anon_page(vma, page, start);
1821 flush_dcache_page(page);
1822 }
1823next_page:
1824 if (vmas)
1825 vmas[i] = vma;
1826 i++;
1827 start += PAGE_SIZE;
1828 nr_pages--;
1829 } while (nr_pages && start < vma->vm_end);
1830 } while (nr_pages);
1831 return i;
1832}
1833EXPORT_SYMBOL(__get_user_pages);
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1863 unsigned long address, unsigned int fault_flags)
1864{
1865 struct vm_area_struct *vma;
1866 int ret;
1867
1868 vma = find_extend_vma(mm, address);
1869 if (!vma || address < vma->vm_start)
1870 return -EFAULT;
1871
1872 ret = handle_mm_fault(mm, vma, address, fault_flags);
1873 if (ret & VM_FAULT_ERROR) {
1874 if (ret & VM_FAULT_OOM)
1875 return -ENOMEM;
1876 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1877 return -EHWPOISON;
1878 if (ret & VM_FAULT_SIGBUS)
1879 return -EFAULT;
1880 BUG();
1881 }
1882 if (tsk) {
1883 if (ret & VM_FAULT_MAJOR)
1884 tsk->maj_flt++;
1885 else
1886 tsk->min_flt++;
1887 }
1888 return 0;
1889}
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1943 unsigned long start, int nr_pages, int write, int force,
1944 struct page **pages, struct vm_area_struct **vmas)
1945{
1946 int flags = FOLL_TOUCH;
1947
1948 if (pages)
1949 flags |= FOLL_GET;
1950 if (write)
1951 flags |= FOLL_WRITE;
1952 if (force)
1953 flags |= FOLL_FORCE;
1954
1955 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1956 NULL);
1957}
1958EXPORT_SYMBOL(get_user_pages);
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974#ifdef CONFIG_ELF_CORE
1975struct page *get_dump_page(unsigned long addr)
1976{
1977 struct vm_area_struct *vma;
1978 struct page *page;
1979
1980 if (__get_user_pages(current, current->mm, addr, 1,
1981 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1982 NULL) < 1)
1983 return NULL;
1984 flush_cache_page(vma, addr, page_to_pfn(page));
1985 return page;
1986}
1987#endif
1988
1989pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1990 spinlock_t **ptl)
1991{
1992 pgd_t * pgd = pgd_offset(mm, addr);
1993 pud_t * pud = pud_alloc(mm, pgd, addr);
1994 if (pud) {
1995 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1996 if (pmd) {
1997 VM_BUG_ON(pmd_trans_huge(*pmd));
1998 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1999 }
2000 }
2001 return NULL;
2002}
2003
2004
2005
2006
2007
2008
2009
2010
2011static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2012 struct page *page, pgprot_t prot)
2013{
2014 struct mm_struct *mm = vma->vm_mm;
2015 int retval;
2016 pte_t *pte;
2017 spinlock_t *ptl;
2018
2019 retval = -EINVAL;
2020 if (PageAnon(page))
2021 goto out;
2022 retval = -ENOMEM;
2023 flush_dcache_page(page);
2024 pte = get_locked_pte(mm, addr, &ptl);
2025 if (!pte)
2026 goto out;
2027 retval = -EBUSY;
2028 if (!pte_none(*pte))
2029 goto out_unlock;
2030
2031
2032 get_page(page);
2033 inc_mm_counter_fast(mm, MM_FILEPAGES);
2034 page_add_file_rmap(page);
2035 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2036
2037 retval = 0;
2038 pte_unmap_unlock(pte, ptl);
2039 return retval;
2040out_unlock:
2041 pte_unmap_unlock(pte, ptl);
2042out:
2043 return retval;
2044}
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2069 struct page *page)
2070{
2071 if (addr < vma->vm_start || addr >= vma->vm_end)
2072 return -EFAULT;
2073 if (!page_count(page))
2074 return -EINVAL;
2075 vma->vm_flags |= VM_INSERTPAGE;
2076 return insert_page(vma, addr, page, vma->vm_page_prot);
2077}
2078EXPORT_SYMBOL(vm_insert_page);
2079
2080static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2081 unsigned long pfn, pgprot_t prot)
2082{
2083 struct mm_struct *mm = vma->vm_mm;
2084 int retval;
2085 pte_t *pte, entry;
2086 spinlock_t *ptl;
2087
2088 retval = -ENOMEM;
2089 pte = get_locked_pte(mm, addr, &ptl);
2090 if (!pte)
2091 goto out;
2092 retval = -EBUSY;
2093 if (!pte_none(*pte))
2094 goto out_unlock;
2095
2096
2097 entry = pte_mkspecial(pfn_pte(pfn, prot));
2098 set_pte_at(mm, addr, pte, entry);
2099 update_mmu_cache(vma, addr, pte);
2100
2101 retval = 0;
2102out_unlock:
2103 pte_unmap_unlock(pte, ptl);
2104out:
2105 return retval;
2106}
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2126 unsigned long pfn)
2127{
2128 int ret;
2129 pgprot_t pgprot = vma->vm_page_prot;
2130
2131
2132
2133
2134
2135
2136 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2137 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2138 (VM_PFNMAP|VM_MIXEDMAP));
2139 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2140 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2141
2142 if (addr < vma->vm_start || addr >= vma->vm_end)
2143 return -EFAULT;
2144 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
2145 return -EINVAL;
2146
2147 ret = insert_pfn(vma, addr, pfn, pgprot);
2148
2149 if (ret)
2150 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2151
2152 return ret;
2153}
2154EXPORT_SYMBOL(vm_insert_pfn);
2155
2156int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2157 unsigned long pfn)
2158{
2159 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2160
2161 if (addr < vma->vm_start || addr >= vma->vm_end)
2162 return -EFAULT;
2163
2164
2165
2166
2167
2168
2169
2170
2171 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2172 struct page *page;
2173
2174 page = pfn_to_page(pfn);
2175 return insert_page(vma, addr, page, vma->vm_page_prot);
2176 }
2177 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2178}
2179EXPORT_SYMBOL(vm_insert_mixed);
2180
2181
2182
2183
2184
2185
2186static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2187 unsigned long addr, unsigned long end,
2188 unsigned long pfn, pgprot_t prot)
2189{
2190 pte_t *pte;
2191 spinlock_t *ptl;
2192
2193 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2194 if (!pte)
2195 return -ENOMEM;
2196 arch_enter_lazy_mmu_mode();
2197 do {
2198 BUG_ON(!pte_none(*pte));
2199 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2200 pfn++;
2201 } while (pte++, addr += PAGE_SIZE, addr != end);
2202 arch_leave_lazy_mmu_mode();
2203 pte_unmap_unlock(pte - 1, ptl);
2204 return 0;
2205}
2206
2207static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2208 unsigned long addr, unsigned long end,
2209 unsigned long pfn, pgprot_t prot)
2210{
2211 pmd_t *pmd;
2212 unsigned long next;
2213
2214 pfn -= addr >> PAGE_SHIFT;
2215 pmd = pmd_alloc(mm, pud, addr);
2216 if (!pmd)
2217 return -ENOMEM;
2218 VM_BUG_ON(pmd_trans_huge(*pmd));
2219 do {
2220 next = pmd_addr_end(addr, end);
2221 if (remap_pte_range(mm, pmd, addr, next,
2222 pfn + (addr >> PAGE_SHIFT), prot))
2223 return -ENOMEM;
2224 } while (pmd++, addr = next, addr != end);
2225 return 0;
2226}
2227
2228static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2229 unsigned long addr, unsigned long end,
2230 unsigned long pfn, pgprot_t prot)
2231{
2232 pud_t *pud;
2233 unsigned long next;
2234
2235 pfn -= addr >> PAGE_SHIFT;
2236 pud = pud_alloc(mm, pgd, addr);
2237 if (!pud)
2238 return -ENOMEM;
2239 do {
2240 next = pud_addr_end(addr, end);
2241 if (remap_pmd_range(mm, pud, addr, next,
2242 pfn + (addr >> PAGE_SHIFT), prot))
2243 return -ENOMEM;
2244 } while (pud++, addr = next, addr != end);
2245 return 0;
2246}
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2259 unsigned long pfn, unsigned long size, pgprot_t prot)
2260{
2261 pgd_t *pgd;
2262 unsigned long next;
2263 unsigned long end = addr + PAGE_ALIGN(size);
2264 struct mm_struct *mm = vma->vm_mm;
2265 int err;
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285 if (addr == vma->vm_start && end == vma->vm_end) {
2286 vma->vm_pgoff = pfn;
2287 vma->vm_flags |= VM_PFN_AT_MMAP;
2288 } else if (is_cow_mapping(vma->vm_flags))
2289 return -EINVAL;
2290
2291 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2292
2293 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2294 if (err) {
2295
2296
2297
2298
2299 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2300 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2301 return -EINVAL;
2302 }
2303
2304 BUG_ON(addr >= end);
2305 pfn -= addr >> PAGE_SHIFT;
2306 pgd = pgd_offset(mm, addr);
2307 flush_cache_range(vma, addr, end);
2308 do {
2309 next = pgd_addr_end(addr, end);
2310 err = remap_pud_range(mm, pgd, addr, next,
2311 pfn + (addr >> PAGE_SHIFT), prot);
2312 if (err)
2313 break;
2314 } while (pgd++, addr = next, addr != end);
2315
2316 if (err)
2317 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2318
2319 return err;
2320}
2321EXPORT_SYMBOL(remap_pfn_range);
2322
2323static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2324 unsigned long addr, unsigned long end,
2325 pte_fn_t fn, void *data)
2326{
2327 pte_t *pte;
2328 int err;
2329 pgtable_t token;
2330 spinlock_t *uninitialized_var(ptl);
2331
2332 pte = (mm == &init_mm) ?
2333 pte_alloc_kernel(pmd, addr) :
2334 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2335 if (!pte)
2336 return -ENOMEM;
2337
2338 BUG_ON(pmd_huge(*pmd));
2339
2340 arch_enter_lazy_mmu_mode();
2341
2342 token = pmd_pgtable(*pmd);
2343
2344 do {
2345 err = fn(pte++, token, addr, data);
2346 if (err)
2347 break;
2348 } while (addr += PAGE_SIZE, addr != end);
2349
2350 arch_leave_lazy_mmu_mode();
2351
2352 if (mm != &init_mm)
2353 pte_unmap_unlock(pte-1, ptl);
2354 return err;
2355}
2356
2357static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2358 unsigned long addr, unsigned long end,
2359 pte_fn_t fn, void *data)
2360{
2361 pmd_t *pmd;
2362 unsigned long next;
2363 int err;
2364
2365 BUG_ON(pud_huge(*pud));
2366
2367 pmd = pmd_alloc(mm, pud, addr);
2368 if (!pmd)
2369 return -ENOMEM;
2370 do {
2371 next = pmd_addr_end(addr, end);
2372 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2373 if (err)
2374 break;
2375 } while (pmd++, addr = next, addr != end);
2376 return err;
2377}
2378
2379static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2380 unsigned long addr, unsigned long end,
2381 pte_fn_t fn, void *data)
2382{
2383 pud_t *pud;
2384 unsigned long next;
2385 int err;
2386
2387 pud = pud_alloc(mm, pgd, addr);
2388 if (!pud)
2389 return -ENOMEM;
2390 do {
2391 next = pud_addr_end(addr, end);
2392 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2393 if (err)
2394 break;
2395 } while (pud++, addr = next, addr != end);
2396 return err;
2397}
2398
2399
2400
2401
2402
2403int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2404 unsigned long size, pte_fn_t fn, void *data)
2405{
2406 pgd_t *pgd;
2407 unsigned long next;
2408 unsigned long end = addr + size;
2409 int err;
2410
2411 BUG_ON(addr >= end);
2412 pgd = pgd_offset(mm, addr);
2413 do {
2414 next = pgd_addr_end(addr, end);
2415 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2416 if (err)
2417 break;
2418 } while (pgd++, addr = next, addr != end);
2419
2420 return err;
2421}
2422EXPORT_SYMBOL_GPL(apply_to_page_range);
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2434 pte_t *page_table, pte_t orig_pte)
2435{
2436 int same = 1;
2437#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2438 if (sizeof(pte_t) > sizeof(unsigned long)) {
2439 spinlock_t *ptl = pte_lockptr(mm, pmd);
2440 spin_lock(ptl);
2441 same = pte_same(*page_table, orig_pte);
2442 spin_unlock(ptl);
2443 }
2444#endif
2445 pte_unmap(page_table);
2446 return same;
2447}
2448
2449static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2450{
2451
2452
2453
2454
2455
2456
2457 if (unlikely(!src)) {
2458 void *kaddr = kmap_atomic(dst, KM_USER0);
2459 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2460
2461
2462
2463
2464
2465
2466
2467 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2468 clear_page(kaddr);
2469 kunmap_atomic(kaddr, KM_USER0);
2470 flush_dcache_page(dst);
2471 } else
2472 copy_user_highpage(dst, src, va, vma);
2473}
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2494 unsigned long address, pte_t *page_table, pmd_t *pmd,
2495 spinlock_t *ptl, pte_t orig_pte)
2496 __releases(ptl)
2497{
2498 struct page *old_page, *new_page;
2499 pte_t entry;
2500 int ret = 0;
2501 int page_mkwrite = 0;
2502 struct page *dirty_page = NULL;
2503
2504 old_page = vm_normal_page(vma, address, orig_pte);
2505 if (!old_page) {
2506
2507
2508
2509
2510
2511
2512
2513 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2514 (VM_WRITE|VM_SHARED))
2515 goto reuse;
2516 goto gotten;
2517 }
2518
2519
2520
2521
2522
2523 if (PageAnon(old_page) && !PageKsm(old_page)) {
2524 if (!trylock_page(old_page)) {
2525 page_cache_get(old_page);
2526 pte_unmap_unlock(page_table, ptl);
2527 lock_page(old_page);
2528 page_table = pte_offset_map_lock(mm, pmd, address,
2529 &ptl);
2530 if (!pte_same(*page_table, orig_pte)) {
2531 unlock_page(old_page);
2532 goto unlock;
2533 }
2534 page_cache_release(old_page);
2535 }
2536 if (reuse_swap_page(old_page)) {
2537
2538
2539
2540
2541
2542 page_move_anon_rmap(old_page, vma, address);
2543 unlock_page(old_page);
2544 goto reuse;
2545 }
2546 unlock_page(old_page);
2547 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2548 (VM_WRITE|VM_SHARED))) {
2549
2550
2551
2552
2553
2554 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2555 struct vm_fault vmf;
2556 int tmp;
2557
2558 vmf.virtual_address = (void __user *)(address &
2559 PAGE_MASK);
2560 vmf.pgoff = old_page->index;
2561 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2562 vmf.page = old_page;
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572 page_cache_get(old_page);
2573 pte_unmap_unlock(page_table, ptl);
2574
2575 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2576 if (unlikely(tmp &
2577 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2578 ret = tmp;
2579 goto unwritable_page;
2580 }
2581 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2582 lock_page(old_page);
2583 if (!old_page->mapping) {
2584 ret = 0;
2585 unlock_page(old_page);
2586 goto unwritable_page;
2587 }
2588 } else
2589 VM_BUG_ON(!PageLocked(old_page));
2590
2591
2592
2593
2594
2595
2596
2597 page_table = pte_offset_map_lock(mm, pmd, address,
2598 &ptl);
2599 if (!pte_same(*page_table, orig_pte)) {
2600 unlock_page(old_page);
2601 goto unlock;
2602 }
2603
2604 page_mkwrite = 1;
2605 }
2606 dirty_page = old_page;
2607 get_page(dirty_page);
2608
2609reuse:
2610 flush_cache_page(vma, address, pte_pfn(orig_pte));
2611 entry = pte_mkyoung(orig_pte);
2612 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2613 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2614 update_mmu_cache(vma, address, page_table);
2615 pte_unmap_unlock(page_table, ptl);
2616 ret |= VM_FAULT_WRITE;
2617
2618 if (!dirty_page)
2619 return ret;
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629 if (!page_mkwrite) {
2630 wait_on_page_locked(dirty_page);
2631 set_page_dirty_balance(dirty_page, page_mkwrite);
2632 }
2633 put_page(dirty_page);
2634 if (page_mkwrite) {
2635 struct address_space *mapping = dirty_page->mapping;
2636
2637 set_page_dirty(dirty_page);
2638 unlock_page(dirty_page);
2639 page_cache_release(dirty_page);
2640 if (mapping) {
2641
2642
2643
2644
2645 balance_dirty_pages_ratelimited(mapping);
2646 }
2647 }
2648
2649
2650 if (vma->vm_file)
2651 file_update_time(vma->vm_file);
2652
2653 return ret;
2654 }
2655
2656
2657
2658
2659 page_cache_get(old_page);
2660gotten:
2661 pte_unmap_unlock(page_table, ptl);
2662
2663 if (unlikely(anon_vma_prepare(vma)))
2664 goto oom;
2665
2666 if (is_zero_pfn(pte_pfn(orig_pte))) {
2667 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2668 if (!new_page)
2669 goto oom;
2670 } else {
2671 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2672 if (!new_page)
2673 goto oom;
2674 cow_user_page(new_page, old_page, address, vma);
2675 }
2676 __SetPageUptodate(new_page);
2677
2678 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2679 goto oom_free_new;
2680
2681
2682
2683
2684 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2685 if (likely(pte_same(*page_table, orig_pte))) {
2686 if (old_page) {
2687 if (!PageAnon(old_page)) {
2688 dec_mm_counter_fast(mm, MM_FILEPAGES);
2689 inc_mm_counter_fast(mm, MM_ANONPAGES);
2690 }
2691 } else
2692 inc_mm_counter_fast(mm, MM_ANONPAGES);
2693 flush_cache_page(vma, address, pte_pfn(orig_pte));
2694 entry = mk_pte(new_page, vma->vm_page_prot);
2695 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2696
2697
2698
2699
2700
2701
2702 ptep_clear_flush(vma, address, page_table);
2703 page_add_new_anon_rmap(new_page, vma, address);
2704
2705
2706
2707
2708
2709 set_pte_at_notify(mm, address, page_table, entry);
2710 update_mmu_cache(vma, address, page_table);
2711 if (old_page) {
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734 page_remove_rmap(old_page);
2735 }
2736
2737
2738 new_page = old_page;
2739 ret |= VM_FAULT_WRITE;
2740 } else
2741 mem_cgroup_uncharge_page(new_page);
2742
2743 if (new_page)
2744 page_cache_release(new_page);
2745unlock:
2746 pte_unmap_unlock(page_table, ptl);
2747 if (old_page) {
2748
2749
2750
2751
2752 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2753 lock_page(old_page);
2754 munlock_vma_page(old_page);
2755 unlock_page(old_page);
2756 }
2757 page_cache_release(old_page);
2758 }
2759 return ret;
2760oom_free_new:
2761 page_cache_release(new_page);
2762oom:
2763 if (old_page) {
2764 if (page_mkwrite) {
2765 unlock_page(old_page);
2766 page_cache_release(old_page);
2767 }
2768 page_cache_release(old_page);
2769 }
2770 return VM_FAULT_OOM;
2771
2772unwritable_page:
2773 page_cache_release(old_page);
2774 return ret;
2775}
2776
2777static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2778 unsigned long start_addr, unsigned long end_addr,
2779 struct zap_details *details)
2780{
2781 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2782}
2783
2784static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2785 struct zap_details *details)
2786{
2787 struct vm_area_struct *vma;
2788 struct prio_tree_iter iter;
2789 pgoff_t vba, vea, zba, zea;
2790
2791 vma_prio_tree_foreach(vma, &iter, root,
2792 details->first_index, details->last_index) {
2793
2794 vba = vma->vm_pgoff;
2795 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2796
2797 zba = details->first_index;
2798 if (zba < vba)
2799 zba = vba;
2800 zea = details->last_index;
2801 if (zea > vea)
2802 zea = vea;
2803
2804 unmap_mapping_range_vma(vma,
2805 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2806 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2807 details);
2808 }
2809}
2810
2811static inline void unmap_mapping_range_list(struct list_head *head,
2812 struct zap_details *details)
2813{
2814 struct vm_area_struct *vma;
2815
2816
2817
2818
2819
2820
2821
2822 list_for_each_entry(vma, head, shared.vm_set.list) {
2823 details->nonlinear_vma = vma;
2824 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2825 }
2826}
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842void unmap_mapping_range(struct address_space *mapping,
2843 loff_t const holebegin, loff_t const holelen, int even_cows)
2844{
2845 struct zap_details details;
2846 pgoff_t hba = holebegin >> PAGE_SHIFT;
2847 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2848
2849
2850 if (sizeof(holelen) > sizeof(hlen)) {
2851 long long holeend =
2852 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2853 if (holeend & ~(long long)ULONG_MAX)
2854 hlen = ULONG_MAX - hba + 1;
2855 }
2856
2857 details.check_mapping = even_cows? NULL: mapping;
2858 details.nonlinear_vma = NULL;
2859 details.first_index = hba;
2860 details.last_index = hba + hlen - 1;
2861 if (details.last_index < details.first_index)
2862 details.last_index = ULONG_MAX;
2863
2864
2865 mutex_lock(&mapping->i_mmap_mutex);
2866 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2867 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2868 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2869 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2870 mutex_unlock(&mapping->i_mmap_mutex);
2871}
2872EXPORT_SYMBOL(unmap_mapping_range);
2873
2874
2875
2876
2877
2878
2879static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2880 unsigned long address, pte_t *page_table, pmd_t *pmd,
2881 unsigned int flags, pte_t orig_pte)
2882{
2883 spinlock_t *ptl;
2884 struct page *page, *swapcache = NULL;
2885 swp_entry_t entry;
2886 pte_t pte;
2887 int locked;
2888 struct mem_cgroup *ptr;
2889 int exclusive = 0;
2890 int ret = 0;
2891
2892 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2893 goto out;
2894
2895 entry = pte_to_swp_entry(orig_pte);
2896 if (unlikely(non_swap_entry(entry))) {
2897 if (is_migration_entry(entry)) {
2898 migration_entry_wait(mm, pmd, address);
2899 } else if (is_hwpoison_entry(entry)) {
2900 ret = VM_FAULT_HWPOISON;
2901 } else {
2902 print_bad_pte(vma, address, orig_pte, NULL);
2903 ret = VM_FAULT_SIGBUS;
2904 }
2905 goto out;
2906 }
2907 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2908 page = lookup_swap_cache(entry);
2909 if (!page) {
2910 grab_swap_token(mm);
2911 page = swapin_readahead(entry,
2912 GFP_HIGHUSER_MOVABLE, vma, address);
2913 if (!page) {
2914
2915
2916
2917
2918 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2919 if (likely(pte_same(*page_table, orig_pte)))
2920 ret = VM_FAULT_OOM;
2921 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2922 goto unlock;
2923 }
2924
2925
2926 ret = VM_FAULT_MAJOR;
2927 count_vm_event(PGMAJFAULT);
2928 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2929 } else if (PageHWPoison(page)) {
2930
2931
2932
2933
2934 ret = VM_FAULT_HWPOISON;
2935 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2936 goto out_release;
2937 }
2938
2939 locked = lock_page_or_retry(page, mm, flags);
2940 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2941 if (!locked) {
2942 ret |= VM_FAULT_RETRY;
2943 goto out_release;
2944 }
2945
2946
2947
2948
2949
2950
2951
2952 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2953 goto out_page;
2954
2955 if (ksm_might_need_to_copy(page, vma, address)) {
2956 swapcache = page;
2957 page = ksm_does_need_to_copy(page, vma, address);
2958
2959 if (unlikely(!page)) {
2960 ret = VM_FAULT_OOM;
2961 page = swapcache;
2962 swapcache = NULL;
2963 goto out_page;
2964 }
2965 }
2966
2967 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2968 ret = VM_FAULT_OOM;
2969 goto out_page;
2970 }
2971
2972
2973
2974
2975 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2976 if (unlikely(!pte_same(*page_table, orig_pte)))
2977 goto out_nomap;
2978
2979 if (unlikely(!PageUptodate(page))) {
2980 ret = VM_FAULT_SIGBUS;
2981 goto out_nomap;
2982 }
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998 inc_mm_counter_fast(mm, MM_ANONPAGES);
2999 dec_mm_counter_fast(mm, MM_SWAPENTS);
3000 pte = mk_pte(page, vma->vm_page_prot);
3001 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3002 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3003 flags &= ~FAULT_FLAG_WRITE;
3004 ret |= VM_FAULT_WRITE;
3005 exclusive = 1;
3006 }
3007 flush_icache_page(vma, page);
3008 set_pte_at(mm, address, page_table, pte);
3009 do_page_add_anon_rmap(page, vma, address, exclusive);
3010
3011 mem_cgroup_commit_charge_swapin(page, ptr);
3012
3013 swap_free(entry);
3014 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3015 try_to_free_swap(page);
3016 unlock_page(page);
3017 if (swapcache) {
3018
3019
3020
3021
3022
3023
3024
3025
3026 unlock_page(swapcache);
3027 page_cache_release(swapcache);
3028 }
3029
3030 if (flags & FAULT_FLAG_WRITE) {
3031 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3032 if (ret & VM_FAULT_ERROR)
3033 ret &= VM_FAULT_ERROR;
3034 goto out;
3035 }
3036
3037
3038 update_mmu_cache(vma, address, page_table);
3039unlock:
3040 pte_unmap_unlock(page_table, ptl);
3041out:
3042 return ret;
3043out_nomap:
3044 mem_cgroup_cancel_charge_swapin(ptr);
3045 pte_unmap_unlock(page_table, ptl);
3046out_page:
3047 unlock_page(page);
3048out_release:
3049 page_cache_release(page);
3050 if (swapcache) {
3051 unlock_page(swapcache);
3052 page_cache_release(swapcache);
3053 }
3054 return ret;
3055}
3056
3057
3058
3059
3060
3061
3062static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3063{
3064 address &= PAGE_MASK;
3065 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3066 struct vm_area_struct *prev = vma->vm_prev;
3067
3068
3069
3070
3071
3072
3073
3074 if (prev && prev->vm_end == address)
3075 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3076
3077 expand_downwards(vma, address - PAGE_SIZE);
3078 }
3079 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3080 struct vm_area_struct *next = vma->vm_next;
3081
3082
3083 if (next && next->vm_start == address + PAGE_SIZE)
3084 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3085
3086 expand_upwards(vma, address + PAGE_SIZE);
3087 }
3088 return 0;
3089}
3090
3091
3092
3093
3094
3095
3096static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3097 unsigned long address, pte_t *page_table, pmd_t *pmd,
3098 unsigned int flags)
3099{
3100 struct page *page;
3101 spinlock_t *ptl;
3102 pte_t entry;
3103
3104 pte_unmap(page_table);
3105
3106
3107 if (check_stack_guard_page(vma, address) < 0)
3108 return VM_FAULT_SIGBUS;
3109
3110
3111 if (!(flags & FAULT_FLAG_WRITE)) {
3112 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3113 vma->vm_page_prot));
3114 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3115 if (!pte_none(*page_table))
3116 goto unlock;
3117 goto setpte;
3118 }
3119
3120
3121 if (unlikely(anon_vma_prepare(vma)))
3122 goto oom;
3123 page = alloc_zeroed_user_highpage_movable(vma, address);
3124 if (!page)
3125 goto oom;
3126 __SetPageUptodate(page);
3127
3128 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3129 goto oom_free_page;
3130
3131 entry = mk_pte(page, vma->vm_page_prot);
3132 if (vma->vm_flags & VM_WRITE)
3133 entry = pte_mkwrite(pte_mkdirty(entry));
3134
3135 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3136 if (!pte_none(*page_table))
3137 goto release;
3138
3139 inc_mm_counter_fast(mm, MM_ANONPAGES);
3140 page_add_new_anon_rmap(page, vma, address);
3141setpte:
3142 set_pte_at(mm, address, page_table, entry);
3143
3144
3145 update_mmu_cache(vma, address, page_table);
3146unlock:
3147 pte_unmap_unlock(page_table, ptl);
3148 return 0;
3149release:
3150 mem_cgroup_uncharge_page(page);
3151 page_cache_release(page);
3152 goto unlock;
3153oom_free_page:
3154 page_cache_release(page);
3155oom:
3156 return VM_FAULT_OOM;
3157}
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3173 unsigned long address, pmd_t *pmd,
3174 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3175{
3176 pte_t *page_table;
3177 spinlock_t *ptl;
3178 struct page *page;
3179 struct page *cow_page;
3180 pte_t entry;
3181 int anon = 0;
3182 struct page *dirty_page = NULL;
3183 struct vm_fault vmf;
3184 int ret;
3185 int page_mkwrite = 0;
3186
3187
3188
3189
3190
3191 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3192
3193 if (unlikely(anon_vma_prepare(vma)))
3194 return VM_FAULT_OOM;
3195
3196 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3197 if (!cow_page)
3198 return VM_FAULT_OOM;
3199
3200 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3201 page_cache_release(cow_page);
3202 return VM_FAULT_OOM;
3203 }
3204 } else
3205 cow_page = NULL;
3206
3207 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3208 vmf.pgoff = pgoff;
3209 vmf.flags = flags;
3210 vmf.page = NULL;
3211
3212 ret = vma->vm_ops->fault(vma, &vmf);
3213 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3214 VM_FAULT_RETRY)))
3215 goto uncharge_out;
3216
3217 if (unlikely(PageHWPoison(vmf.page))) {
3218 if (ret & VM_FAULT_LOCKED)
3219 unlock_page(vmf.page);
3220 ret = VM_FAULT_HWPOISON;
3221 goto uncharge_out;
3222 }
3223
3224
3225
3226
3227
3228 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3229 lock_page(vmf.page);
3230 else
3231 VM_BUG_ON(!PageLocked(vmf.page));
3232
3233
3234
3235
3236 page = vmf.page;
3237 if (flags & FAULT_FLAG_WRITE) {
3238 if (!(vma->vm_flags & VM_SHARED)) {
3239 page = cow_page;
3240 anon = 1;
3241 copy_user_highpage(page, vmf.page, address, vma);
3242 __SetPageUptodate(page);
3243 } else {
3244
3245
3246
3247
3248
3249 if (vma->vm_ops->page_mkwrite) {
3250 int tmp;
3251
3252 unlock_page(page);
3253 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3254 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3255 if (unlikely(tmp &
3256 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3257 ret = tmp;
3258 goto unwritable_page;
3259 }
3260 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3261 lock_page(page);
3262 if (!page->mapping) {
3263 ret = 0;
3264 unlock_page(page);
3265 goto unwritable_page;
3266 }
3267 } else
3268 VM_BUG_ON(!PageLocked(page));
3269 page_mkwrite = 1;
3270 }
3271 }
3272
3273 }
3274
3275 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288 if (likely(pte_same(*page_table, orig_pte))) {
3289 flush_icache_page(vma, page);
3290 entry = mk_pte(page, vma->vm_page_prot);
3291 if (flags & FAULT_FLAG_WRITE)
3292 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3293 if (anon) {
3294 inc_mm_counter_fast(mm, MM_ANONPAGES);
3295 page_add_new_anon_rmap(page, vma, address);
3296 } else {
3297 inc_mm_counter_fast(mm, MM_FILEPAGES);
3298 page_add_file_rmap(page);
3299 if (flags & FAULT_FLAG_WRITE) {
3300 dirty_page = page;
3301 get_page(dirty_page);
3302 }
3303 }
3304 set_pte_at(mm, address, page_table, entry);
3305
3306
3307 update_mmu_cache(vma, address, page_table);
3308 } else {
3309 if (cow_page)
3310 mem_cgroup_uncharge_page(cow_page);
3311 if (anon)
3312 page_cache_release(page);
3313 else
3314 anon = 1;
3315 }
3316
3317 pte_unmap_unlock(page_table, ptl);
3318
3319 if (dirty_page) {
3320 struct address_space *mapping = page->mapping;
3321
3322 if (set_page_dirty(dirty_page))
3323 page_mkwrite = 1;
3324 unlock_page(dirty_page);
3325 put_page(dirty_page);
3326 if (page_mkwrite && mapping) {
3327
3328
3329
3330
3331 balance_dirty_pages_ratelimited(mapping);
3332 }
3333
3334
3335 if (vma->vm_file)
3336 file_update_time(vma->vm_file);
3337 } else {
3338 unlock_page(vmf.page);
3339 if (anon)
3340 page_cache_release(vmf.page);
3341 }
3342
3343 return ret;
3344
3345unwritable_page:
3346 page_cache_release(page);
3347 return ret;
3348uncharge_out:
3349
3350 if (cow_page) {
3351 mem_cgroup_uncharge_page(cow_page);
3352 page_cache_release(cow_page);
3353 }
3354 return ret;
3355}
3356
3357static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3358 unsigned long address, pte_t *page_table, pmd_t *pmd,
3359 unsigned int flags, pte_t orig_pte)
3360{
3361 pgoff_t pgoff = (((address & PAGE_MASK)
3362 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3363
3364 pte_unmap(page_table);
3365 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3366}
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3378 unsigned long address, pte_t *page_table, pmd_t *pmd,
3379 unsigned int flags, pte_t orig_pte)
3380{
3381 pgoff_t pgoff;
3382
3383 flags |= FAULT_FLAG_NONLINEAR;
3384
3385 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3386 return 0;
3387
3388 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3389
3390
3391
3392 print_bad_pte(vma, address, orig_pte, NULL);
3393 return VM_FAULT_SIGBUS;
3394 }
3395
3396 pgoff = pte_to_pgoff(orig_pte);
3397 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3398}
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413int handle_pte_fault(struct mm_struct *mm,
3414 struct vm_area_struct *vma, unsigned long address,
3415 pte_t *pte, pmd_t *pmd, unsigned int flags)
3416{
3417 pte_t entry;
3418 spinlock_t *ptl;
3419
3420 entry = *pte;
3421 if (!pte_present(entry)) {
3422 if (pte_none(entry)) {
3423 if (vma->vm_ops) {
3424 if (likely(vma->vm_ops->fault))
3425 return do_linear_fault(mm, vma, address,
3426 pte, pmd, flags, entry);
3427 }
3428 return do_anonymous_page(mm, vma, address,
3429 pte, pmd, flags);
3430 }
3431 if (pte_file(entry))
3432 return do_nonlinear_fault(mm, vma, address,
3433 pte, pmd, flags, entry);
3434 return do_swap_page(mm, vma, address,
3435 pte, pmd, flags, entry);
3436 }
3437
3438 ptl = pte_lockptr(mm, pmd);
3439 spin_lock(ptl);
3440 if (unlikely(!pte_same(*pte, entry)))
3441 goto unlock;
3442 if (flags & FAULT_FLAG_WRITE) {
3443 if (!pte_write(entry))
3444 return do_wp_page(mm, vma, address,
3445 pte, pmd, ptl, entry);
3446 entry = pte_mkdirty(entry);
3447 }
3448 entry = pte_mkyoung(entry);
3449 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3450 update_mmu_cache(vma, address, pte);
3451 } else {
3452
3453
3454
3455
3456
3457
3458 if (flags & FAULT_FLAG_WRITE)
3459 flush_tlb_fix_spurious_fault(vma, address);
3460 }
3461unlock:
3462 pte_unmap_unlock(pte, ptl);
3463 return 0;
3464}
3465
3466
3467
3468
3469int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3470 unsigned long address, unsigned int flags)
3471{
3472 pgd_t *pgd;
3473 pud_t *pud;
3474 pmd_t *pmd;
3475 pte_t *pte;
3476
3477 __set_current_state(TASK_RUNNING);
3478
3479 count_vm_event(PGFAULT);
3480 mem_cgroup_count_vm_event(mm, PGFAULT);
3481
3482
3483 check_sync_rss_stat(current);
3484
3485 if (unlikely(is_vm_hugetlb_page(vma)))
3486 return hugetlb_fault(mm, vma, address, flags);
3487
3488 pgd = pgd_offset(mm, address);
3489 pud = pud_alloc(mm, pgd, address);
3490 if (!pud)
3491 return VM_FAULT_OOM;
3492 pmd = pmd_alloc(mm, pud, address);
3493 if (!pmd)
3494 return VM_FAULT_OOM;
3495 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3496 if (!vma->vm_ops)
3497 return do_huge_pmd_anonymous_page(mm, vma, address,
3498 pmd, flags);
3499 } else {
3500 pmd_t orig_pmd = *pmd;
3501 barrier();
3502 if (pmd_trans_huge(orig_pmd)) {
3503 if (flags & FAULT_FLAG_WRITE &&
3504 !pmd_write(orig_pmd) &&
3505 !pmd_trans_splitting(orig_pmd))
3506 return do_huge_pmd_wp_page(mm, vma, address,
3507 pmd, orig_pmd);
3508 return 0;
3509 }
3510 }
3511
3512
3513
3514
3515
3516
3517 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3518 return VM_FAULT_OOM;
3519
3520 if (unlikely(pmd_trans_huge(*pmd)))
3521 return 0;
3522
3523
3524
3525
3526
3527
3528 pte = pte_offset_map(pmd, address);
3529
3530 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3531}
3532
3533#ifndef __PAGETABLE_PUD_FOLDED
3534
3535
3536
3537
3538int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3539{
3540 pud_t *new = pud_alloc_one(mm, address);
3541 if (!new)
3542 return -ENOMEM;
3543
3544 smp_wmb();
3545
3546 spin_lock(&mm->page_table_lock);
3547 if (pgd_present(*pgd))
3548 pud_free(mm, new);
3549 else
3550 pgd_populate(mm, pgd, new);
3551 spin_unlock(&mm->page_table_lock);
3552 return 0;
3553}
3554#endif
3555
3556#ifndef __PAGETABLE_PMD_FOLDED
3557
3558
3559
3560
3561int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3562{
3563 pmd_t *new = pmd_alloc_one(mm, address);
3564 if (!new)
3565 return -ENOMEM;
3566
3567 smp_wmb();
3568
3569 spin_lock(&mm->page_table_lock);
3570#ifndef __ARCH_HAS_4LEVEL_HACK
3571 if (pud_present(*pud))
3572 pmd_free(mm, new);
3573 else
3574 pud_populate(mm, pud, new);
3575#else
3576 if (pgd_present(*pud))
3577 pmd_free(mm, new);
3578 else
3579 pgd_populate(mm, pud, new);
3580#endif
3581 spin_unlock(&mm->page_table_lock);
3582 return 0;
3583}
3584#endif
3585
3586int make_pages_present(unsigned long addr, unsigned long end)
3587{
3588 int ret, len, write;
3589 struct vm_area_struct * vma;
3590
3591 vma = find_vma(current->mm, addr);
3592 if (!vma)
3593 return -ENOMEM;
3594
3595
3596
3597
3598
3599 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3600 BUG_ON(addr >= end);
3601 BUG_ON(end > vma->vm_end);
3602 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3603 ret = get_user_pages(current, current->mm, addr,
3604 len, write, 0, NULL, NULL);
3605 if (ret < 0)
3606 return ret;
3607 return ret == len ? 0 : -EFAULT;
3608}
3609
3610#if !defined(__HAVE_ARCH_GATE_AREA)
3611
3612#if defined(AT_SYSINFO_EHDR)
3613static struct vm_area_struct gate_vma;
3614
3615static int __init gate_vma_init(void)
3616{
3617 gate_vma.vm_mm = NULL;
3618 gate_vma.vm_start = FIXADDR_USER_START;
3619 gate_vma.vm_end = FIXADDR_USER_END;
3620 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3621 gate_vma.vm_page_prot = __P101;
3622
3623
3624
3625
3626
3627
3628 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3629 return 0;
3630}
3631__initcall(gate_vma_init);
3632#endif
3633
3634struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3635{
3636#ifdef AT_SYSINFO_EHDR
3637 return &gate_vma;
3638#else
3639 return NULL;
3640#endif
3641}
3642
3643int in_gate_area_no_mm(unsigned long addr)
3644{
3645#ifdef AT_SYSINFO_EHDR
3646 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3647 return 1;
3648#endif
3649 return 0;
3650}
3651
3652#endif
3653
3654static int __follow_pte(struct mm_struct *mm, unsigned long address,
3655 pte_t **ptepp, spinlock_t **ptlp)
3656{
3657 pgd_t *pgd;
3658 pud_t *pud;
3659 pmd_t *pmd;
3660 pte_t *ptep;
3661
3662 pgd = pgd_offset(mm, address);
3663 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3664 goto out;
3665
3666 pud = pud_offset(pgd, address);
3667 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3668 goto out;
3669
3670 pmd = pmd_offset(pud, address);
3671 VM_BUG_ON(pmd_trans_huge(*pmd));
3672 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3673 goto out;
3674
3675
3676 if (pmd_huge(*pmd))
3677 goto out;
3678
3679 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3680 if (!ptep)
3681 goto out;
3682 if (!pte_present(*ptep))
3683 goto unlock;
3684 *ptepp = ptep;
3685 return 0;
3686unlock:
3687 pte_unmap_unlock(ptep, *ptlp);
3688out:
3689 return -EINVAL;
3690}
3691
3692static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3693 pte_t **ptepp, spinlock_t **ptlp)
3694{
3695 int res;
3696
3697
3698 (void) __cond_lock(*ptlp,
3699 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3700 return res;
3701}
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3714 unsigned long *pfn)
3715{
3716 int ret = -EINVAL;
3717 spinlock_t *ptl;
3718 pte_t *ptep;
3719
3720 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3721 return ret;
3722
3723 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3724 if (ret)
3725 return ret;
3726 *pfn = pte_pfn(*ptep);
3727 pte_unmap_unlock(ptep, ptl);
3728 return 0;
3729}
3730EXPORT_SYMBOL(follow_pfn);
3731
3732#ifdef CONFIG_HAVE_IOREMAP_PROT
3733int follow_phys(struct vm_area_struct *vma,
3734 unsigned long address, unsigned int flags,
3735 unsigned long *prot, resource_size_t *phys)
3736{
3737 int ret = -EINVAL;
3738 pte_t *ptep, pte;
3739 spinlock_t *ptl;
3740
3741 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3742 goto out;
3743
3744 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3745 goto out;
3746 pte = *ptep;
3747
3748 if ((flags & FOLL_WRITE) && !pte_write(pte))
3749 goto unlock;
3750
3751 *prot = pgprot_val(pte_pgprot(pte));
3752 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3753
3754 ret = 0;
3755unlock:
3756 pte_unmap_unlock(ptep, ptl);
3757out:
3758 return ret;
3759}
3760
3761int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3762 void *buf, int len, int write)
3763{
3764 resource_size_t phys_addr;
3765 unsigned long prot = 0;
3766 void __iomem *maddr;
3767 int offset = addr & (PAGE_SIZE-1);
3768
3769 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3770 return -EINVAL;
3771
3772 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3773 if (write)
3774 memcpy_toio(maddr + offset, buf, len);
3775 else
3776 memcpy_fromio(buf, maddr + offset, len);
3777 iounmap(maddr);
3778
3779 return len;
3780}
3781#endif
3782
3783
3784
3785
3786
3787static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3788 unsigned long addr, void *buf, int len, int write)
3789{
3790 struct vm_area_struct *vma;
3791 void *old_buf = buf;
3792
3793 down_read(&mm->mmap_sem);
3794
3795 while (len) {
3796 int bytes, ret, offset;
3797 void *maddr;
3798 struct page *page = NULL;
3799
3800 ret = get_user_pages(tsk, mm, addr, 1,
3801 write, 1, &page, &vma);
3802 if (ret <= 0) {
3803
3804
3805
3806
3807#ifdef CONFIG_HAVE_IOREMAP_PROT
3808 vma = find_vma(mm, addr);
3809 if (!vma || vma->vm_start > addr)
3810 break;
3811 if (vma->vm_ops && vma->vm_ops->access)
3812 ret = vma->vm_ops->access(vma, addr, buf,
3813 len, write);
3814 if (ret <= 0)
3815#endif
3816 break;
3817 bytes = ret;
3818 } else {
3819 bytes = len;
3820 offset = addr & (PAGE_SIZE-1);
3821 if (bytes > PAGE_SIZE-offset)
3822 bytes = PAGE_SIZE-offset;
3823
3824 maddr = kmap(page);
3825 if (write) {
3826 copy_to_user_page(vma, page, addr,
3827 maddr + offset, buf, bytes);
3828 set_page_dirty_lock(page);
3829 } else {
3830 copy_from_user_page(vma, page, addr,
3831 buf, maddr + offset, bytes);
3832 }
3833 kunmap(page);
3834 page_cache_release(page);
3835 }
3836 len -= bytes;
3837 buf += bytes;
3838 addr += bytes;
3839 }
3840 up_read(&mm->mmap_sem);
3841
3842 return buf - old_buf;
3843}
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3856 void *buf, int len, int write)
3857{
3858 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3859}
3860
3861
3862
3863
3864
3865
3866int access_process_vm(struct task_struct *tsk, unsigned long addr,
3867 void *buf, int len, int write)
3868{
3869 struct mm_struct *mm;
3870 int ret;
3871
3872 mm = get_task_mm(tsk);
3873 if (!mm)
3874 return 0;
3875
3876 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3877 mmput(mm);
3878
3879 return ret;
3880}
3881
3882
3883
3884
3885void print_vma_addr(char *prefix, unsigned long ip)
3886{
3887 struct mm_struct *mm = current->mm;
3888 struct vm_area_struct *vma;
3889
3890
3891
3892
3893
3894 if (preempt_count())
3895 return;
3896
3897 down_read(&mm->mmap_sem);
3898 vma = find_vma(mm, ip);
3899 if (vma && vma->vm_file) {
3900 struct file *f = vma->vm_file;
3901 char *buf = (char *)__get_free_page(GFP_KERNEL);
3902 if (buf) {
3903 char *p, *s;
3904
3905 p = d_path(&f->f_path, buf, PAGE_SIZE);
3906 if (IS_ERR(p))
3907 p = "?";
3908 s = strrchr(p, '/');
3909 if (s)
3910 p = s+1;
3911 printk("%s%s[%lx+%lx]", prefix, p,
3912 vma->vm_start,
3913 vma->vm_end - vma->vm_start);
3914 free_page((unsigned long)buf);
3915 }
3916 }
3917 up_read(¤t->mm->mmap_sem);
3918}
3919
3920#ifdef CONFIG_PROVE_LOCKING
3921void might_fault(void)
3922{
3923
3924
3925
3926
3927
3928
3929 if (segment_eq(get_fs(), KERNEL_DS))
3930 return;
3931
3932 might_sleep();
3933
3934
3935
3936
3937
3938 if (!in_atomic() && current->mm)
3939 might_lock_read(¤t->mm->mmap_sem);
3940}
3941EXPORT_SYMBOL(might_fault);
3942#endif
3943
3944#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3945static void clear_gigantic_page(struct page *page,
3946 unsigned long addr,
3947 unsigned int pages_per_huge_page)
3948{
3949 int i;
3950 struct page *p = page;
3951
3952 might_sleep();
3953 for (i = 0; i < pages_per_huge_page;
3954 i++, p = mem_map_next(p, page, i)) {
3955 cond_resched();
3956 clear_user_highpage(p, addr + i * PAGE_SIZE);
3957 }
3958}
3959void clear_huge_page(struct page *page,
3960 unsigned long addr, unsigned int pages_per_huge_page)
3961{
3962 int i;
3963
3964 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3965 clear_gigantic_page(page, addr, pages_per_huge_page);
3966 return;
3967 }
3968
3969 might_sleep();
3970 for (i = 0; i < pages_per_huge_page; i++) {
3971 cond_resched();
3972 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3973 }
3974}
3975
3976static void copy_user_gigantic_page(struct page *dst, struct page *src,
3977 unsigned long addr,
3978 struct vm_area_struct *vma,
3979 unsigned int pages_per_huge_page)
3980{
3981 int i;
3982 struct page *dst_base = dst;
3983 struct page *src_base = src;
3984
3985 for (i = 0; i < pages_per_huge_page; ) {
3986 cond_resched();
3987 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3988
3989 i++;
3990 dst = mem_map_next(dst, dst_base, i);
3991 src = mem_map_next(src, src_base, i);
3992 }
3993}
3994
3995void copy_user_huge_page(struct page *dst, struct page *src,
3996 unsigned long addr, struct vm_area_struct *vma,
3997 unsigned int pages_per_huge_page)
3998{
3999 int i;
4000
4001 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4002 copy_user_gigantic_page(dst, src, addr, vma,
4003 pages_per_huge_page);
4004 return;
4005 }
4006
4007 might_sleep();
4008 for (i = 0; i < pages_per_huge_page; i++) {
4009 cond_resched();
4010 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4011 }
4012}
4013#endif
4014