1
2
3
4
5
6
7#include <linux/sched.h>
8#include <linux/sched/task_stack.h>
9#include <linux/kdebug.h>
10#include <linux/extable.h>
11#include <linux/memblock.h>
12#include <linux/kfence.h>
13#include <linux/kprobes.h>
14#include <linux/mmiotrace.h>
15#include <linux/perf_event.h>
16#include <linux/hugetlb.h>
17#include <linux/prefetch.h>
18#include <linux/context_tracking.h>
19#include <linux/uaccess.h>
20#include <linux/efi.h>
21#include <linux/mm_types.h>
22
23#include <asm/cpufeature.h>
24#include <asm/traps.h>
25#include <asm/fixmap.h>
26#include <asm/vsyscall.h>
27#include <asm/vm86.h>
28#include <asm/mmu_context.h>
29#include <asm/efi.h>
30#include <asm/desc.h>
31#include <asm/cpu_entry_area.h>
32#include <asm/pgtable_areas.h>
33#include <asm/kvm_para.h>
34#include <asm/vdso.h>
35
36#define CREATE_TRACE_POINTS
37#include <asm/trace/exceptions.h>
38
39
40
41
42
43static nokprobe_inline int
44kmmio_fault(struct pt_regs *regs, unsigned long addr)
45{
46 if (unlikely(is_kmmio_active()))
47 if (kmmio_handler(regs, addr) == 1)
48 return -1;
49 return 0;
50}
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static inline int
68check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
69 unsigned char opcode, int *prefetch)
70{
71 unsigned char instr_hi = opcode & 0xf0;
72 unsigned char instr_lo = opcode & 0x0f;
73
74 switch (instr_hi) {
75 case 0x20:
76 case 0x30:
77
78
79
80
81
82
83 return ((instr_lo & 7) == 0x6);
84#ifdef CONFIG_X86_64
85 case 0x40:
86
87
88
89 return (!user_mode(regs) || user_64bit_mode(regs));
90#endif
91 case 0x60:
92
93 return (instr_lo & 0xC) == 0x4;
94 case 0xF0:
95
96 return !instr_lo || (instr_lo>>1) == 1;
97 case 0x00:
98
99 if (get_kernel_nofault(opcode, instr))
100 return 0;
101
102 *prefetch = (instr_lo == 0xF) &&
103 (opcode == 0x0D || opcode == 0x18);
104 return 0;
105 default:
106 return 0;
107 }
108}
109
110static bool is_amd_k8_pre_npt(void)
111{
112 struct cpuinfo_x86 *c = &boot_cpu_data;
113
114 return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
115 c->x86_vendor == X86_VENDOR_AMD &&
116 c->x86 == 0xf && c->x86_model < 0x40);
117}
118
119static int
120is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
121{
122 unsigned char *max_instr;
123 unsigned char *instr;
124 int prefetch = 0;
125
126
127 if (!is_amd_k8_pre_npt())
128 return 0;
129
130
131
132
133
134 if (error_code & X86_PF_INSTR)
135 return 0;
136
137 instr = (void *)convert_ip_to_linear(current, regs);
138 max_instr = instr + 15;
139
140
141
142
143
144
145 pagefault_disable();
146
147 while (instr < max_instr) {
148 unsigned char opcode;
149
150 if (user_mode(regs)) {
151 if (get_user(opcode, instr))
152 break;
153 } else {
154 if (get_kernel_nofault(opcode, instr))
155 break;
156 }
157
158 instr++;
159
160 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
161 break;
162 }
163
164 pagefault_enable();
165 return prefetch;
166}
167
168DEFINE_SPINLOCK(pgd_lock);
169LIST_HEAD(pgd_list);
170
171#ifdef CONFIG_X86_32
172static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
173{
174 unsigned index = pgd_index(address);
175 pgd_t *pgd_k;
176 p4d_t *p4d, *p4d_k;
177 pud_t *pud, *pud_k;
178 pmd_t *pmd, *pmd_k;
179
180 pgd += index;
181 pgd_k = init_mm.pgd + index;
182
183 if (!pgd_present(*pgd_k))
184 return NULL;
185
186
187
188
189
190
191 p4d = p4d_offset(pgd, address);
192 p4d_k = p4d_offset(pgd_k, address);
193 if (!p4d_present(*p4d_k))
194 return NULL;
195
196 pud = pud_offset(p4d, address);
197 pud_k = pud_offset(p4d_k, address);
198 if (!pud_present(*pud_k))
199 return NULL;
200
201 pmd = pmd_offset(pud, address);
202 pmd_k = pmd_offset(pud_k, address);
203
204 if (pmd_present(*pmd) != pmd_present(*pmd_k))
205 set_pmd(pmd, *pmd_k);
206
207 if (!pmd_present(*pmd_k))
208 return NULL;
209 else
210 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
211
212 return pmd_k;
213}
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229static noinline int vmalloc_fault(unsigned long address)
230{
231 unsigned long pgd_paddr;
232 pmd_t *pmd_k;
233 pte_t *pte_k;
234
235
236 if (!(address >= VMALLOC_START && address < VMALLOC_END))
237 return -1;
238
239
240
241
242
243
244
245
246 pgd_paddr = read_cr3_pa();
247 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
248 if (!pmd_k)
249 return -1;
250
251 if (pmd_large(*pmd_k))
252 return 0;
253
254 pte_k = pte_offset_kernel(pmd_k, address);
255 if (!pte_present(*pte_k))
256 return -1;
257
258 return 0;
259}
260NOKPROBE_SYMBOL(vmalloc_fault);
261
262void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
263{
264 unsigned long addr;
265
266 for (addr = start & PMD_MASK;
267 addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
268 addr += PMD_SIZE) {
269 struct page *page;
270
271 spin_lock(&pgd_lock);
272 list_for_each_entry(page, &pgd_list, lru) {
273 spinlock_t *pgt_lock;
274
275
276 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
277
278 spin_lock(pgt_lock);
279 vmalloc_sync_one(page_address(page), addr);
280 spin_unlock(pgt_lock);
281 }
282 spin_unlock(&pgd_lock);
283 }
284}
285
286static bool low_pfn(unsigned long pfn)
287{
288 return pfn < max_low_pfn;
289}
290
291static void dump_pagetable(unsigned long address)
292{
293 pgd_t *base = __va(read_cr3_pa());
294 pgd_t *pgd = &base[pgd_index(address)];
295 p4d_t *p4d;
296 pud_t *pud;
297 pmd_t *pmd;
298 pte_t *pte;
299
300#ifdef CONFIG_X86_PAE
301 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
302 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
303 goto out;
304#define pr_pde pr_cont
305#else
306#define pr_pde pr_info
307#endif
308 p4d = p4d_offset(pgd, address);
309 pud = pud_offset(p4d, address);
310 pmd = pmd_offset(pud, address);
311 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
312#undef pr_pde
313
314
315
316
317
318
319
320 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
321 goto out;
322
323 pte = pte_offset_kernel(pmd, address);
324 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
325out:
326 pr_cont("\n");
327}
328
329#else
330
331#ifdef CONFIG_CPU_SUP_AMD
332static const char errata93_warning[] =
333KERN_ERR
334"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
335"******* Working around it, but it may cause SEGVs or burn power.\n"
336"******* Please consider a BIOS update.\n"
337"******* Disabling USB legacy in the BIOS may also help.\n";
338#endif
339
340static int bad_address(void *p)
341{
342 unsigned long dummy;
343
344 return get_kernel_nofault(dummy, (unsigned long *)p);
345}
346
347static void dump_pagetable(unsigned long address)
348{
349 pgd_t *base = __va(read_cr3_pa());
350 pgd_t *pgd = base + pgd_index(address);
351 p4d_t *p4d;
352 pud_t *pud;
353 pmd_t *pmd;
354 pte_t *pte;
355
356 if (bad_address(pgd))
357 goto bad;
358
359 pr_info("PGD %lx ", pgd_val(*pgd));
360
361 if (!pgd_present(*pgd))
362 goto out;
363
364 p4d = p4d_offset(pgd, address);
365 if (bad_address(p4d))
366 goto bad;
367
368 pr_cont("P4D %lx ", p4d_val(*p4d));
369 if (!p4d_present(*p4d) || p4d_large(*p4d))
370 goto out;
371
372 pud = pud_offset(p4d, address);
373 if (bad_address(pud))
374 goto bad;
375
376 pr_cont("PUD %lx ", pud_val(*pud));
377 if (!pud_present(*pud) || pud_large(*pud))
378 goto out;
379
380 pmd = pmd_offset(pud, address);
381 if (bad_address(pmd))
382 goto bad;
383
384 pr_cont("PMD %lx ", pmd_val(*pmd));
385 if (!pmd_present(*pmd) || pmd_large(*pmd))
386 goto out;
387
388 pte = pte_offset_kernel(pmd, address);
389 if (bad_address(pte))
390 goto bad;
391
392 pr_cont("PTE %lx", pte_val(*pte));
393out:
394 pr_cont("\n");
395 return;
396bad:
397 pr_info("BAD\n");
398}
399
400#endif
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416static int is_errata93(struct pt_regs *regs, unsigned long address)
417{
418#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
419 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
420 || boot_cpu_data.x86 != 0xf)
421 return 0;
422
423 if (user_mode(regs))
424 return 0;
425
426 if (address != regs->ip)
427 return 0;
428
429 if ((address >> 32) != 0)
430 return 0;
431
432 address |= 0xffffffffUL << 32;
433 if ((address >= (u64)_stext && address <= (u64)_etext) ||
434 (address >= MODULES_VADDR && address <= MODULES_END)) {
435 printk_once(errata93_warning);
436 regs->ip = address;
437 return 1;
438 }
439#endif
440 return 0;
441}
442
443
444
445
446
447
448
449
450
451static int is_errata100(struct pt_regs *regs, unsigned long address)
452{
453#ifdef CONFIG_X86_64
454 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
455 return 1;
456#endif
457 return 0;
458}
459
460
461static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
462 unsigned long address)
463{
464#ifdef CONFIG_X86_F00F_BUG
465 if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
466 idt_is_f00f_address(address)) {
467 handle_invalid_op(regs);
468 return 1;
469 }
470#endif
471 return 0;
472}
473
474static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
475{
476 u32 offset = (index >> 3) * sizeof(struct desc_struct);
477 unsigned long addr;
478 struct ldttss_desc desc;
479
480 if (index == 0) {
481 pr_alert("%s: NULL\n", name);
482 return;
483 }
484
485 if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
486 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
487 return;
488 }
489
490 if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
491 sizeof(struct ldttss_desc))) {
492 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
493 name, index);
494 return;
495 }
496
497 addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
498#ifdef CONFIG_X86_64
499 addr |= ((u64)desc.base3 << 32);
500#endif
501 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
502 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
503}
504
505static void
506show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
507{
508 if (!oops_may_print())
509 return;
510
511 if (error_code & X86_PF_INSTR) {
512 unsigned int level;
513 pgd_t *pgd;
514 pte_t *pte;
515
516 pgd = __va(read_cr3_pa());
517 pgd += pgd_index(address);
518
519 pte = lookup_address_in_pgd(pgd, address, &level);
520
521 if (pte && pte_present(*pte) && !pte_exec(*pte))
522 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
523 from_kuid(&init_user_ns, current_uid()));
524 if (pte && pte_present(*pte) && pte_exec(*pte) &&
525 (pgd_flags(*pgd) & _PAGE_USER) &&
526 (__read_cr4() & X86_CR4_SMEP))
527 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
528 from_kuid(&init_user_ns, current_uid()));
529 }
530
531 if (address < PAGE_SIZE && !user_mode(regs))
532 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
533 (void *)address);
534 else
535 pr_alert("BUG: unable to handle page fault for address: %px\n",
536 (void *)address);
537
538 pr_alert("#PF: %s %s in %s mode\n",
539 (error_code & X86_PF_USER) ? "user" : "supervisor",
540 (error_code & X86_PF_INSTR) ? "instruction fetch" :
541 (error_code & X86_PF_WRITE) ? "write access" :
542 "read access",
543 user_mode(regs) ? "user" : "kernel");
544 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
545 !(error_code & X86_PF_PROT) ? "not-present page" :
546 (error_code & X86_PF_RSVD) ? "reserved bit violation" :
547 (error_code & X86_PF_PK) ? "protection keys violation" :
548 "permissions violation");
549
550 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
551 struct desc_ptr idt, gdt;
552 u16 ldtr, tr;
553
554
555
556
557
558
559
560
561
562
563
564 store_idt(&idt);
565
566
567 native_store_gdt(&gdt);
568
569 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
570 idt.address, idt.size, gdt.address, gdt.size);
571
572 store_ldt(ldtr);
573 show_ldttss(&gdt, "LDTR", ldtr);
574
575 store_tr(tr);
576 show_ldttss(&gdt, "TR", tr);
577 }
578
579 dump_pagetable(address);
580}
581
582static noinline void
583pgtable_bad(struct pt_regs *regs, unsigned long error_code,
584 unsigned long address)
585{
586 struct task_struct *tsk;
587 unsigned long flags;
588 int sig;
589
590 flags = oops_begin();
591 tsk = current;
592 sig = SIGKILL;
593
594 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
595 tsk->comm, address);
596 dump_pagetable(address);
597
598 if (__die("Bad pagetable", regs, error_code))
599 sig = 0;
600
601 oops_end(flags, regs, sig);
602}
603
604static void sanitize_error_code(unsigned long address,
605 unsigned long *error_code)
606{
607
608
609
610
611
612
613
614
615
616 if (address >= TASK_SIZE_MAX)
617 *error_code |= X86_PF_PROT;
618}
619
620static void set_signal_archinfo(unsigned long address,
621 unsigned long error_code)
622{
623 struct task_struct *tsk = current;
624
625 tsk->thread.trap_nr = X86_TRAP_PF;
626 tsk->thread.error_code = error_code | X86_PF_USER;
627 tsk->thread.cr2 = address;
628}
629
630static noinline void
631page_fault_oops(struct pt_regs *regs, unsigned long error_code,
632 unsigned long address)
633{
634 unsigned long flags;
635 int sig;
636
637 if (user_mode(regs)) {
638
639
640
641
642 goto oops;
643 }
644
645#ifdef CONFIG_VMAP_STACK
646
647
648
649
650
651 if (is_vmalloc_addr((void *)address) &&
652 (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
653 address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
654 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
655
656
657
658
659
660
661
662
663
664
665 asm volatile ("movq %[stack], %%rsp\n\t"
666 "call handle_stack_overflow\n\t"
667 "1: jmp 1b"
668 : ASM_CALL_CONSTRAINT
669 : "D" ("kernel stack overflow (page fault)"),
670 "S" (regs), "d" (address),
671 [stack] "rm" (stack));
672 unreachable();
673 }
674#endif
675
676
677
678
679
680
681 if (IS_ENABLED(CONFIG_EFI))
682 efi_crash_gracefully_on_page_fault(address);
683
684
685 if (!(error_code & X86_PF_PROT) &&
686 kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
687 return;
688
689oops:
690
691
692
693
694 flags = oops_begin();
695
696 show_fault_oops(regs, error_code, address);
697
698 if (task_stack_end_corrupted(current))
699 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
700
701 sig = SIGKILL;
702 if (__die("Oops", regs, error_code))
703 sig = 0;
704
705
706 printk(KERN_DEFAULT "CR2: %016lx\n", address);
707
708 oops_end(flags, regs, sig);
709}
710
711static noinline void
712kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
713 unsigned long address, int signal, int si_code)
714{
715 WARN_ON_ONCE(user_mode(regs));
716
717
718 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
719
720
721
722
723
724 if (in_interrupt())
725 return;
726
727
728
729
730
731
732
733 if (current->thread.sig_on_uaccess_err && signal) {
734 sanitize_error_code(address, &error_code);
735
736 set_signal_archinfo(address, error_code);
737
738
739 force_sig_fault(signal, si_code, (void __user *)address);
740 }
741
742
743
744
745 return;
746 }
747
748
749
750
751
752 if (is_prefetch(regs, error_code, address))
753 return;
754
755 page_fault_oops(regs, error_code, address);
756}
757
758
759
760
761
762static inline void
763show_signal_msg(struct pt_regs *regs, unsigned long error_code,
764 unsigned long address, struct task_struct *tsk)
765{
766 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
767
768 if (!unhandled_signal(tsk, SIGSEGV))
769 return;
770
771 if (!printk_ratelimit())
772 return;
773
774 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
775 loglvl, tsk->comm, task_pid_nr(tsk), address,
776 (void *)regs->ip, (void *)regs->sp, error_code);
777
778 print_vma_addr(KERN_CONT " in ", regs->ip);
779
780 printk(KERN_CONT "\n");
781
782 show_opcodes(regs, loglvl);
783}
784
785
786
787
788
789static bool is_vsyscall_vaddr(unsigned long vaddr)
790{
791 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
792}
793
794static void
795__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
796 unsigned long address, u32 pkey, int si_code)
797{
798 struct task_struct *tsk = current;
799
800 if (!user_mode(regs)) {
801 kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
802 return;
803 }
804
805 if (!(error_code & X86_PF_USER)) {
806
807 page_fault_oops(regs, error_code, address);
808 return;
809 }
810
811
812
813
814
815 local_irq_enable();
816
817
818
819
820
821 if (is_prefetch(regs, error_code, address))
822 return;
823
824 if (is_errata100(regs, address))
825 return;
826
827 sanitize_error_code(address, &error_code);
828
829 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
830 return;
831
832 if (likely(show_unhandled_signals))
833 show_signal_msg(regs, error_code, address, tsk);
834
835 set_signal_archinfo(address, error_code);
836
837 if (si_code == SEGV_PKUERR)
838 force_sig_pkuerr((void __user *)address, pkey);
839 else
840 force_sig_fault(SIGSEGV, si_code, (void __user *)address);
841
842 local_irq_disable();
843}
844
845static noinline void
846bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
847 unsigned long address)
848{
849 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
850}
851
852static void
853__bad_area(struct pt_regs *regs, unsigned long error_code,
854 unsigned long address, u32 pkey, int si_code)
855{
856 struct mm_struct *mm = current->mm;
857
858
859
860
861 mmap_read_unlock(mm);
862
863 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
864}
865
866static noinline void
867bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
868{
869 __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
870}
871
872static inline bool bad_area_access_from_pkeys(unsigned long error_code,
873 struct vm_area_struct *vma)
874{
875
876 bool foreign = false;
877
878 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
879 return false;
880 if (error_code & X86_PF_PK)
881 return true;
882
883 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
884 (error_code & X86_PF_INSTR), foreign))
885 return true;
886 return false;
887}
888
889static noinline void
890bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
891 unsigned long address, struct vm_area_struct *vma)
892{
893
894
895
896
897
898 if (bad_area_access_from_pkeys(error_code, vma)) {
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919 u32 pkey = vma_pkey(vma);
920
921 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
922 } else {
923 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
924 }
925}
926
927static void
928do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
929 vm_fault_t fault)
930{
931
932 if (!user_mode(regs)) {
933 kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
934 return;
935 }
936
937
938 if (is_prefetch(regs, error_code, address))
939 return;
940
941 sanitize_error_code(address, &error_code);
942
943 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
944 return;
945
946 set_signal_archinfo(address, error_code);
947
948#ifdef CONFIG_MEMORY_FAILURE
949 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
950 struct task_struct *tsk = current;
951 unsigned lsb = 0;
952
953 pr_err(
954 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
955 tsk->comm, tsk->pid, address);
956 if (fault & VM_FAULT_HWPOISON_LARGE)
957 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
958 if (fault & VM_FAULT_HWPOISON)
959 lsb = PAGE_SHIFT;
960 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
961 return;
962 }
963#endif
964 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
965}
966
967static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
968{
969 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
970 return 0;
971
972 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
973 return 0;
974
975 return 1;
976}
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999static noinline int
1000spurious_kernel_fault(unsigned long error_code, unsigned long address)
1001{
1002 pgd_t *pgd;
1003 p4d_t *p4d;
1004 pud_t *pud;
1005 pmd_t *pmd;
1006 pte_t *pte;
1007 int ret;
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1019 error_code != (X86_PF_INSTR | X86_PF_PROT))
1020 return 0;
1021
1022 pgd = init_mm.pgd + pgd_index(address);
1023 if (!pgd_present(*pgd))
1024 return 0;
1025
1026 p4d = p4d_offset(pgd, address);
1027 if (!p4d_present(*p4d))
1028 return 0;
1029
1030 if (p4d_large(*p4d))
1031 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1032
1033 pud = pud_offset(p4d, address);
1034 if (!pud_present(*pud))
1035 return 0;
1036
1037 if (pud_large(*pud))
1038 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1039
1040 pmd = pmd_offset(pud, address);
1041 if (!pmd_present(*pmd))
1042 return 0;
1043
1044 if (pmd_large(*pmd))
1045 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1046
1047 pte = pte_offset_kernel(pmd, address);
1048 if (!pte_present(*pte))
1049 return 0;
1050
1051 ret = spurious_kernel_fault_check(error_code, pte);
1052 if (!ret)
1053 return 0;
1054
1055
1056
1057
1058
1059 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1060 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1061
1062 return ret;
1063}
1064NOKPROBE_SYMBOL(spurious_kernel_fault);
1065
1066int show_unhandled_signals = 1;
1067
1068static inline int
1069access_error(unsigned long error_code, struct vm_area_struct *vma)
1070{
1071
1072 bool foreign = false;
1073
1074
1075
1076
1077
1078
1079 if (error_code & X86_PF_PK)
1080 return 1;
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091 if (unlikely(error_code & X86_PF_SGX))
1092 return 1;
1093
1094
1095
1096
1097
1098
1099 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1100 (error_code & X86_PF_INSTR), foreign))
1101 return 1;
1102
1103 if (error_code & X86_PF_WRITE) {
1104
1105 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1106 return 1;
1107 return 0;
1108 }
1109
1110
1111 if (unlikely(error_code & X86_PF_PROT))
1112 return 1;
1113
1114
1115 if (unlikely(!vma_is_accessible(vma)))
1116 return 1;
1117
1118 return 0;
1119}
1120
1121bool fault_in_kernel_space(unsigned long address)
1122{
1123
1124
1125
1126
1127
1128 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1129 return false;
1130
1131 return address >= TASK_SIZE_MAX;
1132}
1133
1134
1135
1136
1137
1138
1139static void
1140do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1141 unsigned long address)
1142{
1143
1144
1145
1146
1147
1148 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1149
1150#ifdef CONFIG_X86_32
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1176 if (vmalloc_fault(address) >= 0)
1177 return;
1178 }
1179#endif
1180
1181 if (is_f00f_bug(regs, hw_error_code, address))
1182 return;
1183
1184
1185 if (spurious_kernel_fault(hw_error_code, address))
1186 return;
1187
1188
1189 if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1190 return;
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200 bad_area_nosemaphore(regs, hw_error_code, address);
1201}
1202NOKPROBE_SYMBOL(do_kern_addr_fault);
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212static inline
1213void do_user_addr_fault(struct pt_regs *regs,
1214 unsigned long error_code,
1215 unsigned long address)
1216{
1217 struct vm_area_struct *vma;
1218 struct task_struct *tsk;
1219 struct mm_struct *mm;
1220 vm_fault_t fault;
1221 unsigned int flags = FAULT_FLAG_DEFAULT;
1222
1223 tsk = current;
1224 mm = tsk->mm;
1225
1226 if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
1227
1228
1229
1230
1231
1232
1233
1234 if (is_errata93(regs, address))
1235 return;
1236
1237 page_fault_oops(regs, error_code, address);
1238 return;
1239 }
1240
1241
1242 if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1243 return;
1244
1245
1246
1247
1248
1249 if (unlikely(error_code & X86_PF_RSVD))
1250 pgtable_bad(regs, error_code, address);
1251
1252
1253
1254
1255
1256
1257
1258
1259 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1260 !(error_code & X86_PF_USER) &&
1261 !(regs->flags & X86_EFLAGS_AC))) {
1262
1263
1264
1265
1266 page_fault_oops(regs, error_code, address);
1267 return;
1268 }
1269
1270
1271
1272
1273
1274 if (unlikely(faulthandler_disabled() || !mm)) {
1275 bad_area_nosemaphore(regs, error_code, address);
1276 return;
1277 }
1278
1279
1280
1281
1282
1283
1284
1285
1286 if (user_mode(regs)) {
1287 local_irq_enable();
1288 flags |= FAULT_FLAG_USER;
1289 } else {
1290 if (regs->flags & X86_EFLAGS_IF)
1291 local_irq_enable();
1292 }
1293
1294 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1295
1296 if (error_code & X86_PF_WRITE)
1297 flags |= FAULT_FLAG_WRITE;
1298 if (error_code & X86_PF_INSTR)
1299 flags |= FAULT_FLAG_INSTRUCTION;
1300
1301#ifdef CONFIG_X86_64
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313 if (is_vsyscall_vaddr(address)) {
1314 if (emulate_vsyscall(error_code, regs, address))
1315 return;
1316 }
1317#endif
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331 if (unlikely(!mmap_read_trylock(mm))) {
1332 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1333
1334
1335
1336
1337 bad_area_nosemaphore(regs, error_code, address);
1338 return;
1339 }
1340retry:
1341 mmap_read_lock(mm);
1342 } else {
1343
1344
1345
1346
1347
1348 might_sleep();
1349 }
1350
1351 vma = find_vma(mm, address);
1352 if (unlikely(!vma)) {
1353 bad_area(regs, error_code, address);
1354 return;
1355 }
1356 if (likely(vma->vm_start <= address))
1357 goto good_area;
1358 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1359 bad_area(regs, error_code, address);
1360 return;
1361 }
1362 if (unlikely(expand_stack(vma, address))) {
1363 bad_area(regs, error_code, address);
1364 return;
1365 }
1366
1367
1368
1369
1370
1371good_area:
1372 if (unlikely(access_error(error_code, vma))) {
1373 bad_area_access_error(regs, error_code, address, vma);
1374 return;
1375 }
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390 fault = handle_mm_fault(vma, address, flags, regs);
1391
1392 if (fault_signal_pending(fault, regs)) {
1393
1394
1395
1396
1397 if (!user_mode(regs))
1398 kernelmode_fixup_or_oops(regs, error_code, address,
1399 SIGBUS, BUS_ADRERR);
1400 return;
1401 }
1402
1403
1404
1405
1406
1407
1408 if (unlikely((fault & VM_FAULT_RETRY) &&
1409 (flags & FAULT_FLAG_ALLOW_RETRY))) {
1410 flags |= FAULT_FLAG_TRIED;
1411 goto retry;
1412 }
1413
1414 mmap_read_unlock(mm);
1415 if (likely(!(fault & VM_FAULT_ERROR)))
1416 return;
1417
1418 if (fatal_signal_pending(current) && !user_mode(regs)) {
1419 kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
1420 return;
1421 }
1422
1423 if (fault & VM_FAULT_OOM) {
1424
1425 if (!user_mode(regs)) {
1426 kernelmode_fixup_or_oops(regs, error_code, address,
1427 SIGSEGV, SEGV_MAPERR);
1428 return;
1429 }
1430
1431
1432
1433
1434
1435
1436 pagefault_out_of_memory();
1437 } else {
1438 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1439 VM_FAULT_HWPOISON_LARGE))
1440 do_sigbus(regs, error_code, address, fault);
1441 else if (fault & VM_FAULT_SIGSEGV)
1442 bad_area_nosemaphore(regs, error_code, address);
1443 else
1444 BUG();
1445 }
1446}
1447NOKPROBE_SYMBOL(do_user_addr_fault);
1448
1449static __always_inline void
1450trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1451 unsigned long address)
1452{
1453 if (!trace_pagefault_enabled())
1454 return;
1455
1456 if (user_mode(regs))
1457 trace_page_fault_user(address, regs, error_code);
1458 else
1459 trace_page_fault_kernel(address, regs, error_code);
1460}
1461
1462static __always_inline void
1463handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1464 unsigned long address)
1465{
1466 trace_page_fault_entries(regs, error_code, address);
1467
1468 if (unlikely(kmmio_fault(regs, address)))
1469 return;
1470
1471
1472 if (unlikely(fault_in_kernel_space(address))) {
1473 do_kern_addr_fault(regs, error_code, address);
1474 } else {
1475 do_user_addr_fault(regs, error_code, address);
1476
1477
1478
1479
1480
1481
1482
1483 local_irq_disable();
1484 }
1485}
1486
1487DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1488{
1489 unsigned long address = read_cr2();
1490 irqentry_state_t state;
1491
1492 prefetchw(¤t->mm->mmap_lock);
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515 if (kvm_handle_async_pf(regs, (u32)address))
1516 return;
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528 state = irqentry_enter(regs);
1529
1530 instrumentation_begin();
1531 handle_page_fault(regs, error_code, address);
1532 instrumentation_end();
1533
1534 irqentry_exit(regs, state);
1535}
1536