1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46
47
48
49
50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic;
52
53static int __init hardlockup_panic_setup(char *str)
54{
55 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1;
57 return 1;
58}
59__setup("nmi_watchdog=", hardlockup_panic_setup);
60#endif
61
62unsigned int __read_mostly softlockup_panic =
63 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
64
65static int __init softlockup_panic_setup(char *str)
66{
67 softlockup_panic = simple_strtoul(str, NULL, 0);
68
69 return 1;
70}
71__setup("softlockup_panic=", softlockup_panic_setup);
72
73static int __init nowatchdog_setup(char *str)
74{
75 watchdog_enabled = 0;
76 return 1;
77}
78__setup("nowatchdog", nowatchdog_setup);
79
80
81static int __init nosoftlockup_setup(char *str)
82{
83 watchdog_enabled = 0;
84 return 1;
85}
86__setup("nosoftlockup", nosoftlockup_setup);
87
88
89
90
91
92
93
94
95static unsigned long get_timestamp(int this_cpu)
96{
97 return cpu_clock(this_cpu) >> 30LL;
98}
99
100static unsigned long get_sample_period(void)
101{
102
103
104
105
106
107
108 return softlockup_thresh / 5 * NSEC_PER_SEC;
109}
110
111
112static void __touch_watchdog(void)
113{
114 int this_cpu = smp_processor_id();
115
116 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
117}
118
119void touch_softlockup_watchdog(void)
120{
121 __raw_get_cpu_var(watchdog_touch_ts) = 0;
122}
123EXPORT_SYMBOL(touch_softlockup_watchdog);
124
125void touch_all_softlockup_watchdogs(void)
126{
127 int cpu;
128
129
130
131
132
133
134 for_each_online_cpu(cpu)
135 per_cpu(watchdog_touch_ts, cpu) = 0;
136}
137
138#ifdef CONFIG_HARDLOCKUP_DETECTOR
139void touch_nmi_watchdog(void)
140{
141 if (watchdog_enabled) {
142 unsigned cpu;
143
144 for_each_present_cpu(cpu) {
145 if (per_cpu(watchdog_nmi_touch, cpu) != true)
146 per_cpu(watchdog_nmi_touch, cpu) = true;
147 }
148 }
149 touch_softlockup_watchdog();
150}
151EXPORT_SYMBOL(touch_nmi_watchdog);
152
153#endif
154
155void touch_softlockup_watchdog_sync(void)
156{
157 __raw_get_cpu_var(softlockup_touch_sync) = true;
158 __raw_get_cpu_var(watchdog_touch_ts) = 0;
159}
160
161#ifdef CONFIG_HARDLOCKUP_DETECTOR
162
163static int is_hardlockup(void)
164{
165 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
166
167 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
168 return 1;
169
170 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
171 return 0;
172}
173#endif
174
175static int is_softlockup(unsigned long touch_ts)
176{
177 unsigned long now = get_timestamp(smp_processor_id());
178
179
180 if (time_after(now, touch_ts + softlockup_thresh))
181 return now - touch_ts;
182
183 return 0;
184}
185
186#ifdef CONFIG_HARDLOCKUP_DETECTOR
187static struct perf_event_attr wd_hw_attr = {
188 .type = PERF_TYPE_HARDWARE,
189 .config = PERF_COUNT_HW_CPU_CYCLES,
190 .size = sizeof(struct perf_event_attr),
191 .pinned = 1,
192 .disabled = 1,
193};
194
195
196static void watchdog_overflow_callback(struct perf_event *event, int nmi,
197 struct perf_sample_data *data,
198 struct pt_regs *regs)
199{
200
201 event->hw.interrupts = 0;
202
203 if (__get_cpu_var(watchdog_nmi_touch) == true) {
204 __get_cpu_var(watchdog_nmi_touch) = false;
205 return;
206 }
207
208
209
210
211
212
213
214 if (is_hardlockup()) {
215 int this_cpu = smp_processor_id();
216
217
218 if (__get_cpu_var(hard_watchdog_warn) == true)
219 return;
220
221 if (hardlockup_panic)
222 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
223 else
224 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
225
226 __get_cpu_var(hard_watchdog_warn) = true;
227 return;
228 }
229
230 __get_cpu_var(hard_watchdog_warn) = false;
231 return;
232}
233static void watchdog_interrupt_count(void)
234{
235 __get_cpu_var(hrtimer_interrupts)++;
236}
237#else
238static inline void watchdog_interrupt_count(void) { return; }
239#endif
240
241
242static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
243{
244 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
245 struct pt_regs *regs = get_irq_regs();
246 int duration;
247
248
249 watchdog_interrupt_count();
250
251
252 wake_up_process(__get_cpu_var(softlockup_watchdog));
253
254
255 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
256
257 if (touch_ts == 0) {
258 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
259
260
261
262
263 __get_cpu_var(softlockup_touch_sync) = false;
264 sched_clock_tick();
265 }
266 __touch_watchdog();
267 return HRTIMER_RESTART;
268 }
269
270
271
272
273
274
275
276 duration = is_softlockup(touch_ts);
277 if (unlikely(duration)) {
278
279 if (__get_cpu_var(soft_watchdog_warn) == true)
280 return HRTIMER_RESTART;
281
282 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
283 smp_processor_id(), duration,
284 current->comm, task_pid_nr(current));
285 print_modules();
286 print_irqtrace_events(current);
287 if (regs)
288 show_regs(regs);
289 else
290 dump_stack();
291
292 if (softlockup_panic)
293 panic("softlockup: hung tasks");
294 __get_cpu_var(soft_watchdog_warn) = true;
295 } else
296 __get_cpu_var(soft_watchdog_warn) = false;
297
298 return HRTIMER_RESTART;
299}
300
301
302
303
304
305static int watchdog(void *unused)
306{
307 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
308 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
309
310 sched_setscheduler(current, SCHED_FIFO, ¶m);
311
312
313 __touch_watchdog();
314
315
316
317 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
318 HRTIMER_MODE_REL_PINNED);
319
320 set_current_state(TASK_INTERRUPTIBLE);
321
322
323
324
325
326 while (!kthread_should_stop()) {
327 __touch_watchdog();
328 schedule();
329
330 if (kthread_should_stop())
331 break;
332
333 set_current_state(TASK_INTERRUPTIBLE);
334 }
335 __set_current_state(TASK_RUNNING);
336
337 return 0;
338}
339
340
341#ifdef CONFIG_HARDLOCKUP_DETECTOR
342static int watchdog_nmi_enable(int cpu)
343{
344 struct perf_event_attr *wd_attr;
345 struct perf_event *event = per_cpu(watchdog_ev, cpu);
346
347
348 if (event && event->state > PERF_EVENT_STATE_OFF)
349 goto out;
350
351
352 if (event != NULL)
353 goto out_enable;
354
355
356 wd_attr = &wd_hw_attr;
357 wd_attr->sample_period = hw_nmi_get_sample_period();
358 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
359 if (!IS_ERR(event)) {
360 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
361 goto out_save;
362 }
363
364 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
365 cpu, PTR_ERR(event));
366 return PTR_ERR(event);
367
368
369out_save:
370 per_cpu(watchdog_ev, cpu) = event;
371out_enable:
372 perf_event_enable(per_cpu(watchdog_ev, cpu));
373out:
374 return 0;
375}
376
377static void watchdog_nmi_disable(int cpu)
378{
379 struct perf_event *event = per_cpu(watchdog_ev, cpu);
380
381 if (event) {
382 perf_event_disable(event);
383 per_cpu(watchdog_ev, cpu) = NULL;
384
385
386 perf_event_release_kernel(event);
387 }
388 return;
389}
390#else
391static int watchdog_nmi_enable(int cpu) { return 0; }
392static void watchdog_nmi_disable(int cpu) { return; }
393#endif
394
395
396static int watchdog_prepare_cpu(int cpu)
397{
398 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
399
400 WARN_ON(per_cpu(softlockup_watchdog, cpu));
401 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
402 hrtimer->function = watchdog_timer_fn;
403
404 return 0;
405}
406
407static int watchdog_enable(int cpu)
408{
409 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
410 int err;
411
412
413 err = watchdog_nmi_enable(cpu);
414 if (err)
415 return err;
416
417
418 if (!p) {
419 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
420 if (IS_ERR(p)) {
421 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
422 return PTR_ERR(p);
423 }
424 kthread_bind(p, cpu);
425 per_cpu(watchdog_touch_ts, cpu) = 0;
426 per_cpu(softlockup_watchdog, cpu) = p;
427 wake_up_process(p);
428 }
429
430 return 0;
431}
432
433static void watchdog_disable(int cpu)
434{
435 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
436 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
437
438
439
440
441
442 hrtimer_cancel(hrtimer);
443
444
445 watchdog_nmi_disable(cpu);
446
447
448 if (p) {
449 per_cpu(softlockup_watchdog, cpu) = NULL;
450 kthread_stop(p);
451 }
452}
453
454static void watchdog_enable_all_cpus(void)
455{
456 int cpu;
457
458 watchdog_enabled = 0;
459
460 for_each_online_cpu(cpu)
461 if (!watchdog_enable(cpu))
462
463
464 watchdog_enabled = 1;
465
466 if (!watchdog_enabled)
467 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
468
469}
470
471static void watchdog_disable_all_cpus(void)
472{
473 int cpu;
474
475 for_each_online_cpu(cpu)
476 watchdog_disable(cpu);
477
478
479 watchdog_enabled = 0;
480}
481
482
483
484#ifdef CONFIG_SYSCTL
485
486
487
488
489int proc_dowatchdog_enabled(struct ctl_table *table, int write,
490 void __user *buffer, size_t *length, loff_t *ppos)
491{
492 proc_dointvec(table, write, buffer, length, ppos);
493
494 if (write) {
495 if (watchdog_enabled)
496 watchdog_enable_all_cpus();
497 else
498 watchdog_disable_all_cpus();
499 }
500 return 0;
501}
502
503int proc_dowatchdog_thresh(struct ctl_table *table, int write,
504 void __user *buffer,
505 size_t *lenp, loff_t *ppos)
506{
507 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
508}
509#endif
510
511
512
513
514
515static int __cpuinit
516cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
517{
518 int hotcpu = (unsigned long)hcpu;
519 int err = 0;
520
521 switch (action) {
522 case CPU_UP_PREPARE:
523 case CPU_UP_PREPARE_FROZEN:
524 err = watchdog_prepare_cpu(hotcpu);
525 break;
526 case CPU_ONLINE:
527 case CPU_ONLINE_FROZEN:
528 if (watchdog_enabled)
529 err = watchdog_enable(hotcpu);
530 break;
531#ifdef CONFIG_HOTPLUG_CPU
532 case CPU_UP_CANCELED:
533 case CPU_UP_CANCELED_FROZEN:
534 watchdog_disable(hotcpu);
535 break;
536 case CPU_DEAD:
537 case CPU_DEAD_FROZEN:
538 watchdog_disable(hotcpu);
539 break;
540#endif
541 }
542 return notifier_from_errno(err);
543}
544
545static struct notifier_block __cpuinitdata cpu_nfb = {
546 .notifier_call = cpu_callback
547};
548
549static int __init spawn_watchdog_task(void)
550{
551 void *cpu = (void *)(long)smp_processor_id();
552 int err;
553
554 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
555 WARN_ON(notifier_to_errno(err));
556
557 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
558 register_cpu_notifier(&cpu_nfb);
559
560 return 0;
561}
562early_initcall(spawn_watchdog_task);
563