linux/tools/tracing/latency/latency-collector.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2017, 2018, 2019, 2021 BMW Car IT GmbH
   4 * Author: Viktor Rosendahl (viktor.rosendahl@bmw.de)
   5 */
   6
   7#define _GNU_SOURCE
   8#define _POSIX_C_SOURCE 200809L
   9
  10#include <ctype.h>
  11#include <stdbool.h>
  12#include <stdio.h>
  13#include <stdlib.h>
  14#include <string.h>
  15
  16#include <err.h>
  17#include <errno.h>
  18#include <fcntl.h>
  19#include <getopt.h>
  20#include <sched.h>
  21#include <linux/unistd.h>
  22#include <signal.h>
  23#include <sys/inotify.h>
  24#include <unistd.h>
  25#include <pthread.h>
  26#include <tracefs.h>
  27
  28static const char *prg_name;
  29static const char *prg_unknown = "unknown program name";
  30
  31static int fd_stdout;
  32
  33static int sched_policy;
  34static bool sched_policy_set;
  35
  36static int sched_pri;
  37static bool sched_pri_set;
  38
  39static bool trace_enable = true;
  40static bool setup_ftrace = true;
  41static bool use_random_sleep;
  42
  43#define TRACE_OPTS                              \
  44        C(FUNC_TR, "function-trace"),           \
  45        C(DISP_GR, "display-graph"),            \
  46        C(NR,       NULL)
  47
  48#undef C
  49#define C(a, b) OPTIDX_##a
  50
  51enum traceopt {
  52        TRACE_OPTS
  53};
  54
  55#undef C
  56#define C(a, b)  b
  57
  58static const char *const optstr[] = {
  59        TRACE_OPTS
  60};
  61
  62enum errhandling {
  63        ERR_EXIT = 0,
  64        ERR_WARN,
  65        ERR_CLEANUP,
  66};
  67
  68static bool use_options[OPTIDX_NR];
  69
  70static char inotify_buffer[655360];
  71
  72#define likely(x)      __builtin_expect(!!(x), 1)
  73#define unlikely(x)    __builtin_expect(!!(x), 0)
  74#define bool2str(x)    (x ? "true":"false")
  75
  76#define DEFAULT_NR_PRINTER_THREADS (3)
  77static unsigned int nr_threads = DEFAULT_NR_PRINTER_THREADS;
  78
  79#define DEFAULT_TABLE_SIZE (2)
  80static unsigned int table_startsize = DEFAULT_TABLE_SIZE;
  81
  82static int verbosity;
  83
  84#define verbose_sizechange() (verbosity >= 1)
  85#define verbose_lostevent()  (verbosity >= 2)
  86#define verbose_ftrace()     (verbosity >= 1)
  87
  88#define was_changed(ORIG, CUR) (strcmp(ORIG, CUR) != 0)
  89#define needs_change(CUR, WANTED) (strcmp(CUR, WANTED) != 0)
  90
  91static const char *debug_tracefile;
  92static const char *debug_tracefile_dflt;
  93static const char *debug_maxlat;
  94static const char *debug_maxlat_dflt;
  95static const char * const DEBUG_NOFILE = "[file not found]";
  96
  97static const char * const TR_MAXLAT  = "tracing_max_latency";
  98static const char * const TR_THRESH  = "tracing_thresh";
  99static const char * const TR_CURRENT = "current_tracer";
 100static const char * const TR_OPTIONS = "trace_options";
 101
 102static const char * const NOP_TRACER = "nop";
 103
 104static const char * const OPT_NO_PREFIX = "no";
 105
 106#define DFLT_THRESHOLD_US "0"
 107static const char *threshold = DFLT_THRESHOLD_US;
 108
 109#define DEV_URANDOM     "/dev/urandom"
 110#define RT_DEFAULT_PRI (99)
 111#define DEFAULT_PRI    (0)
 112
 113#define USEC_PER_MSEC (1000L)
 114#define NSEC_PER_USEC (1000L)
 115#define NSEC_PER_MSEC (USEC_PER_MSEC * NSEC_PER_USEC)
 116
 117#define MSEC_PER_SEC (1000L)
 118#define USEC_PER_SEC (USEC_PER_MSEC * MSEC_PER_SEC)
 119#define NSEC_PER_SEC (NSEC_PER_MSEC * MSEC_PER_SEC)
 120
 121#define SLEEP_TIME_MS_DEFAULT (1000L)
 122#define TRY_PRINTMUTEX_MS (1000)
 123
 124static long sleep_time = (USEC_PER_MSEC * SLEEP_TIME_MS_DEFAULT);
 125
 126static const char * const queue_full_warning =
 127"Could not queue trace for printing. It is likely that events happen faster\n"
 128"than what they can be printed. Probably partly because of random sleeping\n";
 129
 130static const char * const no_tracer_msg =
 131"Could not find any tracers! Running this program as root may help!\n";
 132
 133static const char * const no_latency_tr_msg =
 134"No latency tracers are supported by your kernel!\n";
 135
 136struct policy {
 137        const char *name;
 138        int policy;
 139        int default_pri;
 140};
 141
 142static const struct policy policies[] = {
 143        { "other", SCHED_OTHER, DEFAULT_PRI    },
 144        { "batch", SCHED_BATCH, DEFAULT_PRI    },
 145        { "idle",  SCHED_IDLE,  DEFAULT_PRI    },
 146        { "rr",    SCHED_RR,    RT_DEFAULT_PRI },
 147        { "fifo",  SCHED_FIFO,  RT_DEFAULT_PRI },
 148        { NULL,    0,           DEFAULT_PRI    }
 149};
 150
 151/*
 152 * The default tracer will be the first on this list that is supported by the
 153 * currently running Linux kernel.
 154 */
 155static const char * const relevant_tracers[] = {
 156        "preemptirqsoff",
 157        "preemptoff",
 158        "irqsoff",
 159        "wakeup",
 160        "wakeup_rt",
 161        "wakeup_dl",
 162        NULL
 163};
 164
 165/* This is the list of tracers for which random sleep makes sense */
 166static const char * const random_tracers[] = {
 167        "preemptirqsoff",
 168        "preemptoff",
 169        "irqsoff",
 170        NULL
 171};
 172
 173static const char *current_tracer;
 174static bool force_tracer;
 175
 176struct ftrace_state {
 177        char *tracer;
 178        char *thresh;
 179        bool opt[OPTIDX_NR];
 180        bool opt_valid[OPTIDX_NR];
 181        pthread_mutex_t mutex;
 182};
 183
 184struct entry {
 185        int ticket;
 186        int ticket_completed_ref;
 187};
 188
 189struct print_state {
 190        int ticket_counter;
 191        int ticket_completed;
 192        pthread_mutex_t mutex;
 193        pthread_cond_t cond;
 194        int cnt;
 195        pthread_mutex_t cnt_mutex;
 196};
 197
 198struct short_msg {
 199        char buf[160];
 200        int len;
 201};
 202
 203static struct print_state printstate;
 204static struct ftrace_state save_state;
 205volatile sig_atomic_t signal_flag;
 206
 207#define PROB_TABLE_MAX_SIZE (1000)
 208
 209int probabilities[PROB_TABLE_MAX_SIZE];
 210
 211struct sleep_table {
 212        int *table;
 213        int size;
 214        pthread_mutex_t mutex;
 215};
 216
 217static struct sleep_table sleeptable;
 218
 219#define QUEUE_SIZE (10)
 220
 221struct queue {
 222        struct entry entries[QUEUE_SIZE];
 223        int next_prod_idx;
 224        int next_cons_idx;
 225        pthread_mutex_t mutex;
 226        pthread_cond_t cond;
 227};
 228
 229#define MAX_THREADS (40)
 230
 231struct queue printqueue;
 232pthread_t printthread[MAX_THREADS];
 233pthread_mutex_t print_mtx;
 234#define PRINT_BUFFER_SIZE (16 * 1024 * 1024)
 235
 236static void cleanup_exit(int status);
 237static int set_trace_opt(const char *opt, bool value);
 238
 239static __always_inline void *malloc_or_die(size_t size)
 240{
 241        void *ptr = malloc(size);
 242
 243        if (unlikely(ptr == NULL)) {
 244                warn("malloc() failed");
 245                cleanup_exit(EXIT_FAILURE);
 246        }
 247        return ptr;
 248}
 249
 250static __always_inline void *malloc_or_die_nocleanup(size_t size)
 251{
 252        void *ptr = malloc(size);
 253
 254        if (unlikely(ptr == NULL))
 255                err(0, "malloc() failed");
 256        return ptr;
 257}
 258
 259static __always_inline void write_or_die(int fd, const char *buf, size_t count)
 260{
 261        ssize_t r;
 262
 263        do {
 264                r = write(fd, buf, count);
 265                if (unlikely(r < 0)) {
 266                        if (errno == EINTR)
 267                                continue;
 268                        warn("write() failed");
 269                        cleanup_exit(EXIT_FAILURE);
 270                }
 271                count -= r;
 272                buf += r;
 273        } while (count > 0);
 274}
 275
 276static __always_inline void clock_gettime_or_die(clockid_t clk_id,
 277                                                 struct timespec *tp)
 278{
 279        int r = clock_gettime(clk_id, tp);
 280
 281        if (unlikely(r != 0))
 282                err(EXIT_FAILURE, "clock_gettime() failed");
 283}
 284
 285static __always_inline void sigemptyset_or_die(sigset_t *s)
 286{
 287        if (unlikely(sigemptyset(s) != 0)) {
 288                warn("sigemptyset() failed");
 289                cleanup_exit(EXIT_FAILURE);
 290        }
 291}
 292
 293static __always_inline void sigaddset_or_die(sigset_t *s, int signum)
 294{
 295        if (unlikely(sigaddset(s, signum) != 0)) {
 296                warn("sigemptyset() failed");
 297                cleanup_exit(EXIT_FAILURE);
 298        }
 299}
 300
 301static __always_inline void sigaction_or_die(int signum,
 302                                             const struct sigaction *act,
 303                                             struct sigaction *oldact)
 304{
 305        if (unlikely(sigaction(signum, act, oldact) != 0)) {
 306                warn("sigaction() failed");
 307                cleanup_exit(EXIT_FAILURE);
 308        }
 309}
 310
 311static void open_stdout(void)
 312{
 313        if (setvbuf(stdout, NULL, _IONBF, 0) != 0)
 314                err(EXIT_FAILURE, "setvbuf() failed");
 315        fd_stdout = fileno(stdout);
 316        if (fd_stdout < 0)
 317                err(EXIT_FAILURE, "fileno() failed");
 318}
 319
 320/*
 321 * It's not worth it to call cleanup_exit() from mutex functions because
 322 * cleanup_exit() uses mutexes.
 323 */
 324static __always_inline void mutex_lock(pthread_mutex_t *mtx)
 325{
 326        errno = pthread_mutex_lock(mtx);
 327        if (unlikely(errno))
 328                err(EXIT_FAILURE, "pthread_mutex_lock() failed");
 329}
 330
 331
 332static __always_inline void mutex_unlock(pthread_mutex_t *mtx)
 333{
 334        errno = pthread_mutex_unlock(mtx);
 335        if (unlikely(errno))
 336                err(EXIT_FAILURE, "pthread_mutex_unlock() failed");
 337}
 338
 339static __always_inline void cond_signal(pthread_cond_t *cond)
 340{
 341        errno = pthread_cond_signal(cond);
 342        if (unlikely(errno))
 343                err(EXIT_FAILURE, "pthread_cond_signal() failed");
 344}
 345
 346static __always_inline void cond_wait(pthread_cond_t *restrict cond,
 347                                      pthread_mutex_t *restrict mutex)
 348{
 349        errno = pthread_cond_wait(cond, mutex);
 350        if (unlikely(errno))
 351                err(EXIT_FAILURE, "pthread_cond_wait() failed");
 352}
 353
 354static __always_inline void cond_broadcast(pthread_cond_t *cond)
 355{
 356        errno = pthread_cond_broadcast(cond);
 357        if (unlikely(errno))
 358                err(EXIT_FAILURE, "pthread_cond_broadcast() failed");
 359}
 360
 361static __always_inline void
 362mutex_init(pthread_mutex_t *mutex,
 363           const pthread_mutexattr_t *attr)
 364{
 365        errno = pthread_mutex_init(mutex, attr);
 366        if (errno)
 367                err(EXIT_FAILURE, "pthread_mutex_init() failed");
 368}
 369
 370static __always_inline void mutexattr_init(pthread_mutexattr_t *attr)
 371{
 372        errno = pthread_mutexattr_init(attr);
 373        if (errno)
 374                err(EXIT_FAILURE, "pthread_mutexattr_init() failed");
 375}
 376
 377static __always_inline void mutexattr_destroy(pthread_mutexattr_t *attr)
 378{
 379        errno = pthread_mutexattr_destroy(attr);
 380        if (errno)
 381                err(EXIT_FAILURE, "pthread_mutexattr_destroy() failed");
 382}
 383
 384static __always_inline void mutexattr_settype(pthread_mutexattr_t *attr,
 385                                              int type)
 386{
 387        errno = pthread_mutexattr_settype(attr, type);
 388        if (errno)
 389                err(EXIT_FAILURE, "pthread_mutexattr_settype() failed");
 390}
 391
 392static __always_inline void condattr_init(pthread_condattr_t *attr)
 393{
 394        errno = pthread_condattr_init(attr);
 395        if (errno)
 396                err(EXIT_FAILURE, "pthread_condattr_init() failed");
 397}
 398
 399static __always_inline void condattr_destroy(pthread_condattr_t *attr)
 400{
 401        errno = pthread_condattr_destroy(attr);
 402        if (errno)
 403                err(EXIT_FAILURE, "pthread_condattr_destroy() failed");
 404}
 405
 406static __always_inline void condattr_setclock(pthread_condattr_t *attr,
 407                                              clockid_t clock_id)
 408{
 409        errno = pthread_condattr_setclock(attr, clock_id);
 410        if (unlikely(errno))
 411                err(EXIT_FAILURE, "pthread_condattr_setclock() failed");
 412}
 413
 414static __always_inline void cond_init(pthread_cond_t *cond,
 415                                      const pthread_condattr_t *attr)
 416{
 417        errno = pthread_cond_init(cond, attr);
 418        if (errno)
 419                err(EXIT_FAILURE, "pthread_cond_init() failed");
 420}
 421
 422static __always_inline int
 423cond_timedwait(pthread_cond_t *restrict cond,
 424               pthread_mutex_t *restrict mutex,
 425               const struct timespec *restrict abstime)
 426{
 427        errno = pthread_cond_timedwait(cond, mutex, abstime);
 428        if (errno && errno != ETIMEDOUT)
 429                err(EXIT_FAILURE, "pthread_cond_timedwait() failed");
 430        return errno;
 431}
 432
 433static void init_printstate(void)
 434{
 435        pthread_condattr_t cattr;
 436
 437        printstate.ticket_counter = 0;
 438        printstate.ticket_completed = 0;
 439        printstate.cnt = 0;
 440
 441        mutex_init(&printstate.mutex, NULL);
 442
 443        condattr_init(&cattr);
 444        condattr_setclock(&cattr, CLOCK_MONOTONIC);
 445        cond_init(&printstate.cond, &cattr);
 446        condattr_destroy(&cattr);
 447}
 448
 449static void init_print_mtx(void)
 450{
 451        pthread_mutexattr_t mattr;
 452
 453        mutexattr_init(&mattr);
 454        mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
 455        mutex_init(&print_mtx, &mattr);
 456        mutexattr_destroy(&mattr);
 457
 458}
 459
 460static void signal_blocking(int how)
 461{
 462        sigset_t s;
 463
 464        sigemptyset_or_die(&s);
 465        sigaddset_or_die(&s, SIGHUP);
 466        sigaddset_or_die(&s, SIGTERM);
 467        sigaddset_or_die(&s, SIGINT);
 468
 469        errno = pthread_sigmask(how, &s, NULL);
 470        if (unlikely(errno)) {
 471                warn("pthread_sigmask() failed");
 472                cleanup_exit(EXIT_FAILURE);
 473        }
 474}
 475
 476static void signal_handler(int num)
 477{
 478        signal_flag = num;
 479}
 480
 481static void setup_sig_handler(void)
 482{
 483        struct sigaction sa;
 484
 485        memset(&sa, 0, sizeof(sa));
 486        sa.sa_handler = signal_handler;
 487
 488        sigaction_or_die(SIGHUP, &sa, NULL);
 489        sigaction_or_die(SIGTERM, &sa, NULL);
 490        sigaction_or_die(SIGINT, &sa, NULL);
 491}
 492
 493static void process_signal(int signal)
 494{
 495        char *name;
 496
 497        name = strsignal(signal);
 498        if (name == NULL)
 499                printf("Received signal %d\n", signal);
 500        else
 501                printf("Received signal %d (%s)\n", signal, name);
 502        cleanup_exit(EXIT_SUCCESS);
 503}
 504
 505static __always_inline void check_signals(void)
 506{
 507        int signal = signal_flag;
 508
 509        if (unlikely(signal))
 510                process_signal(signal);
 511}
 512
 513static __always_inline void get_time_in_future(struct timespec *future,
 514                                               long time_us)
 515{
 516        long nsec;
 517
 518        clock_gettime_or_die(CLOCK_MONOTONIC, future);
 519        future->tv_sec += time_us / USEC_PER_SEC;
 520        nsec = future->tv_nsec + (time_us * NSEC_PER_USEC) % NSEC_PER_SEC;
 521        if (nsec >= NSEC_PER_SEC) {
 522                future->tv_nsec = nsec % NSEC_PER_SEC;
 523                future->tv_sec += 1;
 524        }
 525}
 526
 527static __always_inline bool time_has_passed(const struct timespec *time)
 528{
 529        struct timespec now;
 530
 531        clock_gettime_or_die(CLOCK_MONOTONIC, &now);
 532        if (now.tv_sec > time->tv_sec)
 533                return true;
 534        if (now.tv_sec < time->tv_sec)
 535                return false;
 536        return (now.tv_nsec >= time->tv_nsec);
 537}
 538
 539static bool mutex_trylock_limit(pthread_mutex_t *mutex, int time_ms)
 540{
 541        long time_us = time_ms * USEC_PER_MSEC;
 542        struct timespec limit;
 543
 544        get_time_in_future(&limit, time_us);
 545        do {
 546                errno =  pthread_mutex_trylock(mutex);
 547                if (errno && errno != EBUSY)
 548                        err(EXIT_FAILURE, "pthread_mutex_trylock() failed");
 549        } while (errno && !time_has_passed(&limit));
 550        return errno == 0;
 551}
 552
 553static void restore_trace_opts(const struct ftrace_state *state,
 554                                const bool *cur)
 555{
 556        int i;
 557        int r;
 558
 559        for (i = 0; i < OPTIDX_NR; i++)
 560                if (state->opt_valid[i] && state->opt[i] != cur[i]) {
 561                        r = set_trace_opt(optstr[i], state->opt[i]);
 562                        if (r < 0)
 563                                warnx("Failed to restore the %s option to %s",
 564                                      optstr[i], bool2str(state->opt[i]));
 565                        else if (verbose_ftrace())
 566                                printf("Restored the %s option in %s to %s\n",
 567                                       optstr[i], TR_OPTIONS,
 568                                       bool2str(state->opt[i]));
 569                }
 570}
 571
 572static char *read_file(const char *file, enum errhandling h)
 573{
 574        int psize;
 575        char *r;
 576        static const char *emsg = "Failed to read the %s file";
 577
 578        r = tracefs_instance_file_read(NULL, file, &psize);
 579        if (!r) {
 580                if (h) {
 581                        warn(emsg, file);
 582                        if (h == ERR_CLEANUP)
 583                                cleanup_exit(EXIT_FAILURE);
 584                } else
 585                        errx(EXIT_FAILURE, emsg, file);
 586        }
 587
 588        if (r && r[psize - 1] == '\n')
 589                r[psize - 1] = '\0';
 590        return r;
 591}
 592
 593static void restore_file(const char *file, char **saved, const char *cur)
 594{
 595        if (*saved && was_changed(*saved, cur)) {
 596                if (tracefs_instance_file_write(NULL, file, *saved) < 0)
 597                        warnx("Failed to restore %s to %s!", file, *saved);
 598                else if (verbose_ftrace())
 599                        printf("Restored %s to %s\n", file, *saved);
 600                free(*saved);
 601                *saved = NULL;
 602        }
 603}
 604
 605static void restore_ftrace(void)
 606{
 607        mutex_lock(&save_state.mutex);
 608
 609        restore_file(TR_CURRENT, &save_state.tracer, current_tracer);
 610        restore_file(TR_THRESH, &save_state.thresh, threshold);
 611        restore_trace_opts(&save_state, use_options);
 612
 613        mutex_unlock(&save_state.mutex);
 614}
 615
 616static void cleanup_exit(int status)
 617{
 618        char *maxlat;
 619
 620        if (!setup_ftrace)
 621                exit(status);
 622
 623        /*
 624         * We try the print_mtx for 1 sec in order to avoid garbled
 625         * output if possible, but if it cannot be obtained we proceed anyway.
 626         */
 627        mutex_trylock_limit(&print_mtx, TRY_PRINTMUTEX_MS);
 628
 629        maxlat = read_file(TR_MAXLAT, ERR_WARN);
 630        if (maxlat) {
 631                printf("The maximum detected latency was: %sus\n", maxlat);
 632                free(maxlat);
 633        }
 634
 635        restore_ftrace();
 636        /*
 637         * We do not need to unlock the print_mtx here because we will exit at
 638         * the end of this function. Unlocking print_mtx causes problems if a
 639         * print thread happens to be waiting for the mutex because we have
 640         * just changed the ftrace settings to the original and thus the
 641         * print thread would output incorrect data from ftrace.
 642         */
 643        exit(status);
 644}
 645
 646static void init_save_state(void)
 647{
 648        pthread_mutexattr_t mattr;
 649
 650        mutexattr_init(&mattr);
 651        mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
 652        mutex_init(&save_state.mutex, &mattr);
 653        mutexattr_destroy(&mattr);
 654
 655        save_state.tracer = NULL;
 656        save_state.thresh = NULL;
 657        save_state.opt_valid[OPTIDX_FUNC_TR] = false;
 658        save_state.opt_valid[OPTIDX_DISP_GR] = false;
 659}
 660
 661static int printstate_next_ticket(struct entry *req)
 662{
 663        int r;
 664
 665        r = ++(printstate.ticket_counter);
 666        req->ticket = r;
 667        req->ticket_completed_ref = printstate.ticket_completed;
 668        cond_broadcast(&printstate.cond);
 669        return r;
 670}
 671
 672static __always_inline
 673void printstate_mark_req_completed(const struct entry *req)
 674{
 675        if (req->ticket > printstate.ticket_completed)
 676                printstate.ticket_completed = req->ticket;
 677}
 678
 679static __always_inline
 680bool printstate_has_new_req_arrived(const struct entry *req)
 681{
 682        return (printstate.ticket_counter != req->ticket);
 683}
 684
 685static __always_inline int printstate_cnt_inc(void)
 686{
 687        int value;
 688
 689        mutex_lock(&printstate.cnt_mutex);
 690        value = ++printstate.cnt;
 691        mutex_unlock(&printstate.cnt_mutex);
 692        return value;
 693}
 694
 695static __always_inline int printstate_cnt_dec(void)
 696{
 697        int value;
 698
 699        mutex_lock(&printstate.cnt_mutex);
 700        value = --printstate.cnt;
 701        mutex_unlock(&printstate.cnt_mutex);
 702        return value;
 703}
 704
 705static __always_inline int printstate_cnt_read(void)
 706{
 707        int value;
 708
 709        mutex_lock(&printstate.cnt_mutex);
 710        value = printstate.cnt;
 711        mutex_unlock(&printstate.cnt_mutex);
 712        return value;
 713}
 714
 715static __always_inline
 716bool prev_req_won_race(const struct entry *req)
 717{
 718        return (printstate.ticket_completed != req->ticket_completed_ref);
 719}
 720
 721static void sleeptable_resize(int size, bool printout, struct short_msg *msg)
 722{
 723        int bytes;
 724
 725        if (printout) {
 726                msg->len = 0;
 727                if (unlikely(size > PROB_TABLE_MAX_SIZE))
 728                        bytes = snprintf(msg->buf, sizeof(msg->buf),
 729"Cannot increase probability table to %d (maximum size reached)\n", size);
 730                else
 731                        bytes = snprintf(msg->buf, sizeof(msg->buf),
 732"Increasing probability table to %d\n", size);
 733                if (bytes < 0)
 734                        warn("snprintf() failed");
 735                else
 736                        msg->len = bytes;
 737        }
 738
 739        if (unlikely(size < 0)) {
 740                /* Should never happen */
 741                warnx("Bad program state at %s:%d", __FILE__, __LINE__);
 742                cleanup_exit(EXIT_FAILURE);
 743                return;
 744        }
 745        sleeptable.size = size;
 746        sleeptable.table = &probabilities[PROB_TABLE_MAX_SIZE - size];
 747}
 748
 749static void init_probabilities(void)
 750{
 751        int i;
 752        int j = 1000;
 753
 754        for (i = 0; i < PROB_TABLE_MAX_SIZE; i++) {
 755                probabilities[i] = 1000 / j;
 756                j--;
 757        }
 758        mutex_init(&sleeptable.mutex, NULL);
 759}
 760
 761static int table_get_probability(const struct entry *req,
 762                                 struct short_msg *msg)
 763{
 764        int diff = req->ticket - req->ticket_completed_ref;
 765        int rval = 0;
 766
 767        msg->len = 0;
 768        diff--;
 769        /* Should never happen...*/
 770        if (unlikely(diff < 0)) {
 771                warnx("Programmer assumption error at %s:%d\n", __FILE__,
 772                      __LINE__);
 773                cleanup_exit(EXIT_FAILURE);
 774        }
 775        mutex_lock(&sleeptable.mutex);
 776        if (diff >= (sleeptable.size - 1)) {
 777                rval = sleeptable.table[sleeptable.size - 1];
 778                sleeptable_resize(sleeptable.size + 1, verbose_sizechange(),
 779                                  msg);
 780        } else {
 781                rval = sleeptable.table[diff];
 782        }
 783        mutex_unlock(&sleeptable.mutex);
 784        return rval;
 785}
 786
 787static void init_queue(struct queue *q)
 788{
 789        q->next_prod_idx = 0;
 790        q->next_cons_idx = 0;
 791        mutex_init(&q->mutex, NULL);
 792        errno = pthread_cond_init(&q->cond, NULL);
 793        if (errno)
 794                err(EXIT_FAILURE, "pthread_cond_init() failed");
 795}
 796
 797static __always_inline int queue_len(const struct queue *q)
 798{
 799        if (q->next_prod_idx >= q->next_cons_idx)
 800                return q->next_prod_idx - q->next_cons_idx;
 801        else
 802                return QUEUE_SIZE - q->next_cons_idx + q->next_prod_idx;
 803}
 804
 805static __always_inline int queue_nr_free(const struct queue *q)
 806{
 807        int nr_free = QUEUE_SIZE - queue_len(q);
 808
 809        /*
 810         * If there is only one slot left we will anyway lie and claim that the
 811         * queue is full because adding an element will make it appear empty
 812         */
 813        if (nr_free == 1)
 814                nr_free = 0;
 815        return nr_free;
 816}
 817
 818static __always_inline void queue_idx_inc(int *idx)
 819{
 820        *idx = (*idx + 1) % QUEUE_SIZE;
 821}
 822
 823static __always_inline void queue_push_to_back(struct queue *q,
 824                                              const struct entry *e)
 825{
 826        q->entries[q->next_prod_idx] = *e;
 827        queue_idx_inc(&q->next_prod_idx);
 828}
 829
 830static __always_inline struct entry queue_pop_from_front(struct queue *q)
 831{
 832        struct entry e = q->entries[q->next_cons_idx];
 833
 834        queue_idx_inc(&q->next_cons_idx);
 835        return e;
 836}
 837
 838static __always_inline void queue_cond_signal(struct queue *q)
 839{
 840        cond_signal(&q->cond);
 841}
 842
 843static __always_inline void queue_cond_wait(struct queue *q)
 844{
 845        cond_wait(&q->cond, &q->mutex);
 846}
 847
 848static __always_inline int queue_try_to_add_entry(struct queue *q,
 849                                                  const struct entry *e)
 850{
 851        int r = 0;
 852
 853        mutex_lock(&q->mutex);
 854        if (queue_nr_free(q) > 0) {
 855                queue_push_to_back(q, e);
 856                cond_signal(&q->cond);
 857        } else
 858                r = -1;
 859        mutex_unlock(&q->mutex);
 860        return r;
 861}
 862
 863static struct entry queue_wait_for_entry(struct queue *q)
 864{
 865        struct entry e;
 866
 867        mutex_lock(&q->mutex);
 868        while (true) {
 869                if (queue_len(&printqueue) > 0) {
 870                        e = queue_pop_from_front(q);
 871                        break;
 872                }
 873                queue_cond_wait(q);
 874        }
 875        mutex_unlock(&q->mutex);
 876
 877        return e;
 878}
 879
 880static const struct policy *policy_from_name(const char *name)
 881{
 882        const struct policy *p = &policies[0];
 883
 884        while (p->name != NULL) {
 885                if (!strcmp(name, p->name))
 886                        return p;
 887                p++;
 888        }
 889        return NULL;
 890}
 891
 892static const char *policy_name(int policy)
 893{
 894        const struct policy *p = &policies[0];
 895        static const char *rval = "unknown";
 896
 897        while (p->name != NULL) {
 898                if (p->policy == policy)
 899                        return p->name;
 900                p++;
 901        }
 902        return rval;
 903}
 904
 905static bool is_relevant_tracer(const char *name)
 906{
 907        unsigned int i;
 908
 909        for (i = 0; relevant_tracers[i]; i++)
 910                if (!strcmp(name, relevant_tracers[i]))
 911                        return true;
 912        return false;
 913}
 914
 915static bool random_makes_sense(const char *name)
 916{
 917        unsigned int i;
 918
 919        for (i = 0; random_tracers[i]; i++)
 920                if (!strcmp(name, random_tracers[i]))
 921                        return true;
 922        return false;
 923}
 924
 925static void show_available(void)
 926{
 927        char **tracers;
 928        int found = 0;
 929        int i;
 930
 931        tracers = tracefs_tracers(NULL);
 932        for (i = 0; tracers && tracers[i]; i++) {
 933                if (is_relevant_tracer(tracers[i]))
 934                        found++;
 935        }
 936
 937        if (!tracers) {
 938                warnx(no_tracer_msg);
 939                return;
 940        }
 941
 942        if (!found) {
 943                warnx(no_latency_tr_msg);
 944                tracefs_list_free(tracers);
 945                return;
 946        }
 947
 948        printf("The following latency tracers are available on your system:\n");
 949        for (i = 0; tracers[i]; i++) {
 950                if (is_relevant_tracer(tracers[i]))
 951                        printf("%s\n", tracers[i]);
 952        }
 953        tracefs_list_free(tracers);
 954}
 955
 956static bool tracer_valid(const char *name, bool *notracer)
 957{
 958        char **tracers;
 959        int i;
 960        bool rval = false;
 961
 962        *notracer = false;
 963        tracers = tracefs_tracers(NULL);
 964        if (!tracers) {
 965                *notracer = true;
 966                return false;
 967        }
 968        for (i = 0; tracers[i]; i++)
 969                if (!strcmp(tracers[i], name)) {
 970                        rval = true;
 971                        break;
 972                }
 973        tracefs_list_free(tracers);
 974        return rval;
 975}
 976
 977static const char *find_default_tracer(void)
 978{
 979        int i;
 980        bool notracer;
 981        bool valid;
 982
 983        for (i = 0; relevant_tracers[i]; i++) {
 984                valid = tracer_valid(relevant_tracers[i], &notracer);
 985                if (notracer)
 986                        errx(EXIT_FAILURE, no_tracer_msg);
 987                if (valid)
 988                        return relevant_tracers[i];
 989        }
 990        return NULL;
 991}
 992
 993static bool toss_coin(struct drand48_data *buffer, unsigned int prob)
 994{
 995        long r;
 996
 997        if (unlikely(lrand48_r(buffer, &r))) {
 998                warnx("lrand48_r() failed");
 999                cleanup_exit(EXIT_FAILURE);
1000        }
1001        r = r % 1000L;
1002        if (r < prob)
1003                return true;
1004        else
1005                return false;
1006}
1007
1008
1009static long go_to_sleep(const struct entry *req)
1010{
1011        struct timespec future;
1012        long delay = sleep_time;
1013
1014        get_time_in_future(&future, delay);
1015
1016        mutex_lock(&printstate.mutex);
1017        while (!printstate_has_new_req_arrived(req)) {
1018                cond_timedwait(&printstate.cond, &printstate.mutex, &future);
1019                if (time_has_passed(&future))
1020                        break;
1021        }
1022
1023        if (printstate_has_new_req_arrived(req))
1024                delay = -1;
1025        mutex_unlock(&printstate.mutex);
1026
1027        return delay;
1028}
1029
1030
1031static void set_priority(void)
1032{
1033        int r;
1034        pid_t pid;
1035        struct sched_param param;
1036
1037        memset(&param, 0, sizeof(param));
1038        param.sched_priority = sched_pri;
1039
1040        pid = getpid();
1041        r = sched_setscheduler(pid, sched_policy, &param);
1042
1043        if (r != 0)
1044                err(EXIT_FAILURE, "sched_setscheduler() failed");
1045}
1046
1047pid_t latency_collector_gettid(void)
1048{
1049        return (pid_t) syscall(__NR_gettid);
1050}
1051
1052static void print_priority(void)
1053{
1054        pid_t tid;
1055        int policy;
1056        int r;
1057        struct sched_param param;
1058
1059        tid = latency_collector_gettid();
1060        r = pthread_getschedparam(pthread_self(), &policy, &param);
1061        if (r != 0) {
1062                warn("pthread_getschedparam() failed");
1063                cleanup_exit(EXIT_FAILURE);
1064        }
1065        mutex_lock(&print_mtx);
1066        printf("Thread %d runs with scheduling policy %s and priority %d\n",
1067               tid, policy_name(policy), param.sched_priority);
1068        mutex_unlock(&print_mtx);
1069}
1070
1071static __always_inline
1072void __print_skipmessage(const struct short_msg *resize_msg,
1073                         const struct timespec *timestamp, char *buffer,
1074                         size_t bufspace, const struct entry *req, bool excuse,
1075                         const char *str)
1076{
1077        ssize_t bytes = 0;
1078        char *p = &buffer[0];
1079        long us, sec;
1080        int r;
1081
1082        sec = timestamp->tv_sec;
1083        us = timestamp->tv_nsec / 1000;
1084
1085        if (resize_msg != NULL && resize_msg->len > 0) {
1086                strncpy(p, resize_msg->buf, resize_msg->len);
1087                bytes += resize_msg->len;
1088                p += resize_msg->len;
1089                bufspace -= resize_msg->len;
1090        }
1091
1092        if (excuse)
1093                r = snprintf(p, bufspace,
1094"%ld.%06ld Latency %d printout skipped due to %s\n",
1095                             sec, us, req->ticket, str);
1096        else
1097                r = snprintf(p, bufspace, "%ld.%06ld Latency %d detected\n",
1098                            sec, us, req->ticket);
1099
1100        if (r < 0)
1101                warn("snprintf() failed");
1102        else
1103                bytes += r;
1104
1105        /* These prints could happen concurrently */
1106        mutex_lock(&print_mtx);
1107        write_or_die(fd_stdout, buffer, bytes);
1108        mutex_unlock(&print_mtx);
1109}
1110
1111static void print_skipmessage(const struct short_msg *resize_msg,
1112                              const struct timespec *timestamp, char *buffer,
1113                              size_t bufspace, const struct entry *req,
1114                              bool excuse)
1115{
1116        __print_skipmessage(resize_msg, timestamp, buffer, bufspace, req,
1117                            excuse, "random delay");
1118}
1119
1120static void print_lostmessage(const struct timespec *timestamp, char *buffer,
1121                              size_t bufspace, const struct entry *req,
1122                              const char *reason)
1123{
1124        __print_skipmessage(NULL, timestamp, buffer, bufspace, req, true,
1125                            reason);
1126}
1127
1128static void print_tracefile(const struct short_msg *resize_msg,
1129                            const struct timespec *timestamp, char *buffer,
1130                            size_t bufspace, long slept,
1131                            const struct entry *req)
1132{
1133        static const int reserve = 256;
1134        char *p = &buffer[0];
1135        ssize_t bytes = 0;
1136        ssize_t bytes_tot = 0;
1137        long us, sec;
1138        long slept_ms;
1139        int trace_fd;
1140
1141        /* Save some space for the final string and final null char */
1142        bufspace = bufspace - reserve - 1;
1143
1144        if (resize_msg != NULL && resize_msg->len > 0) {
1145                bytes = resize_msg->len;
1146                strncpy(p, resize_msg->buf, bytes);
1147                bytes_tot += bytes;
1148                p += bytes;
1149                bufspace -= bytes;
1150        }
1151
1152        trace_fd = open(debug_tracefile, O_RDONLY);
1153
1154        if (trace_fd < 0) {
1155                warn("open() failed on %s", debug_tracefile);
1156                return;
1157        }
1158
1159        sec = timestamp->tv_sec;
1160        us = timestamp->tv_nsec / 1000;
1161
1162        if (slept != 0) {
1163                slept_ms = slept / 1000;
1164                bytes = snprintf(p, bufspace,
1165"%ld.%06ld Latency %d randomly sleep for %ld ms before print\n",
1166                                 sec, us, req->ticket, slept_ms);
1167        } else {
1168                bytes = snprintf(p, bufspace,
1169                                 "%ld.%06ld Latency %d immediate print\n", sec,
1170                                 us, req->ticket);
1171        }
1172
1173        if (bytes < 0) {
1174                warn("snprintf() failed");
1175                return;
1176        }
1177        p += bytes;
1178        bufspace -= bytes;
1179        bytes_tot += bytes;
1180
1181        bytes = snprintf(p, bufspace,
1182">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BEGIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"
1183                );
1184
1185        if (bytes < 0) {
1186                warn("snprintf() failed");
1187                return;
1188        }
1189
1190        p += bytes;
1191        bufspace -= bytes;
1192        bytes_tot += bytes;
1193
1194        do {
1195                bytes = read(trace_fd, p, bufspace);
1196                if (bytes < 0) {
1197                        if (errno == EINTR)
1198                                continue;
1199                        warn("read() failed on %s", debug_tracefile);
1200                        if (unlikely(close(trace_fd) != 0))
1201                                warn("close() failed on %s", debug_tracefile);
1202                        return;
1203                }
1204                if (bytes == 0)
1205                        break;
1206                p += bytes;
1207                bufspace -= bytes;
1208                bytes_tot += bytes;
1209        } while (true);
1210
1211        if (unlikely(close(trace_fd) != 0))
1212                warn("close() failed on %s", debug_tracefile);
1213
1214        printstate_cnt_dec();
1215        /* Add the reserve space back to the budget for the final string */
1216        bufspace += reserve;
1217
1218        bytes = snprintf(p, bufspace,
1219                         ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
1220
1221        if (bytes < 0) {
1222                warn("snprintf() failed");
1223                return;
1224        }
1225
1226        bytes_tot += bytes;
1227
1228        /* These prints could happen concurrently */
1229        mutex_lock(&print_mtx);
1230        write_or_die(fd_stdout, buffer, bytes_tot);
1231        mutex_unlock(&print_mtx);
1232}
1233
1234static char *get_no_opt(const char *opt)
1235{
1236        char *no_opt;
1237        int s;
1238
1239        s = strlen(opt) + strlen(OPT_NO_PREFIX) + 1;
1240        /* We may be called from cleanup_exit() via set_trace_opt() */
1241        no_opt = malloc_or_die_nocleanup(s);
1242        strcpy(no_opt, OPT_NO_PREFIX);
1243        strcat(no_opt, opt);
1244        return no_opt;
1245}
1246
1247static char *find_next_optstr(const char *allopt, const char **next)
1248{
1249        const char *begin;
1250        const char *end;
1251        char *r;
1252        int s = 0;
1253
1254        if (allopt == NULL)
1255                return NULL;
1256
1257        for (begin = allopt; *begin != '\0'; begin++) {
1258                if (isgraph(*begin))
1259                        break;
1260        }
1261
1262        if (*begin == '\0')
1263                return NULL;
1264
1265        for (end = begin; *end != '\0' && isgraph(*end); end++)
1266                s++;
1267
1268        r = malloc_or_die_nocleanup(s + 1);
1269        strncpy(r, begin, s);
1270        r[s] = '\0';
1271        *next = begin + s;
1272        return r;
1273}
1274
1275static bool get_trace_opt(const char *allopt, const char *opt, bool *found)
1276{
1277        *found = false;
1278        char *no_opt;
1279        char *str;
1280        const char *next = allopt;
1281        bool rval = false;
1282
1283        no_opt = get_no_opt(opt);
1284
1285        do {
1286                str = find_next_optstr(next, &next);
1287                if (str == NULL)
1288                        break;
1289                if (!strcmp(str, opt)) {
1290                        *found = true;
1291                        rval = true;
1292                        free(str);
1293                        break;
1294                }
1295                if (!strcmp(str, no_opt)) {
1296                        *found = true;
1297                        rval = false;
1298                        free(str);
1299                        break;
1300                }
1301                free(str);
1302        } while (true);
1303        free(no_opt);
1304
1305        return rval;
1306}
1307
1308static int set_trace_opt(const char *opt, bool value)
1309{
1310        char *str;
1311        int r;
1312
1313        if (value)
1314                str = strdup(opt);
1315        else
1316                str = get_no_opt(opt);
1317
1318        r = tracefs_instance_file_write(NULL, TR_OPTIONS, str);
1319        free(str);
1320        return r;
1321}
1322
1323void save_trace_opts(struct ftrace_state *state)
1324{
1325        char *allopt;
1326        int psize;
1327        int i;
1328
1329        allopt = tracefs_instance_file_read(NULL, TR_OPTIONS, &psize);
1330        if (!allopt)
1331                errx(EXIT_FAILURE, "Failed to read the %s file\n", TR_OPTIONS);
1332
1333        for (i = 0; i < OPTIDX_NR; i++)
1334                state->opt[i] = get_trace_opt(allopt, optstr[i],
1335                                              &state->opt_valid[i]);
1336
1337        free(allopt);
1338}
1339
1340static void write_file(const char *file, const char *cur, const char *new,
1341                       enum errhandling h)
1342{
1343        int r;
1344        static const char *emsg = "Failed to write to the %s file!";
1345
1346        /* Do nothing if we now that the current and new value are equal */
1347        if (cur && !needs_change(cur, new))
1348                return;
1349
1350        r = tracefs_instance_file_write(NULL, file, new);
1351        if (r < 0) {
1352                if (h) {
1353                        warnx(emsg, file);
1354                        if (h == ERR_CLEANUP)
1355                                cleanup_exit(EXIT_FAILURE);
1356                } else
1357                        errx(EXIT_FAILURE, emsg, file);
1358        }
1359        if (verbose_ftrace()) {
1360                mutex_lock(&print_mtx);
1361                printf("%s was set to %s\n", file, new);
1362                mutex_unlock(&print_mtx);
1363        }
1364}
1365
1366static void reset_max_latency(void)
1367{
1368        write_file(TR_MAXLAT, NULL, "0", ERR_CLEANUP);
1369}
1370
1371static void save_and_disable_tracer(void)
1372{
1373        char *orig_th;
1374        char *tracer;
1375        bool need_nop = false;
1376
1377        mutex_lock(&save_state.mutex);
1378
1379        save_trace_opts(&save_state);
1380        tracer = read_file(TR_CURRENT, ERR_EXIT);
1381        orig_th = read_file(TR_THRESH, ERR_EXIT);
1382
1383        if (needs_change(tracer, NOP_TRACER)) {
1384                mutex_lock(&print_mtx);
1385                if (force_tracer) {
1386                        printf(
1387                                "The %s tracer is already in use but proceeding anyway!\n",
1388                                tracer);
1389                } else {
1390                        printf(
1391                                "The %s tracer is already in use, cowardly bailing out!\n"
1392                                "This could indicate that another program or instance is tracing.\n"
1393                                "Use the -F [--force] option to disregard the current tracer.\n", tracer);
1394                        exit(0);
1395                }
1396                mutex_unlock(&print_mtx);
1397                need_nop = true;
1398        }
1399
1400        save_state.tracer =  tracer;
1401        save_state.thresh = orig_th;
1402
1403        if (need_nop)
1404                write_file(TR_CURRENT, NULL, NOP_TRACER, ERR_EXIT);
1405
1406        mutex_unlock(&save_state.mutex);
1407}
1408
1409void set_trace_opts(struct ftrace_state *state, bool *new)
1410{
1411        int i;
1412        int r;
1413
1414        /*
1415         * We only set options if we earlier detected that the option exists in
1416         * the trace_options file and that the wanted setting is different from
1417         * the one we saw in save_and_disable_tracer()
1418         */
1419        for (i = 0; i < OPTIDX_NR; i++)
1420                if (state->opt_valid[i] &&
1421                    state->opt[i] != new[i]) {
1422                        r = set_trace_opt(optstr[i], new[i]);
1423                        if (r < 0) {
1424                                warnx("Failed to set the %s option to %s",
1425                                      optstr[i], bool2str(new[i]));
1426                                cleanup_exit(EXIT_FAILURE);
1427                        }
1428                        if (verbose_ftrace()) {
1429                                mutex_lock(&print_mtx);
1430                                printf("%s in %s was set to %s\n", optstr[i],
1431                                       TR_OPTIONS, bool2str(new[i]));
1432                                mutex_unlock(&print_mtx);
1433                        }
1434                }
1435}
1436
1437static void enable_tracer(void)
1438{
1439        mutex_lock(&save_state.mutex);
1440        set_trace_opts(&save_state, use_options);
1441
1442        write_file(TR_THRESH, save_state.thresh, threshold, ERR_CLEANUP);
1443        write_file(TR_CURRENT, NOP_TRACER, current_tracer, ERR_CLEANUP);
1444
1445        mutex_unlock(&save_state.mutex);
1446}
1447
1448static void tracing_loop(void)
1449{
1450        int ifd = inotify_init();
1451        int wd;
1452        const ssize_t bufsize = sizeof(inotify_buffer);
1453        const ssize_t istructsize = sizeof(struct inotify_event);
1454        char *buf = &inotify_buffer[0];
1455        ssize_t nr_read;
1456        char *p;
1457        int modified;
1458        struct inotify_event *event;
1459        struct entry req;
1460        char *buffer;
1461        const size_t bufspace = PRINT_BUFFER_SIZE;
1462        struct timespec timestamp;
1463
1464        print_priority();
1465
1466        buffer = malloc_or_die(bufspace);
1467
1468        if (ifd < 0)
1469                err(EXIT_FAILURE, "inotify_init() failed!");
1470
1471
1472        if (setup_ftrace) {
1473                /*
1474                 * We must disable the tracer before resetting the max_latency
1475                 */
1476                save_and_disable_tracer();
1477                /*
1478                 * We must reset the max_latency before the inotify_add_watch()
1479                 * call.
1480                 */
1481                reset_max_latency();
1482        }
1483
1484        wd = inotify_add_watch(ifd, debug_maxlat, IN_MODIFY);
1485        if (wd < 0)
1486                err(EXIT_FAILURE, "inotify_add_watch() failed!");
1487
1488        if (setup_ftrace)
1489                enable_tracer();
1490
1491        signal_blocking(SIG_UNBLOCK);
1492
1493        while (true) {
1494                modified = 0;
1495                check_signals();
1496                nr_read = read(ifd, buf, bufsize);
1497                check_signals();
1498                if (nr_read < 0) {
1499                        if (errno == EINTR)
1500                                continue;
1501                        warn("read() failed on inotify fd!");
1502                        cleanup_exit(EXIT_FAILURE);
1503                }
1504                if (nr_read == bufsize)
1505                        warnx("inotify() buffer filled, skipping events");
1506                if (nr_read < istructsize) {
1507                        warnx("read() returned too few bytes on inotify fd");
1508                        cleanup_exit(EXIT_FAILURE);
1509                }
1510
1511                for (p = buf; p < buf + nr_read;) {
1512                        event = (struct inotify_event *) p;
1513                        if ((event->mask & IN_MODIFY) != 0)
1514                                modified++;
1515                        p += istructsize + event->len;
1516                }
1517                while (modified > 0) {
1518                        check_signals();
1519                        mutex_lock(&printstate.mutex);
1520                        check_signals();
1521                        printstate_next_ticket(&req);
1522                        if (printstate_cnt_read() > 0) {
1523                                printstate_mark_req_completed(&req);
1524                                mutex_unlock(&printstate.mutex);
1525                                if (verbose_lostevent()) {
1526                                        clock_gettime_or_die(CLOCK_MONOTONIC,
1527                                                             &timestamp);
1528                                        print_lostmessage(&timestamp, buffer,
1529                                                          bufspace, &req,
1530                                                          "inotify loop");
1531                                }
1532                                break;
1533                        }
1534                        mutex_unlock(&printstate.mutex);
1535                        if (queue_try_to_add_entry(&printqueue, &req) != 0) {
1536                                /* These prints could happen concurrently */
1537                                check_signals();
1538                                mutex_lock(&print_mtx);
1539                                check_signals();
1540                                write_or_die(fd_stdout, queue_full_warning,
1541                                             sizeof(queue_full_warning));
1542                                mutex_unlock(&print_mtx);
1543                        }
1544                        modified--;
1545                }
1546        }
1547}
1548
1549static void *do_printloop(void *arg)
1550{
1551        const size_t bufspace = PRINT_BUFFER_SIZE;
1552        char *buffer;
1553        long *rseed = (long *) arg;
1554        struct drand48_data drandbuf;
1555        long slept = 0;
1556        struct entry req;
1557        int prob = 0;
1558        struct timespec timestamp;
1559        struct short_msg resize_msg;
1560
1561        print_priority();
1562
1563        if (srand48_r(*rseed, &drandbuf) != 0) {
1564                warn("srand48_r() failed!\n");
1565                cleanup_exit(EXIT_FAILURE);
1566        }
1567
1568        buffer = malloc_or_die(bufspace);
1569
1570        while (true) {
1571                req = queue_wait_for_entry(&printqueue);
1572                clock_gettime_or_die(CLOCK_MONOTONIC, &timestamp);
1573                mutex_lock(&printstate.mutex);
1574                if (prev_req_won_race(&req)) {
1575                        printstate_mark_req_completed(&req);
1576                        mutex_unlock(&printstate.mutex);
1577                        if (verbose_lostevent())
1578                                print_lostmessage(&timestamp, buffer, bufspace,
1579                                                  &req, "print loop");
1580                        continue;
1581                }
1582                mutex_unlock(&printstate.mutex);
1583
1584                /*
1585                 * Toss a coin to decide if we want to sleep before printing
1586                 * out the backtrace. The reason for this is that opening
1587                 * /sys/kernel/debug/tracing/trace will cause a blackout of
1588                 * hundreds of ms, where no latencies will be noted by the
1589                 * latency tracer. Thus by randomly sleeping we try to avoid
1590                 * missing traces systematically due to this. With this option
1591                 * we will sometimes get the first latency, some other times
1592                 * some of the later ones, in case of closely spaced traces.
1593                 */
1594                if (trace_enable && use_random_sleep) {
1595                        slept = 0;
1596                        prob = table_get_probability(&req, &resize_msg);
1597                        if (!toss_coin(&drandbuf, prob))
1598                                slept = go_to_sleep(&req);
1599                        if (slept >= 0) {
1600                                /* A print is ongoing */
1601                                printstate_cnt_inc();
1602                                /*
1603                                 * We will do the printout below so we have to
1604                                 * mark it as completed while we still have the
1605                                 * mutex.
1606                                 */
1607                                mutex_lock(&printstate.mutex);
1608                                printstate_mark_req_completed(&req);
1609                                mutex_unlock(&printstate.mutex);
1610                        }
1611                }
1612                if (trace_enable) {
1613                        /*
1614                         * slept < 0  means that we detected another
1615                         * notification in go_to_sleep() above
1616                         */
1617                        if (slept >= 0)
1618                                /*
1619                                 * N.B. printstate_cnt_dec(); will be called
1620                                 * inside print_tracefile()
1621                                 */
1622                                print_tracefile(&resize_msg, &timestamp, buffer,
1623                                                bufspace, slept, &req);
1624                        else
1625                                print_skipmessage(&resize_msg, &timestamp,
1626                                                  buffer, bufspace, &req, true);
1627                } else {
1628                        print_skipmessage(&resize_msg, &timestamp, buffer,
1629                                          bufspace, &req, false);
1630                }
1631        }
1632        return NULL;
1633}
1634
1635static void start_printthread(void)
1636{
1637        unsigned int i;
1638        long *seed;
1639        int ufd;
1640
1641        ufd = open(DEV_URANDOM, O_RDONLY);
1642        if (nr_threads > MAX_THREADS) {
1643                warnx(
1644"Number of requested print threads was %d, max number is %d\n",
1645                      nr_threads, MAX_THREADS);
1646                nr_threads = MAX_THREADS;
1647        }
1648        for (i = 0; i < nr_threads; i++) {
1649                seed = malloc_or_die(sizeof(*seed));
1650                if (ufd <  0 ||
1651                    read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) {
1652                        printf(
1653"Warning! Using trivial random number seed, since %s not available\n",
1654                        DEV_URANDOM);
1655                        fflush(stdout);
1656                        *seed = i;
1657                }
1658                errno = pthread_create(&printthread[i], NULL, do_printloop,
1659                                       seed);
1660                if (errno)
1661                        err(EXIT_FAILURE, "pthread_create()");
1662        }
1663        if (ufd > 0 && close(ufd) != 0)
1664                warn("close() failed");
1665}
1666
1667static void show_usage(void)
1668{
1669        printf(
1670"Usage: %s [OPTION]...\n\n"
1671"Collect closely occurring latencies from %s\n"
1672"with any of the following tracers: preemptirqsoff, preemptoff, irqsoff, "
1673"wakeup,\nwakeup_dl, or wakeup_rt.\n\n"
1674
1675"The occurrence of a latency is detected by monitoring the file\n"
1676"%s with inotify.\n\n"
1677
1678"The following options are supported:\n\n"
1679
1680"-l, --list\t\tList the latency tracers that are supported by the\n"
1681"\t\t\tcurrently running Linux kernel. If you don't see the\n"
1682"\t\t\ttracer that you want, you will probably need to\n"
1683"\t\t\tchange your kernel config and build a new kernel.\n\n"
1684
1685"-t, --tracer TR\t\tUse the tracer TR. The default is to use the first\n"
1686"\t\t\ttracer that is supported by the kernel in the following\n"
1687"\t\t\torder of precedence:\n\n"
1688"\t\t\tpreemptirqsoff\n"
1689"\t\t\tpreemptoff\n"
1690"\t\t\tirqsoff\n"
1691"\t\t\twakeup\n"
1692"\t\t\twakeup_rt\n"
1693"\t\t\twakeup_dl\n"
1694"\n"
1695"\t\t\tIf TR is not on the list above, then a warning will be\n"
1696"\t\t\tprinted.\n\n"
1697
1698"-F, --force\t\tProceed even if another ftrace tracer is active. Without\n"
1699"\t\t\tthis option, the program will refuse to start tracing if\n"
1700"\t\t\tany other tracer than the nop tracer is active.\n\n"
1701
1702"-s, --threshold TH\tConfigure ftrace to use a threshold of TH microseconds\n"
1703"\t\t\tfor the tracer. The default is 0, which means that\n"
1704"\t\t\ttracing_max_latency will be used. tracing_max_latency is\n"
1705"\t\t\tset to 0 when the program is started and contains the\n"
1706"\t\t\tmaximum of the latencies that have been encountered.\n\n"
1707
1708"-f, --function\t\tEnable the function-trace option in trace_options. With\n"
1709"\t\t\tthis option, ftrace will trace the functions that are\n"
1710"\t\t\texecuted during a latency, without it we only get the\n"
1711"\t\t\tbeginning, end, and backtrace.\n\n"
1712
1713"-g, --graph\t\tEnable the display-graph option in trace_option. This\n"
1714"\t\t\toption causes ftrace to show the graph of how functions\n"
1715"\t\t\tare calling other functions.\n\n"
1716
1717"-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n"
1718"\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n"
1719"\t\t\tusing rr or fifo, remember that these policies may cause\n"
1720"\t\t\tother tasks to experience latencies.\n\n"
1721
1722"-p, --priority PRI\tRun the program with priority PRI. The acceptable range\n"
1723"\t\t\tof PRI depends on the scheduling policy.\n\n"
1724
1725"-n, --notrace\t\tIf latency is detected, do not print out the content of\n"
1726"\t\t\tthe trace file to standard output\n\n"
1727
1728"-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
1729
1730"-r, --random\t\tArbitrarily sleep a certain amount of time, default\n"
1731"\t\t\t%ld ms, before reading the trace file. The\n"
1732"\t\t\tprobabilities for sleep are chosen so that the\n"
1733"\t\t\tprobability of obtaining any of a cluster of closely\n"
1734"\t\t\toccurring latencies are equal, i.e. we will randomly\n"
1735"\t\t\tchoose which one we collect from the trace file.\n\n"
1736"\t\t\tThis option is probably only useful with the irqsoff,\n"
1737"\t\t\tpreemptoff, and preemptirqsoff tracers.\n\n"
1738
1739"-a, --nrlat NRLAT\tFor the purpose of arbitrary delay, assume that there\n"
1740"\t\t\tare no more than NRLAT clustered latencies. If NRLAT\n"
1741"\t\t\tlatencies are detected during a run, this value will\n"
1742"\t\t\tautomatically be increased to NRLAT + 1 and then to\n"
1743"\t\t\tNRLAT + 2 and so on. The default is %d. This option\n"
1744"\t\t\timplies -r. We need to know this number in order to\n"
1745"\t\t\tbe able to calculate the probabilities of sleeping.\n"
1746"\t\t\tSpecifically, the probabilities of not sleeping, i.e. to\n"
1747"\t\t\tdo an immediate printout will be:\n\n"
1748"\t\t\t1/NRLAT  1/(NRLAT - 1) ... 1/3  1/2  1\n\n"
1749"\t\t\tThe probability of sleeping will be:\n\n"
1750"\t\t\t1 - P, where P is from the series above\n\n"
1751"\t\t\tThis descending probability will cause us to choose\n"
1752"\t\t\tan occurrence at random. Observe that the final\n"
1753"\t\t\tprobability is 0, it is when we reach this probability\n"
1754"\t\t\tthat we increase NRLAT automatically. As an example,\n"
1755"\t\t\twith the default value of 2, the probabilities will be:\n\n"
1756"\t\t\t1/2  0\n\n"
1757"\t\t\tThis means, when a latency is detected we will sleep\n"
1758"\t\t\twith 50%% probability. If we ever detect another latency\n"
1759"\t\t\tduring the sleep period, then the probability of sleep\n"
1760"\t\t\twill be 0%% and the table will be expanded to:\n\n"
1761"\t\t\t1/3  1/2  0\n\n"
1762
1763"-v, --verbose\t\tIncrease the verbosity. If this option is given once,\n"
1764"\t\t\tthen print a message every time that the NRLAT value\n"
1765"\t\t\tis automatically increased. It also causes a message to\n"
1766"\t\t\tbe printed when the ftrace settings are changed. If this\n"
1767"\t\t\toption is given at least twice, then also print a\n"
1768"\t\t\twarning for lost events.\n\n"
1769
1770"-u, --time TIME\t\tArbitrarily sleep for a specified time TIME ms before\n"
1771"\t\t\tprinting out the trace from the trace file. The default\n"
1772"\t\t\tis %ld ms. This option implies -r.\n\n"
1773
1774"-x, --no-ftrace\t\tDo not configure ftrace. This assume that the user\n"
1775"\t\t\tconfigures the ftrace files in sysfs such as\n"
1776"\t\t\t/sys/kernel/tracing/current_tracer or equivalent.\n\n"
1777
1778"-i, --tracefile FILE\tUse FILE as trace file. The default is\n"
1779"\t\t\t%s.\n"
1780"\t\t\tThis options implies -x\n\n"
1781
1782"-m, --max-lat FILE\tUse FILE as tracing_max_latency file. The default is\n"
1783"\t\t\t%s.\n"
1784"\t\t\tThis options implies -x\n\n"
1785,
1786prg_name, debug_tracefile_dflt, debug_maxlat_dflt, DEFAULT_NR_PRINTER_THREADS,
1787SLEEP_TIME_MS_DEFAULT, DEFAULT_TABLE_SIZE, SLEEP_TIME_MS_DEFAULT,
1788debug_tracefile_dflt, debug_maxlat_dflt);
1789}
1790
1791static void find_tracefiles(void)
1792{
1793        debug_tracefile_dflt = tracefs_get_tracing_file("trace");
1794        if (debug_tracefile_dflt == NULL) {
1795                /* This is needed in show_usage() */
1796                debug_tracefile_dflt = DEBUG_NOFILE;
1797        }
1798
1799        debug_maxlat_dflt = tracefs_get_tracing_file("tracing_max_latency");
1800        if (debug_maxlat_dflt == NULL) {
1801                /* This is needed in show_usage() */
1802                debug_maxlat_dflt = DEBUG_NOFILE;
1803        }
1804
1805        debug_tracefile = debug_tracefile_dflt;
1806        debug_maxlat = debug_maxlat_dflt;
1807}
1808
1809bool alldigits(const char *s)
1810{
1811        for (; *s != '\0'; s++)
1812                if (!isdigit(*s))
1813                        return false;
1814        return true;
1815}
1816
1817void check_alldigits(const char *optarg, const char *argname)
1818{
1819        if (!alldigits(optarg))
1820                errx(EXIT_FAILURE,
1821                     "The %s parameter expects a decimal argument\n", argname);
1822}
1823
1824static void scan_arguments(int argc, char *argv[])
1825{
1826        int c;
1827        int i;
1828        int option_idx = 0;
1829
1830        static struct option long_options[] = {
1831                { "list",       no_argument,            0, 'l' },
1832                { "tracer",     required_argument,      0, 't' },
1833                { "force",      no_argument,            0, 'F' },
1834                { "threshold",  required_argument,      0, 's' },
1835                { "function",   no_argument,            0, 'f' },
1836                { "graph",      no_argument,            0, 'g' },
1837                { "policy",     required_argument,      0, 'c' },
1838                { "priority",   required_argument,      0, 'p' },
1839                { "help",       no_argument,            0, 'h' },
1840                { "notrace",    no_argument,            0, 'n' },
1841                { "random",     no_argument,            0, 'r' },
1842                { "nrlat",      required_argument,      0, 'a' },
1843                { "threads",    required_argument,      0, 'e' },
1844                { "time",       required_argument,      0, 'u' },
1845                { "verbose",    no_argument,            0, 'v' },
1846                { "no-ftrace",  no_argument,            0, 'x' },
1847                { "tracefile",  required_argument,      0, 'i' },
1848                { "max-lat",    required_argument,      0, 'm' },
1849                { 0,            0,                      0,  0  }
1850        };
1851        const struct policy *p;
1852        int max, min;
1853        int value;
1854        bool notracer, valid;
1855
1856        /*
1857         * We must do this before parsing the arguments because show_usage()
1858         * needs to display these.
1859         */
1860        find_tracefiles();
1861
1862        while (true) {
1863                c = getopt_long(argc, argv, "lt:Fs:fgc:p:hnra:e:u:vxi:m:",
1864                                long_options, &option_idx);
1865                if (c == -1)
1866                        break;
1867
1868                switch (c) {
1869                case 'l':
1870                        show_available();
1871                        exit(0);
1872                        break;
1873                case 't':
1874                        current_tracer = strdup(optarg);
1875                        if (!is_relevant_tracer(current_tracer)) {
1876                                warnx("%s is not a known latency tracer!\n",
1877                                      current_tracer);
1878                        }
1879                        valid = tracer_valid(current_tracer, &notracer);
1880                        if (notracer)
1881                                errx(EXIT_FAILURE, no_tracer_msg);
1882                        if (!valid)
1883                                errx(EXIT_FAILURE,
1884"The tracer %s is not supported by your kernel!\n", current_tracer);
1885                        break;
1886                case 'F':
1887                        force_tracer = true;
1888                        break;
1889                case 's':
1890                        check_alldigits(optarg, "-s [--threshold]");
1891                        threshold = strdup(optarg);
1892                        break;
1893                case 'f':
1894                        use_options[OPTIDX_FUNC_TR] = true;
1895                        break;
1896                case 'g':
1897                        use_options[OPTIDX_DISP_GR] = true;
1898                        break;
1899                case 'c':
1900                        p = policy_from_name(optarg);
1901                        if (p != NULL) {
1902                                sched_policy = p->policy;
1903                                sched_policy_set = true;
1904                                if (!sched_pri_set) {
1905                                        sched_pri = p->default_pri;
1906                                        sched_pri_set = true;
1907                                }
1908                        } else {
1909                                warnx("Unknown scheduling %s\n", optarg);
1910                                show_usage();
1911                                exit(0);
1912                        }
1913                        break;
1914                case 'p':
1915                        check_alldigits(optarg, "-p [--priority]");
1916                        sched_pri = atoi(optarg);
1917                        sched_pri_set = true;
1918                        break;
1919                case 'h':
1920                        show_usage();
1921                        exit(0);
1922                        break;
1923                case 'n':
1924                        trace_enable = false;
1925                        use_random_sleep = false;
1926                        break;
1927                case 'e':
1928                        check_alldigits(optarg, "-e [--threads]");
1929                        value = atoi(optarg);
1930                        if (value > 0)
1931                                nr_threads = value;
1932                        else {
1933                                warnx("NRTHR must be > 0\n");
1934                                show_usage();
1935                                exit(0);
1936                        }
1937                        break;
1938                case 'u':
1939                        check_alldigits(optarg, "-u [--time]");
1940                        value = atoi(optarg);
1941                        if (value < 0) {
1942                                warnx("TIME must be >= 0\n");
1943                                show_usage();
1944                                exit(0);
1945                        }
1946                        trace_enable = true;
1947                        use_random_sleep = true;
1948                        sleep_time = value * USEC_PER_MSEC;
1949                        break;
1950                case 'v':
1951                        verbosity++;
1952                        break;
1953                case 'r':
1954                        trace_enable = true;
1955                        use_random_sleep = true;
1956                        break;
1957                case 'a':
1958                        check_alldigits(optarg, "-a [--nrlat]");
1959                        value = atoi(optarg);
1960                        if (value <= 0) {
1961                                warnx("NRLAT must be > 0\n");
1962                                show_usage();
1963                                exit(0);
1964                        }
1965                        trace_enable = true;
1966                        use_random_sleep = true;
1967                        table_startsize = value;
1968                        break;
1969                case 'x':
1970                        setup_ftrace = false;
1971                        break;
1972                case 'i':
1973                        setup_ftrace = false;
1974                        debug_tracefile = strdup(optarg);
1975                        break;
1976                case 'm':
1977                        setup_ftrace = false;
1978                        debug_maxlat = strdup(optarg);
1979                        break;
1980                default:
1981                        show_usage();
1982                        exit(0);
1983                        break;
1984                }
1985        }
1986
1987        if (setup_ftrace) {
1988                if (!current_tracer) {
1989                        current_tracer = find_default_tracer();
1990                        if (!current_tracer)
1991                                errx(EXIT_FAILURE,
1992"No default tracer found and tracer not specified\n");
1993                }
1994
1995                if (use_random_sleep && !random_makes_sense(current_tracer)) {
1996                        warnx("WARNING: The tracer is %s and random sleep has",
1997                              current_tracer);
1998                        fprintf(stderr,
1999"been enabled. Random sleep is intended for the following tracers:\n");
2000                        for (i = 0; random_tracers[i]; i++)
2001                                fprintf(stderr, "%s\n", random_tracers[i]);
2002                        fprintf(stderr, "\n");
2003                }
2004        }
2005
2006        if (debug_tracefile == DEBUG_NOFILE ||
2007            debug_maxlat == DEBUG_NOFILE)
2008                errx(EXIT_FAILURE,
2009"Could not find tracing directory e.g. /sys/kernel/tracing\n");
2010
2011        if (!sched_policy_set) {
2012                sched_policy = SCHED_RR;
2013                sched_policy_set = true;
2014                if (!sched_pri_set) {
2015                        sched_pri = RT_DEFAULT_PRI;
2016                        sched_pri_set = true;
2017                }
2018        }
2019
2020        max = sched_get_priority_max(sched_policy);
2021        min = sched_get_priority_min(sched_policy);
2022
2023        if (sched_pri < min) {
2024                printf(
2025"ATTENTION: Increasing priority to minimum, which is %d\n", min);
2026                sched_pri = min;
2027        }
2028        if (sched_pri > max) {
2029                printf(
2030"ATTENTION: Reducing priority to maximum, which is %d\n", max);
2031                sched_pri = max;
2032        }
2033}
2034
2035static void show_params(void)
2036{
2037        printf(
2038"\n"
2039"Running with scheduling policy %s and priority %d. Using %d print threads.\n",
2040                policy_name(sched_policy), sched_pri, nr_threads);
2041        if (trace_enable) {
2042                if (use_random_sleep) {
2043                        printf(
2044"%s will be printed with random delay\n"
2045"Start size of the probability table:\t\t\t%d\n"
2046"Print a message when the prob. table changes size:\t%s\n"
2047"Print a warning when an event has been lost:\t\t%s\n"
2048"Sleep time is:\t\t\t\t\t\t%ld ms\n",
2049debug_tracefile,
2050table_startsize,
2051bool2str(verbose_sizechange()),
2052bool2str(verbose_lostevent()),
2053sleep_time / USEC_PER_MSEC);
2054                } else {
2055                        printf("%s will be printed immediately\n",
2056                               debug_tracefile);
2057                }
2058        } else {
2059                printf("%s will not be printed\n",
2060                       debug_tracefile);
2061        }
2062        if (setup_ftrace) {
2063                printf("Tracer:\t\t\t\t\t\t\t%s\n"
2064                       "%s option:\t\t\t\t\t%s\n"
2065                       "%s option:\t\t\t\t\t%s\n",
2066                       current_tracer,
2067                       optstr[OPTIDX_FUNC_TR],
2068                       bool2str(use_options[OPTIDX_FUNC_TR]),
2069                       optstr[OPTIDX_DISP_GR],
2070                       bool2str(use_options[OPTIDX_DISP_GR]));
2071                if (!strcmp(threshold, "0"))
2072                        printf("Threshold:\t\t\t\t\t\ttracing_max_latency\n");
2073                else
2074                        printf("Threshold:\t\t\t\t\t\t%s\n", threshold);
2075        }
2076        printf("\n");
2077}
2078
2079int main(int argc, char *argv[])
2080{
2081        init_save_state();
2082        signal_blocking(SIG_BLOCK);
2083        setup_sig_handler();
2084        open_stdout();
2085
2086        if (argc >= 1)
2087                prg_name = argv[0];
2088        else
2089                prg_name = prg_unknown;
2090
2091        scan_arguments(argc, argv);
2092        show_params();
2093
2094        init_printstate();
2095        init_print_mtx();
2096        if (use_random_sleep) {
2097                init_probabilities();
2098                if (verbose_sizechange())
2099                        printf("Initializing probability table to %d\n",
2100                               table_startsize);
2101                sleeptable_resize(table_startsize, false, NULL);
2102        }
2103        set_priority();
2104        init_queue(&printqueue);
2105        start_printthread();
2106        tracing_loop();
2107        return 0;
2108}
2109
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.