linux/tools/perf/builtin-sched.c
<<
>>
Prefs
   1#include "builtin.h"
   2#include "perf.h"
   3
   4#include "util/util.h"
   5#include "util/evlist.h"
   6#include "util/cache.h"
   7#include "util/evsel.h"
   8#include "util/symbol.h"
   9#include "util/thread.h"
  10#include "util/header.h"
  11#include "util/session.h"
  12#include "util/tool.h"
  13
  14#include "util/parse-options.h"
  15#include "util/trace-event.h"
  16
  17#include "util/debug.h"
  18
  19#include <sys/prctl.h>
  20#include <sys/resource.h>
  21
  22#include <semaphore.h>
  23#include <pthread.h>
  24#include <math.h>
  25
  26static const char               *input_name;
  27
  28static char                     default_sort_order[] = "avg, max, switch, runtime";
  29static const char               *sort_order = default_sort_order;
  30
  31static int                      profile_cpu = -1;
  32
  33#define PR_SET_NAME             15               /* Set process name */
  34#define MAX_CPUS                4096
  35
  36static u64                      run_measurement_overhead;
  37static u64                      sleep_measurement_overhead;
  38
  39#define COMM_LEN                20
  40#define SYM_LEN                 129
  41
  42#define MAX_PID                 65536
  43
  44static unsigned long            nr_tasks;
  45
  46struct perf_sched {
  47        struct perf_tool    tool;
  48        struct perf_session *session;
  49};
  50
  51struct sched_atom;
  52
  53struct task_desc {
  54        unsigned long           nr;
  55        unsigned long           pid;
  56        char                    comm[COMM_LEN];
  57
  58        unsigned long           nr_events;
  59        unsigned long           curr_event;
  60        struct sched_atom       **atoms;
  61
  62        pthread_t               thread;
  63        sem_t                   sleep_sem;
  64
  65        sem_t                   ready_for_work;
  66        sem_t                   work_done_sem;
  67
  68        u64                     cpu_usage;
  69};
  70
  71enum sched_event_type {
  72        SCHED_EVENT_RUN,
  73        SCHED_EVENT_SLEEP,
  74        SCHED_EVENT_WAKEUP,
  75        SCHED_EVENT_MIGRATION,
  76};
  77
  78struct sched_atom {
  79        enum sched_event_type   type;
  80        int                     specific_wait;
  81        u64                     timestamp;
  82        u64                     duration;
  83        unsigned long           nr;
  84        sem_t                   *wait_sem;
  85        struct task_desc        *wakee;
  86};
  87
  88static struct task_desc         *pid_to_task[MAX_PID];
  89
  90static struct task_desc         **tasks;
  91
  92static pthread_mutex_t          start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
  93static u64                      start_time;
  94
  95static pthread_mutex_t          work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
  96
  97static unsigned long            nr_run_events;
  98static unsigned long            nr_sleep_events;
  99static unsigned long            nr_wakeup_events;
 100
 101static unsigned long            nr_sleep_corrections;
 102static unsigned long            nr_run_events_optimized;
 103
 104static unsigned long            targetless_wakeups;
 105static unsigned long            multitarget_wakeups;
 106
 107static u64                      cpu_usage;
 108static u64                      runavg_cpu_usage;
 109static u64                      parent_cpu_usage;
 110static u64                      runavg_parent_cpu_usage;
 111
 112static unsigned long            nr_runs;
 113static u64                      sum_runtime;
 114static u64                      sum_fluct;
 115static u64                      run_avg;
 116
 117static unsigned int             replay_repeat = 10;
 118static unsigned long            nr_timestamps;
 119static unsigned long            nr_unordered_timestamps;
 120static unsigned long            nr_state_machine_bugs;
 121static unsigned long            nr_context_switch_bugs;
 122static unsigned long            nr_events;
 123static unsigned long            nr_lost_chunks;
 124static unsigned long            nr_lost_events;
 125
 126#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 127
 128enum thread_state {
 129        THREAD_SLEEPING = 0,
 130        THREAD_WAIT_CPU,
 131        THREAD_SCHED_IN,
 132        THREAD_IGNORE
 133};
 134
 135struct work_atom {
 136        struct list_head        list;
 137        enum thread_state       state;
 138        u64                     sched_out_time;
 139        u64                     wake_up_time;
 140        u64                     sched_in_time;
 141        u64                     runtime;
 142};
 143
 144struct work_atoms {
 145        struct list_head        work_list;
 146        struct thread           *thread;
 147        struct rb_node          node;
 148        u64                     max_lat;
 149        u64                     max_lat_at;
 150        u64                     total_lat;
 151        u64                     nb_atoms;
 152        u64                     total_runtime;
 153};
 154
 155typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 156
 157static struct rb_root           atom_root, sorted_atom_root;
 158
 159static u64                      all_runtime;
 160static u64                      all_count;
 161
 162
 163static u64 get_nsecs(void)
 164{
 165        struct timespec ts;
 166
 167        clock_gettime(CLOCK_MONOTONIC, &ts);
 168
 169        return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 170}
 171
 172static void burn_nsecs(u64 nsecs)
 173{
 174        u64 T0 = get_nsecs(), T1;
 175
 176        do {
 177                T1 = get_nsecs();
 178        } while (T1 + run_measurement_overhead < T0 + nsecs);
 179}
 180
 181static void sleep_nsecs(u64 nsecs)
 182{
 183        struct timespec ts;
 184
 185        ts.tv_nsec = nsecs % 999999999;
 186        ts.tv_sec = nsecs / 999999999;
 187
 188        nanosleep(&ts, NULL);
 189}
 190
 191static void calibrate_run_measurement_overhead(void)
 192{
 193        u64 T0, T1, delta, min_delta = 1000000000ULL;
 194        int i;
 195
 196        for (i = 0; i < 10; i++) {
 197                T0 = get_nsecs();
 198                burn_nsecs(0);
 199                T1 = get_nsecs();
 200                delta = T1-T0;
 201                min_delta = min(min_delta, delta);
 202        }
 203        run_measurement_overhead = min_delta;
 204
 205        printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 206}
 207
 208static void calibrate_sleep_measurement_overhead(void)
 209{
 210        u64 T0, T1, delta, min_delta = 1000000000ULL;
 211        int i;
 212
 213        for (i = 0; i < 10; i++) {
 214                T0 = get_nsecs();
 215                sleep_nsecs(10000);
 216                T1 = get_nsecs();
 217                delta = T1-T0;
 218                min_delta = min(min_delta, delta);
 219        }
 220        min_delta -= 10000;
 221        sleep_measurement_overhead = min_delta;
 222
 223        printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 224}
 225
 226static struct sched_atom *
 227get_new_event(struct task_desc *task, u64 timestamp)
 228{
 229        struct sched_atom *event = zalloc(sizeof(*event));
 230        unsigned long idx = task->nr_events;
 231        size_t size;
 232
 233        event->timestamp = timestamp;
 234        event->nr = idx;
 235
 236        task->nr_events++;
 237        size = sizeof(struct sched_atom *) * task->nr_events;
 238        task->atoms = realloc(task->atoms, size);
 239        BUG_ON(!task->atoms);
 240
 241        task->atoms[idx] = event;
 242
 243        return event;
 244}
 245
 246static struct sched_atom *last_event(struct task_desc *task)
 247{
 248        if (!task->nr_events)
 249                return NULL;
 250
 251        return task->atoms[task->nr_events - 1];
 252}
 253
 254static void
 255add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 256{
 257        struct sched_atom *event, *curr_event = last_event(task);
 258
 259        /*
 260         * optimize an existing RUN event by merging this one
 261         * to it:
 262         */
 263        if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 264                nr_run_events_optimized++;
 265                curr_event->duration += duration;
 266                return;
 267        }
 268
 269        event = get_new_event(task, timestamp);
 270
 271        event->type = SCHED_EVENT_RUN;
 272        event->duration = duration;
 273
 274        nr_run_events++;
 275}
 276
 277static void
 278add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 279                       struct task_desc *wakee)
 280{
 281        struct sched_atom *event, *wakee_event;
 282
 283        event = get_new_event(task, timestamp);
 284        event->type = SCHED_EVENT_WAKEUP;
 285        event->wakee = wakee;
 286
 287        wakee_event = last_event(wakee);
 288        if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
 289                targetless_wakeups++;
 290                return;
 291        }
 292        if (wakee_event->wait_sem) {
 293                multitarget_wakeups++;
 294                return;
 295        }
 296
 297        wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
 298        sem_init(wakee_event->wait_sem, 0, 0);
 299        wakee_event->specific_wait = 1;
 300        event->wait_sem = wakee_event->wait_sem;
 301
 302        nr_wakeup_events++;
 303}
 304
 305static void
 306add_sched_event_sleep(struct task_desc *task, u64 timestamp,
 307                      u64 task_state __used)
 308{
 309        struct sched_atom *event = get_new_event(task, timestamp);
 310
 311        event->type = SCHED_EVENT_SLEEP;
 312
 313        nr_sleep_events++;
 314}
 315
 316static struct task_desc *register_pid(unsigned long pid, const char *comm)
 317{
 318        struct task_desc *task;
 319
 320        BUG_ON(pid >= MAX_PID);
 321
 322        task = pid_to_task[pid];
 323
 324        if (task)
 325                return task;
 326
 327        task = zalloc(sizeof(*task));
 328        task->pid = pid;
 329        task->nr = nr_tasks;
 330        strcpy(task->comm, comm);
 331        /*
 332         * every task starts in sleeping state - this gets ignored
 333         * if there's no wakeup pointing to this sleep state:
 334         */
 335        add_sched_event_sleep(task, 0, 0);
 336
 337        pid_to_task[pid] = task;
 338        nr_tasks++;
 339        tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
 340        BUG_ON(!tasks);
 341        tasks[task->nr] = task;
 342
 343        if (verbose)
 344                printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
 345
 346        return task;
 347}
 348
 349
 350static void print_task_traces(void)
 351{
 352        struct task_desc *task;
 353        unsigned long i;
 354
 355        for (i = 0; i < nr_tasks; i++) {
 356                task = tasks[i];
 357                printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 358                        task->nr, task->comm, task->pid, task->nr_events);
 359        }
 360}
 361
 362static void add_cross_task_wakeups(void)
 363{
 364        struct task_desc *task1, *task2;
 365        unsigned long i, j;
 366
 367        for (i = 0; i < nr_tasks; i++) {
 368                task1 = tasks[i];
 369                j = i + 1;
 370                if (j == nr_tasks)
 371                        j = 0;
 372                task2 = tasks[j];
 373                add_sched_event_wakeup(task1, 0, task2);
 374        }
 375}
 376
 377static void
 378process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
 379{
 380        int ret = 0;
 381
 382        switch (atom->type) {
 383                case SCHED_EVENT_RUN:
 384                        burn_nsecs(atom->duration);
 385                        break;
 386                case SCHED_EVENT_SLEEP:
 387                        if (atom->wait_sem)
 388                                ret = sem_wait(atom->wait_sem);
 389                        BUG_ON(ret);
 390                        break;
 391                case SCHED_EVENT_WAKEUP:
 392                        if (atom->wait_sem)
 393                                ret = sem_post(atom->wait_sem);
 394                        BUG_ON(ret);
 395                        break;
 396                case SCHED_EVENT_MIGRATION:
 397                        break;
 398                default:
 399                        BUG_ON(1);
 400        }
 401}
 402
 403static u64 get_cpu_usage_nsec_parent(void)
 404{
 405        struct rusage ru;
 406        u64 sum;
 407        int err;
 408
 409        err = getrusage(RUSAGE_SELF, &ru);
 410        BUG_ON(err);
 411
 412        sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
 413        sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
 414
 415        return sum;
 416}
 417
 418static int self_open_counters(void)
 419{
 420        struct perf_event_attr attr;
 421        int fd;
 422
 423        memset(&attr, 0, sizeof(attr));
 424
 425        attr.type = PERF_TYPE_SOFTWARE;
 426        attr.config = PERF_COUNT_SW_TASK_CLOCK;
 427
 428        fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
 429
 430        if (fd < 0)
 431                die("Error: sys_perf_event_open() syscall returned"
 432                    "with %d (%s)\n", fd, strerror(errno));
 433        return fd;
 434}
 435
 436static u64 get_cpu_usage_nsec_self(int fd)
 437{
 438        u64 runtime;
 439        int ret;
 440
 441        ret = read(fd, &runtime, sizeof(runtime));
 442        BUG_ON(ret != sizeof(runtime));
 443
 444        return runtime;
 445}
 446
 447static void *thread_func(void *ctx)
 448{
 449        struct task_desc *this_task = ctx;
 450        u64 cpu_usage_0, cpu_usage_1;
 451        unsigned long i, ret;
 452        char comm2[22];
 453        int fd;
 454
 455        sprintf(comm2, ":%s", this_task->comm);
 456        prctl(PR_SET_NAME, comm2);
 457        fd = self_open_counters();
 458
 459again:
 460        ret = sem_post(&this_task->ready_for_work);
 461        BUG_ON(ret);
 462        ret = pthread_mutex_lock(&start_work_mutex);
 463        BUG_ON(ret);
 464        ret = pthread_mutex_unlock(&start_work_mutex);
 465        BUG_ON(ret);
 466
 467        cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 468
 469        for (i = 0; i < this_task->nr_events; i++) {
 470                this_task->curr_event = i;
 471                process_sched_event(this_task, this_task->atoms[i]);
 472        }
 473
 474        cpu_usage_1 = get_cpu_usage_nsec_self(fd);
 475        this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 476        ret = sem_post(&this_task->work_done_sem);
 477        BUG_ON(ret);
 478
 479        ret = pthread_mutex_lock(&work_done_wait_mutex);
 480        BUG_ON(ret);
 481        ret = pthread_mutex_unlock(&work_done_wait_mutex);
 482        BUG_ON(ret);
 483
 484        goto again;
 485}
 486
 487static void create_tasks(void)
 488{
 489        struct task_desc *task;
 490        pthread_attr_t attr;
 491        unsigned long i;
 492        int err;
 493
 494        err = pthread_attr_init(&attr);
 495        BUG_ON(err);
 496        err = pthread_attr_setstacksize(&attr,
 497                        (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 498        BUG_ON(err);
 499        err = pthread_mutex_lock(&start_work_mutex);
 500        BUG_ON(err);
 501        err = pthread_mutex_lock(&work_done_wait_mutex);
 502        BUG_ON(err);
 503        for (i = 0; i < nr_tasks; i++) {
 504                task = tasks[i];
 505                sem_init(&task->sleep_sem, 0, 0);
 506                sem_init(&task->ready_for_work, 0, 0);
 507                sem_init(&task->work_done_sem, 0, 0);
 508                task->curr_event = 0;
 509                err = pthread_create(&task->thread, &attr, thread_func, task);
 510                BUG_ON(err);
 511        }
 512}
 513
 514static void wait_for_tasks(void)
 515{
 516        u64 cpu_usage_0, cpu_usage_1;
 517        struct task_desc *task;
 518        unsigned long i, ret;
 519
 520        start_time = get_nsecs();
 521        cpu_usage = 0;
 522        pthread_mutex_unlock(&work_done_wait_mutex);
 523
 524        for (i = 0; i < nr_tasks; i++) {
 525                task = tasks[i];
 526                ret = sem_wait(&task->ready_for_work);
 527                BUG_ON(ret);
 528                sem_init(&task->ready_for_work, 0, 0);
 529        }
 530        ret = pthread_mutex_lock(&work_done_wait_mutex);
 531        BUG_ON(ret);
 532
 533        cpu_usage_0 = get_cpu_usage_nsec_parent();
 534
 535        pthread_mutex_unlock(&start_work_mutex);
 536
 537        for (i = 0; i < nr_tasks; i++) {
 538                task = tasks[i];
 539                ret = sem_wait(&task->work_done_sem);
 540                BUG_ON(ret);
 541                sem_init(&task->work_done_sem, 0, 0);
 542                cpu_usage += task->cpu_usage;
 543                task->cpu_usage = 0;
 544        }
 545
 546        cpu_usage_1 = get_cpu_usage_nsec_parent();
 547        if (!runavg_cpu_usage)
 548                runavg_cpu_usage = cpu_usage;
 549        runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
 550
 551        parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 552        if (!runavg_parent_cpu_usage)
 553                runavg_parent_cpu_usage = parent_cpu_usage;
 554        runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
 555                                   parent_cpu_usage)/10;
 556
 557        ret = pthread_mutex_lock(&start_work_mutex);
 558        BUG_ON(ret);
 559
 560        for (i = 0; i < nr_tasks; i++) {
 561                task = tasks[i];
 562                sem_init(&task->sleep_sem, 0, 0);
 563                task->curr_event = 0;
 564        }
 565}
 566
 567static void run_one_test(void)
 568{
 569        u64 T0, T1, delta, avg_delta, fluct;
 570
 571        T0 = get_nsecs();
 572        wait_for_tasks();
 573        T1 = get_nsecs();
 574
 575        delta = T1 - T0;
 576        sum_runtime += delta;
 577        nr_runs++;
 578
 579        avg_delta = sum_runtime / nr_runs;
 580        if (delta < avg_delta)
 581                fluct = avg_delta - delta;
 582        else
 583                fluct = delta - avg_delta;
 584        sum_fluct += fluct;
 585        if (!run_avg)
 586                run_avg = delta;
 587        run_avg = (run_avg*9 + delta)/10;
 588
 589        printf("#%-3ld: %0.3f, ",
 590                nr_runs, (double)delta/1000000.0);
 591
 592        printf("ravg: %0.2f, ",
 593                (double)run_avg/1e6);
 594
 595        printf("cpu: %0.2f / %0.2f",
 596                (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
 597
 598#if 0
 599        /*
 600         * rusage statistics done by the parent, these are less
 601         * accurate than the sum_exec_runtime based statistics:
 602         */
 603        printf(" [%0.2f / %0.2f]",
 604                (double)parent_cpu_usage/1e6,
 605                (double)runavg_parent_cpu_usage/1e6);
 606#endif
 607
 608        printf("\n");
 609
 610        if (nr_sleep_corrections)
 611                printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
 612        nr_sleep_corrections = 0;
 613}
 614
 615static void test_calibrations(void)
 616{
 617        u64 T0, T1;
 618
 619        T0 = get_nsecs();
 620        burn_nsecs(1e6);
 621        T1 = get_nsecs();
 622
 623        printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 624
 625        T0 = get_nsecs();
 626        sleep_nsecs(1e6);
 627        T1 = get_nsecs();
 628
 629        printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 630}
 631
 632#define FILL_FIELD(ptr, field, event, data)     \
 633        ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
 634
 635#define FILL_ARRAY(ptr, array, event, data)                     \
 636do {                                                            \
 637        void *__array = raw_field_ptr(event, #array, data);     \
 638        memcpy(ptr.array, __array, sizeof(ptr.array));  \
 639} while(0)
 640
 641#define FILL_COMMON_FIELDS(ptr, event, data)                    \
 642do {                                                            \
 643        FILL_FIELD(ptr, common_type, event, data);              \
 644        FILL_FIELD(ptr, common_flags, event, data);             \
 645        FILL_FIELD(ptr, common_preempt_count, event, data);     \
 646        FILL_FIELD(ptr, common_pid, event, data);               \
 647        FILL_FIELD(ptr, common_tgid, event, data);              \
 648} while (0)
 649
 650
 651
 652struct trace_switch_event {
 653        u32 size;
 654
 655        u16 common_type;
 656        u8 common_flags;
 657        u8 common_preempt_count;
 658        u32 common_pid;
 659        u32 common_tgid;
 660
 661        char prev_comm[16];
 662        u32 prev_pid;
 663        u32 prev_prio;
 664        u64 prev_state;
 665        char next_comm[16];
 666        u32 next_pid;
 667        u32 next_prio;
 668};
 669
 670struct trace_runtime_event {
 671        u32 size;
 672
 673        u16 common_type;
 674        u8 common_flags;
 675        u8 common_preempt_count;
 676        u32 common_pid;
 677        u32 common_tgid;
 678
 679        char comm[16];
 680        u32 pid;
 681        u64 runtime;
 682        u64 vruntime;
 683};
 684
 685struct trace_wakeup_event {
 686        u32 size;
 687
 688        u16 common_type;
 689        u8 common_flags;
 690        u8 common_preempt_count;
 691        u32 common_pid;
 692        u32 common_tgid;
 693
 694        char comm[16];
 695        u32 pid;
 696
 697        u32 prio;
 698        u32 success;
 699        u32 cpu;
 700};
 701
 702struct trace_fork_event {
 703        u32 size;
 704
 705        u16 common_type;
 706        u8 common_flags;
 707        u8 common_preempt_count;
 708        u32 common_pid;
 709        u32 common_tgid;
 710
 711        char parent_comm[16];
 712        u32 parent_pid;
 713        char child_comm[16];
 714        u32 child_pid;
 715};
 716
 717struct trace_migrate_task_event {
 718        u32 size;
 719
 720        u16 common_type;
 721        u8 common_flags;
 722        u8 common_preempt_count;
 723        u32 common_pid;
 724        u32 common_tgid;
 725
 726        char comm[16];
 727        u32 pid;
 728
 729        u32 prio;
 730        u32 cpu;
 731};
 732
 733struct trace_sched_handler {
 734        void (*switch_event)(struct trace_switch_event *,
 735                             struct machine *,
 736                             struct event_format *,
 737                             int cpu,
 738                             u64 timestamp,
 739                             struct thread *thread);
 740
 741        void (*runtime_event)(struct trace_runtime_event *,
 742                              struct machine *,
 743                              struct event_format *,
 744                              int cpu,
 745                              u64 timestamp,
 746                              struct thread *thread);
 747
 748        void (*wakeup_event)(struct trace_wakeup_event *,
 749                             struct machine *,
 750                             struct event_format *,
 751                             int cpu,
 752                             u64 timestamp,
 753                             struct thread *thread);
 754
 755        void (*fork_event)(struct trace_fork_event *,
 756                           struct event_format *,
 757                           int cpu,
 758                           u64 timestamp,
 759                           struct thread *thread);
 760
 761        void (*migrate_task_event)(struct trace_migrate_task_event *,
 762                           struct machine *machine,
 763                           struct event_format *,
 764                           int cpu,
 765                           u64 timestamp,
 766                           struct thread *thread);
 767};
 768
 769
 770static void
 771replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
 772                    struct machine *machine __used,
 773                    struct event_format *event,
 774                    int cpu __used,
 775                    u64 timestamp __used,
 776                    struct thread *thread __used)
 777{
 778        struct task_desc *waker, *wakee;
 779
 780        if (verbose) {
 781                printf("sched_wakeup event %p\n", event);
 782
 783                printf(" ... pid %d woke up %s/%d\n",
 784                        wakeup_event->common_pid,
 785                        wakeup_event->comm,
 786                        wakeup_event->pid);
 787        }
 788
 789        waker = register_pid(wakeup_event->common_pid, "<unknown>");
 790        wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
 791
 792        add_sched_event_wakeup(waker, timestamp, wakee);
 793}
 794
 795static u64 cpu_last_switched[MAX_CPUS];
 796
 797static void
 798replay_switch_event(struct trace_switch_event *switch_event,
 799                    struct machine *machine __used,
 800                    struct event_format *event,
 801                    int cpu,
 802                    u64 timestamp,
 803                    struct thread *thread __used)
 804{
 805        struct task_desc *prev, __used *next;
 806        u64 timestamp0;
 807        s64 delta;
 808
 809        if (verbose)
 810                printf("sched_switch event %p\n", event);
 811
 812        if (cpu >= MAX_CPUS || cpu < 0)
 813                return;
 814
 815        timestamp0 = cpu_last_switched[cpu];
 816        if (timestamp0)
 817                delta = timestamp - timestamp0;
 818        else
 819                delta = 0;
 820
 821        if (delta < 0)
 822                die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 823
 824        if (verbose) {
 825                printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
 826                        switch_event->prev_comm, switch_event->prev_pid,
 827                        switch_event->next_comm, switch_event->next_pid,
 828                        delta);
 829        }
 830
 831        prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
 832        next = register_pid(switch_event->next_pid, switch_event->next_comm);
 833
 834        cpu_last_switched[cpu] = timestamp;
 835
 836        add_sched_event_run(prev, timestamp, delta);
 837        add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
 838}
 839
 840
 841static void
 842replay_fork_event(struct trace_fork_event *fork_event,
 843                  struct event_format *event,
 844                  int cpu __used,
 845                  u64 timestamp __used,
 846                  struct thread *thread __used)
 847{
 848        if (verbose) {
 849                printf("sched_fork event %p\n", event);
 850                printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
 851                printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
 852        }
 853        register_pid(fork_event->parent_pid, fork_event->parent_comm);
 854        register_pid(fork_event->child_pid, fork_event->child_comm);
 855}
 856
 857static struct trace_sched_handler replay_ops  = {
 858        .wakeup_event           = replay_wakeup_event,
 859        .switch_event           = replay_switch_event,
 860        .fork_event             = replay_fork_event,
 861};
 862
 863struct sort_dimension {
 864        const char              *name;
 865        sort_fn_t               cmp;
 866        struct list_head        list;
 867};
 868
 869static LIST_HEAD(cmp_pid);
 870
 871static int
 872thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 873{
 874        struct sort_dimension *sort;
 875        int ret = 0;
 876
 877        BUG_ON(list_empty(list));
 878
 879        list_for_each_entry(sort, list, list) {
 880                ret = sort->cmp(l, r);
 881                if (ret)
 882                        return ret;
 883        }
 884
 885        return ret;
 886}
 887
 888static struct work_atoms *
 889thread_atoms_search(struct rb_root *root, struct thread *thread,
 890                         struct list_head *sort_list)
 891{
 892        struct rb_node *node = root->rb_node;
 893        struct work_atoms key = { .thread = thread };
 894
 895        while (node) {
 896                struct work_atoms *atoms;
 897                int cmp;
 898
 899                atoms = container_of(node, struct work_atoms, node);
 900
 901                cmp = thread_lat_cmp(sort_list, &key, atoms);
 902                if (cmp > 0)
 903                        node = node->rb_left;
 904                else if (cmp < 0)
 905                        node = node->rb_right;
 906                else {
 907                        BUG_ON(thread != atoms->thread);
 908                        return atoms;
 909                }
 910        }
 911        return NULL;
 912}
 913
 914static void
 915__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 916                         struct list_head *sort_list)
 917{
 918        struct rb_node **new = &(root->rb_node), *parent = NULL;
 919
 920        while (*new) {
 921                struct work_atoms *this;
 922                int cmp;
 923
 924                this = container_of(*new, struct work_atoms, node);
 925                parent = *new;
 926
 927                cmp = thread_lat_cmp(sort_list, data, this);
 928
 929                if (cmp > 0)
 930                        new = &((*new)->rb_left);
 931                else
 932                        new = &((*new)->rb_right);
 933        }
 934
 935        rb_link_node(&data->node, parent, new);
 936        rb_insert_color(&data->node, root);
 937}
 938
 939static void thread_atoms_insert(struct thread *thread)
 940{
 941        struct work_atoms *atoms = zalloc(sizeof(*atoms));
 942        if (!atoms)
 943                die("No memory");
 944
 945        atoms->thread = thread;
 946        INIT_LIST_HEAD(&atoms->work_list);
 947        __thread_latency_insert(&atom_root, atoms, &cmp_pid);
 948}
 949
 950static void
 951latency_fork_event(struct trace_fork_event *fork_event __used,
 952                   struct event_format *event __used,
 953                   int cpu __used,
 954                   u64 timestamp __used,
 955                   struct thread *thread __used)
 956{
 957        /* should insert the newcomer */
 958}
 959
 960__used
 961static char sched_out_state(struct trace_switch_event *switch_event)
 962{
 963        const char *str = TASK_STATE_TO_CHAR_STR;
 964
 965        return str[switch_event->prev_state];
 966}
 967
 968static void
 969add_sched_out_event(struct work_atoms *atoms,
 970                    char run_state,
 971                    u64 timestamp)
 972{
 973        struct work_atom *atom = zalloc(sizeof(*atom));
 974        if (!atom)
 975                die("Non memory");
 976
 977        atom->sched_out_time = timestamp;
 978
 979        if (run_state == 'R') {
 980                atom->state = THREAD_WAIT_CPU;
 981                atom->wake_up_time = atom->sched_out_time;
 982        }
 983
 984        list_add_tail(&atom->list, &atoms->work_list);
 985}
 986
 987static void
 988add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
 989{
 990        struct work_atom *atom;
 991
 992        BUG_ON(list_empty(&atoms->work_list));
 993
 994        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 995
 996        atom->runtime += delta;
 997        atoms->total_runtime += delta;
 998}
 999
1000static void
1001add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1002{
1003        struct work_atom *atom;
1004        u64 delta;
1005
1006        if (list_empty(&atoms->work_list))
1007                return;
1008
1009        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1010
1011        if (atom->state != THREAD_WAIT_CPU)
1012                return;
1013
1014        if (timestamp < atom->wake_up_time) {
1015                atom->state = THREAD_IGNORE;
1016                return;
1017        }
1018
1019        atom->state = THREAD_SCHED_IN;
1020        atom->sched_in_time = timestamp;
1021
1022        delta = atom->sched_in_time - atom->wake_up_time;
1023        atoms->total_lat += delta;
1024        if (delta > atoms->max_lat) {
1025                atoms->max_lat = delta;
1026                atoms->max_lat_at = timestamp;
1027        }
1028        atoms->nb_atoms++;
1029}
1030
1031static void
1032latency_switch_event(struct trace_switch_event *switch_event,
1033                     struct machine *machine,
1034                     struct event_format *event __used,
1035                     int cpu,
1036                     u64 timestamp,
1037                     struct thread *thread __used)
1038{
1039        struct work_atoms *out_events, *in_events;
1040        struct thread *sched_out, *sched_in;
1041        u64 timestamp0;
1042        s64 delta;
1043
1044        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1045
1046        timestamp0 = cpu_last_switched[cpu];
1047        cpu_last_switched[cpu] = timestamp;
1048        if (timestamp0)
1049                delta = timestamp - timestamp0;
1050        else
1051                delta = 0;
1052
1053        if (delta < 0)
1054                die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1055
1056
1057        sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1058        sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1059
1060        out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1061        if (!out_events) {
1062                thread_atoms_insert(sched_out);
1063                out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1064                if (!out_events)
1065                        die("out-event: Internal tree error");
1066        }
1067        add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1068
1069        in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1070        if (!in_events) {
1071                thread_atoms_insert(sched_in);
1072                in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1073                if (!in_events)
1074                        die("in-event: Internal tree error");
1075                /*
1076                 * Take came in we have not heard about yet,
1077                 * add in an initial atom in runnable state:
1078                 */
1079                add_sched_out_event(in_events, 'R', timestamp);
1080        }
1081        add_sched_in_event(in_events, timestamp);
1082}
1083
1084static void
1085latency_runtime_event(struct trace_runtime_event *runtime_event,
1086                     struct machine *machine,
1087                     struct event_format *event __used,
1088                     int cpu,
1089                     u64 timestamp,
1090                     struct thread *this_thread __used)
1091{
1092        struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1093        struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1094
1095        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1096        if (!atoms) {
1097                thread_atoms_insert(thread);
1098                atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1099                if (!atoms)
1100                        die("in-event: Internal tree error");
1101                add_sched_out_event(atoms, 'R', timestamp);
1102        }
1103
1104        add_runtime_event(atoms, runtime_event->runtime, timestamp);
1105}
1106
1107static void
1108latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1109                     struct machine *machine,
1110                     struct event_format *__event __used,
1111                     int cpu __used,
1112                     u64 timestamp,
1113                     struct thread *thread __used)
1114{
1115        struct work_atoms *atoms;
1116        struct work_atom *atom;
1117        struct thread *wakee;
1118
1119        /* Note for later, it may be interesting to observe the failing cases */
1120        if (!wakeup_event->success)
1121                return;
1122
1123        wakee = machine__findnew_thread(machine, wakeup_event->pid);
1124        atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1125        if (!atoms) {
1126                thread_atoms_insert(wakee);
1127                atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1128                if (!atoms)
1129                        die("wakeup-event: Internal tree error");
1130                add_sched_out_event(atoms, 'S', timestamp);
1131        }
1132
1133        BUG_ON(list_empty(&atoms->work_list));
1134
1135        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1136
1137        /*
1138         * You WILL be missing events if you've recorded only
1139         * one CPU, or are only looking at only one, so don't
1140         * make useless noise.
1141         */
1142        if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1143                nr_state_machine_bugs++;
1144
1145        nr_timestamps++;
1146        if (atom->sched_out_time > timestamp) {
1147                nr_unordered_timestamps++;
1148                return;
1149        }
1150
1151        atom->state = THREAD_WAIT_CPU;
1152        atom->wake_up_time = timestamp;
1153}
1154
1155static void
1156latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1157                     struct machine *machine,
1158                     struct event_format *__event __used,
1159                     int cpu __used,
1160                     u64 timestamp,
1161                     struct thread *thread __used)
1162{
1163        struct work_atoms *atoms;
1164        struct work_atom *atom;
1165        struct thread *migrant;
1166
1167        /*
1168         * Only need to worry about migration when profiling one CPU.
1169         */
1170        if (profile_cpu == -1)
1171                return;
1172
1173        migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1174        atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1175        if (!atoms) {
1176                thread_atoms_insert(migrant);
1177                register_pid(migrant->pid, migrant->comm);
1178                atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1179                if (!atoms)
1180                        die("migration-event: Internal tree error");
1181                add_sched_out_event(atoms, 'R', timestamp);
1182        }
1183
1184        BUG_ON(list_empty(&atoms->work_list));
1185
1186        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1187        atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1188
1189        nr_timestamps++;
1190
1191        if (atom->sched_out_time > timestamp)
1192                nr_unordered_timestamps++;
1193}
1194
1195static struct trace_sched_handler lat_ops  = {
1196        .wakeup_event           = latency_wakeup_event,
1197        .switch_event           = latency_switch_event,
1198        .runtime_event          = latency_runtime_event,
1199        .fork_event             = latency_fork_event,
1200        .migrate_task_event     = latency_migrate_task_event,
1201};
1202
1203static void output_lat_thread(struct work_atoms *work_list)
1204{
1205        int i;
1206        int ret;
1207        u64 avg;
1208
1209        if (!work_list->nb_atoms)
1210                return;
1211        /*
1212         * Ignore idle threads:
1213         */
1214        if (!strcmp(work_list->thread->comm, "swapper"))
1215                return;
1216
1217        all_runtime += work_list->total_runtime;
1218        all_count += work_list->nb_atoms;
1219
1220        ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1221
1222        for (i = 0; i < 24 - ret; i++)
1223                printf(" ");
1224
1225        avg = work_list->total_lat / work_list->nb_atoms;
1226
1227        printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1228              (double)work_list->total_runtime / 1e6,
1229                 work_list->nb_atoms, (double)avg / 1e6,
1230                 (double)work_list->max_lat / 1e6,
1231                 (double)work_list->max_lat_at / 1e9);
1232}
1233
1234static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1235{
1236        if (l->thread->pid < r->thread->pid)
1237                return -1;
1238        if (l->thread->pid > r->thread->pid)
1239                return 1;
1240
1241        return 0;
1242}
1243
1244static struct sort_dimension pid_sort_dimension = {
1245        .name                   = "pid",
1246        .cmp                    = pid_cmp,
1247};
1248
1249static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1250{
1251        u64 avgl, avgr;
1252
1253        if (!l->nb_atoms)
1254                return -1;
1255
1256        if (!r->nb_atoms)
1257                return 1;
1258
1259        avgl = l->total_lat / l->nb_atoms;
1260        avgr = r->total_lat / r->nb_atoms;
1261
1262        if (avgl < avgr)
1263                return -1;
1264        if (avgl > avgr)
1265                return 1;
1266
1267        return 0;
1268}
1269
1270static struct sort_dimension avg_sort_dimension = {
1271        .name                   = "avg",
1272        .cmp                    = avg_cmp,
1273};
1274
1275static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1276{
1277        if (l->max_lat < r->max_lat)
1278                return -1;
1279        if (l->max_lat > r->max_lat)
1280                return 1;
1281
1282        return 0;
1283}
1284
1285static struct sort_dimension max_sort_dimension = {
1286        .name                   = "max",
1287        .cmp                    = max_cmp,
1288};
1289
1290static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1291{
1292        if (l->nb_atoms < r->nb_atoms)
1293                return -1;
1294        if (l->nb_atoms > r->nb_atoms)
1295                return 1;
1296
1297        return 0;
1298}
1299
1300static struct sort_dimension switch_sort_dimension = {
1301        .name                   = "switch",
1302        .cmp                    = switch_cmp,
1303};
1304
1305static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1306{
1307        if (l->total_runtime < r->total_runtime)
1308                return -1;
1309        if (l->total_runtime > r->total_runtime)
1310                return 1;
1311
1312        return 0;
1313}
1314
1315static struct sort_dimension runtime_sort_dimension = {
1316        .name                   = "runtime",
1317        .cmp                    = runtime_cmp,
1318};
1319
1320static struct sort_dimension *available_sorts[] = {
1321        &pid_sort_dimension,
1322        &avg_sort_dimension,
1323        &max_sort_dimension,
1324        &switch_sort_dimension,
1325        &runtime_sort_dimension,
1326};
1327
1328#define NB_AVAILABLE_SORTS      (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1329
1330static LIST_HEAD(sort_list);
1331
1332static int sort_dimension__add(const char *tok, struct list_head *list)
1333{
1334        int i;
1335
1336        for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1337                if (!strcmp(available_sorts[i]->name, tok)) {
1338                        list_add_tail(&available_sorts[i]->list, list);
1339
1340                        return 0;
1341                }
1342        }
1343
1344        return -1;
1345}
1346
1347static void setup_sorting(void);
1348
1349static void sort_lat(void)
1350{
1351        struct rb_node *node;
1352
1353        for (;;) {
1354                struct work_atoms *data;
1355                node = rb_first(&atom_root);
1356                if (!node)
1357                        break;
1358
1359                rb_erase(node, &atom_root);
1360                data = rb_entry(node, struct work_atoms, node);
1361                __thread_latency_insert(&sorted_atom_root, data, &sort_list);
1362        }
1363}
1364
1365static struct trace_sched_handler *trace_handler;
1366
1367static void
1368process_sched_wakeup_event(struct perf_tool *tool __used,
1369                           struct event_format *event,
1370                           struct perf_sample *sample,
1371                           struct machine *machine,
1372                           struct thread *thread)
1373{
1374        void *data = sample->raw_data;
1375        struct trace_wakeup_event wakeup_event;
1376
1377        FILL_COMMON_FIELDS(wakeup_event, event, data);
1378
1379        FILL_ARRAY(wakeup_event, comm, event, data);
1380        FILL_FIELD(wakeup_event, pid, event, data);
1381        FILL_FIELD(wakeup_event, prio, event, data);
1382        FILL_FIELD(wakeup_event, success, event, data);
1383        FILL_FIELD(wakeup_event, cpu, event, data);
1384
1385        if (trace_handler->wakeup_event)
1386                trace_handler->wakeup_event(&wakeup_event, machine, event,
1387                                            sample->cpu, sample->time, thread);
1388}
1389
1390/*
1391 * Track the current task - that way we can know whether there's any
1392 * weird events, such as a task being switched away that is not current.
1393 */
1394static int max_cpu;
1395
1396static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1397
1398static struct thread *curr_thread[MAX_CPUS];
1399
1400static char next_shortname1 = 'A';
1401static char next_shortname2 = '0';
1402
1403static void
1404map_switch_event(struct trace_switch_event *switch_event,
1405                 struct machine *machine,
1406                 struct event_format *event __used,
1407                 int this_cpu,
1408                 u64 timestamp,
1409                 struct thread *thread __used)
1410{
1411        struct thread *sched_out __used, *sched_in;
1412        int new_shortname;
1413        u64 timestamp0;
1414        s64 delta;
1415        int cpu;
1416
1417        BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1418
1419        if (this_cpu > max_cpu)
1420                max_cpu = this_cpu;
1421
1422        timestamp0 = cpu_last_switched[this_cpu];
1423        cpu_last_switched[this_cpu] = timestamp;
1424        if (timestamp0)
1425                delta = timestamp - timestamp0;
1426        else
1427                delta = 0;
1428
1429        if (delta < 0)
1430                die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1431
1432
1433        sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1434        sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1435
1436        curr_thread[this_cpu] = sched_in;
1437
1438        printf("  ");
1439
1440        new_shortname = 0;
1441        if (!sched_in->shortname[0]) {
1442                sched_in->shortname[0] = next_shortname1;
1443                sched_in->shortname[1] = next_shortname2;
1444
1445                if (next_shortname1 < 'Z') {
1446                        next_shortname1++;
1447                } else {
1448                        next_shortname1='A';
1449                        if (next_shortname2 < '9') {
1450                                next_shortname2++;
1451                        } else {
1452                                next_shortname2='0';
1453                        }
1454                }
1455                new_shortname = 1;
1456        }
1457
1458        for (cpu = 0; cpu <= max_cpu; cpu++) {
1459                if (cpu != this_cpu)
1460                        printf(" ");
1461                else
1462                        printf("*");
1463
1464                if (curr_thread[cpu]) {
1465                        if (curr_thread[cpu]->pid)
1466                                printf("%2s ", curr_thread[cpu]->shortname);
1467                        else
1468                                printf(".  ");
1469                } else
1470                        printf("   ");
1471        }
1472
1473        printf("  %12.6f secs ", (double)timestamp/1e9);
1474        if (new_shortname) {
1475                printf("%s => %s:%d\n",
1476                        sched_in->shortname, sched_in->comm, sched_in->pid);
1477        } else {
1478                printf("\n");
1479        }
1480}
1481
1482static void
1483process_sched_switch_event(struct perf_tool *tool __used,
1484                           struct event_format *event,
1485                           struct perf_sample *sample,
1486                           struct machine *machine,
1487                           struct thread *thread)
1488{
1489        int this_cpu = sample->cpu;
1490        void *data = sample->raw_data;
1491        struct trace_switch_event switch_event;
1492
1493        FILL_COMMON_FIELDS(switch_event, event, data);
1494
1495        FILL_ARRAY(switch_event, prev_comm, event, data);
1496        FILL_FIELD(switch_event, prev_pid, event, data);
1497        FILL_FIELD(switch_event, prev_prio, event, data);
1498        FILL_FIELD(switch_event, prev_state, event, data);
1499        FILL_ARRAY(switch_event, next_comm, event, data);
1500        FILL_FIELD(switch_event, next_pid, event, data);
1501        FILL_FIELD(switch_event, next_prio, event, data);
1502
1503        if (curr_pid[this_cpu] != (u32)-1) {
1504                /*
1505                 * Are we trying to switch away a PID that is
1506                 * not current?
1507                 */
1508                if (curr_pid[this_cpu] != switch_event.prev_pid)
1509                        nr_context_switch_bugs++;
1510        }
1511        if (trace_handler->switch_event)
1512                trace_handler->switch_event(&switch_event, machine, event,
1513                                            this_cpu, sample->time, thread);
1514
1515        curr_pid[this_cpu] = switch_event.next_pid;
1516}
1517
1518static void
1519process_sched_runtime_event(struct perf_tool *tool __used,
1520                            struct event_format *event,
1521                            struct perf_sample *sample,
1522                            struct machine *machine,
1523                            struct thread *thread)
1524{
1525        void *data = sample->raw_data;
1526        struct trace_runtime_event runtime_event;
1527
1528        FILL_ARRAY(runtime_event, comm, event, data);
1529        FILL_FIELD(runtime_event, pid, event, data);
1530        FILL_FIELD(runtime_event, runtime, event, data);
1531        FILL_FIELD(runtime_event, vruntime, event, data);
1532
1533        if (trace_handler->runtime_event)
1534                trace_handler->runtime_event(&runtime_event, machine, event,
1535                                             sample->cpu, sample->time, thread);
1536}
1537
1538static void
1539process_sched_fork_event(struct perf_tool *tool __used,
1540                         struct event_format *event,
1541                         struct perf_sample *sample,
1542                         struct machine *machine __used,
1543                         struct thread *thread)
1544{
1545        void *data = sample->raw_data;
1546        struct trace_fork_event fork_event;
1547
1548        FILL_COMMON_FIELDS(fork_event, event, data);
1549
1550        FILL_ARRAY(fork_event, parent_comm, event, data);
1551        FILL_FIELD(fork_event, parent_pid, event, data);
1552        FILL_ARRAY(fork_event, child_comm, event, data);
1553        FILL_FIELD(fork_event, child_pid, event, data);
1554
1555        if (trace_handler->fork_event)
1556                trace_handler->fork_event(&fork_event, event,
1557                                          sample->cpu, sample->time, thread);
1558}
1559
1560static void
1561process_sched_exit_event(struct perf_tool *tool __used,
1562                         struct event_format *event,
1563                         struct perf_sample *sample __used,
1564                         struct machine *machine __used,
1565                         struct thread *thread __used)
1566{
1567        if (verbose)
1568                printf("sched_exit event %p\n", event);
1569}
1570
1571static void
1572process_sched_migrate_task_event(struct perf_tool *tool __used,
1573                                 struct event_format *event,
1574                                 struct perf_sample *sample,
1575                                 struct machine *machine,
1576                                 struct thread *thread)
1577{
1578        void *data = sample->raw_data;
1579        struct trace_migrate_task_event migrate_task_event;
1580
1581        FILL_COMMON_FIELDS(migrate_task_event, event, data);
1582
1583        FILL_ARRAY(migrate_task_event, comm, event, data);
1584        FILL_FIELD(migrate_task_event, pid, event, data);
1585        FILL_FIELD(migrate_task_event, prio, event, data);
1586        FILL_FIELD(migrate_task_event, cpu, event, data);
1587
1588        if (trace_handler->migrate_task_event)
1589                trace_handler->migrate_task_event(&migrate_task_event, machine,
1590                                                  event, sample->cpu,
1591                                                  sample->time, thread);
1592}
1593
1594typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event_format *event,
1595                                   struct perf_sample *sample,
1596                                   struct machine *machine,
1597                                   struct thread *thread);
1598
1599static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
1600                                                 union perf_event *event __used,
1601                                                 struct perf_sample *sample,
1602                                                 struct perf_evsel *evsel,
1603                                                 struct machine *machine)
1604{
1605        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1606        struct pevent *pevent = sched->session->pevent;
1607        struct thread *thread = machine__findnew_thread(machine, sample->pid);
1608
1609        if (thread == NULL) {
1610                pr_debug("problem processing %s event, skipping it.\n",
1611                         perf_evsel__name(evsel));
1612                return -1;
1613        }
1614
1615        evsel->hists.stats.total_period += sample->period;
1616        hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1617
1618        if (evsel->handler.func != NULL) {
1619                tracepoint_handler f = evsel->handler.func;
1620
1621                if (evsel->handler.data == NULL)
1622                        evsel->handler.data = pevent_find_event(pevent,
1623                                                          evsel->attr.config);
1624
1625                f(tool, evsel->handler.data, sample, machine, thread);
1626        }
1627
1628        return 0;
1629}
1630
1631static struct perf_sched perf_sched = {
1632        .tool = {
1633                .sample          = perf_sched__process_tracepoint_sample,
1634                .comm            = perf_event__process_comm,
1635                .lost            = perf_event__process_lost,
1636                .fork            = perf_event__process_task,
1637                .ordered_samples = true,
1638        },
1639};
1640
1641static void read_events(bool destroy, struct perf_session **psession)
1642{
1643        int err = -EINVAL;
1644        const struct perf_evsel_str_handler handlers[] = {
1645                { "sched:sched_switch",       process_sched_switch_event, },
1646                { "sched:sched_stat_runtime", process_sched_runtime_event, },
1647                { "sched:sched_wakeup",       process_sched_wakeup_event, },
1648                { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1649                { "sched:sched_process_fork", process_sched_fork_event, },
1650                { "sched:sched_process_exit", process_sched_exit_event, },
1651                { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1652        };
1653        struct perf_session *session;
1654
1655        session = perf_session__new(input_name, O_RDONLY, 0, false,
1656                                    &perf_sched.tool);
1657        if (session == NULL)
1658                die("No Memory");
1659
1660        perf_sched.session = session;
1661
1662        err = perf_session__set_tracepoints_handlers(session, handlers);
1663        assert(err == 0);
1664
1665        if (perf_session__has_traces(session, "record -R")) {
1666                err = perf_session__process_events(session, &perf_sched.tool);
1667                if (err)
1668                        die("Failed to process events, error %d", err);
1669
1670                nr_events      = session->hists.stats.nr_events[0];
1671                nr_lost_events = session->hists.stats.total_lost;
1672                nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1673        }
1674
1675        if (destroy)
1676                perf_session__delete(session);
1677
1678        if (psession)
1679                *psession = session;
1680}
1681
1682static void print_bad_events(void)
1683{
1684        if (nr_unordered_timestamps && nr_timestamps) {
1685                printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1686                        (double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1687                        nr_unordered_timestamps, nr_timestamps);
1688        }
1689        if (nr_lost_events && nr_events) {
1690                printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1691                        (double)nr_lost_events/(double)nr_events*100.0,
1692                        nr_lost_events, nr_events, nr_lost_chunks);
1693        }
1694        if (nr_state_machine_bugs && nr_timestamps) {
1695                printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1696                        (double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1697                        nr_state_machine_bugs, nr_timestamps);
1698                if (nr_lost_events)
1699                        printf(" (due to lost events?)");
1700                printf("\n");
1701        }
1702        if (nr_context_switch_bugs && nr_timestamps) {
1703                printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1704                        (double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1705                        nr_context_switch_bugs, nr_timestamps);
1706                if (nr_lost_events)
1707                        printf(" (due to lost events?)");
1708                printf("\n");
1709        }
1710}
1711
1712static void __cmd_lat(void)
1713{
1714        struct rb_node *next;
1715        struct perf_session *session;
1716
1717        setup_pager();
1718        read_events(false, &session);
1719        sort_lat();
1720
1721        printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1722        printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
1723        printf(" ---------------------------------------------------------------------------------------------------------------\n");
1724
1725        next = rb_first(&sorted_atom_root);
1726
1727        while (next) {
1728                struct work_atoms *work_list;
1729
1730                work_list = rb_entry(next, struct work_atoms, node);
1731                output_lat_thread(work_list);
1732                next = rb_next(next);
1733        }
1734
1735        printf(" -----------------------------------------------------------------------------------------\n");
1736        printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1737                (double)all_runtime/1e6, all_count);
1738
1739        printf(" ---------------------------------------------------\n");
1740
1741        print_bad_events();
1742        printf("\n");
1743
1744        perf_session__delete(session);
1745}
1746
1747static struct trace_sched_handler map_ops  = {
1748        .wakeup_event           = NULL,
1749        .switch_event           = map_switch_event,
1750        .runtime_event          = NULL,
1751        .fork_event             = NULL,
1752};
1753
1754static void __cmd_map(void)
1755{
1756        max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1757
1758        setup_pager();
1759        read_events(true, NULL);
1760        print_bad_events();
1761}
1762
1763static void __cmd_replay(void)
1764{
1765        unsigned long i;
1766
1767        calibrate_run_measurement_overhead();
1768        calibrate_sleep_measurement_overhead();
1769
1770        test_calibrations();
1771
1772        read_events(true, NULL);
1773
1774        printf("nr_run_events:        %ld\n", nr_run_events);
1775        printf("nr_sleep_events:      %ld\n", nr_sleep_events);
1776        printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
1777
1778        if (targetless_wakeups)
1779                printf("target-less wakeups:  %ld\n", targetless_wakeups);
1780        if (multitarget_wakeups)
1781                printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1782        if (nr_run_events_optimized)
1783                printf("run atoms optimized: %ld\n",
1784                        nr_run_events_optimized);
1785
1786        print_task_traces();
1787        add_cross_task_wakeups();
1788
1789        create_tasks();
1790        printf("------------------------------------------------------------\n");
1791        for (i = 0; i < replay_repeat; i++)
1792                run_one_test();
1793}
1794
1795
1796static const char * const sched_usage[] = {
1797        "perf sched [<options>] {record|latency|map|replay|script}",
1798        NULL
1799};
1800
1801static const struct option sched_options[] = {
1802        OPT_STRING('i', "input", &input_name, "file",
1803                    "input file name"),
1804        OPT_INCR('v', "verbose", &verbose,
1805                    "be more verbose (show symbol address, etc)"),
1806        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1807                    "dump raw trace in ASCII"),
1808        OPT_END()
1809};
1810
1811static const char * const latency_usage[] = {
1812        "perf sched latency [<options>]",
1813        NULL
1814};
1815
1816static const struct option latency_options[] = {
1817        OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1818                   "sort by key(s): runtime, switch, avg, max"),
1819        OPT_INCR('v', "verbose", &verbose,
1820                    "be more verbose (show symbol address, etc)"),
1821        OPT_INTEGER('C', "CPU", &profile_cpu,
1822                    "CPU to profile on"),
1823        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1824                    "dump raw trace in ASCII"),
1825        OPT_END()
1826};
1827
1828static const char * const replay_usage[] = {
1829        "perf sched replay [<options>]",
1830        NULL
1831};
1832
1833static const struct option replay_options[] = {
1834        OPT_UINTEGER('r', "repeat", &replay_repeat,
1835                     "repeat the workload replay N times (-1: infinite)"),
1836        OPT_INCR('v', "verbose", &verbose,
1837                    "be more verbose (show symbol address, etc)"),
1838        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1839                    "dump raw trace in ASCII"),
1840        OPT_END()
1841};
1842
1843static void setup_sorting(void)
1844{
1845        char *tmp, *tok, *str = strdup(sort_order);
1846
1847        for (tok = strtok_r(str, ", ", &tmp);
1848                        tok; tok = strtok_r(NULL, ", ", &tmp)) {
1849                if (sort_dimension__add(tok, &sort_list) < 0) {
1850                        error("Unknown --sort key: `%s'", tok);
1851                        usage_with_options(latency_usage, latency_options);
1852                }
1853        }
1854
1855        free(str);
1856
1857        sort_dimension__add("pid", &cmp_pid);
1858}
1859
1860static const char *record_args[] = {
1861        "record",
1862        "-a",
1863        "-R",
1864        "-f",
1865        "-m", "1024",
1866        "-c", "1",
1867        "-e", "sched:sched_switch",
1868        "-e", "sched:sched_stat_wait",
1869        "-e", "sched:sched_stat_sleep",
1870        "-e", "sched:sched_stat_iowait",
1871        "-e", "sched:sched_stat_runtime",
1872        "-e", "sched:sched_process_exit",
1873        "-e", "sched:sched_process_fork",
1874        "-e", "sched:sched_wakeup",
1875        "-e", "sched:sched_migrate_task",
1876};
1877
1878static int __cmd_record(int argc, const char **argv)
1879{
1880        unsigned int rec_argc, i, j;
1881        const char **rec_argv;
1882
1883        rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1884        rec_argv = calloc(rec_argc + 1, sizeof(char *));
1885
1886        if (rec_argv == NULL)
1887                return -ENOMEM;
1888
1889        for (i = 0; i < ARRAY_SIZE(record_args); i++)
1890                rec_argv[i] = strdup(record_args[i]);
1891
1892        for (j = 1; j < (unsigned int)argc; j++, i++)
1893                rec_argv[i] = argv[j];
1894
1895        BUG_ON(i != rec_argc);
1896
1897        return cmd_record(i, rec_argv, NULL);
1898}
1899
1900int cmd_sched(int argc, const char **argv, const char *prefix __used)
1901{
1902        argc = parse_options(argc, argv, sched_options, sched_usage,
1903                             PARSE_OPT_STOP_AT_NON_OPTION);
1904        if (!argc)
1905                usage_with_options(sched_usage, sched_options);
1906
1907        /*
1908         * Aliased to 'perf script' for now:
1909         */
1910        if (!strcmp(argv[0], "script"))
1911                return cmd_script(argc, argv, prefix);
1912
1913        symbol__init();
1914        if (!strncmp(argv[0], "rec", 3)) {
1915                return __cmd_record(argc, argv);
1916        } else if (!strncmp(argv[0], "lat", 3)) {
1917                trace_handler = &lat_ops;
1918                if (argc > 1) {
1919                        argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1920                        if (argc)
1921                                usage_with_options(latency_usage, latency_options);
1922                }
1923                setup_sorting();
1924                __cmd_lat();
1925        } else if (!strcmp(argv[0], "map")) {
1926                trace_handler = &map_ops;
1927                setup_sorting();
1928                __cmd_map();
1929        } else if (!strncmp(argv[0], "rep", 3)) {
1930                trace_handler = &replay_ops;
1931                if (argc) {
1932                        argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1933                        if (argc)
1934                                usage_with_options(replay_usage, replay_options);
1935                }
1936                __cmd_replay();
1937        } else {
1938                usage_with_options(sched_usage, sched_options);
1939        }
1940
1941        return 0;
1942}
1943
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.