linux/drivers/oprofile/buffer_sync.c
<<
>>
Prefs
   1/**
   2 * @file buffer_sync.c
   3 *
   4 * @remark Copyright 2002 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 * @author Barry Kasindorf
   9 *
  10 * This is the core of the buffer management. Each
  11 * CPU buffer is processed and entered into the
  12 * global event buffer. Such processing is necessary
  13 * in several circumstances, mentioned below.
  14 *
  15 * The processing does the job of converting the
  16 * transitory EIP value into a persistent dentry/offset
  17 * value that the profiler can record at its leisure.
  18 *
  19 * See fs/dcookies.c for a description of the dentry/offset
  20 * objects.
  21 */
  22
  23#include <linux/mm.h>
  24#include <linux/workqueue.h>
  25#include <linux/notifier.h>
  26#include <linux/dcookies.h>
  27#include <linux/profile.h>
  28#include <linux/module.h>
  29#include <linux/fs.h>
  30#include <linux/oprofile.h>
  31#include <linux/sched.h>
  32
  33#include "oprofile_stats.h"
  34#include "event_buffer.h"
  35#include "cpu_buffer.h"
  36#include "buffer_sync.h"
  37
  38static LIST_HEAD(dying_tasks);
  39static LIST_HEAD(dead_tasks);
  40static cpumask_t marked_cpus = CPU_MASK_NONE;
  41static DEFINE_SPINLOCK(task_mortuary);
  42static void process_task_mortuary(void);
  43
  44/* Take ownership of the task struct and place it on the
  45 * list for processing. Only after two full buffer syncs
  46 * does the task eventually get freed, because by then
  47 * we are sure we will not reference it again.
  48 * Can be invoked from softirq via RCU callback due to
  49 * call_rcu() of the task struct, hence the _irqsave.
  50 */
  51static int
  52task_free_notify(struct notifier_block *self, unsigned long val, void *data)
  53{
  54        unsigned long flags;
  55        struct task_struct *task = data;
  56        spin_lock_irqsave(&task_mortuary, flags);
  57        list_add(&task->tasks, &dying_tasks);
  58        spin_unlock_irqrestore(&task_mortuary, flags);
  59        return NOTIFY_OK;
  60}
  61
  62
  63/* The task is on its way out. A sync of the buffer means we can catch
  64 * any remaining samples for this task.
  65 */
  66static int
  67task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
  68{
  69        /* To avoid latency problems, we only process the current CPU,
  70         * hoping that most samples for the task are on this CPU
  71         */
  72        sync_buffer(raw_smp_processor_id());
  73        return 0;
  74}
  75
  76
  77/* The task is about to try a do_munmap(). We peek at what it's going to
  78 * do, and if it's an executable region, process the samples first, so
  79 * we don't lose any. This does not have to be exact, it's a QoI issue
  80 * only.
  81 */
  82static int
  83munmap_notify(struct notifier_block *self, unsigned long val, void *data)
  84{
  85        unsigned long addr = (unsigned long)data;
  86        struct mm_struct *mm = current->mm;
  87        struct vm_area_struct *mpnt;
  88
  89        down_read(&mm->mmap_sem);
  90
  91        mpnt = find_vma(mm, addr);
  92        if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
  93                up_read(&mm->mmap_sem);
  94                /* To avoid latency problems, we only process the current CPU,
  95                 * hoping that most samples for the task are on this CPU
  96                 */
  97                sync_buffer(raw_smp_processor_id());
  98                return 0;
  99        }
 100
 101        up_read(&mm->mmap_sem);
 102        return 0;
 103}
 104
 105
 106/* We need to be told about new modules so we don't attribute to a previously
 107 * loaded module, or drop the samples on the floor.
 108 */
 109static int
 110module_load_notify(struct notifier_block *self, unsigned long val, void *data)
 111{
 112#ifdef CONFIG_MODULES
 113        if (val != MODULE_STATE_COMING)
 114                return 0;
 115
 116        /* FIXME: should we process all CPU buffers ? */
 117        mutex_lock(&buffer_mutex);
 118        add_event_entry(ESCAPE_CODE);
 119        add_event_entry(MODULE_LOADED_CODE);
 120        mutex_unlock(&buffer_mutex);
 121#endif
 122        return 0;
 123}
 124
 125
 126static struct notifier_block task_free_nb = {
 127        .notifier_call  = task_free_notify,
 128};
 129
 130static struct notifier_block task_exit_nb = {
 131        .notifier_call  = task_exit_notify,
 132};
 133
 134static struct notifier_block munmap_nb = {
 135        .notifier_call  = munmap_notify,
 136};
 137
 138static struct notifier_block module_load_nb = {
 139        .notifier_call = module_load_notify,
 140};
 141
 142
 143static void end_sync(void)
 144{
 145        end_cpu_work();
 146        /* make sure we don't leak task structs */
 147        process_task_mortuary();
 148        process_task_mortuary();
 149}
 150
 151
 152int sync_start(void)
 153{
 154        int err;
 155
 156        start_cpu_work();
 157
 158        err = task_handoff_register(&task_free_nb);
 159        if (err)
 160                goto out1;
 161        err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
 162        if (err)
 163                goto out2;
 164        err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
 165        if (err)
 166                goto out3;
 167        err = register_module_notifier(&module_load_nb);
 168        if (err)
 169                goto out4;
 170
 171out:
 172        return err;
 173out4:
 174        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 175out3:
 176        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 177out2:
 178        task_handoff_unregister(&task_free_nb);
 179out1:
 180        end_sync();
 181        goto out;
 182}
 183
 184
 185void sync_stop(void)
 186{
 187        unregister_module_notifier(&module_load_nb);
 188        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 189        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 190        task_handoff_unregister(&task_free_nb);
 191        end_sync();
 192}
 193
 194
 195/* Optimisation. We can manage without taking the dcookie sem
 196 * because we cannot reach this code without at least one
 197 * dcookie user still being registered (namely, the reader
 198 * of the event buffer). */
 199static inline unsigned long fast_get_dcookie(struct path *path)
 200{
 201        unsigned long cookie;
 202
 203        if (path->dentry->d_cookie)
 204                return (unsigned long)path->dentry;
 205        get_dcookie(path, &cookie);
 206        return cookie;
 207}
 208
 209
 210/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
 211 * which corresponds loosely to "application name". This is
 212 * not strictly necessary but allows oprofile to associate
 213 * shared-library samples with particular applications
 214 */
 215static unsigned long get_exec_dcookie(struct mm_struct *mm)
 216{
 217        unsigned long cookie = NO_COOKIE;
 218        struct vm_area_struct *vma;
 219
 220        if (!mm)
 221                goto out;
 222
 223        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 224                if (!vma->vm_file)
 225                        continue;
 226                if (!(vma->vm_flags & VM_EXECUTABLE))
 227                        continue;
 228                cookie = fast_get_dcookie(&vma->vm_file->f_path);
 229                break;
 230        }
 231
 232out:
 233        return cookie;
 234}
 235
 236
 237/* Convert the EIP value of a sample into a persistent dentry/offset
 238 * pair that can then be added to the global event buffer. We make
 239 * sure to do this lookup before a mm->mmap modification happens so
 240 * we don't lose track.
 241 */
 242static unsigned long
 243lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 244{
 245        unsigned long cookie = NO_COOKIE;
 246        struct vm_area_struct *vma;
 247
 248        for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 249
 250                if (addr < vma->vm_start || addr >= vma->vm_end)
 251                        continue;
 252
 253                if (vma->vm_file) {
 254                        cookie = fast_get_dcookie(&vma->vm_file->f_path);
 255                        *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
 256                                vma->vm_start;
 257                } else {
 258                        /* must be an anonymous map */
 259                        *offset = addr;
 260                }
 261
 262                break;
 263        }
 264
 265        if (!vma)
 266                cookie = INVALID_COOKIE;
 267
 268        return cookie;
 269}
 270
 271static void increment_tail(struct oprofile_cpu_buffer *b)
 272{
 273        unsigned long new_tail = b->tail_pos + 1;
 274
 275        rmb();  /* be sure fifo pointers are synchromized */
 276
 277        if (new_tail < b->buffer_size)
 278                b->tail_pos = new_tail;
 279        else
 280                b->tail_pos = 0;
 281}
 282
 283static unsigned long last_cookie = INVALID_COOKIE;
 284
 285static void add_cpu_switch(int i)
 286{
 287        add_event_entry(ESCAPE_CODE);
 288        add_event_entry(CPU_SWITCH_CODE);
 289        add_event_entry(i);
 290        last_cookie = INVALID_COOKIE;
 291}
 292
 293static void add_kernel_ctx_switch(unsigned int in_kernel)
 294{
 295        add_event_entry(ESCAPE_CODE);
 296        if (in_kernel)
 297                add_event_entry(KERNEL_ENTER_SWITCH_CODE);
 298        else
 299                add_event_entry(KERNEL_EXIT_SWITCH_CODE);
 300}
 301
 302static void
 303add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
 304{
 305        add_event_entry(ESCAPE_CODE);
 306        add_event_entry(CTX_SWITCH_CODE);
 307        add_event_entry(task->pid);
 308        add_event_entry(cookie);
 309        /* Another code for daemon back-compat */
 310        add_event_entry(ESCAPE_CODE);
 311        add_event_entry(CTX_TGID_CODE);
 312        add_event_entry(task->tgid);
 313}
 314
 315
 316static void add_cookie_switch(unsigned long cookie)
 317{
 318        add_event_entry(ESCAPE_CODE);
 319        add_event_entry(COOKIE_SWITCH_CODE);
 320        add_event_entry(cookie);
 321}
 322
 323
 324static void add_trace_begin(void)
 325{
 326        add_event_entry(ESCAPE_CODE);
 327        add_event_entry(TRACE_BEGIN_CODE);
 328}
 329
 330#ifdef CONFIG_OPROFILE_IBS
 331
 332#define IBS_FETCH_CODE_SIZE     2
 333#define IBS_OP_CODE_SIZE        5
 334#define IBS_EIP(offset)                         \
 335        (((struct op_sample *)&cpu_buf->buffer[(offset)])->eip)
 336#define IBS_EVENT(offset)                               \
 337        (((struct op_sample *)&cpu_buf->buffer[(offset)])->event)
 338
 339/*
 340 * Add IBS fetch and op entries to event buffer
 341 */
 342static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code,
 343                          struct mm_struct *mm)
 344{
 345        unsigned long rip;
 346        int i, count;
 347        unsigned long ibs_cookie = 0;
 348        off_t offset;
 349
 350        increment_tail(cpu_buf);        /* move to RIP entry */
 351
 352        rip = IBS_EIP(cpu_buf->tail_pos);
 353
 354#ifdef __LP64__
 355        rip += IBS_EVENT(cpu_buf->tail_pos) << 32;
 356#endif
 357
 358        if (mm) {
 359                ibs_cookie = lookup_dcookie(mm, rip, &offset);
 360
 361                if (ibs_cookie == NO_COOKIE)
 362                        offset = rip;
 363                if (ibs_cookie == INVALID_COOKIE) {
 364                        atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 365                        offset = rip;
 366                }
 367                if (ibs_cookie != last_cookie) {
 368                        add_cookie_switch(ibs_cookie);
 369                        last_cookie = ibs_cookie;
 370                }
 371        } else
 372                offset = rip;
 373
 374        add_event_entry(ESCAPE_CODE);
 375        add_event_entry(code);
 376        add_event_entry(offset);        /* Offset from Dcookie */
 377
 378        /* we send the Dcookie offset, but send the raw Linear Add also*/
 379        add_event_entry(IBS_EIP(cpu_buf->tail_pos));
 380        add_event_entry(IBS_EVENT(cpu_buf->tail_pos));
 381
 382        if (code == IBS_FETCH_CODE)
 383                count = IBS_FETCH_CODE_SIZE;    /*IBS FETCH is 2 int64s*/
 384        else
 385                count = IBS_OP_CODE_SIZE;       /*IBS OP is 5 int64s*/
 386
 387        for (i = 0; i < count; i++) {
 388                increment_tail(cpu_buf);
 389                add_event_entry(IBS_EIP(cpu_buf->tail_pos));
 390                add_event_entry(IBS_EVENT(cpu_buf->tail_pos));
 391        }
 392}
 393
 394#endif
 395
 396static void add_sample_entry(unsigned long offset, unsigned long event)
 397{
 398        add_event_entry(offset);
 399        add_event_entry(event);
 400}
 401
 402
 403static int add_us_sample(struct mm_struct *mm, struct op_sample *s)
 404{
 405        unsigned long cookie;
 406        off_t offset;
 407
 408        cookie = lookup_dcookie(mm, s->eip, &offset);
 409
 410        if (cookie == INVALID_COOKIE) {
 411                atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 412                return 0;
 413        }
 414
 415        if (cookie != last_cookie) {
 416                add_cookie_switch(cookie);
 417                last_cookie = cookie;
 418        }
 419
 420        add_sample_entry(offset, s->event);
 421
 422        return 1;
 423}
 424
 425
 426/* Add a sample to the global event buffer. If possible the
 427 * sample is converted into a persistent dentry/offset pair
 428 * for later lookup from userspace.
 429 */
 430static int
 431add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
 432{
 433        if (in_kernel) {
 434                add_sample_entry(s->eip, s->event);
 435                return 1;
 436        } else if (mm) {
 437                return add_us_sample(mm, s);
 438        } else {
 439                atomic_inc(&oprofile_stats.sample_lost_no_mm);
 440        }
 441        return 0;
 442}
 443
 444
 445static void release_mm(struct mm_struct *mm)
 446{
 447        if (!mm)
 448                return;
 449        up_read(&mm->mmap_sem);
 450        mmput(mm);
 451}
 452
 453
 454static struct mm_struct *take_tasks_mm(struct task_struct *task)
 455{
 456        struct mm_struct *mm = get_task_mm(task);
 457        if (mm)
 458                down_read(&mm->mmap_sem);
 459        return mm;
 460}
 461
 462
 463static inline int is_code(unsigned long val)
 464{
 465        return val == ESCAPE_CODE;
 466}
 467
 468
 469/* "acquire" as many cpu buffer slots as we can */
 470static unsigned long get_slots(struct oprofile_cpu_buffer *b)
 471{
 472        unsigned long head = b->head_pos;
 473        unsigned long tail = b->tail_pos;
 474
 475        /*
 476         * Subtle. This resets the persistent last_task
 477         * and in_kernel values used for switching notes.
 478         * BUT, there is a small window between reading
 479         * head_pos, and this call, that means samples
 480         * can appear at the new head position, but not
 481         * be prefixed with the notes for switching
 482         * kernel mode or a task switch. This small hole
 483         * can lead to mis-attribution or samples where
 484         * we don't know if it's in the kernel or not,
 485         * at the start of an event buffer.
 486         */
 487        cpu_buffer_reset(b);
 488
 489        if (head >= tail)
 490                return head - tail;
 491
 492        return head + (b->buffer_size - tail);
 493}
 494
 495
 496/* Move tasks along towards death. Any tasks on dead_tasks
 497 * will definitely have no remaining references in any
 498 * CPU buffers at this point, because we use two lists,
 499 * and to have reached the list, it must have gone through
 500 * one full sync already.
 501 */
 502static void process_task_mortuary(void)
 503{
 504        unsigned long flags;
 505        LIST_HEAD(local_dead_tasks);
 506        struct task_struct *task;
 507        struct task_struct *ttask;
 508
 509        spin_lock_irqsave(&task_mortuary, flags);
 510
 511        list_splice_init(&dead_tasks, &local_dead_tasks);
 512        list_splice_init(&dying_tasks, &dead_tasks);
 513
 514        spin_unlock_irqrestore(&task_mortuary, flags);
 515
 516        list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 517                list_del(&task->tasks);
 518                free_task(task);
 519        }
 520}
 521
 522
 523static void mark_done(int cpu)
 524{
 525        int i;
 526
 527        cpu_set(cpu, marked_cpus);
 528
 529        for_each_online_cpu(i) {
 530                if (!cpu_isset(i, marked_cpus))
 531                        return;
 532        }
 533
 534        /* All CPUs have been processed at least once,
 535         * we can process the mortuary once
 536         */
 537        process_task_mortuary();
 538
 539        cpus_clear(marked_cpus);
 540}
 541
 542
 543/* FIXME: this is not sufficient if we implement syscall barrier backtrace
 544 * traversal, the code switch to sb_sample_start at first kernel enter/exit
 545 * switch so we need a fifth state and some special handling in sync_buffer()
 546 */
 547typedef enum {
 548        sb_bt_ignore = -2,
 549        sb_buffer_start,
 550        sb_bt_start,
 551        sb_sample_start,
 552} sync_buffer_state;
 553
 554/* Sync one of the CPU's buffers into the global event buffer.
 555 * Here we need to go through each batch of samples punctuated
 556 * by context switch notes, taking the task's mmap_sem and doing
 557 * lookup in task->mm->mmap to convert EIP into dcookie/offset
 558 * value.
 559 */
 560void sync_buffer(int cpu)
 561{
 562        struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu);
 563        struct mm_struct *mm = NULL;
 564        struct task_struct *new;
 565        unsigned long cookie = 0;
 566        int in_kernel = 1;
 567        sync_buffer_state state = sb_buffer_start;
 568#ifndef CONFIG_OPROFILE_IBS
 569        unsigned int i;
 570        unsigned long available;
 571#endif
 572
 573        mutex_lock(&buffer_mutex);
 574
 575        add_cpu_switch(cpu);
 576
 577        /* Remember, only we can modify tail_pos */
 578
 579#ifndef CONFIG_OPROFILE_IBS
 580        available = get_slots(cpu_buf);
 581
 582        for (i = 0; i < available; ++i) {
 583#else
 584        while (get_slots(cpu_buf)) {
 585#endif
 586                struct op_sample *s = &cpu_buf->buffer[cpu_buf->tail_pos];
 587
 588                if (is_code(s->eip)) {
 589                        if (s->event <= CPU_IS_KERNEL) {
 590                                /* kernel/userspace switch */
 591                                in_kernel = s->event;
 592                                if (state == sb_buffer_start)
 593                                        state = sb_sample_start;
 594                                add_kernel_ctx_switch(s->event);
 595                        } else if (s->event == CPU_TRACE_BEGIN) {
 596                                state = sb_bt_start;
 597                                add_trace_begin();
 598#ifdef CONFIG_OPROFILE_IBS
 599                        } else if (s->event == IBS_FETCH_BEGIN) {
 600                                state = sb_bt_start;
 601                                add_ibs_begin(cpu_buf, IBS_FETCH_CODE, mm);
 602                        } else if (s->event == IBS_OP_BEGIN) {
 603                                state = sb_bt_start;
 604                                add_ibs_begin(cpu_buf, IBS_OP_CODE, mm);
 605#endif
 606                        } else {
 607                                struct mm_struct *oldmm = mm;
 608
 609                                /* userspace context switch */
 610                                new = (struct task_struct *)s->event;
 611
 612                                release_mm(oldmm);
 613                                mm = take_tasks_mm(new);
 614                                if (mm != oldmm)
 615                                        cookie = get_exec_dcookie(mm);
 616                                add_user_ctx_switch(new, cookie);
 617                        }
 618                } else if (state >= sb_bt_start &&
 619                           !add_sample(mm, s, in_kernel)) {
 620                        if (state == sb_bt_start) {
 621                                state = sb_bt_ignore;
 622                                atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 623                        }
 624                }
 625
 626                increment_tail(cpu_buf);
 627        }
 628        release_mm(mm);
 629
 630        mark_done(cpu);
 631
 632        mutex_unlock(&buffer_mutex);
 633}
 634
 635/* The function can be used to add a buffer worth of data directly to
 636 * the kernel buffer. The buffer is assumed to be a circular buffer.
 637 * Take the entries from index start and end at index end, wrapping
 638 * at max_entries.
 639 */
 640void oprofile_put_buff(unsigned long *buf, unsigned int start,
 641                       unsigned int stop, unsigned int max)
 642{
 643        int i;
 644
 645        i = start;
 646
 647        mutex_lock(&buffer_mutex);
 648        while (i != stop) {
 649                add_event_entry(buf[i++]);
 650
 651                if (i >= max)
 652                        i = 0;
 653        }
 654
 655        mutex_unlock(&buffer_mutex);
 656}
 657
 658