linux/drivers/oprofile/buffer_sync.c
<<
>>
Prefs
   1/**
   2 * @file buffer_sync.c
   3 *
   4 * @remark Copyright 2002 OProfile authors
   5 * @remark Read the file COPYING
   6 *
   7 * @author John Levon <levon@movementarian.org>
   8 *
   9 * This is the core of the buffer management. Each
  10 * CPU buffer is processed and entered into the
  11 * global event buffer. Such processing is necessary
  12 * in several circumstances, mentioned below.
  13 *
  14 * The processing does the job of converting the
  15 * transitory EIP value into a persistent dentry/offset
  16 * value that the profiler can record at its leisure.
  17 *
  18 * See fs/dcookies.c for a description of the dentry/offset
  19 * objects.
  20 */
  21
  22#include <linux/mm.h>
  23#include <linux/workqueue.h>
  24#include <linux/notifier.h>
  25#include <linux/dcookies.h>
  26#include <linux/profile.h>
  27#include <linux/module.h>
  28#include <linux/fs.h>
  29#include <linux/oprofile.h>
  30#include <linux/sched.h>
  31
  32#include "oprofile_stats.h"
  33#include "event_buffer.h"
  34#include "cpu_buffer.h"
  35#include "buffer_sync.h"
  36 
  37static LIST_HEAD(dying_tasks);
  38static LIST_HEAD(dead_tasks);
  39static cpumask_t marked_cpus = CPU_MASK_NONE;
  40static DEFINE_SPINLOCK(task_mortuary);
  41static void process_task_mortuary(void);
  42
  43
  44/* Take ownership of the task struct and place it on the
  45 * list for processing. Only after two full buffer syncs
  46 * does the task eventually get freed, because by then
  47 * we are sure we will not reference it again.
  48 * Can be invoked from softirq via RCU callback due to
  49 * call_rcu() of the task struct, hence the _irqsave.
  50 */
  51static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
  52{
  53        unsigned long flags;
  54        struct task_struct * task = data;
  55        spin_lock_irqsave(&task_mortuary, flags);
  56        list_add(&task->tasks, &dying_tasks);
  57        spin_unlock_irqrestore(&task_mortuary, flags);
  58        return NOTIFY_OK;
  59}
  60
  61
  62/* The task is on its way out. A sync of the buffer means we can catch
  63 * any remaining samples for this task.
  64 */
  65static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data)
  66{
  67        /* To avoid latency problems, we only process the current CPU,
  68         * hoping that most samples for the task are on this CPU
  69         */
  70        sync_buffer(raw_smp_processor_id());
  71        return 0;
  72}
  73
  74
  75/* The task is about to try a do_munmap(). We peek at what it's going to
  76 * do, and if it's an executable region, process the samples first, so
  77 * we don't lose any. This does not have to be exact, it's a QoI issue
  78 * only.
  79 */
  80static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)
  81{
  82        unsigned long addr = (unsigned long)data;
  83        struct mm_struct * mm = current->mm;
  84        struct vm_area_struct * mpnt;
  85
  86        down_read(&mm->mmap_sem);
  87
  88        mpnt = find_vma(mm, addr);
  89        if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
  90                up_read(&mm->mmap_sem);
  91                /* To avoid latency problems, we only process the current CPU,
  92                 * hoping that most samples for the task are on this CPU
  93                 */
  94                sync_buffer(raw_smp_processor_id());
  95                return 0;
  96        }
  97
  98        up_read(&mm->mmap_sem);
  99        return 0;
 100}
 101
 102 
 103/* We need to be told about new modules so we don't attribute to a previously
 104 * loaded module, or drop the samples on the floor.
 105 */
 106static int module_load_notify(struct notifier_block * self, unsigned long val, void * data)
 107{
 108#ifdef CONFIG_MODULES
 109        if (val != MODULE_STATE_COMING)
 110                return 0;
 111
 112        /* FIXME: should we process all CPU buffers ? */
 113        mutex_lock(&buffer_mutex);
 114        add_event_entry(ESCAPE_CODE);
 115        add_event_entry(MODULE_LOADED_CODE);
 116        mutex_unlock(&buffer_mutex);
 117#endif
 118        return 0;
 119}
 120
 121 
 122static struct notifier_block task_free_nb = {
 123        .notifier_call  = task_free_notify,
 124};
 125
 126static struct notifier_block task_exit_nb = {
 127        .notifier_call  = task_exit_notify,
 128};
 129
 130static struct notifier_block munmap_nb = {
 131        .notifier_call  = munmap_notify,
 132};
 133
 134static struct notifier_block module_load_nb = {
 135        .notifier_call = module_load_notify,
 136};
 137
 138 
 139static void end_sync(void)
 140{
 141        end_cpu_work();
 142        /* make sure we don't leak task structs */
 143        process_task_mortuary();
 144        process_task_mortuary();
 145}
 146
 147
 148int sync_start(void)
 149{
 150        int err;
 151
 152        start_cpu_work();
 153
 154        err = task_handoff_register(&task_free_nb);
 155        if (err)
 156                goto out1;
 157        err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
 158        if (err)
 159                goto out2;
 160        err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
 161        if (err)
 162                goto out3;
 163        err = register_module_notifier(&module_load_nb);
 164        if (err)
 165                goto out4;
 166
 167out:
 168        return err;
 169out4:
 170        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 171out3:
 172        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 173out2:
 174        task_handoff_unregister(&task_free_nb);
 175out1:
 176        end_sync();
 177        goto out;
 178}
 179
 180
 181void sync_stop(void)
 182{
 183        unregister_module_notifier(&module_load_nb);
 184        profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
 185        profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
 186        task_handoff_unregister(&task_free_nb);
 187        end_sync();
 188}
 189
 190
 191/* Optimisation. We can manage without taking the dcookie sem
 192 * because we cannot reach this code without at least one
 193 * dcookie user still being registered (namely, the reader
 194 * of the event buffer). */
 195static inline unsigned long fast_get_dcookie(struct path *path)
 196{
 197        unsigned long cookie;
 198
 199        if (path->dentry->d_cookie)
 200                return (unsigned long)path->dentry;
 201        get_dcookie(path, &cookie);
 202        return cookie;
 203}
 204
 205
 206/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
 207 * which corresponds loosely to "application name". This is
 208 * not strictly necessary but allows oprofile to associate
 209 * shared-library samples with particular applications
 210 */
 211static unsigned long get_exec_dcookie(struct mm_struct * mm)
 212{
 213        unsigned long cookie = NO_COOKIE;
 214        struct vm_area_struct * vma;
 215 
 216        if (!mm)
 217                goto out;
 218 
 219        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 220                if (!vma->vm_file)
 221                        continue;
 222                if (!(vma->vm_flags & VM_EXECUTABLE))
 223                        continue;
 224                cookie = fast_get_dcookie(&vma->vm_file->f_path);
 225                break;
 226        }
 227
 228out:
 229        return cookie;
 230}
 231
 232
 233/* Convert the EIP value of a sample into a persistent dentry/offset
 234 * pair that can then be added to the global event buffer. We make
 235 * sure to do this lookup before a mm->mmap modification happens so
 236 * we don't lose track.
 237 */
 238static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset)
 239{
 240        unsigned long cookie = NO_COOKIE;
 241        struct vm_area_struct * vma;
 242
 243        for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 244 
 245                if (addr < vma->vm_start || addr >= vma->vm_end)
 246                        continue;
 247
 248                if (vma->vm_file) {
 249                        cookie = fast_get_dcookie(&vma->vm_file->f_path);
 250                        *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
 251                                vma->vm_start;
 252                } else {
 253                        /* must be an anonymous map */
 254                        *offset = addr;
 255                }
 256
 257                break;
 258        }
 259
 260        if (!vma)
 261                cookie = INVALID_COOKIE;
 262
 263        return cookie;
 264}
 265
 266
 267static unsigned long last_cookie = INVALID_COOKIE;
 268 
 269static void add_cpu_switch(int i)
 270{
 271        add_event_entry(ESCAPE_CODE);
 272        add_event_entry(CPU_SWITCH_CODE);
 273        add_event_entry(i);
 274        last_cookie = INVALID_COOKIE;
 275}
 276
 277static void add_kernel_ctx_switch(unsigned int in_kernel)
 278{
 279        add_event_entry(ESCAPE_CODE);
 280        if (in_kernel)
 281                add_event_entry(KERNEL_ENTER_SWITCH_CODE); 
 282        else
 283                add_event_entry(KERNEL_EXIT_SWITCH_CODE); 
 284}
 285 
 286static void
 287add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
 288{
 289        add_event_entry(ESCAPE_CODE);
 290        add_event_entry(CTX_SWITCH_CODE); 
 291        add_event_entry(task->pid);
 292        add_event_entry(cookie);
 293        /* Another code for daemon back-compat */
 294        add_event_entry(ESCAPE_CODE);
 295        add_event_entry(CTX_TGID_CODE);
 296        add_event_entry(task->tgid);
 297}
 298
 299 
 300static void add_cookie_switch(unsigned long cookie)
 301{
 302        add_event_entry(ESCAPE_CODE);
 303        add_event_entry(COOKIE_SWITCH_CODE);
 304        add_event_entry(cookie);
 305}
 306
 307 
 308static void add_trace_begin(void)
 309{
 310        add_event_entry(ESCAPE_CODE);
 311        add_event_entry(TRACE_BEGIN_CODE);
 312}
 313
 314
 315static void add_sample_entry(unsigned long offset, unsigned long event)
 316{
 317        add_event_entry(offset);
 318        add_event_entry(event);
 319}
 320
 321
 322static int add_us_sample(struct mm_struct * mm, struct op_sample * s)
 323{
 324        unsigned long cookie;
 325        off_t offset;
 326 
 327        cookie = lookup_dcookie(mm, s->eip, &offset);
 328 
 329        if (cookie == INVALID_COOKIE) {
 330                atomic_inc(&oprofile_stats.sample_lost_no_mapping);
 331                return 0;
 332        }
 333
 334        if (cookie != last_cookie) {
 335                add_cookie_switch(cookie);
 336                last_cookie = cookie;
 337        }
 338
 339        add_sample_entry(offset, s->event);
 340
 341        return 1;
 342}
 343
 344 
 345/* Add a sample to the global event buffer. If possible the
 346 * sample is converted into a persistent dentry/offset pair
 347 * for later lookup from userspace.
 348 */
 349static int
 350add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
 351{
 352        if (in_kernel) {
 353                add_sample_entry(s->eip, s->event);
 354                return 1;
 355        } else if (mm) {
 356                return add_us_sample(mm, s);
 357        } else {
 358                atomic_inc(&oprofile_stats.sample_lost_no_mm);
 359        }
 360        return 0;
 361}
 362 
 363
 364static void release_mm(struct mm_struct * mm)
 365{
 366        if (!mm)
 367                return;
 368        up_read(&mm->mmap_sem);
 369        mmput(mm);
 370}
 371
 372
 373static struct mm_struct * take_tasks_mm(struct task_struct * task)
 374{
 375        struct mm_struct * mm = get_task_mm(task);
 376        if (mm)
 377                down_read(&mm->mmap_sem);
 378        return mm;
 379}
 380
 381
 382static inline int is_code(unsigned long val)
 383{
 384        return val == ESCAPE_CODE;
 385}
 386 
 387
 388/* "acquire" as many cpu buffer slots as we can */
 389static unsigned long get_slots(struct oprofile_cpu_buffer * b)
 390{
 391        unsigned long head = b->head_pos;
 392        unsigned long tail = b->tail_pos;
 393
 394        /*
 395         * Subtle. This resets the persistent last_task
 396         * and in_kernel values used for switching notes.
 397         * BUT, there is a small window between reading
 398         * head_pos, and this call, that means samples
 399         * can appear at the new head position, but not
 400         * be prefixed with the notes for switching
 401         * kernel mode or a task switch. This small hole
 402         * can lead to mis-attribution or samples where
 403         * we don't know if it's in the kernel or not,
 404         * at the start of an event buffer.
 405         */
 406        cpu_buffer_reset(b);
 407
 408        if (head >= tail)
 409                return head - tail;
 410
 411        return head + (b->buffer_size - tail);
 412}
 413
 414
 415static void increment_tail(struct oprofile_cpu_buffer * b)
 416{
 417        unsigned long new_tail = b->tail_pos + 1;
 418
 419        rmb();
 420
 421        if (new_tail < b->buffer_size)
 422                b->tail_pos = new_tail;
 423        else
 424                b->tail_pos = 0;
 425}
 426
 427
 428/* Move tasks along towards death. Any tasks on dead_tasks
 429 * will definitely have no remaining references in any
 430 * CPU buffers at this point, because we use two lists,
 431 * and to have reached the list, it must have gone through
 432 * one full sync already.
 433 */
 434static void process_task_mortuary(void)
 435{
 436        unsigned long flags;
 437        LIST_HEAD(local_dead_tasks);
 438        struct task_struct * task;
 439        struct task_struct * ttask;
 440
 441        spin_lock_irqsave(&task_mortuary, flags);
 442
 443        list_splice_init(&dead_tasks, &local_dead_tasks);
 444        list_splice_init(&dying_tasks, &dead_tasks);
 445
 446        spin_unlock_irqrestore(&task_mortuary, flags);
 447
 448        list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 449                list_del(&task->tasks);
 450                free_task(task);
 451        }
 452}
 453
 454
 455static void mark_done(int cpu)
 456{
 457        int i;
 458
 459        cpu_set(cpu, marked_cpus);
 460
 461        for_each_online_cpu(i) {
 462                if (!cpu_isset(i, marked_cpus))
 463                        return;
 464        }
 465
 466        /* All CPUs have been processed at least once,
 467         * we can process the mortuary once
 468         */
 469        process_task_mortuary();
 470
 471        cpus_clear(marked_cpus);
 472}
 473
 474
 475/* FIXME: this is not sufficient if we implement syscall barrier backtrace
 476 * traversal, the code switch to sb_sample_start at first kernel enter/exit
 477 * switch so we need a fifth state and some special handling in sync_buffer()
 478 */
 479typedef enum {
 480        sb_bt_ignore = -2,
 481        sb_buffer_start,
 482        sb_bt_start,
 483        sb_sample_start,
 484} sync_buffer_state;
 485
 486/* Sync one of the CPU's buffers into the global event buffer.
 487 * Here we need to go through each batch of samples punctuated
 488 * by context switch notes, taking the task's mmap_sem and doing
 489 * lookup in task->mm->mmap to convert EIP into dcookie/offset
 490 * value.
 491 */
 492void sync_buffer(int cpu)
 493{
 494        struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu);
 495        struct mm_struct *mm = NULL;
 496        struct task_struct * new;
 497        unsigned long cookie = 0;
 498        int in_kernel = 1;
 499        unsigned int i;
 500        sync_buffer_state state = sb_buffer_start;
 501        unsigned long available;
 502
 503        mutex_lock(&buffer_mutex);
 504 
 505        add_cpu_switch(cpu);
 506
 507        /* Remember, only we can modify tail_pos */
 508
 509        available = get_slots(cpu_buf);
 510
 511        for (i = 0; i < available; ++i) {
 512                struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
 513 
 514                if (is_code(s->eip)) {
 515                        if (s->event <= CPU_IS_KERNEL) {
 516                                /* kernel/userspace switch */
 517                                in_kernel = s->event;
 518                                if (state == sb_buffer_start)
 519                                        state = sb_sample_start;
 520                                add_kernel_ctx_switch(s->event);
 521                        } else if (s->event == CPU_TRACE_BEGIN) {
 522                                state = sb_bt_start;
 523                                add_trace_begin();
 524                        } else {
 525                                struct mm_struct * oldmm = mm;
 526
 527                                /* userspace context switch */
 528                                new = (struct task_struct *)s->event;
 529
 530                                release_mm(oldmm);
 531                                mm = take_tasks_mm(new);
 532                                if (mm != oldmm)
 533                                        cookie = get_exec_dcookie(mm);
 534                                add_user_ctx_switch(new, cookie);
 535                        }
 536                } else {
 537                        if (state >= sb_bt_start &&
 538                            !add_sample(mm, s, in_kernel)) {
 539                                if (state == sb_bt_start) {
 540                                        state = sb_bt_ignore;
 541                                        atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 542                                }
 543                        }
 544                }
 545
 546                increment_tail(cpu_buf);
 547        }
 548        release_mm(mm);
 549
 550        mark_done(cpu);
 551
 552        mutex_unlock(&buffer_mutex);
 553}
 554
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.