linux/kernel/events/core.c
<<
>>
Prefs
   1/*
   2 * Performance events core code:
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8 *
   9 * For licensing details see kernel-base/COPYING
  10 */
  11
  12#include <linux/fs.h>
  13#include <linux/mm.h>
  14#include <linux/cpu.h>
  15#include <linux/smp.h>
  16#include <linux/idr.h>
  17#include <linux/file.h>
  18#include <linux/poll.h>
  19#include <linux/slab.h>
  20#include <linux/hash.h>
  21#include <linux/sysfs.h>
  22#include <linux/dcache.h>
  23#include <linux/percpu.h>
  24#include <linux/ptrace.h>
  25#include <linux/reboot.h>
  26#include <linux/vmstat.h>
  27#include <linux/device.h>
  28#include <linux/export.h>
  29#include <linux/vmalloc.h>
  30#include <linux/hardirq.h>
  31#include <linux/rculist.h>
  32#include <linux/uaccess.h>
  33#include <linux/syscalls.h>
  34#include <linux/anon_inodes.h>
  35#include <linux/kernel_stat.h>
  36#include <linux/perf_event.h>
  37#include <linux/ftrace_event.h>
  38#include <linux/hw_breakpoint.h>
  39
  40#include "internal.h"
  41
  42#include <asm/irq_regs.h>
  43
  44struct remote_function_call {
  45        struct task_struct      *p;
  46        int                     (*func)(void *info);
  47        void                    *info;
  48        int                     ret;
  49};
  50
  51static void remote_function(void *data)
  52{
  53        struct remote_function_call *tfc = data;
  54        struct task_struct *p = tfc->p;
  55
  56        if (p) {
  57                tfc->ret = -EAGAIN;
  58                if (task_cpu(p) != smp_processor_id() || !task_curr(p))
  59                        return;
  60        }
  61
  62        tfc->ret = tfc->func(tfc->info);
  63}
  64
  65/**
  66 * task_function_call - call a function on the cpu on which a task runs
  67 * @p:          the task to evaluate
  68 * @func:       the function to be called
  69 * @info:       the function call argument
  70 *
  71 * Calls the function @func when the task is currently running. This might
  72 * be on the current CPU, which just calls the function directly
  73 *
  74 * returns: @func return value, or
  75 *          -ESRCH  - when the process isn't running
  76 *          -EAGAIN - when the process moved away
  77 */
  78static int
  79task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  80{
  81        struct remote_function_call data = {
  82                .p      = p,
  83                .func   = func,
  84                .info   = info,
  85                .ret    = -ESRCH, /* No such (running) process */
  86        };
  87
  88        if (task_curr(p))
  89                smp_call_function_single(task_cpu(p), remote_function, &data, 1);
  90
  91        return data.ret;
  92}
  93
  94/**
  95 * cpu_function_call - call a function on the cpu
  96 * @func:       the function to be called
  97 * @info:       the function call argument
  98 *
  99 * Calls the function @func on the remote cpu.
 100 *
 101 * returns: @func return value or -ENXIO when the cpu is offline
 102 */
 103static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 104{
 105        struct remote_function_call data = {
 106                .p      = NULL,
 107                .func   = func,
 108                .info   = info,
 109                .ret    = -ENXIO, /* No such CPU */
 110        };
 111
 112        smp_call_function_single(cpu, remote_function, &data, 1);
 113
 114        return data.ret;
 115}
 116
 117#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 118                       PERF_FLAG_FD_OUTPUT  |\
 119                       PERF_FLAG_PID_CGROUP)
 120
 121/*
 122 * branch priv levels that need permission checks
 123 */
 124#define PERF_SAMPLE_BRANCH_PERM_PLM \
 125        (PERF_SAMPLE_BRANCH_KERNEL |\
 126         PERF_SAMPLE_BRANCH_HV)
 127
 128enum event_type_t {
 129        EVENT_FLEXIBLE = 0x1,
 130        EVENT_PINNED = 0x2,
 131        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 132};
 133
 134/*
 135 * perf_sched_events : >0 events exist
 136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 137 */
 138struct static_key_deferred perf_sched_events __read_mostly;
 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 141
 142static atomic_t nr_mmap_events __read_mostly;
 143static atomic_t nr_comm_events __read_mostly;
 144static atomic_t nr_task_events __read_mostly;
 145
 146static LIST_HEAD(pmus);
 147static DEFINE_MUTEX(pmus_lock);
 148static struct srcu_struct pmus_srcu;
 149
 150/*
 151 * perf event paranoia level:
 152 *  -1 - not paranoid at all
 153 *   0 - disallow raw tracepoint access for unpriv
 154 *   1 - disallow cpu events for unpriv
 155 *   2 - disallow kernel profiling for unpriv
 156 */
 157int sysctl_perf_event_paranoid __read_mostly = 1;
 158
 159/* Minimum for 512 kiB + 1 user control page */
 160int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 161
 162/*
 163 * max perf event sample rate
 164 */
 165#define DEFAULT_MAX_SAMPLE_RATE 100000
 166int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 167static int max_samples_per_tick __read_mostly =
 168        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 169
 170int perf_proc_update_handler(struct ctl_table *table, int write,
 171                void __user *buffer, size_t *lenp,
 172                loff_t *ppos)
 173{
 174        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
 175
 176        if (ret || !write)
 177                return ret;
 178
 179        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 180
 181        return 0;
 182}
 183
 184static atomic64_t perf_event_id;
 185
 186static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 187                              enum event_type_t event_type);
 188
 189static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 190                             enum event_type_t event_type,
 191                             struct task_struct *task);
 192
 193static void update_context_time(struct perf_event_context *ctx);
 194static u64 perf_event_time(struct perf_event *event);
 195
 196static void ring_buffer_attach(struct perf_event *event,
 197                               struct ring_buffer *rb);
 198
 199void __weak perf_event_print_debug(void)        { }
 200
 201extern __weak const char *perf_pmu_name(void)
 202{
 203        return "pmu";
 204}
 205
 206static inline u64 perf_clock(void)
 207{
 208        return local_clock();
 209}
 210
 211static inline struct perf_cpu_context *
 212__get_cpu_context(struct perf_event_context *ctx)
 213{
 214        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 215}
 216
 217static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 218                          struct perf_event_context *ctx)
 219{
 220        raw_spin_lock(&cpuctx->ctx.lock);
 221        if (ctx)
 222                raw_spin_lock(&ctx->lock);
 223}
 224
 225static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 226                            struct perf_event_context *ctx)
 227{
 228        if (ctx)
 229                raw_spin_unlock(&ctx->lock);
 230        raw_spin_unlock(&cpuctx->ctx.lock);
 231}
 232
 233#ifdef CONFIG_CGROUP_PERF
 234
 235/*
 236 * Must ensure cgroup is pinned (css_get) before calling
 237 * this function. In other words, we cannot call this function
 238 * if there is no cgroup event for the current CPU context.
 239 */
 240static inline struct perf_cgroup *
 241perf_cgroup_from_task(struct task_struct *task)
 242{
 243        return container_of(task_subsys_state(task, perf_subsys_id),
 244                        struct perf_cgroup, css);
 245}
 246
 247static inline bool
 248perf_cgroup_match(struct perf_event *event)
 249{
 250        struct perf_event_context *ctx = event->ctx;
 251        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 252
 253        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 254}
 255
 256static inline bool perf_tryget_cgroup(struct perf_event *event)
 257{
 258        return css_tryget(&event->cgrp->css);
 259}
 260
 261static inline void perf_put_cgroup(struct perf_event *event)
 262{
 263        css_put(&event->cgrp->css);
 264}
 265
 266static inline void perf_detach_cgroup(struct perf_event *event)
 267{
 268        perf_put_cgroup(event);
 269        event->cgrp = NULL;
 270}
 271
 272static inline int is_cgroup_event(struct perf_event *event)
 273{
 274        return event->cgrp != NULL;
 275}
 276
 277static inline u64 perf_cgroup_event_time(struct perf_event *event)
 278{
 279        struct perf_cgroup_info *t;
 280
 281        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 282        return t->time;
 283}
 284
 285static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 286{
 287        struct perf_cgroup_info *info;
 288        u64 now;
 289
 290        now = perf_clock();
 291
 292        info = this_cpu_ptr(cgrp->info);
 293
 294        info->time += now - info->timestamp;
 295        info->timestamp = now;
 296}
 297
 298static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 299{
 300        struct perf_cgroup *cgrp_out = cpuctx->cgrp;
 301        if (cgrp_out)
 302                __update_cgrp_time(cgrp_out);
 303}
 304
 305static inline void update_cgrp_time_from_event(struct perf_event *event)
 306{
 307        struct perf_cgroup *cgrp;
 308
 309        /*
 310         * ensure we access cgroup data only when needed and
 311         * when we know the cgroup is pinned (css_get)
 312         */
 313        if (!is_cgroup_event(event))
 314                return;
 315
 316        cgrp = perf_cgroup_from_task(current);
 317        /*
 318         * Do not update time when cgroup is not active
 319         */
 320        if (cgrp == event->cgrp)
 321                __update_cgrp_time(event->cgrp);
 322}
 323
 324static inline void
 325perf_cgroup_set_timestamp(struct task_struct *task,
 326                          struct perf_event_context *ctx)
 327{
 328        struct perf_cgroup *cgrp;
 329        struct perf_cgroup_info *info;
 330
 331        /*
 332         * ctx->lock held by caller
 333         * ensure we do not access cgroup data
 334         * unless we have the cgroup pinned (css_get)
 335         */
 336        if (!task || !ctx->nr_cgroups)
 337                return;
 338
 339        cgrp = perf_cgroup_from_task(task);
 340        info = this_cpu_ptr(cgrp->info);
 341        info->timestamp = ctx->timestamp;
 342}
 343
 344#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 345#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 346
 347/*
 348 * reschedule events based on the cgroup constraint of task.
 349 *
 350 * mode SWOUT : schedule out everything
 351 * mode SWIN : schedule in based on cgroup for next
 352 */
 353void perf_cgroup_switch(struct task_struct *task, int mode)
 354{
 355        struct perf_cpu_context *cpuctx;
 356        struct pmu *pmu;
 357        unsigned long flags;
 358
 359        /*
 360         * disable interrupts to avoid geting nr_cgroup
 361         * changes via __perf_event_disable(). Also
 362         * avoids preemption.
 363         */
 364        local_irq_save(flags);
 365
 366        /*
 367         * we reschedule only in the presence of cgroup
 368         * constrained events.
 369         */
 370        rcu_read_lock();
 371
 372        list_for_each_entry_rcu(pmu, &pmus, entry) {
 373                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 374
 375                /*
 376                 * perf_cgroup_events says at least one
 377                 * context on this CPU has cgroup events.
 378                 *
 379                 * ctx->nr_cgroups reports the number of cgroup
 380                 * events for a context.
 381                 */
 382                if (cpuctx->ctx.nr_cgroups > 0) {
 383                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 384                        perf_pmu_disable(cpuctx->ctx.pmu);
 385
 386                        if (mode & PERF_CGROUP_SWOUT) {
 387                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 388                                /*
 389                                 * must not be done before ctxswout due
 390                                 * to event_filter_match() in event_sched_out()
 391                                 */
 392                                cpuctx->cgrp = NULL;
 393                        }
 394
 395                        if (mode & PERF_CGROUP_SWIN) {
 396                                WARN_ON_ONCE(cpuctx->cgrp);
 397                                /* set cgrp before ctxsw in to
 398                                 * allow event_filter_match() to not
 399                                 * have to pass task around
 400                                 */
 401                                cpuctx->cgrp = perf_cgroup_from_task(task);
 402                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 403                        }
 404                        perf_pmu_enable(cpuctx->ctx.pmu);
 405                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 406                }
 407        }
 408
 409        rcu_read_unlock();
 410
 411        local_irq_restore(flags);
 412}
 413
 414static inline void perf_cgroup_sched_out(struct task_struct *task,
 415                                         struct task_struct *next)
 416{
 417        struct perf_cgroup *cgrp1;
 418        struct perf_cgroup *cgrp2 = NULL;
 419
 420        /*
 421         * we come here when we know perf_cgroup_events > 0
 422         */
 423        cgrp1 = perf_cgroup_from_task(task);
 424
 425        /*
 426         * next is NULL when called from perf_event_enable_on_exec()
 427         * that will systematically cause a cgroup_switch()
 428         */
 429        if (next)
 430                cgrp2 = perf_cgroup_from_task(next);
 431
 432        /*
 433         * only schedule out current cgroup events if we know
 434         * that we are switching to a different cgroup. Otherwise,
 435         * do no touch the cgroup events.
 436         */
 437        if (cgrp1 != cgrp2)
 438                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 439}
 440
 441static inline void perf_cgroup_sched_in(struct task_struct *prev,
 442                                        struct task_struct *task)
 443{
 444        struct perf_cgroup *cgrp1;
 445        struct perf_cgroup *cgrp2 = NULL;
 446
 447        /*
 448         * we come here when we know perf_cgroup_events > 0
 449         */
 450        cgrp1 = perf_cgroup_from_task(task);
 451
 452        /* prev can never be NULL */
 453        cgrp2 = perf_cgroup_from_task(prev);
 454
 455        /*
 456         * only need to schedule in cgroup events if we are changing
 457         * cgroup during ctxsw. Cgroup events were not scheduled
 458         * out of ctxsw out if that was not the case.
 459         */
 460        if (cgrp1 != cgrp2)
 461                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 462}
 463
 464static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 465                                      struct perf_event_attr *attr,
 466                                      struct perf_event *group_leader)
 467{
 468        struct perf_cgroup *cgrp;
 469        struct cgroup_subsys_state *css;
 470        struct file *file;
 471        int ret = 0, fput_needed;
 472
 473        file = fget_light(fd, &fput_needed);
 474        if (!file)
 475                return -EBADF;
 476
 477        css = cgroup_css_from_dir(file, perf_subsys_id);
 478        if (IS_ERR(css)) {
 479                ret = PTR_ERR(css);
 480                goto out;
 481        }
 482
 483        cgrp = container_of(css, struct perf_cgroup, css);
 484        event->cgrp = cgrp;
 485
 486        /* must be done before we fput() the file */
 487        if (!perf_tryget_cgroup(event)) {
 488                event->cgrp = NULL;
 489                ret = -ENOENT;
 490                goto out;
 491        }
 492
 493        /*
 494         * all events in a group must monitor
 495         * the same cgroup because a task belongs
 496         * to only one perf cgroup at a time
 497         */
 498        if (group_leader && group_leader->cgrp != cgrp) {
 499                perf_detach_cgroup(event);
 500                ret = -EINVAL;
 501        }
 502out:
 503        fput_light(file, fput_needed);
 504        return ret;
 505}
 506
 507static inline void
 508perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 509{
 510        struct perf_cgroup_info *t;
 511        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 512        event->shadow_ctx_time = now - t->timestamp;
 513}
 514
 515static inline void
 516perf_cgroup_defer_enabled(struct perf_event *event)
 517{
 518        /*
 519         * when the current task's perf cgroup does not match
 520         * the event's, we need to remember to call the
 521         * perf_mark_enable() function the first time a task with
 522         * a matching perf cgroup is scheduled in.
 523         */
 524        if (is_cgroup_event(event) && !perf_cgroup_match(event))
 525                event->cgrp_defer_enabled = 1;
 526}
 527
 528static inline void
 529perf_cgroup_mark_enabled(struct perf_event *event,
 530                         struct perf_event_context *ctx)
 531{
 532        struct perf_event *sub;
 533        u64 tstamp = perf_event_time(event);
 534
 535        if (!event->cgrp_defer_enabled)
 536                return;
 537
 538        event->cgrp_defer_enabled = 0;
 539
 540        event->tstamp_enabled = tstamp - event->total_time_enabled;
 541        list_for_each_entry(sub, &event->sibling_list, group_entry) {
 542                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 543                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 544                        sub->cgrp_defer_enabled = 0;
 545                }
 546        }
 547}
 548#else /* !CONFIG_CGROUP_PERF */
 549
 550static inline bool
 551perf_cgroup_match(struct perf_event *event)
 552{
 553        return true;
 554}
 555
 556static inline void perf_detach_cgroup(struct perf_event *event)
 557{}
 558
 559static inline int is_cgroup_event(struct perf_event *event)
 560{
 561        return 0;
 562}
 563
 564static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 565{
 566        return 0;
 567}
 568
 569static inline void update_cgrp_time_from_event(struct perf_event *event)
 570{
 571}
 572
 573static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 574{
 575}
 576
 577static inline void perf_cgroup_sched_out(struct task_struct *task,
 578                                         struct task_struct *next)
 579{
 580}
 581
 582static inline void perf_cgroup_sched_in(struct task_struct *prev,
 583                                        struct task_struct *task)
 584{
 585}
 586
 587static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 588                                      struct perf_event_attr *attr,
 589                                      struct perf_event *group_leader)
 590{
 591        return -EINVAL;
 592}
 593
 594static inline void
 595perf_cgroup_set_timestamp(struct task_struct *task,
 596                          struct perf_event_context *ctx)
 597{
 598}
 599
 600void
 601perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 602{
 603}
 604
 605static inline void
 606perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 607{
 608}
 609
 610static inline u64 perf_cgroup_event_time(struct perf_event *event)
 611{
 612        return 0;
 613}
 614
 615static inline void
 616perf_cgroup_defer_enabled(struct perf_event *event)
 617{
 618}
 619
 620static inline void
 621perf_cgroup_mark_enabled(struct perf_event *event,
 622                         struct perf_event_context *ctx)
 623{
 624}
 625#endif
 626
 627void perf_pmu_disable(struct pmu *pmu)
 628{
 629        int *count = this_cpu_ptr(pmu->pmu_disable_count);
 630        if (!(*count)++)
 631                pmu->pmu_disable(pmu);
 632}
 633
 634void perf_pmu_enable(struct pmu *pmu)
 635{
 636        int *count = this_cpu_ptr(pmu->pmu_disable_count);
 637        if (!--(*count))
 638                pmu->pmu_enable(pmu);
 639}
 640
 641static DEFINE_PER_CPU(struct list_head, rotation_list);
 642
 643/*
 644 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 645 * because they're strictly cpu affine and rotate_start is called with IRQs
 646 * disabled, while rotate_context is called from IRQ context.
 647 */
 648static void perf_pmu_rotate_start(struct pmu *pmu)
 649{
 650        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 651        struct list_head *head = &__get_cpu_var(rotation_list);
 652
 653        WARN_ON(!irqs_disabled());
 654
 655        if (list_empty(&cpuctx->rotation_list))
 656                list_add(&cpuctx->rotation_list, head);
 657}
 658
 659static void get_ctx(struct perf_event_context *ctx)
 660{
 661        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 662}
 663
 664static void put_ctx(struct perf_event_context *ctx)
 665{
 666        if (atomic_dec_and_test(&ctx->refcount)) {
 667                if (ctx->parent_ctx)
 668                        put_ctx(ctx->parent_ctx);
 669                if (ctx->task)
 670                        put_task_struct(ctx->task);
 671                kfree_rcu(ctx, rcu_head);
 672        }
 673}
 674
 675static void unclone_ctx(struct perf_event_context *ctx)
 676{
 677        if (ctx->parent_ctx) {
 678                put_ctx(ctx->parent_ctx);
 679                ctx->parent_ctx = NULL;
 680        }
 681}
 682
 683static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 684{
 685        /*
 686         * only top level events have the pid namespace they were created in
 687         */
 688        if (event->parent)
 689                event = event->parent;
 690
 691        return task_tgid_nr_ns(p, event->ns);
 692}
 693
 694static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 695{
 696        /*
 697         * only top level events have the pid namespace they were created in
 698         */
 699        if (event->parent)
 700                event = event->parent;
 701
 702        return task_pid_nr_ns(p, event->ns);
 703}
 704
 705/*
 706 * If we inherit events we want to return the parent event id
 707 * to userspace.
 708 */
 709static u64 primary_event_id(struct perf_event *event)
 710{
 711        u64 id = event->id;
 712
 713        if (event->parent)
 714                id = event->parent->id;
 715
 716        return id;
 717}
 718
 719/*
 720 * Get the perf_event_context for a task and lock it.
 721 * This has to cope with with the fact that until it is locked,
 722 * the context could get moved to another task.
 723 */
 724static struct perf_event_context *
 725perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 726{
 727        struct perf_event_context *ctx;
 728
 729        rcu_read_lock();
 730retry:
 731        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 732        if (ctx) {
 733                /*
 734                 * If this context is a clone of another, it might
 735                 * get swapped for another underneath us by
 736                 * perf_event_task_sched_out, though the
 737                 * rcu_read_lock() protects us from any context
 738                 * getting freed.  Lock the context and check if it
 739                 * got swapped before we could get the lock, and retry
 740                 * if so.  If we locked the right context, then it
 741                 * can't get swapped on us any more.
 742                 */
 743                raw_spin_lock_irqsave(&ctx->lock, *flags);
 744                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 745                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 746                        goto retry;
 747                }
 748
 749                if (!atomic_inc_not_zero(&ctx->refcount)) {
 750                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 751                        ctx = NULL;
 752                }
 753        }
 754        rcu_read_unlock();
 755        return ctx;
 756}
 757
 758/*
 759 * Get the context for a task and increment its pin_count so it
 760 * can't get swapped to another task.  This also increments its
 761 * reference count so that the context can't get freed.
 762 */
 763static struct perf_event_context *
 764perf_pin_task_context(struct task_struct *task, int ctxn)
 765{
 766        struct perf_event_context *ctx;
 767        unsigned long flags;
 768
 769        ctx = perf_lock_task_context(task, ctxn, &flags);
 770        if (ctx) {
 771                ++ctx->pin_count;
 772                raw_spin_unlock_irqrestore(&ctx->lock, flags);
 773        }
 774        return ctx;
 775}
 776
 777static void perf_unpin_context(struct perf_event_context *ctx)
 778{
 779        unsigned long flags;
 780
 781        raw_spin_lock_irqsave(&ctx->lock, flags);
 782        --ctx->pin_count;
 783        raw_spin_unlock_irqrestore(&ctx->lock, flags);
 784}
 785
 786/*
 787 * Update the record of the current time in a context.
 788 */
 789static void update_context_time(struct perf_event_context *ctx)
 790{
 791        u64 now = perf_clock();
 792
 793        ctx->time += now - ctx->timestamp;
 794        ctx->timestamp = now;
 795}
 796
 797static u64 perf_event_time(struct perf_event *event)
 798{
 799        struct perf_event_context *ctx = event->ctx;
 800
 801        if (is_cgroup_event(event))
 802                return perf_cgroup_event_time(event);
 803
 804        return ctx ? ctx->time : 0;
 805}
 806
 807/*
 808 * Update the total_time_enabled and total_time_running fields for a event.
 809 * The caller of this function needs to hold the ctx->lock.
 810 */
 811static void update_event_times(struct perf_event *event)
 812{
 813        struct perf_event_context *ctx = event->ctx;
 814        u64 run_end;
 815
 816        if (event->state < PERF_EVENT_STATE_INACTIVE ||
 817            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 818                return;
 819        /*
 820         * in cgroup mode, time_enabled represents
 821         * the time the event was enabled AND active
 822         * tasks were in the monitored cgroup. This is
 823         * independent of the activity of the context as
 824         * there may be a mix of cgroup and non-cgroup events.
 825         *
 826         * That is why we treat cgroup events differently
 827         * here.
 828         */
 829        if (is_cgroup_event(event))
 830                run_end = perf_cgroup_event_time(event);
 831        else if (ctx->is_active)
 832                run_end = ctx->time;
 833        else
 834                run_end = event->tstamp_stopped;
 835
 836        event->total_time_enabled = run_end - event->tstamp_enabled;
 837
 838        if (event->state == PERF_EVENT_STATE_INACTIVE)
 839                run_end = event->tstamp_stopped;
 840        else
 841                run_end = perf_event_time(event);
 842
 843        event->total_time_running = run_end - event->tstamp_running;
 844
 845}
 846
 847/*
 848 * Update total_time_enabled and total_time_running for all events in a group.
 849 */
 850static void update_group_times(struct perf_event *leader)
 851{
 852        struct perf_event *event;
 853
 854        update_event_times(leader);
 855        list_for_each_entry(event, &leader->sibling_list, group_entry)
 856                update_event_times(event);
 857}
 858
 859static struct list_head *
 860ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 861{
 862        if (event->attr.pinned)
 863                return &ctx->pinned_groups;
 864        else
 865                return &ctx->flexible_groups;
 866}
 867
 868/*
 869 * Add a event from the lists for its context.
 870 * Must be called with ctx->mutex and ctx->lock held.
 871 */
 872static void
 873list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 874{
 875        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 876        event->attach_state |= PERF_ATTACH_CONTEXT;
 877
 878        /*
 879         * If we're a stand alone event or group leader, we go to the context
 880         * list, group events are kept attached to the group so that
 881         * perf_group_detach can, at all times, locate all siblings.
 882         */
 883        if (event->group_leader == event) {
 884                struct list_head *list;
 885
 886                if (is_software_event(event))
 887                        event->group_flags |= PERF_GROUP_SOFTWARE;
 888
 889                list = ctx_group_list(event, ctx);
 890                list_add_tail(&event->group_entry, list);
 891        }
 892
 893        if (is_cgroup_event(event))
 894                ctx->nr_cgroups++;
 895
 896        if (has_branch_stack(event))
 897                ctx->nr_branch_stack++;
 898
 899        list_add_rcu(&event->event_entry, &ctx->event_list);
 900        if (!ctx->nr_events)
 901                perf_pmu_rotate_start(ctx->pmu);
 902        ctx->nr_events++;
 903        if (event->attr.inherit_stat)
 904                ctx->nr_stat++;
 905}
 906
 907/*
 908 * Called at perf_event creation and when events are attached/detached from a
 909 * group.
 910 */
 911static void perf_event__read_size(struct perf_event *event)
 912{
 913        int entry = sizeof(u64); /* value */
 914        int size = 0;
 915        int nr = 1;
 916
 917        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 918                size += sizeof(u64);
 919
 920        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 921                size += sizeof(u64);
 922
 923        if (event->attr.read_format & PERF_FORMAT_ID)
 924                entry += sizeof(u64);
 925
 926        if (event->attr.read_format & PERF_FORMAT_GROUP) {
 927                nr += event->group_leader->nr_siblings;
 928                size += sizeof(u64);
 929        }
 930
 931        size += entry * nr;
 932        event->read_size = size;
 933}
 934
 935static void perf_event__header_size(struct perf_event *event)
 936{
 937        struct perf_sample_data *data;
 938        u64 sample_type = event->attr.sample_type;
 939        u16 size = 0;
 940
 941        perf_event__read_size(event);
 942
 943        if (sample_type & PERF_SAMPLE_IP)
 944                size += sizeof(data->ip);
 945
 946        if (sample_type & PERF_SAMPLE_ADDR)
 947                size += sizeof(data->addr);
 948
 949        if (sample_type & PERF_SAMPLE_PERIOD)
 950                size += sizeof(data->period);
 951
 952        if (sample_type & PERF_SAMPLE_READ)
 953                size += event->read_size;
 954
 955        event->header_size = size;
 956}
 957
 958static void perf_event__id_header_size(struct perf_event *event)
 959{
 960        struct perf_sample_data *data;
 961        u64 sample_type = event->attr.sample_type;
 962        u16 size = 0;
 963
 964        if (sample_type & PERF_SAMPLE_TID)
 965                size += sizeof(data->tid_entry);
 966
 967        if (sample_type & PERF_SAMPLE_TIME)
 968                size += sizeof(data->time);
 969
 970        if (sample_type & PERF_SAMPLE_ID)
 971                size += sizeof(data->id);
 972
 973        if (sample_type & PERF_SAMPLE_STREAM_ID)
 974                size += sizeof(data->stream_id);
 975
 976        if (sample_type & PERF_SAMPLE_CPU)
 977                size += sizeof(data->cpu_entry);
 978
 979        event->id_header_size = size;
 980}
 981
 982static void perf_group_attach(struct perf_event *event)
 983{
 984        struct perf_event *group_leader = event->group_leader, *pos;
 985
 986        /*
 987         * We can have double attach due to group movement in perf_event_open.
 988         */
 989        if (event->attach_state & PERF_ATTACH_GROUP)
 990                return;
 991
 992        event->attach_state |= PERF_ATTACH_GROUP;
 993
 994        if (group_leader == event)
 995                return;
 996
 997        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 998                        !is_software_event(event))
 999                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1000
1001        list_add_tail(&event->group_entry, &group_leader->sibling_list);
1002        group_leader->nr_siblings++;
1003
1004        perf_event__header_size(group_leader);
1005
1006        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1007                perf_event__header_size(pos);
1008}
1009
1010/*
1011 * Remove a event from the lists for its context.
1012 * Must be called with ctx->mutex and ctx->lock held.
1013 */
1014static void
1015list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1016{
1017        struct perf_cpu_context *cpuctx;
1018        /*
1019         * We can have double detach due to exit/hot-unplug + close.
1020         */
1021        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1022                return;
1023
1024        event->attach_state &= ~PERF_ATTACH_CONTEXT;
1025
1026        if (is_cgroup_event(event)) {
1027                ctx->nr_cgroups--;
1028                cpuctx = __get_cpu_context(ctx);
1029                /*
1030                 * if there are no more cgroup events
1031                 * then cler cgrp to avoid stale pointer
1032                 * in update_cgrp_time_from_cpuctx()
1033                 */
1034                if (!ctx->nr_cgroups)
1035                        cpuctx->cgrp = NULL;
1036        }
1037
1038        if (has_branch_stack(event))
1039                ctx->nr_branch_stack--;
1040
1041        ctx->nr_events--;
1042        if (event->attr.inherit_stat)
1043                ctx->nr_stat--;
1044
1045        list_del_rcu(&event->event_entry);
1046
1047        if (event->group_leader == event)
1048                list_del_init(&event->group_entry);
1049
1050        update_group_times(event);
1051
1052        /*
1053         * If event was in error state, then keep it
1054         * that way, otherwise bogus counts will be
1055         * returned on read(). The only way to get out
1056         * of error state is by explicit re-enabling
1057         * of the event
1058         */
1059        if (event->state > PERF_EVENT_STATE_OFF)
1060                event->state = PERF_EVENT_STATE_OFF;
1061}
1062
1063static void perf_group_detach(struct perf_event *event)
1064{
1065        struct perf_event *sibling, *tmp;
1066        struct list_head *list = NULL;
1067
1068        /*
1069         * We can have double detach due to exit/hot-unplug + close.
1070         */
1071        if (!(event->attach_state & PERF_ATTACH_GROUP))
1072                return;
1073
1074        event->attach_state &= ~PERF_ATTACH_GROUP;
1075
1076        /*
1077         * If this is a sibling, remove it from its group.
1078         */
1079        if (event->group_leader != event) {
1080                list_del_init(&event->group_entry);
1081                event->group_leader->nr_siblings--;
1082                goto out;
1083        }
1084
1085        if (!list_empty(&event->group_entry))
1086                list = &event->group_entry;
1087
1088        /*
1089         * If this was a group event with sibling events then
1090         * upgrade the siblings to singleton events by adding them
1091         * to whatever list we are on.
1092         */
1093        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1094                if (list)
1095                        list_move_tail(&sibling->group_entry, list);
1096                sibling->group_leader = sibling;
1097
1098                /* Inherit group flags from the previous leader */
1099                sibling->group_flags = event->group_flags;
1100        }
1101
1102out:
1103        perf_event__header_size(event->group_leader);
1104
1105        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1106                perf_event__header_size(tmp);
1107}
1108
1109static inline int
1110event_filter_match(struct perf_event *event)
1111{
1112        return (event->cpu == -1 || event->cpu == smp_processor_id())
1113            && perf_cgroup_match(event);
1114}
1115
1116static void
1117event_sched_out(struct perf_event *event,
1118                  struct perf_cpu_context *cpuctx,
1119                  struct perf_event_context *ctx)
1120{
1121        u64 tstamp = perf_event_time(event);
1122        u64 delta;
1123        /*
1124         * An event which could not be activated because of
1125         * filter mismatch still needs to have its timings
1126         * maintained, otherwise bogus information is return
1127         * via read() for time_enabled, time_running:
1128         */
1129        if (event->state == PERF_EVENT_STATE_INACTIVE
1130            && !event_filter_match(event)) {
1131                delta = tstamp - event->tstamp_stopped;
1132                event->tstamp_running += delta;
1133                event->tstamp_stopped = tstamp;
1134        }
1135
1136        if (event->state != PERF_EVENT_STATE_ACTIVE)
1137                return;
1138
1139        event->state = PERF_EVENT_STATE_INACTIVE;
1140        if (event->pending_disable) {
1141                event->pending_disable = 0;
1142                event->state = PERF_EVENT_STATE_OFF;
1143        }
1144        event->tstamp_stopped = tstamp;
1145        event->pmu->del(event, 0);
1146        event->oncpu = -1;
1147
1148        if (!is_software_event(event))
1149                cpuctx->active_oncpu--;
1150        ctx->nr_active--;
1151        if (event->attr.freq && event->attr.sample_freq)
1152                ctx->nr_freq--;
1153        if (event->attr.exclusive || !cpuctx->active_oncpu)
1154                cpuctx->exclusive = 0;
1155}
1156
1157static void
1158group_sched_out(struct perf_event *group_event,
1159                struct perf_cpu_context *cpuctx,
1160                struct perf_event_context *ctx)
1161{
1162        struct perf_event *event;
1163        int state = group_event->state;
1164
1165        event_sched_out(group_event, cpuctx, ctx);
1166
1167        /*
1168         * Schedule out siblings (if any):
1169         */
1170        list_for_each_entry(event, &group_event->sibling_list, group_entry)
1171                event_sched_out(event, cpuctx, ctx);
1172
1173        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1174                cpuctx->exclusive = 0;
1175}
1176
1177/*
1178 * Cross CPU call to remove a performance event
1179 *
1180 * We disable the event on the hardware level first. After that we
1181 * remove it from the context list.
1182 */
1183static int __perf_remove_from_context(void *info)
1184{
1185        struct perf_event *event = info;
1186        struct perf_event_context *ctx = event->ctx;
1187        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1188
1189        raw_spin_lock(&ctx->lock);
1190        event_sched_out(event, cpuctx, ctx);
1191        list_del_event(event, ctx);
1192        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1193                ctx->is_active = 0;
1194                cpuctx->task_ctx = NULL;
1195        }
1196        raw_spin_unlock(&ctx->lock);
1197
1198        return 0;
1199}
1200
1201
1202/*
1203 * Remove the event from a task's (or a CPU's) list of events.
1204 *
1205 * CPU events are removed with a smp call. For task events we only
1206 * call when the task is on a CPU.
1207 *
1208 * If event->ctx is a cloned context, callers must make sure that
1209 * every task struct that event->ctx->task could possibly point to
1210 * remains valid.  This is OK when called from perf_release since
1211 * that only calls us on the top-level context, which can't be a clone.
1212 * When called from perf_event_exit_task, it's OK because the
1213 * context has been detached from its task.
1214 */
1215static void perf_remove_from_context(struct perf_event *event)
1216{
1217        struct perf_event_context *ctx = event->ctx;
1218        struct task_struct *task = ctx->task;
1219
1220        lockdep_assert_held(&ctx->mutex);
1221
1222        if (!task) {
1223                /*
1224                 * Per cpu events are removed via an smp call and
1225                 * the removal is always successful.
1226                 */
1227                cpu_function_call(event->cpu, __perf_remove_from_context, event);
1228                return;
1229        }
1230
1231retry:
1232        if (!task_function_call(task, __perf_remove_from_context, event))
1233                return;
1234
1235        raw_spin_lock_irq(&ctx->lock);
1236        /*
1237         * If we failed to find a running task, but find the context active now
1238         * that we've acquired the ctx->lock, retry.
1239         */
1240        if (ctx->is_active) {
1241                raw_spin_unlock_irq(&ctx->lock);
1242                goto retry;
1243        }
1244
1245        /*
1246         * Since the task isn't running, its safe to remove the event, us
1247         * holding the ctx->lock ensures the task won't get scheduled in.
1248         */
1249        list_del_event(event, ctx);
1250        raw_spin_unlock_irq(&ctx->lock);
1251}
1252
1253/*
1254 * Cross CPU call to disable a performance event
1255 */
1256int __perf_event_disable(void *info)
1257{
1258        struct perf_event *event = info;
1259        struct perf_event_context *ctx = event->ctx;
1260        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1261
1262        /*
1263         * If this is a per-task event, need to check whether this
1264         * event's task is the current task on this cpu.
1265         *
1266         * Can trigger due to concurrent perf_event_context_sched_out()
1267         * flipping contexts around.
1268         */
1269        if (ctx->task && cpuctx->task_ctx != ctx)
1270                return -EINVAL;
1271
1272        raw_spin_lock(&ctx->lock);
1273
1274        /*
1275         * If the event is on, turn it off.
1276         * If it is in error state, leave it in error state.
1277         */
1278        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1279                update_context_time(ctx);
1280                update_cgrp_time_from_event(event);
1281                update_group_times(event);
1282                if (event == event->group_leader)
1283                        group_sched_out(event, cpuctx, ctx);
1284                else
1285                        event_sched_out(event, cpuctx, ctx);
1286                event->state = PERF_EVENT_STATE_OFF;
1287        }
1288
1289        raw_spin_unlock(&ctx->lock);
1290
1291        return 0;
1292}
1293
1294/*
1295 * Disable a event.
1296 *
1297 * If event->ctx is a cloned context, callers must make sure that
1298 * every task struct that event->ctx->task could possibly point to
1299 * remains valid.  This condition is satisifed when called through
1300 * perf_event_for_each_child or perf_event_for_each because they
1301 * hold the top-level event's child_mutex, so any descendant that
1302 * goes to exit will block in sync_child_event.
1303 * When called from perf_pending_event it's OK because event->ctx
1304 * is the current context on this CPU and preemption is disabled,
1305 * hence we can't get into perf_event_task_sched_out for this context.
1306 */
1307void perf_event_disable(struct perf_event *event)
1308{
1309        struct perf_event_context *ctx = event->ctx;
1310        struct task_struct *task = ctx->task;
1311
1312        if (!task) {
1313                /*
1314                 * Disable the event on the cpu that it's on
1315                 */
1316                cpu_function_call(event->cpu, __perf_event_disable, event);
1317                return;
1318        }
1319
1320retry:
1321        if (!task_function_call(task, __perf_event_disable, event))
1322                return;
1323
1324        raw_spin_lock_irq(&ctx->lock);
1325        /*
1326         * If the event is still active, we need to retry the cross-call.
1327         */
1328        if (event->state == PERF_EVENT_STATE_ACTIVE) {
1329                raw_spin_unlock_irq(&ctx->lock);
1330                /*
1331                 * Reload the task pointer, it might have been changed by
1332                 * a concurrent perf_event_context_sched_out().
1333                 */
1334                task = ctx->task;
1335                goto retry;
1336        }
1337
1338        /*
1339         * Since we have the lock this context can't be scheduled
1340         * in, so we can change the state safely.
1341         */
1342        if (event->state == PERF_EVENT_STATE_INACTIVE) {
1343                update_group_times(event);
1344                event->state = PERF_EVENT_STATE_OFF;
1345        }
1346        raw_spin_unlock_irq(&ctx->lock);
1347}
1348EXPORT_SYMBOL_GPL(perf_event_disable);
1349
1350static void perf_set_shadow_time(struct perf_event *event,
1351                                 struct perf_event_context *ctx,
1352                                 u64 tstamp)
1353{
1354        /*
1355         * use the correct time source for the time snapshot
1356         *
1357         * We could get by without this by leveraging the
1358         * fact that to get to this function, the caller
1359         * has most likely already called update_context_time()
1360         * and update_cgrp_time_xx() and thus both timestamp
1361         * are identical (or very close). Given that tstamp is,
1362         * already adjusted for cgroup, we could say that:
1363         *    tstamp - ctx->timestamp
1364         * is equivalent to
1365         *    tstamp - cgrp->timestamp.
1366         *
1367         * Then, in perf_output_read(), the calculation would
1368         * work with no changes because:
1369         * - event is guaranteed scheduled in
1370         * - no scheduled out in between
1371         * - thus the timestamp would be the same
1372         *
1373         * But this is a bit hairy.
1374         *
1375         * So instead, we have an explicit cgroup call to remain
1376         * within the time time source all along. We believe it
1377         * is cleaner and simpler to understand.
1378         */
1379        if (is_cgroup_event(event))
1380                perf_cgroup_set_shadow_time(event, tstamp);
1381        else
1382                event->shadow_ctx_time = tstamp - ctx->timestamp;
1383}
1384
1385#define MAX_INTERRUPTS (~0ULL)
1386
1387static void perf_log_throttle(struct perf_event *event, int enable);
1388
1389static int
1390event_sched_in(struct perf_event *event,
1391                 struct perf_cpu_context *cpuctx,
1392                 struct perf_event_context *ctx)
1393{
1394        u64 tstamp = perf_event_time(event);
1395
1396        if (event->state <= PERF_EVENT_STATE_OFF)
1397                return 0;
1398
1399        event->state = PERF_EVENT_STATE_ACTIVE;
1400        event->oncpu = smp_processor_id();
1401
1402        /*
1403         * Unthrottle events, since we scheduled we might have missed several
1404         * ticks already, also for a heavily scheduling task there is little
1405         * guarantee it'll get a tick in a timely manner.
1406         */
1407        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1408                perf_log_throttle(event, 1);
1409                event->hw.interrupts = 0;
1410        }
1411
1412        /*
1413         * The new state must be visible before we turn it on in the hardware:
1414         */
1415        smp_wmb();
1416
1417        if (event->pmu->add(event, PERF_EF_START)) {
1418                event->state = PERF_EVENT_STATE_INACTIVE;
1419                event->oncpu = -1;
1420                return -EAGAIN;
1421        }
1422
1423        event->tstamp_running += tstamp - event->tstamp_stopped;
1424
1425        perf_set_shadow_time(event, ctx, tstamp);
1426
1427        if (!is_software_event(event))
1428                cpuctx->active_oncpu++;
1429        ctx->nr_active++;
1430        if (event->attr.freq && event->attr.sample_freq)
1431                ctx->nr_freq++;
1432
1433        if (event->attr.exclusive)
1434                cpuctx->exclusive = 1;
1435
1436        return 0;
1437}
1438
1439static int
1440group_sched_in(struct perf_event *group_event,
1441               struct perf_cpu_context *cpuctx,
1442               struct perf_event_context *ctx)
1443{
1444        struct perf_event *event, *partial_group = NULL;
1445        struct pmu *pmu = group_event->pmu;
1446        u64 now = ctx->time;
1447        bool simulate = false;
1448
1449        if (group_event->state == PERF_EVENT_STATE_OFF)
1450                return 0;
1451
1452        pmu->start_txn(pmu);
1453
1454        if (event_sched_in(group_event, cpuctx, ctx)) {
1455                pmu->cancel_txn(pmu);
1456                return -EAGAIN;
1457        }
1458
1459        /*
1460         * Schedule in siblings as one group (if any):
1461         */
1462        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1463                if (event_sched_in(event, cpuctx, ctx)) {
1464                        partial_group = event;
1465                        goto group_error;
1466                }
1467        }
1468
1469        if (!pmu->commit_txn(pmu))
1470                return 0;
1471
1472group_error:
1473        /*
1474         * Groups can be scheduled in as one unit only, so undo any
1475         * partial group before returning:
1476         * The events up to the failed event are scheduled out normally,
1477         * tstamp_stopped will be updated.
1478         *
1479         * The failed events and the remaining siblings need to have
1480         * their timings updated as if they had gone thru event_sched_in()
1481         * and event_sched_out(). This is required to get consistent timings
1482         * across the group. This also takes care of the case where the group
1483         * could never be scheduled by ensuring tstamp_stopped is set to mark
1484         * the time the event was actually stopped, such that time delta
1485         * calculation in update_event_times() is correct.
1486         */
1487        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1488                if (event == partial_group)
1489                        simulate = true;
1490
1491                if (simulate) {
1492                        event->tstamp_running += now - event->tstamp_stopped;
1493                        event->tstamp_stopped = now;
1494                } else {
1495                        event_sched_out(event, cpuctx, ctx);
1496                }
1497        }
1498        event_sched_out(group_event, cpuctx, ctx);
1499
1500        pmu->cancel_txn(pmu);
1501
1502        return -EAGAIN;
1503}
1504
1505/*
1506 * Work out whether we can put this event group on the CPU now.
1507 */
1508static int group_can_go_on(struct perf_event *event,
1509                           struct perf_cpu_context *cpuctx,
1510                           int can_add_hw)
1511{
1512        /*
1513         * Groups consisting entirely of software events can always go on.
1514         */
1515        if (event->group_flags & PERF_GROUP_SOFTWARE)
1516                return 1;
1517        /*
1518         * If an exclusive group is already on, no other hardware
1519         * events can go on.
1520         */
1521        if (cpuctx->exclusive)
1522                return 0;
1523        /*
1524         * If this group is exclusive and there are already
1525         * events on the CPU, it can't go on.
1526         */
1527        if (event->attr.exclusive && cpuctx->active_oncpu)
1528                return 0;
1529        /*
1530         * Otherwise, try to add it if all previous groups were able
1531         * to go on.
1532         */
1533        return can_add_hw;
1534}
1535
1536static void add_event_to_ctx(struct perf_event *event,
1537                               struct perf_event_context *ctx)
1538{
1539        u64 tstamp = perf_event_time(event);
1540
1541        list_add_event(event, ctx);
1542        perf_group_attach(event);
1543        event->tstamp_enabled = tstamp;
1544        event->tstamp_running = tstamp;
1545        event->tstamp_stopped = tstamp;
1546}
1547
1548static void task_ctx_sched_out(struct perf_event_context *ctx);
1549static void
1550ctx_sched_in(struct perf_event_context *ctx,
1551             struct perf_cpu_context *cpuctx,
1552             enum event_type_t event_type,
1553             struct task_struct *task);
1554
1555static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1556                                struct perf_event_context *ctx,
1557                                struct task_struct *task)
1558{
1559        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1560        if (ctx)
1561                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1562        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1563        if (ctx)
1564                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1565}
1566
1567/*
1568 * Cross CPU call to install and enable a performance event
1569 *
1570 * Must be called with ctx->mutex held
1571 */
1572static int  __perf_install_in_context(void *info)
1573{
1574        struct perf_event *event = info;
1575        struct perf_event_context *ctx = event->ctx;
1576        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1577        struct perf_event_context *task_ctx = cpuctx->task_ctx;
1578        struct task_struct *task = current;
1579
1580        perf_ctx_lock(cpuctx, task_ctx);
1581        perf_pmu_disable(cpuctx->ctx.pmu);
1582
1583        /*
1584         * If there was an active task_ctx schedule it out.
1585         */
1586        if (task_ctx)
1587                task_ctx_sched_out(task_ctx);
1588
1589        /*
1590         * If the context we're installing events in is not the
1591         * active task_ctx, flip them.
1592         */
1593        if (ctx->task && task_ctx != ctx) {
1594                if (task_ctx)
1595                        raw_spin_unlock(&task_ctx->lock);
1596                raw_spin_lock(&ctx->lock);
1597                task_ctx = ctx;
1598        }
1599
1600        if (task_ctx) {
1601                cpuctx->task_ctx = task_ctx;
1602                task = task_ctx->task;
1603        }
1604
1605        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1606
1607        update_context_time(ctx);
1608        /*
1609         * update cgrp time only if current cgrp
1610         * matches event->cgrp. Must be done before
1611         * calling add_event_to_ctx()
1612         */
1613        update_cgrp_time_from_event(event);
1614
1615        add_event_to_ctx(event, ctx);
1616
1617        /*
1618         * Schedule everything back in
1619         */
1620        perf_event_sched_in(cpuctx, task_ctx, task);
1621
1622        perf_pmu_enable(cpuctx->ctx.pmu);
1623        perf_ctx_unlock(cpuctx, task_ctx);
1624
1625        return 0;
1626}
1627
1628/*
1629 * Attach a performance event to a context
1630 *
1631 * First we add the event to the list with the hardware enable bit
1632 * in event->hw_config cleared.
1633 *
1634 * If the event is attached to a task which is on a CPU we use a smp
1635 * call to enable it in the task context. The task might have been
1636 * scheduled away, but we check this in the smp call again.
1637 */
1638static void
1639perf_install_in_context(struct perf_event_context *ctx,
1640                        struct perf_event *event,
1641                        int cpu)
1642{
1643        struct task_struct *task = ctx->task;
1644
1645        lockdep_assert_held(&ctx->mutex);
1646
1647        event->ctx = ctx;
1648        if (event->cpu != -1)
1649                event->cpu = cpu;
1650
1651        if (!task) {
1652                /*
1653                 * Per cpu events are installed via an smp call and
1654                 * the install is always successful.
1655                 */
1656                cpu_function_call(cpu, __perf_install_in_context, event);
1657                return;
1658        }
1659
1660retry:
1661        if (!task_function_call(task, __perf_install_in_context, event))
1662                return;
1663
1664        raw_spin_lock_irq(&ctx->lock);
1665        /*
1666         * If we failed to find a running task, but find the context active now
1667         * that we've acquired the ctx->lock, retry.
1668         */
1669        if (ctx->is_active) {
1670                raw_spin_unlock_irq(&ctx->lock);
1671                goto retry;
1672        }
1673
1674        /*
1675         * Since the task isn't running, its safe to add the event, us holding
1676         * the ctx->lock ensures the task won't get scheduled in.
1677         */
1678        add_event_to_ctx(event, ctx);
1679        raw_spin_unlock_irq(&ctx->lock);
1680}
1681
1682/*
1683 * Put a event into inactive state and update time fields.
1684 * Enabling the leader of a group effectively enables all
1685 * the group members that aren't explicitly disabled, so we
1686 * have to update their ->tstamp_enabled also.
1687 * Note: this works for group members as well as group leaders
1688 * since the non-leader members' sibling_lists will be empty.
1689 */
1690static void __perf_event_mark_enabled(struct perf_event *event)
1691{
1692        struct perf_event *sub;
1693        u64 tstamp = perf_event_time(event);
1694
1695        event->state = PERF_EVENT_STATE_INACTIVE;
1696        event->tstamp_enabled = tstamp - event->total_time_enabled;
1697        list_for_each_entry(sub, &event->sibling_list, group_entry) {
1698                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1699                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1700        }
1701}
1702
1703/*
1704 * Cross CPU call to enable a performance event
1705 */
1706static int __perf_event_enable(void *info)
1707{
1708        struct perf_event *event = info;
1709        struct perf_event_context *ctx = event->ctx;
1710        struct perf_event *leader = event->group_leader;
1711        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1712        int err;
1713
1714        if (WARN_ON_ONCE(!ctx->is_active))
1715                return -EINVAL;
1716
1717        raw_spin_lock(&ctx->lock);
1718        update_context_time(ctx);
1719
1720        if (event->state >= PERF_EVENT_STATE_INACTIVE)
1721                goto unlock;
1722
1723        /*
1724         * set current task's cgroup time reference point
1725         */
1726        perf_cgroup_set_timestamp(current, ctx);
1727
1728        __perf_event_mark_enabled(event);
1729
1730        if (!event_filter_match(event)) {
1731                if (is_cgroup_event(event))
1732                        perf_cgroup_defer_enabled(event);
1733                goto unlock;
1734        }
1735
1736        /*
1737         * If the event is in a group and isn't the group leader,
1738         * then don't put it on unless the group is on.
1739         */
1740        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1741                goto unlock;
1742
1743        if (!group_can_go_on(event, cpuctx, 1)) {
1744                err = -EEXIST;
1745        } else {
1746                if (event == leader)
1747                        err = group_sched_in(event, cpuctx, ctx);
1748                else
1749                        err = event_sched_in(event, cpuctx, ctx);
1750        }
1751
1752        if (err) {
1753                /*
1754                 * If this event can't go on and it's part of a
1755                 * group, then the whole group has to come off.
1756                 */
1757                if (leader != event)
1758                        group_sched_out(leader, cpuctx, ctx);
1759                if (leader->attr.pinned) {
1760                        update_group_times(leader);
1761                        leader->state = PERF_EVENT_STATE_ERROR;
1762                }
1763        }
1764
1765unlock:
1766        raw_spin_unlock(&ctx->lock);
1767
1768        return 0;
1769}
1770
1771/*
1772 * Enable a event.
1773 *
1774 * If event->ctx is a cloned context, callers must make sure that
1775 * every task struct that event->ctx->task could possibly point to
1776 * remains valid.  This condition is satisfied when called through
1777 * perf_event_for_each_child or perf_event_for_each as described
1778 * for perf_event_disable.
1779 */
1780void perf_event_enable(struct perf_event *event)
1781{
1782        struct perf_event_context *ctx = event->ctx;
1783        struct task_struct *task = ctx->task;
1784
1785        if (!task) {
1786                /*
1787                 * Enable the event on the cpu that it's on
1788                 */
1789                cpu_function_call(event->cpu, __perf_event_enable, event);
1790                return;
1791        }
1792
1793        raw_spin_lock_irq(&ctx->lock);
1794        if (event->state >= PERF_EVENT_STATE_INACTIVE)
1795                goto out;
1796
1797        /*
1798         * If the event is in error state, clear that first.
1799         * That way, if we see the event in error state below, we
1800         * know that it has gone back into error state, as distinct
1801         * from the task having been scheduled away before the
1802         * cross-call arrived.
1803         */
1804        if (event->state == PERF_EVENT_STATE_ERROR)
1805                event->state = PERF_EVENT_STATE_OFF;
1806
1807retry:
1808        if (!ctx->is_active) {
1809                __perf_event_mark_enabled(event);
1810                goto out;
1811        }
1812
1813        raw_spin_unlock_irq(&ctx->lock);
1814
1815        if (!task_function_call(task, __perf_event_enable, event))
1816                return;
1817
1818        raw_spin_lock_irq(&ctx->lock);
1819
1820        /*
1821         * If the context is active and the event is still off,
1822         * we need to retry the cross-call.
1823         */
1824        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1825                /*
1826                 * task could have been flipped by a concurrent
1827                 * perf_event_context_sched_out()
1828                 */
1829                task = ctx->task;
1830                goto retry;
1831        }
1832
1833out:
1834        raw_spin_unlock_irq(&ctx->lock);
1835}
1836EXPORT_SYMBOL_GPL(perf_event_enable);
1837
1838int perf_event_refresh(struct perf_event *event, int refresh)
1839{
1840        /*
1841         * not supported on inherited events
1842         */
1843        if (event->attr.inherit || !is_sampling_event(event))
1844                return -EINVAL;
1845
1846        atomic_add(refresh, &event->event_limit);
1847        perf_event_enable(event);
1848
1849        return 0;
1850}
1851EXPORT_SYMBOL_GPL(perf_event_refresh);
1852
1853static void ctx_sched_out(struct perf_event_context *ctx,
1854                          struct perf_cpu_context *cpuctx,
1855                          enum event_type_t event_type)
1856{
1857        struct perf_event *event;
1858        int is_active = ctx->is_active;
1859
1860        ctx->is_active &= ~event_type;
1861        if (likely(!ctx->nr_events))
1862                return;
1863
1864        update_context_time(ctx);
1865        update_cgrp_time_from_cpuctx(cpuctx);
1866        if (!ctx->nr_active)
1867                return;
1868
1869        perf_pmu_disable(ctx->pmu);
1870        if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1871                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1872                        group_sched_out(event, cpuctx, ctx);
1873        }
1874
1875        if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1876                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1877                        group_sched_out(event, cpuctx, ctx);
1878        }
1879        perf_pmu_enable(ctx->pmu);
1880}
1881
1882/*
1883 * Test whether two contexts are equivalent, i.e. whether they
1884 * have both been cloned from the same version of the same context
1885 * and they both have the same number of enabled events.
1886 * If the number of enabled events is the same, then the set
1887 * of enabled events should be the same, because these are both
1888 * inherited contexts, therefore we can't access individual events
1889 * in them directly with an fd; we can only enable/disable all
1890 * events via prctl, or enable/disable all events in a family
1891 * via ioctl, which will have the same effect on both contexts.
1892 */
1893static int context_equiv(struct perf_event_context *ctx1,
1894                         struct perf_event_context *ctx2)
1895{
1896        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1897                && ctx1->parent_gen == ctx2->parent_gen
1898                && !ctx1->pin_count && !ctx2->pin_count;
1899}
1900
1901static void __perf_event_sync_stat(struct perf_event *event,
1902                                     struct perf_event *next_event)
1903{
1904        u64 value;
1905
1906        if (!event->attr.inherit_stat)
1907                return;
1908
1909        /*
1910         * Update the event value, we cannot use perf_event_read()
1911         * because we're in the middle of a context switch and have IRQs
1912         * disabled, which upsets smp_call_function_single(), however
1913         * we know the event must be on the current CPU, therefore we
1914         * don't need to use it.
1915         */
1916        switch (event->state) {
1917        case PERF_EVENT_STATE_ACTIVE:
1918                event->pmu->read(event);
1919                /* fall-through */
1920
1921        case PERF_EVENT_STATE_INACTIVE:
1922                update_event_times(event);
1923                break;
1924
1925        default:
1926                break;
1927        }
1928
1929        /*
1930         * In order to keep per-task stats reliable we need to flip the event
1931         * values when we flip the contexts.
1932         */
1933        value = local64_read(&next_event->count);
1934        value = local64_xchg(&event->count, value);
1935        local64_set(&next_event->count, value);
1936
1937        swap(event->total_time_enabled, next_event->total_time_enabled);
1938        swap(event->total_time_running, next_event->total_time_running);
1939
1940        /*
1941         * Since we swizzled the values, update the user visible data too.
1942         */
1943        perf_event_update_userpage(event);
1944        perf_event_update_userpage(next_event);
1945}
1946
1947#define list_next_entry(pos, member) \
1948        list_entry(pos->member.next, typeof(*pos), member)
1949
1950static void perf_event_sync_stat(struct perf_event_context *ctx,
1951                                   struct perf_event_context *next_ctx)
1952{
1953        struct perf_event *event, *next_event;
1954
1955        if (!ctx->nr_stat)
1956                return;
1957
1958        update_context_time(ctx);
1959
1960        event = list_first_entry(&ctx->event_list,
1961                                   struct perf_event, event_entry);
1962
1963        next_event = list_first_entry(&next_ctx->event_list,
1964                                        struct perf_event, event_entry);
1965
1966        while (&event->event_entry != &ctx->event_list &&
1967               &next_event->event_entry != &next_ctx->event_list) {
1968
1969                __perf_event_sync_stat(event, next_event);
1970
1971                event = list_next_entry(event, event_entry);
1972                next_event = list_next_entry(next_event, event_entry);
1973        }
1974}
1975
1976static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1977                                         struct task_struct *next)
1978{
1979        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1980        struct perf_event_context *next_ctx;
1981        struct perf_event_context *parent;
1982        struct perf_cpu_context *cpuctx;
1983        int do_switch = 1;
1984
1985        if (likely(!ctx))
1986                return;
1987
1988        cpuctx = __get_cpu_context(ctx);
1989        if (!cpuctx->task_ctx)
1990                return;
1991
1992        rcu_read_lock();
1993        parent = rcu_dereference(ctx->parent_ctx);
1994        next_ctx = next->perf_event_ctxp[ctxn];
1995        if (parent && next_ctx &&
1996            rcu_dereference(next_ctx->parent_ctx) == parent) {
1997                /*
1998                 * Looks like the two contexts are clones, so we might be
1999                 * able to optimize the context switch.  We lock both
2000                 * contexts and check that they are clones under the
2001                 * lock (including re-checking that neither has been
2002                 * uncloned in the meantime).  It doesn't matter which
2003                 * order we take the locks because no other cpu could
2004                 * be trying to lock both of these tasks.
2005                 */
2006                raw_spin_lock(&ctx->lock);
2007                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2008                if (context_equiv(ctx, next_ctx)) {
2009                        /*
2010                         * XXX do we need a memory barrier of sorts
2011                         * wrt to rcu_dereference() of perf_event_ctxp
2012                         */
2013                        task->perf_event_ctxp[ctxn] = next_ctx;
2014                        next->perf_event_ctxp[ctxn] = ctx;
2015                        ctx->task = next;
2016                        next_ctx->task = task;
2017                        do_switch = 0;
2018
2019                        perf_event_sync_stat(ctx, next_ctx);
2020                }
2021                raw_spin_unlock(&next_ctx->lock);
2022                raw_spin_unlock(&ctx->lock);
2023        }
2024        rcu_read_unlock();
2025
2026        if (do_switch) {
2027                raw_spin_lock(&ctx->lock);
2028                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2029                cpuctx->task_ctx = NULL;
2030                raw_spin_unlock(&ctx->lock);
2031        }
2032}
2033
2034#define for_each_task_context_nr(ctxn)                                  \
2035        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2036
2037/*
2038 * Called from scheduler to remove the events of the current task,
2039 * with interrupts disabled.
2040 *
2041 * We stop each event and update the event value in event->count.
2042 *
2043 * This does not protect us against NMI, but disable()
2044 * sets the disabled bit in the control field of event _before_
2045 * accessing the event control register. If a NMI hits, then it will
2046 * not restart the event.
2047 */
2048void __perf_event_task_sched_out(struct task_struct *task,
2049                                 struct task_struct *next)
2050{
2051        int ctxn;
2052
2053        for_each_task_context_nr(ctxn)
2054                perf_event_context_sched_out(task, ctxn, next);
2055
2056        /*
2057         * if cgroup events exist on this CPU, then we need
2058         * to check if we have to switch out PMU state.
2059         * cgroup event are system-wide mode only
2060         */
2061        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2062                perf_cgroup_sched_out(task, next);
2063}
2064
2065static void task_ctx_sched_out(struct perf_event_context *ctx)
2066{
2067        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2068
2069        if (!cpuctx->task_ctx)
2070                return;
2071
2072        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2073                return;
2074
2075        ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2076        cpuctx->task_ctx = NULL;
2077}
2078
2079/*
2080 * Called with IRQs disabled
2081 */
2082static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2083                              enum event_type_t event_type)
2084{
2085        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2086}
2087
2088static void
2089ctx_pinned_sched_in(struct perf_event_context *ctx,
2090                    struct perf_cpu_context *cpuctx)
2091{
2092        struct perf_event *event;
2093
2094        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2095                if (event->state <= PERF_EVENT_STATE_OFF)
2096                        continue;
2097                if (!event_filter_match(event))
2098                        continue;
2099
2100                /* may need to reset tstamp_enabled */
2101                if (is_cgroup_event(event))
2102                        perf_cgroup_mark_enabled(event, ctx);
2103
2104                if (group_can_go_on(event, cpuctx, 1))
2105                        group_sched_in(event, cpuctx, ctx);
2106
2107                /*
2108                 * If this pinned group hasn't been scheduled,
2109                 * put it in error state.
2110                 */
2111                if (event->state == PERF_EVENT_STATE_INACTIVE) {
2112                        update_group_times(event);
2113                        event->state = PERF_EVENT_STATE_ERROR;
2114                }
2115        }
2116}
2117
2118static void
2119ctx_flexible_sched_in(struct perf_event_context *ctx,
2120                      struct perf_cpu_context *cpuctx)
2121{
2122        struct perf_event *event;
2123        int can_add_hw = 1;
2124
2125        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2126                /* Ignore events in OFF or ERROR state */
2127                if (event->state <= PERF_EVENT_STATE_OFF)
2128                        continue;
2129                /*
2130                 * Listen to the 'cpu' scheduling filter constraint
2131                 * of events:
2132                 */
2133                if (!event_filter_match(event))
2134                        continue;
2135
2136                /* may need to reset tstamp_enabled */
2137                if (is_cgroup_event(event))
2138                        perf_cgroup_mark_enabled(event, ctx);
2139
2140                if (group_can_go_on(event, cpuctx, can_add_hw)) {
2141                        if (group_sched_in(event, cpuctx, ctx))
2142                                can_add_hw = 0;
2143                }
2144        }
2145}
2146
2147static void
2148ctx_sched_in(struct perf_event_context *ctx,
2149             struct perf_cpu_context *cpuctx,
2150             enum event_type_t event_type,
2151             struct task_struct *task)
2152{
2153        u64 now;
2154        int is_active = ctx->is_active;
2155
2156        ctx->is_active |= event_type;
2157        if (likely(!ctx->nr_events))
2158                return;
2159
2160        now = perf_clock();
2161        ctx->timestamp = now;
2162        perf_cgroup_set_timestamp(task, ctx);
2163        /*
2164         * First go through the list and put on any pinned groups
2165         * in order to give them the best chance of going on.
2166         */
2167        if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2168                ctx_pinned_sched_in(ctx, cpuctx);
2169
2170        /* Then walk through the lower prio flexible groups */
2171        if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2172                ctx_flexible_sched_in(ctx, cpuctx);
2173}
2174
2175static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2176                             enum event_type_t event_type,
2177                             struct task_struct *task)
2178{
2179        struct perf_event_context *ctx = &cpuctx->ctx;
2180
2181        ctx_sched_in(ctx, cpuctx, event_type, task);
2182}
2183
2184static void perf_event_context_sched_in(struct perf_event_context *ctx,
2185                                        struct task_struct *task)
2186{
2187        struct perf_cpu_context *cpuctx;
2188
2189        cpuctx = __get_cpu_context(ctx);
2190        if (cpuctx->task_ctx == ctx)
2191                return;
2192
2193        perf_ctx_lock(cpuctx, ctx);
2194        perf_pmu_disable(ctx->pmu);
2195        /*
2196         * We want to keep the following priority order:
2197         * cpu pinned (that don't need to move), task pinned,
2198         * cpu flexible, task flexible.
2199         */
2200        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2201
2202        if (ctx->nr_events)
2203                cpuctx->task_ctx = ctx;
2204
2205        perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2206
2207        perf_pmu_enable(ctx->pmu);
2208        perf_ctx_unlock(cpuctx, ctx);
2209
2210        /*
2211         * Since these rotations are per-cpu, we need to ensure the
2212         * cpu-context we got scheduled on is actually rotating.
2213         */
2214        perf_pmu_rotate_start(ctx->pmu);
2215}
2216
2217/*
2218 * When sampling the branck stack in system-wide, it may be necessary
2219 * to flush the stack on context switch. This happens when the branch
2220 * stack does not tag its entries with the pid of the current task.
2221 * Otherwise it becomes impossible to associate a branch entry with a
2222 * task. This ambiguity is more likely to appear when the branch stack
2223 * supports priv level filtering and the user sets it to monitor only
2224 * at the user level (which could be a useful measurement in system-wide
2225 * mode). In that case, the risk is high of having a branch stack with
2226 * branch from multiple tasks. Flushing may mean dropping the existing
2227 * entries or stashing them somewhere in the PMU specific code layer.
2228 *
2229 * This function provides the context switch callback to the lower code
2230 * layer. It is invoked ONLY when there is at least one system-wide context
2231 * with at least one active event using taken branch sampling.
2232 */
2233static void perf_branch_stack_sched_in(struct task_struct *prev,
2234                                       struct task_struct *task)
2235{
2236        struct perf_cpu_context *cpuctx;
2237        struct pmu *pmu;
2238        unsigned long flags;
2239
2240        /* no need to flush branch stack if not changing task */
2241        if (prev == task)
2242                return;
2243
2244        local_irq_save(flags);
2245
2246        rcu_read_lock();
2247
2248        list_for_each_entry_rcu(pmu, &pmus, entry) {
2249                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2250
2251                /*
2252                 * check if the context has at least one
2253                 * event using PERF_SAMPLE_BRANCH_STACK
2254                 */
2255                if (cpuctx->ctx.nr_branch_stack > 0
2256                    && pmu->flush_branch_stack) {
2257
2258                        pmu = cpuctx->ctx.pmu;
2259
2260                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2261
2262                        perf_pmu_disable(pmu);
2263
2264                        pmu->flush_branch_stack();
2265
2266                        perf_pmu_enable(pmu);
2267
2268                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2269                }
2270        }
2271
2272        rcu_read_unlock();
2273
2274        local_irq_restore(flags);
2275}
2276
2277/*
2278 * Called from scheduler to add the events of the current task
2279 * with interrupts disabled.
2280 *
2281 * We restore the event value and then enable it.
2282 *
2283 * This does not protect us against NMI, but enable()
2284 * sets the enabled bit in the control field of event _before_
2285 * accessing the event control register. If a NMI hits, then it will
2286 * keep the event running.
2287 */
2288void __perf_event_task_sched_in(struct task_struct *prev,
2289                                struct task_struct *task)
2290{
2291        struct perf_event_context *ctx;
2292        int ctxn;
2293
2294        for_each_task_context_nr(ctxn) {
2295                ctx = task->perf_event_ctxp[ctxn];
2296                if (likely(!ctx))
2297                        continue;
2298
2299                perf_event_context_sched_in(ctx, task);
2300        }
2301        /*
2302         * if cgroup events exist on this CPU, then we need
2303         * to check if we have to switch in PMU state.
2304         * cgroup event are system-wide mode only
2305         */
2306        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2307                perf_cgroup_sched_in(prev, task);
2308
2309        /* check for system-wide branch_stack events */
2310        if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2311                perf_branch_stack_sched_in(prev, task);
2312}
2313
2314static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2315{
2316        u64 frequency = event->attr.sample_freq;
2317        u64 sec = NSEC_PER_SEC;
2318        u64 divisor, dividend;
2319
2320        int count_fls, nsec_fls, frequency_fls, sec_fls;
2321
2322        count_fls = fls64(count);
2323        nsec_fls = fls64(nsec);
2324        frequency_fls = fls64(frequency);
2325        sec_fls = 30;
2326
2327        /*
2328         * We got @count in @nsec, with a target of sample_freq HZ
2329         * the target period becomes:
2330         *
2331         *             @count * 10^9
2332         * period = -------------------
2333         *          @nsec * sample_freq
2334         *
2335         */
2336
2337        /*
2338         * Reduce accuracy by one bit such that @a and @b converge
2339         * to a similar magnitude.
2340         */
2341#define REDUCE_FLS(a, b)                \
2342do {                                    \
2343        if (a##_fls > b##_fls) {        \
2344                a >>= 1;                \
2345                a##_fls--;              \
2346        } else {                        \
2347                b >>= 1;                \
2348                b##_fls--;              \
2349        }                               \
2350} while (0)
2351
2352        /*
2353         * Reduce accuracy until either term fits in a u64, then proceed with
2354         * the other, so that finally we can do a u64/u64 division.
2355         */
2356        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2357                REDUCE_FLS(nsec, frequency);
2358                REDUCE_FLS(sec, count);
2359        }
2360
2361        if (count_fls + sec_fls > 64) {
2362                divisor = nsec * frequency;
2363
2364                while (count_fls + sec_fls > 64) {
2365                        REDUCE_FLS(count, sec);
2366                        divisor >>= 1;
2367                }
2368
2369                dividend = count * sec;
2370        } else {
2371                dividend = count * sec;
2372
2373                while (nsec_fls + frequency_fls > 64) {
2374                        REDUCE_FLS(nsec, frequency);
2375                        dividend >>= 1;
2376                }
2377
2378                divisor = nsec * frequency;
2379        }
2380
2381        if (!divisor)
2382                return dividend;
2383
2384        return div64_u64(dividend, divisor);
2385}
2386
2387static DEFINE_PER_CPU(int, perf_throttled_count);
2388static DEFINE_PER_CPU(u64, perf_throttled_seq);
2389
2390static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2391{
2392        struct hw_perf_event *hwc = &event->hw;
2393        s64 period, sample_period;
2394        s64 delta;
2395
2396        period = perf_calculate_period(event, nsec, count);
2397
2398        delta = (s64)(period - hwc->sample_period);
2399        delta = (delta + 7) / 8; /* low pass filter */
2400
2401        sample_period = hwc->sample_period + delta;
2402
2403        if (!sample_period)
2404                sample_period = 1;
2405
2406        hwc->sample_period = sample_period;
2407
2408        if (local64_read(&hwc->period_left) > 8*sample_period) {
2409                if (disable)
2410                        event->pmu->stop(event, PERF_EF_UPDATE);
2411
2412                local64_set(&hwc->period_left, 0);
2413
2414                if (disable)
2415                        event->pmu->start(event, PERF_EF_RELOAD);
2416        }
2417}
2418
2419/*
2420 * combine freq adjustment with unthrottling to avoid two passes over the
2421 * events. At the same time, make sure, having freq events does not change
2422 * the rate of unthrottling as that would introduce bias.
2423 */
2424static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2425                                           int needs_unthr)
2426{
2427        struct perf_event *event;
2428        struct hw_perf_event *hwc;
2429        u64 now, period = TICK_NSEC;
2430        s64 delta;
2431
2432        /*
2433         * only need to iterate over all events iff:
2434         * - context have events in frequency mode (needs freq adjust)
2435         * - there are events to unthrottle on this cpu
2436         */
2437        if (!(ctx->nr_freq || needs_unthr))
2438                return;
2439
2440        raw_spin_lock(&ctx->lock);
2441        perf_pmu_disable(ctx->pmu);
2442
2443        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2444                if (event->state != PERF_EVENT_STATE_ACTIVE)
2445                        continue;
2446
2447                if (!event_filter_match(event))
2448                        continue;
2449
2450                hwc = &event->hw;
2451
2452                if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2453                        hwc->interrupts = 0;
2454                        perf_log_throttle(event, 1);
2455                        event->pmu->start(event, 0);
2456                }
2457
2458                if (!event->attr.freq || !event->attr.sample_freq)
2459                        continue;
2460
2461                /*
2462                 * stop the event and update event->count
2463                 */
2464                event->pmu->stop(event, PERF_EF_UPDATE);
2465
2466                now = local64_read(&event->count);
2467                delta = now - hwc->freq_count_stamp;
2468                hwc->freq_count_stamp = now;
2469
2470                /*
2471                 * restart the event
2472                 * reload only if value has changed
2473                 * we have stopped the event so tell that
2474                 * to perf_adjust_period() to avoid stopping it
2475                 * twice.
2476                 */
2477                if (delta > 0)
2478                        perf_adjust_period(event, period, delta, false);
2479
2480                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2481        }
2482
2483        perf_pmu_enable(ctx->pmu);
2484        raw_spin_unlock(&ctx->lock);
2485}
2486
2487/*
2488 * Round-robin a context's events:
2489 */
2490static void rotate_ctx(struct perf_event_context *ctx)
2491{
2492        /*
2493         * Rotate the first entry last of non-pinned groups. Rotation might be
2494         * disabled by the inheritance code.
2495         */
2496        if (!ctx->rotate_disable)
2497                list_rotate_left(&ctx->flexible_groups);
2498}
2499
2500/*
2501 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2502 * because they're strictly cpu affine and rotate_start is called with IRQs
2503 * disabled, while rotate_context is called from IRQ context.
2504 */
2505static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2506{
2507        struct perf_event_context *ctx = NULL;
2508        int rotate = 0, remove = 1;
2509
2510        if (cpuctx->ctx.nr_events) {
2511                remove = 0;
2512                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2513                        rotate = 1;
2514        }
2515
2516        ctx = cpuctx->task_ctx;
2517        if (ctx && ctx->nr_events) {
2518                remove = 0;
2519                if (ctx->nr_events != ctx->nr_active)
2520                        rotate = 1;
2521        }
2522
2523        if (!rotate)
2524                goto done;
2525
2526        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2527        perf_pmu_disable(cpuctx->ctx.pmu);
2528
2529        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2530        if (ctx)
2531                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2532
2533        rotate_ctx(&cpuctx->ctx);
2534        if (ctx)
2535                rotate_ctx(ctx);
2536
2537        perf_event_sched_in(cpuctx, ctx, current);
2538
2539        perf_pmu_enable(cpuctx->ctx.pmu);
2540        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2541done:
2542        if (remove)
2543                list_del_init(&cpuctx->rotation_list);
2544}
2545
2546void perf_event_task_tick(void)
2547{
2548        struct list_head *head = &__get_cpu_var(rotation_list);
2549        struct perf_cpu_context *cpuctx, *tmp;
2550        struct perf_event_context *ctx;
2551        int throttled;
2552
2553        WARN_ON(!irqs_disabled());
2554
2555        __this_cpu_inc(perf_throttled_seq);
2556        throttled = __this_cpu_xchg(perf_throttled_count, 0);
2557
2558        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2559                ctx = &cpuctx->ctx;
2560                perf_adjust_freq_unthr_context(ctx, throttled);
2561
2562                ctx = cpuctx->task_ctx;
2563                if (ctx)
2564                        perf_adjust_freq_unthr_context(ctx, throttled);
2565
2566                if (cpuctx->jiffies_interval == 1 ||
2567                                !(jiffies % cpuctx->jiffies_interval))
2568                        perf_rotate_context(cpuctx);
2569        }
2570}
2571
2572static int event_enable_on_exec(struct perf_event *event,
2573                                struct perf_event_context *ctx)
2574{
2575        if (!event->attr.enable_on_exec)
2576                return 0;
2577
2578        event->attr.enable_on_exec = 0;
2579        if (event->state >= PERF_EVENT_STATE_INACTIVE)
2580                return 0;
2581
2582        __perf_event_mark_enabled(event);
2583
2584        return 1;
2585}
2586
2587/*
2588 * Enable all of a task's events that have been marked enable-on-exec.
2589 * This expects task == current.
2590 */
2591static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2592{
2593        struct perf_event *event;
2594        unsigned long flags;
2595        int enabled = 0;
2596        int ret;
2597
2598        local_irq_save(flags);
2599        if (!ctx || !ctx->nr_events)
2600                goto out;
2601
2602        /*
2603         * We must ctxsw out cgroup events to avoid conflict
2604         * when invoking perf_task_event_sched_in() later on
2605         * in this function. Otherwise we end up trying to
2606         * ctxswin cgroup events which are already scheduled
2607         * in.
2608         */
2609        perf_cgroup_sched_out(current, NULL);
2610
2611        raw_spin_lock(&ctx->lock);
2612        task_ctx_sched_out(ctx);
2613
2614        list_for_each_entry(event, &ctx->event_list, event_entry) {
2615                ret = event_enable_on_exec(event, ctx);
2616                if (ret)
2617                        enabled = 1;
2618        }
2619
2620        /*
2621         * Unclone this context if we enabled any event.
2622         */
2623        if (enabled)
2624                unclone_ctx(ctx);
2625
2626        raw_spin_unlock(&ctx->lock);
2627
2628        /*
2629         * Also calls ctxswin for cgroup events, if any:
2630         */
2631        perf_event_context_sched_in(ctx, ctx->task);
2632out:
2633        local_irq_restore(flags);
2634}
2635
2636/*
2637 * Cross CPU call to read the hardware event
2638 */
2639static void __perf_event_read(void *info)
2640{
2641        struct perf_event *event = info;
2642        struct perf_event_context *ctx = event->ctx;
2643        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2644
2645        /*
2646         * If this is a task context, we need to check whether it is
2647         * the current task context of this cpu.  If not it has been
2648         * scheduled out before the smp call arrived.  In that case
2649         * event->count would have been updated to a recent sample
2650         * when the event was scheduled out.
2651         */
2652        if (ctx->task && cpuctx->task_ctx != ctx)
2653                return;
2654
2655        raw_spin_lock(&ctx->lock);
2656        if (ctx->is_active) {
2657                update_context_time(ctx);
2658                update_cgrp_time_from_event(event);
2659        }
2660        update_event_times(event);
2661        if (event->state == PERF_EVENT_STATE_ACTIVE)
2662                event->pmu->read(event);
2663        raw_spin_unlock(&ctx->lock);
2664}
2665
2666static inline u64 perf_event_count(struct perf_event *event)
2667{
2668        return local64_read(&event->count) + atomic64_read(&event->child_count);
2669}
2670
2671static u64 perf_event_read(struct perf_event *event)
2672{
2673        /*
2674         * If event is enabled and currently active on a CPU, update the
2675         * value in the event structure:
2676         */
2677        if (event->state == PERF_EVENT_STATE_ACTIVE) {
2678                smp_call_function_single(event->oncpu,
2679                                         __perf_event_read, event, 1);
2680        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2681                struct perf_event_context *ctx = event->ctx;
2682                unsigned long flags;
2683
2684                raw_spin_lock_irqsave(&ctx->lock, flags);
2685                /*
2686                 * may read while context is not active
2687                 * (e.g., thread is blocked), in that case
2688                 * we cannot update context time
2689                 */
2690                if (ctx->is_active) {
2691                        update_context_time(ctx);
2692                        update_cgrp_time_from_event(event);
2693                }
2694                update_event_times(event);
2695                raw_spin_unlock_irqrestore(&ctx->lock, flags);
2696        }
2697
2698        return perf_event_count(event);
2699}
2700
2701/*
2702 * Initialize the perf_event context in a task_struct:
2703 */
2704static void __perf_event_init_context(struct perf_event_context *ctx)
2705{
2706        raw_spin_lock_init(&ctx->lock);
2707        mutex_init(&ctx->mutex);
2708        INIT_LIST_HEAD(&ctx->pinned_groups);
2709        INIT_LIST_HEAD(&ctx->flexible_groups);
2710        INIT_LIST_HEAD(&ctx->event_list);
2711        atomic_set(&ctx->refcount, 1);
2712}
2713
2714static struct perf_event_context *
2715alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2716{
2717        struct perf_event_context *ctx;
2718
2719        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2720        if (!ctx)
2721                return NULL;
2722
2723        __perf_event_init_context(ctx);
2724        if (task) {
2725                ctx->task = task;
2726                get_task_struct(task);
2727        }
2728        ctx->pmu = pmu;
2729
2730        return ctx;
2731}
2732
2733static struct task_struct *
2734find_lively_task_by_vpid(pid_t vpid)
2735{
2736        struct task_struct *task;
2737        int err;
2738
2739        rcu_read_lock();
2740        if (!vpid)
2741                task = current;
2742        else
2743                task = find_task_by_vpid(vpid);
2744        if (task)
2745                get_task_struct(task);
2746        rcu_read_unlock();
2747
2748        if (!task)
2749                return ERR_PTR(-ESRCH);
2750
2751        /* Reuse ptrace permission checks for now. */
2752        err = -EACCES;
2753        if (!ptrace_may_access(task, PTRACE_MODE_READ))
2754                goto errout;
2755
2756        return task;
2757errout:
2758        put_task_struct(task);
2759        return ERR_PTR(err);
2760
2761}
2762
2763/*
2764 * Returns a matching context with refcount and pincount.
2765 */
2766static struct perf_event_context *
2767find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2768{
2769        struct perf_event_context *ctx;
2770        struct perf_cpu_context *cpuctx;
2771        unsigned long flags;
2772        int ctxn, err;
2773
2774        if (!task) {
2775                /* Must be root to operate on a CPU event: */
2776                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2777                        return ERR_PTR(-EACCES);
2778
2779                /*
2780                 * We could be clever and allow to attach a event to an
2781                 * offline CPU and activate it when the CPU comes up, but
2782                 * that's for later.
2783                 */
2784                if (!cpu_online(cpu))
2785                        return ERR_PTR(-ENODEV);
2786
2787                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2788                ctx = &cpuctx->ctx;
2789                get_ctx(ctx);
2790                ++ctx->pin_count;
2791
2792                return ctx;
2793        }
2794
2795        err = -EINVAL;
2796        ctxn = pmu->task_ctx_nr;
2797        if (ctxn < 0)
2798                goto errout;
2799
2800retry:
2801        ctx = perf_lock_task_context(task, ctxn, &flags);
2802        if (ctx) {
2803                unclone_ctx(ctx);
2804                ++ctx->pin_count;
2805                raw_spin_unlock_irqrestore(&ctx->lock, flags);
2806        } else {
2807                ctx = alloc_perf_context(pmu, task);
2808                err = -ENOMEM;
2809                if (!ctx)
2810                        goto errout;
2811
2812                err = 0;
2813                mutex_lock(&task->perf_event_mutex);
2814                /*
2815                 * If it has already passed perf_event_exit_task().
2816                 * we must see PF_EXITING, it takes this mutex too.
2817                 */
2818                if (task->flags & PF_EXITING)
2819                        err = -ESRCH;
2820                else if (task->perf_event_ctxp[ctxn])
2821                        err = -EAGAIN;
2822                else {
2823                        get_ctx(ctx);
2824                        ++ctx->pin_count;
2825                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2826                }
2827                mutex_unlock(&task->perf_event_mutex);
2828
2829                if (unlikely(err)) {
2830                        put_ctx(ctx);
2831
2832                        if (err == -EAGAIN)
2833                                goto retry;
2834                        goto errout;
2835                }
2836        }
2837
2838        return ctx;
2839
2840errout:
2841        return ERR_PTR(err);
2842}
2843
2844static void perf_event_free_filter(struct perf_event *event);
2845
2846static void free_event_rcu(struct rcu_head *head)
2847{
2848        struct perf_event *event;
2849
2850        event = container_of(head, struct perf_event, rcu_head);
2851        if (event->ns)
2852                put_pid_ns(event->ns);
2853        perf_event_free_filter(event);
2854        kfree(event);
2855}
2856
2857static void ring_buffer_put(struct ring_buffer *rb);
2858
2859static void free_event(struct perf_event *event)
2860{
2861        irq_work_sync(&event->pending);
2862
2863        if (!event->parent) {
2864                if (event->attach_state & PERF_ATTACH_TASK)
2865                        static_key_slow_dec_deferred(&perf_sched_events);
2866                if (event->attr.mmap || event->attr.mmap_data)
2867                        atomic_dec(&nr_mmap_events);
2868                if (event->attr.comm)
2869                        atomic_dec(&nr_comm_events);
2870                if (event->attr.task)
2871                        atomic_dec(&nr_task_events);
2872                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2873                        put_callchain_buffers();
2874                if (is_cgroup_event(event)) {
2875                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2876                        static_key_slow_dec_deferred(&perf_sched_events);
2877                }
2878
2879                if (has_branch_stack(event)) {
2880                        static_key_slow_dec_deferred(&perf_sched_events);
2881                        /* is system-wide event */
2882                        if (!(event->attach_state & PERF_ATTACH_TASK))
2883                                atomic_dec(&per_cpu(perf_branch_stack_events,
2884                                                    event->cpu));
2885                }
2886        }
2887
2888        if (event->rb) {
2889                ring_buffer_put(event->rb);
2890                event->rb = NULL;
2891        }
2892
2893        if (is_cgroup_event(event))
2894                perf_detach_cgroup(event);
2895
2896        if (event->destroy)
2897                event->destroy(event);
2898
2899        if (event->ctx)
2900                put_ctx(event->ctx);
2901
2902        call_rcu(&event->rcu_head, free_event_rcu);
2903}
2904
2905int perf_event_release_kernel(struct perf_event *event)
2906{
2907        struct perf_event_context *ctx = event->ctx;
2908
2909        WARN_ON_ONCE(ctx->parent_ctx);
2910        /*
2911         * There are two ways this annotation is useful:
2912         *
2913         *  1) there is a lock recursion from perf_event_exit_task
2914         *     see the comment there.
2915         *
2916         *  2) there is a lock-inversion with mmap_sem through
2917         *     perf_event_read_group(), which takes faults while
2918         *     holding ctx->mutex, however this is called after
2919         *     the last filedesc died, so there is no possibility
2920         *     to trigger the AB-BA case.
2921         */
2922        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2923        raw_spin_lock_irq(&ctx->lock);
2924        perf_group_detach(event);
2925        raw_spin_unlock_irq(&ctx->lock);
2926        perf_remove_from_context(event);
2927        mutex_unlock(&ctx->mutex);
2928
2929        free_event(event);
2930
2931        return 0;
2932}
2933EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2934
2935/*
2936 * Called when the last reference to the file is gone.
2937 */
2938static void put_event(struct perf_event *event)
2939{
2940        struct task_struct *owner;
2941
2942        if (!atomic_long_dec_and_test(&event->refcount))
2943                return;
2944
2945        rcu_read_lock();
2946        owner = ACCESS_ONCE(event->owner);
2947        /*
2948         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2949         * !owner it means the list deletion is complete and we can indeed
2950         * free this event, otherwise we need to serialize on
2951         * owner->perf_event_mutex.
2952         */
2953        smp_read_barrier_depends();
2954        if (owner) {
2955                /*
2956                 * Since delayed_put_task_struct() also drops the last
2957                 * task reference we can safely take a new reference
2958                 * while holding the rcu_read_lock().
2959                 */
2960                get_task_struct(owner);
2961        }
2962        rcu_read_unlock();
2963
2964        if (owner) {
2965                mutex_lock(&owner->perf_event_mutex);
2966                /*
2967                 * We have to re-check the event->owner field, if it is cleared
2968                 * we raced with perf_event_exit_task(), acquiring the mutex
2969                 * ensured they're done, and we can proceed with freeing the
2970                 * event.
2971                 */
2972                if (event->owner)
2973                        list_del_init(&event->owner_entry);
2974                mutex_unlock(&owner->perf_event_mutex);
2975                put_task_struct(owner);
2976        }
2977
2978        perf_event_release_kernel(event);
2979}
2980
2981static int perf_release(struct inode *inode, struct file *file)
2982{
2983        put_event(file->private_data);
2984        return 0;
2985}
2986
2987u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2988{
2989        struct perf_event *child;
2990        u64 total = 0;
2991
2992        *enabled = 0;
2993        *running = 0;
2994
2995        mutex_lock(&event->child_mutex);
2996        total += perf_event_read(event);
2997        *enabled += event->total_time_enabled +
2998                        atomic64_read(&event->child_total_time_enabled);
2999        *running += event->total_time_running +
3000                        atomic64_read(&event->child_total_time_running);
3001
3002        list_for_each_entry(child, &event->child_list, child_list) {
3003                total += perf_event_read(child);
3004                *enabled += child->total_time_enabled;
3005                *running += child->total_time_running;
3006        }
3007        mutex_unlock(&event->child_mutex);
3008
3009        return total;
3010}
3011EXPORT_SYMBOL_GPL(perf_event_read_value);
3012
3013static int perf_event_read_group(struct perf_event *event,
3014                                   u64 read_format, char __user *buf)
3015{
3016        struct perf_event *leader = event->group_leader, *sub;
3017        int n = 0, size = 0, ret = -EFAULT;
3018        struct perf_event_context *ctx = leader->ctx;
3019        u64 values[5];
3020        u64 count, enabled, running;
3021
3022        mutex_lock(&ctx->mutex);
3023        count = perf_event_read_value(leader, &enabled, &running);
3024
3025        values[n++] = 1 + leader->nr_siblings;
3026        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3027                values[n++] = enabled;
3028        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3029                values[n++] = running;
3030        values[n++] = count;
3031        if (read_format & PERF_FORMAT_ID)
3032                values[n++] = primary_event_id(leader);
3033
3034        size = n * sizeof(u64);
3035
3036        if (copy_to_user(buf, values, size))
3037                goto unlock;
3038
3039        ret = size;
3040
3041        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3042                n = 0;
3043
3044                values[n++] = perf_event_read_value(sub, &enabled, &running);
3045                if (read_format & PERF_FORMAT_ID)
3046                        values[n++] = primary_event_id(sub);
3047
3048                size = n * sizeof(u64);
3049
3050                if (copy_to_user(buf + ret, values, size)) {
3051                        ret = -EFAULT;
3052                        goto unlock;
3053                }
3054
3055                ret += size;
3056        }
3057unlock:
3058        mutex_unlock(&ctx->mutex);
3059
3060        return ret;
3061}
3062
3063static int perf_event_read_one(struct perf_event *event,
3064                                 u64 read_format, char __user *buf)
3065{
3066        u64 enabled, running;
3067        u64 values[4];
3068        int n = 0;
3069
3070        values[n++] = perf_event_read_value(event, &enabled, &running);
3071        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3072                values[n++] = enabled;
3073        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3074                values[n++] = running;
3075        if (read_format & PERF_FORMAT_ID)
3076                values[n++] = primary_event_id(event);
3077
3078        if (copy_to_user(buf, values, n * sizeof(u64)))
3079                return -EFAULT;
3080
3081        return n * sizeof(u64);
3082}
3083
3084/*
3085 * Read the performance event - simple non blocking version for now
3086 */
3087static ssize_t
3088perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3089{
3090        u64 read_format = event->attr.read_format;
3091        int ret;
3092
3093        /*
3094         * Return end-of-file for a read on a event that is in
3095         * error state (i.e. because it was pinned but it couldn't be
3096         * scheduled on to the CPU at some point).
3097         */
3098        if (event->state == PERF_EVENT_STATE_ERROR)
3099                return 0;
3100
3101        if (count < event->read_size)
3102                return -ENOSPC;
3103
3104        WARN_ON_ONCE(event->ctx->parent_ctx);
3105        if (read_format & PERF_FORMAT_GROUP)
3106                ret = perf_event_read_group(event, read_format, buf);
3107        else
3108                ret = perf_event_read_one(event, read_format, buf);
3109
3110        return ret;
3111}
3112
3113static ssize_t
3114perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3115{
3116        struct perf_event *event = file->private_data;
3117
3118        return perf_read_hw(event, buf, count);
3119}
3120
3121static unsigned int perf_poll(struct file *file, poll_table *wait)
3122{
3123        struct perf_event *event = file->private_data;
3124        struct ring_buffer *rb;
3125        unsigned int events = POLL_HUP;
3126
3127        /*
3128         * Race between perf_event_set_output() and perf_poll(): perf_poll()
3129         * grabs the rb reference but perf_event_set_output() overrides it.
3130         * Here is the timeline for two threads T1, T2:
3131         * t0: T1, rb = rcu_dereference(event->rb)
3132         * t1: T2, old_rb = event->rb
3133         * t2: T2, event->rb = new rb
3134         * t3: T2, ring_buffer_detach(old_rb)
3135         * t4: T1, ring_buffer_attach(rb1)
3136         * t5: T1, poll_wait(event->waitq)
3137         *
3138         * To avoid this problem, we grab mmap_mutex in perf_poll()
3139         * thereby ensuring that the assignment of the new ring buffer
3140         * and the detachment of the old buffer appear atomic to perf_poll()
3141         */
3142        mutex_lock(&event->mmap_mutex);
3143
3144        rcu_read_lock();
3145        rb = rcu_dereference(event->rb);
3146        if (rb) {
3147                ring_buffer_attach(event, rb);
3148                events = atomic_xchg(&rb->poll, 0);
3149        }
3150        rcu_read_unlock();
3151
3152        mutex_unlock(&event->mmap_mutex);
3153
3154        poll_wait(file, &event->waitq, wait);
3155
3156        return events;
3157}
3158
3159static void perf_event_reset(struct perf_event *event)
3160{
3161        (void)perf_event_read(event);
3162        local64_set(&event->count, 0);
3163        perf_event_update_userpage(event);
3164}
3165
3166/*
3167 * Holding the top-level event's child_mutex means that any
3168 * descendant process that has inherited this event will block
3169 * in sync_child_event if it goes to exit, thus satisfying the
3170 * task existence requirements of perf_event_enable/disable.
3171 */
3172static void perf_event_for_each_child(struct perf_event *event,
3173                                        void (*func)(struct perf_event *))
3174{
3175        struct perf_event *child;
3176
3177        WARN_ON_ONCE(event->ctx->parent_ctx);
3178        mutex_lock(&event->child_mutex);
3179        func(event);
3180        list_for_each_entry(child, &event->child_list, child_list)
3181                func(child);
3182        mutex_unlock(&event->child_mutex);
3183}
3184
3185static void perf_event_for_each(struct perf_event *event,
3186                                  void (*func)(struct perf_event *))
3187{
3188        struct perf_event_context *ctx = event->ctx;
3189        struct perf_event *sibling;
3190
3191        WARN_ON_ONCE(ctx->parent_ctx);
3192        mutex_lock(&ctx->mutex);
3193        event = event->group_leader;
3194
3195        perf_event_for_each_child(event, func);
3196        list_for_each_entry(sibling, &event->sibling_list, group_entry)
3197                perf_event_for_each_child(sibling, func);
3198        mutex_unlock(&ctx->mutex);
3199}
3200
3201static int perf_event_period(struct perf_event *event, u64 __user *arg)
3202{
3203        struct perf_event_context *ctx = event->ctx;
3204        int ret = 0;
3205        u64 value;
3206
3207        if (!is_sampling_event(event))
3208                return -EINVAL;
3209
3210        if (copy_from_user(&value, arg, sizeof(value)))
3211                return -EFAULT;
3212
3213        if (!value)
3214                return -EINVAL;
3215
3216        raw_spin_lock_irq(&ctx->lock);
3217        if (event->attr.freq) {
3218                if (value > sysctl_perf_event_sample_rate) {
3219                        ret = -EINVAL;
3220                        goto unlock;
3221                }
3222
3223                event->attr.sample_freq = value;
3224        } else {
3225                event->attr.sample_period = value;
3226                event->hw.sample_period = value;
3227        }
3228unlock:
3229        raw_spin_unlock_irq(&ctx->lock);
3230
3231        return ret;
3232}
3233
3234static const struct file_operations perf_fops;
3235
3236static struct file *perf_fget_light(int fd, int *fput_needed)
3237{
3238        struct file *file;
3239
3240        file = fget_light(fd, fput_needed);
3241        if (!file)
3242                return ERR_PTR(-EBADF);
3243
3244        if (file->f_op != &perf_fops) {
3245                fput_light(file, *fput_needed);
3246                *fput_needed = 0;
3247                return ERR_PTR(-EBADF);
3248        }
3249
3250        return file;
3251}
3252
3253static int perf_event_set_output(struct perf_event *event,
3254                                 struct perf_event *output_event);
3255static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3256
3257static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3258{
3259        struct perf_event *event = file->private_data;
3260        void (*func)(struct perf_event *);
3261        u32 flags = arg;
3262
3263        switch (cmd) {
3264        case PERF_EVENT_IOC_ENABLE:
3265                func = perf_event_enable;
3266                break;
3267        case PERF_EVENT_IOC_DISABLE:
3268                func = perf_event_disable;
3269                break;
3270        case PERF_EVENT_IOC_RESET:
3271                func = perf_event_reset;
3272                break;
3273
3274        case PERF_EVENT_IOC_REFRESH:
3275                return perf_event_refresh(event, arg);
3276
3277        case PERF_EVENT_IOC_PERIOD:
3278                return perf_event_period(event, (u64 __user *)arg);
3279
3280        case PERF_EVENT_IOC_SET_OUTPUT:
3281        {
3282                struct file *output_file = NULL;
3283                struct perf_event *output_event = NULL;
3284                int fput_needed = 0;
3285                int ret;
3286
3287                if (arg != -1) {
3288                        output_file = perf_fget_light(arg, &fput_needed);
3289                        if (IS_ERR(output_file))
3290                                return PTR_ERR(output_file);
3291                        output_event = output_file->private_data;
3292                }
3293
3294                ret = perf_event_set_output(event, output_event);
3295                if (output_event)
3296                        fput_light(output_file, fput_needed);
3297
3298                return ret;
3299        }
3300
3301        case PERF_EVENT_IOC_SET_FILTER:
3302                return perf_event_set_filter(event, (void __user *)arg);
3303
3304        default:
3305                return -ENOTTY;
3306        }
3307
3308        if (flags & PERF_IOC_FLAG_GROUP)
3309                perf_event_for_each(event, func);
3310        else
3311                perf_event_for_each_child(event, func);
3312
3313        return 0;
3314}
3315
3316int perf_event_task_enable(void)
3317{
3318        struct perf_event *event;
3319
3320        mutex_lock(&current->perf_event_mutex);
3321        list_for_each_entry(event, &current->perf_event_list, owner_entry)
3322                perf_event_for_each_child(event, perf_event_enable);
3323        mutex_unlock(&current->perf_event_mutex);
3324
3325        return 0;
3326}
3327
3328int perf_event_task_disable(void)
3329{
3330        struct perf_event *event;
3331
3332        mutex_lock(&current->perf_event_mutex);
3333        list_for_each_entry(event, &current->perf_event_list, owner_entry)
3334                perf_event_for_each_child(event, perf_event_disable);
3335        mutex_unlock(&current->perf_event_mutex);
3336
3337        return 0;
3338}
3339
3340static int perf_event_index(struct perf_event *event)
3341{
3342        if (event->hw.state & PERF_HES_STOPPED)
3343                return 0;
3344
3345        if (event->state != PERF_EVENT_STATE_ACTIVE)
3346                return 0;
3347
3348        return event->pmu->event_idx(event);
3349}
3350
3351static void calc_timer_values(struct perf_event *event,
3352                                u64 *now,
3353                                u64 *enabled,
3354                                u64 *running)
3355{
3356        u64 ctx_time;
3357
3358        *now = perf_clock();
3359        ctx_time = event->shadow_ctx_time + *now;
3360        *enabled = ctx_time - event->tstamp_enabled;
3361        *running = ctx_time - event->tstamp_running;
3362}
3363
3364void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3365{
3366}
3367
3368/*
3369 * Callers need to ensure there can be no nesting of this function, otherwise
3370 * the seqlock logic goes bad. We can not serialize this because the arch
3371 * code calls this from NMI context.
3372 */
3373void perf_event_update_userpage(struct perf_event *event)
3374{
3375        struct perf_event_mmap_page *userpg;
3376        struct ring_buffer *rb;
3377        u64 enabled, running, now;
3378
3379        rcu_read_lock();
3380        /*
3381         * compute total_time_enabled, total_time_running
3382         * based on snapshot values taken when the event
3383         * was last scheduled in.
3384         *
3385         * we cannot simply called update_context_time()
3386         * because of locking issue as we can be called in
3387         * NMI context
3388         */
3389        calc_timer_values(event, &now, &enabled, &running);
3390        rb = rcu_dereference(event->rb);
3391        if (!rb)
3392                goto unlock;
3393
3394        userpg = rb->user_page;
3395
3396        /*
3397         * Disable preemption so as to not let the corresponding user-space
3398         * spin too long if we get preempted.
3399         */
3400        preempt_disable();
3401        ++userpg->lock;
3402        barrier();
3403        userpg->index = perf_event_index(event);
3404        userpg->offset = perf_event_count(event);
3405        if (userpg->index)
3406                userpg->offset -= local64_read(&event->hw.prev_count);
3407
3408        userpg->time_enabled = enabled +
3409                        atomic64_read(&event->child_total_time_enabled);
3410
3411        userpg->time_running = running +
3412                        atomic64_read(&event->child_total_time_running);
3413
3414        arch_perf_update_userpage(userpg, now);
3415
3416        barrier();
3417        ++userpg->lock;
3418        preempt_enable();
3419unlock:
3420        rcu_read_unlock();
3421}
3422
3423static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3424{
3425        struct perf_event *event = vma->vm_file->private_data;
3426        struct ring_buffer *rb;
3427        int ret = VM_FAULT_SIGBUS;
3428
3429        if (vmf->flags & FAULT_FLAG_MKWRITE) {
3430                if (vmf->pgoff == 0)
3431                        ret = 0;
3432                return ret;
3433        }
3434
3435        rcu_read_lock();
3436        rb = rcu_dereference(event->rb);
3437        if (!rb)
3438                goto unlock;
3439
3440        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3441                goto unlock;
3442
3443        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3444        if (!vmf->page)
3445                goto unlock;
3446
3447        get_page(vmf->page);
3448        vmf->page->mapping = vma->vm_file->f_mapping;
3449        vmf->page->index   = vmf->pgoff;
3450
3451        ret = 0;
3452unlock:
3453        rcu_read_unlock();
3454
3455        return ret;
3456}
3457
3458static void ring_buffer_attach(struct perf_event *event,
3459                               struct ring_buffer *rb)
3460{
3461        unsigned long flags;
3462
3463        if (!list_empty(&event->rb_entry))
3464                return;
3465
3466        spin_lock_irqsave(&rb->event_lock, flags);
3467        if (!list_empty(&event->rb_entry))
3468                goto unlock;
3469
3470        list_add(&event->rb_entry, &rb->event_list);
3471unlock:
3472        spin_unlock_irqrestore(&rb->event_lock, flags);
3473}
3474
3475static void ring_buffer_detach(struct perf_event *event,
3476                               struct ring_buffer *rb)
3477{
3478        unsigned long flags;
3479
3480        if (list_empty(&event->rb_entry))
3481                return;
3482
3483        spin_lock_irqsave(&rb->event_lock, flags);
3484        list_del_init(&event->rb_entry);
3485        wake_up_all(&event->waitq);
3486        spin_unlock_irqrestore(&rb->event_lock, flags);
3487}
3488
3489static void ring_buffer_wakeup(struct perf_event *event)
3490{
3491        struct ring_buffer *rb;
3492
3493        rcu_read_lock();
3494        rb = rcu_dereference(event->rb);
3495        if (!rb)
3496                goto unlock;
3497
3498        list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3499                wake_up_all(&event->waitq);
3500
3501unlock:
3502        rcu_read_unlock();
3503}
3504
3505static void rb_free_rcu(struct rcu_head *rcu_head)
3506{
3507        struct ring_buffer *rb;
3508
3509        rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3510        rb_free(rb);
3511}
3512
3513static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3514{
3515        struct ring_buffer *rb;
3516
3517        rcu_read_lock();
3518        rb = rcu_dereference(event->rb);
3519        if (rb) {
3520                if (!atomic_inc_not_zero(&rb->refcount))
3521                        rb = NULL;
3522        }
3523        rcu_read_unlock();
3524
3525        return rb;
3526}
3527
3528static void ring_buffer_put(struct ring_buffer *rb)
3529{
3530        struct perf_event *event, *n;
3531        unsigned long flags;
3532
3533        if (!atomic_dec_and_test(&rb->refcount))
3534                return;
3535
3536        spin_lock_irqsave(&rb->event_lock, flags);
3537        list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3538                list_del_init(&event->rb_entry);
3539                wake_up_all(&event->waitq);
3540        }
3541        spin_unlock_irqrestore(&rb->event_lock, flags);
3542
3543        call_rcu(&rb->rcu_head, rb_free_rcu);
3544}
3545
3546static void perf_mmap_open(struct vm_area_struct *vma)
3547{
3548        struct perf_event *event = vma->vm_file->private_data;
3549
3550        atomic_inc(&event->mmap_count);
3551}
3552
3553static void perf_mmap_close(struct vm_area_struct *vma)
3554{
3555        struct perf_event *event = vma->vm_file->private_data;
3556
3557        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3558                unsigned long size = perf_data_size(event->rb);
3559                struct user_struct *user = event->mmap_user;
3560                struct ring_buffer *rb = event->rb;
3561
3562                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3563                vma->vm_mm->pinned_vm -= event->mmap_locked;
3564                rcu_assign_pointer(event->rb, NULL);
3565                ring_buffer_detach(event, rb);
3566                mutex_unlock(&event->mmap_mutex);
3567
3568                ring_buffer_put(rb);
3569                free_uid(user);
3570        }
3571}
3572
3573static const struct vm_operations_struct perf_mmap_vmops = {
3574        .open           = perf_mmap_open,
3575        .close          = perf_mmap_close,
3576        .fault          = perf_mmap_fault,
3577        .page_mkwrite   = perf_mmap_fault,
3578};
3579
3580static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3581{
3582        struct perf_event *event = file->private_data;
3583        unsigned long user_locked, user_lock_limit;
3584        struct user_struct *user = current_user();
3585        unsigned long locked, lock_limit;
3586        struct ring_buffer *rb;
3587        unsigned long vma_size;
3588        unsigned long nr_pages;
3589        long user_extra, extra;
3590        int ret = 0, flags = 0;
3591
3592        /*
3593         * Don't allow mmap() of inherited per-task counters. This would
3594         * create a performance issue due to all children writing to the
3595         * same rb.
3596         */
3597        if (event->cpu == -1 && event->attr.inherit)
3598                return -EINVAL;
3599
3600        if (!(vma->vm_flags & VM_SHARED))
3601                return -EINVAL;
3602
3603        vma_size = vma->vm_end - vma->vm_start;
3604        nr_pages = (vma_size / PAGE_SIZE) - 1;
3605
3606        /*
3607         * If we have rb pages ensure they're a power-of-two number, so we
3608         * can do bitmasks instead of modulo.
3609         */
3610        if (nr_pages != 0 && !is_power_of_2(nr_pages))
3611                return -EINVAL;
3612
3613        if (vma_size != PAGE_SIZE * (1 + nr_pages))
3614                return -EINVAL;
3615
3616        if (vma->vm_pgoff != 0)
3617                return -EINVAL;
3618
3619        WARN_ON_ONCE(event->ctx->parent_ctx);
3620        mutex_lock(&event->mmap_mutex);
3621        if (event->rb) {
3622                if (event->rb->nr_pages == nr_pages)
3623                        atomic_inc(&event->rb->refcount);
3624                else
3625                        ret = -EINVAL;
3626                goto unlock;
3627        }
3628
3629        user_extra = nr_pages + 1;
3630        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3631
3632        /*
3633         * Increase the limit linearly with more CPUs:
3634         */
3635        user_lock_limit *= num_online_cpus();
3636
3637        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3638
3639        extra = 0;
3640        if (user_locked > user_lock_limit)
3641                extra = user_locked - user_lock_limit;
3642
3643        lock_limit = rlimit(RLIMIT_MEMLOCK);
3644        lock_limit >>= PAGE_SHIFT;
3645        locked = vma->vm_mm->pinned_vm + extra;
3646
3647        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3648                !capable(CAP_IPC_LOCK)) {
3649                ret = -EPERM;
3650                goto unlock;
3651        }
3652
3653        WARN_ON(event->rb);
3654
3655        if (vma->vm_flags & VM_WRITE)
3656                flags |= RING_BUFFER_WRITABLE;
3657
3658        rb = rb_alloc(nr_pages, 
3659                event->attr.watermark ? event->attr.wakeup_watermark : 0,
3660                event->cpu, flags);
3661
3662        if (!rb) {
3663                ret = -ENOMEM;
3664                goto unlock;
3665        }
3666        rcu_assign_pointer(event->rb, rb);
3667
3668        atomic_long_add(user_extra, &user->locked_vm);
3669        event->mmap_locked = extra;
3670        event->mmap_user = get_current_user();
3671        vma->vm_mm->pinned_vm += event->mmap_locked;
3672
3673        perf_event_update_userpage(event);
3674
3675unlock:
3676        if (!ret)
3677                atomic_inc(&event->mmap_count);
3678        mutex_unlock(&event->mmap_mutex);
3679
3680        vma->vm_flags |= VM_RESERVED;
3681        vma->vm_ops = &perf_mmap_vmops;
3682
3683        return ret;
3684}
3685
3686static int perf_fasync(int fd, struct file *filp, int on)
3687{
3688        struct inode *inode = filp->f_path.dentry->d_inode;
3689        struct perf_event *event = filp->private_data;
3690        int retval;
3691
3692        mutex_lock(&inode->i_mutex);
3693        retval = fasync_helper(fd, filp, on, &event->fasync);
3694        mutex_unlock(&inode->i_mutex);
3695
3696        if (retval < 0)
3697                return retval;
3698
3699        return 0;
3700}
3701
3702static const struct file_operations perf_fops = {
3703        .llseek                 = no_llseek,
3704        .release                = perf_release,
3705        .read                   = perf_read,
3706        .poll                   = perf_poll,
3707        .unlocked_ioctl         = perf_ioctl,
3708        .compat_ioctl           = perf_ioctl,
3709        .mmap                   = perf_mmap,
3710        .fasync                 = perf_fasync,
3711};
3712
3713/*
3714 * Perf event wakeup
3715 *
3716 * If there's data, ensure we set the poll() state and publish everything
3717 * to user-space before waking everybody up.
3718 */
3719
3720void perf_event_wakeup(struct perf_event *event)
3721{
3722        ring_buffer_wakeup(event);
3723
3724        if (event->pending_kill) {
3725                kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3726                event->pending_kill = 0;
3727        }
3728}
3729
3730static void perf_pending_event(struct irq_work *entry)
3731{
3732        struct perf_event *event = container_of(entry,
3733                        struct perf_event, pending);
3734
3735        if (event->pending_disable) {
3736                event->pending_disable = 0;
3737                __perf_event_disable(event);
3738        }
3739
3740        if (event->pending_wakeup) {
3741                event->pending_wakeup = 0;
3742                perf_event_wakeup(event);
3743        }
3744}
3745
3746/*
3747 * We assume there is only KVM supporting the callbacks.
3748 * Later on, we might change it to a list if there is
3749 * another virtualization implementation supporting the callbacks.
3750 */
3751struct perf_guest_info_callbacks *perf_guest_cbs;
3752
3753int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3754{
3755        perf_guest_cbs = cbs;
3756        return 0;
3757}
3758EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3759
3760int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3761{
3762        perf_guest_cbs = NULL;
3763        return 0;
3764}
3765EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3766
3767static void __perf_event_header__init_id(struct perf_event_header *header,
3768                                         struct perf_sample_data *data,
3769                                         struct perf_event *event)
3770{
3771        u64 sample_type = event->attr.sample_type;
3772
3773        data->type = sample_type;
3774        header->size += event->id_header_size;
3775
3776        if (sample_type & PERF_SAMPLE_TID) {
3777                /* namespace issues */
3778                data->tid_entry.pid = perf_event_pid(event, current);
3779                data->tid_entry.tid = perf_event_tid(event, current);
3780        }
3781
3782        if (sample_type & PERF_SAMPLE_TIME)
3783                data->time = perf_clock();
3784
3785        if (sample_type & PERF_SAMPLE_ID)
3786                data->id = primary_event_id(event);
3787
3788        if (sample_type & PERF_SAMPLE_STREAM_ID)
3789                data->stream_id = event->id;
3790
3791        if (sample_type & PERF_SAMPLE_CPU) {
3792                data->cpu_entry.cpu      = raw_smp_processor_id();
3793                data->cpu_entry.reserved = 0;
3794        }
3795}
3796
3797void perf_event_header__init_id(struct perf_event_header *header,
3798                                struct perf_sample_data *data,
3799                                struct perf_event *event)
3800{
3801        if (event->attr.sample_id_all)
3802                __perf_event_header__init_id(header, data, event);
3803}
3804
3805static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3806                                           struct perf_sample_data *data)
3807{
3808        u64 sample_type = data->type;
3809
3810        if (sample_type & PERF_SAMPLE_TID)
3811                perf_output_put(handle, data->tid_entry);
3812
3813        if (sample_type & PERF_SAMPLE_TIME)
3814                perf_output_put(handle, data->time);
3815
3816        if (sample_type & PERF_SAMPLE_ID)
3817                perf_output_put(handle, data->id);
3818
3819        if (sample_type & PERF_SAMPLE_STREAM_ID)
3820                perf_output_put(handle, data->stream_id);
3821
3822        if (sample_type & PERF_SAMPLE_CPU)
3823                perf_output_put(handle, data->cpu_entry);
3824}
3825
3826void perf_event__output_id_sample(struct perf_event *event,
3827                                  struct perf_output_handle *handle,
3828                                  struct perf_sample_data *sample)
3829{
3830        if (event->attr.sample_id_all)
3831                __perf_event__output_id_sample(handle, sample);
3832}
3833
3834static void perf_output_read_one(struct perf_output_handle *handle,
3835                                 struct perf_event *event,
3836                                 u64 enabled, u64 running)
3837{
3838        u64 read_format = event->attr.read_format;
3839        u64 values[4];
3840        int n = 0;
3841
3842        values[n++] = perf_event_count(event);
3843        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3844                values[n++] = enabled +
3845                        atomic64_read(&event->child_total_time_enabled);
3846        }
3847        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3848                values[n++] = running +
3849                        atomic64_read(&event->child_total_time_running);
3850        }
3851        if (read_format & PERF_FORMAT_ID)
3852                values[n++] = primary_event_id(event);
3853
3854        __output_copy(handle, values, n * sizeof(u64));
3855}
3856
3857/*
3858 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3859 */
3860static void perf_output_read_group(struct perf_output_handle *handle,
3861                            struct perf_event *event,
3862                            u64 enabled, u64 running)
3863{
3864        struct perf_event *leader = event->group_leader, *sub;
3865        u64 read_format = event->attr.read_format;
3866        u64 values[5];
3867        int n = 0;
3868
3869        values[n++] = 1 + leader->nr_siblings;
3870
3871        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3872                values[n++] = enabled;
3873
3874        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3875                values[n++] = running;
3876
3877        if (leader != event)
3878                leader->pmu->read(leader);
3879
3880        values[n++] = perf_event_count(leader);
3881        if (read_format & PERF_FORMAT_ID)
3882                values[n++] = primary_event_id(leader);
3883
3884        __output_copy(handle, values, n * sizeof(u64));
3885
3886        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3887                n = 0;
3888
3889                if (sub != event)
3890                        sub->pmu->read(sub);
3891
3892                values[n++] = perf_event_count(sub);
3893                if (read_format & PERF_FORMAT_ID)
3894                        values[n++] = primary_event_id(sub);
3895
3896                __output_copy(handle, values, n * sizeof(u64));
3897        }
3898}
3899
3900#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3901                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
3902
3903static void perf_output_read(struct perf_output_handle *handle,
3904                             struct perf_event *event)
3905{
3906        u64 enabled = 0, running = 0, now;
3907        u64 read_format = event->attr.read_format;
3908
3909        /*
3910         * compute total_time_enabled, total_time_running
3911         * based on snapshot values taken when the event
3912         * was last scheduled in.
3913         *
3914         * we cannot simply called update_context_time()
3915         * because of locking issue as we are called in
3916         * NMI context
3917         */
3918        if (read_format & PERF_FORMAT_TOTAL_TIMES)
3919                calc_timer_values(event, &now, &enabled, &running);
3920
3921        if (event->attr.read_format & PERF_FORMAT_GROUP)
3922                perf_output_read_group(handle, event, enabled, running);
3923        else
3924                perf_output_read_one(handle, event, enabled, running);
3925}
3926
3927void perf_output_sample(struct perf_output_handle *handle,
3928                        struct perf_event_header *header,
3929                        struct perf_sample_data *data,
3930                        struct perf_event *event)
3931{
3932        u64 sample_type = data->type;
3933
3934        perf_output_put(handle, *header);
3935
3936        if (sample_type & PERF_SAMPLE_IP)
3937                perf_output_put(handle, data->ip);
3938
3939        if (sample_type & PERF_SAMPLE_TID)
3940                perf_output_put(handle, data->tid_entry);
3941
3942        if (sample_type & PERF_SAMPLE_TIME)
3943                perf_output_put(handle, data->time);
3944
3945        if (sample_type & PERF_SAMPLE_ADDR)
3946                perf_output_put(handle, data->addr);
3947
3948        if (sample_type & PERF_SAMPLE_ID)
3949                perf_output_put(handle, data->id);
3950
3951        if (sample_type & PERF_SAMPLE_STREAM_ID)
3952                perf_output_put(handle, data->stream_id);
3953
3954        if (sample_type & PERF_SAMPLE_CPU)
3955                perf_output_put(handle, data->cpu_entry);
3956
3957        if (sample_type & PERF_SAMPLE_PERIOD)
3958                perf_output_put(handle, data->period);
3959
3960        if (sample_type & PERF_SAMPLE_READ)
3961                perf_output_read(handle, event);
3962
3963        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3964                if (data->callchain) {
3965                        int size = 1;
3966
3967                        if (data->callchain)
3968                                size += data->callchain->nr;
3969
3970                        size *= sizeof(u64);
3971
3972                        __output_copy(handle, data->callchain, size);
3973                } else {
3974                        u64 nr = 0;
3975                        perf_output_put(handle, nr);
3976                }
3977        }
3978
3979        if (sample_type & PERF_SAMPLE_RAW) {
3980                if (data->raw) {
3981                        perf_output_put(handle, data->raw->size);
3982                        __output_copy(handle, data->raw->data,
3983                                           data->raw->size);
3984                } else {
3985                        struct {
3986                                u32     size;
3987                                u32     data;
3988                        } raw = {
3989                                .size = sizeof(u32),
3990                                .data = 0,
3991                        };
3992                        perf_output_put(handle, raw);
3993                }
3994        }
3995
3996        if (!event->attr.watermark) {
3997                int wakeup_events = event->attr.wakeup_events;
3998
3999                if (wakeup_events) {
4000                        struct ring_buffer *rb = handle->rb;
4001                        int events = local_inc_return(&rb->events);
4002
4003                        if (events >= wakeup_events) {
4004                                local_sub(wakeup_events, &rb->events);
4005                                local_inc(&rb->wakeup);
4006                        }
4007                }
4008        }
4009
4010        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4011                if (data->br_stack) {
4012                        size_t size;
4013
4014                        size = data->br_stack->nr
4015                             * sizeof(struct perf_branch_entry);
4016
4017                        perf_output_put(handle, data->br_stack->nr);
4018                        perf_output_copy(handle, data->br_stack->entries, size);
4019                } else {
4020                        /*
4021                         * we always store at least the value of nr
4022                         */
4023                        u64 nr = 0;
4024                        perf_output_put(handle, nr);
4025                }
4026        }
4027}
4028
4029void perf_prepare_sample(struct perf_event_header *header,
4030                         struct perf_sample_data *data,
4031                         struct perf_event *event,
4032                         struct pt_regs *regs)
4033{
4034        u64 sample_type = event->attr.sample_type;
4035
4036        header->type = PERF_RECORD_SAMPLE;
4037        header->size = sizeof(*header) + event->header_size;
4038
4039        header->misc = 0;
4040        header->misc |= perf_misc_flags(regs);
4041
4042        __perf_event_header__init_id(header, data, event);
4043
4044        if (sample_type & PERF_SAMPLE_IP)
4045                data->ip = perf_instruction_pointer(regs);
4046
4047        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4048                int size = 1;
4049
4050                data->callchain = perf_callchain(event, regs);
4051
4052                if (data->callchain)
4053                        size += data->callchain->nr;
4054
4055                header->size += size * sizeof(u64);
4056        }
4057
4058        if (sample_type & PERF_SAMPLE_RAW) {
4059                int size = sizeof(u32);
4060
4061                if (data->raw)
4062                        size += data->raw->size;
4063                else
4064                        size += sizeof(u32);
4065
4066                WARN_ON_ONCE(size & (sizeof(u64)-1));
4067                header->size += size;
4068        }
4069
4070        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4071                int size = sizeof(u64); /* nr */
4072                if (data->br_stack) {
4073                        size += data->br_stack->nr
4074                              * sizeof(struct perf_branch_entry);
4075                }
4076                header->size += size;
4077        }
4078}
4079
4080static void perf_event_output(struct perf_event *event,
4081                                struct perf_sample_data *data,
4082                                struct pt_regs *regs)
4083{
4084        struct perf_output_handle handle;
4085        struct perf_event_header header;
4086
4087        /* protect the callchain buffers */
4088        rcu_read_lock();
4089
4090        perf_prepare_sample(&header, data, event, regs);
4091
4092        if (perf_output_begin(&handle, event, header.size))
4093                goto exit;
4094
4095        perf_output_sample(&handle, &header, data, event);
4096
4097        perf_output_end(&handle);
4098
4099exit:
4100        rcu_read_unlock();
4101}
4102
4103/*
4104 * read event_id
4105 */
4106
4107struct perf_read_event {
4108        struct perf_event_header        header;
4109
4110        u32                             pid;
4111        u32                             tid;
4112};
4113
4114static void
4115perf_event_read_event(struct perf_event *event,
4116                        struct task_struct *task)
4117{
4118        struct perf_output_handle handle;
4119        struct perf_sample_data sample;
4120        struct perf_read_event read_event = {
4121                .header = {
4122                        .type = PERF_RECORD_READ,
4123                        .misc = 0,
4124                        .size = sizeof(read_event) + event->read_size,
4125                },
4126                .pid = perf_event_pid(event, task),
4127                .tid = perf_event_tid(event, task),
4128        };
4129        int ret;
4130
4131        perf_event_header__init_id(&read_event.header, &sample, event);
4132        ret = perf_output_begin(&handle, event, read_event.header.size);
4133        if (ret)
4134                return;
4135
4136        perf_output_put(&handle, read_event);
4137        perf_output_read(&handle, event);
4138        perf_event__output_id_sample(event, &handle, &sample);
4139
4140        perf_output_end(&handle);
4141}
4142
4143/*
4144 * task tracking -- fork/exit
4145 *
4146 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
4147 */
4148
4149struct perf_task_event {
4150        struct task_struct              *task;
4151        struct perf_event_context       *task_ctx;
4152
4153        struct {
4154                struct perf_event_header        header;
4155
4156                u32                             pid;
4157                u32                             ppid;
4158                u32                             tid;
4159                u32                             ptid;
4160                u64                             time;
4161        } event_id;
4162};
4163
4164static void perf_event_task_output(struct perf_event *event,
4165                                     struct perf_task_event *task_event)
4166{
4167        struct perf_output_handle handle;
4168        struct perf_sample_data sample;
4169        struct task_struct *task = task_event->task;
4170        int ret, size = task_event->event_id.header.size;
4171
4172        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4173
4174        ret = perf_output_begin(&handle, event,
4175                                task_event->event_id.header.size);
4176        if (ret)
4177                goto out;
4178
4179        task_event->event_id.pid = perf_event_pid(event, task);
4180        task_event->event_id.ppid = perf_event_pid(event, current);
4181
4182        task_event->event_id.tid = perf_event_tid(event, task);
4183        task_event->event_id.ptid = perf_event_tid(event, current);
4184
4185        perf_output_put(&handle, task_event->event_id);
4186
4187        perf_event__output_id_sample(event, &handle, &sample);
4188
4189        perf_output_end(&handle);
4190out:
4191        task_event->event_id.header.size = size;
4192}
4193
4194static int perf_event_task_match(struct perf_event *event)
4195{
4196        if (event->state < PERF_EVENT_STATE_INACTIVE)
4197                return 0;
4198
4199        if (!event_filter_match(event))
4200                return 0;
4201
4202        if (event->attr.comm || event->attr.mmap ||
4203            event->attr.mmap_data || event->attr.task)
4204                return 1;
4205
4206        return 0;
4207}
4208
4209static void perf_event_task_ctx(struct perf_event_context *ctx,
4210                                  struct perf_task_event *task_event)
4211{
4212        struct perf_event *event;
4213
4214        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4215                if (perf_event_task_match(event))
4216                        perf_event_task_output(event, task_event);
4217        }
4218}
4219
4220static void perf_event_task_event(struct perf_task_event *task_event)
4221{
4222        struct perf_cpu_context *cpuctx;
4223        struct perf_event_context *ctx;
4224        struct pmu *pmu;
4225        int ctxn;
4226
4227        rcu_read_lock();
4228        list_for_each_entry_rcu(pmu, &pmus, entry) {
4229                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4230                if (cpuctx->active_pmu != pmu)
4231                        goto next;
4232                perf_event_task_ctx(&cpuctx->ctx, task_event);
4233
4234                ctx = task_event->task_ctx;
4235                if (!ctx) {
4236                        ctxn = pmu->task_ctx_nr;
4237                        if (ctxn < 0)
4238                                goto next;
4239                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4240                }
4241                if (ctx)
4242                        perf_event_task_ctx(ctx, task_event);
4243next:
4244                put_cpu_ptr(pmu->pmu_cpu_context);
4245        }
4246        rcu_read_unlock();
4247}
4248
4249static void perf_event_task(struct task_struct *task,
4250                              struct perf_event_context *task_ctx,
4251                              int new)
4252{
4253        struct perf_task_event task_event;
4254
4255        if (!atomic_read(&nr_comm_events) &&
4256            !atomic_read(&nr_mmap_events) &&
4257            !atomic_read(&nr_task_events))
4258                return;
4259
4260        task_event = (struct perf_task_event){
4261                .task     = task,
4262                .task_ctx = task_ctx,
4263                .event_id    = {
4264                        .header = {
4265                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4266                                .misc = 0,
4267                                .size = sizeof(task_event.event_id),
4268                        },
4269                        /* .pid  */
4270                        /* .ppid */
4271                        /* .tid  */
4272                        /* .ptid */
4273                        .time = perf_clock(),
4274                },
4275        };
4276
4277        perf_event_task_event(&task_event);
4278}
4279
4280void perf_event_fork(struct task_struct *task)
4281{
4282        perf_event_task(task, NULL, 1);
4283}
4284
4285/*
4286 * comm tracking
4287 */
4288
4289struct perf_comm_event {
4290        struct task_struct      *task;
4291        char                    *comm;
4292        int                     comm_size;
4293
4294        struct {
4295                struct perf_event_header        header;
4296
4297                u32                             pid;
4298                u32                             tid;
4299        } event_id;
4300};
4301
4302static void perf_event_comm_output(struct perf_event *event,
4303                                     struct perf_comm_event *comm_event)
4304{
4305        struct perf_output_handle handle;
4306        struct perf_sample_data sample;
4307        int size = comm_event->event_id.header.size;
4308        int ret;
4309
4310        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4311        ret = perf_output_begin(&handle, event,
4312                                comm_event->event_id.header.size);
4313
4314        if (ret)
4315                goto out;
4316
4317        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4318        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4319
4320        perf_output_put(&handle, comm_event->event_id);
4321        __output_copy(&handle, comm_event->comm,
4322                                   comm_event->comm_size);
4323
4324        perf_event__output_id_sample(event, &handle, &sample);
4325
4326        perf_output_end(&handle);
4327out:
4328        comm_event->event_id.header.size = size;
4329}
4330
4331static int perf_event_comm_match(struct perf_event *event)
4332{
4333        if (event->state < PERF_EVENT_STATE_INACTIVE)
4334                return 0;
4335
4336        if (!event_filter_match(event))
4337                return 0;
4338
4339        if (event->attr.comm)
4340                return 1;
4341
4342        return 0;
4343}
4344
4345static void perf_event_comm_ctx(struct perf_event_context *ctx,
4346                                  struct perf_comm_event *comm_event)
4347{
4348        struct perf_event *event;
4349
4350        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4351                if (perf_event_comm_match(event))
4352                        perf_event_comm_output(event, comm_event);
4353        }
4354}
4355
4356static void perf_event_comm_event(struct perf_comm_event *comm_event)
4357{
4358        struct perf_cpu_context *cpuctx;
4359        struct perf_event_context *ctx;
4360        char comm[TASK_COMM_LEN];
4361        unsigned int size;
4362        struct pmu *pmu;
4363        int ctxn;
4364
4365        memset(comm, 0, sizeof(comm));
4366        strlcpy(comm, comm_event->task->comm, sizeof(comm));
4367        size = ALIGN(strlen(comm)+1, sizeof(u64));
4368
4369        comm_event->comm = comm;
4370        comm_event->comm_size = size;
4371
4372        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4373        rcu_read_lock();
4374        list_for_each_entry_rcu(pmu, &pmus, entry) {
4375                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4376                if (cpuctx->active_pmu != pmu)
4377                        goto next;
4378                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4379
4380                ctxn = pmu->task_ctx_nr;
4381                if (ctxn < 0)
4382                        goto next;
4383
4384                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4385                if (ctx)
4386                        perf_event_comm_ctx(ctx, comm_event);
4387next:
4388                put_cpu_ptr(pmu->pmu_cpu_context);
4389        }
4390        rcu_read_unlock();
4391}
4392
4393void perf_event_comm(struct task_struct *task)
4394{
4395        struct perf_comm_event comm_event;
4396        struct perf_event_context *ctx;
4397        int ctxn;
4398
4399        for_each_task_context_nr(ctxn) {
4400                ctx = task->perf_event_ctxp[ctxn];
4401                if (!ctx)
4402                        continue;
4403
4404                perf_event_enable_on_exec(ctx);
4405        }
4406
4407        if (!atomic_read(&nr_comm_events))
4408                return;
4409
4410        comm_event = (struct perf_comm_event){
4411                .task   = task,
4412                /* .comm      */
4413                /* .comm_size */
4414                .event_id  = {
4415                        .header = {
4416                                .type = PERF_RECORD_COMM,
4417                                .misc = 0,
4418                                /* .size */
4419                        },
4420                        /* .pid */
4421                        /* .tid */
4422                },
4423        };
4424
4425        perf_event_comm_event(&comm_event);
4426}
4427
4428/*
4429 * mmap tracking
4430 */
4431
4432struct perf_mmap_event {
4433        struct vm_area_struct   *vma;
4434
4435        const char              *file_name;
4436        int                     file_size;
4437
4438        struct {
4439                struct perf_event_header        header;
4440
4441                u32                             pid;
4442                u32                             tid;
4443                u64                             start;
4444                u64                             len;
4445                u64                             pgoff;
4446        } event_id;
4447};
4448
4449static void perf_event_mmap_output(struct perf_event *event,
4450                                     struct perf_mmap_event *mmap_event)
4451{
4452        struct perf_output_handle handle;
4453        struct perf_sample_data sample;
4454        int size = mmap_event->event_id.header.size;
4455        int ret;
4456
4457        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4458        ret = perf_output_begin(&handle, event,
4459                                mmap_event->event_id.header.size);
4460        if (ret)
4461                goto out;
4462
4463        mmap_event->event_id.pid = perf_event_pid(event, current);
4464        mmap_event->event_id.tid = perf_event_tid(event, current);
4465
4466        perf_output_put(&handle, mmap_event->event_id);
4467        __output_copy(&handle, mmap_event->file_name,
4468                                   mmap_event->file_size);
4469
4470        perf_event__output_id_sample(event, &handle, &sample);
4471
4472        perf_output_end(&handle);
4473out:
4474        mmap_event->event_id.header.size = size;
4475}
4476
4477static int perf_event_mmap_match(struct perf_event *event,
4478                                   struct perf_mmap_event *mmap_event,
4479                                   int executable)
4480{
4481        if (event->state < PERF_EVENT_STATE_INACTIVE)
4482                return 0;
4483
4484        if (!event_filter_match(event))
4485                return 0;
4486
4487        if ((!executable && event->attr.mmap_data) ||
4488            (executable && event->attr.mmap))
4489                return 1;
4490
4491        return 0;
4492}
4493
4494static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4495                                  struct perf_mmap_event *mmap_event,
4496                                  int executable)
4497{
4498        struct perf_event *event;
4499
4500        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4501                if (perf_event_mmap_match(event, mmap_event, executable))
4502                        perf_event_mmap_output(event, mmap_event);
4503        }
4504}
4505
4506static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4507{
4508        struct perf_cpu_context *cpuctx;
4509        struct perf_event_context *ctx;
4510        struct vm_area_struct *vma = mmap_event->vma;
4511        struct file *file = vma->vm_file;
4512        unsigned int size;
4513        char tmp[16];
4514        char *buf = NULL;
4515        const char *name;
4516        struct pmu *pmu;
4517        int ctxn;
4518
4519        memset(tmp, 0, sizeof(tmp));
4520
4521        if (file) {
4522                /*
4523                 * d_path works from the end of the rb backwards, so we
4524                 * need to add enough zero bytes after the string to handle
4525                 * the 64bit alignment we do later.
4526                 */
4527                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4528                if (!buf) {
4529                        name = strncpy(tmp, "//enomem", sizeof(tmp));
4530                        goto got_name;
4531                }
4532                name = d_path(&file->f_path, buf, PATH_MAX);
4533                if (IS_ERR(name)) {
4534                        name = strncpy(tmp, "//toolong", sizeof(tmp));
4535                        goto got_name;
4536                }
4537        } else {
4538                if (arch_vma_name(mmap_event->vma)) {
4539                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4540                                       sizeof(tmp));
4541                        goto got_name;
4542                }
4543
4544                if (!vma->vm_mm) {
4545                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
4546                        goto got_name;
4547                } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4548                                vma->vm_end >= vma->vm_mm->brk) {
4549                        name = strncpy(tmp, "[heap]", sizeof(tmp));
4550                        goto got_name;
4551                } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4552                                vma->vm_end >= vma->vm_mm->start_stack) {
4553                        name = strncpy(tmp, "[stack]", sizeof(tmp));
4554                        goto got_name;
4555                }
4556
4557                name = strncpy(tmp, "//anon", sizeof(tmp));
4558                goto got_name;
4559        }
4560
4561got_name:
4562        size = ALIGN(strlen(name)+1, sizeof(u64));
4563
4564        mmap_event->file_name = name;
4565        mmap_event->file_size = size;
4566
4567        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4568
4569        rcu_read_lock();
4570        list_for_each_entry_rcu(pmu, &pmus, entry) {
4571                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4572                if (cpuctx->active_pmu != pmu)
4573                        goto next;
4574                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4575                                        vma->vm_flags & VM_EXEC);
4576
4577                ctxn = pmu->task_ctx_nr;
4578                if (ctxn < 0)
4579                        goto next;
4580
4581                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4582                if (ctx) {
4583                        perf_event_mmap_ctx(ctx, mmap_event,
4584                                        vma->vm_flags & VM_EXEC);
4585                }
4586next:
4587                put_cpu_ptr(pmu->pmu_cpu_context);
4588        }
4589        rcu_read_unlock();
4590
4591        kfree(buf);
4592}
4593
4594void perf_event_mmap(struct vm_area_struct *vma)
4595{
4596        struct perf_mmap_event mmap_event;
4597
4598        if (!atomic_read(&nr_mmap_events))
4599                return;
4600
4601        mmap_event = (struct perf_mmap_event){
4602                .vma    = vma,
4603                /* .file_name */
4604                /* .file_size */
4605                .event_id  = {
4606                        .header = {
4607                                .type = PERF_RECORD_MMAP,
4608                                .misc = PERF_RECORD_MISC_USER,
4609                                /* .size */
4610                        },
4611                        /* .pid */
4612                        /* .tid */
4613                        .start  = vma->vm_start,
4614                        .len    = vma->vm_end - vma->vm_start,
4615                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
4616                },
4617        };
4618
4619        perf_event_mmap_event(&mmap_event);
4620}
4621
4622/*
4623 * IRQ throttle logging
4624 */
4625
4626static void perf_log_throttle(struct perf_event *event, int enable)
4627{
4628        struct perf_output_handle handle;
4629        struct perf_sample_data sample;
4630        int ret;
4631
4632        struct {
4633                struct perf_event_header        header;
4634                u64                             time;
4635                u64                             id;
4636                u64                             stream_id;
4637        } throttle_event = {
4638                .header = {
4639                        .type = PERF_RECORD_THROTTLE,
4640                        .misc = 0,
4641                        .size = sizeof(throttle_event),
4642                },
4643                .time           = perf_clock(),
4644                .id             = primary_event_id(event),
4645                .stream_id      = event->id,
4646        };
4647
4648        if (enable)
4649                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4650
4651        perf_event_header__init_id(&throttle_event.header, &sample, event);
4652
4653        ret = perf_output_begin(&handle, event,
4654                                throttle_event.header.size);
4655        if (ret)
4656                return;
4657
4658        perf_output_put(&handle, throttle_event);
4659        perf_event__output_id_sample(event, &handle, &sample);
4660        perf_output_end(&handle);
4661}
4662
4663/*
4664 * Generic event overflow handling, sampling.
4665 */
4666
4667static int __perf_event_overflow(struct perf_event *event,
4668                                   int throttle, struct perf_sample_data *data,
4669                                   struct pt_regs *regs)
4670{
4671        int events = atomic_read(&event->event_limit);
4672        struct hw_perf_event *hwc = &event->hw;
4673        u64 seq;
4674        int ret = 0;
4675
4676        /*
4677         * Non-sampling counters might still use the PMI to fold short
4678         * hardware counters, ignore those.
4679         */
4680        if (unlikely(!is_sampling_event(event)))
4681                return 0;
4682
4683        seq = __this_cpu_read(perf_throttled_seq);
4684        if (seq != hwc->interrupts_seq) {
4685                hwc->interrupts_seq = seq;
4686                hwc->interrupts = 1;
4687        } else {
4688                hwc->interrupts++;
4689                if (unlikely(throttle
4690                             && hwc->interrupts >= max_samples_per_tick)) {
4691                        __this_cpu_inc(perf_throttled_count);
4692                        hwc->interrupts = MAX_INTERRUPTS;
4693                        perf_log_throttle(event, 0);
4694                        ret = 1;
4695                }
4696        }
4697
4698        if (event->attr.freq) {
4699                u64 now = perf_clock();
4700                s64 delta = now - hwc->freq_time_stamp;
4701
4702                hwc->freq_time_stamp = now;
4703
4704                if (delta > 0 && delta < 2*TICK_NSEC)
4705                        perf_adjust_period(event, delta, hwc->last_period, true);
4706        }
4707
4708        /*
4709         * XXX event_limit might not quite work as expected on inherited
4710         * events
4711         */
4712
4713        event->pending_kill = POLL_IN;
4714        if (events && atomic_dec_and_test(&event->event_limit)) {
4715                ret = 1;
4716                event->pending_kill = POLL_HUP;
4717                event->pending_disable = 1;
4718                irq_work_queue(&event->pending);
4719        }
4720
4721        if (event->overflow_handler)
4722                event->overflow_handler(event, data, regs);
4723        else
4724                perf_event_output(event, data, regs);
4725
4726        if (event->fasync && event->pending_kill) {
4727                event->pending_wakeup = 1;
4728                irq_work_queue(&event->pending);
4729        }
4730
4731        return ret;
4732}
4733
4734int perf_event_overflow(struct perf_event *event,
4735                          struct perf_sample_data *data,
4736                          struct pt_regs *regs)
4737{
4738        return __perf_event_overflow(event, 1, data, regs);
4739}
4740
4741/*
4742 * Generic software event infrastructure
4743 */
4744
4745struct swevent_htable {
4746        struct swevent_hlist            *swevent_hlist;
4747        struct mutex                    hlist_mutex;
4748        int                             hlist_refcount;
4749
4750        /* Recursion avoidance in each contexts */
4751        int                             recursion[PERF_NR_CONTEXTS];
4752};
4753
4754static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4755
4756/*
4757 * We directly increment event->count and keep a second value in
4758 * event->hw.period_left to count intervals. This period event
4759 * is kept in the range [-sample_period, 0] so that we can use the
4760 * sign as trigger.
4761 */
4762
4763static u64 perf_swevent_set_period(struct perf_event *event)
4764{
4765        struct hw_perf_event *hwc = &event->hw;
4766        u64 period = hwc->last_period;
4767        u64 nr, offset;
4768        s64 old, val;
4769
4770        hwc->last_period = hwc->sample_period;
4771
4772again:
4773        old = val = local64_read(&hwc->period_left);
4774        if (val < 0)
4775                return 0;
4776
4777        nr = div64_u64(period + val, period);
4778        offset = nr * period;
4779        val -= offset;
4780        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4781                goto again;
4782
4783        return nr;
4784}
4785
4786static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4787                                    struct perf_sample_data *data,
4788                                    struct pt_regs *regs)
4789{
4790        struct hw_perf_event *hwc = &event->hw;
4791        int throttle = 0;
4792
4793        if (!overflow)
4794                overflow = perf_swevent_set_period(event);
4795
4796        if (hwc->interrupts == MAX_INTERRUPTS)
4797                return;
4798
4799        for (; overflow; overflow--) {
4800                if (__perf_event_overflow(event, throttle,
4801                                            data, regs)) {
4802                        /*
4803                         * We inhibit the overflow from happening when
4804                         * hwc->interrupts == MAX_INTERRUPTS.
4805                         */
4806                        break;
4807                }
4808                throttle = 1;
4809        }
4810}
4811
4812static void perf_swevent_event(struct perf_event *event, u64 nr,
4813                               struct perf_sample_data *data,
4814                               struct pt_regs *regs)
4815{
4816        struct hw_perf_event *hwc = &event->hw;
4817
4818        local64_add(nr, &event->count);
4819
4820        if (!regs)
4821                return;
4822
4823        if (!is_sampling_event(event))
4824                return;
4825
4826        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4827                data->period = nr;
4828                return perf_swevent_overflow(event, 1, data, regs);
4829        } else
4830                data->period = event->hw.last_period;
4831
4832        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4833                return perf_swevent_overflow(event, 1, data, regs);
4834
4835        if (local64_add_negative(nr, &hwc->period_left))
4836                return;
4837
4838        perf_swevent_overflow(event, 0, data, regs);
4839}
4840
4841static int perf_exclude_event(struct perf_event *event,
4842                              struct pt_regs *regs)
4843{
4844        if (event->hw.state & PERF_HES_STOPPED)
4845                return 1;
4846
4847        if (regs) {
4848                if (event->attr.exclude_user && user_mode(regs))
4849                        return 1;
4850
4851                if (event->attr.exclude_kernel && !user_mode(regs))
4852                        return 1;
4853        }
4854
4855        return 0;
4856}
4857
4858static int perf_swevent_match(struct perf_event *event,
4859                                enum perf_type_id type,
4860                                u32 event_id,
4861                                struct perf_sample_data *data,
4862                                struct pt_regs *regs)
4863{
4864        if (event->attr.type != type)
4865                return 0;
4866
4867        if (event->attr.config != event_id)
4868                return 0;
4869
4870        if (perf_exclude_event(event, regs))
4871                return 0;
4872
4873        return 1;
4874}
4875
4876static inline u64 swevent_hash(u64 type, u32 event_id)
4877{
4878        u64 val = event_id | (type << 32);
4879
4880        return hash_64(val, SWEVENT_HLIST_BITS);
4881}
4882
4883static inline struct hlist_head *
4884__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4885{
4886        u64 hash = swevent_hash(type, event_id);
4887
4888        return &hlist->heads[hash];
4889}
4890
4891/* For the read side: events when they trigger */
4892static inline struct hlist_head *
4893find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4894{
4895        struct swevent_hlist *hlist;
4896
4897        hlist = rcu_dereference(swhash->swevent_hlist);
4898        if (!hlist)
4899                return NULL;
4900
4901        return __find_swevent_head(hlist, type, event_id);
4902}
4903
4904/* For the event head insertion and removal in the hlist */
4905static inline struct hlist_head *
4906find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4907{
4908        struct swevent_hlist *hlist;
4909        u32 event_id = event->attr.config;
4910        u64 type = event->attr.type;
4911
4912        /*
4913         * Event scheduling is always serialized against hlist allocation
4914         * and release. Which makes the protected version suitable here.
4915         * The context lock guarantees that.
4916         */
4917        hlist = rcu_dereference_protected(swhash->swevent_hlist,
4918                                          lockdep_is_held(&event->ctx->lock));
4919        if (!hlist)
4920                return NULL;
4921
4922        return __find_swevent_head(hlist, type, event_id);
4923}
4924
4925static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4926                                    u64 nr,
4927                                    struct perf_sample_data *data,
4928                                    struct pt_regs *regs)
4929{
4930        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4931        struct perf_event *event;
4932        struct hlist_node *node;
4933        struct hlist_head *head;
4934
4935        rcu_read_lock();
4936        head = find_swevent_head_rcu(swhash, type, event_id);
4937        if (!head)
4938                goto end;
4939
4940        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4941                if (perf_swevent_match(event, type, event_id, data, regs))
4942                        perf_swevent_event(event, nr, data, regs);
4943        }
4944end:
4945        rcu_read_unlock();
4946}
4947
4948int perf_swevent_get_recursion_context(void)
4949{
4950        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4951
4952        return get_recursion_context(swhash->recursion);
4953}
4954EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4955
4956inline void perf_swevent_put_recursion_context(int rctx)
4957{
4958        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4959
4960        put_recursion_context(swhash->recursion, rctx);
4961}
4962
4963void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
4964{
4965        struct perf_sample_data data;
4966        int rctx;
4967
4968        preempt_disable_notrace();
4969        rctx = perf_swevent_get_recursion_context();
4970        if (rctx < 0)
4971                return;
4972
4973        perf_sample_data_init(&data, addr, 0);
4974
4975        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
4976
4977        perf_swevent_put_recursion_context(rctx);
4978        preempt_enable_notrace();
4979}
4980
4981static void perf_swevent_read(struct perf_event *event)
4982{
4983}
4984
4985static int perf_swevent_add(struct perf_event *event, int flags)
4986{
4987        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4988        struct hw_perf_event *hwc = &event->hw;
4989        struct hlist_head *head;
4990
4991        if (is_sampling_event(event)) {
4992                hwc->last_period = hwc->sample_period;
4993                perf_swevent_set_period(event);
4994        }
4995
4996        hwc->state = !(flags & PERF_EF_START);
4997
4998        head = find_swevent_head(swhash, event);
4999        if (WARN_ON_ONCE(!head))
5000                return -EINVAL;
5001
5002        hlist_add_head_rcu(&event->hlist_entry, head);
5003
5004        return 0;
5005}
5006
5007static void perf_swevent_del(struct perf_event *event, int flags)
5008{
5009        hlist_del_rcu(&event->hlist_entry);
5010}
5011
5012static void perf_swevent_start(struct perf_event *event, int flags)
5013{
5014        event->hw.state = 0;
5015}
5016
5017static void perf_swevent_stop(struct perf_event *event, int flags)
5018{
5019        event->hw.state = PERF_HES_STOPPED;
5020}
5021
5022/* Deref the hlist from the update side */
5023static inline struct swevent_hlist *
5024swevent_hlist_deref(struct swevent_htable *swhash)
5025{
5026        return rcu_dereference_protected(swhash->swevent_hlist,
5027                                         lockdep_is_held(&swhash->hlist_mutex));
5028}
5029
5030static void swevent_hlist_release(struct swevent_htable *swhash)
5031{
5032        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5033
5034        if (!hlist)
5035                return;
5036
5037        rcu_assign_pointer(swhash->swevent_hlist, NULL);
5038        kfree_rcu(hlist, rcu_head);
5039}
5040
5041static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5042{
5043        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5044
5045        mutex_lock(&swhash->hlist_mutex);
5046
5047        if (!--swhash->hlist_refcount)
5048                swevent_hlist_release(swhash);
5049
5050        mutex_unlock(&swhash->hlist_mutex);
5051}
5052
5053static void swevent_hlist_put(struct perf_event *event)
5054{
5055        int cpu;
5056
5057        if (event->cpu != -1) {
5058                swevent_hlist_put_cpu(event, event->cpu);
5059                return;
5060        }
5061
5062        for_each_possible_cpu(cpu)
5063                swevent_hlist_put_cpu(event, cpu);
5064}
5065
5066static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5067{
5068        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5069        int err = 0;
5070
5071        mutex_lock(&swhash->hlist_mutex);
5072
5073        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5074                struct swevent_hlist *hlist;
5075
5076                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5077                if (!hlist) {
5078                        err = -ENOMEM;
5079                        goto exit;
5080                }
5081                rcu_assign_pointer(swhash->swevent_hlist, hlist);
5082        }
5083        swhash->hlist_refcount++;
5084exit:
5085        mutex_unlock(&swhash->hlist_mutex);
5086
5087        return err;
5088}
5089
5090static int swevent_hlist_get(struct perf_event *event)
5091{
5092        int err;
5093        int cpu, failed_cpu;
5094
5095        if (event->cpu != -1)
5096                return swevent_hlist_get_cpu(event, event->cpu);
5097
5098        get_online_cpus();
5099        for_each_possible_cpu(cpu) {
5100                err = swevent_hlist_get_cpu(event, cpu);
5101                if (err) {
5102                        failed_cpu = cpu;
5103                        goto fail;
5104                }
5105        }
5106        put_online_cpus();
5107
5108        return 0;
5109fail:
5110        for_each_possible_cpu(cpu) {
5111                if (cpu == failed_cpu)
5112                        break;
5113                swevent_hlist_put_cpu(event, cpu);
5114        }
5115
5116        put_online_cpus();
5117        return err;
5118}
5119
5120struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5121
5122static void sw_perf_event_destroy(struct perf_event *event)
5123{
5124        u64 event_id = event->attr.config;
5125
5126        WARN_ON(event->parent);
5127
5128        static_key_slow_dec(&perf_swevent_enabled[event_id]);
5129        swevent_hlist_put(event);
5130}
5131
5132static int perf_swevent_init(struct perf_event *event)
5133{
5134        int event_id = event->attr.config;
5135
5136        if (event->attr.type != PERF_TYPE_SOFTWARE)
5137                return -ENOENT;
5138
5139        /*
5140         * no branch sampling for software events
5141         */
5142        if (has_branch_stack(event))
5143                return -EOPNOTSUPP;
5144
5145        switch (event_id) {
5146        case PERF_COUNT_SW_CPU_CLOCK:
5147        case PERF_COUNT_SW_TASK_CLOCK:
5148                return -ENOENT;
5149
5150        default:
5151                break;
5152        }
5153
5154        if (event_id >= PERF_COUNT_SW_MAX)
5155                return -ENOENT;
5156
5157        if (!event->parent) {
5158                int err;
5159
5160                err = swevent_hlist_get(event);
5161                if (err)
5162                        return err;
5163
5164                static_key_slow_inc(&perf_swevent_enabled[event_id]);
5165                event->destroy = sw_perf_event_destroy;
5166        }
5167
5168        return 0;
5169}
5170
5171static int perf_swevent_event_idx(struct perf_event *event)
5172{
5173        return 0;
5174}
5175
5176static struct pmu perf_swevent = {
5177        .task_ctx_nr    = perf_sw_context,
5178
5179        .event_init     = perf_swevent_init,
5180        .add            = perf_swevent_add,
5181        .del            = perf_swevent_del,
5182        .start          = perf_swevent_start,
5183        .stop           = perf_swevent_stop,
5184        .read           = perf_swevent_read,
5185
5186        .event_idx      = perf_swevent_event_idx,
5187};
5188
5189#ifdef CONFIG_EVENT_TRACING
5190
5191static int perf_tp_filter_match(struct perf_event *event,
5192                                struct perf_sample_data *data)
5193{
5194        void *record = data->raw->data;
5195
5196        if (likely(!event->filter) || filter_match_preds(event->filter, record))
5197                return 1;
5198        return 0;
5199}
5200
5201static int perf_tp_event_match(struct perf_event *event,
5202                                struct perf_sample_data *data,
5203                                struct pt_regs *regs)
5204{
5205        if (event->hw.state & PERF_HES_STOPPED)
5206                return 0;
5207        /*
5208         * All tracepoints are from kernel-space.
5209         */
5210        if (event->attr.exclude_kernel)
5211                return 0;
5212
5213        if (!perf_tp_filter_match(event, data))
5214                return 0;
5215
5216        return 1;
5217}
5218
5219void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5220                   struct pt_regs *regs, struct hlist_head *head, int rctx,
5221                   struct task_struct *task)
5222{
5223        struct perf_sample_data data;
5224        struct perf_event *event;
5225        struct hlist_node *node;
5226
5227        struct perf_raw_record raw = {
5228                .size = entry_size,
5229                .data = record,
5230        };
5231
5232        perf_sample_data_init(&data, addr, 0);
5233        data.raw = &raw;
5234
5235        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5236                if (perf_tp_event_match(event, &data, regs))
5237                        perf_swevent_event(event, count, &data, regs);
5238        }
5239
5240        /*
5241         * If we got specified a target task, also iterate its context and
5242         * deliver this event there too.
5243         */
5244        if (task && task != current) {
5245                struct perf_event_context *ctx;
5246                struct trace_entry *entry = record;
5247
5248                rcu_read_lock();
5249                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5250                if (!ctx)
5251                        goto unlock;
5252
5253                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5254                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
5255                                continue;
5256                        if (event->attr.config != entry->type)
5257                                continue;
5258                        if (perf_tp_event_match(event, &data, regs))
5259                                perf_swevent_event(event, count, &data, regs);
5260                }
5261unlock:
5262                rcu_read_unlock();
5263        }
5264
5265        perf_swevent_put_recursion_context(rctx);
5266}
5267EXPORT_SYMBOL_GPL(perf_tp_event);
5268
5269static void tp_perf_event_destroy(struct perf_event *event)
5270{
5271        perf_trace_destroy(event);
5272}
5273
5274static int perf_tp_event_init(struct perf_event *event)
5275{
5276        int err;
5277
5278        if (event->attr.type != PERF_TYPE_TRACEPOINT)
5279                return -ENOENT;
5280
5281        /*
5282         * no branch sampling for tracepoint events
5283         */
5284        if (has_branch_stack(event))
5285                return -EOPNOTSUPP;
5286
5287        err = perf_trace_init(event);
5288        if (err)
5289                return err;
5290
5291        event->destroy = tp_perf_event_destroy;
5292
5293        return 0;
5294}
5295
5296static struct pmu perf_tracepoint = {
5297        .task_ctx_nr    = perf_sw_context,
5298
5299        .event_init     = perf_tp_event_init,
5300        .add            = perf_trace_add,
5301        .del            = perf_trace_del,
5302        .start          = perf_swevent_start,
5303        .stop           = perf_swevent_stop,
5304        .read           = perf_swevent_read,
5305
5306        .event_idx      = perf_swevent_event_idx,
5307};
5308
5309static inline void perf_tp_register(void)
5310{
5311        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5312}
5313
5314static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5315{
5316        char *filter_str;
5317        int ret;
5318
5319        if (event->attr.type != PERF_TYPE_TRACEPOINT)
5320                return -EINVAL;
5321
5322        filter_str = strndup_user(arg, PAGE_SIZE);
5323        if (IS_ERR(filter_str))
5324                return PTR_ERR(filter_str);
5325
5326        ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5327
5328        kfree(filter_str);
5329        return ret;
5330}
5331
5332static void perf_event_free_filter(struct perf_event *event)
5333{
5334        ftrace_profile_free_filter(event);
5335}
5336
5337#else
5338
5339static inline void perf_tp_register(void)
5340{
5341}
5342
5343static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5344{
5345        return -ENOENT;
5346}
5347
5348static void perf_event_free_filter(struct perf_event *event)
5349{
5350}
5351
5352#endif /* CONFIG_EVENT_TRACING */
5353
5354#ifdef CONFIG_HAVE_HW_BREAKPOINT
5355void perf_bp_event(struct perf_event *bp, void *data)
5356{
5357        struct perf_sample_data sample;
5358        struct pt_regs *regs = data;
5359
5360        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5361
5362        if (!bp->hw.state && !perf_exclude_event(bp, regs))
5363                perf_swevent_event(bp, 1, &sample, regs);
5364}
5365#endif
5366
5367/*
5368 * hrtimer based swevent callback
5369 */
5370
5371static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5372{
5373        enum hrtimer_restart ret = HRTIMER_RESTART;
5374        struct perf_sample_data data;
5375        struct pt_regs *regs;
5376        struct perf_event *event;
5377        u64 period;
5378
5379        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5380
5381        if (event->state != PERF_EVENT_STATE_ACTIVE)
5382                return HRTIMER_NORESTART;
5383
5384        event->pmu->read(event);
5385
5386        perf_sample_data_init(&data, 0, event->hw.last_period);
5387        regs = get_irq_regs();
5388
5389        if (regs && !perf_exclude_event(event, regs)) {
5390                if (!(event->attr.exclude_idle && is_idle_task(current)))
5391                        if (__perf_event_overflow(event, 1, &data, regs))
5392                                ret = HRTIMER_NORESTART;
5393        }
5394
5395        period = max_t(u64, 10000, event->hw.sample_period);
5396        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5397
5398        return ret;
5399}
5400
5401static void perf_swevent_start_hrtimer(struct perf_event *event)
5402{
5403        struct hw_perf_event *hwc = &event->hw;
5404        s64 period;
5405
5406        if (!is_sampling_event(event))
5407                return;
5408
5409        period = local64_read(&hwc->period_left);
5410        if (period) {
5411                if (period < 0)
5412                        period = 10000;
5413
5414                local64_set(&hwc->period_left, 0);
5415        } else {
5416                period = max_t(u64, 10000, hwc->sample_period);
5417        }
5418        __hrtimer_start_range_ns(&hwc->hrtimer,
5419                                ns_to_ktime(period), 0,
5420                                HRTIMER_MODE_REL_PINNED, 0);
5421}
5422
5423static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5424{
5425        struct hw_perf_event *hwc = &event->hw;
5426
5427        if (is_sampling_event(event)) {
5428                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5429                local64_set(&hwc->period_left, ktime_to_ns(remaining));
5430
5431                hrtimer_cancel(&hwc->hrtimer);
5432        }
5433}
5434
5435static void perf_swevent_init_hrtimer(struct perf_event *event)
5436{
5437        struct hw_perf_event *hwc = &event->hw;
5438
5439        if (!is_sampling_event(event))
5440                return;
5441
5442        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5443        hwc->hrtimer.function = perf_swevent_hrtimer;
5444
5445        /*
5446         * Since hrtimers have a fixed rate, we can do a static freq->period
5447         * mapping and avoid the whole period adjust feedback stuff.
5448         */
5449        if (event->attr.freq) {
5450                long freq = event->attr.sample_freq;
5451
5452                event->attr.sample_period = NSEC_PER_SEC / freq;
5453                hwc->sample_period = event->attr.sample_period;
5454                local64_set(&hwc->period_left, hwc->sample_period);
5455                event->attr.freq = 0;
5456        }
5457}
5458
5459/*
5460 * Software event: cpu wall time clock
5461 */
5462
5463static void cpu_clock_event_update(struct perf_event *event)
5464{
5465        s64 prev;
5466        u64 now;
5467
5468        now = local_clock();
5469        prev = local64_xchg(&event->hw.prev_count, now);
5470        local64_add(now - prev, &event->count);
5471}
5472
5473static void cpu_clock_event_start(struct perf_event *event, int flags)
5474{
5475        local64_set(&event->hw.prev_count, local_clock());
5476        perf_swevent_start_hrtimer(event);
5477}
5478
5479static void cpu_clock_event_stop(struct perf_event *event, int flags)
5480{
5481        perf_swevent_cancel_hrtimer(event);
5482        cpu_clock_event_update(event);
5483}
5484
5485static int cpu_clock_event_add(struct perf_event *event, int flags)
5486{
5487        if (flags & PERF_EF_START)
5488                cpu_clock_event_start(event, flags);
5489
5490        return 0;
5491}
5492
5493static void cpu_clock_event_del(struct perf_event *event, int flags)
5494{
5495        cpu_clock_event_stop(event, flags);
5496}
5497
5498static void cpu_clock_event_read(struct perf_event *event)
5499{
5500        cpu_clock_event_update(event);
5501}
5502
5503static int cpu_clock_event_init(struct perf_event *event)
5504{
5505        if (event->attr.type != PERF_TYPE_SOFTWARE)
5506                return -ENOENT;
5507
5508        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5509                return -ENOENT;
5510
5511        /*
5512         * no branch sampling for software events
5513         */
5514        if (has_branch_stack(event))
5515                return -EOPNOTSUPP;
5516
5517        perf_swevent_init_hrtimer(event);
5518
5519        return 0;
5520}
5521
5522static struct pmu perf_cpu_clock = {
5523        .task_ctx_nr    = perf_sw_context,
5524
5525        .event_init     = cpu_clock_event_init,
5526        .add            = cpu_clock_event_add,
5527        .del            = cpu_clock_event_del,
5528        .start          = cpu_clock_event_start,
5529        .stop           = cpu_clock_event_stop,
5530        .read           = cpu_clock_event_read,
5531
5532        .event_idx      = perf_swevent_event_idx,
5533};
5534
5535/*
5536 * Software event: task time clock
5537 */
5538
5539static void task_clock_event_update(struct perf_event *event, u64 now)
5540{
5541        u64 prev;
5542        s64 delta;
5543
5544        prev = local64_xchg(&event->hw.prev_count, now);
5545        delta = now - prev;
5546        local64_add(delta, &event->count);
5547}
5548
5549static void task_clock_event_start(struct perf_event *event, int flags)
5550{
5551        local64_set(&event->hw.prev_count, event->ctx->time);
5552        perf_swevent_start_hrtimer(event);
5553}
5554
5555static void task_clock_event_stop(struct perf_event *event, int flags)
5556{
5557        perf_swevent_cancel_hrtimer(event);
5558        task_clock_event_update(event, event->ctx->time);
5559}
5560
5561static int task_clock_event_add(struct perf_event *event, int flags)
5562{
5563        if (flags & PERF_EF_START)
5564                task_clock_event_start(event, flags);
5565
5566        return 0;
5567}
5568
5569static void task_clock_event_del(struct perf_event *event, int flags)
5570{
5571        task_clock_event_stop(event, PERF_EF_UPDATE);
5572}
5573
5574static void task_clock_event_read(struct perf_event *event)
5575{
5576        u64 now = perf_clock();
5577        u64 delta = now - event->ctx->timestamp;
5578        u64 time = event->ctx->time + delta;
5579
5580        task_clock_event_update(event, time);
5581}
5582
5583static int task_clock_event_init(struct perf_event *event)
5584{
5585        if (event->attr.type != PERF_TYPE_SOFTWARE)
5586                return -ENOENT;
5587
5588        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5589                return -ENOENT;
5590
5591        /*
5592         * no branch sampling for software events
5593         */
5594        if (has_branch_stack(event))
5595                return -EOPNOTSUPP;
5596
5597        perf_swevent_init_hrtimer(event);
5598
5599        return 0;
5600}
5601
5602static struct pmu perf_task_clock = {
5603        .task_ctx_nr    = perf_sw_context,
5604
5605        .event_init     = task_clock_event_init,
5606        .add            = task_clock_event_add,
5607        .del            = task_clock_event_del,
5608        .start          = task_clock_event_start,
5609        .stop           = task_clock_event_stop,
5610        .read           = task_clock_event_read,
5611
5612        .event_idx      = perf_swevent_event_idx,
5613};
5614
5615static void perf_pmu_nop_void(struct pmu *pmu)