linux/kernel/trace/blktrace.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
   4 *
   5 */
   6
   7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   8
   9#include <linux/kernel.h>
  10#include <linux/blkdev.h>
  11#include <linux/blktrace_api.h>
  12#include <linux/percpu.h>
  13#include <linux/init.h>
  14#include <linux/mutex.h>
  15#include <linux/slab.h>
  16#include <linux/debugfs.h>
  17#include <linux/export.h>
  18#include <linux/time.h>
  19#include <linux/uaccess.h>
  20#include <linux/list.h>
  21#include <linux/blk-cgroup.h>
  22
  23#include "../../block/blk.h"
  24
  25#include <trace/events/block.h>
  26
  27#include "trace_output.h"
  28
  29#ifdef CONFIG_BLK_DEV_IO_TRACE
  30
  31static unsigned int blktrace_seq __read_mostly = 1;
  32
  33static struct trace_array *blk_tr;
  34static bool blk_tracer_enabled __read_mostly;
  35
  36static LIST_HEAD(running_trace_list);
  37static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
  38
  39/* Select an alternative, minimalistic output than the original one */
  40#define TRACE_BLK_OPT_CLASSIC   0x1
  41#define TRACE_BLK_OPT_CGROUP    0x2
  42#define TRACE_BLK_OPT_CGNAME    0x4
  43
  44static struct tracer_opt blk_tracer_opts[] = {
  45        /* Default disable the minimalistic output */
  46        { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
  47#ifdef CONFIG_BLK_CGROUP
  48        { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
  49        { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
  50#endif
  51        { }
  52};
  53
  54static struct tracer_flags blk_tracer_flags = {
  55        .val  = 0,
  56        .opts = blk_tracer_opts,
  57};
  58
  59/* Global reference count of probes */
  60static DEFINE_MUTEX(blk_probe_mutex);
  61static int blk_probes_ref;
  62
  63static void blk_register_tracepoints(void);
  64static void blk_unregister_tracepoints(void);
  65
  66/*
  67 * Send out a notify message.
  68 */
  69static void trace_note(struct blk_trace *bt, pid_t pid, int action,
  70                       const void *data, size_t len, u64 cgid)
  71{
  72        struct blk_io_trace *t;
  73        struct ring_buffer_event *event = NULL;
  74        struct trace_buffer *buffer = NULL;
  75        unsigned int trace_ctx = 0;
  76        int cpu = smp_processor_id();
  77        bool blk_tracer = blk_tracer_enabled;
  78        ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
  79
  80        if (blk_tracer) {
  81                buffer = blk_tr->array_buffer.buffer;
  82                trace_ctx = tracing_gen_ctx_flags(0);
  83                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
  84                                                  sizeof(*t) + len + cgid_len,
  85                                                  trace_ctx);
  86                if (!event)
  87                        return;
  88                t = ring_buffer_event_data(event);
  89                goto record_it;
  90        }
  91
  92        if (!bt->rchan)
  93                return;
  94
  95        t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
  96        if (t) {
  97                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  98                t->time = ktime_to_ns(ktime_get());
  99record_it:
 100                t->device = bt->dev;
 101                t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
 102                t->pid = pid;
 103                t->cpu = cpu;
 104                t->pdu_len = len + cgid_len;
 105                if (cgid_len)
 106                        memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
 107                memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 108
 109                if (blk_tracer)
 110                        trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 111        }
 112}
 113
 114/*
 115 * Send out a notify for this process, if we haven't done so since a trace
 116 * started
 117 */
 118static void trace_note_tsk(struct task_struct *tsk)
 119{
 120        unsigned long flags;
 121        struct blk_trace *bt;
 122
 123        tsk->btrace_seq = blktrace_seq;
 124        raw_spin_lock_irqsave(&running_trace_lock, flags);
 125        list_for_each_entry(bt, &running_trace_list, running_list) {
 126                trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
 127                           sizeof(tsk->comm), 0);
 128        }
 129        raw_spin_unlock_irqrestore(&running_trace_lock, flags);
 130}
 131
 132static void trace_note_time(struct blk_trace *bt)
 133{
 134        struct timespec64 now;
 135        unsigned long flags;
 136        u32 words[2];
 137
 138        /* need to check user space to see if this breaks in y2038 or y2106 */
 139        ktime_get_real_ts64(&now);
 140        words[0] = (u32)now.tv_sec;
 141        words[1] = now.tv_nsec;
 142
 143        local_irq_save(flags);
 144        trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
 145        local_irq_restore(flags);
 146}
 147
 148void __blk_trace_note_message(struct blk_trace *bt,
 149                struct cgroup_subsys_state *css, const char *fmt, ...)
 150{
 151        int n;
 152        va_list args;
 153        unsigned long flags;
 154        char *buf;
 155        u64 cgid = 0;
 156
 157        if (unlikely(bt->trace_state != Blktrace_running &&
 158                     !blk_tracer_enabled))
 159                return;
 160
 161        /*
 162         * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
 163         * message to the trace.
 164         */
 165        if (!(bt->act_mask & BLK_TC_NOTIFY))
 166                return;
 167
 168        local_irq_save(flags);
 169        buf = this_cpu_ptr(bt->msg_data);
 170        va_start(args, fmt);
 171        n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 172        va_end(args);
 173
 174#ifdef CONFIG_BLK_CGROUP
 175        if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 176                cgid = cgroup_id(css->cgroup);
 177        else
 178                cgid = 1;
 179#endif
 180        trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
 181        local_irq_restore(flags);
 182}
 183EXPORT_SYMBOL_GPL(__blk_trace_note_message);
 184
 185static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 186                         pid_t pid)
 187{
 188        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 189                return 1;
 190        if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 191                return 1;
 192        if (bt->pid && pid != bt->pid)
 193                return 1;
 194
 195        return 0;
 196}
 197
 198/*
 199 * Data direction bit lookup
 200 */
 201static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 202                                 BLK_TC_ACT(BLK_TC_WRITE) };
 203
 204#define BLK_TC_RAHEAD           BLK_TC_AHEAD
 205#define BLK_TC_PREFLUSH         BLK_TC_FLUSH
 206
 207/* The ilog2() calls fall out because they're constant */
 208#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) <<  \
 209          (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 210
 211/*
 212 * The worker for the various blk_add_trace*() types. Fills out a
 213 * blk_io_trace structure and places it in a per-cpu subbuffer.
 214 */
 215static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 216                            const blk_opf_t opf, u32 what, int error,
 217                            int pdu_len, void *pdu_data, u64 cgid)
 218{
 219        struct task_struct *tsk = current;
 220        struct ring_buffer_event *event = NULL;
 221        struct trace_buffer *buffer = NULL;
 222        struct blk_io_trace *t;
 223        unsigned long flags = 0;
 224        unsigned long *sequence;
 225        unsigned int trace_ctx = 0;
 226        pid_t pid;
 227        int cpu;
 228        bool blk_tracer = blk_tracer_enabled;
 229        ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
 230        const enum req_op op = opf & REQ_OP_MASK;
 231
 232        if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 233                return;
 234
 235        what |= ddir_act[op_is_write(op) ? WRITE : READ];
 236        what |= MASK_TC_BIT(opf, SYNC);
 237        what |= MASK_TC_BIT(opf, RAHEAD);
 238        what |= MASK_TC_BIT(opf, META);
 239        what |= MASK_TC_BIT(opf, PREFLUSH);
 240        what |= MASK_TC_BIT(opf, FUA);
 241        if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
 242                what |= BLK_TC_ACT(BLK_TC_DISCARD);
 243        if (op == REQ_OP_FLUSH)
 244                what |= BLK_TC_ACT(BLK_TC_FLUSH);
 245        if (cgid)
 246                what |= __BLK_TA_CGROUP;
 247
 248        pid = tsk->pid;
 249        if (act_log_check(bt, what, sector, pid))
 250                return;
 251        cpu = raw_smp_processor_id();
 252
 253        if (blk_tracer) {
 254                tracing_record_cmdline(current);
 255
 256                buffer = blk_tr->array_buffer.buffer;
 257                trace_ctx = tracing_gen_ctx_flags(0);
 258                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 259                                                  sizeof(*t) + pdu_len + cgid_len,
 260                                                  trace_ctx);
 261                if (!event)
 262                        return;
 263                t = ring_buffer_event_data(event);
 264                goto record_it;
 265        }
 266
 267        if (unlikely(tsk->btrace_seq != blktrace_seq))
 268                trace_note_tsk(tsk);
 269
 270        /*
 271         * A word about the locking here - we disable interrupts to reserve
 272         * some space in the relay per-cpu buffer, to prevent an irq
 273         * from coming in and stepping on our toes.
 274         */
 275        local_irq_save(flags);
 276        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
 277        if (t) {
 278                sequence = per_cpu_ptr(bt->sequence, cpu);
 279
 280                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 281                t->sequence = ++(*sequence);
 282                t->time = ktime_to_ns(ktime_get());
 283record_it:
 284                /*
 285                 * These two are not needed in ftrace as they are in the
 286                 * generic trace_entry, filled by tracing_generic_entry_update,
 287                 * but for the trace_event->bin() synthesizer benefit we do it
 288                 * here too.
 289                 */
 290                t->cpu = cpu;
 291                t->pid = pid;
 292
 293                t->sector = sector;
 294                t->bytes = bytes;
 295                t->action = what;
 296                t->device = bt->dev;
 297                t->error = error;
 298                t->pdu_len = pdu_len + cgid_len;
 299
 300                if (cgid_len)
 301                        memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
 302                if (pdu_len)
 303                        memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 304
 305                if (blk_tracer) {
 306                        trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 307                        return;
 308                }
 309        }
 310
 311        local_irq_restore(flags);
 312}
 313
 314static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
 315{
 316        relay_close(bt->rchan);
 317
 318        /*
 319         * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
 320         * under 'q->debugfs_dir', thus lookup and remove them.
 321         */
 322        if (!bt->dir) {
 323                debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
 324                debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
 325        } else {
 326                debugfs_remove(bt->dir);
 327        }
 328        free_percpu(bt->sequence);
 329        free_percpu(bt->msg_data);
 330        kfree(bt);
 331}
 332
 333static void get_probe_ref(void)
 334{
 335        mutex_lock(&blk_probe_mutex);
 336        if (++blk_probes_ref == 1)
 337                blk_register_tracepoints();
 338        mutex_unlock(&blk_probe_mutex);
 339}
 340
 341static void put_probe_ref(void)
 342{
 343        mutex_lock(&blk_probe_mutex);
 344        if (!--blk_probes_ref)
 345                blk_unregister_tracepoints();
 346        mutex_unlock(&blk_probe_mutex);
 347}
 348
 349static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
 350{
 351        synchronize_rcu();
 352        blk_trace_free(q, bt);
 353        put_probe_ref();
 354}
 355
 356static int __blk_trace_remove(struct request_queue *q)
 357{
 358        struct blk_trace *bt;
 359
 360        bt = rcu_replace_pointer(q->blk_trace, NULL,
 361                                 lockdep_is_held(&q->debugfs_mutex));
 362        if (!bt)
 363                return -EINVAL;
 364
 365        if (bt->trace_state != Blktrace_running)
 366                blk_trace_cleanup(q, bt);
 367
 368        return 0;
 369}
 370
 371int blk_trace_remove(struct request_queue *q)
 372{
 373        int ret;
 374
 375        mutex_lock(&q->debugfs_mutex);
 376        ret = __blk_trace_remove(q);
 377        mutex_unlock(&q->debugfs_mutex);
 378
 379        return ret;
 380}
 381EXPORT_SYMBOL_GPL(blk_trace_remove);
 382
 383static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 384                                size_t count, loff_t *ppos)
 385{
 386        struct blk_trace *bt = filp->private_data;
 387        char buf[16];
 388
 389        snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
 390
 391        return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
 392}
 393
 394static const struct file_operations blk_dropped_fops = {
 395        .owner =        THIS_MODULE,
 396        .open =         simple_open,
 397        .read =         blk_dropped_read,
 398        .llseek =       default_llseek,
 399};
 400
 401static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 402                                size_t count, loff_t *ppos)
 403{
 404        char *msg;
 405        struct blk_trace *bt;
 406
 407        if (count >= BLK_TN_MAX_MSG)
 408                return -EINVAL;
 409
 410        msg = memdup_user_nul(buffer, count);
 411        if (IS_ERR(msg))
 412                return PTR_ERR(msg);
 413
 414        bt = filp->private_data;
 415        __blk_trace_note_message(bt, NULL, "%s", msg);
 416        kfree(msg);
 417
 418        return count;
 419}
 420
 421static const struct file_operations blk_msg_fops = {
 422        .owner =        THIS_MODULE,
 423        .open =         simple_open,
 424        .write =        blk_msg_write,
 425        .llseek =       noop_llseek,
 426};
 427
 428/*
 429 * Keep track of how many times we encountered a full subbuffer, to aid
 430 * the user space app in telling how many lost events there were.
 431 */
 432static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 433                                     void *prev_subbuf, size_t prev_padding)
 434{
 435        struct blk_trace *bt;
 436
 437        if (!relay_buf_full(buf))
 438                return 1;
 439
 440        bt = buf->chan->private_data;
 441        atomic_inc(&bt->dropped);
 442        return 0;
 443}
 444
 445static int blk_remove_buf_file_callback(struct dentry *dentry)
 446{
 447        debugfs_remove(dentry);
 448
 449        return 0;
 450}
 451
 452static struct dentry *blk_create_buf_file_callback(const char *filename,
 453                                                   struct dentry *parent,
 454                                                   umode_t mode,
 455                                                   struct rchan_buf *buf,
 456                                                   int *is_global)
 457{
 458        return debugfs_create_file(filename, mode, parent, buf,
 459                                        &relay_file_operations);
 460}
 461
 462static const struct rchan_callbacks blk_relay_callbacks = {
 463        .subbuf_start           = blk_subbuf_start_callback,
 464        .create_buf_file        = blk_create_buf_file_callback,
 465        .remove_buf_file        = blk_remove_buf_file_callback,
 466};
 467
 468static void blk_trace_setup_lba(struct blk_trace *bt,
 469                                struct block_device *bdev)
 470{
 471        if (bdev) {
 472                bt->start_lba = bdev->bd_start_sect;
 473                bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
 474        } else {
 475                bt->start_lba = 0;
 476                bt->end_lba = -1ULL;
 477        }
 478}
 479
 480/*
 481 * Setup everything required to start tracing
 482 */
 483static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 484                              struct block_device *bdev,
 485                              struct blk_user_trace_setup *buts)
 486{
 487        struct blk_trace *bt = NULL;
 488        struct dentry *dir = NULL;
 489        int ret;
 490
 491        lockdep_assert_held(&q->debugfs_mutex);
 492
 493        if (!buts->buf_size || !buts->buf_nr)
 494                return -EINVAL;
 495
 496        strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
 497        buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
 498
 499        /*
 500         * some device names have larger paths - convert the slashes
 501         * to underscores for this to work as expected
 502         */
 503        strreplace(buts->name, '/', '_');
 504
 505        /*
 506         * bdev can be NULL, as with scsi-generic, this is a helpful as
 507         * we can be.
 508         */
 509        if (rcu_dereference_protected(q->blk_trace,
 510                                      lockdep_is_held(&q->debugfs_mutex))) {
 511                pr_warn("Concurrent blktraces are not allowed on %s\n",
 512                        buts->name);
 513                return -EBUSY;
 514        }
 515
 516        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 517        if (!bt)
 518                return -ENOMEM;
 519
 520        ret = -ENOMEM;
 521        bt->sequence = alloc_percpu(unsigned long);
 522        if (!bt->sequence)
 523                goto err;
 524
 525        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 526        if (!bt->msg_data)
 527                goto err;
 528
 529        /*
 530         * When tracing the whole disk reuse the existing debugfs directory
 531         * created by the block layer on init. For partitions block devices,
 532         * and scsi-generic block devices we create a temporary new debugfs
 533         * directory that will be removed once the trace ends.
 534         */
 535        if (bdev && !bdev_is_partition(bdev))
 536                dir = q->debugfs_dir;
 537        else
 538                bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 539
 540        /*
 541         * As blktrace relies on debugfs for its interface the debugfs directory
 542         * is required, contrary to the usual mantra of not checking for debugfs
 543         * files or directories.
 544         */
 545        if (IS_ERR_OR_NULL(dir)) {
 546                pr_warn("debugfs_dir not present for %s so skipping\n",
 547                        buts->name);
 548                ret = -ENOENT;
 549                goto err;
 550        }
 551
 552        bt->dev = dev;
 553        atomic_set(&bt->dropped, 0);
 554        INIT_LIST_HEAD(&bt->running_list);
 555
 556        ret = -EIO;
 557        debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
 558        debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
 559
 560        bt->rchan = relay_open("trace", dir, buts->buf_size,
 561                                buts->buf_nr, &blk_relay_callbacks, bt);
 562        if (!bt->rchan)
 563                goto err;
 564
 565        bt->act_mask = buts->act_mask;
 566        if (!bt->act_mask)
 567                bt->act_mask = (u16) -1;
 568
 569        blk_trace_setup_lba(bt, bdev);
 570
 571        /* overwrite with user settings */
 572        if (buts->start_lba)
 573                bt->start_lba = buts->start_lba;
 574        if (buts->end_lba)
 575                bt->end_lba = buts->end_lba;
 576
 577        bt->pid = buts->pid;
 578        bt->trace_state = Blktrace_setup;
 579
 580        rcu_assign_pointer(q->blk_trace, bt);
 581        get_probe_ref();
 582
 583        ret = 0;
 584err:
 585        if (ret)
 586                blk_trace_free(q, bt);
 587        return ret;
 588}
 589
 590static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 591                             struct block_device *bdev, char __user *arg)
 592{
 593        struct blk_user_trace_setup buts;
 594        int ret;
 595
 596        ret = copy_from_user(&buts, arg, sizeof(buts));
 597        if (ret)
 598                return -EFAULT;
 599
 600        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 601        if (ret)
 602                return ret;
 603
 604        if (copy_to_user(arg, &buts, sizeof(buts))) {
 605                __blk_trace_remove(q);
 606                return -EFAULT;
 607        }
 608        return 0;
 609}
 610
 611int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 612                    struct block_device *bdev,
 613                    char __user *arg)
 614{
 615        int ret;
 616
 617        mutex_lock(&q->debugfs_mutex);
 618        ret = __blk_trace_setup(q, name, dev, bdev, arg);
 619        mutex_unlock(&q->debugfs_mutex);
 620
 621        return ret;
 622}
 623EXPORT_SYMBOL_GPL(blk_trace_setup);
 624
 625#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 626static int compat_blk_trace_setup(struct request_queue *q, char *name,
 627                                  dev_t dev, struct block_device *bdev,
 628                                  char __user *arg)
 629{
 630        struct blk_user_trace_setup buts;
 631        struct compat_blk_user_trace_setup cbuts;
 632        int ret;
 633
 634        if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
 635                return -EFAULT;
 636
 637        buts = (struct blk_user_trace_setup) {
 638                .act_mask = cbuts.act_mask,
 639                .buf_size = cbuts.buf_size,
 640                .buf_nr = cbuts.buf_nr,
 641                .start_lba = cbuts.start_lba,
 642                .end_lba = cbuts.end_lba,
 643                .pid = cbuts.pid,
 644        };
 645
 646        ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 647        if (ret)
 648                return ret;
 649
 650        if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
 651                __blk_trace_remove(q);
 652                return -EFAULT;
 653        }
 654
 655        return 0;
 656}
 657#endif
 658
 659static int __blk_trace_startstop(struct request_queue *q, int start)
 660{
 661        int ret;
 662        struct blk_trace *bt;
 663
 664        bt = rcu_dereference_protected(q->blk_trace,
 665                                       lockdep_is_held(&q->debugfs_mutex));
 666        if (bt == NULL)
 667                return -EINVAL;
 668
 669        /*
 670         * For starting a trace, we can transition from a setup or stopped
 671         * trace. For stopping a trace, the state must be running
 672         */
 673        ret = -EINVAL;
 674        if (start) {
 675                if (bt->trace_state == Blktrace_setup ||
 676                    bt->trace_state == Blktrace_stopped) {
 677                        blktrace_seq++;
 678                        smp_mb();
 679                        bt->trace_state = Blktrace_running;
 680                        raw_spin_lock_irq(&running_trace_lock);
 681                        list_add(&bt->running_list, &running_trace_list);
 682                        raw_spin_unlock_irq(&running_trace_lock);
 683
 684                        trace_note_time(bt);
 685                        ret = 0;
 686                }
 687        } else {
 688                if (bt->trace_state == Blktrace_running) {
 689                        bt->trace_state = Blktrace_stopped;
 690                        raw_spin_lock_irq(&running_trace_lock);
 691                        list_del_init(&bt->running_list);
 692                        raw_spin_unlock_irq(&running_trace_lock);
 693                        relay_flush(bt->rchan);
 694                        ret = 0;
 695                }
 696        }
 697
 698        return ret;
 699}
 700
 701int blk_trace_startstop(struct request_queue *q, int start)
 702{
 703        int ret;
 704
 705        mutex_lock(&q->debugfs_mutex);
 706        ret = __blk_trace_startstop(q, start);
 707        mutex_unlock(&q->debugfs_mutex);
 708
 709        return ret;
 710}
 711EXPORT_SYMBOL_GPL(blk_trace_startstop);
 712
 713/*
 714 * When reading or writing the blktrace sysfs files, the references to the
 715 * opened sysfs or device files should prevent the underlying block device
 716 * from being removed. So no further delete protection is really needed.
 717 */
 718
 719/**
 720 * blk_trace_ioctl: - handle the ioctls associated with tracing
 721 * @bdev:       the block device
 722 * @cmd:        the ioctl cmd
 723 * @arg:        the argument data, if any
 724 *
 725 **/
 726int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 727{
 728        struct request_queue *q;
 729        int ret, start = 0;
 730        char b[BDEVNAME_SIZE];
 731
 732        q = bdev_get_queue(bdev);
 733        if (!q)
 734                return -ENXIO;
 735
 736        mutex_lock(&q->debugfs_mutex);
 737
 738        switch (cmd) {
 739        case BLKTRACESETUP:
 740                snprintf(b, sizeof(b), "%pg", bdev);
 741                ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 742                break;
 743#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 744        case BLKTRACESETUP32:
 745                snprintf(b, sizeof(b), "%pg", bdev);
 746                ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 747                break;
 748#endif
 749        case BLKTRACESTART:
 750                start = 1;
 751                fallthrough;
 752        case BLKTRACESTOP:
 753                ret = __blk_trace_startstop(q, start);
 754                break;
 755        case BLKTRACETEARDOWN:
 756                ret = __blk_trace_remove(q);
 757                break;
 758        default:
 759                ret = -ENOTTY;
 760                break;
 761        }
 762
 763        mutex_unlock(&q->debugfs_mutex);
 764        return ret;
 765}
 766
 767/**
 768 * blk_trace_shutdown: - stop and cleanup trace structures
 769 * @q:    the request queue associated with the device
 770 *
 771 **/
 772void blk_trace_shutdown(struct request_queue *q)
 773{
 774        if (rcu_dereference_protected(q->blk_trace,
 775                                      lockdep_is_held(&q->debugfs_mutex))) {
 776                __blk_trace_startstop(q, 0);
 777                __blk_trace_remove(q);
 778        }
 779}
 780
 781#ifdef CONFIG_BLK_CGROUP
 782static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 783{
 784        struct cgroup_subsys_state *blkcg_css;
 785        struct blk_trace *bt;
 786
 787        /* We don't use the 'bt' value here except as an optimization... */
 788        bt = rcu_dereference_protected(q->blk_trace, 1);
 789        if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 790                return 0;
 791
 792        blkcg_css = bio_blkcg_css(bio);
 793        if (!blkcg_css)
 794                return 0;
 795        return cgroup_id(blkcg_css->cgroup);
 796}
 797#else
 798static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 799{
 800        return 0;
 801}
 802#endif
 803
 804static u64
 805blk_trace_request_get_cgid(struct request *rq)
 806{
 807        if (!rq->bio)
 808                return 0;
 809        /* Use the first bio */
 810        return blk_trace_bio_get_cgid(rq->q, rq->bio);
 811}
 812
 813/*
 814 * blktrace probes
 815 */
 816
 817/**
 818 * blk_add_trace_rq - Add a trace for a request oriented action
 819 * @rq:         the source request
 820 * @error:      return status to log
 821 * @nr_bytes:   number of completed bytes
 822 * @what:       the action
 823 * @cgid:       the cgroup info
 824 *
 825 * Description:
 826 *     Records an action against a request. Will log the bio offset + size.
 827 *
 828 **/
 829static void blk_add_trace_rq(struct request *rq, blk_status_t error,
 830                             unsigned int nr_bytes, u32 what, u64 cgid)
 831{
 832        struct blk_trace *bt;
 833
 834        rcu_read_lock();
 835        bt = rcu_dereference(rq->q->blk_trace);
 836        if (likely(!bt)) {
 837                rcu_read_unlock();
 838                return;
 839        }
 840
 841        if (blk_rq_is_passthrough(rq))
 842                what |= BLK_TC_ACT(BLK_TC_PC);
 843        else
 844                what |= BLK_TC_ACT(BLK_TC_FS);
 845
 846        __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
 847                        what, blk_status_to_errno(error), 0, NULL, cgid);
 848        rcu_read_unlock();
 849}
 850
 851static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
 852{
 853        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
 854                         blk_trace_request_get_cgid(rq));
 855}
 856
 857static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
 858{
 859        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
 860                         blk_trace_request_get_cgid(rq));
 861}
 862
 863static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
 864{
 865        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
 866                         blk_trace_request_get_cgid(rq));
 867}
 868
 869static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
 870{
 871        blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
 872                         blk_trace_request_get_cgid(rq));
 873}
 874
 875static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 876                        blk_status_t error, unsigned int nr_bytes)
 877{
 878        blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
 879                         blk_trace_request_get_cgid(rq));
 880}
 881
 882/**
 883 * blk_add_trace_bio - Add a trace for a bio oriented action
 884 * @q:          queue the io is for
 885 * @bio:        the source bio
 886 * @what:       the action
 887 * @error:      error, if any
 888 *
 889 * Description:
 890 *     Records an action against a bio. Will log the bio offset + size.
 891 *
 892 **/
 893static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 894                              u32 what, int error)
 895{
 896        struct blk_trace *bt;
 897
 898        rcu_read_lock();
 899        bt = rcu_dereference(q->blk_trace);
 900        if (likely(!bt)) {
 901                rcu_read_unlock();
 902                return;
 903        }
 904
 905        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 906                        bio->bi_opf, what, error, 0, NULL,
 907                        blk_trace_bio_get_cgid(q, bio));
 908        rcu_read_unlock();
 909}
 910
 911static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
 912{
 913        blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
 914}
 915
 916static void blk_add_trace_bio_complete(void *ignore,
 917                                       struct request_queue *q, struct bio *bio)
 918{
 919        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
 920                          blk_status_to_errno(bio->bi_status));
 921}
 922
 923static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
 924{
 925        blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
 926                        0);
 927}
 928
 929static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
 930{
 931        blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
 932                        0);
 933}
 934
 935static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
 936{
 937        blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
 938}
 939
 940static void blk_add_trace_getrq(void *ignore, struct bio *bio)
 941{
 942        blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
 943}
 944
 945static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 946{
 947        struct blk_trace *bt;
 948
 949        rcu_read_lock();
 950        bt = rcu_dereference(q->blk_trace);
 951        if (bt)
 952                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
 953        rcu_read_unlock();
 954}
 955
 956static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 957                                    unsigned int depth, bool explicit)
 958{
 959        struct blk_trace *bt;
 960
 961        rcu_read_lock();
 962        bt = rcu_dereference(q->blk_trace);
 963        if (bt) {
 964                __be64 rpdu = cpu_to_be64(depth);
 965                u32 what;
 966
 967                if (explicit)
 968                        what = BLK_TA_UNPLUG_IO;
 969                else
 970                        what = BLK_TA_UNPLUG_TIMER;
 971
 972                __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
 973        }
 974        rcu_read_unlock();
 975}
 976
 977static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
 978{
 979        struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 980        struct blk_trace *bt;
 981
 982        rcu_read_lock();
 983        bt = rcu_dereference(q->blk_trace);
 984        if (bt) {
 985                __be64 rpdu = cpu_to_be64(pdu);
 986
 987                __blk_add_trace(bt, bio->bi_iter.bi_sector,
 988                                bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
 989                                blk_status_to_errno(bio->bi_status),
 990                                sizeof(rpdu), &rpdu,
 991                                blk_trace_bio_get_cgid(q, bio));
 992        }
 993        rcu_read_unlock();
 994}
 995
 996/**
 997 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
 998 * @ignore:     trace callback data parameter (not used)
 999 * @bio:        the source bio
1000 * @dev:        source device
1001 * @from:       source sector
1002 *
1003 * Called after a bio is remapped to a different device and/or sector.
1004 **/
1005static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
1006                                    sector_t from)
1007{
1008        struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1009        struct blk_trace *bt;
1010        struct blk_io_trace_remap r;
1011
1012        rcu_read_lock();
1013        bt = rcu_dereference(q->blk_trace);
1014        if (likely(!bt)) {
1015                rcu_read_unlock();
1016                return;
1017        }
1018
1019        r.device_from = cpu_to_be32(dev);
1020        r.device_to   = cpu_to_be32(bio_dev(bio));
1021        r.sector_from = cpu_to_be64(from);
1022
1023        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
1024                        bio->bi_opf, BLK_TA_REMAP,
1025                        blk_status_to_errno(bio->bi_status),
1026                        sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
1027        rcu_read_unlock();
1028}
1029
1030/**
1031 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
1032 * @ignore:     trace callback data parameter (not used)
1033 * @rq:         the source request
1034 * @dev:        target device
1035 * @from:       source sector
1036 *
1037 * Description:
1038 *     Device mapper remaps request to other devices.
1039 *     Add a trace for that action.
1040 *
1041 **/
1042static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
1043                                   sector_t from)
1044{
1045        struct blk_trace *bt;
1046        struct blk_io_trace_remap r;
1047
1048        rcu_read_lock();
1049        bt = rcu_dereference(rq->q->blk_trace);
1050        if (likely(!bt)) {
1051                rcu_read_unlock();
1052                return;
1053        }
1054
1055        r.device_from = cpu_to_be32(dev);
1056        r.device_to   = cpu_to_be32(disk_devt(rq->q->disk));
1057        r.sector_from = cpu_to_be64(from);
1058
1059        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
1060                        rq->cmd_flags, BLK_TA_REMAP, 0,
1061                        sizeof(r), &r, blk_trace_request_get_cgid(rq));
1062        rcu_read_unlock();
1063}
1064
1065/**
1066 * blk_add_driver_data - Add binary message with driver-specific data
1067 * @rq:         io request
1068 * @data:       driver-specific data
1069 * @len:        length of driver-specific data
1070 *
1071 * Description:
1072 *     Some drivers might want to write driver-specific data per request.
1073 *
1074 **/
1075void blk_add_driver_data(struct request *rq, void *data, size_t len)
1076{
1077        struct blk_trace *bt;
1078
1079        rcu_read_lock();
1080        bt = rcu_dereference(rq->q->blk_trace);
1081        if (likely(!bt)) {
1082                rcu_read_unlock();
1083                return;
1084        }
1085
1086        __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
1087                                BLK_TA_DRV_DATA, 0, len, data,
1088                                blk_trace_request_get_cgid(rq));
1089        rcu_read_unlock();
1090}
1091EXPORT_SYMBOL_GPL(blk_add_driver_data);
1092
1093static void blk_register_tracepoints(void)
1094{
1095        int ret;
1096
1097        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1098        WARN_ON(ret);
1099        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1100        WARN_ON(ret);
1101        ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
1102        WARN_ON(ret);
1103        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1104        WARN_ON(ret);
1105        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
1106        WARN_ON(ret);
1107        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
1108        WARN_ON(ret);
1109        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
1110        WARN_ON(ret);
1111        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
1112        WARN_ON(ret);
1113        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
1114        WARN_ON(ret);
1115        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
1116        WARN_ON(ret);
1117        ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
1118        WARN_ON(ret);
1119        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1120        WARN_ON(ret);
1121        ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1122        WARN_ON(ret);
1123        ret = register_trace_block_split(blk_add_trace_split, NULL);
1124        WARN_ON(ret);
1125        ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1126        WARN_ON(ret);
1127        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1128        WARN_ON(ret);
1129}
1130
1131static void blk_unregister_tracepoints(void)
1132{
1133        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1134        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1135        unregister_trace_block_split(blk_add_trace_split, NULL);
1136        unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1137        unregister_trace_block_plug(blk_add_trace_plug, NULL);
1138        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
1139        unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
1140        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
1141        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
1142        unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
1143        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
1144        unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
1145        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
1146        unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
1147        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
1148        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
1149
1150        tracepoint_synchronize_unregister();
1151}
1152
1153/*
1154 * struct blk_io_tracer formatting routines
1155 */
1156
1157static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
1158{
1159        int i = 0;
1160        int tc = t->action >> BLK_TC_SHIFT;
1161
1162        if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
1163                rwbs[i++] = 'N';
1164                goto out;
1165        }
1166
1167        if (tc & BLK_TC_FLUSH)
1168                rwbs[i++] = 'F';
1169
1170        if (tc & BLK_TC_DISCARD)
1171                rwbs[i++] = 'D';
1172        else if (tc & BLK_TC_WRITE)
1173                rwbs[i++] = 'W';
1174        else if (t->bytes)
1175                rwbs[i++] = 'R';
1176        else
1177                rwbs[i++] = 'N';
1178
1179        if (tc & BLK_TC_FUA)
1180                rwbs[i++] = 'F';
1181        if (tc & BLK_TC_AHEAD)
1182                rwbs[i++] = 'A';
1183        if (tc & BLK_TC_SYNC)
1184                rwbs[i++] = 'S';
1185        if (tc & BLK_TC_META)
1186                rwbs[i++] = 'M';
1187out:
1188        rwbs[i] = '\0';
1189}
1190
1191static inline
1192const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
1193{
1194        return (const struct blk_io_trace *)ent;
1195}
1196
1197static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
1198{
1199        return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
1200}
1201
1202static inline u64 t_cgid(const struct trace_entry *ent)
1203{
1204        return *(u64 *)(te_blk_io_trace(ent) + 1);
1205}
1206
1207static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
1208{
1209        return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
1210}
1211
1212static inline u32 t_action(const struct trace_entry *ent)
1213{
1214        return te_blk_io_trace(ent)->action;
1215}
1216
1217static inline u32 t_bytes(const struct trace_entry *ent)
1218{
1219        return te_blk_io_trace(ent)->bytes;
1220}
1221
1222static inline u32 t_sec(const struct trace_entry *ent)
1223{
1224        return te_blk_io_trace(ent)->bytes >> 9;
1225}
1226
1227static inline unsigned long long t_sector(const struct trace_entry *ent)
1228{
1229        return te_blk_io_trace(ent)->sector;
1230}
1231
1232static inline __u16 t_error(const struct trace_entry *ent)
1233{
1234        return te_blk_io_trace(ent)->error;
1235}
1236
1237static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
1238{
1239        const __be64 *val = pdu_start(ent, has_cg);
1240        return be64_to_cpu(*val);
1241}
1242
1243typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
1244        bool has_cg);
1245
1246static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
1247        bool has_cg)
1248{
1249        char rwbs[RWBS_LEN];
1250        unsigned long long ts  = iter->ts;
1251        unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1252        unsigned secs          = (unsigned long)ts;
1253        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1254
1255        fill_rwbs(rwbs, t);
1256
1257        trace_seq_printf(&iter->seq,
1258                         "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1259                         MAJOR(t->device), MINOR(t->device), iter->cpu,
1260                         secs, nsec_rem, iter->ent->pid, act, rwbs);
1261}
1262
1263static void blk_log_action(struct trace_iterator *iter, const char *act,
1264        bool has_cg)
1265{
1266        char rwbs[RWBS_LEN];
1267        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1268
1269        fill_rwbs(rwbs, t);
1270        if (has_cg) {
1271                u64 id = t_cgid(iter->ent);
1272
1273                if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
1274                        char blkcg_name_buf[NAME_MAX + 1] = "<...>";
1275
1276                        cgroup_path_from_kernfs_id(id, blkcg_name_buf,
1277                                sizeof(blkcg_name_buf));
1278                        trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
1279                                 MAJOR(t->device), MINOR(t->device),
1280                                 blkcg_name_buf, act, rwbs);
1281                } else {
1282                        /*
1283                         * The cgid portion used to be "INO,GEN".  Userland
1284                         * builds a FILEID_INO32_GEN fid out of them and
1285                         * opens the cgroup using open_by_handle_at(2).
1286                         * While 32bit ino setups are still the same, 64bit
1287                         * ones now use the 64bit ino as the whole ID and
1288                         * no longer use generation.
1289                         *
1290                         * Regardless of the content, always output
1291                         * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
1292                         * be mapped back to @id on both 64 and 32bit ino
1293                         * setups.  See __kernfs_fh_to_dentry().
1294                         */
1295                        trace_seq_printf(&iter->seq,
1296                                 "%3d,%-3d %llx,%-llx %2s %3s ",
1297                                 MAJOR(t->device), MINOR(t->device),
1298                                 id & U32_MAX, id >> 32, act, rwbs);
1299                }
1300        } else
1301                trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1302                                 MAJOR(t->device), MINOR(t->device), act, rwbs);
1303}
1304
1305static void blk_log_dump_pdu(struct trace_seq *s,
1306        const struct trace_entry *ent, bool has_cg)
1307{
1308        const unsigned char *pdu_buf;
1309        int pdu_len;
1310        int i, end;
1311
1312        pdu_buf = pdu_start(ent, has_cg);
1313        pdu_len = pdu_real_len(ent, has_cg);
1314
1315        if (!pdu_len)
1316                return;
1317
1318        /* find the last zero that needs to be printed */
1319        for (end = pdu_len - 1; end >= 0; end--)
1320                if (pdu_buf[end])
1321                        break;
1322        end++;
1323
1324        trace_seq_putc(s, '(');
1325
1326        for (i = 0; i < pdu_len; i++) {
1327
1328                trace_seq_printf(s, "%s%02x",
1329                                 i == 0 ? "" : " ", pdu_buf[i]);
1330
1331                /*
1332                 * stop when the rest is just zeros and indicate so
1333                 * with a ".." appended
1334                 */
1335                if (i == end && end != pdu_len - 1) {
1336                        trace_seq_puts(s, " ..) ");
1337                        return;
1338                }
1339        }
1340
1341        trace_seq_puts(s, ") ");
1342}
1343
1344static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1345{
1346        char cmd[TASK_COMM_LEN];
1347
1348        trace_find_cmdline(ent->pid, cmd);
1349
1350        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1351                trace_seq_printf(s, "%u ", t_bytes(ent));
1352                blk_log_dump_pdu(s, ent, has_cg);
1353                trace_seq_printf(s, "[%s]\n", cmd);
1354        } else {
1355                if (t_sec(ent))
1356                        trace_seq_printf(s, "%llu + %u [%s]\n",
1357                                                t_sector(ent), t_sec(ent), cmd);
1358                else
1359                        trace_seq_printf(s, "[%s]\n", cmd);
1360        }
1361}
1362
1363static void blk_log_with_error(struct trace_seq *s,
1364                              const struct trace_entry *ent, bool has_cg)
1365{
1366        if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1367                blk_log_dump_pdu(s, ent, has_cg);
1368                trace_seq_printf(s, "[%d]\n", t_error(ent));
1369        } else {
1370                if (t_sec(ent))
1371                        trace_seq_printf(s, "%llu + %u [%d]\n",
1372                                         t_sector(ent),
1373                                         t_sec(ent), t_error(ent));
1374                else
1375                        trace_seq_printf(s, "%llu [%d]\n",
1376                                         t_sector(ent), t_error(ent));
1377        }
1378}
1379
1380static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1381{
1382        const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
1383
1384        trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1385                         t_sector(ent), t_sec(ent),
1386                         MAJOR(be32_to_cpu(__r->device_from)),
1387                         MINOR(be32_to_cpu(__r->device_from)),
1388                         be64_to_cpu(__r->sector_from));
1389}
1390
1391static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1392{
1393        char cmd[TASK_COMM_LEN];
1394
1395        trace_find_cmdline(ent->pid, cmd);
1396
1397        trace_seq_printf(s, "[%s]\n", cmd);
1398}
1399
1400static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1401{
1402        char cmd[TASK_COMM_LEN];
1403
1404        trace_find_cmdline(ent->pid, cmd);
1405
1406        trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
1407}
1408
1409static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
1410{
1411        char cmd[TASK_COMM_LEN];
1412
1413        trace_find_cmdline(ent->pid, cmd);
1414
1415        trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1416                         get_pdu_int(ent, has_cg), cmd);
1417}
1418
1419static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
1420                        bool has_cg)
1421{
1422
1423        trace_seq_putmem(s, pdu_start(ent, has_cg),
1424                pdu_real_len(ent, has_cg));
1425        trace_seq_putc(s, '\n');
1426}
1427
1428/*
1429 * struct tracer operations
1430 */
1431
1432static void blk_tracer_print_header(struct seq_file *m)
1433{
1434        if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1435                return;
1436        seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
1437                    "#  |     |     |           |   |   |\n");
1438}
1439
1440static void blk_tracer_start(struct trace_array *tr)
1441{
1442        blk_tracer_enabled = true;
1443}
1444
1445static int blk_tracer_init(struct trace_array *tr)
1446{
1447        blk_tr = tr;
1448        blk_tracer_start(tr);
1449        return 0;
1450}
1451
1452static void blk_tracer_stop(struct trace_array *tr)
1453{
1454        blk_tracer_enabled = false;
1455}
1456
1457static void blk_tracer_reset(struct trace_array *tr)
1458{
1459        blk_tracer_stop(tr);
1460}
1461
1462static const struct {
1463        const char *act[2];
1464        void       (*print)(struct trace_seq *s, const struct trace_entry *ent,
1465                            bool has_cg);
1466} what2act[] = {
1467        [__BLK_TA_QUEUE]        = {{  "Q", "queue" },      blk_log_generic },
1468        [__BLK_TA_BACKMERGE]    = {{  "M", "backmerge" },  blk_log_generic },
1469        [__BLK_TA_FRONTMERGE]   = {{  "F", "frontmerge" }, blk_log_generic },
1470        [__BLK_TA_GETRQ]        = {{  "G", "getrq" },      blk_log_generic },
1471        [__BLK_TA_SLEEPRQ]      = {{  "S", "sleeprq" },    blk_log_generic },
1472        [__BLK_TA_REQUEUE]      = {{  "R", "requeue" },    blk_log_with_error },
1473        [__BLK_TA_ISSUE]        = {{  "D", "issue" },      blk_log_generic },
1474        [__BLK_TA_COMPLETE]     = {{  "C", "complete" },   blk_log_with_error },
1475        [__BLK_TA_PLUG]         = {{  "P", "plug" },       blk_log_plug },
1476        [__BLK_TA_UNPLUG_IO]    = {{  "U", "unplug_io" },  blk_log_unplug },
1477        [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1478        [__BLK_TA_INSERT]       = {{  "I", "insert" },     blk_log_generic },
1479        [__BLK_TA_SPLIT]        = {{  "X", "split" },      blk_log_split },
1480        [__BLK_TA_BOUNCE]       = {{  "B", "bounce" },     blk_log_generic },
1481        [__BLK_TA_REMAP]        = {{  "A", "remap" },      blk_log_remap },
1482};
1483
1484static enum print_line_t print_one_line(struct trace_iterator *iter,
1485                                        bool classic)
1486{
1487        struct trace_array *tr = iter->tr;
1488        struct trace_seq *s = &iter->seq;
1489        const struct blk_io_trace *t;
1490        u16 what;
1491        bool long_act;
1492        blk_log_action_t *log_action;
1493        bool has_cg;
1494
1495        t          = te_blk_io_trace(iter->ent);
1496        what       = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
1497        long_act   = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
1498        log_action = classic ? &blk_log_action_classic : &blk_log_action;
1499        has_cg     = t->action & __BLK_TA_CGROUP;
1500
1501        if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
1502                log_action(iter, long_act ? "message" : "m", has_cg);
1503                blk_log_msg(s, iter->ent, has_cg);
1504                return trace_handle_return(s);
1505        }
1506
1507        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1508                trace_seq_printf(s, "Unknown action %x\n", what);
1509        else {
1510                log_action(iter, what2act[what].act[long_act], has_cg);
1511                what2act[what].print(s, iter->ent, has_cg);
1512        }
1513
1514        return trace_handle_return(s);
1515}
1516
1517static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1518                                               int flags, struct trace_event *event)
1519{
1520        return print_one_line(iter, false);
1521}
1522
1523static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1524{
1525        struct trace_seq *s = &iter->seq;
1526        struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1527        const int offset = offsetof(struct blk_io_trace, sector);
1528        struct blk_io_trace old = {
1529                .magic    = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1530                .time     = iter->ts,
1531        };
1532
1533        trace_seq_putmem(s, &old, offset);
1534        trace_seq_putmem(s, &t->sector,
1535                         sizeof(old) - offset + t->pdu_len);
1536}
1537
1538static enum print_line_t
1539blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1540                             struct trace_event *event)
1541{
1542        blk_trace_synthesize_old_trace(iter);
1543
1544        return trace_handle_return(&iter->seq);
1545}
1546
1547static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1548{
1549        if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1550                return TRACE_TYPE_UNHANDLED;
1551
1552        return print_one_line(iter, true);
1553}
1554
1555static int
1556blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1557{
1558        /* don't output context-info for blk_classic output */
1559        if (bit == TRACE_BLK_OPT_CLASSIC) {
1560                if (set)
1561                        tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1562                else
1563                        tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
1564        }
1565        return 0;
1566}
1567
1568static struct tracer blk_tracer __read_mostly = {
1569        .name           = "blk",
1570        .init           = blk_tracer_init,
1571        .reset          = blk_tracer_reset,
1572        .start          = blk_tracer_start,
1573        .stop           = blk_tracer_stop,
1574        .print_header   = blk_tracer_print_header,
1575        .print_line     = blk_tracer_print_line,
1576        .flags          = &blk_tracer_flags,
1577        .set_flag       = blk_tracer_set_flag,
1578};
1579
1580static struct trace_event_functions trace_blk_event_funcs = {
1581        .trace          = blk_trace_event_print,
1582        .binary         = blk_trace_event_print_binary,
1583};
1584
1585static struct trace_event trace_blk_event = {
1586        .type           = TRACE_BLK,
1587        .funcs          = &trace_blk_event_funcs,
1588};
1589
1590static int __init init_blk_tracer(void)
1591{
1592        if (!register_trace_event(&trace_blk_event)) {
1593                pr_warn("Warning: could not register block events\n");
1594                return 1;
1595        }
1596
1597        if (register_tracer(&blk_tracer) != 0) {
1598                pr_warn("Warning: could not register the block tracer\n");
1599                unregister_trace_event(&trace_blk_event);
1600                return 1;
1601        }
1602
1603        return 0;
1604}
1605
1606device_initcall(init_blk_tracer);
1607
1608static int blk_trace_remove_queue(struct request_queue *q)
1609{
1610        struct blk_trace *bt;
1611
1612        bt = rcu_replace_pointer(q->blk_trace, NULL,
1613                                 lockdep_is_held(&q->debugfs_mutex));
1614        if (bt == NULL)
1615                return -EINVAL;
1616
1617        if (bt->trace_state == Blktrace_running) {
1618                bt->trace_state = Blktrace_stopped;
1619                raw_spin_lock_irq(&running_trace_lock);
1620                list_del_init(&bt->running_list);
1621                raw_spin_unlock_irq(&running_trace_lock);
1622                relay_flush(bt->rchan);
1623        }
1624
1625        put_probe_ref();
1626        synchronize_rcu();
1627        blk_trace_free(q, bt);
1628        return 0;
1629}
1630
1631/*
1632 * Setup everything required to start tracing
1633 */
1634static int blk_trace_setup_queue(struct request_queue *q,
1635                                 struct block_device *bdev)
1636{
1637        struct blk_trace *bt = NULL;
1638        int ret = -ENOMEM;
1639
1640        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1641        if (!bt)
1642                return -ENOMEM;
1643
1644        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1645        if (!bt->msg_data)
1646                goto free_bt;
1647
1648        bt->dev = bdev->bd_dev;
1649        bt->act_mask = (u16)-1;
1650
1651        blk_trace_setup_lba(bt, bdev);
1652
1653        rcu_assign_pointer(q->blk_trace, bt);
1654        get_probe_ref();
1655        return 0;
1656
1657free_bt:
1658        blk_trace_free(q, bt);
1659        return ret;
1660}
1661
1662/*
1663 * sysfs interface to enable and configure tracing
1664 */
1665
1666static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1667                                         struct device_attribute *attr,
1668                                         char *buf);
1669static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1670                                          struct device_attribute *attr,
1671                                          const char *buf, size_t count);
1672#define BLK_TRACE_DEVICE_ATTR(_name) \
1673        DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1674                    sysfs_blk_trace_attr_show, \
1675                    sysfs_blk_trace_attr_store)
1676
1677static BLK_TRACE_DEVICE_ATTR(enable);
1678static BLK_TRACE_DEVICE_ATTR(act_mask);
1679static BLK_TRACE_DEVICE_ATTR(pid);
1680static BLK_TRACE_DEVICE_ATTR(start_lba);
1681static BLK_TRACE_DEVICE_ATTR(end_lba);
1682
1683static struct attribute *blk_trace_attrs[] = {
1684        &dev_attr_enable.attr,
1685        &dev_attr_act_mask.attr,
1686        &dev_attr_pid.attr,
1687        &dev_attr_start_lba.attr,
1688        &dev_attr_end_lba.attr,
1689        NULL
1690};
1691
1692struct attribute_group blk_trace_attr_group = {
1693        .name  = "trace",
1694        .attrs = blk_trace_attrs,
1695};
1696
1697static const struct {
1698        int mask;
1699        const char *str;
1700} mask_maps[] = {
1701        { BLK_TC_READ,          "read"          },
1702        { BLK_TC_WRITE,         "write"         },
1703        { BLK_TC_FLUSH,         "flush"         },
1704        { BLK_TC_SYNC,          "sync"          },
1705        { BLK_TC_QUEUE,         "queue"         },
1706        { BLK_TC_REQUEUE,       "requeue"       },
1707        { BLK_TC_ISSUE,         "issue"         },
1708        { BLK_TC_COMPLETE,      "complete"      },
1709        { BLK_TC_FS,            "fs"            },
1710        { BLK_TC_PC,            "pc"            },
1711        { BLK_TC_NOTIFY,        "notify"        },
1712        { BLK_TC_AHEAD,         "ahead"         },
1713        { BLK_TC_META,          "meta"          },
1714        { BLK_TC_DISCARD,       "discard"       },
1715        { BLK_TC_DRV_DATA,      "drv_data"      },
1716        { BLK_TC_FUA,           "fua"           },
1717};
1718
1719static int blk_trace_str2mask(const char *str)
1720{
1721        int i;
1722        int mask = 0;
1723        char *buf, *s, *token;
1724
1725        buf = kstrdup(str, GFP_KERNEL);
1726        if (buf == NULL)
1727                return -ENOMEM;
1728        s = strstrip(buf);
1729
1730        while (1) {
1731                token = strsep(&s, ",");
1732                if (token == NULL)
1733                        break;
1734
1735                if (*token == '\0')
1736                        continue;
1737
1738                for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1739                        if (strcasecmp(token, mask_maps[i].str) == 0) {
1740                                mask |= mask_maps[i].mask;
1741                                break;
1742                        }
1743                }
1744                if (i == ARRAY_SIZE(mask_maps)) {
1745                        mask = -EINVAL;
1746                        break;
1747                }
1748        }
1749        kfree(buf);
1750
1751        return mask;
1752}
1753
1754static ssize_t blk_trace_mask2str(char *buf, int mask)
1755{
1756        int i;
1757        char *p = buf;
1758
1759        for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1760                if (mask & mask_maps[i].mask) {
1761                        p += sprintf(p, "%s%s",
1762                                    (p == buf) ? "" : ",", mask_maps[i].str);
1763                }
1764        }
1765        *p++ = '\n';
1766
1767        return p - buf;
1768}
1769
1770static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1771                                         struct device_attribute *attr,
1772                                         char *buf)
1773{
1774        struct block_device *bdev = dev_to_bdev(dev);
1775        struct request_queue *q = bdev_get_queue(bdev);
1776        struct blk_trace *bt;
1777        ssize_t ret = -ENXIO;
1778
1779        mutex_lock(&q->debugfs_mutex);
1780
1781        bt = rcu_dereference_protected(q->blk_trace,
1782                                       lockdep_is_held(&q->debugfs_mutex));
1783        if (attr == &dev_attr_enable) {
1784                ret = sprintf(buf, "%u\n", !!bt);
1785                goto out_unlock_bdev;
1786        }
1787
1788        if (bt == NULL)
1789                ret = sprintf(buf, "disabled\n");
1790        else if (attr == &dev_attr_act_mask)
1791                ret = blk_trace_mask2str(buf, bt->act_mask);
1792        else if (attr == &dev_attr_pid)
1793                ret = sprintf(buf, "%u\n", bt->pid);
1794        else if (attr == &dev_attr_start_lba)
1795                ret = sprintf(buf, "%llu\n", bt->start_lba);
1796        else if (attr == &dev_attr_end_lba)
1797                ret = sprintf(buf, "%llu\n", bt->end_lba);
1798
1799out_unlock_bdev:
1800        mutex_unlock(&q->debugfs_mutex);
1801        return ret;
1802}
1803
1804static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1805                                          struct device_attribute *attr,
1806                                          const char *buf, size_t count)
1807{
1808        struct block_device *bdev = dev_to_bdev(dev);
1809        struct request_queue *q = bdev_get_queue(bdev);
1810        struct blk_trace *bt;
1811        u64 value;
1812        ssize_t ret = -EINVAL;
1813
1814        if (count == 0)
1815                goto out;
1816
1817        if (attr == &dev_attr_act_mask) {
1818                if (kstrtoull(buf, 0, &value)) {
1819                        /* Assume it is a list of trace category names */
1820                        ret = blk_trace_str2mask(buf);
1821                        if (ret < 0)
1822                                goto out;
1823                        value = ret;
1824                }
1825        } else {
1826                if (kstrtoull(buf, 0, &value))
1827                        goto out;
1828        }
1829
1830        mutex_lock(&q->debugfs_mutex);
1831
1832        bt = rcu_dereference_protected(q->blk_trace,
1833                                       lockdep_is_held(&q->debugfs_mutex));
1834        if (attr == &dev_attr_enable) {
1835                if (!!value == !!bt) {
1836                        ret = 0;
1837                        goto out_unlock_bdev;
1838                }
1839                if (value)
1840                        ret = blk_trace_setup_queue(q, bdev);
1841                else
1842                        ret = blk_trace_remove_queue(q);
1843                goto out_unlock_bdev;
1844        }
1845
1846        ret = 0;
1847        if (bt == NULL) {
1848                ret = blk_trace_setup_queue(q, bdev);
1849                bt = rcu_dereference_protected(q->blk_trace,
1850                                lockdep_is_held(&q->debugfs_mutex));
1851        }
1852
1853        if (ret == 0) {
1854                if (attr == &dev_attr_act_mask)
1855                        bt->act_mask = value;
1856                else if (attr == &dev_attr_pid)
1857                        bt->pid = value;
1858                else if (attr == &dev_attr_start_lba)
1859                        bt->start_lba = value;
1860                else if (attr == &dev_attr_end_lba)
1861                        bt->end_lba = value;
1862        }
1863
1864out_unlock_bdev:
1865        mutex_unlock(&q->debugfs_mutex);
1866out:
1867        return ret ? ret : count;
1868}
1869#endif /* CONFIG_BLK_DEV_IO_TRACE */
1870
1871#ifdef CONFIG_EVENT_TRACING
1872
1873/**
1874 * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
1875 * @rwbs:       buffer to be filled
1876 * @opf:        request operation type (REQ_OP_XXX) and flags for the tracepoint
1877 *
1878 * Description:
1879 *     Maps each request operation and flag to a single character and fills the
1880 *     buffer provided by the caller with resulting string.
1881 *
1882 **/
1883void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
1884{
1885        int i = 0;
1886
1887        if (opf & REQ_PREFLUSH)
1888                rwbs[i++] = 'F';
1889
1890        switch (opf & REQ_OP_MASK) {
1891        case REQ_OP_WRITE:
1892                rwbs[i++] = 'W';
1893                break;
1894        case REQ_OP_DISCARD:
1895                rwbs[i++] = 'D';
1896                break;
1897        case REQ_OP_SECURE_ERASE:
1898                rwbs[i++] = 'D';
1899                rwbs[i++] = 'E';
1900                break;
1901        case REQ_OP_FLUSH:
1902                rwbs[i++] = 'F';
1903                break;
1904        case REQ_OP_READ:
1905                rwbs[i++] = 'R';
1906                break;
1907        default:
1908                rwbs[i++] = 'N';
1909        }
1910
1911        if (opf & REQ_FUA)
1912                rwbs[i++] = 'F';
1913        if (opf & REQ_RAHEAD)
1914                rwbs[i++] = 'A';
1915        if (opf & REQ_SYNC)
1916                rwbs[i++] = 'S';
1917        if (opf & REQ_META)
1918                rwbs[i++] = 'M';
1919
1920        rwbs[i] = '\0';
1921}
1922EXPORT_SYMBOL_GPL(blk_fill_rwbs);
1923
1924#endif /* CONFIG_EVENT_TRACING */
1925
1926