linux/block/blktrace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 * GNU General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  16 *
  17 */
  18#include <linux/kernel.h>
  19#include <linux/blkdev.h>
  20#include <linux/blktrace_api.h>
  21#include <linux/percpu.h>
  22#include <linux/init.h>
  23#include <linux/mutex.h>
  24#include <linux/debugfs.h>
  25#include <linux/time.h>
  26#include <asm/uaccess.h>
  27
  28static unsigned int blktrace_seq __read_mostly = 1;
  29
  30/*
  31 * Send out a notify message.
  32 */
  33static void trace_note(struct blk_trace *bt, pid_t pid, int action,
  34                       const void *data, size_t len)
  35{
  36        struct blk_io_trace *t;
  37
  38        t = relay_reserve(bt->rchan, sizeof(*t) + len);
  39        if (t) {
  40                const int cpu = smp_processor_id();
  41
  42                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  43                t->time = ktime_to_ns(ktime_get());
  44                t->device = bt->dev;
  45                t->action = action;
  46                t->pid = pid;
  47                t->cpu = cpu;
  48                t->pdu_len = len;
  49                memcpy((void *) t + sizeof(*t), data, len);
  50        }
  51}
  52
  53/*
  54 * Send out a notify for this process, if we haven't done so since a trace
  55 * started
  56 */
  57static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
  58{
  59        tsk->btrace_seq = blktrace_seq;
  60        trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
  61}
  62
  63static void trace_note_time(struct blk_trace *bt)
  64{
  65        struct timespec now;
  66        unsigned long flags;
  67        u32 words[2];
  68
  69        getnstimeofday(&now);
  70        words[0] = now.tv_sec;
  71        words[1] = now.tv_nsec;
  72
  73        local_irq_save(flags);
  74        trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
  75        local_irq_restore(flags);
  76}
  77
  78void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
  79{
  80        int n;
  81        va_list args;
  82        unsigned long flags;
  83        char *buf;
  84
  85        local_irq_save(flags);
  86        buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
  87        va_start(args, fmt);
  88        n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
  89        va_end(args);
  90
  91        trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
  92        local_irq_restore(flags);
  93}
  94EXPORT_SYMBOL_GPL(__trace_note_message);
  95
  96static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  97                         pid_t pid)
  98{
  99        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 100                return 1;
 101        if (sector < bt->start_lba || sector > bt->end_lba)
 102                return 1;
 103        if (bt->pid && pid != bt->pid)
 104                return 1;
 105
 106        return 0;
 107}
 108
 109/*
 110 * Data direction bit lookup
 111 */
 112static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
 113
 114/*
 115 * Bio action bits of interest
 116 */
 117static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
 118
 119/*
 120 * More could be added as needed, taking care to increment the decrementer
 121 * to get correct indexing
 122 */
 123#define trace_barrier_bit(rw)   \
 124        (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
 125#define trace_sync_bit(rw)      \
 126        (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
 127#define trace_ahead_bit(rw)     \
 128        (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
 129#define trace_meta_bit(rw)      \
 130        (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
 131
 132/*
 133 * The worker for the various blk_add_trace*() types. Fills out a
 134 * blk_io_trace structure and places it in a per-cpu subbuffer.
 135 */
 136void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 137                     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 138{
 139        struct task_struct *tsk = current;
 140        struct blk_io_trace *t;
 141        unsigned long flags;
 142        unsigned long *sequence;
 143        pid_t pid;
 144        int cpu;
 145
 146        if (unlikely(bt->trace_state != Blktrace_running))
 147                return;
 148
 149        what |= ddir_act[rw & WRITE];
 150        what |= bio_act[trace_barrier_bit(rw)];
 151        what |= bio_act[trace_sync_bit(rw)];
 152        what |= bio_act[trace_ahead_bit(rw)];
 153        what |= bio_act[trace_meta_bit(rw)];
 154
 155        pid = tsk->pid;
 156        if (unlikely(act_log_check(bt, what, sector, pid)))
 157                return;
 158
 159        /*
 160         * A word about the locking here - we disable interrupts to reserve
 161         * some space in the relay per-cpu buffer, to prevent an irq
 162         * from coming in and stepping on our toes.
 163         */
 164        local_irq_save(flags);
 165
 166        if (unlikely(tsk->btrace_seq != blktrace_seq))
 167                trace_note_tsk(bt, tsk);
 168
 169        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
 170        if (t) {
 171                cpu = smp_processor_id();
 172                sequence = per_cpu_ptr(bt->sequence, cpu);
 173
 174                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 175                t->sequence = ++(*sequence);
 176                t->time = ktime_to_ns(ktime_get());
 177                t->sector = sector;
 178                t->bytes = bytes;
 179                t->action = what;
 180                t->pid = pid;
 181                t->device = bt->dev;
 182                t->cpu = cpu;
 183                t->error = error;
 184                t->pdu_len = pdu_len;
 185
 186                if (pdu_len)
 187                        memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 188        }
 189
 190        local_irq_restore(flags);
 191}
 192
 193EXPORT_SYMBOL_GPL(__blk_add_trace);
 194
 195static struct dentry *blk_tree_root;
 196static DEFINE_MUTEX(blk_tree_mutex);
 197static unsigned int root_users;
 198
 199static inline void blk_remove_root(void)
 200{
 201        if (blk_tree_root) {
 202                debugfs_remove(blk_tree_root);
 203                blk_tree_root = NULL;
 204        }
 205}
 206
 207static void blk_remove_tree(struct dentry *dir)
 208{
 209        mutex_lock(&blk_tree_mutex);
 210        debugfs_remove(dir);
 211        if (--root_users == 0)
 212                blk_remove_root();
 213        mutex_unlock(&blk_tree_mutex);
 214}
 215
 216static struct dentry *blk_create_tree(const char *blk_name)
 217{
 218        struct dentry *dir = NULL;
 219        int created = 0;
 220
 221        mutex_lock(&blk_tree_mutex);
 222
 223        if (!blk_tree_root) {
 224                blk_tree_root = debugfs_create_dir("block", NULL);
 225                if (!blk_tree_root)
 226                        goto err;
 227                created = 1;
 228        }
 229
 230        dir = debugfs_create_dir(blk_name, blk_tree_root);
 231        if (dir)
 232                root_users++;
 233        else {
 234                /* Delete root only if we created it */
 235                if (created)
 236                        blk_remove_root();
 237        }
 238
 239err:
 240        mutex_unlock(&blk_tree_mutex);
 241        return dir;
 242}
 243
 244static void blk_trace_cleanup(struct blk_trace *bt)
 245{
 246        relay_close(bt->rchan);
 247        debugfs_remove(bt->msg_file);
 248        debugfs_remove(bt->dropped_file);
 249        blk_remove_tree(bt->dir);
 250        free_percpu(bt->sequence);
 251        free_percpu(bt->msg_data);
 252        kfree(bt);
 253}
 254
 255int blk_trace_remove(struct request_queue *q)
 256{
 257        struct blk_trace *bt;
 258
 259        bt = xchg(&q->blk_trace, NULL);
 260        if (!bt)
 261                return -EINVAL;
 262
 263        if (bt->trace_state == Blktrace_setup ||
 264            bt->trace_state == Blktrace_stopped)
 265                blk_trace_cleanup(bt);
 266
 267        return 0;
 268}
 269EXPORT_SYMBOL_GPL(blk_trace_remove);
 270
 271static int blk_dropped_open(struct inode *inode, struct file *filp)
 272{
 273        filp->private_data = inode->i_private;
 274
 275        return 0;
 276}
 277
 278static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 279                                size_t count, loff_t *ppos)
 280{
 281        struct blk_trace *bt = filp->private_data;
 282        char buf[16];
 283
 284        snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
 285
 286        return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
 287}
 288
 289static const struct file_operations blk_dropped_fops = {
 290        .owner =        THIS_MODULE,
 291        .open =         blk_dropped_open,
 292        .read =         blk_dropped_read,
 293};
 294
 295static int blk_msg_open(struct inode *inode, struct file *filp)
 296{
 297        filp->private_data = inode->i_private;
 298
 299        return 0;
 300}
 301
 302static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 303                                size_t count, loff_t *ppos)
 304{
 305        char *msg;
 306        struct blk_trace *bt;
 307
 308        if (count > BLK_TN_MAX_MSG)
 309                return -EINVAL;
 310
 311        msg = kmalloc(count, GFP_KERNEL);
 312        if (msg == NULL)
 313                return -ENOMEM;
 314
 315        if (copy_from_user(msg, buffer, count)) {
 316                kfree(msg);
 317                return -EFAULT;
 318        }
 319
 320        bt = filp->private_data;
 321        __trace_note_message(bt, "%s", msg);
 322        kfree(msg);
 323
 324        return count;
 325}
 326
 327static const struct file_operations blk_msg_fops = {
 328        .owner =        THIS_MODULE,
 329        .open =         blk_msg_open,
 330        .write =        blk_msg_write,
 331};
 332
 333/*
 334 * Keep track of how many times we encountered a full subbuffer, to aid
 335 * the user space app in telling how many lost events there were.
 336 */
 337static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 338                                     void *prev_subbuf, size_t prev_padding)
 339{
 340        struct blk_trace *bt;
 341
 342        if (!relay_buf_full(buf))
 343                return 1;
 344
 345        bt = buf->chan->private_data;
 346        atomic_inc(&bt->dropped);
 347        return 0;
 348}
 349
 350static int blk_remove_buf_file_callback(struct dentry *dentry)
 351{
 352        debugfs_remove(dentry);
 353        return 0;
 354}
 355
 356static struct dentry *blk_create_buf_file_callback(const char *filename,
 357                                                   struct dentry *parent,
 358                                                   int mode,
 359                                                   struct rchan_buf *buf,
 360                                                   int *is_global)
 361{
 362        return debugfs_create_file(filename, mode, parent, buf,
 363                                        &relay_file_operations);
 364}
 365
 366static struct rchan_callbacks blk_relay_callbacks = {
 367        .subbuf_start           = blk_subbuf_start_callback,
 368        .create_buf_file        = blk_create_buf_file_callback,
 369        .remove_buf_file        = blk_remove_buf_file_callback,
 370};
 371
 372/*
 373 * Setup everything required to start tracing
 374 */
 375int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 376                        struct blk_user_trace_setup *buts)
 377{
 378        struct blk_trace *old_bt, *bt = NULL;
 379        struct dentry *dir = NULL;
 380        int ret, i;
 381
 382        if (!buts->buf_size || !buts->buf_nr)
 383                return -EINVAL;
 384
 385        strcpy(buts->name, name);
 386
 387        /*
 388         * some device names have larger paths - convert the slashes
 389         * to underscores for this to work as expected
 390         */
 391        for (i = 0; i < strlen(buts->name); i++)
 392                if (buts->name[i] == '/')
 393                        buts->name[i] = '_';
 394
 395        ret = -ENOMEM;
 396        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 397        if (!bt)
 398                goto err;
 399
 400        bt->sequence = alloc_percpu(unsigned long);
 401        if (!bt->sequence)
 402                goto err;
 403
 404        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
 405        if (!bt->msg_data)
 406                goto err;
 407
 408        ret = -ENOENT;
 409        dir = blk_create_tree(buts->name);
 410        if (!dir)
 411                goto err;
 412
 413        bt->dir = dir;
 414        bt->dev = dev;
 415        atomic_set(&bt->dropped, 0);
 416
 417        ret = -EIO;
 418        bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
 419        if (!bt->dropped_file)
 420                goto err;
 421
 422        bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
 423        if (!bt->msg_file)
 424                goto err;
 425
 426        bt->rchan = relay_open("trace", dir, buts->buf_size,
 427                                buts->buf_nr, &blk_relay_callbacks, bt);
 428        if (!bt->rchan)
 429                goto err;
 430
 431        bt->act_mask = buts->act_mask;
 432        if (!bt->act_mask)
 433                bt->act_mask = (u16) -1;
 434
 435        bt->start_lba = buts->start_lba;
 436        bt->end_lba = buts->end_lba;
 437        if (!bt->end_lba)
 438                bt->end_lba = -1ULL;
 439
 440        bt->pid = buts->pid;
 441        bt->trace_state = Blktrace_setup;
 442
 443        ret = -EBUSY;
 444        old_bt = xchg(&q->blk_trace, bt);
 445        if (old_bt) {
 446                (void) xchg(&q->blk_trace, old_bt);
 447                goto err;
 448        }
 449
 450        return 0;
 451err:
 452        if (dir)
 453                blk_remove_tree(dir);
 454        if (bt) {
 455                if (bt->msg_file)
 456                        debugfs_remove(bt->msg_file);
 457                if (bt->dropped_file)
 458                        debugfs_remove(bt->dropped_file);
 459                free_percpu(bt->sequence);
 460                free_percpu(bt->msg_data);
 461                if (bt->rchan)
 462                        relay_close(bt->rchan);
 463                kfree(bt);
 464        }
 465        return ret;
 466}
 467
 468int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 469                    char __user *arg)
 470{
 471        struct blk_user_trace_setup buts;
 472        int ret;
 473
 474        ret = copy_from_user(&buts, arg, sizeof(buts));
 475        if (ret)
 476                return -EFAULT;
 477
 478        ret = do_blk_trace_setup(q, name, dev, &buts);
 479        if (ret)
 480                return ret;
 481
 482        if (copy_to_user(arg, &buts, sizeof(buts)))
 483                return -EFAULT;
 484
 485        return 0;
 486}
 487EXPORT_SYMBOL_GPL(blk_trace_setup);
 488
 489int blk_trace_startstop(struct request_queue *q, int start)
 490{
 491        struct blk_trace *bt;
 492        int ret;
 493
 494        if ((bt = q->blk_trace) == NULL)
 495                return -EINVAL;
 496
 497        /*
 498         * For starting a trace, we can transition from a setup or stopped
 499         * trace. For stopping a trace, the state must be running
 500         */
 501        ret = -EINVAL;
 502        if (start) {
 503                if (bt->trace_state == Blktrace_setup ||
 504                    bt->trace_state == Blktrace_stopped) {
 505                        blktrace_seq++;
 506                        smp_mb();
 507                        bt->trace_state = Blktrace_running;
 508
 509                        trace_note_time(bt);
 510                        ret = 0;
 511                }
 512        } else {
 513                if (bt->trace_state == Blktrace_running) {
 514                        bt->trace_state = Blktrace_stopped;
 515                        relay_flush(bt->rchan);
 516                        ret = 0;
 517                }
 518        }
 519
 520        return ret;
 521}
 522EXPORT_SYMBOL_GPL(blk_trace_startstop);
 523
 524/**
 525 * blk_trace_ioctl: - handle the ioctls associated with tracing
 526 * @bdev:       the block device
 527 * @cmd:        the ioctl cmd
 528 * @arg:        the argument data, if any
 529 *
 530 **/
 531int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 532{
 533        struct request_queue *q;
 534        int ret, start = 0;
 535        char b[BDEVNAME_SIZE];
 536
 537        q = bdev_get_queue(bdev);
 538        if (!q)
 539                return -ENXIO;
 540
 541        mutex_lock(&bdev->bd_mutex);
 542
 543        switch (cmd) {
 544        case BLKTRACESETUP:
 545                bdevname(bdev, b);
 546                ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
 547                break;
 548        case BLKTRACESTART:
 549                start = 1;
 550        case BLKTRACESTOP:
 551                ret = blk_trace_startstop(q, start);
 552                break;
 553        case BLKTRACETEARDOWN:
 554                ret = blk_trace_remove(q);
 555                break;
 556        default:
 557                ret = -ENOTTY;
 558                break;
 559        }
 560
 561        mutex_unlock(&bdev->bd_mutex);
 562        return ret;
 563}
 564
 565/**
 566 * blk_trace_shutdown: - stop and cleanup trace structures
 567 * @q:    the request queue associated with the device
 568 *
 569 **/
 570void blk_trace_shutdown(struct request_queue *q)
 571{
 572        if (q->blk_trace) {
 573                blk_trace_startstop(q, 0);
 574                blk_trace_remove(q);
 575        }
 576}
 577