linux/block/blktrace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 * GNU General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  16 *
  17 */
  18#include <linux/kernel.h>
  19#include <linux/blkdev.h>
  20#include <linux/blktrace_api.h>
  21#include <linux/percpu.h>
  22#include <linux/init.h>
  23#include <linux/mutex.h>
  24#include <linux/debugfs.h>
  25#include <linux/time.h>
  26#include <asm/uaccess.h>
  27
  28static unsigned int blktrace_seq __read_mostly = 1;
  29
  30/*
  31 * Send out a notify message.
  32 */
  33static void trace_note(struct blk_trace *bt, pid_t pid, int action,
  34                       const void *data, size_t len)
  35{
  36        struct blk_io_trace *t;
  37
  38        t = relay_reserve(bt->rchan, sizeof(*t) + len);
  39        if (t) {
  40                const int cpu = smp_processor_id();
  41
  42                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
  43                t->time = ktime_to_ns(ktime_get());
  44                t->device = bt->dev;
  45                t->action = action;
  46                t->pid = pid;
  47                t->cpu = cpu;
  48                t->pdu_len = len;
  49                memcpy((void *) t + sizeof(*t), data, len);
  50        }
  51}
  52
  53/*
  54 * Send out a notify for this process, if we haven't done so since a trace
  55 * started
  56 */
  57static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
  58{
  59        tsk->btrace_seq = blktrace_seq;
  60        trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
  61}
  62
  63static void trace_note_time(struct blk_trace *bt)
  64{
  65        struct timespec now;
  66        unsigned long flags;
  67        u32 words[2];
  68
  69        getnstimeofday(&now);
  70        words[0] = now.tv_sec;
  71        words[1] = now.tv_nsec;
  72
  73        local_irq_save(flags);
  74        trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
  75        local_irq_restore(flags);
  76}
  77
  78static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  79                         pid_t pid)
  80{
  81        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
  82                return 1;
  83        if (sector < bt->start_lba || sector > bt->end_lba)
  84                return 1;
  85        if (bt->pid && pid != bt->pid)
  86                return 1;
  87
  88        return 0;
  89}
  90
  91/*
  92 * Data direction bit lookup
  93 */
  94static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
  95
  96/*
  97 * Bio action bits of interest
  98 */
  99static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
 100
 101/*
 102 * More could be added as needed, taking care to increment the decrementer
 103 * to get correct indexing
 104 */
 105#define trace_barrier_bit(rw)   \
 106        (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
 107#define trace_sync_bit(rw)      \
 108        (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
 109#define trace_ahead_bit(rw)     \
 110        (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
 111#define trace_meta_bit(rw)      \
 112        (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
 113
 114/*
 115 * The worker for the various blk_add_trace*() types. Fills out a
 116 * blk_io_trace structure and places it in a per-cpu subbuffer.
 117 */
 118void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 119                     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 120{
 121        struct task_struct *tsk = current;
 122        struct blk_io_trace *t;
 123        unsigned long flags;
 124        unsigned long *sequence;
 125        pid_t pid;
 126        int cpu;
 127
 128        if (unlikely(bt->trace_state != Blktrace_running))
 129                return;
 130
 131        what |= ddir_act[rw & WRITE];
 132        what |= bio_act[trace_barrier_bit(rw)];
 133        what |= bio_act[trace_sync_bit(rw)];
 134        what |= bio_act[trace_ahead_bit(rw)];
 135        what |= bio_act[trace_meta_bit(rw)];
 136
 137        pid = tsk->pid;
 138        if (unlikely(act_log_check(bt, what, sector, pid)))
 139                return;
 140
 141        /*
 142         * A word about the locking here - we disable interrupts to reserve
 143         * some space in the relay per-cpu buffer, to prevent an irq
 144         * from coming in and stepping on our toes. Once reserved, it's
 145         * enough to get preemption disabled to prevent read of this data
 146         * before we are through filling it. get_cpu()/put_cpu() does this
 147         * for us
 148         */
 149        local_irq_save(flags);
 150
 151        if (unlikely(tsk->btrace_seq != blktrace_seq))
 152                trace_note_tsk(bt, tsk);
 153
 154        t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
 155        if (t) {
 156                cpu = smp_processor_id();
 157                sequence = per_cpu_ptr(bt->sequence, cpu);
 158
 159                t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 160                t->sequence = ++(*sequence);
 161                t->time = ktime_to_ns(ktime_get());
 162                t->sector = sector;
 163                t->bytes = bytes;
 164                t->action = what;
 165                t->pid = pid;
 166                t->device = bt->dev;
 167                t->cpu = cpu;
 168                t->error = error;
 169                t->pdu_len = pdu_len;
 170
 171                if (pdu_len)
 172                        memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 173        }
 174
 175        local_irq_restore(flags);
 176}
 177
 178EXPORT_SYMBOL_GPL(__blk_add_trace);
 179
 180static struct dentry *blk_tree_root;
 181static DEFINE_MUTEX(blk_tree_mutex);
 182static unsigned int root_users;
 183
 184static inline void blk_remove_root(void)
 185{
 186        if (blk_tree_root) {
 187                debugfs_remove(blk_tree_root);
 188                blk_tree_root = NULL;
 189        }
 190}
 191
 192static void blk_remove_tree(struct dentry *dir)
 193{
 194        mutex_lock(&blk_tree_mutex);
 195        debugfs_remove(dir);
 196        if (--root_users == 0)
 197                blk_remove_root();
 198        mutex_unlock(&blk_tree_mutex);
 199}
 200
 201static struct dentry *blk_create_tree(const char *blk_name)
 202{
 203        struct dentry *dir = NULL;
 204        int created = 0;
 205
 206        mutex_lock(&blk_tree_mutex);
 207
 208        if (!blk_tree_root) {
 209                blk_tree_root = debugfs_create_dir("block", NULL);
 210                if (!blk_tree_root)
 211                        goto err;
 212                created = 1;
 213        }
 214
 215        dir = debugfs_create_dir(blk_name, blk_tree_root);
 216        if (dir)
 217                root_users++;
 218        else {
 219                /* Delete root only if we created it */
 220                if (created)
 221                        blk_remove_root();
 222        }
 223
 224err:
 225        mutex_unlock(&blk_tree_mutex);
 226        return dir;
 227}
 228
 229static void blk_trace_cleanup(struct blk_trace *bt)
 230{
 231        relay_close(bt->rchan);
 232        debugfs_remove(bt->dropped_file);
 233        blk_remove_tree(bt->dir);
 234        free_percpu(bt->sequence);
 235        kfree(bt);
 236}
 237
 238static int blk_trace_remove(struct request_queue *q)
 239{
 240        struct blk_trace *bt;
 241
 242        bt = xchg(&q->blk_trace, NULL);
 243        if (!bt)
 244                return -EINVAL;
 245
 246        if (bt->trace_state == Blktrace_setup ||
 247            bt->trace_state == Blktrace_stopped)
 248                blk_trace_cleanup(bt);
 249
 250        return 0;
 251}
 252
 253static int blk_dropped_open(struct inode *inode, struct file *filp)
 254{
 255        filp->private_data = inode->i_private;
 256
 257        return 0;
 258}
 259
 260static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 261                                size_t count, loff_t *ppos)
 262{
 263        struct blk_trace *bt = filp->private_data;
 264        char buf[16];
 265
 266        snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
 267
 268        return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
 269}
 270
 271static const struct file_operations blk_dropped_fops = {
 272        .owner =        THIS_MODULE,
 273        .open =         blk_dropped_open,
 274        .read =         blk_dropped_read,
 275};
 276
 277/*
 278 * Keep track of how many times we encountered a full subbuffer, to aid
 279 * the user space app in telling how many lost events there were.
 280 */
 281static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 282                                     void *prev_subbuf, size_t prev_padding)
 283{
 284        struct blk_trace *bt;
 285
 286        if (!relay_buf_full(buf))
 287                return 1;
 288
 289        bt = buf->chan->private_data;
 290        atomic_inc(&bt->dropped);
 291        return 0;
 292}
 293
 294static int blk_remove_buf_file_callback(struct dentry *dentry)
 295{
 296        debugfs_remove(dentry);
 297        return 0;
 298}
 299
 300static struct dentry *blk_create_buf_file_callback(const char *filename,
 301                                                   struct dentry *parent,
 302                                                   int mode,
 303                                                   struct rchan_buf *buf,
 304                                                   int *is_global)
 305{
 306        return debugfs_create_file(filename, mode, parent, buf,
 307                                        &relay_file_operations);
 308}
 309
 310static struct rchan_callbacks blk_relay_callbacks = {
 311        .subbuf_start           = blk_subbuf_start_callback,
 312        .create_buf_file        = blk_create_buf_file_callback,
 313        .remove_buf_file        = blk_remove_buf_file_callback,
 314};
 315
 316/*
 317 * Setup everything required to start tracing
 318 */
 319int do_blk_trace_setup(struct request_queue *q, struct block_device *bdev,
 320                        struct blk_user_trace_setup *buts)
 321{
 322        struct blk_trace *old_bt, *bt = NULL;
 323        struct dentry *dir = NULL;
 324        char b[BDEVNAME_SIZE];
 325        int ret, i;
 326
 327        if (!buts->buf_size || !buts->buf_nr)
 328                return -EINVAL;
 329
 330        strcpy(buts->name, bdevname(bdev, b));
 331
 332        /*
 333         * some device names have larger paths - convert the slashes
 334         * to underscores for this to work as expected
 335         */
 336        for (i = 0; i < strlen(buts->name); i++)
 337                if (buts->name[i] == '/')
 338                        buts->name[i] = '_';
 339
 340        ret = -ENOMEM;
 341        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 342        if (!bt)
 343                goto err;
 344
 345        bt->sequence = alloc_percpu(unsigned long);
 346        if (!bt->sequence)
 347                goto err;
 348
 349        ret = -ENOENT;
 350        dir = blk_create_tree(buts->name);
 351        if (!dir)
 352                goto err;
 353
 354        bt->dir = dir;
 355        bt->dev = bdev->bd_dev;
 356        atomic_set(&bt->dropped, 0);
 357
 358        ret = -EIO;
 359        bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
 360        if (!bt->dropped_file)
 361                goto err;
 362
 363        bt->rchan = relay_open("trace", dir, buts->buf_size,
 364                                buts->buf_nr, &blk_relay_callbacks, bt);
 365        if (!bt->rchan)
 366                goto err;
 367
 368        bt->act_mask = buts->act_mask;
 369        if (!bt->act_mask)
 370                bt->act_mask = (u16) -1;
 371
 372        bt->start_lba = buts->start_lba;
 373        bt->end_lba = buts->end_lba;
 374        if (!bt->end_lba)
 375                bt->end_lba = -1ULL;
 376
 377        bt->pid = buts->pid;
 378        bt->trace_state = Blktrace_setup;
 379
 380        ret = -EBUSY;
 381        old_bt = xchg(&q->blk_trace, bt);
 382        if (old_bt) {
 383                (void) xchg(&q->blk_trace, old_bt);
 384                goto err;
 385        }
 386
 387        return 0;
 388err:
 389        if (dir)
 390                blk_remove_tree(dir);
 391        if (bt) {
 392                if (bt->dropped_file)
 393                        debugfs_remove(bt->dropped_file);
 394                free_percpu(bt->sequence);
 395                if (bt->rchan)
 396                        relay_close(bt->rchan);
 397                kfree(bt);
 398        }
 399        return ret;
 400}
 401
 402static int blk_trace_setup(struct request_queue *q, struct block_device *bdev,
 403                           char __user *arg)
 404{
 405        struct blk_user_trace_setup buts;
 406        int ret;
 407
 408        ret = copy_from_user(&buts, arg, sizeof(buts));
 409        if (ret)
 410                return -EFAULT;
 411
 412        ret = do_blk_trace_setup(q, bdev, &buts);
 413        if (ret)
 414                return ret;
 415
 416        if (copy_to_user(arg, &buts, sizeof(buts)))
 417                return -EFAULT;
 418
 419        return 0;
 420}
 421
 422static int blk_trace_startstop(struct request_queue *q, int start)
 423{
 424        struct blk_trace *bt;
 425        int ret;
 426
 427        if ((bt = q->blk_trace) == NULL)
 428                return -EINVAL;
 429
 430        /*
 431         * For starting a trace, we can transition from a setup or stopped
 432         * trace. For stopping a trace, the state must be running
 433         */
 434        ret = -EINVAL;
 435        if (start) {
 436                if (bt->trace_state == Blktrace_setup ||
 437                    bt->trace_state == Blktrace_stopped) {
 438                        blktrace_seq++;
 439                        smp_mb();
 440                        bt->trace_state = Blktrace_running;
 441
 442                        trace_note_time(bt);
 443                        ret = 0;
 444                }
 445        } else {
 446                if (bt->trace_state == Blktrace_running) {
 447                        bt->trace_state = Blktrace_stopped;
 448                        relay_flush(bt->rchan);
 449                        ret = 0;
 450                }
 451        }
 452
 453        return ret;
 454}
 455
 456/**
 457 * blk_trace_ioctl: - handle the ioctls associated with tracing
 458 * @bdev:       the block device
 459 * @cmd:        the ioctl cmd
 460 * @arg:        the argument data, if any
 461 *
 462 **/
 463int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 464{
 465        struct request_queue *q;
 466        int ret, start = 0;
 467
 468        q = bdev_get_queue(bdev);
 469        if (!q)
 470                return -ENXIO;
 471
 472        mutex_lock(&bdev->bd_mutex);
 473
 474        switch (cmd) {
 475        case BLKTRACESETUP:
 476                ret = blk_trace_setup(q, bdev, arg);
 477                break;
 478        case BLKTRACESTART:
 479                start = 1;
 480        case BLKTRACESTOP:
 481                ret = blk_trace_startstop(q, start);
 482                break;
 483        case BLKTRACETEARDOWN:
 484                ret = blk_trace_remove(q);
 485                break;
 486        default:
 487                ret = -ENOTTY;
 488                break;
 489        }
 490
 491        mutex_unlock(&bdev->bd_mutex);
 492        return ret;
 493}
 494
 495/**
 496 * blk_trace_shutdown: - stop and cleanup trace structures
 497 * @q:    the request queue associated with the device
 498 *
 499 **/
 500void blk_trace_shutdown(struct request_queue *q)
 501{
 502        if (q->blk_trace) {
 503                blk_trace_startstop(q, 0);
 504                blk_trace_remove(q);
 505        }
 506}
 507
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.