linux/block/blk-cgroup.c
<<
>>
Prefs
   1/*
   2 * Common Block IO controller cgroup interface
   3 *
   4 * Based on ideas and code from CFQ, CFS and BFQ:
   5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6 *
   7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8 *                    Paolo Valente <paolo.valente@unimore.it>
   9 *
  10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11 *                    Nauman Rafique <nauman@google.com>
  12 */
  13#include <linux/ioprio.h>
  14#include <linux/kdev_t.h>
  15#include <linux/module.h>
  16#include <linux/err.h>
  17#include <linux/blkdev.h>
  18#include <linux/slab.h>
  19#include <linux/genhd.h>
  20#include <linux/delay.h>
  21#include <linux/atomic.h>
  22#include "blk-cgroup.h"
  23#include "blk.h"
  24
  25#define MAX_KEY_LEN 100
  26
  27static DEFINE_MUTEX(blkcg_pol_mutex);
  28
  29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
  30EXPORT_SYMBOL_GPL(blkcg_root);
  31
  32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  33
  34static bool blkcg_policy_enabled(struct request_queue *q,
  35                                 const struct blkcg_policy *pol)
  36{
  37        return pol && test_bit(pol->plid, q->blkcg_pols);
  38}
  39
  40/**
  41 * blkg_free - free a blkg
  42 * @blkg: blkg to free
  43 *
  44 * Free @blkg which may be partially allocated.
  45 */
  46static void blkg_free(struct blkcg_gq *blkg)
  47{
  48        int i;
  49
  50        if (!blkg)
  51                return;
  52
  53        for (i = 0; i < BLKCG_MAX_POLS; i++) {
  54                struct blkcg_policy *pol = blkcg_policy[i];
  55                struct blkg_policy_data *pd = blkg->pd[i];
  56
  57                if (!pd)
  58                        continue;
  59
  60                if (pol && pol->pd_exit_fn)
  61                        pol->pd_exit_fn(blkg);
  62
  63                kfree(pd);
  64        }
  65
  66        blk_exit_rl(&blkg->rl);
  67        kfree(blkg);
  68}
  69
  70/**
  71 * blkg_alloc - allocate a blkg
  72 * @blkcg: block cgroup the new blkg is associated with
  73 * @q: request_queue the new blkg is associated with
  74 * @gfp_mask: allocation mask to use
  75 *
  76 * Allocate a new blkg assocating @blkcg and @q.
  77 */
  78static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  79                                   gfp_t gfp_mask)
  80{
  81        struct blkcg_gq *blkg;
  82        int i;
  83
  84        /* alloc and init base part */
  85        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
  86        if (!blkg)
  87                return NULL;
  88
  89        blkg->q = q;
  90        INIT_LIST_HEAD(&blkg->q_node);
  91        blkg->blkcg = blkcg;
  92        blkg->refcnt = 1;
  93
  94        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
  95        if (blkcg != &blkcg_root) {
  96                if (blk_init_rl(&blkg->rl, q, gfp_mask))
  97                        goto err_free;
  98                blkg->rl.blkg = blkg;
  99        }
 100
 101        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 102                struct blkcg_policy *pol = blkcg_policy[i];
 103                struct blkg_policy_data *pd;
 104
 105                if (!blkcg_policy_enabled(q, pol))
 106                        continue;
 107
 108                /* alloc per-policy data and attach it to blkg */
 109                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
 110                if (!pd)
 111                        goto err_free;
 112
 113                blkg->pd[i] = pd;
 114                pd->blkg = blkg;
 115
 116                /* invoke per-policy init */
 117                if (blkcg_policy_enabled(blkg->q, pol))
 118                        pol->pd_init_fn(blkg);
 119        }
 120
 121        return blkg;
 122
 123err_free:
 124        blkg_free(blkg);
 125        return NULL;
 126}
 127
 128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 129                                      struct request_queue *q)
 130{
 131        struct blkcg_gq *blkg;
 132
 133        blkg = rcu_dereference(blkcg->blkg_hint);
 134        if (blkg && blkg->q == q)
 135                return blkg;
 136
 137        /*
 138         * Hint didn't match.  Look up from the radix tree.  Note that we
 139         * may not be holding queue_lock and thus are not sure whether
 140         * @blkg from blkg_tree has already been removed or not, so we
 141         * can't update hint to the lookup result.  Leave it to the caller.
 142         */
 143        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 144        if (blkg && blkg->q == q)
 145                return blkg;
 146
 147        return NULL;
 148}
 149
 150/**
 151 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 152 * @blkcg: blkcg of interest
 153 * @q: request_queue of interest
 154 *
 155 * Lookup blkg for the @blkcg - @q pair.  This function should be called
 156 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
 157 * - see blk_queue_bypass_start() for details.
 158 */
 159struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 160{
 161        WARN_ON_ONCE(!rcu_read_lock_held());
 162
 163        if (unlikely(blk_queue_bypass(q)))
 164                return NULL;
 165        return __blkg_lookup(blkcg, q);
 166}
 167EXPORT_SYMBOL_GPL(blkg_lookup);
 168
 169/*
 170 * If @new_blkg is %NULL, this function tries to allocate a new one as
 171 * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
 172 */
 173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 174                                             struct request_queue *q,
 175                                             struct blkcg_gq *new_blkg)
 176{
 177        struct blkcg_gq *blkg;
 178        int ret;
 179
 180        WARN_ON_ONCE(!rcu_read_lock_held());
 181        lockdep_assert_held(q->queue_lock);
 182
 183        /* lookup and update hint on success, see __blkg_lookup() for details */
 184        blkg = __blkg_lookup(blkcg, q);
 185        if (blkg) {
 186                rcu_assign_pointer(blkcg->blkg_hint, blkg);
 187                goto out_free;
 188        }
 189
 190        /* blkg holds a reference to blkcg */
 191        if (!css_tryget(&blkcg->css)) {
 192                blkg = ERR_PTR(-EINVAL);
 193                goto out_free;
 194        }
 195
 196        /* allocate */
 197        if (!new_blkg) {
 198                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 199                if (unlikely(!new_blkg)) {
 200                        blkg = ERR_PTR(-ENOMEM);
 201                        goto out_put;
 202                }
 203        }
 204        blkg = new_blkg;
 205
 206        /* insert */
 207        spin_lock(&blkcg->lock);
 208        ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 209        if (likely(!ret)) {
 210                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 211                list_add(&blkg->q_node, &q->blkg_list);
 212        }
 213        spin_unlock(&blkcg->lock);
 214
 215        if (!ret)
 216                return blkg;
 217
 218        blkg = ERR_PTR(ret);
 219out_put:
 220        css_put(&blkcg->css);
 221out_free:
 222        blkg_free(new_blkg);
 223        return blkg;
 224}
 225
 226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 227                                    struct request_queue *q)
 228{
 229        /*
 230         * This could be the first entry point of blkcg implementation and
 231         * we shouldn't allow anything to go through for a bypassing queue.
 232         */
 233        if (unlikely(blk_queue_bypass(q)))
 234                return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 235        return __blkg_lookup_create(blkcg, q, NULL);
 236}
 237EXPORT_SYMBOL_GPL(blkg_lookup_create);
 238
 239static void blkg_destroy(struct blkcg_gq *blkg)
 240{
 241        struct blkcg *blkcg = blkg->blkcg;
 242
 243        lockdep_assert_held(blkg->q->queue_lock);
 244        lockdep_assert_held(&blkcg->lock);
 245
 246        /* Something wrong if we are trying to remove same group twice */
 247        WARN_ON_ONCE(list_empty(&blkg->q_node));
 248        WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 249
 250        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 251        list_del_init(&blkg->q_node);
 252        hlist_del_init_rcu(&blkg->blkcg_node);
 253
 254        /*
 255         * Both setting lookup hint to and clearing it from @blkg are done
 256         * under queue_lock.  If it's not pointing to @blkg now, it never
 257         * will.  Hint assignment itself can race safely.
 258         */
 259        if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
 260                rcu_assign_pointer(blkcg->blkg_hint, NULL);
 261
 262        /*
 263         * Put the reference taken at the time of creation so that when all
 264         * queues are gone, group can be destroyed.
 265         */
 266        blkg_put(blkg);
 267}
 268
 269/**
 270 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 271 * @q: request_queue of interest
 272 *
 273 * Destroy all blkgs associated with @q.
 274 */
 275static void blkg_destroy_all(struct request_queue *q)
 276{
 277        struct blkcg_gq *blkg, *n;
 278
 279        lockdep_assert_held(q->queue_lock);
 280
 281        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 282                struct blkcg *blkcg = blkg->blkcg;
 283
 284                spin_lock(&blkcg->lock);
 285                blkg_destroy(blkg);
 286                spin_unlock(&blkcg->lock);
 287        }
 288
 289        /*
 290         * root blkg is destroyed.  Just clear the pointer since
 291         * root_rl does not take reference on root blkg.
 292         */
 293        q->root_blkg = NULL;
 294        q->root_rl.blkg = NULL;
 295}
 296
 297static void blkg_rcu_free(struct rcu_head *rcu_head)
 298{
 299        blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
 300}
 301
 302void __blkg_release(struct blkcg_gq *blkg)
 303{
 304        /* release the extra blkcg reference this blkg has been holding */
 305        css_put(&blkg->blkcg->css);
 306
 307        /*
 308         * A group is freed in rcu manner. But having an rcu lock does not
 309         * mean that one can access all the fields of blkg and assume these
 310         * are valid. For example, don't try to follow throtl_data and
 311         * request queue links.
 312         *
 313         * Having a reference to blkg under an rcu allows acess to only
 314         * values local to groups like group stats and group rate limits
 315         */
 316        call_rcu(&blkg->rcu_head, blkg_rcu_free);
 317}
 318EXPORT_SYMBOL_GPL(__blkg_release);
 319
 320/*
 321 * The next function used by blk_queue_for_each_rl().  It's a bit tricky
 322 * because the root blkg uses @q->root_rl instead of its own rl.
 323 */
 324struct request_list *__blk_queue_next_rl(struct request_list *rl,
 325                                         struct request_queue *q)
 326{
 327        struct list_head *ent;
 328        struct blkcg_gq *blkg;
 329
 330        /*
 331         * Determine the current blkg list_head.  The first entry is
 332         * root_rl which is off @q->blkg_list and mapped to the head.
 333         */
 334        if (rl == &q->root_rl) {
 335                ent = &q->blkg_list;
 336        } else {
 337                blkg = container_of(rl, struct blkcg_gq, rl);
 338                ent = &blkg->q_node;
 339        }
 340
 341        /* walk to the next list_head, skip root blkcg */
 342        ent = ent->next;
 343        if (ent == &q->root_blkg->q_node)
 344                ent = ent->next;
 345        if (ent == &q->blkg_list)
 346                return NULL;
 347
 348        blkg = container_of(ent, struct blkcg_gq, q_node);
 349        return &blkg->rl;
 350}
 351
 352static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
 353                             u64 val)
 354{
 355        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 356        struct blkcg_gq *blkg;
 357        struct hlist_node *n;
 358        int i;
 359
 360        mutex_lock(&blkcg_pol_mutex);
 361        spin_lock_irq(&blkcg->lock);
 362
 363        /*
 364         * Note that stat reset is racy - it doesn't synchronize against
 365         * stat updates.  This is a debug feature which shouldn't exist
 366         * anyway.  If you get hit by a race, retry.
 367         */
 368        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 369                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 370                        struct blkcg_policy *pol = blkcg_policy[i];
 371
 372                        if (blkcg_policy_enabled(blkg->q, pol) &&
 373                            pol->pd_reset_stats_fn)
 374                                pol->pd_reset_stats_fn(blkg);
 375                }
 376        }
 377
 378        spin_unlock_irq(&blkcg->lock);
 379        mutex_unlock(&blkcg_pol_mutex);
 380        return 0;
 381}
 382
 383static const char *blkg_dev_name(struct blkcg_gq *blkg)
 384{
 385        /* some drivers (floppy) instantiate a queue w/o disk registered */
 386        if (blkg->q->backing_dev_info.dev)
 387                return dev_name(blkg->q->backing_dev_info.dev);
 388        return NULL;
 389}
 390
 391/**
 392 * blkcg_print_blkgs - helper for printing per-blkg data
 393 * @sf: seq_file to print to
 394 * @blkcg: blkcg of interest
 395 * @prfill: fill function to print out a blkg
 396 * @pol: policy in question
 397 * @data: data to be passed to @prfill
 398 * @show_total: to print out sum of prfill return values or not
 399 *
 400 * This function invokes @prfill on each blkg of @blkcg if pd for the
 401 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 402 * policy data and @data.  If @show_total is %true, the sum of the return
 403 * values from @prfill is printed with "Total" label at the end.
 404 *
 405 * This is to be used to construct print functions for
 406 * cftype->read_seq_string method.
 407 */
 408void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 409                       u64 (*prfill)(struct seq_file *,
 410                                     struct blkg_policy_data *, int),
 411                       const struct blkcg_policy *pol, int data,
 412                       bool show_total)
 413{
 414        struct blkcg_gq *blkg;
 415        struct hlist_node *n;
 416        u64 total = 0;
 417
 418        spin_lock_irq(&blkcg->lock);
 419        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
 420                if (blkcg_policy_enabled(blkg->q, pol))
 421                        total += prfill(sf, blkg->pd[pol->plid], data);
 422        spin_unlock_irq(&blkcg->lock);
 423
 424        if (show_total)
 425                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 426}
 427EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 428
 429/**
 430 * __blkg_prfill_u64 - prfill helper for a single u64 value
 431 * @sf: seq_file to print to
 432 * @pd: policy private data of interest
 433 * @v: value to print
 434 *
 435 * Print @v to @sf for the device assocaited with @pd.
 436 */
 437u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 438{
 439        const char *dname = blkg_dev_name(pd->blkg);
 440
 441        if (!dname)
 442                return 0;
 443
 444        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 445        return v;
 446}
 447EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 448
 449/**
 450 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 451 * @sf: seq_file to print to
 452 * @pd: policy private data of interest
 453 * @rwstat: rwstat to print
 454 *
 455 * Print @rwstat to @sf for the device assocaited with @pd.
 456 */
 457u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 458                         const struct blkg_rwstat *rwstat)
 459{
 460        static const char *rwstr[] = {
 461                [BLKG_RWSTAT_READ]      = "Read",
 462                [BLKG_RWSTAT_WRITE]     = "Write",
 463                [BLKG_RWSTAT_SYNC]      = "Sync",
 464                [BLKG_RWSTAT_ASYNC]     = "Async",
 465        };
 466        const char *dname = blkg_dev_name(pd->blkg);
 467        u64 v;
 468        int i;
 469
 470        if (!dname)
 471                return 0;
 472
 473        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 474                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
 475                           (unsigned long long)rwstat->cnt[i]);
 476
 477        v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
 478        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 479        return v;
 480}
 481
 482/**
 483 * blkg_prfill_stat - prfill callback for blkg_stat
 484 * @sf: seq_file to print to
 485 * @pd: policy private data of interest
 486 * @off: offset to the blkg_stat in @pd
 487 *
 488 * prfill callback for printing a blkg_stat.
 489 */
 490u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 491{
 492        return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 493}
 494EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 495
 496/**
 497 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 498 * @sf: seq_file to print to
 499 * @pd: policy private data of interest
 500 * @off: offset to the blkg_rwstat in @pd
 501 *
 502 * prfill callback for printing a blkg_rwstat.
 503 */
 504u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 505                       int off)
 506{
 507        struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 508
 509        return __blkg_prfill_rwstat(sf, pd, &rwstat);
 510}
 511EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 512
 513/**
 514 * blkg_conf_prep - parse and prepare for per-blkg config update
 515 * @blkcg: target block cgroup
 516 * @pol: target policy
 517 * @input: input string
 518 * @ctx: blkg_conf_ctx to be filled
 519 *
 520 * Parse per-blkg config update from @input and initialize @ctx with the
 521 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 522 * value.  This function returns with RCU read lock and queue lock held and
 523 * must be paired with blkg_conf_finish().
 524 */
 525int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 526                   const char *input, struct blkg_conf_ctx *ctx)
 527        __acquires(rcu) __acquires(disk->queue->queue_lock)
 528{
 529        struct gendisk *disk;
 530        struct blkcg_gq *blkg;
 531        unsigned int major, minor;
 532        unsigned long long v;
 533        int part, ret;
 534
 535        if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
 536                return -EINVAL;
 537
 538        disk = get_gendisk(MKDEV(major, minor), &part);
 539        if (!disk || part)
 540                return -EINVAL;
 541
 542        rcu_read_lock();
 543        spin_lock_irq(disk->queue->queue_lock);
 544
 545        if (blkcg_policy_enabled(disk->queue, pol))
 546                blkg = blkg_lookup_create(blkcg, disk->queue);
 547        else
 548                blkg = ERR_PTR(-EINVAL);
 549
 550        if (IS_ERR(blkg)) {
 551                ret = PTR_ERR(blkg);
 552                rcu_read_unlock();
 553                spin_unlock_irq(disk->queue->queue_lock);
 554                put_disk(disk);
 555                /*
 556                 * If queue was bypassing, we should retry.  Do so after a
 557                 * short msleep().  It isn't strictly necessary but queue
 558                 * can be bypassing for some time and it's always nice to
 559                 * avoid busy looping.
 560                 */
 561                if (ret == -EBUSY) {
 562                        msleep(10);
 563                        ret = restart_syscall();
 564                }
 565                return ret;
 566        }
 567
 568        ctx->disk = disk;
 569        ctx->blkg = blkg;
 570        ctx->v = v;
 571        return 0;
 572}
 573EXPORT_SYMBOL_GPL(blkg_conf_prep);
 574
 575/**
 576 * blkg_conf_finish - finish up per-blkg config update
 577 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 578 *
 579 * Finish up after per-blkg config update.  This function must be paired
 580 * with blkg_conf_prep().
 581 */
 582void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 583        __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 584{
 585        spin_unlock_irq(ctx->disk->queue->queue_lock);
 586        rcu_read_unlock();
 587        put_disk(ctx->disk);
 588}
 589EXPORT_SYMBOL_GPL(blkg_conf_finish);
 590
 591struct cftype blkcg_files[] = {
 592        {
 593                .name = "reset_stats",
 594                .write_u64 = blkcg_reset_stats,
 595        },
 596        { }     /* terminate */
 597};
 598
 599/**
 600 * blkcg_pre_destroy - cgroup pre_destroy callback
 601 * @cgroup: cgroup of interest
 602 *
 603 * This function is called when @cgroup is about to go away and responsible
 604 * for shooting down all blkgs associated with @cgroup.  blkgs should be
 605 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 606 * inside q lock, this function performs reverse double lock dancing.
 607 *
 608 * This is the blkcg counterpart of ioc_release_fn().
 609 */
 610static int blkcg_pre_destroy(struct cgroup *cgroup)
 611{
 612        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 613
 614        spin_lock_irq(&blkcg->lock);
 615
 616        while (!hlist_empty(&blkcg->blkg_list)) {
 617                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
 618                                                struct blkcg_gq, blkcg_node);
 619                struct request_queue *q = blkg->q;
 620
 621                if (spin_trylock(q->queue_lock)) {
 622                        blkg_destroy(blkg);
 623                        spin_unlock(q->queue_lock);
 624                } else {
 625                        spin_unlock_irq(&blkcg->lock);
 626                        cpu_relax();
 627                        spin_lock_irq(&blkcg->lock);
 628                }
 629        }
 630
 631        spin_unlock_irq(&blkcg->lock);
 632        return 0;
 633}
 634
 635static void blkcg_destroy(struct cgroup *cgroup)
 636{
 637        struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 638
 639        if (blkcg != &blkcg_root)
 640                kfree(blkcg);
 641}
 642
 643static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
 644{
 645        static atomic64_t id_seq = ATOMIC64_INIT(0);
 646        struct blkcg *blkcg;
 647        struct cgroup *parent = cgroup->parent;
 648
 649        if (!parent) {
 650                blkcg = &blkcg_root;
 651                goto done;
 652        }
 653
 654        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 655        if (!blkcg)
 656                return ERR_PTR(-ENOMEM);
 657
 658        blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
 659        blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 660done:
 661        spin_lock_init(&blkcg->lock);
 662        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 663        INIT_HLIST_HEAD(&blkcg->blkg_list);
 664
 665        return &blkcg->css;
 666}
 667
 668/**
 669 * blkcg_init_queue - initialize blkcg part of request queue
 670 * @q: request_queue to initialize
 671 *
 672 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 673 * part of new request_queue @q.
 674 *
 675 * RETURNS:
 676 * 0 on success, -errno on failure.
 677 */
 678int blkcg_init_queue(struct request_queue *q)
 679{
 680        might_sleep();
 681
 682        return blk_throtl_init(q);
 683}
 684
 685/**
 686 * blkcg_drain_queue - drain blkcg part of request_queue
 687 * @q: request_queue to drain
 688 *
 689 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 690 */
 691void blkcg_drain_queue(struct request_queue *q)
 692{
 693        lockdep_assert_held(q->queue_lock);
 694
 695        blk_throtl_drain(q);
 696}
 697
 698/**
 699 * blkcg_exit_queue - exit and release blkcg part of request_queue
 700 * @q: request_queue being released
 701 *
 702 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 703 */
 704void blkcg_exit_queue(struct request_queue *q)
 705{
 706        spin_lock_irq(q->queue_lock);
 707        blkg_destroy_all(q);
 708        spin_unlock_irq(q->queue_lock);
 709
 710        blk_throtl_exit(q);
 711}
 712
 713/*
 714 * We cannot support shared io contexts, as we have no mean to support
 715 * two tasks with the same ioc in two different groups without major rework
 716 * of the main cic data structures.  For now we allow a task to change
 717 * its cgroup only if it's the only owner of its ioc.
 718 */
 719static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 720{
 721        struct task_struct *task;
 722        struct io_context *ioc;
 723        int ret = 0;
 724
 725        /* task_lock() is needed to avoid races with exit_io_context() */
 726        cgroup_taskset_for_each(task, cgrp, tset) {
 727                task_lock(task);
 728                ioc = task->io_context;
 729                if (ioc && atomic_read(&ioc->nr_tasks) > 1)
 730                        ret = -EINVAL;
 731                task_unlock(task);
 732                if (ret)
 733                        break;
 734        }
 735        return ret;
 736}
 737
 738struct cgroup_subsys blkio_subsys = {
 739        .name = "blkio",
 740        .create = blkcg_create,
 741        .can_attach = blkcg_can_attach,
 742        .pre_destroy = blkcg_pre_destroy,
 743        .destroy = blkcg_destroy,
 744        .subsys_id = blkio_subsys_id,
 745        .base_cftypes = blkcg_files,
 746        .module = THIS_MODULE,
 747};
 748EXPORT_SYMBOL_GPL(blkio_subsys);
 749
 750/**
 751 * blkcg_activate_policy - activate a blkcg policy on a request_queue
 752 * @q: request_queue of interest
 753 * @pol: blkcg policy to activate
 754 *
 755 * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
 756 * bypass mode to populate its blkgs with policy_data for @pol.
 757 *
 758 * Activation happens with @q bypassed, so nobody would be accessing blkgs
 759 * from IO path.  Update of each blkg is protected by both queue and blkcg
 760 * locks so that holding either lock and testing blkcg_policy_enabled() is
 761 * always enough for dereferencing policy data.
 762 *
 763 * The caller is responsible for synchronizing [de]activations and policy
 764 * [un]registerations.  Returns 0 on success, -errno on failure.
 765 */
 766int blkcg_activate_policy(struct request_queue *q,
 767                          const struct blkcg_policy *pol)
 768{
 769        LIST_HEAD(pds);
 770        struct blkcg_gq *blkg;
 771        struct blkg_policy_data *pd, *n;
 772        int cnt = 0, ret;
 773        bool preloaded;
 774
 775        if (blkcg_policy_enabled(q, pol))
 776                return 0;
 777
 778        /* preallocations for root blkg */
 779        blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 780        if (!blkg)
 781                return -ENOMEM;
 782
 783        preloaded = !radix_tree_preload(GFP_KERNEL);
 784
 785        blk_queue_bypass_start(q);
 786
 787        /* make sure the root blkg exists and count the existing blkgs */
 788        spin_lock_irq(q->queue_lock);
 789
 790        rcu_read_lock();
 791        blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
 792        rcu_read_unlock();
 793
 794        if (preloaded)
 795                radix_tree_preload_end();
 796
 797        if (IS_ERR(blkg)) {
 798                ret = PTR_ERR(blkg);
 799                goto out_unlock;
 800        }
 801        q->root_blkg = blkg;
 802        q->root_rl.blkg = blkg;
 803
 804        list_for_each_entry(blkg, &q->blkg_list, q_node)
 805                cnt++;
 806
 807        spin_unlock_irq(q->queue_lock);
 808
 809        /* allocate policy_data for all existing blkgs */
 810        while (cnt--) {
 811                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 812                if (!pd) {
 813                        ret = -ENOMEM;
 814                        goto out_free;
 815                }
 816                list_add_tail(&pd->alloc_node, &pds);
 817        }
 818
 819        /*
 820         * Install the allocated pds.  With @q bypassing, no new blkg
 821         * should have been created while the queue lock was dropped.
 822         */
 823        spin_lock_irq(q->queue_lock);
 824
 825        list_for_each_entry(blkg, &q->blkg_list, q_node) {
 826                if (WARN_ON(list_empty(&pds))) {
 827                        /* umm... this shouldn't happen, just abort */
 828                        ret = -ENOMEM;
 829                        goto out_unlock;
 830                }
 831                pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 832                list_del_init(&pd->alloc_node);
 833
 834                /* grab blkcg lock too while installing @pd on @blkg */
 835                spin_lock(&blkg->blkcg->lock);
 836
 837                blkg->pd[pol->plid] = pd;
 838                pd->blkg = blkg;
 839                pol->pd_init_fn(blkg);
 840
 841                spin_unlock(&blkg->blkcg->lock);
 842        }
 843
 844        __set_bit(pol->plid, q->blkcg_pols);
 845        ret = 0;
 846out_unlock:
 847        spin_unlock_irq(q->queue_lock);
 848out_free:
 849        blk_queue_bypass_end(q);
 850        list_for_each_entry_safe(pd, n, &pds, alloc_node)
 851                kfree(pd);
 852        return ret;
 853}
 854EXPORT_SYMBOL_GPL(blkcg_activate_policy);
 855
 856/**
 857 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
 858 * @q: request_queue of interest
 859 * @pol: blkcg policy to deactivate
 860 *
 861 * Deactivate @pol on @q.  Follows the same synchronization rules as
 862 * blkcg_activate_policy().
 863 */
 864void blkcg_deactivate_policy(struct request_queue *q,
 865                             const struct blkcg_policy *pol)
 866{
 867        struct blkcg_gq *blkg;
 868
 869        if (!blkcg_policy_enabled(q, pol))
 870                return;
 871
 872        blk_queue_bypass_start(q);
 873        spin_lock_irq(q->queue_lock);
 874
 875        __clear_bit(pol->plid, q->blkcg_pols);
 876
 877        /* if no policy is left, no need for blkgs - shoot them down */
 878        if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
 879                blkg_destroy_all(q);
 880
 881        list_for_each_entry(blkg, &q->blkg_list, q_node) {
 882                /* grab blkcg lock too while removing @pd from @blkg */
 883                spin_lock(&blkg->blkcg->lock);
 884
 885                if (pol->pd_exit_fn)
 886                        pol->pd_exit_fn(blkg);
 887
 888                kfree(blkg->pd[pol->plid]);
 889                blkg->pd[pol->plid] = NULL;
 890
 891                spin_unlock(&blkg->blkcg->lock);
 892        }
 893
 894        spin_unlock_irq(q->queue_lock);
 895        blk_queue_bypass_end(q);
 896}
 897EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 898
 899/**
 900 * blkcg_policy_register - register a blkcg policy
 901 * @pol: blkcg policy to register
 902 *
 903 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 904 * successful registration.  Returns 0 on success and -errno on failure.
 905 */
 906int blkcg_policy_register(struct blkcg_policy *pol)
 907{
 908        int i, ret;
 909
 910        if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
 911                return -EINVAL;
 912
 913        mutex_lock(&blkcg_pol_mutex);
 914
 915        /* find an empty slot */
 916        ret = -ENOSPC;
 917        for (i = 0; i < BLKCG_MAX_POLS; i++)
 918                if (!blkcg_policy[i])
 919                        break;
 920        if (i >= BLKCG_MAX_POLS)
 921                goto out_unlock;
 922
 923        /* register and update blkgs */
 924        pol->plid = i;
 925        blkcg_policy[i] = pol;
 926
 927        /* everything is in place, add intf files for the new policy */
 928        if (pol->cftypes)
 929                WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
 930        ret = 0;
 931out_unlock:
 932        mutex_unlock(&blkcg_pol_mutex);
 933        return ret;
 934}
 935EXPORT_SYMBOL_GPL(blkcg_policy_register);
 936
 937/**
 938 * blkcg_policy_unregister - unregister a blkcg policy
 939 * @pol: blkcg policy to unregister
 940 *
 941 * Undo blkcg_policy_register(@pol).  Might sleep.
 942 */
 943void blkcg_policy_unregister(struct blkcg_policy *pol)
 944{
 945        mutex_lock(&blkcg_pol_mutex);
 946
 947        if (WARN_ON(blkcg_policy[pol->plid] != pol))
 948                goto out_unlock;
 949
 950        /* kill the intf files first */
 951        if (pol->cftypes)
 952                cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
 953
 954        /* unregister and update blkgs */
 955        blkcg_policy[pol->plid] = NULL;
 956out_unlock:
 957        mutex_unlock(&blkcg_pol_mutex);
 958}
 959EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 960
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.