linux/block/blk-cgroup.c
<<
>>
Prefs
   1/*
   2 * Common Block IO controller cgroup interface
   3 *
   4 * Based on ideas and code from CFQ, CFS and BFQ:
   5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6 *
   7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8 *                    Paolo Valente <paolo.valente@unimore.it>
   9 *
  10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11 *                    Nauman Rafique <nauman@google.com>
  12 */
  13#include <linux/ioprio.h>
  14#include <linux/seq_file.h>
  15#include <linux/kdev_t.h>
  16#include <linux/module.h>
  17#include <linux/err.h>
  18#include <linux/blkdev.h>
  19#include <linux/slab.h>
  20#include "blk-cgroup.h"
  21#include <linux/genhd.h>
  22
  23#define MAX_KEY_LEN 100
  24
  25static DEFINE_SPINLOCK(blkio_list_lock);
  26static LIST_HEAD(blkio_list);
  27
  28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  30
  31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  32                                                  struct cgroup *);
  33static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  34                              struct task_struct *, bool);
  35static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  36                           struct cgroup *, struct task_struct *, bool);
  37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  39
  40struct cgroup_subsys blkio_subsys = {
  41        .name = "blkio",
  42        .create = blkiocg_create,
  43        .can_attach = blkiocg_can_attach,
  44        .attach = blkiocg_attach,
  45        .destroy = blkiocg_destroy,
  46        .populate = blkiocg_populate,
  47#ifdef CONFIG_BLK_CGROUP
  48        /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
  49        .subsys_id = blkio_subsys_id,
  50#endif
  51        .use_id = 1,
  52        .module = THIS_MODULE,
  53};
  54EXPORT_SYMBOL_GPL(blkio_subsys);
  55
  56static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  57                                            struct blkio_policy_node *pn)
  58{
  59        list_add(&pn->node, &blkcg->policy_list);
  60}
  61
  62/* Must be called with blkcg->lock held */
  63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  64{
  65        list_del(&pn->node);
  66}
  67
  68/* Must be called with blkcg->lock held */
  69static struct blkio_policy_node *
  70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
  71{
  72        struct blkio_policy_node *pn;
  73
  74        list_for_each_entry(pn, &blkcg->policy_list, node) {
  75                if (pn->dev == dev)
  76                        return pn;
  77        }
  78
  79        return NULL;
  80}
  81
  82struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  83{
  84        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  85                            struct blkio_cgroup, css);
  86}
  87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  88
  89/*
  90 * Add to the appropriate stat variable depending on the request type.
  91 * This should be called with the blkg->stats_lock held.
  92 */
  93static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
  94                                bool sync)
  95{
  96        if (direction)
  97                stat[BLKIO_STAT_WRITE] += add;
  98        else
  99                stat[BLKIO_STAT_READ] += add;
 100        if (sync)
 101                stat[BLKIO_STAT_SYNC] += add;
 102        else
 103                stat[BLKIO_STAT_ASYNC] += add;
 104}
 105
 106/*
 107 * Decrements the appropriate stat variable if non-zero depending on the
 108 * request type. Panics on value being zero.
 109 * This should be called with the blkg->stats_lock held.
 110 */
 111static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 112{
 113        if (direction) {
 114                BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
 115                stat[BLKIO_STAT_WRITE]--;
 116        } else {
 117                BUG_ON(stat[BLKIO_STAT_READ] == 0);
 118                stat[BLKIO_STAT_READ]--;
 119        }
 120        if (sync) {
 121                BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
 122                stat[BLKIO_STAT_SYNC]--;
 123        } else {
 124                BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
 125                stat[BLKIO_STAT_ASYNC]--;
 126        }
 127}
 128
 129#ifdef CONFIG_DEBUG_BLK_CGROUP
 130/* This should be called with the blkg->stats_lock held. */
 131static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 132                                                struct blkio_group *curr_blkg)
 133{
 134        if (blkio_blkg_waiting(&blkg->stats))
 135                return;
 136        if (blkg == curr_blkg)
 137                return;
 138        blkg->stats.start_group_wait_time = sched_clock();
 139        blkio_mark_blkg_waiting(&blkg->stats);
 140}
 141
 142/* This should be called with the blkg->stats_lock held. */
 143static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 144{
 145        unsigned long long now;
 146
 147        if (!blkio_blkg_waiting(stats))
 148                return;
 149
 150        now = sched_clock();
 151        if (time_after64(now, stats->start_group_wait_time))
 152                stats->group_wait_time += now - stats->start_group_wait_time;
 153        blkio_clear_blkg_waiting(stats);
 154}
 155
 156/* This should be called with the blkg->stats_lock held. */
 157static void blkio_end_empty_time(struct blkio_group_stats *stats)
 158{
 159        unsigned long long now;
 160
 161        if (!blkio_blkg_empty(stats))
 162                return;
 163
 164        now = sched_clock();
 165        if (time_after64(now, stats->start_empty_time))
 166                stats->empty_time += now - stats->start_empty_time;
 167        blkio_clear_blkg_empty(stats);
 168}
 169
 170void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
 171{
 172        unsigned long flags;
 173
 174        spin_lock_irqsave(&blkg->stats_lock, flags);
 175        BUG_ON(blkio_blkg_idling(&blkg->stats));
 176        blkg->stats.start_idle_time = sched_clock();
 177        blkio_mark_blkg_idling(&blkg->stats);
 178        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 179}
 180EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 181
 182void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
 183{
 184        unsigned long flags;
 185        unsigned long long now;
 186        struct blkio_group_stats *stats;
 187
 188        spin_lock_irqsave(&blkg->stats_lock, flags);
 189        stats = &blkg->stats;
 190        if (blkio_blkg_idling(stats)) {
 191                now = sched_clock();
 192                if (time_after64(now, stats->start_idle_time))
 193                        stats->idle_time += now - stats->start_idle_time;
 194                blkio_clear_blkg_idling(stats);
 195        }
 196        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 197}
 198EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 199
 200void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
 201{
 202        unsigned long flags;
 203        struct blkio_group_stats *stats;
 204
 205        spin_lock_irqsave(&blkg->stats_lock, flags);
 206        stats = &blkg->stats;
 207        stats->avg_queue_size_sum +=
 208                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 209                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 210        stats->avg_queue_size_samples++;
 211        blkio_update_group_wait_time(stats);
 212        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 213}
 214EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 215
 216void blkiocg_set_start_empty_time(struct blkio_group *blkg)
 217{
 218        unsigned long flags;
 219        struct blkio_group_stats *stats;
 220
 221        spin_lock_irqsave(&blkg->stats_lock, flags);
 222        stats = &blkg->stats;
 223
 224        if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 225                        stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
 226                spin_unlock_irqrestore(&blkg->stats_lock, flags);
 227                return;
 228        }
 229
 230        /*
 231         * group is already marked empty. This can happen if cfqq got new
 232         * request in parent group and moved to this group while being added
 233         * to service tree. Just ignore the event and move on.
 234         */
 235        if(blkio_blkg_empty(stats)) {
 236                spin_unlock_irqrestore(&blkg->stats_lock, flags);
 237                return;
 238        }
 239
 240        stats->start_empty_time = sched_clock();
 241        blkio_mark_blkg_empty(stats);
 242        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 243}
 244EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 245
 246void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 247                        unsigned long dequeue)
 248{
 249        blkg->stats.dequeue += dequeue;
 250}
 251EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 252#else
 253static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 254                                        struct blkio_group *curr_blkg) {}
 255static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
 256#endif
 257
 258void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 259                        struct blkio_group *curr_blkg, bool direction,
 260                        bool sync)
 261{
 262        unsigned long flags;
 263
 264        spin_lock_irqsave(&blkg->stats_lock, flags);
 265        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
 266                        sync);
 267        blkio_end_empty_time(&blkg->stats);
 268        blkio_set_start_group_wait_time(blkg, curr_blkg);
 269        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 270}
 271EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 272
 273void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 274                                                bool direction, bool sync)
 275{
 276        unsigned long flags;
 277
 278        spin_lock_irqsave(&blkg->stats_lock, flags);
 279        blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
 280                                        direction, sync);
 281        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 282}
 283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 284
 285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 286{
 287        unsigned long flags;
 288
 289        spin_lock_irqsave(&blkg->stats_lock, flags);
 290        blkg->stats.time += time;
 291        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 292}
 293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 294
 295void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 296                                uint64_t bytes, bool direction, bool sync)
 297{
 298        struct blkio_group_stats *stats;
 299        unsigned long flags;
 300
 301        spin_lock_irqsave(&blkg->stats_lock, flags);
 302        stats = &blkg->stats;
 303        stats->sectors += bytes >> 9;
 304        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
 305                        sync);
 306        blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
 307                        direction, sync);
 308        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 309}
 310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 311
 312void blkiocg_update_completion_stats(struct blkio_group *blkg,
 313        uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 314{
 315        struct blkio_group_stats *stats;
 316        unsigned long flags;
 317        unsigned long long now = sched_clock();
 318
 319        spin_lock_irqsave(&blkg->stats_lock, flags);
 320        stats = &blkg->stats;
 321        if (time_after64(now, io_start_time))
 322                blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 323                                now - io_start_time, direction, sync);
 324        if (time_after64(io_start_time, start_time))
 325                blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 326                                io_start_time - start_time, direction, sync);
 327        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 328}
 329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 330
 331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 332                                        bool sync)
 333{
 334        unsigned long flags;
 335
 336        spin_lock_irqsave(&blkg->stats_lock, flags);
 337        blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
 338                        sync);
 339        spin_unlock_irqrestore(&blkg->stats_lock, flags);
 340}
 341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 342
 343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 344                        struct blkio_group *blkg, void *key, dev_t dev)
 345{
 346        unsigned long flags;
 347
 348        spin_lock_irqsave(&blkcg->lock, flags);
 349        spin_lock_init(&blkg->stats_lock);
 350        rcu_assign_pointer(blkg->key, key);
 351        blkg->blkcg_id = css_id(&blkcg->css);
 352        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 353        spin_unlock_irqrestore(&blkcg->lock, flags);
 354        /* Need to take css reference ? */
 355        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 356        blkg->dev = dev;
 357}
 358EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
 359
 360static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 361{
 362        hlist_del_init_rcu(&blkg->blkcg_node);
 363        blkg->blkcg_id = 0;
 364}
 365
 366/*
 367 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
 368 * indicating that blk_group was unhashed by the time we got to it.
 369 */
 370int blkiocg_del_blkio_group(struct blkio_group *blkg)
 371{
 372        struct blkio_cgroup *blkcg;
 373        unsigned long flags;
 374        struct cgroup_subsys_state *css;
 375        int ret = 1;
 376
 377        rcu_read_lock();
 378        css = css_lookup(&blkio_subsys, blkg->blkcg_id);
 379        if (css) {
 380                blkcg = container_of(css, struct blkio_cgroup, css);
 381                spin_lock_irqsave(&blkcg->lock, flags);
 382                if (!hlist_unhashed(&blkg->blkcg_node)) {
 383                        __blkiocg_del_blkio_group(blkg);
 384                        ret = 0;
 385                }
 386                spin_unlock_irqrestore(&blkcg->lock, flags);
 387        }
 388
 389        rcu_read_unlock();
 390        return ret;
 391}
 392EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 393
 394/* called under rcu_read_lock(). */
 395struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 396{
 397        struct blkio_group *blkg;
 398        struct hlist_node *n;
 399        void *__key;
 400
 401        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
 402                __key = blkg->key;
 403                if (__key == key)
 404                        return blkg;
 405        }
 406
 407        return NULL;
 408}
 409EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 410
 411#define SHOW_FUNCTION(__VAR)                                            \
 412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
 413                                       struct cftype *cftype)           \
 414{                                                                       \
 415        struct blkio_cgroup *blkcg;                                     \
 416                                                                        \
 417        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
 418        return (u64)blkcg->__VAR;                                       \
 419}
 420
 421SHOW_FUNCTION(weight);
 422#undef SHOW_FUNCTION
 423
 424static int
 425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 426{
 427        struct blkio_cgroup *blkcg;
 428        struct blkio_group *blkg;
 429        struct hlist_node *n;
 430        struct blkio_policy_type *blkiop;
 431        struct blkio_policy_node *pn;
 432
 433        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
 434                return -EINVAL;
 435
 436        blkcg = cgroup_to_blkio_cgroup(cgroup);
 437        spin_lock(&blkio_list_lock);
 438        spin_lock_irq(&blkcg->lock);
 439        blkcg->weight = (unsigned int)val;
 440
 441        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 442                pn = blkio_policy_search_node(blkcg, blkg->dev);
 443
 444                if (pn)
 445                        continue;
 446
 447                list_for_each_entry(blkiop, &blkio_list, list)
 448                        blkiop->ops.blkio_update_group_weight_fn(blkg,
 449                                        blkcg->weight);
 450        }
 451        spin_unlock_irq(&blkcg->lock);
 452        spin_unlock(&blkio_list_lock);
 453        return 0;
 454}
 455
 456static int
 457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 458{
 459        struct blkio_cgroup *blkcg;
 460        struct blkio_group *blkg;
 461        struct blkio_group_stats *stats;
 462        struct hlist_node *n;
 463        uint64_t queued[BLKIO_STAT_TOTAL];
 464        int i;
 465#ifdef CONFIG_DEBUG_BLK_CGROUP
 466        bool idling, waiting, empty;
 467        unsigned long long now = sched_clock();
 468#endif
 469
 470        blkcg = cgroup_to_blkio_cgroup(cgroup);
 471        spin_lock_irq(&blkcg->lock);
 472        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 473                spin_lock(&blkg->stats_lock);
 474                stats = &blkg->stats;
 475#ifdef CONFIG_DEBUG_BLK_CGROUP
 476                idling = blkio_blkg_idling(stats);
 477                waiting = blkio_blkg_waiting(stats);
 478                empty = blkio_blkg_empty(stats);
 479#endif
 480                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 481                        queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
 482                memset(stats, 0, sizeof(struct blkio_group_stats));
 483                for (i = 0; i < BLKIO_STAT_TOTAL; i++)
 484                        stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
 485#ifdef CONFIG_DEBUG_BLK_CGROUP
 486                if (idling) {
 487                        blkio_mark_blkg_idling(stats);
 488                        stats->start_idle_time = now;
 489                }
 490                if (waiting) {
 491                        blkio_mark_blkg_waiting(stats);
 492                        stats->start_group_wait_time = now;
 493                }
 494                if (empty) {
 495                        blkio_mark_blkg_empty(stats);
 496                        stats->start_empty_time = now;
 497                }
 498#endif
 499                spin_unlock(&blkg->stats_lock);
 500        }
 501        spin_unlock_irq(&blkcg->lock);
 502        return 0;
 503}
 504
 505static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
 506                                int chars_left, bool diskname_only)
 507{
 508        snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
 509        chars_left -= strlen(str);
 510        if (chars_left <= 0) {
 511                printk(KERN_WARNING
 512                        "Possibly incorrect cgroup stat display format");
 513                return;
 514        }
 515        if (diskname_only)
 516                return;
 517        switch (type) {
 518        case BLKIO_STAT_READ:
 519                strlcat(str, " Read", chars_left);
 520                break;
 521        case BLKIO_STAT_WRITE:
 522                strlcat(str, " Write", chars_left);
 523                break;
 524        case BLKIO_STAT_SYNC:
 525                strlcat(str, " Sync", chars_left);
 526                break;
 527        case BLKIO_STAT_ASYNC:
 528                strlcat(str, " Async", chars_left);
 529                break;
 530        case BLKIO_STAT_TOTAL:
 531                strlcat(str, " Total", chars_left);
 532                break;
 533        default:
 534                strlcat(str, " Invalid", chars_left);
 535        }
 536}
 537
 538static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 539                                struct cgroup_map_cb *cb, dev_t dev)
 540{
 541        blkio_get_key_name(0, dev, str, chars_left, true);
 542        cb->fill(cb, str, val);
 543        return val;
 544}
 545
 546/* This should be called with blkg->stats_lock held */
 547static uint64_t blkio_get_stat(struct blkio_group *blkg,
 548                struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
 549{
 550        uint64_t disk_total;
 551        char key_str[MAX_KEY_LEN];
 552        enum stat_sub_type sub_type;
 553
 554        if (type == BLKIO_STAT_TIME)
 555                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 556                                        blkg->stats.time, cb, dev);
 557        if (type == BLKIO_STAT_SECTORS)
 558                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 559                                        blkg->stats.sectors, cb, dev);
 560#ifdef CONFIG_DEBUG_BLK_CGROUP
 561        if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 562                uint64_t sum = blkg->stats.avg_queue_size_sum;
 563                uint64_t samples = blkg->stats.avg_queue_size_samples;
 564                if (samples)
 565                        do_div(sum, samples);
 566                else
 567                        sum = 0;
 568                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
 569        }
 570        if (type == BLKIO_STAT_GROUP_WAIT_TIME)
 571                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 572                                        blkg->stats.group_wait_time, cb, dev);
 573        if (type == BLKIO_STAT_IDLE_TIME)
 574                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 575                                        blkg->stats.idle_time, cb, dev);
 576        if (type == BLKIO_STAT_EMPTY_TIME)
 577                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 578                                        blkg->stats.empty_time, cb, dev);
 579        if (type == BLKIO_STAT_DEQUEUE)
 580                return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 581                                        blkg->stats.dequeue, cb, dev);
 582#endif
 583
 584        for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 585                        sub_type++) {
 586                blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
 587                cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 588        }
 589        disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
 590                        blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
 591        blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 592        cb->fill(cb, key_str, disk_total);
 593        return disk_total;
 594}
 595
 596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)                \
 597static int blkiocg_##__VAR##_read(struct cgroup *cgroup,                \
 598                struct cftype *cftype, struct cgroup_map_cb *cb)        \
 599{                                                                       \
 600        struct blkio_cgroup *blkcg;                                     \
 601        struct blkio_group *blkg;                                       \
 602        struct hlist_node *n;                                           \
 603        uint64_t cgroup_total = 0;                                      \
 604                                                                        \
 605        if (!cgroup_lock_live_group(cgroup))                            \
 606                return -ENODEV;                                         \
 607                                                                        \
 608        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
 609        rcu_read_lock();                                                \
 610        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
 611                if (blkg->dev) {                                        \
 612                        spin_lock_irq(&blkg->stats_lock);               \
 613                        cgroup_total += blkio_get_stat(blkg, cb,        \
 614                                                blkg->dev, type);       \
 615                        spin_unlock_irq(&blkg->stats_lock);             \
 616                }                                                       \
 617        }                                                               \
 618        if (show_total)                                                 \
 619                cb->fill(cb, "Total", cgroup_total);                    \
 620        rcu_read_unlock();                                              \
 621        cgroup_unlock();                                                \
 622        return 0;                                                       \
 623}
 624
 625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
 626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
 627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
 628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
 629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
 630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
 631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
 632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
 633#ifdef CONFIG_DEBUG_BLK_CGROUP
 634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
 636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
 637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
 638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
 639#endif
 640#undef SHOW_FUNCTION_PER_GROUP
 641
 642static int blkio_check_dev_num(dev_t dev)
 643{
 644        int part = 0;
 645        struct gendisk *disk;
 646
 647        disk = get_gendisk(dev, &part);
 648        if (!disk || part)
 649                return -ENODEV;
 650
 651        return 0;
 652}
 653
 654static int blkio_policy_parse_and_set(char *buf,
 655                                      struct blkio_policy_node *newpn)
 656{
 657        char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 658        int ret;
 659        unsigned long major, minor, temp;
 660        int i = 0;
 661        dev_t dev;
 662
 663        memset(s, 0, sizeof(s));
 664
 665        while ((p = strsep(&buf, " ")) != NULL) {
 666                if (!*p)
 667                        continue;
 668
 669                s[i++] = p;
 670
 671                /* Prevent from inputing too many things */
 672                if (i == 3)
 673                        break;
 674        }
 675
 676        if (i != 2)
 677                return -EINVAL;
 678
 679        p = strsep(&s[0], ":");
 680        if (p != NULL)
 681                major_s = p;
 682        else
 683                return -EINVAL;
 684
 685        minor_s = s[0];
 686        if (!minor_s)
 687                return -EINVAL;
 688
 689        ret = strict_strtoul(major_s, 10, &major);
 690        if (ret)
 691                return -EINVAL;
 692
 693        ret = strict_strtoul(minor_s, 10, &minor);
 694        if (ret)
 695                return -EINVAL;
 696
 697        dev = MKDEV(major, minor);
 698
 699        ret = blkio_check_dev_num(dev);
 700        if (ret)
 701                return ret;
 702
 703        newpn->dev = dev;
 704
 705        if (s[1] == NULL)
 706                return -EINVAL;
 707
 708        ret = strict_strtoul(s[1], 10, &temp);
 709        if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
 710            temp > BLKIO_WEIGHT_MAX)
 711                return -EINVAL;
 712
 713        newpn->weight =  temp;
 714
 715        return 0;
 716}
 717
 718unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 719                              dev_t dev)
 720{
 721        struct blkio_policy_node *pn;
 722
 723        pn = blkio_policy_search_node(blkcg, dev);
 724        if (pn)
 725                return pn->weight;
 726        else
 727                return blkcg->weight;
 728}
 729EXPORT_SYMBOL_GPL(blkcg_get_weight);
 730
 731
 732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 733                                       const char *buffer)
 734{
 735        int ret = 0;
 736        char *buf;
 737        struct blkio_policy_node *newpn, *pn;
 738        struct blkio_cgroup *blkcg;
 739        struct blkio_group *blkg;
 740        int keep_newpn = 0;
 741        struct hlist_node *n;
 742        struct blkio_policy_type *blkiop;
 743
 744        buf = kstrdup(buffer, GFP_KERNEL);
 745        if (!buf)
 746                return -ENOMEM;
 747
 748        newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
 749        if (!newpn) {
 750                ret = -ENOMEM;
 751                goto free_buf;
 752        }
 753
 754        ret = blkio_policy_parse_and_set(buf, newpn);
 755        if (ret)
 756                goto free_newpn;
 757
 758        blkcg = cgroup_to_blkio_cgroup(cgrp);
 759
 760        spin_lock_irq(&blkcg->lock);
 761
 762        pn = blkio_policy_search_node(blkcg, newpn->dev);
 763        if (!pn) {
 764                if (newpn->weight != 0) {
 765                        blkio_policy_insert_node(blkcg, newpn);
 766                        keep_newpn = 1;
 767                }
 768                spin_unlock_irq(&blkcg->lock);
 769                goto update_io_group;
 770        }
 771
 772        if (newpn->weight == 0) {
 773                /* weight == 0 means deleteing a specific weight */
 774                blkio_policy_delete_node(pn);
 775                spin_unlock_irq(&blkcg->lock);
 776                goto update_io_group;
 777        }
 778        spin_unlock_irq(&blkcg->lock);
 779
 780        pn->weight = newpn->weight;
 781
 782update_io_group:
 783        /* update weight for each cfqg */
 784        spin_lock(&blkio_list_lock);
 785        spin_lock_irq(&blkcg->lock);
 786
 787        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 788                if (newpn->dev == blkg->dev) {
 789                        list_for_each_entry(blkiop, &blkio_list, list)
 790                                blkiop->ops.blkio_update_group_weight_fn(blkg,
 791                                                         newpn->weight ?
 792                                                         newpn->weight :
 793                                                         blkcg->weight);
 794                }
 795        }
 796
 797        spin_unlock_irq(&blkcg->lock);
 798        spin_unlock(&blkio_list_lock);
 799
 800free_newpn:
 801        if (!keep_newpn)
 802                kfree(newpn);
 803free_buf:
 804        kfree(buf);
 805        return ret;
 806}
 807
 808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 809                                      struct seq_file *m)
 810{
 811        struct blkio_cgroup *blkcg;
 812        struct blkio_policy_node *pn;
 813
 814        seq_printf(m, "dev\tweight\n");
 815
 816        blkcg = cgroup_to_blkio_cgroup(cgrp);
 817        if (!list_empty(&blkcg->policy_list)) {
 818                spin_lock_irq(&blkcg->lock);
 819                list_for_each_entry(pn, &blkcg->policy_list, node) {
 820                        seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
 821                                   MINOR(pn->dev), pn->weight);
 822                }
 823                spin_unlock_irq(&blkcg->lock);
 824        }
 825
 826        return 0;
 827}
 828
 829struct cftype blkio_files[] = {
 830        {
 831                .name = "weight_device",
 832                .read_seq_string = blkiocg_weight_device_read,
 833                .write_string = blkiocg_weight_device_write,
 834                .max_write_len = 256,
 835        },
 836        {
 837                .name = "weight",
 838                .read_u64 = blkiocg_weight_read,
 839                .write_u64 = blkiocg_weight_write,
 840        },
 841        {
 842                .name = "time",
 843                .read_map = blkiocg_time_read,
 844        },
 845        {
 846                .name = "sectors",
 847                .read_map = blkiocg_sectors_read,
 848        },
 849        {
 850                .name = "io_service_bytes",
 851                .read_map = blkiocg_io_service_bytes_read,
 852        },
 853        {
 854                .name = "io_serviced",
 855                .read_map = blkiocg_io_serviced_read,
 856        },
 857        {
 858                .name = "io_service_time",
 859                .read_map = blkiocg_io_service_time_read,
 860        },
 861        {
 862                .name = "io_wait_time",
 863                .read_map = blkiocg_io_wait_time_read,
 864        },
 865        {
 866                .name = "io_merged",
 867                .read_map = blkiocg_io_merged_read,
 868        },
 869        {
 870                .name = "io_queued",
 871                .read_map = blkiocg_io_queued_read,
 872        },
 873        {
 874                .name = "reset_stats",
 875                .write_u64 = blkiocg_reset_stats,
 876        },
 877#ifdef CONFIG_DEBUG_BLK_CGROUP
 878        {
 879                .name = "avg_queue_size",
 880                .read_map = blkiocg_avg_queue_size_read,
 881        },
 882        {
 883                .name = "group_wait_time",
 884                .read_map = blkiocg_group_wait_time_read,
 885        },
 886        {
 887                .name = "idle_time",
 888                .read_map = blkiocg_idle_time_read,
 889        },
 890        {
 891                .name = "empty_time",
 892                .read_map = blkiocg_empty_time_read,
 893        },
 894        {
 895                .name = "dequeue",
 896                .read_map = blkiocg_dequeue_read,
 897        },
 898#endif
 899};
 900
 901static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 902{
 903        return cgroup_add_files(cgroup, subsys, blkio_files,
 904                                ARRAY_SIZE(blkio_files));
 905}
 906
 907static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 908{
 909        struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 910        unsigned long flags;
 911        struct blkio_group *blkg;
 912        void *key;
 913        struct blkio_policy_type *blkiop;
 914        struct blkio_policy_node *pn, *pntmp;
 915
 916        rcu_read_lock();
 917        do {
 918                spin_lock_irqsave(&blkcg->lock, flags);
 919
 920                if (hlist_empty(&blkcg->blkg_list)) {
 921                        spin_unlock_irqrestore(&blkcg->lock, flags);
 922                        break;
 923                }
 924
 925                blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
 926                                        blkcg_node);
 927                key = rcu_dereference(blkg->key);
 928                __blkiocg_del_blkio_group(blkg);
 929
 930                spin_unlock_irqrestore(&blkcg->lock, flags);
 931
 932                /*
 933                 * This blkio_group is being unlinked as associated cgroup is
 934                 * going away. Let all the IO controlling policies know about
 935                 * this event. Currently this is static call to one io
 936                 * controlling policy. Once we have more policies in place, we
 937                 * need some dynamic registration of callback function.
 938                 */
 939                spin_lock(&blkio_list_lock);
 940                list_for_each_entry(blkiop, &blkio_list, list)
 941                        blkiop->ops.blkio_unlink_group_fn(key, blkg);
 942                spin_unlock(&blkio_list_lock);
 943        } while (1);
 944
 945        list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
 946                blkio_policy_delete_node(pn);
 947                kfree(pn);
 948        }
 949
 950        free_css_id(&blkio_subsys, &blkcg->css);
 951        rcu_read_unlock();
 952        if (blkcg != &blkio_root_cgroup)
 953                kfree(blkcg);
 954}
 955
 956static struct cgroup_subsys_state *
 957blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 958{
 959        struct blkio_cgroup *blkcg;
 960        struct cgroup *parent = cgroup->parent;
 961
 962        if (!parent) {
 963                blkcg = &blkio_root_cgroup;
 964                goto done;
 965        }
 966
 967        /* Currently we do not support hierarchy deeper than two level (0,1) */
 968        if (parent != cgroup->top_cgroup)
 969                return ERR_PTR(-EINVAL);
 970
 971        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 972        if (!blkcg)
 973                return ERR_PTR(-ENOMEM);
 974
 975        blkcg->weight = BLKIO_WEIGHT_DEFAULT;
 976done:
 977        spin_lock_init(&blkcg->lock);
 978        INIT_HLIST_HEAD(&blkcg->blkg_list);
 979
 980        INIT_LIST_HEAD(&blkcg->policy_list);
 981        return &blkcg->css;
 982}
 983
 984/*
 985 * We cannot support shared io contexts, as we have no mean to support
 986 * two tasks with the same ioc in two different groups without major rework
 987 * of the main cic data structures.  For now we allow a task to change
 988 * its cgroup only if it's the only owner of its ioc.
 989 */
 990static int blkiocg_can_attach(struct cgroup_subsys *subsys,
 991                                struct cgroup *cgroup, struct task_struct *tsk,
 992                                bool threadgroup)
 993{
 994        struct io_context *ioc;
 995        int ret = 0;
 996
 997        /* task_lock() is needed to avoid races with exit_io_context() */
 998        task_lock(tsk);
 999        ioc = tsk->io_context;
1000        if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1001                ret = -EINVAL;
1002        task_unlock(tsk);
1003
1004        return ret;
1005}
1006
1007static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1008                                struct cgroup *prev, struct task_struct *tsk,
1009                                bool threadgroup)
1010{
1011        struct io_context *ioc;
1012
1013        task_lock(tsk);
1014        ioc = tsk->io_context;
1015        if (ioc)
1016                ioc->cgroup_changed = 1;
1017        task_unlock(tsk);
1018}
1019
1020void blkio_policy_register(struct blkio_policy_type *blkiop)
1021{
1022        spin_lock(&blkio_list_lock);
1023        list_add_tail(&blkiop->list, &blkio_list);
1024        spin_unlock(&blkio_list_lock);
1025}
1026EXPORT_SYMBOL_GPL(blkio_policy_register);
1027
1028void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1029{
1030        spin_lock(&blkio_list_lock);
1031        list_del_init(&blkiop->list);
1032        spin_unlock(&blkio_list_lock);
1033}
1034EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1035
1036static int __init init_cgroup_blkio(void)
1037{
1038        return cgroup_load_subsys(&blkio_subsys);
1039}
1040
1041static void __exit exit_cgroup_blkio(void)
1042{
1043        cgroup_unload_subsys(&blkio_subsys);
1044}
1045
1046module_init(init_cgroup_blkio);
1047module_exit(exit_cgroup_blkio);
1048MODULE_LICENSE("GPL");
1049