linux/fs/ext4/ialloc.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/ext4/ialloc.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  BSD ufs-inspired inode and directory allocation by
  10 *  Stephen Tweedie (sct@redhat.com), 1993
  11 *  Big-endian to little-endian byte-swapping/bitmaps by
  12 *        David S. Miller (davem@caip.rutgers.edu), 1995
  13 */
  14
  15#include <linux/time.h>
  16#include <linux/fs.h>
  17#include <linux/jbd2.h>
  18#include <linux/stat.h>
  19#include <linux/string.h>
  20#include <linux/quotaops.h>
  21#include <linux/buffer_head.h>
  22#include <linux/random.h>
  23#include <linux/bitops.h>
  24#include <linux/blkdev.h>
  25#include <asm/byteorder.h>
  26
  27#include "ext4.h"
  28#include "ext4_jbd2.h"
  29#include "xattr.h"
  30#include "acl.h"
  31
  32#include <trace/events/ext4.h>
  33
  34/*
  35 * ialloc.c contains the inodes allocation and deallocation routines
  36 */
  37
  38/*
  39 * The free inodes are managed by bitmaps.  A file system contains several
  40 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
  41 * block for inodes, N blocks for the inode table and data blocks.
  42 *
  43 * The file system contains group descriptors which are located after the
  44 * super block.  Each descriptor contains the number of the bitmap block and
  45 * the free blocks count in the block.
  46 */
  47
  48/*
  49 * To avoid calling the atomic setbit hundreds or thousands of times, we only
  50 * need to use it within a single byte (to ensure we get endianness right).
  51 * We can use memset for the rest of the bitmap as there are no other users.
  52 */
  53void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
  54{
  55        int i;
  56
  57        if (start_bit >= end_bit)
  58                return;
  59
  60        ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
  61        for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
  62                ext4_set_bit(i, bitmap);
  63        if (i < end_bit)
  64                memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
  65}
  66
  67/* Initializes an uninitialized inode bitmap */
  68unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
  69                                ext4_group_t block_group,
  70                                struct ext4_group_desc *gdp)
  71{
  72        struct ext4_sb_info *sbi = EXT4_SB(sb);
  73
  74        J_ASSERT_BH(bh, buffer_locked(bh));
  75
  76        /* If checksum is bad mark all blocks and inodes use to prevent
  77         * allocation, essentially implementing a per-group read-only flag. */
  78        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
  79                ext4_error(sb, "Checksum bad for group %u", block_group);
  80                ext4_free_blks_set(sb, gdp, 0);
  81                ext4_free_inodes_set(sb, gdp, 0);
  82                ext4_itable_unused_set(sb, gdp, 0);
  83                memset(bh->b_data, 0xff, sb->s_blocksize);
  84                return 0;
  85        }
  86
  87        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
  88        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
  89                        bh->b_data);
  90
  91        return EXT4_INODES_PER_GROUP(sb);
  92}
  93
  94/*
  95 * Read the inode allocation bitmap for a given block_group, reading
  96 * into the specified slot in the superblock's bitmap cache.
  97 *
  98 * Return buffer_head of bitmap on success or NULL.
  99 */
 100static struct buffer_head *
 101ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 102{
 103        struct ext4_group_desc *desc;
 104        struct buffer_head *bh = NULL;
 105        ext4_fsblk_t bitmap_blk;
 106
 107        desc = ext4_get_group_desc(sb, block_group, NULL);
 108        if (!desc)
 109                return NULL;
 110        bitmap_blk = ext4_inode_bitmap(sb, desc);
 111        bh = sb_getblk(sb, bitmap_blk);
 112        if (unlikely(!bh)) {
 113                ext4_error(sb, "Cannot read inode bitmap - "
 114                            "block_group = %u, inode_bitmap = %llu",
 115                            block_group, bitmap_blk);
 116                return NULL;
 117        }
 118        if (bitmap_uptodate(bh))
 119                return bh;
 120
 121        lock_buffer(bh);
 122        if (bitmap_uptodate(bh)) {
 123                unlock_buffer(bh);
 124                return bh;
 125        }
 126        ext4_lock_group(sb, block_group);
 127        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 128                ext4_init_inode_bitmap(sb, bh, block_group, desc);
 129                set_bitmap_uptodate(bh);
 130                set_buffer_uptodate(bh);
 131                ext4_unlock_group(sb, block_group);
 132                unlock_buffer(bh);
 133                return bh;
 134        }
 135        ext4_unlock_group(sb, block_group);
 136        if (buffer_uptodate(bh)) {
 137                /*
 138                 * if not uninit if bh is uptodate,
 139                 * bitmap is also uptodate
 140                 */
 141                set_bitmap_uptodate(bh);
 142                unlock_buffer(bh);
 143                return bh;
 144        }
 145        /*
 146         * submit the buffer_head for read. We can
 147         * safely mark the bitmap as uptodate now.
 148         * We do it here so the bitmap uptodate bit
 149         * get set with buffer lock held.
 150         */
 151        set_bitmap_uptodate(bh);
 152        if (bh_submit_read(bh) < 0) {
 153                put_bh(bh);
 154                ext4_error(sb, "Cannot read inode bitmap - "
 155                            "block_group = %u, inode_bitmap = %llu",
 156                            block_group, bitmap_blk);
 157                return NULL;
 158        }
 159        return bh;
 160}
 161
 162/*
 163 * NOTE! When we get the inode, we're the only people
 164 * that have access to it, and as such there are no
 165 * race conditions we have to worry about. The inode
 166 * is not on the hash-lists, and it cannot be reached
 167 * through the filesystem because the directory entry
 168 * has been deleted earlier.
 169 *
 170 * HOWEVER: we must make sure that we get no aliases,
 171 * which means that we have to call "clear_inode()"
 172 * _before_ we mark the inode not in use in the inode
 173 * bitmaps. Otherwise a newly created file might use
 174 * the same inode number (not actually the same pointer
 175 * though), and then we'd have two inodes sharing the
 176 * same inode number and space on the harddisk.
 177 */
 178void ext4_free_inode(handle_t *handle, struct inode *inode)
 179{
 180        struct super_block *sb = inode->i_sb;
 181        int is_directory;
 182        unsigned long ino;
 183        struct buffer_head *bitmap_bh = NULL;
 184        struct buffer_head *bh2;
 185        ext4_group_t block_group;
 186        unsigned long bit;
 187        struct ext4_group_desc *gdp;
 188        struct ext4_super_block *es;
 189        struct ext4_sb_info *sbi;
 190        int fatal = 0, err, count, cleared;
 191
 192        if (atomic_read(&inode->i_count) > 1) {
 193                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
 194                       atomic_read(&inode->i_count));
 195                return;
 196        }
 197        if (inode->i_nlink) {
 198                printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
 199                       inode->i_nlink);
 200                return;
 201        }
 202        if (!sb) {
 203                printk(KERN_ERR "ext4_free_inode: inode on "
 204                       "nonexistent device\n");
 205                return;
 206        }
 207        sbi = EXT4_SB(sb);
 208
 209        ino = inode->i_ino;
 210        ext4_debug("freeing inode %lu\n", ino);
 211        trace_ext4_free_inode(inode);
 212
 213        /*
 214         * Note: we must free any quota before locking the superblock,
 215         * as writing the quota to disk may need the lock as well.
 216         */
 217        dquot_initialize(inode);
 218        ext4_xattr_delete_inode(handle, inode);
 219        dquot_free_inode(inode);
 220        dquot_drop(inode);
 221
 222        is_directory = S_ISDIR(inode->i_mode);
 223
 224        /* Do this BEFORE marking the inode not in use or returning an error */
 225        clear_inode(inode);
 226
 227        es = EXT4_SB(sb)->s_es;
 228        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
 229                ext4_error(sb, "reserved or nonexistent inode %lu", ino);
 230                goto error_return;
 231        }
 232        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 233        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 234        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 235        if (!bitmap_bh)
 236                goto error_return;
 237
 238        BUFFER_TRACE(bitmap_bh, "get_write_access");
 239        fatal = ext4_journal_get_write_access(handle, bitmap_bh);
 240        if (fatal)
 241                goto error_return;
 242
 243        fatal = -ESRCH;
 244        gdp = ext4_get_group_desc(sb, block_group, &bh2);
 245        if (gdp) {
 246                BUFFER_TRACE(bh2, "get_write_access");
 247                fatal = ext4_journal_get_write_access(handle, bh2);
 248        }
 249        ext4_lock_group(sb, block_group);
 250        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
 251        if (fatal || !cleared) {
 252                ext4_unlock_group(sb, block_group);
 253                goto out;
 254        }
 255
 256        count = ext4_free_inodes_count(sb, gdp) + 1;
 257        ext4_free_inodes_set(sb, gdp, count);
 258        if (is_directory) {
 259                count = ext4_used_dirs_count(sb, gdp) - 1;
 260                ext4_used_dirs_set(sb, gdp, count);
 261                percpu_counter_dec(&sbi->s_dirs_counter);
 262        }
 263        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 264        ext4_unlock_group(sb, block_group);
 265
 266        percpu_counter_inc(&sbi->s_freeinodes_counter);
 267        if (sbi->s_log_groups_per_flex) {
 268                ext4_group_t f = ext4_flex_group(sbi, block_group);
 269
 270                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
 271                if (is_directory)
 272                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
 273        }
 274        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 275        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
 276out:
 277        if (cleared) {
 278                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
 279                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 280                if (!fatal)
 281                        fatal = err;
 282                sb->s_dirt = 1;
 283        } else
 284                ext4_error(sb, "bit already cleared for inode %lu", ino);
 285
 286error_return:
 287        brelse(bitmap_bh);
 288        ext4_std_error(sb, fatal);
 289}
 290
 291/*
 292 * There are two policies for allocating an inode.  If the new inode is
 293 * a directory, then a forward search is made for a block group with both
 294 * free space and a low directory-to-inode ratio; if that fails, then of
 295 * the groups with above-average free space, that group with the fewest
 296 * directories already is chosen.
 297 *
 298 * For other inodes, search forward from the parent directory\'s block
 299 * group to find a free inode.
 300 */
 301static int find_group_dir(struct super_block *sb, struct inode *parent,
 302                                ext4_group_t *best_group)
 303{
 304        ext4_group_t ngroups = ext4_get_groups_count(sb);
 305        unsigned int freei, avefreei;
 306        struct ext4_group_desc *desc, *best_desc = NULL;
 307        ext4_group_t group;
 308        int ret = -1;
 309
 310        freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
 311        avefreei = freei / ngroups;
 312
 313        for (group = 0; group < ngroups; group++) {
 314                desc = ext4_get_group_desc(sb, group, NULL);
 315                if (!desc || !ext4_free_inodes_count(sb, desc))
 316                        continue;
 317                if (ext4_free_inodes_count(sb, desc) < avefreei)
 318                        continue;
 319                if (!best_desc ||
 320                    (ext4_free_blks_count(sb, desc) >
 321                     ext4_free_blks_count(sb, best_desc))) {
 322                        *best_group = group;
 323                        best_desc = desc;
 324                        ret = 0;
 325                }
 326        }
 327        return ret;
 328}
 329
 330#define free_block_ratio 10
 331
 332static int find_group_flex(struct super_block *sb, struct inode *parent,
 333                           ext4_group_t *best_group)
 334{
 335        struct ext4_sb_info *sbi = EXT4_SB(sb);
 336        struct ext4_group_desc *desc;
 337        struct flex_groups *flex_group = sbi->s_flex_groups;
 338        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 339        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
 340        ext4_group_t ngroups = ext4_get_groups_count(sb);
 341        int flex_size = ext4_flex_bg_size(sbi);
 342        ext4_group_t best_flex = parent_fbg_group;
 343        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
 344        int flexbg_free_blocks;
 345        int flex_freeb_ratio;
 346        ext4_group_t n_fbg_groups;
 347        ext4_group_t i;
 348
 349        n_fbg_groups = (ngroups + flex_size - 1) >>
 350                sbi->s_log_groups_per_flex;
 351
 352find_close_to_parent:
 353        flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
 354        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 355        if (atomic_read(&flex_group[best_flex].free_inodes) &&
 356            flex_freeb_ratio > free_block_ratio)
 357                goto found_flexbg;
 358
 359        if (best_flex && best_flex == parent_fbg_group) {
 360                best_flex--;
 361                goto find_close_to_parent;
 362        }
 363
 364        for (i = 0; i < n_fbg_groups; i++) {
 365                if (i == parent_fbg_group || i == parent_fbg_group - 1)
 366                        continue;
 367
 368                flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
 369                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 370
 371                if (flex_freeb_ratio > free_block_ratio &&
 372                    (atomic_read(&flex_group[i].free_inodes))) {
 373                        best_flex = i;
 374                        goto found_flexbg;
 375                }
 376
 377                if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
 378                    ((atomic_read(&flex_group[i].free_blocks) >
 379                      atomic_read(&flex_group[best_flex].free_blocks)) &&
 380                     atomic_read(&flex_group[i].free_inodes)))
 381                        best_flex = i;
 382        }
 383
 384        if (!atomic_read(&flex_group[best_flex].free_inodes) ||
 385            !atomic_read(&flex_group[best_flex].free_blocks))
 386                return -1;
 387
 388found_flexbg:
 389        for (i = best_flex * flex_size; i < ngroups &&
 390                     i < (best_flex + 1) * flex_size; i++) {
 391                desc = ext4_get_group_desc(sb, i, NULL);
 392                if (ext4_free_inodes_count(sb, desc)) {
 393                        *best_group = i;
 394                        goto out;
 395                }
 396        }
 397
 398        return -1;
 399out:
 400        return 0;
 401}
 402
 403struct orlov_stats {
 404        __u32 free_inodes;
 405        __u32 free_blocks;
 406        __u32 used_dirs;
 407};
 408
 409/*
 410 * Helper function for Orlov's allocator; returns critical information
 411 * for a particular block group or flex_bg.  If flex_size is 1, then g
 412 * is a block group number; otherwise it is flex_bg number.
 413 */
 414void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 415                       int flex_size, struct orlov_stats *stats)
 416{
 417        struct ext4_group_desc *desc;
 418        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 419
 420        if (flex_size > 1) {
 421                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
 422                stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
 423                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
 424                return;
 425        }
 426
 427        desc = ext4_get_group_desc(sb, g, NULL);
 428        if (desc) {
 429                stats->free_inodes = ext4_free_inodes_count(sb, desc);
 430                stats->free_blocks = ext4_free_blks_count(sb, desc);
 431                stats->used_dirs = ext4_used_dirs_count(sb, desc);
 432        } else {
 433                stats->free_inodes = 0;
 434                stats->free_blocks = 0;
 435                stats->used_dirs = 0;
 436        }
 437}
 438
 439/*
 440 * Orlov's allocator for directories.
 441 *
 442 * We always try to spread first-level directories.
 443 *
 444 * If there are blockgroups with both free inodes and free blocks counts
 445 * not worse than average we return one with smallest directory count.
 446 * Otherwise we simply return a random group.
 447 *
 448 * For the rest rules look so:
 449 *
 450 * It's OK to put directory into a group unless
 451 * it has too many directories already (max_dirs) or
 452 * it has too few free inodes left (min_inodes) or
 453 * it has too few free blocks left (min_blocks) or
 454 * Parent's group is preferred, if it doesn't satisfy these
 455 * conditions we search cyclically through the rest. If none
 456 * of the groups look good we just look for a group with more
 457 * free inodes than average (starting at parent's group).
 458 */
 459
 460static int find_group_orlov(struct super_block *sb, struct inode *parent,
 461                            ext4_group_t *group, int mode,
 462                            const struct qstr *qstr)
 463{
 464        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 465        struct ext4_sb_info *sbi = EXT4_SB(sb);
 466        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 467        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 468        unsigned int freei, avefreei;
 469        ext4_fsblk_t freeb, avefreeb;
 470        unsigned int ndirs;
 471        int max_dirs, min_inodes;
 472        ext4_grpblk_t min_blocks;
 473        ext4_group_t i, grp, g, ngroups;
 474        struct ext4_group_desc *desc;
 475        struct orlov_stats stats;
 476        int flex_size = ext4_flex_bg_size(sbi);
 477        struct dx_hash_info hinfo;
 478
 479        ngroups = real_ngroups;
 480        if (flex_size > 1) {
 481                ngroups = (real_ngroups + flex_size - 1) >>
 482                        sbi->s_log_groups_per_flex;
 483                parent_group >>= sbi->s_log_groups_per_flex;
 484        }
 485
 486        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 487        avefreei = freei / ngroups;
 488        freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 489        avefreeb = freeb;
 490        do_div(avefreeb, ngroups);
 491        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 492
 493        if (S_ISDIR(mode) &&
 494            ((parent == sb->s_root->d_inode) ||
 495             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 496                int best_ndir = inodes_per_group;
 497                int ret = -1;
 498
 499                if (qstr) {
 500                        hinfo.hash_version = DX_HASH_HALF_MD4;
 501                        hinfo.seed = sbi->s_hash_seed;
 502                        ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
 503                        grp = hinfo.hash;
 504                } else
 505                        get_random_bytes(&grp, sizeof(grp));
 506                parent_group = (unsigned)grp % ngroups;
 507                for (i = 0; i < ngroups; i++) {
 508                        g = (parent_group + i) % ngroups;
 509                        get_orlov_stats(sb, g, flex_size, &stats);
 510                        if (!stats.free_inodes)
 511                                continue;
 512                        if (stats.used_dirs >= best_ndir)
 513                                continue;
 514                        if (stats.free_inodes < avefreei)
 515                                continue;
 516                        if (stats.free_blocks < avefreeb)
 517                                continue;
 518                        grp = g;
 519                        ret = 0;
 520                        best_ndir = stats.used_dirs;
 521                }
 522                if (ret)
 523                        goto fallback;
 524        found_flex_bg:
 525                if (flex_size == 1) {
 526                        *group = grp;
 527                        return 0;
 528                }
 529
 530                /*
 531                 * We pack inodes at the beginning of the flexgroup's
 532                 * inode tables.  Block allocation decisions will do
 533                 * something similar, although regular files will
 534                 * start at 2nd block group of the flexgroup.  See
 535                 * ext4_ext_find_goal() and ext4_find_near().
 536                 */
 537                grp *= flex_size;
 538                for (i = 0; i < flex_size; i++) {
 539                        if (grp+i >= real_ngroups)
 540                                break;
 541                        desc = ext4_get_group_desc(sb, grp+i, NULL);
 542                        if (desc && ext4_free_inodes_count(sb, desc)) {
 543                                *group = grp+i;
 544                                return 0;
 545                        }
 546                }
 547                goto fallback;
 548        }
 549
 550        max_dirs = ndirs / ngroups + inodes_per_group / 16;
 551        min_inodes = avefreei - inodes_per_group*flex_size / 4;
 552        if (min_inodes < 1)
 553                min_inodes = 1;
 554        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
 555
 556        /*
 557         * Start looking in the flex group where we last allocated an
 558         * inode for this parent directory
 559         */
 560        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
 561                parent_group = EXT4_I(parent)->i_last_alloc_group;
 562                if (flex_size > 1)
 563                        parent_group >>= sbi->s_log_groups_per_flex;
 564        }
 565
 566        for (i = 0; i < ngroups; i++) {
 567                grp = (parent_group + i) % ngroups;
 568                get_orlov_stats(sb, grp, flex_size, &stats);
 569                if (stats.used_dirs >= max_dirs)
 570                        continue;
 571                if (stats.free_inodes < min_inodes)
 572                        continue;
 573                if (stats.free_blocks < min_blocks)
 574                        continue;
 575                goto found_flex_bg;
 576        }
 577
 578fallback:
 579        ngroups = real_ngroups;
 580        avefreei = freei / ngroups;
 581fallback_retry:
 582        parent_group = EXT4_I(parent)->i_block_group;
 583        for (i = 0; i < ngroups; i++) {
 584                grp = (parent_group + i) % ngroups;
 585                desc = ext4_get_group_desc(sb, grp, NULL);
 586                if (desc && ext4_free_inodes_count(sb, desc) &&
 587                    ext4_free_inodes_count(sb, desc) >= avefreei) {
 588                        *group = grp;
 589                        return 0;
 590                }
 591        }
 592
 593        if (avefreei) {
 594                /*
 595                 * The free-inodes counter is approximate, and for really small
 596                 * filesystems the above test can fail to find any blockgroups
 597                 */
 598                avefreei = 0;
 599                goto fallback_retry;
 600        }
 601
 602        return -1;
 603}
 604
 605static int find_group_other(struct super_block *sb, struct inode *parent,
 606                            ext4_group_t *group, int mode)
 607{
 608        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 609        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 610        struct ext4_group_desc *desc;
 611        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 612
 613        /*
 614         * Try to place the inode is the same flex group as its
 615         * parent.  If we can't find space, use the Orlov algorithm to
 616         * find another flex group, and store that information in the
 617         * parent directory's inode information so that use that flex
 618         * group for future allocations.
 619         */
 620        if (flex_size > 1) {
 621                int retry = 0;
 622
 623        try_again:
 624                parent_group &= ~(flex_size-1);
 625                last = parent_group + flex_size;
 626                if (last > ngroups)
 627                        last = ngroups;
 628                for  (i = parent_group; i < last; i++) {
 629                        desc = ext4_get_group_desc(sb, i, NULL);
 630                        if (desc && ext4_free_inodes_count(sb, desc)) {
 631                                *group = i;
 632                                return 0;
 633                        }
 634                }
 635                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
 636                        retry = 1;
 637                        parent_group = EXT4_I(parent)->i_last_alloc_group;
 638                        goto try_again;
 639                }
 640                /*
 641                 * If this didn't work, use the Orlov search algorithm
 642                 * to find a new flex group; we pass in the mode to
 643                 * avoid the topdir algorithms.
 644                 */
 645                *group = parent_group + flex_size;
 646                if (*group > ngroups)
 647                        *group = 0;
 648                return find_group_orlov(sb, parent, group, mode, 0);
 649        }
 650
 651        /*
 652         * Try to place the inode in its parent directory
 653         */
 654        *group = parent_group;
 655        desc = ext4_get_group_desc(sb, *group, NULL);
 656        if (desc && ext4_free_inodes_count(sb, desc) &&
 657                        ext4_free_blks_count(sb, desc))
 658                return 0;
 659
 660        /*
 661         * We're going to place this inode in a different blockgroup from its
 662         * parent.  We want to cause files in a common directory to all land in
 663         * the same blockgroup.  But we want files which are in a different
 664         * directory which shares a blockgroup with our parent to land in a
 665         * different blockgroup.
 666         *
 667         * So add our directory's i_ino into the starting point for the hash.
 668         */
 669        *group = (*group + parent->i_ino) % ngroups;
 670
 671        /*
 672         * Use a quadratic hash to find a group with a free inode and some free
 673         * blocks.
 674         */
 675        for (i = 1; i < ngroups; i <<= 1) {
 676                *group += i;
 677                if (*group >= ngroups)
 678                        *group -= ngroups;
 679                desc = ext4_get_group_desc(sb, *group, NULL);
 680                if (desc && ext4_free_inodes_count(sb, desc) &&
 681                                ext4_free_blks_count(sb, desc))
 682                        return 0;
 683        }
 684
 685        /*
 686         * That failed: try linear search for a free inode, even if that group
 687         * has no free blocks.
 688         */
 689        *group = parent_group;
 690        for (i = 0; i < ngroups; i++) {
 691                if (++*group >= ngroups)
 692                        *group = 0;
 693                desc = ext4_get_group_desc(sb, *group, NULL);
 694                if (desc && ext4_free_inodes_count(sb, desc))
 695                        return 0;
 696        }
 697
 698        return -1;
 699}
 700
 701/*
 702 * claim the inode from the inode bitmap. If the group
 703 * is uninit we need to take the groups's ext4_group_lock
 704 * and clear the uninit flag. The inode bitmap update
 705 * and group desc uninit flag clear should be done
 706 * after holding ext4_group_lock so that ext4_read_inode_bitmap
 707 * doesn't race with the ext4_claim_inode
 708 */
 709static int ext4_claim_inode(struct super_block *sb,
 710                        struct buffer_head *inode_bitmap_bh,
 711                        unsigned long ino, ext4_group_t group, int mode)
 712{
 713        int free = 0, retval = 0, count;
 714        struct ext4_sb_info *sbi = EXT4_SB(sb);
 715        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 716
 717        ext4_lock_group(sb, group);
 718        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 719                /* not a free inode */
 720                retval = 1;
 721                goto err_ret;
 722        }
 723        ino++;
 724        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 725                        ino > EXT4_INODES_PER_GROUP(sb)) {
 726                ext4_unlock_group(sb, group);
 727                ext4_error(sb, "reserved inode or inode > inodes count - "
 728                           "block_group = %u, inode=%lu", group,
 729                           ino + group * EXT4_INODES_PER_GROUP(sb));
 730                return 1;
 731        }
 732        /* If we didn't allocate from within the initialized part of the inode
 733         * table then we need to initialize up to this inode. */
 734        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
 735
 736                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 737                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 738                        /* When marking the block group with
 739                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
 740                         * on the value of bg_itable_unused even though
 741                         * mke2fs could have initialized the same for us.
 742                         * Instead we calculated the value below
 743                         */
 744
 745                        free = 0;
 746                } else {
 747                        free = EXT4_INODES_PER_GROUP(sb) -
 748                                ext4_itable_unused_count(sb, gdp);
 749                }
 750
 751                /*
 752                 * Check the relative inode number against the last used
 753                 * relative inode number in this group. if it is greater
 754                 * we need to  update the bg_itable_unused count
 755                 *
 756                 */
 757                if (ino > free)
 758                        ext4_itable_unused_set(sb, gdp,
 759                                        (EXT4_INODES_PER_GROUP(sb) - ino));
 760        }
 761        count = ext4_free_inodes_count(sb, gdp) - 1;
 762        ext4_free_inodes_set(sb, gdp, count);
 763        if (S_ISDIR(mode)) {
 764                count = ext4_used_dirs_count(sb, gdp) + 1;
 765                ext4_used_dirs_set(sb, gdp, count);
 766                if (sbi->s_log_groups_per_flex) {
 767                        ext4_group_t f = ext4_flex_group(sbi, group);
 768
 769                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 770                }
 771        }
 772        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 773err_ret:
 774        ext4_unlock_group(sb, group);
 775        return retval;
 776}
 777
 778/*
 779 * There are two policies for allocating an inode.  If the new inode is
 780 * a directory, then a forward search is made for a block group with both
 781 * free space and a low directory-to-inode ratio; if that fails, then of
 782 * the groups with above-average free space, that group with the fewest
 783 * directories already is chosen.
 784 *
 785 * For other inodes, search forward from the parent directory's block
 786 * group to find a free inode.
 787 */
 788struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 789                             const struct qstr *qstr, __u32 goal)
 790{
 791        struct super_block *sb;
 792        struct buffer_head *inode_bitmap_bh = NULL;
 793        struct buffer_head *group_desc_bh;
 794        ext4_group_t ngroups, group = 0;
 795        unsigned long ino = 0;
 796        struct inode *inode;
 797        struct ext4_group_desc *gdp = NULL;
 798        struct ext4_inode_info *ei;
 799        struct ext4_sb_info *sbi;
 800        int ret2, err = 0;
 801        struct inode *ret;
 802        ext4_group_t i;
 803        int free = 0;
 804        static int once = 1;
 805        ext4_group_t flex_group;
 806
 807        /* Cannot create files in a deleted directory */
 808        if (!dir || !dir->i_nlink)
 809                return ERR_PTR(-EPERM);
 810
 811        sb = dir->i_sb;
 812        ngroups = ext4_get_groups_count(sb);
 813        trace_ext4_request_inode(dir, mode);
 814        inode = new_inode(sb);
 815        if (!inode)
 816                return ERR_PTR(-ENOMEM);
 817        ei = EXT4_I(inode);
 818        sbi = EXT4_SB(sb);
 819
 820        if (!goal)
 821                goal = sbi->s_inode_goal;
 822
 823        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
 824                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
 825                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
 826                ret2 = 0;
 827                goto got_group;
 828        }
 829
 830        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 831                ret2 = find_group_flex(sb, dir, &group);
 832                if (ret2 == -1) {
 833                        ret2 = find_group_other(sb, dir, &group, mode);
 834                        if (ret2 == 0 && once) {
 835                                once = 0;
 836                                printk(KERN_NOTICE "ext4: find_group_flex "
 837                                       "failed, fallback succeeded dir %lu\n",
 838                                       dir->i_ino);
 839                        }
 840                }
 841                goto got_group;
 842        }
 843
 844        if (S_ISDIR(mode)) {
 845                if (test_opt(sb, OLDALLOC))
 846                        ret2 = find_group_dir(sb, dir, &group);
 847                else
 848                        ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
 849        } else
 850                ret2 = find_group_other(sb, dir, &group, mode);
 851
 852got_group:
 853        EXT4_I(dir)->i_last_alloc_group = group;
 854        err = -ENOSPC;
 855        if (ret2 == -1)
 856                goto out;
 857
 858        for (i = 0; i < ngroups; i++, ino = 0) {
 859                err = -EIO;
 860
 861                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 862                if (!gdp)
 863                        goto fail;
 864
 865                brelse(inode_bitmap_bh);
 866                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 867                if (!inode_bitmap_bh)
 868                        goto fail;
 869
 870repeat_in_this_group:
 871                ino = ext4_find_next_zero_bit((unsigned long *)
 872                                              inode_bitmap_bh->b_data,
 873                                              EXT4_INODES_PER_GROUP(sb), ino);
 874
 875                if (ino < EXT4_INODES_PER_GROUP(sb)) {
 876
 877                        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 878                        err = ext4_journal_get_write_access(handle,
 879                                                            inode_bitmap_bh);
 880                        if (err)
 881                                goto fail;
 882
 883                        BUFFER_TRACE(group_desc_bh, "get_write_access");
 884                        err = ext4_journal_get_write_access(handle,
 885                                                                group_desc_bh);
 886                        if (err)
 887                                goto fail;
 888                        if (!ext4_claim_inode(sb, inode_bitmap_bh,
 889                                                ino, group, mode)) {
 890                                /* we won it */
 891                                BUFFER_TRACE(inode_bitmap_bh,
 892                                        "call ext4_handle_dirty_metadata");
 893                                err = ext4_handle_dirty_metadata(handle,
 894                                                                 NULL,
 895                                                        inode_bitmap_bh);
 896                                if (err)
 897                                        goto fail;
 898                                /* zero bit is inode number 1*/
 899                                ino++;
 900                                goto got;
 901                        }
 902                        /* we lost it */
 903                        ext4_handle_release_buffer(handle, inode_bitmap_bh);
 904                        ext4_handle_release_buffer(handle, group_desc_bh);
 905
 906                        if (++ino < EXT4_INODES_PER_GROUP(sb))
 907                                goto repeat_in_this_group;
 908                }
 909
 910                /*
 911                 * This case is possible in concurrent environment.  It is very
 912                 * rare.  We cannot repeat the find_group_xxx() call because
 913                 * that will simply return the same blockgroup, because the
 914                 * group descriptor metadata has not yet been updated.
 915                 * So we just go onto the next blockgroup.
 916                 */
 917                if (++group == ngroups)
 918                        group = 0;
 919        }
 920        err = -ENOSPC;
 921        goto out;
 922
 923got:
 924        /* We may have to initialize the block bitmap if it isn't already */
 925        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 926            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 927                struct buffer_head *block_bitmap_bh;
 928
 929                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
 930                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
 931                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 932                if (err) {
 933                        brelse(block_bitmap_bh);
 934                        goto fail;
 935                }
 936
 937                free = 0;
 938                ext4_lock_group(sb, group);
 939                /* recheck and clear flag under lock if we still need to */
 940                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 941                        free = ext4_free_blocks_after_init(sb, group, gdp);
 942                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 943                        ext4_free_blks_set(sb, gdp, free);
 944                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 945                                                                gdp);
 946                }
 947                ext4_unlock_group(sb, group);
 948
 949                /* Don't need to dirty bitmap block if we didn't change it */
 950                if (free) {
 951                        BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
 952                        err = ext4_handle_dirty_metadata(handle,
 953                                                        NULL, block_bitmap_bh);
 954                }
 955
 956                brelse(block_bitmap_bh);
 957                if (err)
 958                        goto fail;
 959        }
 960        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 961        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
 962        if (err)
 963                goto fail;
 964
 965        percpu_counter_dec(&sbi->s_freeinodes_counter);
 966        if (S_ISDIR(mode))
 967                percpu_counter_inc(&sbi->s_dirs_counter);
 968        sb->s_dirt = 1;
 969
 970        if (sbi->s_log_groups_per_flex) {
 971                flex_group = ext4_flex_group(sbi, group);
 972                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 973        }
 974
 975        if (test_opt(sb, GRPID)) {
 976                inode->i_mode = mode;
 977                inode->i_uid = current_fsuid();
 978                inode->i_gid = dir->i_gid;
 979        } else
 980                inode_init_owner(inode, dir, mode);
 981
 982        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 983        /* This is the optimal IO size (for stat), not the fs block size */
 984        inode->i_blocks = 0;
 985        inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
 986                                                       ext4_current_time(inode);
 987
 988        memset(ei->i_data, 0, sizeof(ei->i_data));
 989        ei->i_dir_start_lookup = 0;
 990        ei->i_disksize = 0;
 991
 992        /*
 993         * Don't inherit extent flag from directory, amongst others. We set
 994         * extent flag on newly created directory and file only if -o extent
 995         * mount option is specified
 996         */
 997        ei->i_flags =
 998                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 999        ei->i_file_acl = 0;
1000        ei->i_dtime = 0;
1001        ei->i_block_group = group;
1002        ei->i_last_alloc_group = ~0;
1003
1004        ext4_set_inode_flags(inode);
1005        if (IS_DIRSYNC(inode))
1006                ext4_handle_sync(handle);
1007        if (insert_inode_locked(inode) < 0) {
1008                err = -EINVAL;
1009                goto fail_drop;
1010        }
1011        spin_lock(&sbi->s_next_gen_lock);
1012        inode->i_generation = sbi->s_next_generation++;
1013        spin_unlock(&sbi->s_next_gen_lock);
1014
1015        ei->i_state_flags = 0;
1016        ext4_set_inode_state(inode, EXT4_STATE_NEW);
1017
1018        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1019
1020        ret = inode;
1021        dquot_initialize(inode);
1022        err = dquot_alloc_inode(inode);
1023        if (err)
1024                goto fail_drop;
1025
1026        err = ext4_init_acl(handle, inode, dir);
1027        if (err)
1028                goto fail_free_drop;
1029
1030        err = ext4_init_security(handle, inode, dir);
1031        if (err)
1032                goto fail_free_drop;
1033
1034        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1035                /* set extent flag only for directory, file and normal symlink*/
1036                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1037                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1038                        ext4_ext_tree_init(handle, inode);
1039                }
1040        }
1041
1042        err = ext4_mark_inode_dirty(handle, inode);
1043        if (err) {
1044                ext4_std_error(sb, err);
1045                goto fail_free_drop;
1046        }
1047
1048        ext4_debug("allocating inode %lu\n", inode->i_ino);
1049        trace_ext4_allocate_inode(inode, dir, mode);
1050        goto really_out;
1051fail:
1052        ext4_std_error(sb, err);
1053out:
1054        iput(inode);
1055        ret = ERR_PTR(err);
1056really_out:
1057        brelse(inode_bitmap_bh);
1058        return ret;
1059
1060fail_free_drop:
1061        dquot_free_inode(inode);
1062
1063fail_drop:
1064        dquot_drop(inode);
1065        inode->i_flags |= S_NOQUOTA;
1066        inode->i_nlink = 0;
1067        unlock_new_inode(inode);
1068        iput(inode);
1069        brelse(inode_bitmap_bh);
1070        return ERR_PTR(err);
1071}
1072
1073/* Verify that we are loading a valid orphan from disk */
1074struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1075{
1076        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
1077        ext4_group_t block_group;
1078        int bit;
1079        struct buffer_head *bitmap_bh;
1080        struct inode *inode = NULL;
1081        long err = -EIO;
1082
1083        /* Error cases - e2fsck has already cleaned up for us */
1084        if (ino > max_ino) {
1085                ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
1086                goto error;
1087        }
1088
1089        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
1090        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1091        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1092        if (!bitmap_bh) {
1093                ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1094                goto error;
1095        }
1096
1097        /* Having the inode bit set should be a 100% indicator that this
1098         * is a valid orphan (no e2fsck run on fs).  Orphans also include
1099         * inodes that were being truncated, so we can't check i_nlink==0.
1100         */
1101        if (!ext4_test_bit(bit, bitmap_bh->b_data))
1102                goto bad_orphan;
1103
1104        inode = ext4_iget(sb, ino);
1105        if (IS_ERR(inode))
1106                goto iget_failed;
1107
1108        /*
1109         * If the orphans has i_nlinks > 0 then it should be able to be
1110         * truncated, otherwise it won't be removed from the orphan list
1111         * during processing and an infinite loop will result.
1112         */
1113        if (inode->i_nlink && !ext4_can_truncate(inode))
1114                goto bad_orphan;
1115
1116        if (NEXT_ORPHAN(inode) > max_ino)
1117                goto bad_orphan;
1118        brelse(bitmap_bh);
1119        return inode;
1120
1121iget_failed:
1122        err = PTR_ERR(inode);
1123        inode = NULL;
1124bad_orphan:
1125        ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
1126        printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1127               bit, (unsigned long long)bitmap_bh->b_blocknr,
1128               ext4_test_bit(bit, bitmap_bh->b_data));
1129        printk(KERN_NOTICE "inode=%p\n", inode);
1130        if (inode) {
1131                printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
1132                       is_bad_inode(inode));
1133                printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
1134                       NEXT_ORPHAN(inode));
1135                printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
1136                printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
1137                /* Avoid freeing blocks if we got a bad deleted inode */
1138                if (inode->i_nlink == 0)
1139                        inode->i_blocks = 0;
1140                iput(inode);
1141        }
1142        brelse(bitmap_bh);
1143error:
1144        return ERR_PTR(err);
1145}
1146
1147unsigned long ext4_count_free_inodes(struct super_block *sb)
1148{
1149        unsigned long desc_count;
1150        struct ext4_group_desc *gdp;
1151        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1152#ifdef EXT4FS_DEBUG
1153        struct ext4_super_block *es;
1154        unsigned long bitmap_count, x;
1155        struct buffer_head *bitmap_bh = NULL;
1156
1157        es = EXT4_SB(sb)->s_es;
1158        desc_count = 0;
1159        bitmap_count = 0;
1160        gdp = NULL;
1161        for (i = 0; i < ngroups; i++) {
1162                gdp = ext4_get_group_desc(sb, i, NULL);
1163                if (!gdp)
1164                        continue;
1165                desc_count += ext4_free_inodes_count(sb, gdp);
1166                brelse(bitmap_bh);
1167                bitmap_bh = ext4_read_inode_bitmap(sb, i);
1168                if (!bitmap_bh)
1169                        continue;
1170
1171                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1172                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1173                        (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1174                bitmap_count += x;
1175        }
1176        brelse(bitmap_bh);
1177        printk(KERN_DEBUG "ext4_count_free_inodes: "
1178               "stored = %u, computed = %lu, %lu\n",
1179               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
1180        return desc_count;
1181#else
1182        desc_count = 0;
1183        for (i = 0; i < ngroups; i++) {
1184                gdp = ext4_get_group_desc(sb, i, NULL);
1185                if (!gdp)
1186                        continue;
1187                desc_count += ext4_free_inodes_count(sb, gdp);
1188                cond_resched();
1189        }
1190        return desc_count;
1191#endif
1192}
1193
1194/* Called at mount-time, super-block is locked */
1195unsigned long ext4_count_dirs(struct super_block * sb)
1196{
1197        unsigned long count = 0;
1198        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1199
1200        for (i = 0; i < ngroups; i++) {
1201                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1202                if (!gdp)
1203                        continue;
1204                count += ext4_used_dirs_count(sb, gdp);
1205        }
1206        return count;
1207}
1208
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.