linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   4 * Written by Alex Tomas <alex@clusterfs.com>
   5 */
   6
   7
   8/*
   9 * mballoc.c contains the multiblocks allocation routines
  10 */
  11
  12#include "ext4_jbd2.h"
  13#include "mballoc.h"
  14#include <linux/log2.h>
  15#include <linux/module.h>
  16#include <linux/slab.h>
  17#include <linux/nospec.h>
  18#include <linux/backing-dev.h>
  19#include <trace/events/ext4.h>
  20
  21/*
  22 * MUSTDO:
  23 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  24 *   - search for metadata in few groups
  25 *
  26 * TODO v4:
  27 *   - normalization should take into account whether file is still open
  28 *   - discard preallocations if no free space left (policy?)
  29 *   - don't normalize tails
  30 *   - quota
  31 *   - reservation for superuser
  32 *
  33 * TODO v3:
  34 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  35 *   - track min/max extents in each group for better group selection
  36 *   - mb_mark_used() may allocate chunk right after splitting buddy
  37 *   - tree of groups sorted by number of free blocks
  38 *   - error handling
  39 */
  40
  41/*
  42 * The allocation request involve request for multiple number of blocks
  43 * near to the goal(block) value specified.
  44 *
  45 * During initialization phase of the allocator we decide to use the
  46 * group preallocation or inode preallocation depending on the size of
  47 * the file. The size of the file could be the resulting file size we
  48 * would have after allocation, or the current file size, which ever
  49 * is larger. If the size is less than sbi->s_mb_stream_request we
  50 * select to use the group preallocation. The default value of
  51 * s_mb_stream_request is 16 blocks. This can also be tuned via
  52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  53 * terms of number of blocks.
  54 *
  55 * The main motivation for having small file use group preallocation is to
  56 * ensure that we have small files closer together on the disk.
  57 *
  58 * First stage the allocator looks at the inode prealloc list,
  59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  60 * spaces for this particular inode. The inode prealloc space is
  61 * represented as:
  62 *
  63 * pa_lstart -> the logical start block for this prealloc space
  64 * pa_pstart -> the physical start block for this prealloc space
  65 * pa_len    -> length for this prealloc space (in clusters)
  66 * pa_free   ->  free space available in this prealloc space (in clusters)
  67 *
  68 * The inode preallocation space is used looking at the _logical_ start
  69 * block. If only the logical file block falls within the range of prealloc
  70 * space we will consume the particular prealloc space. This makes sure that
  71 * we have contiguous physical blocks representing the file blocks
  72 *
  73 * The important thing to be noted in case of inode prealloc space is that
  74 * we don't modify the values associated to inode prealloc space except
  75 * pa_free.
  76 *
  77 * If we are not able to find blocks in the inode prealloc space and if we
  78 * have the group allocation flag set then we look at the locality group
  79 * prealloc space. These are per CPU prealloc list represented as
  80 *
  81 * ext4_sb_info.s_locality_groups[smp_processor_id()]
  82 *
  83 * The reason for having a per cpu locality group is to reduce the contention
  84 * between CPUs. It is possible to get scheduled at this point.
  85 *
  86 * The locality group prealloc space is used looking at whether we have
  87 * enough free space (pa_free) within the prealloc space.
  88 *
  89 * If we can't allocate blocks via inode prealloc or/and locality group
  90 * prealloc then we look at the buddy cache. The buddy cache is represented
  91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
  92 * mapped to the buddy and bitmap information regarding different
  93 * groups. The buddy information is attached to buddy cache inode so that
  94 * we can access them through the page cache. The information regarding
  95 * each group is loaded via ext4_mb_load_buddy.  The information involve
  96 * block bitmap and buddy information. The information are stored in the
  97 * inode as:
  98 *
  99 *  {                        page                        }
 100 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 101 *
 102 *
 103 * one block each for bitmap and buddy information.  So for each group we
 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
 105 * blocksize) blocks.  So it can have information regarding groups_per_page
 106 * which is blocks_per_page/2
 107 *
 108 * The buddy cache inode is not stored on disk. The inode is thrown
 109 * away when the filesystem is unmounted.
 110 *
 111 * We look for count number of blocks in the buddy cache. If we were able
 112 * to locate that many free blocks we return with additional information
 113 * regarding rest of the contiguous physical block available
 114 *
 115 * Before allocating blocks via buddy cache we normalize the request
 116 * blocks. This ensure we ask for more blocks that we needed. The extra
 117 * blocks that we get after allocation is added to the respective prealloc
 118 * list. In case of inode preallocation we follow a list of heuristics
 119 * based on file size. This can be found in ext4_mb_normalize_request. If
 120 * we are doing a group prealloc we try to normalize the request to
 121 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 122 * dependent on the cluster size; for non-bigalloc file systems, it is
 123 * 512 blocks. This can be tuned via
 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 125 * terms of number of blocks. If we have mounted the file system with -O
 126 * stripe=<value> option the group prealloc request is normalized to the
 127 * smallest multiple of the stripe value (sbi->s_stripe) which is
 128 * greater than the default mb_group_prealloc.
 129 *
 130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
 131 * structures in two data structures:
 132 *
 133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
 134 *
 135 *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
 136 *
 137 *    This is an array of lists where the index in the array represents the
 138 *    largest free order in the buddy bitmap of the participating group infos of
 139 *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
 140 *    number of buddy bitmap orders possible) number of lists. Group-infos are
 141 *    placed in appropriate lists.
 142 *
 143 * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
 144 *
 145 *    Locking: sbi->s_mb_rb_lock (rwlock)
 146 *
 147 *    This is a red black tree consisting of group infos and the tree is sorted
 148 *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
 149 *    / ext4_group_info->bb_fragments).
 150 *
 151 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
 152 * structures to decide the order in which groups are to be traversed for
 153 * fulfilling an allocation request.
 154 *
 155 * At CR = 0, we look for groups which have the largest_free_order >= the order
 156 * of the request. We directly look at the largest free order list in the data
 157 * structure (1) above where largest_free_order = order of the request. If that
 158 * list is empty, we look at remaining list in the increasing order of
 159 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
 160 *
 161 * At CR = 1, we only consider groups where average fragment size > request
 162 * size. So, we lookup a group which has average fragment size just above or
 163 * equal to request size using our rb tree (data structure 2) in O(log N) time.
 164 *
 165 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
 166 * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
 167 *
 168 * The regular allocator (using the buddy cache) supports a few tunables.
 169 *
 170 * /sys/fs/ext4/<partition>/mb_min_to_scan
 171 * /sys/fs/ext4/<partition>/mb_max_to_scan
 172 * /sys/fs/ext4/<partition>/mb_order2_req
 173 * /sys/fs/ext4/<partition>/mb_linear_limit
 174 *
 175 * The regular allocator uses buddy scan only if the request len is power of
 176 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 177 * value of s_mb_order2_reqs can be tuned via
 178 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 179 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 180 * stripe size. This should result in better allocation on RAID setups. If
 181 * not, we search in the specific group using bitmap for best extents. The
 182 * tunable min_to_scan and max_to_scan control the behaviour here.
 183 * min_to_scan indicate how long the mballoc __must__ look for a best
 184 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 185 * best extent in the found extents. Searching for the blocks starts with
 186 * the group specified as the goal value in allocation context via
 187 * ac_g_ex. Each group is first checked based on the criteria whether it
 188 * can be used for allocation. ext4_mb_good_group explains how the groups are
 189 * checked.
 190 *
 191 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
 192 * get traversed linearly. That may result in subsequent allocations being not
 193 * close to each other. And so, the underlying device may get filled up in a
 194 * non-linear fashion. While that may not matter on non-rotational devices, for
 195 * rotational devices that may result in higher seek times. "mb_linear_limit"
 196 * tells mballoc how many groups mballoc should search linearly before
 197 * performing consulting above data structures for more efficient lookups. For
 198 * non rotational devices, this value defaults to 0 and for rotational devices
 199 * this is set to MB_DEFAULT_LINEAR_LIMIT.
 200 *
 201 * Both the prealloc space are getting populated as above. So for the first
 202 * request we will hit the buddy cache which will result in this prealloc
 203 * space getting filled. The prealloc space is then later used for the
 204 * subsequent request.
 205 */
 206
 207/*
 208 * mballoc operates on the following data:
 209 *  - on-disk bitmap
 210 *  - in-core buddy (actually includes buddy and bitmap)
 211 *  - preallocation descriptors (PAs)
 212 *
 213 * there are two types of preallocations:
 214 *  - inode
 215 *    assiged to specific inode and can be used for this inode only.
 216 *    it describes part of inode's space preallocated to specific
 217 *    physical blocks. any block from that preallocated can be used
 218 *    independent. the descriptor just tracks number of blocks left
 219 *    unused. so, before taking some block from descriptor, one must
 220 *    make sure corresponded logical block isn't allocated yet. this
 221 *    also means that freeing any block within descriptor's range
 222 *    must discard all preallocated blocks.
 223 *  - locality group
 224 *    assigned to specific locality group which does not translate to
 225 *    permanent set of inodes: inode can join and leave group. space
 226 *    from this type of preallocation can be used for any inode. thus
 227 *    it's consumed from the beginning to the end.
 228 *
 229 * relation between them can be expressed as:
 230 *    in-core buddy = on-disk bitmap + preallocation descriptors
 231 *
 232 * this mean blocks mballoc considers used are:
 233 *  - allocated blocks (persistent)
 234 *  - preallocated blocks (non-persistent)
 235 *
 236 * consistency in mballoc world means that at any time a block is either
 237 * free or used in ALL structures. notice: "any time" should not be read
 238 * literally -- time is discrete and delimited by locks.
 239 *
 240 *  to keep it simple, we don't use block numbers, instead we count number of
 241 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 242 *
 243 * all operations can be expressed as:
 244 *  - init buddy:                       buddy = on-disk + PAs
 245 *  - new PA:                           buddy += N; PA = N
 246 *  - use inode PA:                     on-disk += N; PA -= N
 247 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 248 *  - use locality group PA             on-disk += N; PA -= N
 249 *  - discard locality group PA         buddy -= PA; PA = 0
 250 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 251 *        is used in real operation because we can't know actual used
 252 *        bits from PA, only from on-disk bitmap
 253 *
 254 * if we follow this strict logic, then all operations above should be atomic.
 255 * given some of them can block, we'd have to use something like semaphores
 256 * killing performance on high-end SMP hardware. let's try to relax it using
 257 * the following knowledge:
 258 *  1) if buddy is referenced, it's already initialized
 259 *  2) while block is used in buddy and the buddy is referenced,
 260 *     nobody can re-allocate that block
 261 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 262 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 263 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 264 *     block
 265 *
 266 * so, now we're building a concurrency table:
 267 *  - init buddy vs.
 268 *    - new PA
 269 *      blocks for PA are allocated in the buddy, buddy must be referenced
 270 *      until PA is linked to allocation group to avoid concurrent buddy init
 271 *    - use inode PA
 272 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 273 *      given (3) we care that PA-=N operation doesn't interfere with init
 274 *    - discard inode PA
 275 *      the simplest way would be to have buddy initialized by the discard
 276 *    - use locality group PA
 277 *      again PA-=N must be serialized with init
 278 *    - discard locality group PA
 279 *      the simplest way would be to have buddy initialized by the discard
 280 *  - new PA vs.
 281 *    - use inode PA
 282 *      i_data_sem serializes them
 283 *    - discard inode PA
 284 *      discard process must wait until PA isn't used by another process
 285 *    - use locality group PA
 286 *      some mutex should serialize them
 287 *    - discard locality group PA
 288 *      discard process must wait until PA isn't used by another process
 289 *  - use inode PA
 290 *    - use inode PA
 291 *      i_data_sem or another mutex should serializes them
 292 *    - discard inode PA
 293 *      discard process must wait until PA isn't used by another process
 294 *    - use locality group PA
 295 *      nothing wrong here -- they're different PAs covering different blocks
 296 *    - discard locality group PA
 297 *      discard process must wait until PA isn't used by another process
 298 *
 299 * now we're ready to make few consequences:
 300 *  - PA is referenced and while it is no discard is possible
 301 *  - PA is referenced until block isn't marked in on-disk bitmap
 302 *  - PA changes only after on-disk bitmap
 303 *  - discard must not compete with init. either init is done before
 304 *    any discard or they're serialized somehow
 305 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 306 *
 307 * a special case when we've used PA to emptiness. no need to modify buddy
 308 * in this case, but we should care about concurrent init
 309 *
 310 */
 311
 312 /*
 313 * Logic in few words:
 314 *
 315 *  - allocation:
 316 *    load group
 317 *    find blocks
 318 *    mark bits in on-disk bitmap
 319 *    release group
 320 *
 321 *  - use preallocation:
 322 *    find proper PA (per-inode or group)
 323 *    load group
 324 *    mark bits in on-disk bitmap
 325 *    release group
 326 *    release PA
 327 *
 328 *  - free:
 329 *    load group
 330 *    mark bits in on-disk bitmap
 331 *    release group
 332 *
 333 *  - discard preallocations in group:
 334 *    mark PAs deleted
 335 *    move them onto local list
 336 *    load on-disk bitmap
 337 *    load group
 338 *    remove PA from object (inode or locality group)
 339 *    mark free blocks in-core
 340 *
 341 *  - discard inode's preallocations:
 342 */
 343
 344/*
 345 * Locking rules
 346 *
 347 * Locks:
 348 *  - bitlock on a group        (group)
 349 *  - object (inode/locality)   (object)
 350 *  - per-pa lock               (pa)
 351 *  - cr0 lists lock            (cr0)
 352 *  - cr1 tree lock             (cr1)
 353 *
 354 * Paths:
 355 *  - new pa
 356 *    object
 357 *    group
 358 *
 359 *  - find and use pa:
 360 *    pa
 361 *
 362 *  - release consumed pa:
 363 *    pa
 364 *    group
 365 *    object
 366 *
 367 *  - generate in-core bitmap:
 368 *    group
 369 *        pa
 370 *
 371 *  - discard all for given object (inode, locality group):
 372 *    object
 373 *        pa
 374 *    group
 375 *
 376 *  - discard all for given group:
 377 *    group
 378 *        pa
 379 *    group
 380 *        object
 381 *
 382 *  - allocation path (ext4_mb_regular_allocator)
 383 *    group
 384 *    cr0/cr1
 385 */
 386static struct kmem_cache *ext4_pspace_cachep;
 387static struct kmem_cache *ext4_ac_cachep;
 388static struct kmem_cache *ext4_free_data_cachep;
 389
 390/* We create slab caches for groupinfo data structures based on the
 391 * superblock block size.  There will be one per mounted filesystem for
 392 * each unique s_blocksize_bits */
 393#define NR_GRPINFO_CACHES 8
 394static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 395
 396static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 397        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 398        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 399        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 400};
 401
 402static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 403                                        ext4_group_t group);
 404static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 405                                                ext4_group_t group);
 406static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 407
 408static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
 409                               ext4_group_t group, int cr);
 410
 411/*
 412 * The algorithm using this percpu seq counter goes below:
 413 * 1. We sample the percpu discard_pa_seq counter before trying for block
 414 *    allocation in ext4_mb_new_blocks().
 415 * 2. We increment this percpu discard_pa_seq counter when we either allocate
 416 *    or free these blocks i.e. while marking those blocks as used/free in
 417 *    mb_mark_used()/mb_free_blocks().
 418 * 3. We also increment this percpu seq counter when we successfully identify
 419 *    that the bb_prealloc_list is not empty and hence proceed for discarding
 420 *    of those PAs inside ext4_mb_discard_group_preallocations().
 421 *
 422 * Now to make sure that the regular fast path of block allocation is not
 423 * affected, as a small optimization we only sample the percpu seq counter
 424 * on that cpu. Only when the block allocation fails and when freed blocks
 425 * found were 0, that is when we sample percpu seq counter for all cpus using
 426 * below function ext4_get_discard_pa_seq_sum(). This happens after making
 427 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
 428 */
 429static DEFINE_PER_CPU(u64, discard_pa_seq);
 430static inline u64 ext4_get_discard_pa_seq_sum(void)
 431{
 432        int __cpu;
 433        u64 __seq = 0;
 434
 435        for_each_possible_cpu(__cpu)
 436                __seq += per_cpu(discard_pa_seq, __cpu);
 437        return __seq;
 438}
 439
 440static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 441{
 442#if BITS_PER_LONG == 64
 443        *bit += ((unsigned long) addr & 7UL) << 3;
 444        addr = (void *) ((unsigned long) addr & ~7UL);
 445#elif BITS_PER_LONG == 32
 446        *bit += ((unsigned long) addr & 3UL) << 3;
 447        addr = (void *) ((unsigned long) addr & ~3UL);
 448#else
 449#error "how many bits you are?!"
 450#endif
 451        return addr;
 452}
 453
 454static inline int mb_test_bit(int bit, void *addr)
 455{
 456        /*
 457         * ext4_test_bit on architecture like powerpc
 458         * needs unsigned long aligned address
 459         */
 460        addr = mb_correct_addr_and_bit(&bit, addr);
 461        return ext4_test_bit(bit, addr);
 462}
 463
 464static inline void mb_set_bit(int bit, void *addr)
 465{
 466        addr = mb_correct_addr_and_bit(&bit, addr);
 467        ext4_set_bit(bit, addr);
 468}
 469
 470static inline void mb_clear_bit(int bit, void *addr)
 471{
 472        addr = mb_correct_addr_and_bit(&bit, addr);
 473        ext4_clear_bit(bit, addr);
 474}
 475
 476static inline int mb_test_and_clear_bit(int bit, void *addr)
 477{
 478        addr = mb_correct_addr_and_bit(&bit, addr);
 479        return ext4_test_and_clear_bit(bit, addr);
 480}
 481
 482static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 483{
 484        int fix = 0, ret, tmpmax;
 485        addr = mb_correct_addr_and_bit(&fix, addr);
 486        tmpmax = max + fix;
 487        start += fix;
 488
 489        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 490        if (ret > max)
 491                return max;
 492        return ret;
 493}
 494
 495static inline int mb_find_next_bit(void *addr, int max, int start)
 496{
 497        int fix = 0, ret, tmpmax;
 498        addr = mb_correct_addr_and_bit(&fix, addr);
 499        tmpmax = max + fix;
 500        start += fix;
 501
 502        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 503        if (ret > max)
 504                return max;
 505        return ret;
 506}
 507
 508static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 509{
 510        char *bb;
 511
 512        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 513        BUG_ON(max == NULL);
 514
 515        if (order > e4b->bd_blkbits + 1) {
 516                *max = 0;
 517                return NULL;
 518        }
 519
 520        /* at order 0 we see each particular block */
 521        if (order == 0) {
 522                *max = 1 << (e4b->bd_blkbits + 3);
 523                return e4b->bd_bitmap;
 524        }
 525
 526        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 527        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 528
 529        return bb;
 530}
 531
 532#ifdef DOUBLE_CHECK
 533static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 534                           int first, int count)
 535{
 536        int i;
 537        struct super_block *sb = e4b->bd_sb;
 538
 539        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 540                return;
 541        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 542        for (i = 0; i < count; i++) {
 543                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 544                        ext4_fsblk_t blocknr;
 545
 546                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 547                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 548                        ext4_grp_locked_error(sb, e4b->bd_group,
 549                                              inode ? inode->i_ino : 0,
 550                                              blocknr,
 551                                              "freeing block already freed "
 552                                              "(bit %u)",
 553                                              first + i);
 554                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
 555                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 556                }
 557                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 558        }
 559}
 560
 561static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 562{
 563        int i;
 564
 565        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 566                return;
 567        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 568        for (i = 0; i < count; i++) {
 569                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 570                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 571        }
 572}
 573
 574static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 575{
 576        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 577                return;
 578        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 579                unsigned char *b1, *b2;
 580                int i;
 581                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 582                b2 = (unsigned char *) bitmap;
 583                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 584                        if (b1[i] != b2[i]) {
 585                                ext4_msg(e4b->bd_sb, KERN_ERR,
 586                                         "corruption in group %u "
 587                                         "at byte %u(%u): %x in copy != %x "
 588                                         "on disk/prealloc",
 589                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
 590                                BUG();
 591                        }
 592                }
 593        }
 594}
 595
 596static void mb_group_bb_bitmap_alloc(struct super_block *sb,
 597                        struct ext4_group_info *grp, ext4_group_t group)
 598{
 599        struct buffer_head *bh;
 600
 601        grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
 602        if (!grp->bb_bitmap)
 603                return;
 604
 605        bh = ext4_read_block_bitmap(sb, group);
 606        if (IS_ERR_OR_NULL(bh)) {
 607                kfree(grp->bb_bitmap);
 608                grp->bb_bitmap = NULL;
 609                return;
 610        }
 611
 612        memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
 613        put_bh(bh);
 614}
 615
 616static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
 617{
 618        kfree(grp->bb_bitmap);
 619}
 620
 621#else
 622static inline void mb_free_blocks_double(struct inode *inode,
 623                                struct ext4_buddy *e4b, int first, int count)
 624{
 625        return;
 626}
 627static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 628                                                int first, int count)
 629{
 630        return;
 631}
 632static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 633{
 634        return;
 635}
 636
 637static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
 638                        struct ext4_group_info *grp, ext4_group_t group)
 639{
 640        return;
 641}
 642
 643static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
 644{
 645        return;
 646}
 647#endif
 648
 649#ifdef AGGRESSIVE_CHECK
 650
 651#define MB_CHECK_ASSERT(assert)                                         \
 652do {                                                                    \
 653        if (!(assert)) {                                                \
 654                printk(KERN_EMERG                                       \
 655                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 656                        function, file, line, # assert);                \
 657                BUG();                                                  \
 658        }                                                               \
 659} while (0)
 660
 661static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 662                                const char *function, int line)
 663{
 664        struct super_block *sb = e4b->bd_sb;
 665        int order = e4b->bd_blkbits + 1;
 666        int max;
 667        int max2;
 668        int i;
 669        int j;
 670        int k;
 671        int count;
 672        struct ext4_group_info *grp;
 673        int fragments = 0;
 674        int fstart;
 675        struct list_head *cur;
 676        void *buddy;
 677        void *buddy2;
 678
 679        if (e4b->bd_info->bb_check_counter++ % 10)
 680                return 0;
 681
 682        while (order > 1) {
 683                buddy = mb_find_buddy(e4b, order, &max);
 684                MB_CHECK_ASSERT(buddy);
 685                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 686                MB_CHECK_ASSERT(buddy2);
 687                MB_CHECK_ASSERT(buddy != buddy2);
 688                MB_CHECK_ASSERT(max * 2 == max2);
 689
 690                count = 0;
 691                for (i = 0; i < max; i++) {
 692
 693                        if (mb_test_bit(i, buddy)) {
 694                                /* only single bit in buddy2 may be 1 */
 695                                if (!mb_test_bit(i << 1, buddy2)) {
 696                                        MB_CHECK_ASSERT(
 697                                                mb_test_bit((i<<1)+1, buddy2));
 698                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 699                                        MB_CHECK_ASSERT(
 700                                                mb_test_bit(i << 1, buddy2));
 701                                }
 702                                continue;
 703                        }
 704
 705                        /* both bits in buddy2 must be 1 */
 706                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 707                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 708
 709                        for (j = 0; j < (1 << order); j++) {
 710                                k = (i * (1 << order)) + j;
 711                                MB_CHECK_ASSERT(
 712                                        !mb_test_bit(k, e4b->bd_bitmap));
 713                        }
 714                        count++;
 715                }
 716                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 717                order--;
 718        }
 719
 720        fstart = -1;
 721        buddy = mb_find_buddy(e4b, 0, &max);
 722        for (i = 0; i < max; i++) {
 723                if (!mb_test_bit(i, buddy)) {
 724                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 725                        if (fstart == -1) {
 726                                fragments++;
 727                                fstart = i;
 728                        }
 729                        continue;
 730                }
 731                fstart = -1;
 732                /* check used bits only */
 733                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 734                        buddy2 = mb_find_buddy(e4b, j, &max2);
 735                        k = i >> j;
 736                        MB_CHECK_ASSERT(k < max2);
 737                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 738                }
 739        }
 740        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 741        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 742
 743        grp = ext4_get_group_info(sb, e4b->bd_group);
 744        list_for_each(cur, &grp->bb_prealloc_list) {
 745                ext4_group_t groupnr;
 746                struct ext4_prealloc_space *pa;
 747                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 748                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 749                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 750                for (i = 0; i < pa->pa_len; i++)
 751                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 752        }
 753        return 0;
 754}
 755#undef MB_CHECK_ASSERT
 756#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 757                                        __FILE__, __func__, __LINE__)
 758#else
 759#define mb_check_buddy(e4b)
 760#endif
 761
 762/*
 763 * Divide blocks started from @first with length @len into
 764 * smaller chunks with power of 2 blocks.
 765 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 766 * then increase bb_counters[] for corresponded chunk size.
 767 */
 768static void ext4_mb_mark_free_simple(struct super_block *sb,
 769                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 770                                        struct ext4_group_info *grp)
 771{
 772        struct ext4_sb_info *sbi = EXT4_SB(sb);
 773        ext4_grpblk_t min;
 774        ext4_grpblk_t max;
 775        ext4_grpblk_t chunk;
 776        unsigned int border;
 777
 778        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 779
 780        border = 2 << sb->s_blocksize_bits;
 781
 782        while (len > 0) {
 783                /* find how many blocks can be covered since this position */
 784                max = ffs(first | border) - 1;
 785
 786                /* find how many blocks of power 2 we need to mark */
 787                min = fls(len) - 1;
 788
 789                if (max < min)
 790                        min = max;
 791                chunk = 1 << min;
 792
 793                /* mark multiblock chunks only */
 794                grp->bb_counters[min]++;
 795                if (min > 0)
 796                        mb_clear_bit(first >> min,
 797                                     buddy + sbi->s_mb_offsets[min]);
 798
 799                len -= chunk;
 800                first += chunk;
 801        }
 802}
 803
 804static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
 805                        int (*cmp)(struct rb_node *, struct rb_node *))
 806{
 807        struct rb_node **iter = &root->rb_node, *parent = NULL;
 808
 809        while (*iter) {
 810                parent = *iter;
 811                if (cmp(new, *iter) > 0)
 812                        iter = &((*iter)->rb_left);
 813                else
 814                        iter = &((*iter)->rb_right);
 815        }
 816
 817        rb_link_node(new, parent, iter);
 818        rb_insert_color(new, root);
 819}
 820
 821static int
 822ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
 823{
 824        struct ext4_group_info *grp1 = rb_entry(rb1,
 825                                                struct ext4_group_info,
 826                                                bb_avg_fragment_size_rb);
 827        struct ext4_group_info *grp2 = rb_entry(rb2,
 828                                                struct ext4_group_info,
 829                                                bb_avg_fragment_size_rb);
 830        int num_frags_1, num_frags_2;
 831
 832        num_frags_1 = grp1->bb_fragments ?
 833                grp1->bb_free / grp1->bb_fragments : 0;
 834        num_frags_2 = grp2->bb_fragments ?
 835                grp2->bb_free / grp2->bb_fragments : 0;
 836
 837        return (num_frags_2 - num_frags_1);
 838}
 839
 840/*
 841 * Reinsert grpinfo into the avg_fragment_size tree with new average
 842 * fragment size.
 843 */
 844static void
 845mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
 846{
 847        struct ext4_sb_info *sbi = EXT4_SB(sb);
 848
 849        if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
 850                return;
 851
 852        write_lock(&sbi->s_mb_rb_lock);
 853        if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
 854                rb_erase(&grp->bb_avg_fragment_size_rb,
 855                                &sbi->s_mb_avg_fragment_size_root);
 856                RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
 857        }
 858
 859        ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
 860                &grp->bb_avg_fragment_size_rb,
 861                ext4_mb_avg_fragment_size_cmp);
 862        write_unlock(&sbi->s_mb_rb_lock);
 863}
 864
 865/*
 866 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
 867 * cr level needs an update.
 868 */
 869static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
 870                        int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
 871{
 872        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 873        struct ext4_group_info *iter, *grp;
 874        int i;
 875
 876        if (ac->ac_status == AC_STATUS_FOUND)
 877                return;
 878
 879        if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
 880                atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
 881
 882        grp = NULL;
 883        for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
 884                if (list_empty(&sbi->s_mb_largest_free_orders[i]))
 885                        continue;
 886                read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
 887                if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
 888                        read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
 889                        continue;
 890                }
 891                grp = NULL;
 892                list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
 893                                    bb_largest_free_order_node) {
 894                        if (sbi->s_mb_stats)
 895                                atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
 896                        if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
 897                                grp = iter;
 898                                break;
 899                        }
 900                }
 901                read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
 902                if (grp)
 903                        break;
 904        }
 905
 906        if (!grp) {
 907                /* Increment cr and search again */
 908                *new_cr = 1;
 909        } else {
 910                *group = grp->bb_group;
 911                ac->ac_last_optimal_group = *group;
 912                ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
 913        }
 914}
 915
 916/*
 917 * Choose next group by traversing average fragment size tree. Updates *new_cr
 918 * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
 919 * the linear search should continue for one iteration since there's lock
 920 * contention on the rb tree lock.
 921 */
 922static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
 923                int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
 924{
 925        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 926        int avg_fragment_size, best_so_far;
 927        struct rb_node *node, *found;
 928        struct ext4_group_info *grp;
 929
 930        /*
 931         * If there is contention on the lock, instead of waiting for the lock
 932         * to become available, just continue searching lineraly. We'll resume
 933         * our rb tree search later starting at ac->ac_last_optimal_group.
 934         */
 935        if (!read_trylock(&sbi->s_mb_rb_lock)) {
 936                ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
 937                return;
 938        }
 939
 940        if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
 941                if (sbi->s_mb_stats)
 942                        atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
 943                /* We have found something at CR 1 in the past */
 944                grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
 945                for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
 946                     found = rb_next(found)) {
 947                        grp = rb_entry(found, struct ext4_group_info,
 948                                       bb_avg_fragment_size_rb);
 949                        if (sbi->s_mb_stats)
 950                                atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
 951                        if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
 952                                break;
 953                }
 954                goto done;
 955        }
 956
 957        node = sbi->s_mb_avg_fragment_size_root.rb_node;
 958        best_so_far = 0;
 959        found = NULL;
 960
 961        while (node) {
 962                grp = rb_entry(node, struct ext4_group_info,
 963                               bb_avg_fragment_size_rb);
 964                avg_fragment_size = 0;
 965                if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
 966                        avg_fragment_size = grp->bb_fragments ?
 967                                grp->bb_free / grp->bb_fragments : 0;
 968                        if (!best_so_far || avg_fragment_size < best_so_far) {
 969                                best_so_far = avg_fragment_size;
 970                                found = node;
 971                        }
 972                }
 973                if (avg_fragment_size > ac->ac_g_ex.fe_len)
 974                        node = node->rb_right;
 975                else
 976                        node = node->rb_left;
 977        }
 978
 979done:
 980        if (found) {
 981                grp = rb_entry(found, struct ext4_group_info,
 982                               bb_avg_fragment_size_rb);
 983                *group = grp->bb_group;
 984                ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
 985        } else {
 986                *new_cr = 2;
 987        }
 988
 989        read_unlock(&sbi->s_mb_rb_lock);
 990        ac->ac_last_optimal_group = *group;
 991}
 992
 993static inline int should_optimize_scan(struct ext4_allocation_context *ac)
 994{
 995        if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
 996                return 0;
 997        if (ac->ac_criteria >= 2)
 998                return 0;
 999        if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
1000                return 0;
1001        return 1;
1002}
1003
1004/*
1005 * Return next linear group for allocation. If linear traversal should not be
1006 * performed, this function just returns the same group
1007 */
1008static int
1009next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
1010{
1011        if (!should_optimize_scan(ac))
1012                goto inc_and_return;
1013
1014        if (ac->ac_groups_linear_remaining) {
1015                ac->ac_groups_linear_remaining--;
1016                goto inc_and_return;
1017        }
1018
1019        if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
1020                ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
1021                goto inc_and_return;
1022        }
1023
1024        return group;
1025inc_and_return:
1026        /*
1027         * Artificially restricted ngroups for non-extent
1028         * files makes group > ngroups possible on first loop.
1029         */
1030        return group + 1 >= ngroups ? 0 : group + 1;
1031}
1032
1033/*
1034 * ext4_mb_choose_next_group: choose next group for allocation.
1035 *
1036 * @ac        Allocation Context
1037 * @new_cr    This is an output parameter. If the there is no good group
1038 *            available at current CR level, this field is updated to indicate
1039 *            the new cr level that should be used.
1040 * @group     This is an input / output parameter. As an input it indicates the
1041 *            next group that the allocator intends to use for allocation. As
1042 *            output, this field indicates the next group that should be used as
1043 *            determined by the optimization functions.
1044 * @ngroups   Total number of groups
1045 */
1046static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
1047                int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
1048{
1049        *new_cr = ac->ac_criteria;
1050
1051        if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
1052                return;
1053
1054        if (*new_cr == 0) {
1055                ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
1056        } else if (*new_cr == 1) {
1057                ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
1058        } else {
1059                /*
1060                 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
1061                 * bb_free. But until that happens, we should never come here.
1062                 */
1063                WARN_ON(1);
1064        }
1065}
1066
1067/*
1068 * Cache the order of the largest free extent we have available in this block
1069 * group.
1070 */
1071static void
1072mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
1073{
1074        struct ext4_sb_info *sbi = EXT4_SB(sb);
1075        int i;
1076
1077        if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
1078                write_lock(&sbi->s_mb_largest_free_orders_locks[
1079                                              grp->bb_largest_free_order]);
1080                list_del_init(&grp->bb_largest_free_order_node);
1081                write_unlock(&sbi->s_mb_largest_free_orders_locks[
1082                                              grp->bb_largest_free_order]);
1083        }
1084        grp->bb_largest_free_order = -1; /* uninit */
1085
1086        for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
1087                if (grp->bb_counters[i] > 0) {
1088                        grp->bb_largest_free_order = i;
1089                        break;
1090                }
1091        }
1092        if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
1093            grp->bb_largest_free_order >= 0 && grp->bb_free) {
1094                write_lock(&sbi->s_mb_largest_free_orders_locks[
1095                                              grp->bb_largest_free_order]);
1096                list_add_tail(&grp->bb_largest_free_order_node,
1097                      &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
1098                write_unlock(&sbi->s_mb_largest_free_orders_locks[
1099                                              grp->bb_largest_free_order]);
1100        }
1101}
1102
1103static noinline_for_stack
1104void ext4_mb_generate_buddy(struct super_block *sb,
1105                                void *buddy, void *bitmap, ext4_group_t group)
1106{
1107        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1108        struct ext4_sb_info *sbi = EXT4_SB(sb);
1109        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
1110        ext4_grpblk_t i = 0;
1111        ext4_grpblk_t first;
1112        ext4_grpblk_t len;
1113        unsigned free = 0;
1114        unsigned fragments = 0;
1115        unsigned long long period = get_cycles();
1116
1117        /* initialize buddy from bitmap which is aggregation
1118         * of on-disk bitmap and preallocations */
1119        i = mb_find_next_zero_bit(bitmap, max, 0);
1120        grp->bb_first_free = i;
1121        while (i < max) {
1122                fragments++;
1123                first = i;
1124                i = mb_find_next_bit(bitmap, max, i);
1125                len = i - first;
1126                free += len;
1127                if (len > 1)
1128                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
1129                else
1130                        grp->bb_counters[0]++;
1131                if (i < max)
1132                        i = mb_find_next_zero_bit(bitmap, max, i);
1133        }
1134        grp->bb_fragments = fragments;
1135
1136        if (free != grp->bb_free) {
1137                ext4_grp_locked_error(sb, group, 0, 0,
1138                                      "block bitmap and bg descriptor "
1139                                      "inconsistent: %u vs %u free clusters",
1140                                      free, grp->bb_free);
1141                /*
1142                 * If we intend to continue, we consider group descriptor
1143                 * corrupt and update bb_free using bitmap value
1144                 */
1145                grp->bb_free = free;
1146                ext4_mark_group_bitmap_corrupted(sb, group,
1147                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1148        }
1149        mb_set_largest_free_order(sb, grp);
1150
1151        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
1152
1153        period = get_cycles() - period;
1154        atomic_inc(&sbi->s_mb_buddies_generated);
1155        atomic64_add(period, &sbi->s_mb_generation_time);
1156        mb_update_avg_fragment_size(sb, grp);
1157}
1158
1159/* The buddy information is attached the buddy cache inode
1160 * for convenience. The information regarding each group
1161 * is loaded via ext4_mb_load_buddy. The information involve
1162 * block bitmap and buddy information. The information are
1163 * stored in the inode as
1164 *
1165 * {                        page                        }
1166 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1167 *
1168 *
1169 * one block each for bitmap and buddy information.
1170 * So for each group we take up 2 blocks. A page can
1171 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
1172 * So it can have information regarding groups_per_page which
1173 * is blocks_per_page/2
1174 *
1175 * Locking note:  This routine takes the block group lock of all groups
1176 * for this page; do not hold this lock when calling this routine!
1177 */
1178
1179static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
1180{
1181        ext4_group_t ngroups;
1182        int blocksize;
1183        int blocks_per_page;
1184        int groups_per_page;
1185        int err = 0;
1186        int i;
1187        ext4_group_t first_group, group;
1188        int first_block;
1189        struct super_block *sb;
1190        struct buffer_head *bhs;
1191        struct buffer_head **bh = NULL;
1192        struct inode *inode;
1193        char *data;
1194        char *bitmap;
1195        struct ext4_group_info *grinfo;
1196
1197        inode = page->mapping->host;
1198        sb = inode->i_sb;
1199        ngroups = ext4_get_groups_count(sb);
1200        blocksize = i_blocksize(inode);
1201        blocks_per_page = PAGE_SIZE / blocksize;
1202
1203        mb_debug(sb, "init page %lu\n", page->index);
1204
1205        groups_per_page = blocks_per_page >> 1;
1206        if (groups_per_page == 0)
1207                groups_per_page = 1;
1208
1209        /* allocate buffer_heads to read bitmaps */
1210        if (groups_per_page > 1) {
1211                i = sizeof(struct buffer_head *) * groups_per_page;
1212                bh = kzalloc(i, gfp);
1213                if (bh == NULL) {
1214                        err = -ENOMEM;
1215                        goto out;
1216                }
1217        } else
1218                bh = &bhs;
1219
1220        first_group = page->index * blocks_per_page / 2;
1221
1222        /* read all groups the page covers into the cache */
1223        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1224                if (group >= ngroups)
1225                        break;
1226
1227                grinfo = ext4_get_group_info(sb, group);
1228                /*
1229                 * If page is uptodate then we came here after online resize
1230                 * which added some new uninitialized group info structs, so
1231                 * we must skip all initialized uptodate buddies on the page,
1232                 * which may be currently in use by an allocating task.
1233                 */
1234                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
1235                        bh[i] = NULL;
1236                        continue;
1237                }
1238                bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
1239                if (IS_ERR(bh[i])) {
1240                        err = PTR_ERR(bh[i]);
1241                        bh[i] = NULL;
1242                        goto out;
1243                }
1244                mb_debug(sb, "read bitmap for group %u\n", group);
1245        }
1246
1247        /* wait for I/O completion */
1248        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
1249                int err2;
1250
1251                if (!bh[i])
1252                        continue;
1253                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
1254                if (!err)
1255                        err = err2;
1256        }
1257
1258        first_block = page->index * blocks_per_page;
1259        for (i = 0; i < blocks_per_page; i++) {
1260                group = (first_block + i) >> 1;
1261                if (group >= ngroups)
1262                        break;
1263
1264                if (!bh[group - first_group])
1265                        /* skip initialized uptodate buddy */
1266                        continue;
1267
1268                if (!buffer_verified(bh[group - first_group]))
1269                        /* Skip faulty bitmaps */
1270                        continue;
1271                err = 0;
1272
1273                /*
1274                 * data carry information regarding this
1275                 * particular group in the format specified
1276                 * above
1277                 *
1278                 */
1279                data = page_address(page) + (i * blocksize);
1280                bitmap = bh[group - first_group]->b_data;
1281
1282                /*
1283                 * We place the buddy block and bitmap block
1284                 * close together
1285                 */
1286                if ((first_block + i) & 1) {
1287                        /* this is block of buddy */
1288                        BUG_ON(incore == NULL);
1289                        mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
1290                                group, page->index, i * blocksize);
1291                        trace_ext4_mb_buddy_bitmap_load(sb, group);
1292                        grinfo = ext4_get_group_info(sb, group);
1293                        grinfo->bb_fragments = 0;
1294                        memset(grinfo->bb_counters, 0,
1295                               sizeof(*grinfo->bb_counters) *
1296                               (MB_NUM_ORDERS(sb)));
1297                        /*
1298                         * incore got set to the group block bitmap below
1299                         */
1300                        ext4_lock_group(sb, group);
1301                        /* init the buddy */
1302                        memset(data, 0xff, blocksize);
1303                        ext4_mb_generate_buddy(sb, data, incore, group);
1304                        ext4_unlock_group(sb, group);
1305                        incore = NULL;
1306                } else {
1307                        /* this is block of bitmap */
1308                        BUG_ON(incore != NULL);
1309                        mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
1310                                group, page->index, i * blocksize);
1311                        trace_ext4_mb_bitmap_load(sb, group);
1312
1313                        /* see comments in ext4_mb_put_pa() */
1314                        ext4_lock_group(sb, group);
1315                        memcpy(data, bitmap, blocksize);
1316
1317                        /* mark all preallocated blks used in in-core bitmap */
1318                        ext4_mb_generate_from_pa(sb, data, group);
1319                        ext4_mb_generate_from_freelist(sb, data, group);
1320                        ext4_unlock_group(sb, group);
1321
1322                        /* set incore so that the buddy information can be
1323                         * generated using this
1324                         */
1325                        incore = data;
1326                }
1327        }
1328        SetPageUptodate(page);
1329
1330out:
1331        if (bh) {
1332                for (i = 0; i < groups_per_page; i++)
1333                        brelse(bh[i]);
1334                if (bh != &bhs)
1335                        kfree(bh);
1336        }
1337        return err;
1338}
1339
1340/*
1341 * Lock the buddy and bitmap pages. This make sure other parallel init_group
1342 * on the same buddy page doesn't happen whild holding the buddy page lock.
1343 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1344 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
1345 */
1346static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1347                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
1348{
1349        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
1350        int block, pnum, poff;
1351        int blocks_per_page;
1352        struct page *page;
1353
1354        e4b->bd_buddy_page = NULL;
1355        e4b->bd_bitmap_page = NULL;
1356
1357        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1358        /*
1359         * the buddy cache inode stores the block bitmap
1360         * and buddy information in consecutive blocks.
1361         * So for each group we need two blocks.
1362         */
1363        block = group * 2;
1364        pnum = block / blocks_per_page;
1365        poff = block % blocks_per_page;
1366        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1367        if (!page)
1368                return -ENOMEM;
1369        BUG_ON(page->mapping != inode->i_mapping);
1370        e4b->bd_bitmap_page = page;
1371        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1372
1373        if (blocks_per_page >= 2) {
1374                /* buddy and bitmap are on the same page */
1375                return 0;
1376        }
1377
1378        block++;
1379        pnum = block / blocks_per_page;
1380        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1381        if (!page)
1382                return -ENOMEM;
1383        BUG_ON(page->mapping != inode->i_mapping);
1384        e4b->bd_buddy_page = page;
1385        return 0;
1386}
1387
1388static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1389{
1390        if (e4b->bd_bitmap_page) {
1391                unlock_page(e4b->bd_bitmap_page);
1392                put_page(e4b->bd_bitmap_page);
1393        }
1394        if (e4b->bd_buddy_page) {
1395                unlock_page(e4b->bd_buddy_page);
1396                put_page(e4b->bd_buddy_page);
1397        }
1398}
1399
1400/*
1401 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1402 * block group lock of all groups for this page; do not hold the BG lock when
1403 * calling this routine!
1404 */
1405static noinline_for_stack
1406int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1407{
1408
1409        struct ext4_group_info *this_grp;
1410        struct ext4_buddy e4b;
1411        struct page *page;
1412        int ret = 0;
1413
1414        might_sleep();
1415        mb_debug(sb, "init group %u\n", group);
1416        this_grp = ext4_get_group_info(sb, group);
1417        /*
1418         * This ensures that we don't reinit the buddy cache
1419         * page which map to the group from which we are already
1420         * allocating. If we are looking at the buddy cache we would
1421         * have taken a reference using ext4_mb_load_buddy and that
1422         * would have pinned buddy page to page cache.
1423         * The call to ext4_mb_get_buddy_page_lock will mark the
1424         * page accessed.
1425         */
1426        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1427        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1428                /*
1429                 * somebody initialized the group
1430                 * return without doing anything
1431                 */
1432                goto err;
1433        }
1434
1435        page = e4b.bd_bitmap_page;
1436        ret = ext4_mb_init_cache(page, NULL, gfp);
1437        if (ret)
1438                goto err;
1439        if (!PageUptodate(page)) {
1440                ret = -EIO;
1441                goto err;
1442        }
1443
1444        if (e4b.bd_buddy_page == NULL) {
1445                /*
1446                 * If both the bitmap and buddy are in
1447                 * the same page we don't need to force
1448                 * init the buddy
1449                 */
1450                ret = 0;
1451                goto err;
1452        }
1453        /* init buddy cache */
1454        page = e4b.bd_buddy_page;
1455        ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1456        if (ret)
1457                goto err;
1458        if (!PageUptodate(page)) {
1459                ret = -EIO;
1460                goto err;
1461        }
1462err:
1463        ext4_mb_put_buddy_page_lock(&e4b);
1464        return ret;
1465}
1466
1467/*
1468 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1469 * block group lock of all groups for this page; do not hold the BG lock when
1470 * calling this routine!
1471 */
1472static noinline_for_stack int
1473ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1474                       struct ext4_buddy *e4b, gfp_t gfp)
1475{
1476        int blocks_per_page;
1477        int block;
1478        int pnum;
1479        int poff;
1480        struct page *page;
1481        int ret;
1482        struct ext4_group_info *grp;
1483        struct ext4_sb_info *sbi = EXT4_SB(sb);
1484        struct inode *inode = sbi->s_buddy_cache;
1485
1486        might_sleep();
1487        mb_debug(sb, "load group %u\n", group);
1488
1489        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1490        grp = ext4_get_group_info(sb, group);
1491
1492        e4b->bd_blkbits = sb->s_blocksize_bits;
1493        e4b->bd_info = grp;
1494        e4b->bd_sb = sb;
1495        e4b->bd_group = group;
1496        e4b->bd_buddy_page = NULL;
1497        e4b->bd_bitmap_page = NULL;
1498
1499        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1500                /*
1501                 * we need full data about the group
1502                 * to make a good selection
1503                 */
1504                ret = ext4_mb_init_group(sb, group, gfp);
1505                if (ret)
1506                        return ret;
1507        }
1508
1509        /*
1510         * the buddy cache inode stores the block bitmap
1511         * and buddy information in consecutive blocks.
1512         * So for each group we need two blocks.
1513         */
1514        block = group * 2;
1515        pnum = block / blocks_per_page;
1516        poff = block % blocks_per_page;
1517
1518        /* we could use find_or_create_page(), but it locks page
1519         * what we'd like to avoid in fast path ... */
1520        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1521        if (page == NULL || !PageUptodate(page)) {
1522                if (page)
1523                        /*
1524                         * drop the page reference and try
1525                         * to get the page with lock. If we
1526                         * are not uptodate that implies
1527                         * somebody just created the page but
1528                         * is yet to initialize the same. So
1529                         * wait for it to initialize.
1530                         */
1531                        put_page(page);
1532                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1533                if (page) {
1534                        BUG_ON(page->mapping != inode->i_mapping);
1535                        if (!PageUptodate(page)) {
1536                                ret = ext4_mb_init_cache(page, NULL, gfp);
1537                                if (ret) {
1538                                        unlock_page(page);
1539                                        goto err;
1540                                }
1541                                mb_cmp_bitmaps(e4b, page_address(page) +
1542                                               (poff * sb->s_blocksize));
1543                        }
1544                        unlock_page(page);
1545                }
1546        }
1547        if (page == NULL) {
1548                ret = -ENOMEM;
1549                goto err;
1550        }
1551        if (!PageUptodate(page)) {
1552                ret = -EIO;
1553                goto err;
1554        }
1555
1556        /* Pages marked accessed already */
1557        e4b->bd_bitmap_page = page;
1558        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1559
1560        block++;
1561        pnum = block / blocks_per_page;
1562        poff = block % blocks_per_page;
1563
1564        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1565        if (page == NULL || !PageUptodate(page)) {
1566                if (page)
1567                        put_page(page);
1568                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1569                if (page) {
1570                        BUG_ON(page->mapping != inode->i_mapping);
1571                        if (!PageUptodate(page)) {
1572                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1573                                                         gfp);
1574                                if (ret) {
1575                                        unlock_page(page);
1576                                        goto err;
1577                                }
1578                        }
1579                        unlock_page(page);
1580                }
1581        }
1582        if (page == NULL) {
1583                ret = -ENOMEM;
1584                goto err;
1585        }
1586        if (!PageUptodate(page)) {
1587                ret = -EIO;
1588                goto err;
1589        }
1590
1591        /* Pages marked accessed already */
1592        e4b->bd_buddy_page = page;
1593        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1594
1595        return 0;
1596
1597err:
1598        if (page)
1599                put_page(page);
1600        if (e4b->bd_bitmap_page)
1601                put_page(e4b->bd_bitmap_page);
1602        if (e4b->bd_buddy_page)
1603                put_page(e4b->bd_buddy_page);
1604        e4b->bd_buddy = NULL;
1605        e4b->bd_bitmap = NULL;
1606        return ret;
1607}
1608
1609static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1610                              struct ext4_buddy *e4b)
1611{
1612        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1613}
1614
1615static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1616{
1617        if (e4b->bd_bitmap_page)
1618                put_page(e4b->bd_bitmap_page);
1619        if (e4b->bd_buddy_page)
1620                put_page(e4b->bd_buddy_page);
1621}
1622
1623
1624static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1625{
1626        int order = 1, max;
1627        void *bb;
1628
1629        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1630        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1631
1632        while (order <= e4b->bd_blkbits + 1) {
1633                bb = mb_find_buddy(e4b, order, &max);
1634                if (!mb_test_bit(block >> order, bb)) {
1635                        /* this block is part of buddy of order 'order' */
1636                        return order;
1637                }
1638                order++;
1639        }
1640        return 0;
1641}
1642
1643static void mb_clear_bits(void *bm, int cur, int len)
1644{
1645        __u32 *addr;
1646
1647        len = cur + len;
1648        while (cur < len) {
1649                if ((cur & 31) == 0 && (len - cur) >= 32) {
1650                        /* fast path: clear whole word at once */
1651                        addr = bm + (cur >> 3);
1652                        *addr = 0;
1653                        cur += 32;
1654                        continue;
1655                }
1656                mb_clear_bit(cur, bm);
1657                cur++;
1658        }
1659}
1660
1661/* clear bits in given range
1662 * will return first found zero bit if any, -1 otherwise
1663 */
1664static int mb_test_and_clear_bits(void *bm, int cur, int len)
1665{
1666        __u32 *addr;
1667        int zero_bit = -1;
1668
1669        len = cur + len;
1670        while (cur < len) {
1671                if ((cur & 31) == 0 && (len - cur) >= 32) {
1672                        /* fast path: clear whole word at once */
1673                        addr = bm + (cur >> 3);
1674                        if (*addr != (__u32)(-1) && zero_bit == -1)
1675                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1676                        *addr = 0;
1677                        cur += 32;
1678                        continue;
1679                }
1680                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1681                        zero_bit = cur;
1682                cur++;
1683        }
1684
1685        return zero_bit;
1686}
1687
1688void ext4_set_bits(void *bm, int cur, int len)
1689{
1690        __u32 *addr;
1691
1692        len = cur + len;
1693        while (cur < len) {
1694                if ((cur & 31) == 0 && (len - cur) >= 32) {
1695                        /* fast path: set whole word at once */
1696                        addr = bm + (cur >> 3);
1697                        *addr = 0xffffffff;
1698                        cur += 32;
1699                        continue;
1700                }
1701                mb_set_bit(cur, bm);
1702                cur++;
1703        }
1704}
1705
1706static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1707{
1708        if (mb_test_bit(*bit + side, bitmap)) {
1709                mb_clear_bit(*bit, bitmap);
1710                (*bit) -= side;
1711                return 1;
1712        }
1713        else {
1714                (*bit) += side;
1715                mb_set_bit(*bit, bitmap);
1716                return -1;
1717        }
1718}
1719
1720static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1721{
1722        int max;
1723        int order = 1;
1724        void *buddy = mb_find_buddy(e4b, order, &max);
1725
1726        while (buddy) {
1727                void *buddy2;
1728
1729                /* Bits in range [first; last] are known to be set since
1730                 * corresponding blocks were allocated. Bits in range
1731                 * (first; last) will stay set because they form buddies on
1732                 * upper layer. We just deal with borders if they don't
1733                 * align with upper layer and then go up.
1734                 * Releasing entire group is all about clearing
1735                 * single bit of highest order buddy.
1736                 */
1737
1738                /* Example:
1739                 * ---------------------------------
1740                 * |   1   |   1   |   1   |   1   |
1741                 * ---------------------------------
1742                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1743                 * ---------------------------------
1744                 *   0   1   2   3   4   5   6   7
1745                 *      \_____________________/
1746                 *
1747                 * Neither [1] nor [6] is aligned to above layer.
1748                 * Left neighbour [0] is free, so mark it busy,
1749                 * decrease bb_counters and extend range to
1750                 * [0; 6]
1751                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1752                 * mark [6] free, increase bb_counters and shrink range to
1753                 * [0; 5].
1754                 * Then shift range to [0; 2], go up and do the same.
1755                 */
1756
1757
1758                if (first & 1)
1759                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1760                if (!(last & 1))
1761                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1762                if (first > last)
1763                        break;
1764                order++;
1765
1766                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1767                        mb_clear_bits(buddy, first, last - first + 1);
1768                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1769                        break;
1770                }
1771                first >>= 1;
1772                last >>= 1;
1773                buddy = buddy2;
1774        }
1775}
1776
1777static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1778                           int first, int count)
1779{
1780        int left_is_free = 0;
1781        int right_is_free = 0;
1782        int block;
1783        int last = first + count - 1;
1784        struct super_block *sb = e4b->bd_sb;
1785
1786        if (WARN_ON(count == 0))
1787                return;
1788        BUG_ON(last >= (sb->s_blocksize << 3));
1789        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1790        /* Don't bother if the block group is corrupt. */
1791        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1792                return;
1793
1794        mb_check_buddy(e4b);
1795        mb_free_blocks_double(inode, e4b, first, count);
1796
1797        this_cpu_inc(discard_pa_seq);
1798        e4b->bd_info->bb_free += count;
1799        if (first < e4b->bd_info->bb_first_free)
1800                e4b->bd_info->bb_first_free = first;
1801
1802        /* access memory sequentially: check left neighbour,
1803         * clear range and then check right neighbour
1804         */
1805        if (first != 0)
1806                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1807        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1808        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1809                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1810
1811        if (unlikely(block != -1)) {
1812                struct ext4_sb_info *sbi = EXT4_SB(sb);
1813                ext4_fsblk_t blocknr;
1814
1815                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1816                blocknr += EXT4_C2B(sbi, block);
1817                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1818                        ext4_grp_locked_error(sb, e4b->bd_group,
1819                                              inode ? inode->i_ino : 0,
1820                                              blocknr,
1821                                              "freeing already freed block (bit %u); block bitmap corrupt.",
1822                                              block);
1823                        ext4_mark_group_bitmap_corrupted(
1824                                sb, e4b->bd_group,
1825                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1826                }
1827                goto done;
1828        }
1829
1830        /* let's maintain fragments counter */
1831        if (left_is_free && right_is_free)
1832                e4b->bd_info->bb_fragments--;
1833        else if (!left_is_free && !right_is_free)
1834                e4b->bd_info->bb_fragments++;
1835
1836        /* buddy[0] == bd_bitmap is a special case, so handle
1837         * it right away and let mb_buddy_mark_free stay free of
1838         * zero order checks.
1839         * Check if neighbours are to be coaleasced,
1840         * adjust bitmap bb_counters and borders appropriately.
1841         */
1842        if (first & 1) {
1843                first += !left_is_free;
1844                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1845        }
1846        if (!(last & 1)) {
1847                last -= !right_is_free;
1848                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1849        }
1850
1851        if (first <= last)
1852                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1853
1854done:
1855        mb_set_largest_free_order(sb, e4b->bd_info);
1856        mb_update_avg_fragment_size(sb, e4b->bd_info);
1857        mb_check_buddy(e4b);
1858}
1859
1860static int mb_find_extent(struct ext4_buddy *e4b, int block,
1861                                int needed, struct ext4_free_extent *ex)
1862{
1863        int next = block;
1864        int max, order;
1865        void *buddy;
1866
1867        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1868        BUG_ON(ex == NULL);
1869
1870        buddy = mb_find_buddy(e4b, 0, &max);
1871        BUG_ON(buddy == NULL);
1872        BUG_ON(block >= max);
1873        if (mb_test_bit(block, buddy)) {
1874                ex->fe_len = 0;
1875                ex->fe_start = 0;
1876                ex->fe_group = 0;
1877                return 0;
1878        }
1879
1880        /* find actual order */
1881        order = mb_find_order_for_block(e4b, block);
1882        block = block >> order;
1883
1884        ex->fe_len = 1 << order;
1885        ex->fe_start = block << order;
1886        ex->fe_group = e4b->bd_group;
1887
1888        /* calc difference from given start */
1889        next = next - ex->fe_start;
1890        ex->fe_len -= next;
1891        ex->fe_start += next;
1892
1893        while (needed > ex->fe_len &&
1894               mb_find_buddy(e4b, order, &max)) {
1895
1896                if (block + 1 >= max)
1897                        break;
1898
1899                next = (block + 1) * (1 << order);
1900                if (mb_test_bit(next, e4b->bd_bitmap))
1901                        break;
1902
1903                order = mb_find_order_for_block(e4b, next);
1904
1905                block = next >> order;
1906                ex->fe_len += 1 << order;
1907        }
1908
1909        if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
1910                /* Should never happen! (but apparently sometimes does?!?) */
1911                WARN_ON(1);
1912                ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
1913                        "corruption or bug in mb_find_extent "
1914                        "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
1915                        block, order, needed, ex->fe_group, ex->fe_start,
1916                        ex->fe_len, ex->fe_logical);
1917                ex->fe_len = 0;
1918                ex->fe_start = 0;
1919                ex->fe_group = 0;
1920        }
1921        return ex->fe_len;
1922}
1923
1924static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1925{
1926        int ord;
1927        int mlen = 0;
1928        int max = 0;
1929        int cur;
1930        int start = ex->fe_start;
1931        int len = ex->fe_len;
1932        unsigned ret = 0;
1933        int len0 = len;
1934        void *buddy;
1935
1936        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1937        BUG_ON(e4b->bd_group != ex->fe_group);
1938        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1939        mb_check_buddy(e4b);
1940        mb_mark_used_double(e4b, start, len);
1941
1942        this_cpu_inc(discard_pa_seq);
1943        e4b->bd_info->bb_free -= len;
1944        if (e4b->bd_info->bb_first_free == start)
1945                e4b->bd_info->bb_first_free += len;
1946
1947        /* let's maintain fragments counter */
1948        if (start != 0)
1949                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1950        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1951                max = !mb_test_bit(start + len, e4b->bd_bitmap);
1952        if (mlen && max)
1953                e4b->bd_info->bb_fragments++;
1954        else if (!mlen && !max)
1955                e4b->bd_info->bb_fragments--;
1956
1957        /* let's maintain buddy itself */
1958        while (len) {
1959                ord = mb_find_order_for_block(e4b, start);
1960
1961                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1962                        /* the whole chunk may be allocated at once! */
1963                        mlen = 1 << ord;
1964                        buddy = mb_find_buddy(e4b, ord, &max);
1965                        BUG_ON((start >> ord) >= max);
1966                        mb_set_bit(start >> ord, buddy);
1967                        e4b->bd_info->bb_counters[ord]--;
1968                        start += mlen;
1969                        len -= mlen;
1970                        BUG_ON(len < 0);
1971                        continue;
1972                }
1973
1974                /* store for history */
1975                if (ret == 0)
1976                        ret = len | (ord << 16);
1977
1978                /* we have to split large buddy */
1979                BUG_ON(ord <= 0);
1980                buddy = mb_find_buddy(e4b, ord, &max);
1981                mb_set_bit(start >> ord, buddy);
1982                e4b->bd_info->bb_counters[ord]--;
1983
1984                ord--;
1985                cur = (start >> ord) & ~1U;
1986                buddy = mb_find_buddy(e4b, ord, &max);
1987                mb_clear_bit(cur, buddy);
1988                mb_clear_bit(cur + 1, buddy);
1989                e4b->bd_info->bb_counters[ord]++;
1990                e4b->bd_info->bb_counters[ord]++;
1991        }
1992        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1993
1994        mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
1995        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1996        mb_check_buddy(e4b);
1997
1998        return ret;
1999}
2000
2001/*
2002 * Must be called under group lock!
2003 */
2004static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
2005                                        struct ext4_buddy *e4b)
2006{
2007        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2008        int ret;
2009
2010        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
2011        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2012
2013        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
2014        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
2015        ret = mb_mark_used(e4b, &ac->ac_b_ex);
2016
2017        /* preallocation can change ac_b_ex, thus we store actually
2018         * allocated blocks for history */
2019        ac->ac_f_ex = ac->ac_b_ex;
2020
2021        ac->ac_status = AC_STATUS_FOUND;
2022        ac->ac_tail = ret & 0xffff;
2023        ac->ac_buddy = ret >> 16;
2024
2025        /*
2026         * take the page reference. We want the page to be pinned
2027         * so that we don't get a ext4_mb_init_cache_call for this
2028         * group until we update the bitmap. That would mean we
2029         * double allocate blocks. The reference is dropped
2030         * in ext4_mb_release_context
2031         */
2032        ac->ac_bitmap_page = e4b->bd_bitmap_page;
2033        get_page(ac->ac_bitmap_page);
2034        ac->ac_buddy_page = e4b->bd_buddy_page;
2035        get_page(ac->ac_buddy_page);
2036        /* store last allocated for subsequent stream allocation */
2037        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2038                spin_lock(&sbi->s_md_lock);
2039                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
2040                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
2041                spin_unlock(&sbi->s_md_lock);
2042        }
2043        /*
2044         * As we've just preallocated more space than
2045         * user requested originally, we store allocated
2046         * space in a special descriptor.
2047         */
2048        if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2049                ext4_mb_new_preallocation(ac);
2050
2051}
2052
2053static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
2054                                        struct ext4_buddy *e4b,
2055                                        int finish_group)
2056{
2057        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2058        struct ext4_free_extent *bex = &ac->ac_b_ex;
2059        struct ext4_free_extent *gex = &ac->ac_g_ex;
2060        struct ext4_free_extent ex;
2061        int max;
2062
2063        if (ac->ac_status == AC_STATUS_FOUND)
2064                return;
2065        /*
2066         * We don't want to scan for a whole year
2067         */
2068        if (ac->ac_found > sbi->s_mb_max_to_scan &&
2069                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2070                ac->ac_status = AC_STATUS_BREAK;
2071                return;
2072        }
2073
2074        /*
2075         * Haven't found good chunk so far, let's continue
2076         */
2077        if (bex->fe_len < gex->fe_len)
2078                return;
2079
2080        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
2081                        && bex->fe_group == e4b->bd_group) {
2082                /* recheck chunk's availability - we don't know
2083                 * when it was found (within this lock-unlock
2084                 * period or not) */
2085                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
2086                if (max >= gex->fe_len) {
2087                        ext4_mb_use_best_found(ac, e4b);
2088                        return;
2089                }
2090        }
2091}
2092
2093/*
2094 * The routine checks whether found extent is good enough. If it is,
2095 * then the extent gets marked used and flag is set to the context
2096 * to stop scanning. Otherwise, the extent is compared with the
2097 * previous found extent and if new one is better, then it's stored
2098 * in the context. Later, the best found extent will be used, if
2099 * mballoc can't find good enough extent.
2100 *
2101 * FIXME: real allocation policy is to be designed yet!
2102 */
2103static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
2104                                        struct ext4_free_extent *ex,
2105                                        struct ext4_buddy *e4b)
2106{
2107        struct ext4_free_extent *bex = &ac->ac_b_ex;
2108        struct ext4_free_extent *gex = &ac->ac_g_ex;
2109
2110        BUG_ON(ex->fe_len <= 0);
2111        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2112        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
2113        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
2114
2115        ac->ac_found++;
2116
2117        /*
2118         * The special case - take what you catch first
2119         */
2120        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2121                *bex = *ex;
2122                ext4_mb_use_best_found(ac, e4b);
2123                return;
2124        }
2125
2126        /*
2127         * Let's check whether the chuck is good enough
2128         */
2129        if (ex->fe_len == gex->fe_len) {
2130                *bex = *ex;
2131                ext4_mb_use_best_found(ac, e4b);
2132                return;
2133        }
2134
2135        /*
2136         * If this is first found extent, just store it in the context
2137         */
2138        if (bex->fe_len == 0) {
2139                *bex = *ex;
2140                return;
2141        }
2142
2143        /*
2144         * If new found extent is better, store it in the context
2145         */
2146        if (bex->fe_len < gex->fe_len) {
2147                /* if the request isn't satisfied, any found extent
2148                 * larger than previous best one is better */
2149                if (ex->fe_len > bex->fe_len)
2150                        *bex = *ex;
2151        } else if (ex->fe_len > gex->fe_len) {
2152                /* if the request is satisfied, then we try to find
2153                 * an extent that still satisfy the request, but is
2154                 * smaller than previous one */
2155                if (ex->fe_len < bex->fe_len)
2156                        *bex = *ex;
2157        }
2158
2159        ext4_mb_check_limits(ac, e4b, 0);
2160}
2161
2162static noinline_for_stack
2163int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
2164                                        struct ext4_buddy *e4b)
2165{
2166        struct ext4_free_extent ex = ac->ac_b_ex;
2167        ext4_group_t group = ex.fe_group;
2168        int max;
2169        int err;
2170
2171        BUG_ON(ex.fe_len <= 0);
2172        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2173        if (err)
2174                return err;
2175
2176        ext4_lock_group(ac->ac_sb, group);
2177        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
2178
2179        if (max > 0) {
2180                ac->ac_b_ex = ex;
2181                ext4_mb_use_best_found(ac, e4b);
2182        }
2183
2184        ext4_unlock_group(ac->ac_sb, group);
2185        ext4_mb_unload_buddy(e4b);
2186
2187        return 0;
2188}
2189
2190static noinline_for_stack
2191int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
2192                                struct ext4_buddy *e4b)
2193{
2194        ext4_group_t group = ac->ac_g_ex.fe_group;
2195        int max;
2196        int err;
2197        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2198        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2199        struct ext4_free_extent ex;
2200
2201        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
2202                return 0;
2203        if (grp->bb_free == 0)
2204                return 0;
2205
2206        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
2207        if (err)
2208                return err;
2209
2210        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
2211                ext4_mb_unload_buddy(e4b);
2212                return 0;
2213        }
2214
2215        ext4_lock_group(ac->ac_sb, group);
2216        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
2217                             ac->ac_g_ex.fe_len, &ex);
2218        ex.fe_logical = 0xDEADFA11; /* debug value */
2219
2220        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
2221                ext4_fsblk_t start;
2222
2223                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
2224                        ex.fe_start;
2225                /* use do_div to get remainder (would be 64-bit modulo) */
2226                if (do_div(start, sbi->s_stripe) == 0) {
2227                        ac->ac_found++;
2228                        ac->ac_b_ex = ex;
2229                        ext4_mb_use_best_found(ac, e4b);
2230                }
2231        } else if (max >= ac->ac_g_ex.fe_len) {
2232                BUG_ON(ex.fe_len <= 0);
2233                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2234                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2235                ac->ac_found++;
2236                ac->ac_b_ex = ex;
2237                ext4_mb_use_best_found(ac, e4b);
2238        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
2239                /* Sometimes, caller may want to merge even small
2240                 * number of blocks to an existing extent */
2241                BUG_ON(ex.fe_len <= 0);
2242                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
2243                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
2244                ac->ac_found++;
2245                ac->ac_b_ex = ex;
2246                ext4_mb_use_best_found(ac, e4b);
2247        }
2248        ext4_unlock_group(ac->ac_sb, group);
2249        ext4_mb_unload_buddy(e4b);
2250
2251        return 0;
2252}
2253
2254/*
2255 * The routine scans buddy structures (not bitmap!) from given order
2256 * to max order and tries to find big enough chunk to satisfy the req
2257 */
2258static noinline_for_stack
2259void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
2260                                        struct ext4_buddy *e4b)
2261{
2262        struct super_block *sb = ac->ac_sb;
2263        struct ext4_group_info *grp = e4b->bd_info;
2264        void *buddy;
2265        int i;
2266        int k;
2267        int max;
2268
2269        BUG_ON(ac->ac_2order <= 0);
2270        for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
2271                if (grp->bb_counters[i] == 0)
2272                        continue;
2273
2274                buddy = mb_find_buddy(e4b, i, &max);
2275                BUG_ON(buddy == NULL);
2276
2277                k = mb_find_next_zero_bit(buddy, max, 0);
2278                if (k >= max) {
2279                        ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
2280                                "%d free clusters of order %d. But found 0",
2281                                grp->bb_counters[i], i);
2282                        ext4_mark_group_bitmap_corrupted(ac->ac_sb,
2283                                         e4b->bd_group,
2284                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2285                        break;
2286                }
2287                ac->ac_found++;
2288
2289                ac->ac_b_ex.fe_len = 1 << i;
2290                ac->ac_b_ex.fe_start = k << i;
2291                ac->ac_b_ex.fe_group = e4b->bd_group;
2292
2293                ext4_mb_use_best_found(ac, e4b);
2294
2295                BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
2296
2297                if (EXT4_SB(sb)->s_mb_stats)
2298                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
2299
2300                break;
2301        }
2302}
2303
2304/*
2305 * The routine scans the group and measures all found extents.
2306 * In order to optimize scanning, caller must pass number of
2307 * free blocks in the group, so the routine can know upper limit.
2308 */
2309static noinline_for_stack
2310void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
2311                                        struct ext4_buddy *e4b)
2312{
2313        struct super_block *sb = ac->ac_sb;
2314        void *bitmap = e4b->bd_bitmap;
2315        struct ext4_free_extent ex;
2316        int i;
2317        int free;
2318
2319        free = e4b->bd_info->bb_free;
2320        if (WARN_ON(free <= 0))
2321                return;
2322
2323        i = e4b->bd_info->bb_first_free;
2324
2325        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
2326                i = mb_find_next_zero_bit(bitmap,
2327                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
2328                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
2329                        /*
2330                         * IF we have corrupt bitmap, we won't find any
2331                         * free blocks even though group info says we
2332                         * have free blocks
2333                         */
2334                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2335                                        "%d free clusters as per "
2336                                        "group info. But bitmap says 0",
2337                                        free);
2338                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2339                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2340                        break;
2341                }
2342
2343                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
2344                if (WARN_ON(ex.fe_len <= 0))
2345                        break;
2346                if (free < ex.fe_len) {
2347                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2348                                        "%d free clusters as per "
2349                                        "group info. But got %d blocks",
2350                                        free, ex.fe_len);
2351                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2352                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2353                        /*
2354                         * The number of free blocks differs. This mostly
2355                         * indicate that the bitmap is corrupt. So exit
2356                         * without claiming the space.
2357                         */
2358                        break;
2359                }
2360                ex.fe_logical = 0xDEADC0DE; /* debug value */
2361                ext4_mb_measure_extent(ac, &ex, e4b);
2362
2363                i += ex.fe_len;
2364                free -= ex.fe_len;
2365        }
2366
2367        ext4_mb_check_limits(ac, e4b, 1);
2368}
2369
2370/*
2371 * This is a special case for storages like raid5
2372 * we try to find stripe-aligned chunks for stripe-size-multiple requests
2373 */
2374static noinline_for_stack
2375void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
2376                                 struct ext4_buddy *e4b)
2377{
2378        struct super_block *sb = ac->ac_sb;
2379        struct ext4_sb_info *sbi = EXT4_SB(sb);
2380        void *bitmap = e4b->bd_bitmap;
2381        struct ext4_free_extent ex;
2382        ext4_fsblk_t first_group_block;
2383        ext4_fsblk_t a;
2384        ext4_grpblk_t i;
2385        int max;
2386
2387        BUG_ON(sbi->s_stripe == 0);
2388
2389        /* find first stripe-aligned block in group */
2390        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2391
2392        a = first_group_block + sbi->s_stripe - 1;
2393        do_div(a, sbi->s_stripe);
2394        i = (a * sbi->s_stripe) - first_group_block;
2395
2396        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2397                if (!mb_test_bit(i, bitmap)) {
2398                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2399                        if (max >= sbi->s_stripe) {
2400                                ac->ac_found++;
2401                                ex.fe_logical = 0xDEADF00D; /* debug value */
2402                                ac->ac_b_ex = ex;
2403                                ext4_mb_use_best_found(ac, e4b);
2404                                break;
2405                        }
2406                }
2407                i += sbi->s_stripe;
2408        }
2409}
2410
2411/*
2412 * This is also called BEFORE we load the buddy bitmap.
2413 * Returns either 1 or 0 indicating that the group is either suitable
2414 * for the allocation or not.
2415 */
2416static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
2417                                ext4_group_t group, int cr)
2418{
2419        ext4_grpblk_t free, fragments;
2420        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2421        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2422
2423        BUG_ON(cr < 0 || cr >= 4);
2424
2425        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2426                return false;
2427
2428        free = grp->bb_free;
2429        if (free == 0)
2430                return false;
2431
2432        fragments = grp->bb_fragments;
2433        if (fragments == 0)
2434                return false;
2435
2436        switch (cr) {
2437        case 0:
2438                BUG_ON(ac->ac_2order == 0);
2439
2440                /* Avoid using the first bg of a flexgroup for data files */
2441                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2442                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2443                    ((group % flex_size) == 0))
2444                        return false;
2445
2446                if (free < ac->ac_g_ex.fe_len)
2447                        return false;
2448
2449                if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
2450                        return true;
2451
2452                if (grp->bb_largest_free_order < ac->ac_2order)
2453                        return false;
2454
2455                return true;
2456        case 1:
2457                if ((free / fragments) >= ac->ac_g_ex.fe_len)
2458                        return true;
2459                break;
2460        case 2:
2461                if (free >= ac->ac_g_ex.fe_len)
2462                        return true;
2463                break;
2464        case 3:
2465                return true;
2466        default:
2467                BUG();
2468        }
2469
2470        return false;
2471}
2472
2473/*
2474 * This could return negative error code if something goes wrong
2475 * during ext4_mb_init_group(). This should not be called with
2476 * ext4_lock_group() held.
2477 */
2478static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2479                                     ext4_group_t group, int cr)
2480{
2481        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2482        struct super_block *sb = ac->ac_sb;
2483        struct ext4_sb_info *sbi = EXT4_SB(sb);
2484        bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2485        ext4_grpblk_t free;
2486        int ret = 0;
2487
2488        if (sbi->s_mb_stats)
2489                atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
2490        if (should_lock)
2491                ext4_lock_group(sb, group);
2492        free = grp->bb_free;
2493        if (free == 0)
2494                goto out;
2495        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2496                goto out;
2497        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2498                goto out;
2499        if (should_lock)
2500                ext4_unlock_group(sb, group);
2501
2502        /* We only do this if the grp has never been initialized */
2503        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2504                struct ext4_group_desc *gdp =
2505                        ext4_get_group_desc(sb, group, NULL);
2506                int ret;
2507
2508                /* cr=0/1 is a very optimistic search to find large
2509                 * good chunks almost for free.  If buddy data is not
2510                 * ready, then this optimization makes no sense.  But
2511                 * we never skip the first block group in a flex_bg,
2512                 * since this gets used for metadata block allocation,
2513                 * and we want to make sure we locate metadata blocks
2514                 * in the first block group in the flex_bg if possible.
2515                 */
2516                if (cr < 2 &&
2517                    (!sbi->s_log_groups_per_flex ||
2518                     ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2519                    !(ext4_has_group_desc_csum(sb) &&
2520                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2521                        return 0;
2522                ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2523                if (ret)
2524                        return ret;
2525        }
2526
2527        if (should_lock)
2528                ext4_lock_group(sb, group);
2529        ret = ext4_mb_good_group(ac, group, cr);
2530out:
2531        if (should_lock)
2532                ext4_unlock_group(sb, group);
2533        return ret;
2534}
2535
2536/*
2537 * Start prefetching @nr block bitmaps starting at @group.
2538 * Return the next group which needs to be prefetched.
2539 */
2540ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2541                              unsigned int nr, int *cnt)
2542{
2543        ext4_group_t ngroups = ext4_get_groups_count(sb);
2544        struct buffer_head *bh;
2545        struct blk_plug plug;
2546
2547        blk_start_plug(&plug);
2548        while (nr-- > 0) {
2549                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2550                                                                  NULL);
2551                struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2552
2553                /*
2554                 * Prefetch block groups with free blocks; but don't
2555                 * bother if it is marked uninitialized on disk, since
2556                 * it won't require I/O to read.  Also only try to
2557                 * prefetch once, so we avoid getblk() call, which can
2558                 * be expensive.
2559                 */
2560                if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2561                    EXT4_MB_GRP_NEED_INIT(grp) &&
2562                    ext4_free_group_clusters(sb, gdp) > 0 &&
2563                    !(ext4_has_group_desc_csum(sb) &&
2564                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2565                        bh = ext4_read_block_bitmap_nowait(sb, group, true);
2566                        if (bh && !IS_ERR(bh)) {
2567                                if (!buffer_uptodate(bh) && cnt)
2568                                        (*cnt)++;
2569                                brelse(bh);
2570                        }
2571                }
2572                if (++group >= ngroups)
2573                        group = 0;
2574        }
2575        blk_finish_plug(&plug);
2576        return group;
2577}
2578
2579/*
2580 * Prefetching reads the block bitmap into the buffer cache; but we
2581 * need to make sure that the buddy bitmap in the page cache has been
2582 * initialized.  Note that ext4_mb_init_group() will block if the I/O
2583 * is not yet completed, or indeed if it was not initiated by
2584 * ext4_mb_prefetch did not start the I/O.
2585 *
2586 * TODO: We should actually kick off the buddy bitmap setup in a work
2587 * queue when the buffer I/O is completed, so that we don't block
2588 * waiting for the block allocation bitmap read to finish when
2589 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2590 */
2591void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2592                           unsigned int nr)
2593{
2594        while (nr-- > 0) {
2595                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2596                                                                  NULL);
2597                struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2598
2599                if (!group)
2600                        group = ext4_get_groups_count(sb);
2601                group--;
2602                grp = ext4_get_group_info(sb, group);
2603
2604                if (EXT4_MB_GRP_NEED_INIT(grp) &&
2605                    ext4_free_group_clusters(sb, gdp) > 0 &&
2606                    !(ext4_has_group_desc_csum(sb) &&
2607                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2608                        if (ext4_mb_init_group(sb, group, GFP_NOFS))
2609                                break;
2610                }
2611        }
2612}
2613
2614static noinline_for_stack int
2615ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2616{
2617        ext4_group_t prefetch_grp = 0, ngroups, group, i;
2618        int cr = -1;
2619        int err = 0, first_err = 0;
2620        unsigned int nr = 0, prefetch_ios = 0;
2621        struct ext4_sb_info *sbi;
2622        struct super_block *sb;
2623        struct ext4_buddy e4b;
2624        int lost;
2625
2626        sb = ac->ac_sb;
2627        sbi = EXT4_SB(sb);
2628        ngroups = ext4_get_groups_count(sb);
2629        /* non-extent files are limited to low blocks/groups */
2630        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2631                ngroups = sbi->s_blockfile_groups;
2632
2633        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2634
2635        /* first, try the goal */
2636        err = ext4_mb_find_by_goal(ac, &e4b);
2637        if (err || ac->ac_status == AC_STATUS_FOUND)
2638                goto out;
2639
2640        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2641                goto out;
2642
2643        /*
2644         * ac->ac_2order is set only if the fe_len is a power of 2
2645         * if ac->ac_2order is set we also set criteria to 0 so that we
2646         * try exact allocation using buddy.
2647         */
2648        i = fls(ac->ac_g_ex.fe_len);
2649        ac->ac_2order = 0;
2650        /*
2651         * We search using buddy data only if the order of the request
2652         * is greater than equal to the sbi_s_mb_order2_reqs
2653         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2654         * We also support searching for power-of-two requests only for
2655         * requests upto maximum buddy size we have constructed.
2656         */
2657        if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
2658                /*
2659                 * This should tell if fe_len is exactly power of 2
2660                 */
2661                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2662                        ac->ac_2order = array_index_nospec(i - 1,
2663                                                           MB_NUM_ORDERS(sb));
2664        }
2665
2666        /* if stream allocation is enabled, use global goal */
2667        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2668                /* TBD: may be hot point */
2669                spin_lock(&sbi->s_md_lock);
2670                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2671                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2672                spin_unlock(&sbi->s_md_lock);
2673        }
2674
2675        /* Let's just scan groups to find more-less suitable blocks */
2676        cr = ac->ac_2order ? 0 : 1;
2677        /*
2678         * cr == 0 try to get exact allocation,
2679         * cr == 3  try to get anything
2680         */
2681repeat:
2682        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2683                ac->ac_criteria = cr;
2684                /*
2685                 * searching for the right group start
2686                 * from the goal value specified
2687                 */
2688                group = ac->ac_g_ex.fe_group;
2689                ac->ac_last_optimal_group = group;
2690                ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
2691                prefetch_grp = group;
2692
2693                for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
2694                             i++) {
2695                        int ret = 0, new_cr;
2696
2697                        cond_resched();
2698
2699                        ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
2700                        if (new_cr != cr) {
2701                                cr = new_cr;
2702                                goto repeat;
2703                        }
2704
2705                        /*
2706                         * Batch reads of the block allocation bitmaps
2707                         * to get multiple READs in flight; limit
2708                         * prefetching at cr=0/1, otherwise mballoc can
2709                         * spend a lot of time loading imperfect groups
2710                         */
2711                        if ((prefetch_grp == group) &&
2712                            (cr > 1 ||
2713                             prefetch_ios < sbi->s_mb_prefetch_limit)) {
2714                                unsigned int curr_ios = prefetch_ios;
2715
2716                                nr = sbi->s_mb_prefetch;
2717                                if (ext4_has_feature_flex_bg(sb)) {
2718                                        nr = 1 << sbi->s_log_groups_per_flex;
2719                                        nr -= group & (nr - 1);
2720                                        nr = min(nr, sbi->s_mb_prefetch);
2721                                }
2722                                prefetch_grp = ext4_mb_prefetch(sb, group,
2723                                                        nr, &prefetch_ios);
2724                                if (prefetch_ios == curr_ios)
2725                                        nr = 0;
2726                        }
2727
2728                        /* This now checks without needing the buddy page */
2729                        ret = ext4_mb_good_group_nolock(ac, group, cr);
2730                        if (ret <= 0) {
2731                                if (!first_err)
2732                                        first_err = ret;
2733                                continue;
2734                        }
2735
2736                        err = ext4_mb_load_buddy(sb, group, &e4b);
2737                        if (err)
2738                                goto out;
2739
2740                        ext4_lock_group(sb, group);
2741
2742                        /*
2743                         * We need to check again after locking the
2744                         * block group
2745                         */
2746                        ret = ext4_mb_good_group(ac, group, cr);
2747                        if (ret == 0) {
2748                                ext4_unlock_group(sb, group);
2749                                ext4_mb_unload_buddy(&e4b);
2750                                continue;
2751                        }
2752
2753                        ac->ac_groups_scanned++;
2754                        if (cr == 0)
2755                                ext4_mb_simple_scan_group(ac, &e4b);
2756                        else if (cr == 1 && sbi->s_stripe &&
2757                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2758                                ext4_mb_scan_aligned(ac, &e4b);
2759                        else
2760                                ext4_mb_complex_scan_group(ac, &e4b);
2761
2762                        ext4_unlock_group(sb, group);
2763                        ext4_mb_unload_buddy(&e4b);
2764
2765                        if (ac->ac_status != AC_STATUS_CONTINUE)
2766                                break;
2767                }
2768                /* Processed all groups and haven't found blocks */
2769                if (sbi->s_mb_stats && i == ngroups)
2770                        atomic64_inc(&sbi->s_bal_cX_failed[cr]);
2771        }
2772
2773        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2774            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2775                /*
2776                 * We've been searching too long. Let's try to allocate
2777                 * the best chunk we've found so far
2778                 */
2779                ext4_mb_try_best_found(ac, &e4b);
2780                if (ac->ac_status != AC_STATUS_FOUND) {
2781                        /*
2782                         * Someone more lucky has already allocated it.
2783                         * The only thing we can do is just take first
2784                         * found block(s)
2785                         */
2786                        lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2787                        mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2788                                 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2789                                 ac->ac_b_ex.fe_len, lost);
2790
2791                        ac->ac_b_ex.fe_group = 0;
2792                        ac->ac_b_ex.fe_start = 0;
2793                        ac->ac_b_ex.fe_len = 0;
2794                        ac->ac_status = AC_STATUS_CONTINUE;
2795                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2796                        cr = 3;
2797                        goto repeat;
2798                }
2799        }
2800
2801        if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2802                atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
2803out:
2804        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2805                err = first_err;
2806
2807        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2808                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2809                 ac->ac_flags, cr, err);
2810
2811        if (nr)
2812                ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2813
2814        return err;
2815}
2816
2817static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2818{
2819        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2820        ext4_group_t group;
2821
2822        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2823                return NULL;
2824        group = *pos + 1;
2825        return (void *) ((unsigned long) group);
2826}
2827
2828static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2829{
2830        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2831        ext4_group_t group;
2832
2833        ++*pos;
2834        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2835                return NULL;
2836        group = *pos + 1;
2837        return (void *) ((unsigned long) group);
2838}
2839
2840static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2841{
2842        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2843        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2844        int i;
2845        int err, buddy_loaded = 0;
2846        struct ext4_buddy e4b;
2847        struct ext4_group_info *grinfo;
2848        unsigned char blocksize_bits = min_t(unsigned char,
2849                                             sb->s_blocksize_bits,
2850                                             EXT4_MAX_BLOCK_LOG_SIZE);
2851        struct sg {
2852                struct ext4_group_info info;
2853                ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2854        } sg;
2855
2856        group--;
2857        if (group == 0)
2858                seq_puts(seq, "#group: free  frags first ["
2859                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2860                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
2861
2862        i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2863                sizeof(struct ext4_group_info);
2864
2865        grinfo = ext4_get_group_info(sb, group);
2866        /* Load the group info in memory only if not already loaded. */
2867        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2868                err = ext4_mb_load_buddy(sb, group, &e4b);
2869                if (err) {
2870                        seq_printf(seq, "#%-5u: I/O error\n", group);
2871                        return 0;
2872                }
2873                buddy_loaded = 1;
2874        }
2875
2876        memcpy(&sg, ext4_get_group_info(sb, group), i);
2877
2878        if (buddy_loaded)
2879                ext4_mb_unload_buddy(&e4b);
2880
2881        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2882                        sg.info.bb_fragments, sg.info.bb_first_free);
2883        for (i = 0; i <= 13; i++)
2884                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
2885                                sg.info.bb_counters[i] : 0);
2886        seq_puts(seq, " ]\n");
2887
2888        return 0;
2889}
2890
2891static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2892{
2893}
2894
2895const struct seq_operations ext4_mb_seq_groups_ops = {
2896        .start  = ext4_mb_seq_groups_start,
2897        .next   = ext4_mb_seq_groups_next,
2898        .stop   = ext4_mb_seq_groups_stop,
2899        .show   = ext4_mb_seq_groups_show,
2900};
2901
2902int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2903{
2904        struct super_block *sb = (struct super_block *)seq->private;
2905        struct ext4_sb_info *sbi = EXT4_SB(sb);
2906
2907        seq_puts(seq, "mballoc:\n");
2908        if (!sbi->s_mb_stats) {
2909                seq_puts(seq, "\tmb stats collection turned off.\n");
2910                seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2911                return 0;
2912        }
2913        seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2914        seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2915
2916        seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));
2917
2918        seq_puts(seq, "\tcr0_stats:\n");
2919        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2920        seq_printf(seq, "\t\tgroups_considered: %llu\n",
2921                   atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2922        seq_printf(seq, "\t\tuseless_loops: %llu\n",
2923                   atomic64_read(&sbi->s_bal_cX_failed[0]));
2924        seq_printf(seq, "\t\tbad_suggestions: %u\n",
2925                   atomic_read(&sbi->s_bal_cr0_bad_suggestions));
2926
2927        seq_puts(seq, "\tcr1_stats:\n");
2928        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2929        seq_printf(seq, "\t\tgroups_considered: %llu\n",
2930                   atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2931        seq_printf(seq, "\t\tuseless_loops: %llu\n",
2932                   atomic64_read(&sbi->s_bal_cX_failed[1]));
2933        seq_printf(seq, "\t\tbad_suggestions: %u\n",
2934                   atomic_read(&sbi->s_bal_cr1_bad_suggestions));
2935
2936        seq_puts(seq, "\tcr2_stats:\n");
2937        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2938        seq_printf(seq, "\t\tgroups_considered: %llu\n",
2939                   atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2940        seq_printf(seq, "\t\tuseless_loops: %llu\n",
2941                   atomic64_read(&sbi->s_bal_cX_failed[2]));
2942
2943        seq_puts(seq, "\tcr3_stats:\n");
2944        seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2945        seq_printf(seq, "\t\tgroups_considered: %llu\n",
2946                   atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2947        seq_printf(seq, "\t\tuseless_loops: %llu\n",
2948                   atomic64_read(&sbi->s_bal_cX_failed[3]));
2949        seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2950        seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2951        seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2952        seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2953        seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2954
2955        seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2956                   atomic_read(&sbi->s_mb_buddies_generated),
2957                   ext4_get_groups_count(sb));
2958        seq_printf(seq, "\tbuddies_time_used: %llu\n",
2959                   atomic64_read(&sbi->s_mb_generation_time));
2960        seq_printf(seq, "\tpreallocated: %u\n",
2961                   atomic_read(&sbi->s_mb_preallocated));
2962        seq_printf(seq, "\tdiscarded: %u\n",
2963                   atomic_read(&sbi->s_mb_discarded));
2964        return 0;
2965}
2966
2967static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
2968{
2969        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2970        unsigned long position;
2971
2972        read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
2973
2974        if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
2975                return NULL;
2976        position = *pos + 1;
2977        return (void *) ((unsigned long) position);
2978}
2979
2980static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
2981{
2982        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2983        unsigned long position;
2984
2985        ++*pos;
2986        if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
2987                return NULL;
2988        position = *pos + 1;
2989        return (void *) ((unsigned long) position);
2990}
2991
2992static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
2993{
2994        struct super_block *sb = PDE_DATA(file_inode(seq->file));
2995        struct ext4_sb_info *sbi = EXT4_SB(sb);
2996        unsigned long position = ((unsigned long) v);
2997        struct ext4_group_info *grp;
2998        struct rb_node *n;
2999        unsigned int count, min, max;
3000
3001        position--;
3002        if (position >= MB_NUM_ORDERS(sb)) {
3003                seq_puts(seq, "fragment_size_tree:\n");
3004                n = rb_first(&sbi->s_mb_avg_fragment_size_root);
3005                if (!n) {
3006                        seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
3007                        return 0;
3008                }
3009                grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
3010                min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
3011                count = 1;
3012                while (rb_next(n)) {
3013                        count++;
3014                        n = rb_next(n);
3015                }
3016                grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
3017                max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
3018
3019                seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
3020                           min, max, count);
3021                return 0;
3022        }
3023
3024        if (position == 0) {
3025                seq_printf(seq, "optimize_scan: %d\n",
3026                           test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
3027                seq_puts(seq, "max_free_order_lists:\n");
3028        }
3029        count = 0;
3030        list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
3031                            bb_largest_free_order_node)
3032                count++;
3033        seq_printf(seq, "\tlist_order_%u_groups: %u\n",
3034                   (unsigned int)position, count);
3035
3036        return 0;
3037}
3038
3039static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
3040{
3041        struct super_block *sb = PDE_DATA(file_inode(seq->file));
3042
3043        read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
3044}
3045
3046const struct seq_operations ext4_mb_seq_structs_summary_ops = {
3047        .start  = ext4_mb_seq_structs_summary_start,
3048        .next   = ext4_mb_seq_structs_summary_next,
3049        .stop   = ext4_mb_seq_structs_summary_stop,
3050        .show   = ext4_mb_seq_structs_summary_show,
3051};
3052
3053static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
3054{
3055        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3056        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
3057
3058        BUG_ON(!cachep);
3059        return cachep;
3060}
3061
3062/*
3063 * Allocate the top-level s_group_info array for the specified number
3064 * of groups
3065 */
3066int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
3067{
3068        struct ext4_sb_info *sbi = EXT4_SB(sb);
3069        unsigned size;
3070        struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
3071
3072        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
3073                EXT4_DESC_PER_BLOCK_BITS(sb);
3074        if (size <= sbi->s_group_info_size)
3075                return 0;
3076
3077        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
3078        new_groupinfo = kvzalloc(size, GFP_KERNEL);
3079        if (!new_groupinfo) {
3080                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
3081                return -ENOMEM;
3082        }
3083        rcu_read_lock();
3084        old_groupinfo = rcu_dereference(sbi->s_group_info);
3085        if (old_groupinfo)
3086                memcpy(new_groupinfo, old_groupinfo,
3087                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
3088        rcu_read_unlock();
3089        rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
3090        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
3091        if (old_groupinfo)
3092                ext4_kvfree_array_rcu(old_groupinfo);
3093        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
3094                   sbi->s_group_info_size);
3095        return 0;
3096}
3097
3098/* Create and initialize ext4_group_info data for the given group. */
3099int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
3100                          struct ext4_group_desc *desc)
3101{
3102        int i;
3103        int metalen = 0;
3104        int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
3105        struct ext4_sb_info *sbi = EXT4_SB(sb);
3106        struct ext4_group_info **meta_group_info;
3107        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3108
3109        /*
3110         * First check if this group is the first of a reserved block.
3111         * If it's true, we have to allocate a new table of pointers
3112         * to ext4_group_info structures
3113         */
3114        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3115                metalen = sizeof(*meta_group_info) <<
3116                        EXT4_DESC_PER_BLOCK_BITS(sb);
3117                meta_group_info = kmalloc(metalen, GFP_NOFS);
3118                if (meta_group_info == NULL) {
3119                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
3120                                 "for a buddy group");
3121                        goto exit_meta_group_info;
3122                }
3123                rcu_read_lock();
3124                rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
3125                rcu_read_unlock();
3126        }
3127
3128        meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
3129        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
3130
3131        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
3132        if (meta_group_info[i] == NULL) {
3133                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
3134                goto exit_group_info;
3135        }
3136        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
3137                &(meta_group_info[i]->bb_state));
3138
3139        /*
3140         * initialize bb_free to be able to skip
3141         * empty groups without initialization
3142         */
3143        if (ext4_has_group_desc_csum(sb) &&
3144            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3145                meta_group_info[i]->bb_free =
3146                        ext4_free_clusters_after_init(sb, group, desc);
3147        } else {
3148                meta_group_info[i]->bb_free =
3149                        ext4_free_group_clusters(sb, desc);
3150        }
3151
3152        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
3153        init_rwsem(&meta_group_info[i]->alloc_sem);
3154        meta_group_info[i]->bb_free_root = RB_ROOT;
3155        INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
3156        RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
3157        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
3158        meta_group_info[i]->bb_group = group;
3159
3160        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
3161        return 0;
3162
3163exit_group_info:
3164        /* If a meta_group_info table has been allocated, release it now */
3165        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
3166                struct ext4_group_info ***group_info;
3167
3168                rcu_read_lock();
3169                group_info = rcu_dereference(sbi->s_group_info);
3170                kfree(group_info[idx]);
3171                group_info[idx] = NULL;
3172                rcu_read_unlock();
3173        }
3174exit_meta_group_info:
3175        return -ENOMEM;
3176} /* ext4_mb_add_groupinfo */
3177
3178static int ext4_mb_init_backend(struct super_block *sb)
3179{
3180        ext4_group_t ngroups = ext4_get_groups_count(sb);
3181        ext4_group_t i;
3182        struct ext4_sb_info *sbi = EXT4_SB(sb);
3183        int err;
3184        struct ext4_group_desc *desc;
3185        struct ext4_group_info ***group_info;
3186        struct kmem_cache *cachep;
3187
3188        err = ext4_mb_alloc_groupinfo(sb, ngroups);
3189        if (err)
3190                return err;
3191
3192        sbi->s_buddy_cache = new_inode(sb);
3193        if (sbi->s_buddy_cache == NULL) {
3194                ext4_msg(sb, KERN_ERR, "can't get new inode");
3195                goto err_freesgi;
3196        }
3197        /* To avoid potentially colliding with an valid on-disk inode number,
3198         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
3199         * not in the inode hash, so it should never be found by iget(), but
3200         * this will avoid confusion if it ever shows up during debugging. */
3201        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
3202        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
3203        for (i = 0; i < ngroups; i++) {
3204                cond_resched();
3205                desc = ext4_get_group_desc(sb, i, NULL);
3206                if (desc == NULL) {
3207                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
3208                        goto err_freebuddy;
3209                }
3210                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
3211                        goto err_freebuddy;
3212        }
3213
3214        if (ext4_has_feature_flex_bg(sb)) {
3215                /* a single flex group is supposed to be read by a single IO.
3216                 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
3217                 * unsigned integer, so the maximum shift is 32.
3218                 */
3219                if (sbi->s_es->s_log_groups_per_flex >= 32) {
3220                        ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
3221                        goto err_freebuddy;
3222                }
3223                sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
3224                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
3225                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
3226        } else {
3227                sbi->s_mb_prefetch = 32;
3228        }
3229        if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
3230                sbi->s_mb_prefetch = ext4_get_groups_count(sb);
3231        /* now many real IOs to prefetch within a single allocation at cr=0
3232         * given cr=0 is an CPU-related optimization we shouldn't try to
3233         * load too many groups, at some point we should start to use what
3234         * we've got in memory.
3235         * with an average random access time 5ms, it'd take a second to get
3236         * 200 groups (* N with flex_bg), so let's make this limit 4
3237         */
3238        sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
3239        if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
3240                sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
3241
3242        return 0;
3243
3244err_freebuddy:
3245        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3246        while (i-- > 0)
3247                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
3248        i = sbi->s_group_info_size;
3249        rcu_read_lock();
3250        group_info = rcu_dereference(sbi->s_group_info);
3251        while (i-- > 0)
3252                kfree(group_info[i]);
3253        rcu_read_unlock();
3254        iput(sbi->s_buddy_cache);
3255err_freesgi:
3256        rcu_read_lock();
3257        kvfree(rcu_dereference(sbi->s_group_info));
3258        rcu_read_unlock();
3259        return -ENOMEM;
3260}
3261
3262static void ext4_groupinfo_destroy_slabs(void)
3263{
3264        int i;
3265
3266        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
3267                kmem_cache_destroy(ext4_groupinfo_caches[i]);
3268                ext4_groupinfo_caches[i] = NULL;
3269        }
3270}
3271
3272static int ext4_groupinfo_create_slab(size_t size)
3273{
3274        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
3275        int slab_size;
3276        int blocksize_bits = order_base_2(size);
3277        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
3278        struct kmem_cache *cachep;
3279
3280        if (cache_index >= NR_GRPINFO_CACHES)
3281                return -EINVAL;
3282
3283        if (unlikely(cache_index < 0))
3284                cache_index = 0;
3285
3286        mutex_lock(&ext4_grpinfo_slab_create_mutex);
3287        if (ext4_groupinfo_caches[cache_index]) {
3288                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3289                return 0;       /* Already created */
3290        }
3291
3292        slab_size = offsetof(struct ext4_group_info,
3293                                bb_counters[blocksize_bits + 2]);
3294
3295        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
3296                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
3297                                        NULL);
3298
3299        ext4_groupinfo_caches[cache_index] = cachep;
3300
3301        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
3302        if (!cachep) {
3303                printk(KERN_EMERG
3304                       "EXT4-fs: no memory for groupinfo slab cache\n");
3305                return -ENOMEM;
3306        }
3307
3308        return 0;
3309}
3310
3311int ext4_mb_init(struct super_block *sb)
3312{
3313        struct ext4_sb_info *sbi = EXT4_SB(sb);
3314        unsigned i, j;
3315        unsigned offset, offset_incr;
3316        unsigned max;
3317        int ret;
3318
3319        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
3320
3321        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
3322        if (sbi->s_mb_offsets == NULL) {
3323                ret = -ENOMEM;
3324                goto out;
3325        }
3326
3327        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
3328        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
3329        if (sbi->s_mb_maxs == NULL) {
3330                ret = -ENOMEM;
3331                goto out;
3332        }
3333
3334        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
3335        if (ret < 0)
3336                goto out;
3337
3338        /* order 0 is regular bitmap */
3339        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
3340        sbi->s_mb_offsets[0] = 0;
3341
3342        i = 1;
3343        offset = 0;
3344        offset_incr = 1 << (sb->s_blocksize_bits - 1);
3345        max = sb->s_blocksize << 2;
3346        do {
3347                sbi->s_mb_offsets[i] = offset;
3348                sbi->s_mb_maxs[i] = max;
3349                offset += offset_incr;
3350                offset_incr = offset_incr >> 1;
3351                max = max >> 1;
3352                i++;
3353        } while (i < MB_NUM_ORDERS(sb));
3354
3355        sbi->s_mb_avg_fragment_size_root = RB_ROOT;
3356        sbi->s_mb_largest_free_orders =
3357                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
3358                        GFP_KERNEL);
3359        if (!sbi->s_mb_largest_free_orders) {
3360                ret = -ENOMEM;
3361                goto out;
3362        }
3363        sbi->s_mb_largest_free_orders_locks =
3364                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
3365                        GFP_KERNEL);
3366        if (!sbi->s_mb_largest_free_orders_locks) {
3367                ret = -ENOMEM;
3368                goto out;
3369        }
3370        for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
3371                INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
3372                rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
3373        }
3374        rwlock_init(&sbi->s_mb_rb_lock);
3375
3376        spin_lock_init(&sbi->s_md_lock);
3377        sbi->s_mb_free_pending = 0;
3378        INIT_LIST_HEAD(&sbi->s_freed_data_list);
3379
3380        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
3381        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
3382        sbi->s_mb_stats = MB_DEFAULT_STATS;
3383        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
3384        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
3385        sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
3386        /*
3387         * The default group preallocation is 512, which for 4k block
3388         * sizes translates to 2 megabytes.  However for bigalloc file
3389         * systems, this is probably too big (i.e, if the cluster size
3390         * is 1 megabyte, then group preallocation size becomes half a
3391         * gigabyte!).  As a default, we will keep a two megabyte
3392         * group pralloc size for cluster sizes up to 64k, and after
3393         * that, we will force a minimum group preallocation size of
3394         * 32 clusters.  This translates to 8 megs when the cluster
3395         * size is 256k, and 32 megs when the cluster size is 1 meg,
3396         * which seems reasonable as a default.
3397         */
3398        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
3399                                       sbi->s_cluster_bits, 32);
3400        /*
3401         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3402         * to the lowest multiple of s_stripe which is bigger than
3403         * the s_mb_group_prealloc as determined above. We want
3404         * the preallocation size to be an exact multiple of the
3405         * RAID stripe size so that preallocations don't fragment
3406         * the stripes.
3407         */
3408        if (sbi->s_stripe > 1) {
3409                sbi->s_mb_group_prealloc = roundup(
3410                        sbi->s_mb_group_prealloc, sbi->s_stripe);
3411        }
3412
3413        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
3414        if (sbi->s_locality_groups == NULL) {
3415                ret = -ENOMEM;
3416                goto out;
3417        }
3418        for_each_possible_cpu(i) {
3419                struct ext4_locality_group *lg;
3420                lg = per_cpu_ptr(sbi->s_locality_groups, i);
3421                mutex_init(&lg->lg_mutex);
3422                for (j = 0; j < PREALLOC_TB_SIZE; j++)
3423                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
3424                spin_lock_init(&lg->lg_prealloc_lock);
3425        }
3426
3427        if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
3428                sbi->s_mb_max_linear_groups = 0;
3429        else
3430                sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
3431        /* init file for buddy data */
3432        ret = ext4_mb_init_backend(sb);
3433        if (ret != 0)
3434                goto out_free_locality_groups;
3435
3436        return 0;
3437
3438out_free_locality_groups:
3439        free_percpu(sbi->s_locality_groups);
3440        sbi->s_locality_groups = NULL;
3441out:
3442        kfree(sbi->s_mb_largest_free_orders);
3443        kfree(sbi->s_mb_largest_free_orders_locks);
3444        kfree(sbi->s_mb_offsets);
3445        sbi->s_mb_offsets = NULL;
3446        kfree(sbi->s_mb_maxs);
3447        sbi->s_mb_maxs = NULL;
3448        return ret;
3449}
3450
3451/* need to called with the ext4 group lock held */
3452static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
3453{
3454        struct ext4_prealloc_space *pa;
3455        struct list_head *cur, *tmp;
3456        int count = 0;
3457
3458        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
3459                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3460                list_del(&pa->pa_group_list);
3461                count++;
3462                kmem_cache_free(ext4_pspace_cachep, pa);
3463        }
3464        return count;
3465}
3466
3467int ext4_mb_release(struct super_block *sb)
3468{
3469        ext4_group_t ngroups = ext4_get_groups_count(sb);
3470        ext4_group_t i;
3471        int num_meta_group_infos;
3472        struct ext4_group_info *grinfo, ***group_info;
3473        struct ext4_sb_info *sbi = EXT4_SB(sb);
3474        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3475        int count;
3476
3477        if (sbi->s_group_info) {
3478                for (i = 0; i < ngroups; i++) {
3479                        cond_resched();
3480                        grinfo = ext4_get_group_info(sb, i);
3481                        mb_group_bb_bitmap_free(grinfo);
3482                        ext4_lock_group(sb, i);
3483                        count = ext4_mb_cleanup_pa(grinfo);
3484                        if (count)
3485                                mb_debug(sb, "mballoc: %d PAs left\n",
3486                                         count);
3487                        ext4_unlock_group(sb, i);
3488                        kmem_cache_free(cachep, grinfo);
3489                }
3490                num_meta_group_infos = (ngroups +
3491                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
3492                        EXT4_DESC_PER_BLOCK_BITS(sb);
3493                rcu_read_lock();
3494                group_info = rcu_dereference(sbi->s_group_info);
3495                for (i = 0; i < num_meta_group_infos; i++)
3496                        kfree(group_info[i]);
3497                kvfree(group_info);
3498                rcu_read_unlock();
3499        }
3500        kfree(sbi->s_mb_largest_free_orders);
3501        kfree(sbi->s_mb_largest_free_orders_locks);
3502        kfree(sbi->s_mb_offsets);
3503        kfree(sbi->s_mb_maxs);
3504        iput(sbi->s_buddy_cache);
3505        if (sbi->s_mb_stats) {
3506                ext4_msg(sb, KERN_INFO,
3507                       "mballoc: %u blocks %u reqs (%u success)",
3508                                atomic_read(&sbi->s_bal_allocated),
3509                                atomic_read(&sbi->s_bal_reqs),
3510                                atomic_read(&sbi->s_bal_success));
3511                ext4_msg(sb, KERN_INFO,
3512                      "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
3513                                "%u 2^N hits, %u breaks, %u lost",
3514                                atomic_read(&sbi->s_bal_ex_scanned),
3515                                atomic_read(&sbi->s_bal_groups_scanned),
3516                                atomic_read(&sbi->s_bal_goals),
3517                                atomic_read(&sbi->s_bal_2orders),
3518                                atomic_read(&sbi->s_bal_breaks),
3519                                atomic_read(&sbi->s_mb_lost_chunks));
3520                ext4_msg(sb, KERN_INFO,
3521                       "mballoc: %u generated and it took %llu",
3522                                atomic_read(&sbi->s_mb_buddies_generated),
3523                                atomic64_read(&sbi->s_mb_generation_time));
3524                ext4_msg(sb, KERN_INFO,
3525                       "mballoc: %u preallocated, %u discarded",
3526                                atomic_read(&sbi->s_mb_preallocated),
3527                                atomic_read(&sbi->s_mb_discarded));
3528        }
3529
3530        free_percpu(sbi->s_locality_groups);
3531
3532        return 0;
3533}
3534
3535static inline int ext4_issue_discard(struct super_block *sb,
3536                ext4_group_t block_group, ext4_grpblk_t cluster, int count,
3537                struct bio **biop)
3538{
3539        ext4_fsblk_t discard_block;
3540
3541        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
3542                         ext4_group_first_block_no(sb, block_group));
3543        count = EXT4_C2B(EXT4_SB(sb), count);
3544        trace_ext4_discard_blocks(sb,
3545                        (unsigned long long) discard_block, count);
3546        if (biop) {
3547                return __blkdev_issue_discard(sb->s_bdev,
3548                        (sector_t)discard_block << (sb->s_blocksize_bits - 9),
3549                        (sector_t)count << (sb->s_blocksize_bits - 9),
3550                        GFP_NOFS, 0, biop);
3551        } else
3552                return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
3553}
3554
3555static void ext4_free_data_in_buddy(struct super_block *sb,
3556                                    struct ext4_free_data *entry)
3557{
3558        struct ext4_buddy e4b;
3559        struct ext4_group_info *db;
3560        int err, count = 0, count2 = 0;
3561
3562        mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
3563                 entry->efd_count, entry->efd_group, entry);
3564
3565        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
3566        /* we expect to find existing buddy because it's pinned */
3567        BUG_ON(err != 0);
3568
3569        spin_lock(&EXT4_SB(sb)->s_md_lock);
3570        EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
3571        spin_unlock(&EXT4_SB(sb)->s_md_lock);
3572
3573        db = e4b.bd_info;
3574        /* there are blocks to put in buddy to make them really free */
3575        count += entry->efd_count;
3576        count2++;
3577        ext4_lock_group(sb, entry->efd_group);
3578        /* Take it out of per group rb tree */
3579        rb_erase(&entry->efd_node, &(db->bb_free_root));
3580        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
3581
3582        /*
3583         * Clear the trimmed flag for the group so that the next
3584         * ext4_trim_fs can trim it.
3585         * If the volume is mounted with -o discard, online discard
3586         * is supported and the free blocks will be trimmed online.
3587         */
3588        if (!test_opt(sb, DISCARD))
3589                EXT4_MB_GRP_CLEAR_TRIMMED(db);
3590
3591        if (!db->bb_free_root.rb_node) {
3592                /* No more items in the per group rb tree
3593                 * balance refcounts from ext4_mb_free_metadata()
3594                 */
3595                put_page(e4b.bd_buddy_page);
3596                put_page(e4b.bd_bitmap_page);
3597        }
3598        ext4_unlock_group(sb, entry->efd_group);
3599        kmem_cache_free(ext4_free_data_cachep, entry);
3600        ext4_mb_unload_buddy(&e4b);
3601
3602        mb_debug(sb, "freed %d blocks in %d structures\n", count,
3603                 count2);
3604}
3605
3606/*
3607 * This function is called by the jbd2 layer once the commit has finished,
3608 * so we know we can free the blocks that were released with that commit.
3609 */
3610void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
3611{
3612        struct ext4_sb_info *sbi = EXT4_SB(sb);
3613        struct ext4_free_data *entry, *tmp;
3614        struct bio *discard_bio = NULL;
3615        struct list_head freed_data_list;
3616        struct list_head *cut_pos = NULL;
3617        int err;
3618
3619        INIT_LIST_HEAD(&freed_data_list);
3620
3621        spin_lock(&sbi->s_md_lock);
3622        list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
3623                if (entry->efd_tid != commit_tid)
3624                        break;
3625                cut_pos = &entry->efd_list;
3626        }
3627        if (cut_pos)
3628                list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
3629                                  cut_pos);
3630        spin_unlock(&sbi->s_md_lock);
3631
3632        if (test_opt(sb, DISCARD)) {
3633                list_for_each_entry(entry, &freed_data_list, efd_list) {
3634                        err = ext4_issue_discard(sb, entry->efd_group,
3635                                                 entry->efd_start_cluster,
3636                                                 entry->efd_count,
3637                                                 &discard_bio);
3638                        if (err && err != -EOPNOTSUPP) {
3639                                ext4_msg(sb, KERN_WARNING, "discard request in"
3640                                         " group:%d block:%d count:%d failed"
3641                                         " with %d", entry->efd_group,
3642                                         entry->efd_start_cluster,
3643                                         entry->efd_count, err);
3644                        } else if (err == -EOPNOTSUPP)
3645                                break;
3646                }
3647
3648                if (discard_bio) {
3649                        submit_bio_wait(discard_bio);
3650                        bio_put(discard_bio);
3651                }
3652        }
3653
3654        list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3655                ext4_free_data_in_buddy(sb, entry);
3656}
3657
3658int __init ext4_init_mballoc(void)
3659{
3660        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
3661                                        SLAB_RECLAIM_ACCOUNT);
3662        if (ext4_pspace_cachep == NULL)
3663                goto out;
3664
3665        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
3666                                    SLAB_RECLAIM_ACCOUNT);
3667        if (ext4_ac_cachep == NULL)
3668                goto out_pa_free;
3669
3670        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
3671                                           SLAB_RECLAIM_ACCOUNT);
3672        if (ext4_free_data_cachep == NULL)
3673                goto out_ac_free;
3674
3675        return 0;
3676
3677out_ac_free:
3678        kmem_cache_destroy(ext4_ac_cachep);
3679out_pa_free:
3680        kmem_cache_destroy(ext4_pspace_cachep);
3681out:
3682        return -ENOMEM;
3683}
3684
3685void ext4_exit_mballoc(void)
3686{
3687        /*
3688         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3689         * before destroying the slab cache.
3690         */
3691        rcu_barrier();
3692        kmem_cache_destroy(ext4_pspace_cachep);
3693        kmem_cache_destroy(ext4_ac_cachep);
3694        kmem_cache_destroy(ext4_free_data_cachep);
3695        ext4_groupinfo_destroy_slabs();
3696}
3697
3698
3699/*
3700 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
3701 * Returns 0 if success or error code
3702 */
3703static noinline_for_stack int
3704ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3705                                handle_t *handle, unsigned int reserv_clstrs)
3706{
3707        struct buffer_head *bitmap_bh = NULL;
3708        struct ext4_group_desc *gdp;
3709        struct buffer_head *gdp_bh;
3710        struct ext4_sb_info *sbi;
3711        struct super_block *sb;
3712        ext4_fsblk_t block;
3713        int err, len;
3714
3715        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3716        BUG_ON(ac->ac_b_ex.fe_len <= 0);
3717
3718        sb = ac->ac_sb;
3719        sbi = EXT4_SB(sb);
3720
3721        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3722        if (IS_ERR(bitmap_bh)) {
3723                err = PTR_ERR(bitmap_bh);
3724                bitmap_bh = NULL;
3725                goto out_err;
3726        }
3727
3728        BUFFER_TRACE(bitmap_bh, "getting write access");
3729        err = ext4_journal_get_write_access(handle, bitmap_bh);
3730        if (err)
3731                goto out_err;
3732
3733        err = -EIO;
3734        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3735        if (!gdp)
3736                goto out_err;
3737
3738        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3739                        ext4_free_group_clusters(sb, gdp));
3740
3741        BUFFER_TRACE(gdp_bh, "get_write_access");
3742        err = ext4_journal_get_write_access(handle, gdp_bh);
3743        if (err)
3744                goto out_err;
3745
3746        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3747
3748        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3749        if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
3750                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
3751                           "fs metadata", block, block+len);
3752                /* File system mounted not to panic on error
3753                 * Fix the bitmap and return EFSCORRUPTED
3754                 * We leak some of the blocks here.
3755                 */
3756                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3757                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3758                              ac->ac_b_ex.fe_len);
3759                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3760                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3761                if (!err)
3762                        err = -EFSCORRUPTED;
3763                goto out_err;
3764        }
3765
3766        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3767#ifdef AGGRESSIVE_CHECK
3768        {
3769                int i;
3770                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3771                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3772                                                bitmap_bh->b_data));
3773                }
3774        }
3775#endif
3776        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3777                      ac->ac_b_ex.fe_len);
3778        if (ext4_has_group_desc_csum(sb) &&
3779            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3780                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3781                ext4_free_group_clusters_set(sb, gdp,
3782                                             ext4_free_clusters_after_init(sb,
3783                                                ac->ac_b_ex.fe_group, gdp));
3784        }
3785        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3786        ext4_free_group_clusters_set(sb, gdp, len);
3787        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3788        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3789
3790        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3791        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3792        /*
3793         * Now reduce the dirty block count also. Should not go negative
3794         */
3795        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3796                /* release all the reserved blocks if non delalloc */
3797                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3798                                   reserv_clstrs);
3799
3800        if (sbi->s_log_groups_per_flex) {
3801                ext4_group_t flex_group = ext4_flex_group(sbi,
3802                                                          ac->ac_b_ex.fe_group);
3803                atomic64_sub(ac->ac_b_ex.fe_len,
3804                             &sbi_array_rcu_deref(sbi, s_flex_groups,
3805                                                  flex_group)->free_clusters);
3806        }
3807
3808        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3809        if (err)
3810                goto out_err;
3811        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3812
3813out_err:
3814        brelse(bitmap_bh);
3815        return err;
3816}
3817
3818/*
3819 * Idempotent helper for Ext4 fast commit replay path to set the state of
3820 * blocks in bitmaps and update counters.
3821 */
3822void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3823                        int len, int state)
3824{
3825        struct buffer_head *bitmap_bh = NULL;
3826        struct ext4_group_desc *gdp;
3827        struct buffer_head *gdp_bh;
3828        struct ext4_sb_info *sbi = EXT4_SB(sb);
3829        ext4_group_t group;
3830        ext4_grpblk_t blkoff;
3831        int i, clen, err;
3832        int already;
3833
3834        clen = EXT4_B2C(sbi, len);
3835
3836        ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
3837        bitmap_bh = ext4_read_block_bitmap(sb, group);
3838        if (IS_ERR(bitmap_bh)) {
3839                err = PTR_ERR(bitmap_bh);
3840                bitmap_bh = NULL;
3841                goto out_err;
3842        }
3843
3844        err = -EIO;
3845        gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3846        if (!gdp)
3847                goto out_err;
3848
3849        ext4_lock_group(sb, group);
3850        already = 0;
3851        for (i = 0; i < clen; i++)
3852                if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
3853                        already++;
3854
3855        if (state)
3856                ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
3857        else
3858                mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
3859        if (ext4_has_group_desc_csum(sb) &&
3860            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3861                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3862                ext4_free_group_clusters_set(sb, gdp,
3863                                             ext4_free_clusters_after_init(sb,
3864                                                group, gdp));
3865        }
3866        if (state)
3867                clen = ext4_free_group_clusters(sb, gdp) - clen + already;
3868        else
3869                clen = ext4_free_group_clusters(sb, gdp) + clen - already;
3870
3871        ext4_free_group_clusters_set(sb, gdp, clen);
3872        ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
3873        ext4_group_desc_csum_set(sb, group, gdp);
3874
3875        ext4_unlock_group(sb, group);
3876
3877        if (sbi->s_log_groups_per_flex) {
3878                ext4_group_t flex_group = ext4_flex_group(sbi, group);
3879
3880                atomic64_sub(len,
3881                             &sbi_array_rcu_deref(sbi, s_flex_groups,
3882                                                  flex_group)->free_clusters);
3883        }
3884
3885        err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
3886        if (err)
3887                goto out_err;
3888        sync_dirty_buffer(bitmap_bh);
3889        err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
3890        sync_dirty_buffer(gdp_bh);
3891
3892out_err:
3893        brelse(bitmap_bh);
3894}
3895
3896/*
3897 * here we normalize request for locality group
3898 * Group request are normalized to s_mb_group_prealloc, which goes to
3899 * s_strip if we set the same via mount option.
3900 * s_mb_group_prealloc can be configured via
3901 * /sys/fs/ext4/<partition>/mb_group_prealloc
3902 *
3903 * XXX: should we try to preallocate more than the group has now?
3904 */
3905static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3906{
3907        struct super_block *sb = ac->ac_sb;
3908        struct ext4_locality_group *lg = ac->ac_lg;
3909
3910        BUG_ON(lg == NULL);
3911        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3912        mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
3913}
3914
3915/*
3916 * Normalization means making request better in terms of
3917 * size and alignment
3918 */
3919static noinline_for_stack void
3920ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3921                                struct ext4_allocation_request *ar)
3922{
3923        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3924        int bsbits, max;
3925        ext4_lblk_t end;
3926        loff_t size, start_off;
3927        loff_t orig_size __maybe_unused;
3928        ext4_lblk_t start;
3929        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3930        struct ext4_prealloc_space *pa;
3931
3932        /* do normalize only data requests, metadata requests
3933           do not need preallocation */
3934        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3935                return;
3936
3937        /* sometime caller may want exact blocks */
3938        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3939                return;
3940
3941        /* caller may indicate that preallocation isn't
3942         * required (it's a tail, for example) */
3943        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3944                return;
3945
3946        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3947                ext4_mb_normalize_group_request(ac);
3948                return ;
3949        }
3950
3951        bsbits = ac->ac_sb->s_blocksize_bits;
3952
3953        /* first, let's learn actual file size
3954         * given current request is allocated */
3955        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3956        size = size << bsbits;
3957        if (size < i_size_read(ac->ac_inode))
3958                size = i_size_read(ac->ac_inode);
3959        orig_size = size;
3960
3961        /* max size of free chunks */
3962        max = 2 << bsbits;
3963
3964#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3965                (req <= (size) || max <= (chunk_size))
3966
3967        /* first, try to predict filesize */
3968        /* XXX: should this table be tunable? */
3969        start_off = 0;
3970        if (size <= 16 * 1024) {
3971                size = 16 * 1024;
3972        } else if (size <= 32 * 1024) {
3973                size = 32 * 1024;
3974        } else if (size <= 64 * 1024) {
3975                size = 64 * 1024;
3976        } else if (size <= 128 * 1024) {
3977                size = 128 * 1024;
3978        } else if (size <= 256 * 1024) {
3979                size = 256 * 1024;
3980        } else if (size <= 512 * 1024) {
3981                size = 512 * 1024;
3982        } else if (size <= 1024 * 1024) {
3983                size = 1024 * 1024;
3984        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3985                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3986                                                (21 - bsbits)) << 21;
3987                size = 2 * 1024 * 1024;
3988        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3989                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3990                                                        (22 - bsbits)) << 22;
3991                size = 4 * 1024 * 1024;
3992        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3993                                        (8<<20)>>bsbits, max, 8 * 1024)) {
3994                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3995                                                        (23 - bsbits)) << 23;
3996                size = 8 * 1024 * 1024;
3997        } else {
3998                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3999                size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
4000                                              ac->ac_o_ex.fe_len) << bsbits;
4001        }
4002        size = size >> bsbits;
4003        start = start_off >> bsbits;
4004
4005        /* don't cover already allocated blocks in selected range */
4006        if (ar->pleft && start <= ar->lleft) {
4007                size -= ar->lleft + 1 - start;
4008                start = ar->lleft + 1;
4009        }
4010        if (ar->pright && start + size - 1 >= ar->lright)
4011                size -= start + size - ar->lright;
4012
4013        /*
4014         * Trim allocation request for filesystems with artificially small
4015         * groups.
4016         */
4017        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
4018                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
4019
4020        end = start + size;
4021
4022        /* check we don't cross already preallocated blocks */
4023        rcu_read_lock();
4024        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4025                ext4_lblk_t pa_end;
4026
4027                if (pa->pa_deleted)
4028                        continue;
4029                spin_lock(&pa->pa_lock);
4030                if (pa->pa_deleted) {
4031                        spin_unlock(&pa->pa_lock);
4032                        continue;
4033                }
4034
4035                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
4036                                                  pa->pa_len);
4037
4038                /* PA must not overlap original request */
4039                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
4040                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
4041
4042                /* skip PAs this normalized request doesn't overlap with */
4043                if (pa->pa_lstart >= end || pa_end <= start) {
4044                        spin_unlock(&pa->pa_lock);
4045                        continue;
4046                }
4047                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
4048
4049                /* adjust start or end to be adjacent to this pa */
4050                if (pa_end <= ac->ac_o_ex.fe_logical) {
4051                        BUG_ON(pa_end < start);
4052                        start = pa_end;
4053                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
4054                        BUG_ON(pa->pa_lstart > end);
4055                        end = pa->pa_lstart;
4056                }
4057                spin_unlock(&pa->pa_lock);
4058        }
4059        rcu_read_unlock();
4060        size = end - start;
4061
4062        /* XXX: extra loop to check we really don't overlap preallocations */
4063        rcu_read_lock();
4064        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4065                ext4_lblk_t pa_end;
4066
4067                spin_lock(&pa->pa_lock);
4068                if (pa->pa_deleted == 0) {
4069                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
4070                                                          pa->pa_len);
4071                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
4072                }
4073                spin_unlock(&pa->pa_lock);
4074        }
4075        rcu_read_unlock();
4076
4077        if (start + size <= ac->ac_o_ex.fe_logical &&
4078                        start > ac->ac_o_ex.fe_logical) {
4079                ext4_msg(ac->ac_sb, KERN_ERR,
4080                         "start %lu, size %lu, fe_logical %lu",
4081                         (unsigned long) start, (unsigned long) size,
4082                         (unsigned long) ac->ac_o_ex.fe_logical);
4083                BUG();
4084        }
4085        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
4086
4087        /* now prepare goal request */
4088
4089        /* XXX: is it better to align blocks WRT to logical
4090         * placement or satisfy big request as is */
4091        ac->ac_g_ex.fe_logical = start;
4092        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
4093
4094        /* define goal start in order to merge */
4095        if (ar->pright && (ar->lright == (start + size))) {
4096                /* merge to the right */
4097                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
4098                                                &ac->ac_f_ex.fe_group,
4099                                                &ac->ac_f_ex.fe_start);
4100                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4101        }
4102        if (ar->pleft && (ar->lleft + 1 == start)) {
4103                /* merge to the left */
4104                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
4105                                                &ac->ac_f_ex.fe_group,
4106                                                &ac->ac_f_ex.fe_start);
4107                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
4108        }
4109
4110        mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
4111                 orig_size, start);
4112}
4113
4114static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
4115{
4116        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4117
4118        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
4119                atomic_inc(&sbi->s_bal_reqs);
4120                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
4121                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
4122                        atomic_inc(&sbi->s_bal_success);
4123                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
4124                atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
4125                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
4126                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
4127                        atomic_inc(&sbi->s_bal_goals);
4128                if (ac->ac_found > sbi->s_mb_max_to_scan)
4129                        atomic_inc(&sbi->s_bal_breaks);
4130        }
4131
4132        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
4133                trace_ext4_mballoc_alloc(ac);
4134        else
4135                trace_ext4_mballoc_prealloc(ac);
4136}
4137
4138/*
4139 * Called on failure; free up any blocks from the inode PA for this
4140 * context.  We don't need this for MB_GROUP_PA because we only change
4141 * pa_free in ext4_mb_release_context(), but on failure, we've already
4142 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
4143 */
4144static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
4145{
4146        struct ext4_prealloc_space *pa = ac->ac_pa;
4147        struct ext4_buddy e4b;
4148        int err;
4149
4150        if (pa == NULL) {
4151                if (ac->ac_f_ex.fe_len == 0)
4152                        return;
4153                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
4154                if (err) {
4155                        /*
4156                         * This should never happen since we pin the
4157                         * pages in the ext4_allocation_context so
4158                         * ext4_mb_load_buddy() should never fail.
4159                         */
4160                        WARN(1, "mb_load_buddy failed (%d)", err);
4161                        return;
4162                }
4163                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4164                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
4165                               ac->ac_f_ex.fe_len);
4166                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
4167                ext4_mb_unload_buddy(&e4b);
4168                return;
4169        }
4170        if (pa->pa_type == MB_INODE_PA)
4171                pa->pa_free += ac->ac_b_ex.fe_len;
4172}
4173
4174/*
4175 * use blocks preallocated to inode
4176 */
4177static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
4178                                struct ext4_prealloc_space *pa)
4179{
4180        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4181        ext4_fsblk_t start;
4182        ext4_fsblk_t end;
4183        int len;
4184
4185        /* found preallocated blocks, use them */
4186        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
4187        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
4188                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
4189        len = EXT4_NUM_B2C(sbi, end - start);
4190        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
4191                                        &ac->ac_b_ex.fe_start);
4192        ac->ac_b_ex.fe_len = len;
4193        ac->ac_status = AC_STATUS_FOUND;
4194        ac->ac_pa = pa;
4195
4196        BUG_ON(start < pa->pa_pstart);
4197        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
4198        BUG_ON(pa->pa_free < len);
4199        pa->pa_free -= len;
4200
4201        mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
4202}
4203
4204/*
4205 * use blocks preallocated to locality group
4206 */
4207static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
4208                                struct ext4_prealloc_space *pa)
4209{
4210        unsigned int len = ac->ac_o_ex.fe_len;
4211
4212        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
4213                                        &ac->ac_b_ex.fe_group,
4214                                        &ac->ac_b_ex.fe_start);
4215        ac->ac_b_ex.fe_len = len;
4216        ac->ac_status = AC_STATUS_FOUND;
4217        ac->ac_pa = pa;
4218
4219        /* we don't correct pa_pstart or pa_plen here to avoid
4220         * possible race when the group is being loaded concurrently
4221         * instead we correct pa later, after blocks are marked
4222         * in on-disk bitmap -- see ext4_mb_release_context()
4223         * Other CPUs are prevented from allocating from this pa by lg_mutex
4224         */
4225        mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
4226                 pa->pa_lstart-len, len, pa);
4227}
4228
4229/*
4230 * Return the prealloc space that have minimal distance
4231 * from the goal block. @cpa is the prealloc
4232 * space that is having currently known minimal distance
4233 * from the goal block.
4234 */
4235static struct ext4_prealloc_space *
4236ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
4237                        struct ext4_prealloc_space *pa,
4238                        struct ext4_prealloc_space *cpa)
4239{
4240        ext4_fsblk_t cur_distance, new_distance;
4241
4242        if (cpa == NULL) {
4243                atomic_inc(&pa->pa_count);
4244                return pa;
4245        }
4246        cur_distance = abs(goal_block - cpa->pa_pstart);
4247        new_distance = abs(goal_block - pa->pa_pstart);
4248
4249        if (cur_distance <= new_distance)
4250                return cpa;
4251
4252        /* drop the previous reference */
4253        atomic_dec(&cpa->pa_count);
4254        atomic_inc(&pa->pa_count);
4255        return pa;
4256}
4257
4258/*
4259 * search goal blocks in preallocated space
4260 */
4261static noinline_for_stack bool
4262ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
4263{
4264        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4265        int order, i;
4266        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
4267        struct ext4_locality_group *lg;
4268        struct ext4_prealloc_space *pa, *cpa = NULL;
4269        ext4_fsblk_t goal_block;
4270
4271        /* only data can be preallocated */
4272        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4273                return false;
4274
4275        /* first, try per-file preallocation */
4276        rcu_read_lock();
4277        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
4278
4279                /* all fields in this condition don't change,
4280                 * so we can skip locking for them */
4281                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
4282                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
4283                                               EXT4_C2B(sbi, pa->pa_len)))
4284                        continue;
4285
4286                /* non-extent files can't have physical blocks past 2^32 */
4287                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
4288                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
4289                     EXT4_MAX_BLOCK_FILE_PHYS))
4290                        continue;
4291
4292                /* found preallocated blocks, use them */
4293                spin_lock(&pa->pa_lock);
4294                if (pa->pa_deleted == 0 && pa->pa_free) {
4295                        atomic_inc(&pa->pa_count);
4296                        ext4_mb_use_inode_pa(ac, pa);
4297                        spin_unlock(&pa->pa_lock);
4298                        ac->ac_criteria = 10;
4299                        rcu_read_unlock();
4300                        return true;
4301                }
4302                spin_unlock(&pa->pa_lock);
4303        }
4304        rcu_read_unlock();
4305
4306        /* can we use group allocation? */
4307        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
4308                return false;
4309
4310        /* inode may have no locality group for some reason */
4311        lg = ac->ac_lg;
4312        if (lg == NULL)
4313                return false;
4314        order  = fls(ac->ac_o_ex.fe_len) - 1;
4315        if (order > PREALLOC_TB_SIZE - 1)
4316                /* The max size of hash table is PREALLOC_TB_SIZE */
4317                order = PREALLOC_TB_SIZE - 1;
4318
4319        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
4320        /*
4321         * search for the prealloc space that is having
4322         * minimal distance from the goal block.
4323         */
4324        for (i = order; i < PREALLOC_TB_SIZE; i++) {
4325                rcu_read_lock();
4326                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
4327                                        pa_inode_list) {
4328                        spin_lock(&pa->pa_lock);
4329                        if (pa->pa_deleted == 0 &&
4330                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
4331
4332                                cpa = ext4_mb_check_group_pa(goal_block,
4333                                                                pa, cpa);
4334                        }
4335                        spin_unlock(&pa->pa_lock);
4336                }
4337                rcu_read_unlock();
4338        }
4339        if (cpa) {
4340                ext4_mb_use_group_pa(ac, cpa);
4341                ac->ac_criteria = 20;
4342                return true;
4343        }
4344        return false;
4345}
4346
4347/*
4348 * the function goes through all block freed in the group
4349 * but not yet committed and marks them used in in-core bitmap.
4350 * buddy must be generated from this bitmap
4351 * Need to be called with the ext4 group lock held
4352 */
4353static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
4354                                                ext4_group_t group)
4355{
4356        struct rb_node *n;
4357        struct ext4_group_info *grp;
4358        struct ext4_free_data *entry;
4359
4360        grp = ext4_get_group_info(sb, group);
4361        n = rb_first(&(grp->bb_free_root));
4362
4363        while (n) {
4364                entry = rb_entry(n, struct ext4_free_data, efd_node);
4365                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
4366                n = rb_next(n);
4367        }
4368        return;
4369}
4370
4371/*
4372 * the function goes through all preallocation in this group and marks them
4373 * used in in-core bitmap. buddy must be generated from this bitmap
4374 * Need to be called with ext4 group lock held
4375 */
4376static noinline_for_stack
4377void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
4378                                        ext4_group_t group)
4379{
4380        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4381        struct ext4_prealloc_space *pa;
4382        struct list_head *cur;
4383        ext4_group_t groupnr;
4384        ext4_grpblk_t start;
4385        int preallocated = 0;
4386        int len;
4387
4388        /* all form of preallocation discards first load group,
4389         * so the only competing code is preallocation use.
4390         * we don't need any locking here
4391         * notice we do NOT ignore preallocations with pa_deleted
4392         * otherwise we could leave used blocks available for
4393         * allocation in buddy when concurrent ext4_mb_put_pa()
4394         * is dropping preallocation
4395         */
4396        list_for_each(cur, &grp->bb_prealloc_list) {
4397                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
4398                spin_lock(&pa->pa_lock);
4399                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4400                                             &groupnr, &start);
4401                len = pa->pa_len;
4402                spin_unlock(&pa->pa_lock);
4403                if (unlikely(len == 0))
4404                        continue;
4405                BUG_ON(groupnr != group);
4406                ext4_set_bits(bitmap, start, len);
4407                preallocated += len;
4408        }
4409        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
4410}
4411
4412static void ext4_mb_mark_pa_deleted(struct super_block *sb,
4413                                    struct ext4_prealloc_space *pa)
4414{
4415        struct ext4_inode_info *ei;
4416
4417        if (pa->pa_deleted) {
4418                ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
4419                             pa->pa_type, pa->pa_pstart, pa->pa_lstart,
4420                             pa->pa_len);
4421                return;
4422        }
4423
4424        pa->pa_deleted = 1;
4425
4426        if (pa->pa_type == MB_INODE_PA) {
4427                ei = EXT4_I(pa->pa_inode);
4428                atomic_dec(&ei->i_prealloc_active);
4429        }
4430}
4431
4432static void ext4_mb_pa_callback(struct rcu_head *head)
4433{
4434        struct ext4_prealloc_space *pa;
4435        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
4436
4437        BUG_ON(atomic_read(&pa->pa_count));
4438        BUG_ON(pa->pa_deleted == 0);
4439        kmem_cache_free(ext4_pspace_cachep, pa);
4440}
4441
4442/*
4443 * drops a reference to preallocated space descriptor
4444 * if this was the last reference and the space is consumed
4445 */
4446static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
4447                        struct super_block *sb, struct ext4_prealloc_space *pa)
4448{
4449        ext4_group_t grp;
4450        ext4_fsblk_t grp_blk;
4451
4452        /* in this short window concurrent discard can set pa_deleted */
4453        spin_lock(&pa->pa_lock);
4454        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
4455                spin_unlock(&pa->pa_lock);
4456                return;
4457        }
4458
4459        if (pa->pa_deleted == 1) {
4460                spin_unlock(&pa->pa_lock);
4461                return;
4462        }
4463
4464        ext4_mb_mark_pa_deleted(sb, pa);
4465        spin_unlock(&pa->pa_lock);
4466
4467        grp_blk = pa->pa_pstart;
4468        /*
4469         * If doing group-based preallocation, pa_pstart may be in the
4470         * next group when pa is used up
4471         */
4472        if (pa->pa_type == MB_GROUP_PA)
4473                grp_blk--;
4474
4475        grp = ext4_get_group_number(sb, grp_blk);
4476
4477        /*
4478         * possible race:
4479         *
4480         *  P1 (buddy init)                     P2 (regular allocation)
4481         *                                      find block B in PA
4482         *  copy on-disk bitmap to buddy
4483         *                                      mark B in on-disk bitmap
4484         *                                      drop PA from group
4485         *  mark all PAs in buddy
4486         *
4487         * thus, P1 initializes buddy with B available. to prevent this
4488         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
4489