linux/fs/xfs/xfs_icache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_sb.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_trans_priv.h"
  17#include "xfs_inode_item.h"
  18#include "xfs_quota.h"
  19#include "xfs_trace.h"
  20#include "xfs_icache.h"
  21#include "xfs_bmap_util.h"
  22#include "xfs_dquot_item.h"
  23#include "xfs_dquot.h"
  24#include "xfs_reflink.h"
  25#include "xfs_ialloc.h"
  26
  27#include <linux/iversion.h>
  28
  29/*
  30 * Allocate and initialise an xfs_inode.
  31 */
  32struct xfs_inode *
  33xfs_inode_alloc(
  34        struct xfs_mount        *mp,
  35        xfs_ino_t               ino)
  36{
  37        struct xfs_inode        *ip;
  38
  39        /*
  40         * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
  41         * and return NULL here on ENOMEM.
  42         */
  43        ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
  44
  45        if (inode_init_always(mp->m_super, VFS_I(ip))) {
  46                kmem_cache_free(xfs_inode_zone, ip);
  47                return NULL;
  48        }
  49
  50        /* VFS doesn't initialise i_mode! */
  51        VFS_I(ip)->i_mode = 0;
  52
  53        XFS_STATS_INC(mp, vn_active);
  54        ASSERT(atomic_read(&ip->i_pincount) == 0);
  55        ASSERT(ip->i_ino == 0);
  56
  57        /* initialise the xfs inode */
  58        ip->i_ino = ino;
  59        ip->i_mount = mp;
  60        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  61        ip->i_afp = NULL;
  62        ip->i_cowfp = NULL;
  63        memset(&ip->i_df, 0, sizeof(ip->i_df));
  64        ip->i_flags = 0;
  65        ip->i_delayed_blks = 0;
  66        ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
  67        ip->i_nblocks = 0;
  68        ip->i_forkoff = 0;
  69        ip->i_sick = 0;
  70        ip->i_checked = 0;
  71        INIT_WORK(&ip->i_ioend_work, xfs_end_io);
  72        INIT_LIST_HEAD(&ip->i_ioend_list);
  73        spin_lock_init(&ip->i_ioend_lock);
  74
  75        return ip;
  76}
  77
  78STATIC void
  79xfs_inode_free_callback(
  80        struct rcu_head         *head)
  81{
  82        struct inode            *inode = container_of(head, struct inode, i_rcu);
  83        struct xfs_inode        *ip = XFS_I(inode);
  84
  85        switch (VFS_I(ip)->i_mode & S_IFMT) {
  86        case S_IFREG:
  87        case S_IFDIR:
  88        case S_IFLNK:
  89                xfs_idestroy_fork(&ip->i_df);
  90                break;
  91        }
  92
  93        if (ip->i_afp) {
  94                xfs_idestroy_fork(ip->i_afp);
  95                kmem_cache_free(xfs_ifork_zone, ip->i_afp);
  96        }
  97        if (ip->i_cowfp) {
  98                xfs_idestroy_fork(ip->i_cowfp);
  99                kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
 100        }
 101        if (ip->i_itemp) {
 102                ASSERT(!test_bit(XFS_LI_IN_AIL,
 103                                 &ip->i_itemp->ili_item.li_flags));
 104                xfs_inode_item_destroy(ip);
 105                ip->i_itemp = NULL;
 106        }
 107
 108        kmem_cache_free(xfs_inode_zone, ip);
 109}
 110
 111static void
 112__xfs_inode_free(
 113        struct xfs_inode        *ip)
 114{
 115        /* asserts to verify all state is correct here */
 116        ASSERT(atomic_read(&ip->i_pincount) == 0);
 117        ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
 118        XFS_STATS_DEC(ip->i_mount, vn_active);
 119
 120        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 121}
 122
 123void
 124xfs_inode_free(
 125        struct xfs_inode        *ip)
 126{
 127        ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
 128
 129        /*
 130         * Because we use RCU freeing we need to ensure the inode always
 131         * appears to be reclaimed with an invalid inode number when in the
 132         * free state. The ip->i_flags_lock provides the barrier against lookup
 133         * races.
 134         */
 135        spin_lock(&ip->i_flags_lock);
 136        ip->i_flags = XFS_IRECLAIM;
 137        ip->i_ino = 0;
 138        spin_unlock(&ip->i_flags_lock);
 139
 140        __xfs_inode_free(ip);
 141}
 142
 143/*
 144 * Queue background inode reclaim work if there are reclaimable inodes and there
 145 * isn't reclaim work already scheduled or in progress.
 146 */
 147static void
 148xfs_reclaim_work_queue(
 149        struct xfs_mount        *mp)
 150{
 151
 152        rcu_read_lock();
 153        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 154                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 155                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 156        }
 157        rcu_read_unlock();
 158}
 159
 160static void
 161xfs_perag_set_reclaim_tag(
 162        struct xfs_perag        *pag)
 163{
 164        struct xfs_mount        *mp = pag->pag_mount;
 165
 166        lockdep_assert_held(&pag->pag_ici_lock);
 167        if (pag->pag_ici_reclaimable++)
 168                return;
 169
 170        /* propagate the reclaim tag up into the perag radix tree */
 171        spin_lock(&mp->m_perag_lock);
 172        radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
 173                           XFS_ICI_RECLAIM_TAG);
 174        spin_unlock(&mp->m_perag_lock);
 175
 176        /* schedule periodic background inode reclaim */
 177        xfs_reclaim_work_queue(mp);
 178
 179        trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
 180}
 181
 182static void
 183xfs_perag_clear_reclaim_tag(
 184        struct xfs_perag        *pag)
 185{
 186        struct xfs_mount        *mp = pag->pag_mount;
 187
 188        lockdep_assert_held(&pag->pag_ici_lock);
 189        if (--pag->pag_ici_reclaimable)
 190                return;
 191
 192        /* clear the reclaim tag from the perag radix tree */
 193        spin_lock(&mp->m_perag_lock);
 194        radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
 195                             XFS_ICI_RECLAIM_TAG);
 196        spin_unlock(&mp->m_perag_lock);
 197        trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
 198}
 199
 200
 201/*
 202 * We set the inode flag atomically with the radix tree tag.
 203 * Once we get tag lookups on the radix tree, this inode flag
 204 * can go away.
 205 */
 206void
 207xfs_inode_set_reclaim_tag(
 208        struct xfs_inode        *ip)
 209{
 210        struct xfs_mount        *mp = ip->i_mount;
 211        struct xfs_perag        *pag;
 212
 213        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 214        spin_lock(&pag->pag_ici_lock);
 215        spin_lock(&ip->i_flags_lock);
 216
 217        radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
 218                           XFS_ICI_RECLAIM_TAG);
 219        xfs_perag_set_reclaim_tag(pag);
 220        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 221
 222        spin_unlock(&ip->i_flags_lock);
 223        spin_unlock(&pag->pag_ici_lock);
 224        xfs_perag_put(pag);
 225}
 226
 227STATIC void
 228xfs_inode_clear_reclaim_tag(
 229        struct xfs_perag        *pag,
 230        xfs_ino_t               ino)
 231{
 232        radix_tree_tag_clear(&pag->pag_ici_root,
 233                             XFS_INO_TO_AGINO(pag->pag_mount, ino),
 234                             XFS_ICI_RECLAIM_TAG);
 235        xfs_perag_clear_reclaim_tag(pag);
 236}
 237
 238static void
 239xfs_inew_wait(
 240        struct xfs_inode        *ip)
 241{
 242        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
 243        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
 244
 245        do {
 246                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 247                if (!xfs_iflags_test(ip, XFS_INEW))
 248                        break;
 249                schedule();
 250        } while (true);
 251        finish_wait(wq, &wait.wq_entry);
 252}
 253
 254/*
 255 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
 256 * part of the structure. This is made more complex by the fact we store
 257 * information about the on-disk values in the VFS inode and so we can't just
 258 * overwrite the values unconditionally. Hence we save the parameters we
 259 * need to retain across reinitialisation, and rewrite them into the VFS inode
 260 * after reinitialisation even if it fails.
 261 */
 262static int
 263xfs_reinit_inode(
 264        struct xfs_mount        *mp,
 265        struct inode            *inode)
 266{
 267        int             error;
 268        uint32_t        nlink = inode->i_nlink;
 269        uint32_t        generation = inode->i_generation;
 270        uint64_t        version = inode_peek_iversion(inode);
 271        umode_t         mode = inode->i_mode;
 272        dev_t           dev = inode->i_rdev;
 273        kuid_t          uid = inode->i_uid;
 274        kgid_t          gid = inode->i_gid;
 275
 276        error = inode_init_always(mp->m_super, inode);
 277
 278        set_nlink(inode, nlink);
 279        inode->i_generation = generation;
 280        inode_set_iversion_queried(inode, version);
 281        inode->i_mode = mode;
 282        inode->i_rdev = dev;
 283        inode->i_uid = uid;
 284        inode->i_gid = gid;
 285        return error;
 286}
 287
 288/*
 289 * If we are allocating a new inode, then check what was returned is
 290 * actually a free, empty inode. If we are not allocating an inode,
 291 * then check we didn't find a free inode.
 292 *
 293 * Returns:
 294 *      0               if the inode free state matches the lookup context
 295 *      -ENOENT         if the inode is free and we are not allocating
 296 *      -EFSCORRUPTED   if there is any state mismatch at all
 297 */
 298static int
 299xfs_iget_check_free_state(
 300        struct xfs_inode        *ip,
 301        int                     flags)
 302{
 303        if (flags & XFS_IGET_CREATE) {
 304                /* should be a free inode */
 305                if (VFS_I(ip)->i_mode != 0) {
 306                        xfs_warn(ip->i_mount,
 307"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
 308                                ip->i_ino, VFS_I(ip)->i_mode);
 309                        return -EFSCORRUPTED;
 310                }
 311
 312                if (ip->i_nblocks != 0) {
 313                        xfs_warn(ip->i_mount,
 314"Corruption detected! Free inode 0x%llx has blocks allocated!",
 315                                ip->i_ino);
 316                        return -EFSCORRUPTED;
 317                }
 318                return 0;
 319        }
 320
 321        /* should be an allocated inode */
 322        if (VFS_I(ip)->i_mode == 0)
 323                return -ENOENT;
 324
 325        return 0;
 326}
 327
 328/*
 329 * Check the validity of the inode we just found it the cache
 330 */
 331static int
 332xfs_iget_cache_hit(
 333        struct xfs_perag        *pag,
 334        struct xfs_inode        *ip,
 335        xfs_ino_t               ino,
 336        int                     flags,
 337        int                     lock_flags) __releases(RCU)
 338{
 339        struct inode            *inode = VFS_I(ip);
 340        struct xfs_mount        *mp = ip->i_mount;
 341        int                     error;
 342
 343        /*
 344         * check for re-use of an inode within an RCU grace period due to the
 345         * radix tree nodes not being updated yet. We monitor for this by
 346         * setting the inode number to zero before freeing the inode structure.
 347         * If the inode has been reallocated and set up, then the inode number
 348         * will not match, so check for that, too.
 349         */
 350        spin_lock(&ip->i_flags_lock);
 351        if (ip->i_ino != ino) {
 352                trace_xfs_iget_skip(ip);
 353                XFS_STATS_INC(mp, xs_ig_frecycle);
 354                error = -EAGAIN;
 355                goto out_error;
 356        }
 357
 358
 359        /*
 360         * If we are racing with another cache hit that is currently
 361         * instantiating this inode or currently recycling it out of
 362         * reclaimabe state, wait for the initialisation to complete
 363         * before continuing.
 364         *
 365         * XXX(hch): eventually we should do something equivalent to
 366         *           wait_on_inode to wait for these flags to be cleared
 367         *           instead of polling for it.
 368         */
 369        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 370                trace_xfs_iget_skip(ip);
 371                XFS_STATS_INC(mp, xs_ig_frecycle);
 372                error = -EAGAIN;
 373                goto out_error;
 374        }
 375
 376        /*
 377         * Check the inode free state is valid. This also detects lookup
 378         * racing with unlinks.
 379         */
 380        error = xfs_iget_check_free_state(ip, flags);
 381        if (error)
 382                goto out_error;
 383
 384        /*
 385         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
 386         * Need to carefully get it back into useable state.
 387         */
 388        if (ip->i_flags & XFS_IRECLAIMABLE) {
 389                trace_xfs_iget_reclaim(ip);
 390
 391                if (flags & XFS_IGET_INCORE) {
 392                        error = -EAGAIN;
 393                        goto out_error;
 394                }
 395
 396                /*
 397                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
 398                 * from stomping over us while we recycle the inode.  We can't
 399                 * clear the radix tree reclaimable tag yet as it requires
 400                 * pag_ici_lock to be held exclusive.
 401                 */
 402                ip->i_flags |= XFS_IRECLAIM;
 403
 404                spin_unlock(&ip->i_flags_lock);
 405                rcu_read_unlock();
 406
 407                ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 408                error = xfs_reinit_inode(mp, inode);
 409                if (error) {
 410                        bool wake;
 411                        /*
 412                         * Re-initializing the inode failed, and we are in deep
 413                         * trouble.  Try to re-add it to the reclaim list.
 414                         */
 415                        rcu_read_lock();
 416                        spin_lock(&ip->i_flags_lock);
 417                        wake = !!__xfs_iflags_test(ip, XFS_INEW);
 418                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 419                        if (wake)
 420                                wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 421                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 422                        trace_xfs_iget_reclaim_fail(ip);
 423                        goto out_error;
 424                }
 425
 426                spin_lock(&pag->pag_ici_lock);
 427                spin_lock(&ip->i_flags_lock);
 428
 429                /*
 430                 * Clear the per-lifetime state in the inode as we are now
 431                 * effectively a new inode and need to return to the initial
 432                 * state before reuse occurs.
 433                 */
 434                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 435                ip->i_flags |= XFS_INEW;
 436                xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 437                inode->i_state = I_NEW;
 438                ip->i_sick = 0;
 439                ip->i_checked = 0;
 440
 441                spin_unlock(&ip->i_flags_lock);
 442                spin_unlock(&pag->pag_ici_lock);
 443        } else {
 444                /* If the VFS inode is being torn down, pause and try again. */
 445                if (!igrab(inode)) {
 446                        trace_xfs_iget_skip(ip);
 447                        error = -EAGAIN;
 448                        goto out_error;
 449                }
 450
 451                /* We've got a live one. */
 452                spin_unlock(&ip->i_flags_lock);
 453                rcu_read_unlock();
 454                trace_xfs_iget_hit(ip);
 455        }
 456
 457        if (lock_flags != 0)
 458                xfs_ilock(ip, lock_flags);
 459
 460        if (!(flags & XFS_IGET_INCORE))
 461                xfs_iflags_clear(ip, XFS_ISTALE);
 462        XFS_STATS_INC(mp, xs_ig_found);
 463
 464        return 0;
 465
 466out_error:
 467        spin_unlock(&ip->i_flags_lock);
 468        rcu_read_unlock();
 469        return error;
 470}
 471
 472
 473static int
 474xfs_iget_cache_miss(
 475        struct xfs_mount        *mp,
 476        struct xfs_perag        *pag,
 477        xfs_trans_t             *tp,
 478        xfs_ino_t               ino,
 479        struct xfs_inode        **ipp,
 480        int                     flags,
 481        int                     lock_flags)
 482{
 483        struct xfs_inode        *ip;
 484        int                     error;
 485        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 486        int                     iflags;
 487
 488        ip = xfs_inode_alloc(mp, ino);
 489        if (!ip)
 490                return -ENOMEM;
 491
 492        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
 493        if (error)
 494                goto out_destroy;
 495
 496        /*
 497         * For version 5 superblocks, if we are initialising a new inode and we
 498         * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
 499         * simply build the new inode core with a random generation number.
 500         *
 501         * For version 4 (and older) superblocks, log recovery is dependent on
 502         * the i_flushiter field being initialised from the current on-disk
 503         * value and hence we must also read the inode off disk even when
 504         * initializing new inodes.
 505         */
 506        if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
 507            (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 508                VFS_I(ip)->i_generation = prandom_u32();
 509        } else {
 510                struct xfs_buf          *bp;
 511
 512                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
 513                if (error)
 514                        goto out_destroy;
 515
 516                error = xfs_inode_from_disk(ip,
 517                                xfs_buf_offset(bp, ip->i_imap.im_boffset));
 518                if (!error)
 519                        xfs_buf_set_ref(bp, XFS_INO_REF);
 520                xfs_trans_brelse(tp, bp);
 521
 522                if (error)
 523                        goto out_destroy;
 524        }
 525
 526        trace_xfs_iget_miss(ip);
 527
 528        /*
 529         * Check the inode free state is valid. This also detects lookup
 530         * racing with unlinks.
 531         */
 532        error = xfs_iget_check_free_state(ip, flags);
 533        if (error)
 534                goto out_destroy;
 535
 536        /*
 537         * Preload the radix tree so we can insert safely under the
 538         * write spinlock. Note that we cannot sleep inside the preload
 539         * region. Since we can be called from transaction context, don't
 540         * recurse into the file system.
 541         */
 542        if (radix_tree_preload(GFP_NOFS)) {
 543                error = -EAGAIN;
 544                goto out_destroy;
 545        }
 546
 547        /*
 548         * Because the inode hasn't been added to the radix-tree yet it can't
 549         * be found by another thread, so we can do the non-sleeping lock here.
 550         */
 551        if (lock_flags) {
 552                if (!xfs_ilock_nowait(ip, lock_flags))
 553                        BUG();
 554        }
 555
 556        /*
 557         * These values must be set before inserting the inode into the radix
 558         * tree as the moment it is inserted a concurrent lookup (allowed by the
 559         * RCU locking mechanism) can find it and that lookup must see that this
 560         * is an inode currently under construction (i.e. that XFS_INEW is set).
 561         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 562         * memory barrier that ensures this detection works correctly at lookup
 563         * time.
 564         */
 565        iflags = XFS_INEW;
 566        if (flags & XFS_IGET_DONTCACHE)
 567                d_mark_dontcache(VFS_I(ip));
 568        ip->i_udquot = NULL;
 569        ip->i_gdquot = NULL;
 570        ip->i_pdquot = NULL;
 571        xfs_iflags_set(ip, iflags);
 572
 573        /* insert the new inode */
 574        spin_lock(&pag->pag_ici_lock);
 575        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 576        if (unlikely(error)) {
 577                WARN_ON(error != -EEXIST);
 578                XFS_STATS_INC(mp, xs_ig_dup);
 579                error = -EAGAIN;
 580                goto out_preload_end;
 581        }
 582        spin_unlock(&pag->pag_ici_lock);
 583        radix_tree_preload_end();
 584
 585        *ipp = ip;
 586        return 0;
 587
 588out_preload_end:
 589        spin_unlock(&pag->pag_ici_lock);
 590        radix_tree_preload_end();
 591        if (lock_flags)
 592                xfs_iunlock(ip, lock_flags);
 593out_destroy:
 594        __destroy_inode(VFS_I(ip));
 595        xfs_inode_free(ip);
 596        return error;
 597}
 598
 599/*
 600 * Look up an inode by number in the given file system.  The inode is looked up
 601 * in the cache held in each AG.  If the inode is found in the cache, initialise
 602 * the vfs inode if necessary.
 603 *
 604 * If it is not in core, read it in from the file system's device, add it to the
 605 * cache and initialise the vfs inode.
 606 *
 607 * The inode is locked according to the value of the lock_flags parameter.
 608 * Inode lookup is only done during metadata operations and not as part of the
 609 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
 610 */
 611int
 612xfs_iget(
 613        struct xfs_mount        *mp,
 614        struct xfs_trans        *tp,
 615        xfs_ino_t               ino,
 616        uint                    flags,
 617        uint                    lock_flags,
 618        struct xfs_inode        **ipp)
 619{
 620        struct xfs_inode        *ip;
 621        struct xfs_perag        *pag;
 622        xfs_agino_t             agino;
 623        int                     error;
 624
 625        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 626
 627        /* reject inode numbers outside existing AGs */
 628        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 629                return -EINVAL;
 630
 631        XFS_STATS_INC(mp, xs_ig_attempts);
 632
 633        /* get the perag structure and ensure that it's inode capable */
 634        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 635        agino = XFS_INO_TO_AGINO(mp, ino);
 636
 637again:
 638        error = 0;
 639        rcu_read_lock();
 640        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 641
 642        if (ip) {
 643                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 644                if (error)
 645                        goto out_error_or_again;
 646        } else {
 647                rcu_read_unlock();
 648                if (flags & XFS_IGET_INCORE) {
 649                        error = -ENODATA;
 650                        goto out_error_or_again;
 651                }
 652                XFS_STATS_INC(mp, xs_ig_missed);
 653
 654                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 655                                                        flags, lock_flags);
 656                if (error)
 657                        goto out_error_or_again;
 658        }
 659        xfs_perag_put(pag);
 660
 661        *ipp = ip;
 662
 663        /*
 664         * If we have a real type for an on-disk inode, we can setup the inode
 665         * now.  If it's a new inode being created, xfs_ialloc will handle it.
 666         */
 667        if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
 668                xfs_setup_existing_inode(ip);
 669        return 0;
 670
 671out_error_or_again:
 672        if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
 673                delay(1);
 674                goto again;
 675        }
 676        xfs_perag_put(pag);
 677        return error;
 678}
 679
 680/*
 681 * "Is this a cached inode that's also allocated?"
 682 *
 683 * Look up an inode by number in the given file system.  If the inode is
 684 * in cache and isn't in purgatory, return 1 if the inode is allocated
 685 * and 0 if it is not.  For all other cases (not in cache, being torn
 686 * down, etc.), return a negative error code.
 687 *
 688 * The caller has to prevent inode allocation and freeing activity,
 689 * presumably by locking the AGI buffer.   This is to ensure that an
 690 * inode cannot transition from allocated to freed until the caller is
 691 * ready to allow that.  If the inode is in an intermediate state (new,
 692 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
 693 * inode is not in the cache, -ENOENT will be returned.  The caller must
 694 * deal with these scenarios appropriately.
 695 *
 696 * This is a specialized use case for the online scrubber; if you're
 697 * reading this, you probably want xfs_iget.
 698 */
 699int
 700xfs_icache_inode_is_allocated(
 701        struct xfs_mount        *mp,
 702        struct xfs_trans        *tp,
 703        xfs_ino_t               ino,
 704        bool                    *inuse)
 705{
 706        struct xfs_inode        *ip;
 707        int                     error;
 708
 709        error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
 710        if (error)
 711                return error;
 712
 713        *inuse = !!(VFS_I(ip)->i_mode);
 714        xfs_irele(ip);
 715        return 0;
 716}
 717
 718/*
 719 * The inode lookup is done in batches to keep the amount of lock traffic and
 720 * radix tree lookups to a minimum. The batch size is a trade off between
 721 * lookup reduction and stack usage. This is in the reclaim path, so we can't
 722 * be too greedy.
 723 */
 724#define XFS_LOOKUP_BATCH        32
 725
 726/*
 727 * Decide if the given @ip is eligible to be a part of the inode walk, and
 728 * grab it if so.  Returns true if it's ready to go or false if we should just
 729 * ignore it.
 730 */
 731STATIC bool
 732xfs_inode_walk_ag_grab(
 733        struct xfs_inode        *ip,
 734        int                     flags)
 735{
 736        struct inode            *inode = VFS_I(ip);
 737        bool                    newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
 738
 739        ASSERT(rcu_read_lock_held());
 740
 741        /* Check for stale RCU freed inode */
 742        spin_lock(&ip->i_flags_lock);
 743        if (!ip->i_ino)
 744                goto out_unlock_noent;
 745
 746        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
 747        if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
 748            __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
 749                goto out_unlock_noent;
 750        spin_unlock(&ip->i_flags_lock);
 751
 752        /* nothing to sync during shutdown */
 753        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 754                return false;
 755
 756        /* If we can't grab the inode, it must on it's way to reclaim. */
 757        if (!igrab(inode))
 758                return false;
 759
 760        /* inode is valid */
 761        return true;
 762
 763out_unlock_noent:
 764        spin_unlock(&ip->i_flags_lock);
 765        return false;
 766}
 767
 768/*
 769 * For a given per-AG structure @pag, grab, @execute, and rele all incore
 770 * inodes with the given radix tree @tag.
 771 */
 772STATIC int
 773xfs_inode_walk_ag(
 774        struct xfs_perag        *pag,
 775        int                     iter_flags,
 776        int                     (*execute)(struct xfs_inode *ip, void *args),
 777        void                    *args,
 778        int                     tag)
 779{
 780        struct xfs_mount        *mp = pag->pag_mount;
 781        uint32_t                first_index;
 782        int                     last_error = 0;
 783        int                     skipped;
 784        bool                    done;
 785        int                     nr_found;
 786
 787restart:
 788        done = false;
 789        skipped = 0;
 790        first_index = 0;
 791        nr_found = 0;
 792        do {
 793                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 794                int             error = 0;
 795                int             i;
 796
 797                rcu_read_lock();
 798
 799                if (tag == XFS_ICI_NO_TAG)
 800                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 801                                        (void **)batch, first_index,
 802                                        XFS_LOOKUP_BATCH);
 803                else
 804                        nr_found = radix_tree_gang_lookup_tag(
 805                                        &pag->pag_ici_root,
 806                                        (void **) batch, first_index,
 807                                        XFS_LOOKUP_BATCH, tag);
 808
 809                if (!nr_found) {
 810                        rcu_read_unlock();
 811                        break;
 812                }
 813
 814                /*
 815                 * Grab the inodes before we drop the lock. if we found
 816                 * nothing, nr == 0 and the loop will be skipped.
 817                 */
 818                for (i = 0; i < nr_found; i++) {
 819                        struct xfs_inode *ip = batch[i];
 820
 821                        if (done || !xfs_inode_walk_ag_grab(ip, iter_flags))
 822                                batch[i] = NULL;
 823
 824                        /*
 825                         * Update the index for the next lookup. Catch
 826                         * overflows into the next AG range which can occur if
 827                         * we have inodes in the last block of the AG and we
 828                         * are currently pointing to the last inode.
 829                         *
 830                         * Because we may see inodes that are from the wrong AG
 831                         * due to RCU freeing and reallocation, only update the
 832                         * index if it lies in this AG. It was a race that lead
 833                         * us to see this inode, so another lookup from the
 834                         * same index will not find it again.
 835                         */
 836                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
 837                                continue;
 838                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 839                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 840                                done = true;
 841                }
 842
 843                /* unlock now we've grabbed the inodes. */
 844                rcu_read_unlock();
 845
 846                for (i = 0; i < nr_found; i++) {
 847                        if (!batch[i])
 848                                continue;
 849                        if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) &&
 850                            xfs_iflags_test(batch[i], XFS_INEW))
 851                                xfs_inew_wait(batch[i]);
 852                        error = execute(batch[i], args);
 853                        xfs_irele(batch[i]);
 854                        if (error == -EAGAIN) {
 855                                skipped++;
 856                                continue;
 857                        }
 858                        if (error && last_error != -EFSCORRUPTED)
 859                                last_error = error;
 860                }
 861
 862                /* bail out if the filesystem is corrupted.  */
 863                if (error == -EFSCORRUPTED)
 864                        break;
 865
 866                cond_resched();
 867
 868        } while (nr_found && !done);
 869
 870        if (skipped) {
 871                delay(1);
 872                goto restart;
 873        }
 874        return last_error;
 875}
 876
 877/* Fetch the next (possibly tagged) per-AG structure. */
 878static inline struct xfs_perag *
 879xfs_inode_walk_get_perag(
 880        struct xfs_mount        *mp,
 881        xfs_agnumber_t          agno,
 882        int                     tag)
 883{
 884        if (tag == XFS_ICI_NO_TAG)
 885                return xfs_perag_get(mp, agno);
 886        return xfs_perag_get_tag(mp, agno, tag);
 887}
 888
 889/*
 890 * Call the @execute function on all incore inodes matching the radix tree
 891 * @tag.
 892 */
 893int
 894xfs_inode_walk(
 895        struct xfs_mount        *mp,
 896        int                     iter_flags,
 897        int                     (*execute)(struct xfs_inode *ip, void *args),
 898        void                    *args,
 899        int                     tag)
 900{
 901        struct xfs_perag        *pag;
 902        int                     error = 0;
 903        int                     last_error = 0;
 904        xfs_agnumber_t          ag;
 905
 906        ag = 0;
 907        while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) {
 908                ag = pag->pag_agno + 1;
 909                error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag);
 910                xfs_perag_put(pag);
 911                if (error) {
 912                        last_error = error;
 913                        if (error == -EFSCORRUPTED)
 914                                break;
 915                }
 916        }
 917        return last_error;
 918}
 919
 920/*
 921 * Grab the inode for reclaim exclusively.
 922 *
 923 * We have found this inode via a lookup under RCU, so the inode may have
 924 * already been freed, or it may be in the process of being recycled by
 925 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
 926 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
 927 * will not be set. Hence we need to check for both these flag conditions to
 928 * avoid inodes that are no longer reclaim candidates.
 929 *
 930 * Note: checking for other state flags here, under the i_flags_lock or not, is
 931 * racy and should be avoided. Those races should be resolved only after we have
 932 * ensured that we are able to reclaim this inode and the world can see that we
 933 * are going to reclaim it.
 934 *
 935 * Return true if we grabbed it, false otherwise.
 936 */
 937static bool
 938xfs_reclaim_inode_grab(
 939        struct xfs_inode        *ip)
 940{
 941        ASSERT(rcu_read_lock_held());
 942
 943        spin_lock(&ip->i_flags_lock);
 944        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 945            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 946                /* not a reclaim candidate. */
 947                spin_unlock(&ip->i_flags_lock);
 948                return false;
 949        }
 950        __xfs_iflags_set(ip, XFS_IRECLAIM);
 951        spin_unlock(&ip->i_flags_lock);
 952        return true;
 953}
 954
 955/*
 956 * Inode reclaim is non-blocking, so the default action if progress cannot be
 957 * made is to "requeue" the inode for reclaim by unlocking it and clearing the
 958 * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
 959 * blocking anymore and hence we can wait for the inode to be able to reclaim
 960 * it.
 961 *
 962 * We do no IO here - if callers require inodes to be cleaned they must push the
 963 * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
 964 * done in the background in a non-blocking manner, and enables memory reclaim
 965 * to make progress without blocking.
 966 */
 967static void
 968xfs_reclaim_inode(
 969        struct xfs_inode        *ip,
 970        struct xfs_perag        *pag)
 971{
 972        xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
 973
 974        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 975                goto out;
 976        if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
 977                goto out_iunlock;
 978
 979        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 980                xfs_iunpin_wait(ip);
 981                xfs_iflush_abort(ip);
 982                goto reclaim;
 983        }
 984        if (xfs_ipincount(ip))
 985                goto out_clear_flush;
 986        if (!xfs_inode_clean(ip))
 987                goto out_clear_flush;
 988
 989        xfs_iflags_clear(ip, XFS_IFLUSHING);
 990reclaim:
 991
 992        /*
 993         * Because we use RCU freeing we need to ensure the inode always appears
 994         * to be reclaimed with an invalid inode number when in the free state.
 995         * We do this as early as possible under the ILOCK so that
 996         * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
 997         * detect races with us here. By doing this, we guarantee that once
 998         * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
 999         * it will see either a valid inode that will serialise correctly, or it
1000         * will see an invalid inode that it can skip.
1001         */
1002        spin_lock(&ip->i_flags_lock);
1003        ip->i_flags = XFS_IRECLAIM;
1004        ip->i_ino = 0;
1005        spin_unlock(&ip->i_flags_lock);
1006
1007        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1008
1009        XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
1010        /*
1011         * Remove the inode from the per-AG radix tree.
1012         *
1013         * Because radix_tree_delete won't complain even if the item was never
1014         * added to the tree assert that it's been there before to catch
1015         * problems with the inode life time early on.
1016         */
1017        spin_lock(&pag->pag_ici_lock);
1018        if (!radix_tree_delete(&pag->pag_ici_root,
1019                                XFS_INO_TO_AGINO(ip->i_mount, ino)))
1020                ASSERT(0);
1021        xfs_perag_clear_reclaim_tag(pag);
1022        spin_unlock(&pag->pag_ici_lock);
1023
1024        /*
1025         * Here we do an (almost) spurious inode lock in order to coordinate
1026         * with inode cache radix tree lookups.  This is because the lookup
1027         * can reference the inodes in the cache without taking references.
1028         *
1029         * We make that OK here by ensuring that we wait until the inode is
1030         * unlocked after the lookup before we go ahead and free it.
1031         */
1032        xfs_ilock(ip, XFS_ILOCK_EXCL);
1033        xfs_qm_dqdetach(ip);
1034        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1035        ASSERT(xfs_inode_clean(ip));
1036
1037        __xfs_inode_free(ip);
1038        return;
1039
1040out_clear_flush:
1041        xfs_iflags_clear(ip, XFS_IFLUSHING);
1042out_iunlock:
1043        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1044out:
1045        xfs_iflags_clear(ip, XFS_IRECLAIM);
1046}
1047
1048/*
1049 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
1050 * corrupted, we still want to try to reclaim all the inodes. If we don't,
1051 * then a shut down during filesystem unmount reclaim walk leak all the
1052 * unreclaimed inodes.
1053 *
1054 * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
1055 * so that callers that want to block until all dirty inodes are written back
1056 * and reclaimed can sanely loop.
1057 */
1058static void
1059xfs_reclaim_inodes_ag(
1060        struct xfs_mount        *mp,
1061        int                     *nr_to_scan)
1062{
1063        struct xfs_perag        *pag;
1064        xfs_agnumber_t          ag = 0;
1065
1066        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1067                unsigned long   first_index = 0;
1068                int             done = 0;
1069                int             nr_found = 0;
1070
1071                ag = pag->pag_agno + 1;
1072
1073                first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1074                do {
1075                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1076                        int     i;
1077
1078                        rcu_read_lock();
1079                        nr_found = radix_tree_gang_lookup_tag(
1080                                        &pag->pag_ici_root,
1081                                        (void **)batch, first_index,
1082                                        XFS_LOOKUP_BATCH,
1083                                        XFS_ICI_RECLAIM_TAG);
1084                        if (!nr_found) {
1085                                done = 1;
1086                                rcu_read_unlock();
1087                                break;
1088                        }
1089
1090                        /*
1091                         * Grab the inodes before we drop the lock. if we found
1092                         * nothing, nr == 0 and the loop will be skipped.
1093                         */
1094                        for (i = 0; i < nr_found; i++) {
1095                                struct xfs_inode *ip = batch[i];
1096
1097                                if (done || !xfs_reclaim_inode_grab(ip))
1098                                        batch[i] = NULL;
1099
1100                                /*
1101                                 * Update the index for the next lookup. Catch
1102                                 * overflows into the next AG range which can
1103                                 * occur if we have inodes in the last block of
1104                                 * the AG and we are currently pointing to the
1105                                 * last inode.
1106                                 *
1107                                 * Because we may see inodes that are from the
1108                                 * wrong AG due to RCU freeing and
1109                                 * reallocation, only update the index if it
1110                                 * lies in this AG. It was a race that lead us
1111                                 * to see this inode, so another lookup from
1112                                 * the same index will not find it again.
1113                                 */
1114                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
1115                                                                pag->pag_agno)
1116                                        continue;
1117                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1118                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1119                                        done = 1;
1120                        }
1121
1122                        /* unlock now we've grabbed the inodes. */
1123                        rcu_read_unlock();
1124
1125                        for (i = 0; i < nr_found; i++) {
1126                                if (batch[i])
1127                                        xfs_reclaim_inode(batch[i], pag);
1128                        }
1129
1130                        *nr_to_scan -= XFS_LOOKUP_BATCH;
1131                        cond_resched();
1132                } while (nr_found && !done && *nr_to_scan > 0);
1133
1134                if (done)
1135                        first_index = 0;
1136                WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1137                xfs_perag_put(pag);
1138        }
1139}
1140
1141void
1142xfs_reclaim_inodes(
1143        struct xfs_mount        *mp)
1144{
1145        int             nr_to_scan = INT_MAX;
1146
1147        while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
1148                xfs_ail_push_all_sync(mp->m_ail);
1149                xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1150        }
1151}
1152
1153/*
1154 * The shrinker infrastructure determines how many inodes we should scan for
1155 * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1156 * push the AIL here. We also want to proactively free up memory if we can to
1157 * minimise the amount of work memory reclaim has to do so we kick the
1158 * background reclaim if it isn't already scheduled.
1159 */
1160long
1161xfs_reclaim_inodes_nr(
1162        struct xfs_mount        *mp,
1163        int                     nr_to_scan)
1164{
1165        /* kick background reclaimer and push the AIL */
1166        xfs_reclaim_work_queue(mp);
1167        xfs_ail_push_all(mp->m_ail);
1168
1169        xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1170        return 0;
1171}
1172
1173/*
1174 * Return the number of reclaimable inodes in the filesystem for
1175 * the shrinker to determine how much to reclaim.
1176 */
1177int
1178xfs_reclaim_inodes_count(
1179        struct xfs_mount        *mp)
1180{
1181        struct xfs_perag        *pag;
1182        xfs_agnumber_t          ag = 0;
1183        int                     reclaimable = 0;
1184
1185        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1186                ag = pag->pag_agno + 1;
1187                reclaimable += pag->pag_ici_reclaimable;
1188                xfs_perag_put(pag);
1189        }
1190        return reclaimable;
1191}
1192
1193STATIC bool
1194xfs_inode_match_id(
1195        struct xfs_inode        *ip,
1196        struct xfs_eofblocks    *eofb)
1197{
1198        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1199            !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1200                return false;
1201
1202        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1203            !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1204                return false;
1205
1206        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1207            ip->i_projid != eofb->eof_prid)
1208                return false;
1209
1210        return true;
1211}
1212
1213/*
1214 * A union-based inode filtering algorithm. Process the inode if any of the
1215 * criteria match. This is for global/internal scans only.
1216 */
1217STATIC bool
1218xfs_inode_match_id_union(
1219        struct xfs_inode        *ip,
1220        struct xfs_eofblocks    *eofb)
1221{
1222        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1223            uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1224                return true;
1225
1226        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1227            gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1228                return true;
1229
1230        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1231            ip->i_projid == eofb->eof_prid)
1232                return true;
1233
1234        return false;
1235}
1236
1237/*
1238 * Is this inode @ip eligible for eof/cow block reclamation, given some
1239 * filtering parameters @eofb?  The inode is eligible if @eofb is null or
1240 * if the predicate functions match.
1241 */
1242static bool
1243xfs_inode_matches_eofb(
1244        struct xfs_inode        *ip,
1245        struct xfs_eofblocks    *eofb)
1246{
1247        bool                    match;
1248
1249        if (!eofb)
1250                return true;
1251
1252        if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1253                match = xfs_inode_match_id_union(ip, eofb);
1254        else
1255                match = xfs_inode_match_id(ip, eofb);
1256        if (!match)
1257                return false;
1258
1259        /* skip the inode if the file size is too small */
1260        if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
1261            XFS_ISIZE(ip) < eofb->eof_min_file_size)
1262                return false;
1263
1264        return true;
1265}
1266
1267/*
1268 * This is a fast pass over the inode cache to try to get reclaim moving on as
1269 * many inodes as possible in a short period of time. It kicks itself every few
1270 * seconds, as well as being kicked by the inode cache shrinker when memory
1271 * goes low.
1272 */
1273void
1274xfs_reclaim_worker(
1275        struct work_struct *work)
1276{
1277        struct xfs_mount *mp = container_of(to_delayed_work(work),
1278                                        struct xfs_mount, m_reclaim_work);
1279        int             nr_to_scan = INT_MAX;
1280
1281        xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1282        xfs_reclaim_work_queue(mp);
1283}
1284
1285STATIC int
1286xfs_inode_free_eofblocks(
1287        struct xfs_inode        *ip,
1288        void                    *args,
1289        unsigned int            *lockflags)
1290{
1291        struct xfs_eofblocks    *eofb = args;
1292        bool                    wait;
1293
1294        wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
1295
1296        if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1297                return 0;
1298
1299        /*
1300         * If the mapping is dirty the operation can block and wait for some
1301         * time. Unless we are waiting, skip it.
1302         */
1303        if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1304                return 0;
1305
1306        if (!xfs_inode_matches_eofb(ip, eofb))
1307                return 0;
1308
1309        /*
1310         * If the caller is waiting, return -EAGAIN to keep the background
1311         * scanner moving and revisit the inode in a subsequent pass.
1312         */
1313        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1314                if (wait)
1315                        return -EAGAIN;
1316                return 0;
1317        }
1318        *lockflags |= XFS_IOLOCK_EXCL;
1319
1320        if (xfs_can_free_eofblocks(ip, false))
1321                return xfs_free_eofblocks(ip);
1322
1323        /* inode could be preallocated or append-only */
1324        trace_xfs_inode_free_eofblocks_invalid(ip);
1325        xfs_inode_clear_eofblocks_tag(ip);
1326        return 0;
1327}
1328
1329/*
1330 * Background scanning to trim preallocated space. This is queued based on the
1331 * 'speculative_prealloc_lifetime' tunable (5m by default).
1332 */
1333static inline void
1334xfs_blockgc_queue(
1335        struct xfs_perag        *pag)
1336{
1337        rcu_read_lock();
1338        if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
1339                queue_delayed_work(pag->pag_mount->m_gc_workqueue,
1340                                   &pag->pag_blockgc_work,
1341                                   msecs_to_jiffies(xfs_blockgc_secs * 1000));
1342        rcu_read_unlock();
1343}
1344
1345static void
1346xfs_blockgc_set_iflag(
1347        struct xfs_inode        *ip,
1348        unsigned long           iflag)
1349{
1350        struct xfs_mount        *mp = ip->i_mount;
1351        struct xfs_perag        *pag;
1352        int                     tagged;
1353
1354        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1355
1356        /*
1357         * Don't bother locking the AG and looking up in the radix trees
1358         * if we already know that we have the tag set.
1359         */
1360        if (ip->i_flags & iflag)
1361                return;
1362        spin_lock(&ip->i_flags_lock);
1363        ip->i_flags |= iflag;
1364        spin_unlock(&ip->i_flags_lock);
1365
1366        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1367        spin_lock(&pag->pag_ici_lock);
1368
1369        tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG);
1370        radix_tree_tag_set(&pag->pag_ici_root,
1371                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1372                           XFS_ICI_BLOCKGC_TAG);
1373        if (!tagged) {
1374                /* propagate the blockgc tag up into the perag radix tree */
1375                spin_lock(&ip->i_mount->m_perag_lock);
1376                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1377                                   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1378                                   XFS_ICI_BLOCKGC_TAG);
1379                spin_unlock(&ip->i_mount->m_perag_lock);
1380
1381                /* kick off background trimming */
1382                xfs_blockgc_queue(pag);
1383
1384                trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1,
1385                                _RET_IP_);
1386        }
1387
1388        spin_unlock(&pag->pag_ici_lock);
1389        xfs_perag_put(pag);
1390}
1391
1392void
1393xfs_inode_set_eofblocks_tag(
1394        xfs_inode_t     *ip)
1395{
1396        trace_xfs_inode_set_eofblocks_tag(ip);
1397        return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1398}
1399
1400static void
1401xfs_blockgc_clear_iflag(
1402        struct xfs_inode        *ip,
1403        unsigned long           iflag)
1404{
1405        struct xfs_mount        *mp = ip->i_mount;
1406        struct xfs_perag        *pag;
1407        bool                    clear_tag;
1408
1409        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1410
1411        spin_lock(&ip->i_flags_lock);
1412        ip->i_flags &= ~iflag;
1413        clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1414        spin_unlock(&ip->i_flags_lock);
1415
1416        if (!clear_tag)
1417                return;
1418
1419        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1420        spin_lock(&pag->pag_ici_lock);
1421
1422        radix_tree_tag_clear(&pag->pag_ici_root,
1423                             XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1424                             XFS_ICI_BLOCKGC_TAG);
1425        if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) {
1426                /* clear the blockgc tag from the perag radix tree */
1427                spin_lock(&ip->i_mount->m_perag_lock);
1428                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1429                                     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1430                                     XFS_ICI_BLOCKGC_TAG);
1431                spin_unlock(&ip->i_mount->m_perag_lock);
1432                trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1,
1433                                _RET_IP_);
1434        }
1435
1436        spin_unlock(&pag->pag_ici_lock);
1437        xfs_perag_put(pag);
1438}
1439
1440void
1441xfs_inode_clear_eofblocks_tag(
1442        xfs_inode_t     *ip)
1443{
1444        trace_xfs_inode_clear_eofblocks_tag(ip);
1445        return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1446}
1447
1448/*
1449 * Set ourselves up to free CoW blocks from this file.  If it's already clean
1450 * then we can bail out quickly, but otherwise we must back off if the file
1451 * is undergoing some kind of write.
1452 */
1453static bool
1454xfs_prep_free_cowblocks(
1455        struct xfs_inode        *ip)
1456{
1457        /*
1458         * Just clear the tag if we have an empty cow fork or none at all. It's
1459         * possible the inode was fully unshared since it was originally tagged.
1460         */
1461        if (!xfs_inode_has_cow_data(ip)) {
1462                trace_xfs_inode_free_cowblocks_invalid(ip);
1463                xfs_inode_clear_cowblocks_tag(ip);
1464                return false;
1465        }
1466
1467        /*
1468         * If the mapping is dirty or under writeback we cannot touch the
1469         * CoW fork.  Leave it alone if we're in the midst of a directio.
1470         */
1471        if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1472            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1473            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1474            atomic_read(&VFS_I(ip)->i_dio_count))
1475                return false;
1476
1477        return true;
1478}
1479
1480/*
1481 * Automatic CoW Reservation Freeing
1482 *
1483 * These functions automatically garbage collect leftover CoW reservations
1484 * that were made on behalf of a cowextsize hint when we start to run out
1485 * of quota or when the reservations sit around for too long.  If the file
1486 * has dirty pages or is undergoing writeback, its CoW reservations will
1487 * be retained.
1488 *
1489 * The actual garbage collection piggybacks off the same code that runs
1490 * the speculative EOF preallocation garbage collector.
1491 */
1492STATIC int
1493xfs_inode_free_cowblocks(
1494        struct xfs_inode        *ip,
1495        void                    *args,
1496        unsigned int            *lockflags)
1497{
1498        struct xfs_eofblocks    *eofb = args;
1499        bool                    wait;
1500        int                     ret = 0;
1501
1502        wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
1503
1504        if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1505                return 0;
1506
1507        if (!xfs_prep_free_cowblocks(ip))
1508                return 0;
1509
1510        if (!xfs_inode_matches_eofb(ip, eofb))
1511                return 0;
1512
1513        /*
1514         * If the caller is waiting, return -EAGAIN to keep the background
1515         * scanner moving and revisit the inode in a subsequent pass.
1516         */
1517        if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1518            !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1519                if (wait)
1520                        return -EAGAIN;
1521                return 0;
1522        }
1523        *lockflags |= XFS_IOLOCK_EXCL;
1524
1525        if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1526                if (wait)
1527                        return -EAGAIN;
1528                return 0;
1529        }
1530        *lockflags |= XFS_MMAPLOCK_EXCL;
1531
1532        /*
1533         * Check again, nobody else should be able to dirty blocks or change
1534         * the reflink iflag now that we have the first two locks held.
1535         */
1536        if (xfs_prep_free_cowblocks(ip))
1537                ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1538        return ret;
1539}
1540
1541void
1542xfs_inode_set_cowblocks_tag(
1543        xfs_inode_t     *ip)
1544{
1545        trace_xfs_inode_set_cowblocks_tag(ip);
1546        return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1547}
1548
1549void
1550xfs_inode_clear_cowblocks_tag(
1551        xfs_inode_t     *ip)
1552{
1553        trace_xfs_inode_clear_cowblocks_tag(ip);
1554        return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1555}
1556
1557#define for_each_perag_tag(mp, next_agno, pag, tag) \
1558        for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
1559                (pag) != NULL; \
1560                (next_agno) = (pag)->pag_agno + 1, \
1561                xfs_perag_put(pag), \
1562                (pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))
1563
1564
1565/* Disable post-EOF and CoW block auto-reclamation. */
1566void
1567xfs_blockgc_stop(
1568        struct xfs_mount        *mp)
1569{
1570        struct xfs_perag        *pag;
1571        xfs_agnumber_t          agno;
1572
1573        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1574                cancel_delayed_work_sync(&pag->pag_blockgc_work);
1575}
1576
1577/* Enable post-EOF and CoW block auto-reclamation. */
1578void
1579xfs_blockgc_start(
1580        struct xfs_mount        *mp)
1581{
1582        struct xfs_perag        *pag;
1583        xfs_agnumber_t          agno;
1584
1585        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1586                xfs_blockgc_queue(pag);
1587}
1588
1589/* Scan one incore inode for block preallocations that we can remove. */
1590static int
1591xfs_blockgc_scan_inode(
1592        struct xfs_inode        *ip,
1593        void                    *args)
1594{
1595        unsigned int            lockflags = 0;
1596        int                     error;
1597
1598        error = xfs_inode_free_eofblocks(ip, args, &lockflags);
1599        if (error)
1600                goto unlock;
1601
1602        error = xfs_inode_free_cowblocks(ip, args, &lockflags);
1603unlock:
1604        if (lockflags)
1605                xfs_iunlock(ip, lockflags);
1606        return error;
1607}
1608
1609/* Background worker that trims preallocated space. */
1610void
1611xfs_blockgc_worker(
1612        struct work_struct      *work)
1613{
1614        struct xfs_perag        *pag = container_of(to_delayed_work(work),
1615                                        struct xfs_perag, pag_blockgc_work);
1616        struct xfs_mount        *mp = pag->pag_mount;
1617        int                     error;
1618
1619        if (!sb_start_write_trylock(mp->m_super))
1620                return;
1621        error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL,
1622                        XFS_ICI_BLOCKGC_TAG);
1623        if (error)
1624                xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1625                                pag->pag_agno, error);
1626        sb_end_write(mp->m_super);
1627        xfs_blockgc_queue(pag);
1628}
1629
1630/*
1631 * Try to free space in the filesystem by purging eofblocks and cowblocks.
1632 */
1633int
1634xfs_blockgc_free_space(
1635        struct xfs_mount        *mp,
1636        struct xfs_eofblocks    *eofb)
1637{
1638        trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
1639
1640        return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb,
1641                        XFS_ICI_BLOCKGC_TAG);
1642}
1643
1644/*
1645 * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1646 * quota caused an allocation failure, so we make a best effort by including
1647 * each quota under low free space conditions (less than 1% free space) in the
1648 * scan.
1649 *
1650 * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1651 * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or
1652 * MMAPLOCK.
1653 */
1654int
1655xfs_blockgc_free_dquots(
1656        struct xfs_mount        *mp,
1657        struct xfs_dquot        *udqp,
1658        struct xfs_dquot        *gdqp,
1659        struct xfs_dquot        *pdqp,
1660        unsigned int            eof_flags)
1661{
1662        struct xfs_eofblocks    eofb = {0};
1663        bool                    do_work = false;
1664
1665        if (!udqp && !gdqp && !pdqp)
1666                return 0;
1667
1668        /*
1669         * Run a scan to free blocks using the union filter to cover all
1670         * applicable quotas in a single scan.
1671         */
1672        eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
1673
1674        if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1675                eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1676                eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1677                do_work = true;
1678        }
1679
1680        if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1681                eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1682                eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1683                do_work = true;
1684        }
1685
1686        if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1687                eofb.eof_prid = pdqp->q_id;
1688                eofb.eof_flags |= XFS_EOF_FLAGS_PRID;
1689                do_work = true;
1690        }
1691
1692        if (!do_work)
1693                return 0;
1694
1695        return xfs_blockgc_free_space(mp, &eofb);
1696}
1697
1698/* Run cow/eofblocks scans on the quotas attached to the inode. */
1699int
1700xfs_blockgc_free_quota(
1701        struct xfs_inode        *ip,
1702        unsigned int            eof_flags)
1703{
1704        return xfs_blockgc_free_dquots(ip->i_mount,
1705                        xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1706                        xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1707                        xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
1708}
1709