linux/fs/xfs/xfs_icache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_trans.h"
  15#include "xfs_trans_priv.h"
  16#include "xfs_inode_item.h"
  17#include "xfs_quota.h"
  18#include "xfs_trace.h"
  19#include "xfs_icache.h"
  20#include "xfs_bmap_util.h"
  21#include "xfs_dquot_item.h"
  22#include "xfs_dquot.h"
  23#include "xfs_reflink.h"
  24#include "xfs_ialloc.h"
  25#include "xfs_ag.h"
  26
  27#include <linux/iversion.h>
  28
  29/* Radix tree tags for incore inode tree. */
  30
  31/* inode is to be reclaimed */
  32#define XFS_ICI_RECLAIM_TAG     0
  33/* Inode has speculative preallocations (posteof or cow) to clean. */
  34#define XFS_ICI_BLOCKGC_TAG     1
  35
  36/*
  37 * The goal for walking incore inodes.  These can correspond with incore inode
  38 * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
  39 */
  40enum xfs_icwalk_goal {
  41        /* Goals that are not related to tags; these must be < 0. */
  42        XFS_ICWALK_DQRELE       = -1,
  43
  44        /* Goals directly associated with tagged inodes. */
  45        XFS_ICWALK_BLOCKGC      = XFS_ICI_BLOCKGC_TAG,
  46        XFS_ICWALK_RECLAIM      = XFS_ICI_RECLAIM_TAG,
  47};
  48
  49#define XFS_ICWALK_NULL_TAG     (-1U)
  50
  51/* Compute the inode radix tree tag for this goal. */
  52static inline unsigned int
  53xfs_icwalk_tag(enum xfs_icwalk_goal goal)
  54{
  55        return goal < 0 ? XFS_ICWALK_NULL_TAG : goal;
  56}
  57
  58static int xfs_icwalk(struct xfs_mount *mp,
  59                enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  60static int xfs_icwalk_ag(struct xfs_perag *pag,
  61                enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  62
  63/*
  64 * Private inode cache walk flags for struct xfs_icwalk.  Must not
  65 * coincide with XFS_ICWALK_FLAGS_VALID.
  66 */
  67#define XFS_ICWALK_FLAG_DROP_UDQUOT     (1U << 31)
  68#define XFS_ICWALK_FLAG_DROP_GDQUOT     (1U << 30)
  69#define XFS_ICWALK_FLAG_DROP_PDQUOT     (1U << 29)
  70
  71/* Stop scanning after icw_scan_limit inodes. */
  72#define XFS_ICWALK_FLAG_SCAN_LIMIT      (1U << 28)
  73
  74#define XFS_ICWALK_FLAG_RECLAIM_SICK    (1U << 27)
  75#define XFS_ICWALK_FLAG_UNION           (1U << 26) /* union filter algorithm */
  76
  77#define XFS_ICWALK_PRIVATE_FLAGS        (XFS_ICWALK_FLAG_DROP_UDQUOT | \
  78                                         XFS_ICWALK_FLAG_DROP_GDQUOT | \
  79                                         XFS_ICWALK_FLAG_DROP_PDQUOT | \
  80                                         XFS_ICWALK_FLAG_SCAN_LIMIT | \
  81                                         XFS_ICWALK_FLAG_RECLAIM_SICK | \
  82                                         XFS_ICWALK_FLAG_UNION)
  83
  84/*
  85 * Allocate and initialise an xfs_inode.
  86 */
  87struct xfs_inode *
  88xfs_inode_alloc(
  89        struct xfs_mount        *mp,
  90        xfs_ino_t               ino)
  91{
  92        struct xfs_inode        *ip;
  93
  94        /*
  95         * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
  96         * and return NULL here on ENOMEM.
  97         */
  98        ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
  99
 100        if (inode_init_always(mp->m_super, VFS_I(ip))) {
 101                kmem_cache_free(xfs_inode_zone, ip);
 102                return NULL;
 103        }
 104
 105        /* VFS doesn't initialise i_mode! */
 106        VFS_I(ip)->i_mode = 0;
 107
 108        XFS_STATS_INC(mp, vn_active);
 109        ASSERT(atomic_read(&ip->i_pincount) == 0);
 110        ASSERT(ip->i_ino == 0);
 111
 112        /* initialise the xfs inode */
 113        ip->i_ino = ino;
 114        ip->i_mount = mp;
 115        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
 116        ip->i_afp = NULL;
 117        ip->i_cowfp = NULL;
 118        memset(&ip->i_df, 0, sizeof(ip->i_df));
 119        ip->i_flags = 0;
 120        ip->i_delayed_blks = 0;
 121        ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
 122        ip->i_nblocks = 0;
 123        ip->i_forkoff = 0;
 124        ip->i_sick = 0;
 125        ip->i_checked = 0;
 126        INIT_WORK(&ip->i_ioend_work, xfs_end_io);
 127        INIT_LIST_HEAD(&ip->i_ioend_list);
 128        spin_lock_init(&ip->i_ioend_lock);
 129
 130        return ip;
 131}
 132
 133STATIC void
 134xfs_inode_free_callback(
 135        struct rcu_head         *head)
 136{
 137        struct inode            *inode = container_of(head, struct inode, i_rcu);
 138        struct xfs_inode        *ip = XFS_I(inode);
 139
 140        switch (VFS_I(ip)->i_mode & S_IFMT) {
 141        case S_IFREG:
 142        case S_IFDIR:
 143        case S_IFLNK:
 144                xfs_idestroy_fork(&ip->i_df);
 145                break;
 146        }
 147
 148        if (ip->i_afp) {
 149                xfs_idestroy_fork(ip->i_afp);
 150                kmem_cache_free(xfs_ifork_zone, ip->i_afp);
 151        }
 152        if (ip->i_cowfp) {
 153                xfs_idestroy_fork(ip->i_cowfp);
 154                kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
 155        }
 156        if (ip->i_itemp) {
 157                ASSERT(!test_bit(XFS_LI_IN_AIL,
 158                                 &ip->i_itemp->ili_item.li_flags));
 159                xfs_inode_item_destroy(ip);
 160                ip->i_itemp = NULL;
 161        }
 162
 163        kmem_cache_free(xfs_inode_zone, ip);
 164}
 165
 166static void
 167__xfs_inode_free(
 168        struct xfs_inode        *ip)
 169{
 170        /* asserts to verify all state is correct here */
 171        ASSERT(atomic_read(&ip->i_pincount) == 0);
 172        ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
 173        XFS_STATS_DEC(ip->i_mount, vn_active);
 174
 175        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 176}
 177
 178void
 179xfs_inode_free(
 180        struct xfs_inode        *ip)
 181{
 182        ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
 183
 184        /*
 185         * Because we use RCU freeing we need to ensure the inode always
 186         * appears to be reclaimed with an invalid inode number when in the
 187         * free state. The ip->i_flags_lock provides the barrier against lookup
 188         * races.
 189         */
 190        spin_lock(&ip->i_flags_lock);
 191        ip->i_flags = XFS_IRECLAIM;
 192        ip->i_ino = 0;
 193        spin_unlock(&ip->i_flags_lock);
 194
 195        __xfs_inode_free(ip);
 196}
 197
 198/*
 199 * Queue background inode reclaim work if there are reclaimable inodes and there
 200 * isn't reclaim work already scheduled or in progress.
 201 */
 202static void
 203xfs_reclaim_work_queue(
 204        struct xfs_mount        *mp)
 205{
 206
 207        rcu_read_lock();
 208        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 209                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 210                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 211        }
 212        rcu_read_unlock();
 213}
 214
 215/*
 216 * Background scanning to trim preallocated space. This is queued based on the
 217 * 'speculative_prealloc_lifetime' tunable (5m by default).
 218 */
 219static inline void
 220xfs_blockgc_queue(
 221        struct xfs_perag        *pag)
 222{
 223        rcu_read_lock();
 224        if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
 225                queue_delayed_work(pag->pag_mount->m_gc_workqueue,
 226                                   &pag->pag_blockgc_work,
 227                                   msecs_to_jiffies(xfs_blockgc_secs * 1000));
 228        rcu_read_unlock();
 229}
 230
 231/* Set a tag on both the AG incore inode tree and the AG radix tree. */
 232static void
 233xfs_perag_set_inode_tag(
 234        struct xfs_perag        *pag,
 235        xfs_agino_t             agino,
 236        unsigned int            tag)
 237{
 238        struct xfs_mount        *mp = pag->pag_mount;
 239        bool                    was_tagged;
 240
 241        lockdep_assert_held(&pag->pag_ici_lock);
 242
 243        was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
 244        radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
 245
 246        if (tag == XFS_ICI_RECLAIM_TAG)
 247                pag->pag_ici_reclaimable++;
 248
 249        if (was_tagged)
 250                return;
 251
 252        /* propagate the tag up into the perag radix tree */
 253        spin_lock(&mp->m_perag_lock);
 254        radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
 255        spin_unlock(&mp->m_perag_lock);
 256
 257        /* start background work */
 258        switch (tag) {
 259        case XFS_ICI_RECLAIM_TAG:
 260                xfs_reclaim_work_queue(mp);
 261                break;
 262        case XFS_ICI_BLOCKGC_TAG:
 263                xfs_blockgc_queue(pag);
 264                break;
 265        }
 266
 267        trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
 268}
 269
 270/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
 271static void
 272xfs_perag_clear_inode_tag(
 273        struct xfs_perag        *pag,
 274        xfs_agino_t             agino,
 275        unsigned int            tag)
 276{
 277        struct xfs_mount        *mp = pag->pag_mount;
 278
 279        lockdep_assert_held(&pag->pag_ici_lock);
 280
 281        /*
 282         * Reclaim can signal (with a null agino) that it cleared its own tag
 283         * by removing the inode from the radix tree.
 284         */
 285        if (agino != NULLAGINO)
 286                radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
 287        else
 288                ASSERT(tag == XFS_ICI_RECLAIM_TAG);
 289
 290        if (tag == XFS_ICI_RECLAIM_TAG)
 291                pag->pag_ici_reclaimable--;
 292
 293        if (radix_tree_tagged(&pag->pag_ici_root, tag))
 294                return;
 295
 296        /* clear the tag from the perag radix tree */
 297        spin_lock(&mp->m_perag_lock);
 298        radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
 299        spin_unlock(&mp->m_perag_lock);
 300
 301        trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
 302}
 303
 304/*
 305 * We set the inode flag atomically with the radix tree tag.
 306 * Once we get tag lookups on the radix tree, this inode flag
 307 * can go away.
 308 */
 309void
 310xfs_inode_mark_reclaimable(
 311        struct xfs_inode        *ip)
 312{
 313        struct xfs_mount        *mp = ip->i_mount;
 314        struct xfs_perag        *pag;
 315
 316        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 317        spin_lock(&pag->pag_ici_lock);
 318        spin_lock(&ip->i_flags_lock);
 319
 320        xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
 321                        XFS_ICI_RECLAIM_TAG);
 322        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 323
 324        spin_unlock(&ip->i_flags_lock);
 325        spin_unlock(&pag->pag_ici_lock);
 326        xfs_perag_put(pag);
 327}
 328
 329static inline void
 330xfs_inew_wait(
 331        struct xfs_inode        *ip)
 332{
 333        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
 334        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
 335
 336        do {
 337                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 338                if (!xfs_iflags_test(ip, XFS_INEW))
 339                        break;
 340                schedule();
 341        } while (true);
 342        finish_wait(wq, &wait.wq_entry);
 343}
 344
 345/*
 346 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
 347 * part of the structure. This is made more complex by the fact we store
 348 * information about the on-disk values in the VFS inode and so we can't just
 349 * overwrite the values unconditionally. Hence we save the parameters we
 350 * need to retain across reinitialisation, and rewrite them into the VFS inode
 351 * after reinitialisation even if it fails.
 352 */
 353static int
 354xfs_reinit_inode(
 355        struct xfs_mount        *mp,
 356        struct inode            *inode)
 357{
 358        int                     error;
 359        uint32_t                nlink = inode->i_nlink;
 360        uint32_t                generation = inode->i_generation;
 361        uint64_t                version = inode_peek_iversion(inode);
 362        umode_t                 mode = inode->i_mode;
 363        dev_t                   dev = inode->i_rdev;
 364        kuid_t                  uid = inode->i_uid;
 365        kgid_t                  gid = inode->i_gid;
 366
 367        error = inode_init_always(mp->m_super, inode);
 368
 369        set_nlink(inode, nlink);
 370        inode->i_generation = generation;
 371        inode_set_iversion_queried(inode, version);
 372        inode->i_mode = mode;
 373        inode->i_rdev = dev;
 374        inode->i_uid = uid;
 375        inode->i_gid = gid;
 376        return error;
 377}
 378
 379/*
 380 * Carefully nudge an inode whose VFS state has been torn down back into a
 381 * usable state.  Drops the i_flags_lock and the rcu read lock.
 382 */
 383static int
 384xfs_iget_recycle(
 385        struct xfs_perag        *pag,
 386        struct xfs_inode        *ip) __releases(&ip->i_flags_lock)
 387{
 388        struct xfs_mount        *mp = ip->i_mount;
 389        struct inode            *inode = VFS_I(ip);
 390        int                     error;
 391
 392        trace_xfs_iget_recycle(ip);
 393
 394        /*
 395         * We need to make it look like the inode is being reclaimed to prevent
 396         * the actual reclaim workers from stomping over us while we recycle
 397         * the inode.  We can't clear the radix tree tag yet as it requires
 398         * pag_ici_lock to be held exclusive.
 399         */
 400        ip->i_flags |= XFS_IRECLAIM;
 401
 402        spin_unlock(&ip->i_flags_lock);
 403        rcu_read_unlock();
 404
 405        ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 406        error = xfs_reinit_inode(mp, inode);
 407        if (error) {
 408                bool    wake;
 409
 410                /*
 411                 * Re-initializing the inode failed, and we are in deep
 412                 * trouble.  Try to re-add it to the reclaim list.
 413                 */
 414                rcu_read_lock();
 415                spin_lock(&ip->i_flags_lock);
 416                wake = !!__xfs_iflags_test(ip, XFS_INEW);
 417                ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 418                if (wake)
 419                        wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 420                ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 421                spin_unlock(&ip->i_flags_lock);
 422                rcu_read_unlock();
 423
 424                trace_xfs_iget_recycle_fail(ip);
 425                return error;
 426        }
 427
 428        spin_lock(&pag->pag_ici_lock);
 429        spin_lock(&ip->i_flags_lock);
 430
 431        /*
 432         * Clear the per-lifetime state in the inode as we are now effectively
 433         * a new inode and need to return to the initial state before reuse
 434         * occurs.
 435         */
 436        ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 437        ip->i_flags |= XFS_INEW;
 438        xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
 439                        XFS_ICI_RECLAIM_TAG);
 440        inode->i_state = I_NEW;
 441        spin_unlock(&ip->i_flags_lock);
 442        spin_unlock(&pag->pag_ici_lock);
 443
 444        return 0;
 445}
 446
 447/*
 448 * If we are allocating a new inode, then check what was returned is
 449 * actually a free, empty inode. If we are not allocating an inode,
 450 * then check we didn't find a free inode.
 451 *
 452 * Returns:
 453 *      0               if the inode free state matches the lookup context
 454 *      -ENOENT         if the inode is free and we are not allocating
 455 *      -EFSCORRUPTED   if there is any state mismatch at all
 456 */
 457static int
 458xfs_iget_check_free_state(
 459        struct xfs_inode        *ip,
 460        int                     flags)
 461{
 462        if (flags & XFS_IGET_CREATE) {
 463                /* should be a free inode */
 464                if (VFS_I(ip)->i_mode != 0) {
 465                        xfs_warn(ip->i_mount,
 466"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
 467                                ip->i_ino, VFS_I(ip)->i_mode);
 468                        return -EFSCORRUPTED;
 469                }
 470
 471                if (ip->i_nblocks != 0) {
 472                        xfs_warn(ip->i_mount,
 473"Corruption detected! Free inode 0x%llx has blocks allocated!",
 474                                ip->i_ino);
 475                        return -EFSCORRUPTED;
 476                }
 477                return 0;
 478        }
 479
 480        /* should be an allocated inode */
 481        if (VFS_I(ip)->i_mode == 0)
 482                return -ENOENT;
 483
 484        return 0;
 485}
 486
 487/*
 488 * Check the validity of the inode we just found it the cache
 489 */
 490static int
 491xfs_iget_cache_hit(
 492        struct xfs_perag        *pag,
 493        struct xfs_inode        *ip,
 494        xfs_ino_t               ino,
 495        int                     flags,
 496        int                     lock_flags) __releases(RCU)
 497{
 498        struct inode            *inode = VFS_I(ip);
 499        struct xfs_mount        *mp = ip->i_mount;
 500        int                     error;
 501
 502        /*
 503         * check for re-use of an inode within an RCU grace period due to the
 504         * radix tree nodes not being updated yet. We monitor for this by
 505         * setting the inode number to zero before freeing the inode structure.
 506         * If the inode has been reallocated and set up, then the inode number
 507         * will not match, so check for that, too.
 508         */
 509        spin_lock(&ip->i_flags_lock);
 510        if (ip->i_ino != ino)
 511                goto out_skip;
 512
 513        /*
 514         * If we are racing with another cache hit that is currently
 515         * instantiating this inode or currently recycling it out of
 516         * reclaimable state, wait for the initialisation to complete
 517         * before continuing.
 518         *
 519         * XXX(hch): eventually we should do something equivalent to
 520         *           wait_on_inode to wait for these flags to be cleared
 521         *           instead of polling for it.
 522         */
 523        if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM))
 524                goto out_skip;
 525
 526        /*
 527         * Check the inode free state is valid. This also detects lookup
 528         * racing with unlinks.
 529         */
 530        error = xfs_iget_check_free_state(ip, flags);
 531        if (error)
 532                goto out_error;
 533
 534        /* Skip inodes that have no vfs state. */
 535        if ((flags & XFS_IGET_INCORE) &&
 536            (ip->i_flags & XFS_IRECLAIMABLE))
 537                goto out_skip;
 538
 539        /* The inode fits the selection criteria; process it. */
 540        if (ip->i_flags & XFS_IRECLAIMABLE) {
 541                /* Drops i_flags_lock and RCU read lock. */
 542                error = xfs_iget_recycle(pag, ip);
 543                if (error)
 544                        return error;
 545        } else {
 546                /* If the VFS inode is being torn down, pause and try again. */
 547                if (!igrab(inode))
 548                        goto out_skip;
 549
 550                /* We've got a live one. */
 551                spin_unlock(&ip->i_flags_lock);
 552                rcu_read_unlock();
 553                trace_xfs_iget_hit(ip);
 554        }
 555
 556        if (lock_flags != 0)
 557                xfs_ilock(ip, lock_flags);
 558
 559        if (!(flags & XFS_IGET_INCORE))
 560                xfs_iflags_clear(ip, XFS_ISTALE);
 561        XFS_STATS_INC(mp, xs_ig_found);
 562
 563        return 0;
 564
 565out_skip:
 566        trace_xfs_iget_skip(ip);
 567        XFS_STATS_INC(mp, xs_ig_frecycle);
 568        error = -EAGAIN;
 569out_error:
 570        spin_unlock(&ip->i_flags_lock);
 571        rcu_read_unlock();
 572        return error;
 573}
 574
 575static int
 576xfs_iget_cache_miss(
 577        struct xfs_mount        *mp,
 578        struct xfs_perag        *pag,
 579        xfs_trans_t             *tp,
 580        xfs_ino_t               ino,
 581        struct xfs_inode        **ipp,
 582        int                     flags,
 583        int                     lock_flags)
 584{
 585        struct xfs_inode        *ip;
 586        int                     error;
 587        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 588        int                     iflags;
 589
 590        ip = xfs_inode_alloc(mp, ino);
 591        if (!ip)
 592                return -ENOMEM;
 593
 594        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
 595        if (error)
 596                goto out_destroy;
 597
 598        /*
 599         * For version 5 superblocks, if we are initialising a new inode and we
 600         * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
 601         * simply build the new inode core with a random generation number.
 602         *
 603         * For version 4 (and older) superblocks, log recovery is dependent on
 604         * the i_flushiter field being initialised from the current on-disk
 605         * value and hence we must also read the inode off disk even when
 606         * initializing new inodes.
 607         */
 608        if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
 609            (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 610                VFS_I(ip)->i_generation = prandom_u32();
 611        } else {
 612                struct xfs_buf          *bp;
 613
 614                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
 615                if (error)
 616                        goto out_destroy;
 617
 618                error = xfs_inode_from_disk(ip,
 619                                xfs_buf_offset(bp, ip->i_imap.im_boffset));
 620                if (!error)
 621                        xfs_buf_set_ref(bp, XFS_INO_REF);
 622                xfs_trans_brelse(tp, bp);
 623
 624                if (error)
 625                        goto out_destroy;
 626        }
 627
 628        trace_xfs_iget_miss(ip);
 629
 630        /*
 631         * Check the inode free state is valid. This also detects lookup
 632         * racing with unlinks.
 633         */
 634        error = xfs_iget_check_free_state(ip, flags);
 635        if (error)
 636                goto out_destroy;
 637
 638        /*
 639         * Preload the radix tree so we can insert safely under the
 640         * write spinlock. Note that we cannot sleep inside the preload
 641         * region. Since we can be called from transaction context, don't
 642         * recurse into the file system.
 643         */
 644        if (radix_tree_preload(GFP_NOFS)) {
 645                error = -EAGAIN;
 646                goto out_destroy;
 647        }
 648
 649        /*
 650         * Because the inode hasn't been added to the radix-tree yet it can't
 651         * be found by another thread, so we can do the non-sleeping lock here.
 652         */
 653        if (lock_flags) {
 654                if (!xfs_ilock_nowait(ip, lock_flags))
 655                        BUG();
 656        }
 657
 658        /*
 659         * These values must be set before inserting the inode into the radix
 660         * tree as the moment it is inserted a concurrent lookup (allowed by the
 661         * RCU locking mechanism) can find it and that lookup must see that this
 662         * is an inode currently under construction (i.e. that XFS_INEW is set).
 663         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 664         * memory barrier that ensures this detection works correctly at lookup
 665         * time.
 666         */
 667        iflags = XFS_INEW;
 668        if (flags & XFS_IGET_DONTCACHE)
 669                d_mark_dontcache(VFS_I(ip));
 670        ip->i_udquot = NULL;
 671        ip->i_gdquot = NULL;
 672        ip->i_pdquot = NULL;
 673        xfs_iflags_set(ip, iflags);
 674
 675        /* insert the new inode */
 676        spin_lock(&pag->pag_ici_lock);
 677        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 678        if (unlikely(error)) {
 679                WARN_ON(error != -EEXIST);
 680                XFS_STATS_INC(mp, xs_ig_dup);
 681                error = -EAGAIN;
 682                goto out_preload_end;
 683        }
 684        spin_unlock(&pag->pag_ici_lock);
 685        radix_tree_preload_end();
 686
 687        *ipp = ip;
 688        return 0;
 689
 690out_preload_end:
 691        spin_unlock(&pag->pag_ici_lock);
 692        radix_tree_preload_end();
 693        if (lock_flags)
 694                xfs_iunlock(ip, lock_flags);
 695out_destroy:
 696        __destroy_inode(VFS_I(ip));
 697        xfs_inode_free(ip);
 698        return error;
 699}
 700
 701/*
 702 * Look up an inode by number in the given file system.  The inode is looked up
 703 * in the cache held in each AG.  If the inode is found in the cache, initialise
 704 * the vfs inode if necessary.
 705 *
 706 * If it is not in core, read it in from the file system's device, add it to the
 707 * cache and initialise the vfs inode.
 708 *
 709 * The inode is locked according to the value of the lock_flags parameter.
 710 * Inode lookup is only done during metadata operations and not as part of the
 711 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
 712 */
 713int
 714xfs_iget(
 715        struct xfs_mount        *mp,
 716        struct xfs_trans        *tp,
 717        xfs_ino_t               ino,
 718        uint                    flags,
 719        uint                    lock_flags,
 720        struct xfs_inode        **ipp)
 721{
 722        struct xfs_inode        *ip;
 723        struct xfs_perag        *pag;
 724        xfs_agino_t             agino;
 725        int                     error;
 726
 727        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 728
 729        /* reject inode numbers outside existing AGs */
 730        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 731                return -EINVAL;
 732
 733        XFS_STATS_INC(mp, xs_ig_attempts);
 734
 735        /* get the perag structure and ensure that it's inode capable */
 736        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 737        agino = XFS_INO_TO_AGINO(mp, ino);
 738
 739again:
 740        error = 0;
 741        rcu_read_lock();
 742        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 743
 744        if (ip) {
 745                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 746                if (error)
 747                        goto out_error_or_again;
 748        } else {
 749                rcu_read_unlock();
 750                if (flags & XFS_IGET_INCORE) {
 751                        error = -ENODATA;
 752                        goto out_error_or_again;
 753                }
 754                XFS_STATS_INC(mp, xs_ig_missed);
 755
 756                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 757                                                        flags, lock_flags);
 758                if (error)
 759                        goto out_error_or_again;
 760        }
 761        xfs_perag_put(pag);
 762
 763        *ipp = ip;
 764
 765        /*
 766         * If we have a real type for an on-disk inode, we can setup the inode
 767         * now.  If it's a new inode being created, xfs_ialloc will handle it.
 768         */
 769        if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
 770                xfs_setup_existing_inode(ip);
 771        return 0;
 772
 773out_error_or_again:
 774        if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
 775                delay(1);
 776                goto again;
 777        }
 778        xfs_perag_put(pag);
 779        return error;
 780}
 781
 782/*
 783 * "Is this a cached inode that's also allocated?"
 784 *
 785 * Look up an inode by number in the given file system.  If the inode is
 786 * in cache and isn't in purgatory, return 1 if the inode is allocated
 787 * and 0 if it is not.  For all other cases (not in cache, being torn
 788 * down, etc.), return a negative error code.
 789 *
 790 * The caller has to prevent inode allocation and freeing activity,
 791 * presumably by locking the AGI buffer.   This is to ensure that an
 792 * inode cannot transition from allocated to freed until the caller is
 793 * ready to allow that.  If the inode is in an intermediate state (new,
 794 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
 795 * inode is not in the cache, -ENOENT will be returned.  The caller must
 796 * deal with these scenarios appropriately.
 797 *
 798 * This is a specialized use case for the online scrubber; if you're
 799 * reading this, you probably want xfs_iget.
 800 */
 801int
 802xfs_icache_inode_is_allocated(
 803        struct xfs_mount        *mp,
 804        struct xfs_trans        *tp,
 805        xfs_ino_t               ino,
 806        bool                    *inuse)
 807{
 808        struct xfs_inode        *ip;
 809        int                     error;
 810
 811        error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
 812        if (error)
 813                return error;
 814
 815        *inuse = !!(VFS_I(ip)->i_mode);
 816        xfs_irele(ip);
 817        return 0;
 818}
 819
 820#ifdef CONFIG_XFS_QUOTA
 821/* Decide if we want to grab this inode to drop its dquots. */
 822static bool
 823xfs_dqrele_igrab(
 824        struct xfs_inode        *ip)
 825{
 826        bool                    ret = false;
 827
 828        ASSERT(rcu_read_lock_held());
 829
 830        /* Check for stale RCU freed inode */
 831        spin_lock(&ip->i_flags_lock);
 832        if (!ip->i_ino)
 833                goto out_unlock;
 834
 835        /*
 836         * Skip inodes that are anywhere in the reclaim machinery because we
 837         * drop dquots before tagging an inode for reclamation.
 838         */
 839        if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE))
 840                goto out_unlock;
 841
 842        /*
 843         * The inode looks alive; try to grab a VFS reference so that it won't
 844         * get destroyed.  If we got the reference, return true to say that
 845         * we grabbed the inode.
 846         *
 847         * If we can't get the reference, then we know the inode had its VFS
 848         * state torn down and hasn't yet entered the reclaim machinery.  Since
 849         * we also know that dquots are detached from an inode before it enters
 850         * reclaim, we can skip the inode.
 851         */
 852        ret = igrab(VFS_I(ip)) != NULL;
 853
 854out_unlock:
 855        spin_unlock(&ip->i_flags_lock);
 856        return ret;
 857}
 858
 859/* Drop this inode's dquots. */
 860static void
 861xfs_dqrele_inode(
 862        struct xfs_inode        *ip,
 863        struct xfs_icwalk       *icw)
 864{
 865        if (xfs_iflags_test(ip, XFS_INEW))
 866                xfs_inew_wait(ip);
 867
 868        xfs_ilock(ip, XFS_ILOCK_EXCL);
 869        if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) {
 870                xfs_qm_dqrele(ip->i_udquot);
 871                ip->i_udquot = NULL;
 872        }
 873        if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) {
 874                xfs_qm_dqrele(ip->i_gdquot);
 875                ip->i_gdquot = NULL;
 876        }
 877        if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) {
 878                xfs_qm_dqrele(ip->i_pdquot);
 879                ip->i_pdquot = NULL;
 880        }
 881        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 882        xfs_irele(ip);
 883}
 884
 885/*
 886 * Detach all dquots from incore inodes if we can.  The caller must already
 887 * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will
 888 * not get reattached.
 889 */
 890int
 891xfs_dqrele_all_inodes(
 892        struct xfs_mount        *mp,
 893        unsigned int            qflags)
 894{
 895        struct xfs_icwalk       icw = { .icw_flags = 0 };
 896
 897        if (qflags & XFS_UQUOTA_ACCT)
 898                icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT;
 899        if (qflags & XFS_GQUOTA_ACCT)
 900                icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT;
 901        if (qflags & XFS_PQUOTA_ACCT)
 902                icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
 903
 904        return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw);
 905}
 906#else
 907# define xfs_dqrele_igrab(ip)           (false)
 908# define xfs_dqrele_inode(ip, priv)     ((void)0)
 909#endif /* CONFIG_XFS_QUOTA */
 910
 911/*
 912 * Grab the inode for reclaim exclusively.
 913 *
 914 * We have found this inode via a lookup under RCU, so the inode may have
 915 * already been freed, or it may be in the process of being recycled by
 916 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
 917 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
 918 * will not be set. Hence we need to check for both these flag conditions to
 919 * avoid inodes that are no longer reclaim candidates.
 920 *
 921 * Note: checking for other state flags here, under the i_flags_lock or not, is
 922 * racy and should be avoided. Those races should be resolved only after we have
 923 * ensured that we are able to reclaim this inode and the world can see that we
 924 * are going to reclaim it.
 925 *
 926 * Return true if we grabbed it, false otherwise.
 927 */
 928static bool
 929xfs_reclaim_igrab(
 930        struct xfs_inode        *ip,
 931        struct xfs_icwalk       *icw)
 932{
 933        ASSERT(rcu_read_lock_held());
 934
 935        spin_lock(&ip->i_flags_lock);
 936        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
 937            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
 938                /* not a reclaim candidate. */
 939                spin_unlock(&ip->i_flags_lock);
 940                return false;
 941        }
 942
 943        /* Don't reclaim a sick inode unless the caller asked for it. */
 944        if (ip->i_sick &&
 945            (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
 946                spin_unlock(&ip->i_flags_lock);
 947                return false;
 948        }
 949
 950        __xfs_iflags_set(ip, XFS_IRECLAIM);
 951        spin_unlock(&ip->i_flags_lock);
 952        return true;
 953}
 954
 955/*
 956 * Inode reclaim is non-blocking, so the default action if progress cannot be
 957 * made is to "requeue" the inode for reclaim by unlocking it and clearing the
 958 * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
 959 * blocking anymore and hence we can wait for the inode to be able to reclaim
 960 * it.
 961 *
 962 * We do no IO here - if callers require inodes to be cleaned they must push the
 963 * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
 964 * done in the background in a non-blocking manner, and enables memory reclaim
 965 * to make progress without blocking.
 966 */
 967static void
 968xfs_reclaim_inode(
 969        struct xfs_inode        *ip,
 970        struct xfs_perag        *pag)
 971{
 972        xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
 973
 974        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 975                goto out;
 976        if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
 977                goto out_iunlock;
 978
 979        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 980                xfs_iunpin_wait(ip);
 981                xfs_iflush_abort(ip);
 982                goto reclaim;
 983        }
 984        if (xfs_ipincount(ip))
 985                goto out_clear_flush;
 986        if (!xfs_inode_clean(ip))
 987                goto out_clear_flush;
 988
 989        xfs_iflags_clear(ip, XFS_IFLUSHING);
 990reclaim:
 991
 992        /*
 993         * Because we use RCU freeing we need to ensure the inode always appears
 994         * to be reclaimed with an invalid inode number when in the free state.
 995         * We do this as early as possible under the ILOCK so that
 996         * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
 997         * detect races with us here. By doing this, we guarantee that once
 998         * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
 999         * it will see either a valid inode that will serialise correctly, or it
1000         * will see an invalid inode that it can skip.
1001         */
1002        spin_lock(&ip->i_flags_lock);
1003        ip->i_flags = XFS_IRECLAIM;
1004        ip->i_ino = 0;
1005        ip->i_sick = 0;
1006        ip->i_checked = 0;
1007        spin_unlock(&ip->i_flags_lock);
1008
1009        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1010
1011        XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
1012        /*
1013         * Remove the inode from the per-AG radix tree.
1014         *
1015         * Because radix_tree_delete won't complain even if the item was never
1016         * added to the tree assert that it's been there before to catch
1017         * problems with the inode life time early on.
1018         */
1019        spin_lock(&pag->pag_ici_lock);
1020        if (!radix_tree_delete(&pag->pag_ici_root,
1021                                XFS_INO_TO_AGINO(ip->i_mount, ino)))
1022                ASSERT(0);
1023        xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
1024        spin_unlock(&pag->pag_ici_lock);
1025
1026        /*
1027         * Here we do an (almost) spurious inode lock in order to coordinate
1028         * with inode cache radix tree lookups.  This is because the lookup
1029         * can reference the inodes in the cache without taking references.
1030         *
1031         * We make that OK here by ensuring that we wait until the inode is
1032         * unlocked after the lookup before we go ahead and free it.
1033         */
1034        xfs_ilock(ip, XFS_ILOCK_EXCL);
1035        ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
1036        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1037        ASSERT(xfs_inode_clean(ip));
1038
1039        __xfs_inode_free(ip);
1040        return;
1041
1042out_clear_flush:
1043        xfs_iflags_clear(ip, XFS_IFLUSHING);
1044out_iunlock:
1045        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1046out:
1047        xfs_iflags_clear(ip, XFS_IRECLAIM);
1048}
1049
1050/* Reclaim sick inodes if we're unmounting or the fs went down. */
1051static inline bool
1052xfs_want_reclaim_sick(
1053        struct xfs_mount        *mp)
1054{
1055        return (mp->m_flags & XFS_MOUNT_UNMOUNTING) ||
1056               (mp->m_flags & XFS_MOUNT_NORECOVERY) ||
1057               XFS_FORCED_SHUTDOWN(mp);
1058}
1059
1060void
1061xfs_reclaim_inodes(
1062        struct xfs_mount        *mp)
1063{
1064        struct xfs_icwalk       icw = {
1065                .icw_flags      = 0,
1066        };
1067
1068        if (xfs_want_reclaim_sick(mp))
1069                icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1070
1071        while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
1072                xfs_ail_push_all_sync(mp->m_ail);
1073                xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1074        }
1075}
1076
1077/*
1078 * The shrinker infrastructure determines how many inodes we should scan for
1079 * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1080 * push the AIL here. We also want to proactively free up memory if we can to
1081 * minimise the amount of work memory reclaim has to do so we kick the
1082 * background reclaim if it isn't already scheduled.
1083 */
1084long
1085xfs_reclaim_inodes_nr(
1086        struct xfs_mount        *mp,
1087        unsigned long           nr_to_scan)
1088{
1089        struct xfs_icwalk       icw = {
1090                .icw_flags      = XFS_ICWALK_FLAG_SCAN_LIMIT,
1091                .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
1092        };
1093
1094        if (xfs_want_reclaim_sick(mp))
1095                icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1096
1097        /* kick background reclaimer and push the AIL */
1098        xfs_reclaim_work_queue(mp);
1099        xfs_ail_push_all(mp->m_ail);
1100
1101        xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1102        return 0;
1103}
1104
1105/*
1106 * Return the number of reclaimable inodes in the filesystem for
1107 * the shrinker to determine how much to reclaim.
1108 */
1109long
1110xfs_reclaim_inodes_count(
1111        struct xfs_mount        *mp)
1112{
1113        struct xfs_perag        *pag;
1114        xfs_agnumber_t          ag = 0;
1115        long                    reclaimable = 0;
1116
1117        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1118                ag = pag->pag_agno + 1;
1119                reclaimable += pag->pag_ici_reclaimable;
1120                xfs_perag_put(pag);
1121        }
1122        return reclaimable;
1123}
1124
1125STATIC bool
1126xfs_icwalk_match_id(
1127        struct xfs_inode        *ip,
1128        struct xfs_icwalk       *icw)
1129{
1130        if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1131            !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1132                return false;
1133
1134        if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1135            !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1136                return false;
1137
1138        if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1139            ip->i_projid != icw->icw_prid)
1140                return false;
1141
1142        return true;
1143}
1144
1145/*
1146 * A union-based inode filtering algorithm. Process the inode if any of the
1147 * criteria match. This is for global/internal scans only.
1148 */
1149STATIC bool
1150xfs_icwalk_match_id_union(
1151        struct xfs_inode        *ip,
1152        struct xfs_icwalk       *icw)
1153{
1154        if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1155            uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1156                return true;
1157
1158        if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1159            gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1160                return true;
1161
1162        if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1163            ip->i_projid == icw->icw_prid)
1164                return true;
1165
1166        return false;
1167}
1168
1169/*
1170 * Is this inode @ip eligible for eof/cow block reclamation, given some
1171 * filtering parameters @icw?  The inode is eligible if @icw is null or
1172 * if the predicate functions match.
1173 */
1174static bool
1175xfs_icwalk_match(
1176        struct xfs_inode        *ip,
1177        struct xfs_icwalk       *icw)
1178{
1179        bool                    match;
1180
1181        if (!icw)
1182                return true;
1183
1184        if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1185                match = xfs_icwalk_match_id_union(ip, icw);
1186        else
1187                match = xfs_icwalk_match_id(ip, icw);
1188        if (!match)
1189                return false;
1190
1191        /* skip the inode if the file size is too small */
1192        if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1193            XFS_ISIZE(ip) < icw->icw_min_file_size)
1194                return false;
1195
1196        return true;
1197}
1198
1199/*
1200 * This is a fast pass over the inode cache to try to get reclaim moving on as
1201 * many inodes as possible in a short period of time. It kicks itself every few
1202 * seconds, as well as being kicked by the inode cache shrinker when memory
1203 * goes low.
1204 */
1205void
1206xfs_reclaim_worker(
1207        struct work_struct *work)
1208{
1209        struct xfs_mount *mp = container_of(to_delayed_work(work),
1210                                        struct xfs_mount, m_reclaim_work);
1211
1212        xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1213        xfs_reclaim_work_queue(mp);
1214}
1215
1216STATIC int
1217xfs_inode_free_eofblocks(
1218        struct xfs_inode        *ip,
1219        struct xfs_icwalk       *icw,
1220        unsigned int            *lockflags)
1221{
1222        bool                    wait;
1223
1224        wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1225
1226        if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1227                return 0;
1228
1229        /*
1230         * If the mapping is dirty the operation can block and wait for some
1231         * time. Unless we are waiting, skip it.
1232         */
1233        if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1234                return 0;
1235
1236        if (!xfs_icwalk_match(ip, icw))
1237                return 0;
1238
1239        /*
1240         * If the caller is waiting, return -EAGAIN to keep the background
1241         * scanner moving and revisit the inode in a subsequent pass.
1242         */
1243        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1244                if (wait)
1245                        return -EAGAIN;
1246                return 0;
1247        }
1248        *lockflags |= XFS_IOLOCK_EXCL;
1249
1250        if (xfs_can_free_eofblocks(ip, false))
1251                return xfs_free_eofblocks(ip);
1252
1253        /* inode could be preallocated or append-only */
1254        trace_xfs_inode_free_eofblocks_invalid(ip);
1255        xfs_inode_clear_eofblocks_tag(ip);
1256        return 0;
1257}
1258
1259static void
1260xfs_blockgc_set_iflag(
1261        struct xfs_inode        *ip,
1262        unsigned long           iflag)
1263{
1264        struct xfs_mount        *mp = ip->i_mount;
1265        struct xfs_perag        *pag;
1266
1267        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1268
1269        /*
1270         * Don't bother locking the AG and looking up in the radix trees
1271         * if we already know that we have the tag set.
1272         */
1273        if (ip->i_flags & iflag)
1274                return;
1275        spin_lock(&ip->i_flags_lock);
1276        ip->i_flags |= iflag;
1277        spin_unlock(&ip->i_flags_lock);
1278
1279        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1280        spin_lock(&pag->pag_ici_lock);
1281
1282        xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1283                        XFS_ICI_BLOCKGC_TAG);
1284
1285        spin_unlock(&pag->pag_ici_lock);
1286        xfs_perag_put(pag);
1287}
1288
1289void
1290xfs_inode_set_eofblocks_tag(
1291        xfs_inode_t     *ip)
1292{
1293        trace_xfs_inode_set_eofblocks_tag(ip);
1294        return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1295}
1296
1297static void
1298xfs_blockgc_clear_iflag(
1299        struct xfs_inode        *ip,
1300        unsigned long           iflag)
1301{
1302        struct xfs_mount        *mp = ip->i_mount;
1303        struct xfs_perag        *pag;
1304        bool                    clear_tag;
1305
1306        ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1307
1308        spin_lock(&ip->i_flags_lock);
1309        ip->i_flags &= ~iflag;
1310        clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1311        spin_unlock(&ip->i_flags_lock);
1312
1313        if (!clear_tag)
1314                return;
1315
1316        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1317        spin_lock(&pag->pag_ici_lock);
1318
1319        xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1320                        XFS_ICI_BLOCKGC_TAG);
1321
1322        spin_unlock(&pag->pag_ici_lock);
1323        xfs_perag_put(pag);
1324}
1325
1326void
1327xfs_inode_clear_eofblocks_tag(
1328        xfs_inode_t     *ip)
1329{
1330        trace_xfs_inode_clear_eofblocks_tag(ip);
1331        return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1332}
1333
1334/*
1335 * Set ourselves up to free CoW blocks from this file.  If it's already clean
1336 * then we can bail out quickly, but otherwise we must back off if the file
1337 * is undergoing some kind of write.
1338 */
1339static bool
1340xfs_prep_free_cowblocks(
1341        struct xfs_inode        *ip)
1342{
1343        /*
1344         * Just clear the tag if we have an empty cow fork or none at all. It's
1345         * possible the inode was fully unshared since it was originally tagged.
1346         */
1347        if (!xfs_inode_has_cow_data(ip)) {
1348                trace_xfs_inode_free_cowblocks_invalid(ip);
1349                xfs_inode_clear_cowblocks_tag(ip);
1350                return false;
1351        }
1352
1353        /*
1354         * If the mapping is dirty or under writeback we cannot touch the
1355         * CoW fork.  Leave it alone if we're in the midst of a directio.
1356         */
1357        if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1358            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1359            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1360            atomic_read(&VFS_I(ip)->i_dio_count))
1361                return false;
1362
1363        return true;
1364}
1365
1366/*
1367 * Automatic CoW Reservation Freeing
1368 *
1369 * These functions automatically garbage collect leftover CoW reservations
1370 * that were made on behalf of a cowextsize hint when we start to run out
1371 * of quota or when the reservations sit around for too long.  If the file
1372 * has dirty pages or is undergoing writeback, its CoW reservations will
1373 * be retained.
1374 *
1375 * The actual garbage collection piggybacks off the same code that runs
1376 * the speculative EOF preallocation garbage collector.
1377 */
1378STATIC int
1379xfs_inode_free_cowblocks(
1380        struct xfs_inode        *ip,
1381        struct xfs_icwalk       *icw,
1382        unsigned int            *lockflags)
1383{
1384        bool                    wait;
1385        int                     ret = 0;
1386
1387        wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1388
1389        if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1390                return 0;
1391
1392        if (!xfs_prep_free_cowblocks(ip))
1393                return 0;
1394
1395        if (!xfs_icwalk_match(ip, icw))
1396                return 0;
1397
1398        /*
1399         * If the caller is waiting, return -EAGAIN to keep the background
1400         * scanner moving and revisit the inode in a subsequent pass.
1401         */
1402        if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1403            !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1404                if (wait)
1405                        return -EAGAIN;
1406                return 0;
1407        }
1408        *lockflags |= XFS_IOLOCK_EXCL;
1409
1410        if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1411                if (wait)
1412                        return -EAGAIN;
1413                return 0;
1414        }
1415        *lockflags |= XFS_MMAPLOCK_EXCL;
1416
1417        /*
1418         * Check again, nobody else should be able to dirty blocks or change
1419         * the reflink iflag now that we have the first two locks held.
1420         */
1421        if (xfs_prep_free_cowblocks(ip))
1422                ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1423        return ret;
1424}
1425
1426void
1427xfs_inode_set_cowblocks_tag(
1428        xfs_inode_t     *ip)
1429{
1430        trace_xfs_inode_set_cowblocks_tag(ip);
1431        return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1432}
1433
1434void
1435xfs_inode_clear_cowblocks_tag(
1436        xfs_inode_t     *ip)
1437{
1438        trace_xfs_inode_clear_cowblocks_tag(ip);
1439        return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1440}
1441
1442/* Disable post-EOF and CoW block auto-reclamation. */
1443void
1444xfs_blockgc_stop(
1445        struct xfs_mount        *mp)
1446{
1447        struct xfs_perag        *pag;
1448        xfs_agnumber_t          agno;
1449
1450        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1451                cancel_delayed_work_sync(&pag->pag_blockgc_work);
1452}
1453
1454/* Enable post-EOF and CoW block auto-reclamation. */
1455void
1456xfs_blockgc_start(
1457        struct xfs_mount        *mp)
1458{
1459        struct xfs_perag        *pag;
1460        xfs_agnumber_t          agno;
1461
1462        for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1463                xfs_blockgc_queue(pag);
1464}
1465
1466/* Don't try to run block gc on an inode that's in any of these states. */
1467#define XFS_BLOCKGC_NOGRAB_IFLAGS       (XFS_INEW | \
1468                                         XFS_IRECLAIMABLE | \
1469                                         XFS_IRECLAIM)
1470/*
1471 * Decide if the given @ip is eligible for garbage collection of speculative
1472 * preallocations, and grab it if so.  Returns true if it's ready to go or
1473 * false if we should just ignore it.
1474 */
1475static bool
1476xfs_blockgc_igrab(
1477        struct xfs_inode        *ip)
1478{
1479        struct inode            *inode = VFS_I(ip);
1480
1481        ASSERT(rcu_read_lock_held());
1482
1483        /* Check for stale RCU freed inode */
1484        spin_lock(&ip->i_flags_lock);
1485        if (!ip->i_ino)
1486                goto out_unlock_noent;
1487
1488        if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1489                goto out_unlock_noent;
1490        spin_unlock(&ip->i_flags_lock);
1491
1492        /* nothing to sync during shutdown */
1493        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1494                return false;
1495
1496        /* If we can't grab the inode, it must on it's way to reclaim. */
1497        if (!igrab(inode))
1498                return false;
1499
1500        /* inode is valid */
1501        return true;
1502
1503out_unlock_noent:
1504        spin_unlock(&ip->i_flags_lock);
1505        return false;
1506}
1507
1508/* Scan one incore inode for block preallocations that we can remove. */
1509static int
1510xfs_blockgc_scan_inode(
1511        struct xfs_inode        *ip,
1512        struct xfs_icwalk       *icw)
1513{
1514        unsigned int            lockflags = 0;
1515        int                     error;
1516
1517        error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1518        if (error)
1519                goto unlock;
1520
1521        error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1522unlock:
1523        if (lockflags)
1524                xfs_iunlock(ip, lockflags);
1525        xfs_irele(ip);
1526        return error;
1527}
1528
1529/* Background worker that trims preallocated space. */
1530void
1531xfs_blockgc_worker(
1532        struct work_struct      *work)
1533{
1534        struct xfs_perag        *pag = container_of(to_delayed_work(work),
1535                                        struct xfs_perag, pag_blockgc_work);
1536        struct xfs_mount        *mp = pag->pag_mount;
1537        int                     error;
1538
1539        if (!sb_start_write_trylock(mp->m_super))
1540                return;
1541        error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1542        if (error)
1543                xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1544                                pag->pag_agno, error);
1545        sb_end_write(mp->m_super);
1546        xfs_blockgc_queue(pag);
1547}
1548
1549/*
1550 * Try to free space in the filesystem by purging eofblocks and cowblocks.
1551 */
1552int
1553xfs_blockgc_free_space(
1554        struct xfs_mount        *mp,
1555        struct xfs_icwalk       *icw)
1556{
1557        trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1558
1559        return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
1560}
1561
1562/*
1563 * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1564 * quota caused an allocation failure, so we make a best effort by including
1565 * each quota under low free space conditions (less than 1% free space) in the
1566 * scan.
1567 *
1568 * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1569 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1570 * MMAPLOCK.
1571 */
1572int
1573xfs_blockgc_free_dquots(
1574        struct xfs_mount        *mp,
1575        struct xfs_dquot        *udqp,
1576        struct xfs_dquot        *gdqp,
1577        struct xfs_dquot        *pdqp,
1578        unsigned int            iwalk_flags)
1579{
1580        struct xfs_icwalk       icw = {0};
1581        bool                    do_work = false;
1582
1583        if (!udqp && !gdqp && !pdqp)
1584                return 0;
1585
1586        /*
1587         * Run a scan to free blocks using the union filter to cover all
1588         * applicable quotas in a single scan.
1589         */
1590        icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1591
1592        if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1593                icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1594                icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1595                do_work = true;
1596        }
1597
1598        if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1599                icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1600                icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1601                do_work = true;
1602        }
1603
1604        if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1605                icw.icw_prid = pdqp->q_id;
1606                icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1607                do_work = true;
1608        }
1609
1610        if (!do_work)
1611                return 0;
1612
1613        return xfs_blockgc_free_space(mp, &icw);
1614}
1615
1616/* Run cow/eofblocks scans on the quotas attached to the inode. */
1617int
1618xfs_blockgc_free_quota(
1619        struct xfs_inode        *ip,
1620        unsigned int            iwalk_flags)
1621{
1622        return xfs_blockgc_free_dquots(ip->i_mount,
1623                        xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1624                        xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1625                        xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1626}
1627
1628/* XFS Inode Cache Walking Code */
1629
1630/*
1631 * The inode lookup is done in batches to keep the amount of lock traffic and
1632 * radix tree lookups to a minimum. The batch size is a trade off between
1633 * lookup reduction and stack usage. This is in the reclaim path, so we can't
1634 * be too greedy.
1635 */
1636#define XFS_LOOKUP_BATCH        32
1637
1638
1639/*
1640 * Decide if we want to grab this inode in anticipation of doing work towards
1641 * the goal.
1642 */
1643static inline bool
1644xfs_icwalk_igrab(
1645        enum xfs_icwalk_goal    goal,
1646        struct xfs_inode        *ip,
1647        struct xfs_icwalk       *icw)
1648{
1649        switch (goal) {
1650        case XFS_ICWALK_DQRELE:
1651                return xfs_dqrele_igrab(ip);
1652        case XFS_ICWALK_BLOCKGC:
1653                return xfs_blockgc_igrab(ip);
1654        case XFS_ICWALK_RECLAIM:
1655                return xfs_reclaim_igrab(ip, icw);
1656        default:
1657                return false;
1658        }
1659}
1660
1661/*
1662 * Process an inode.  Each processing function must handle any state changes
1663 * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
1664 */
1665static inline int
1666xfs_icwalk_process_inode(
1667        enum xfs_icwalk_goal    goal,
1668        struct xfs_inode        *ip,
1669        struct xfs_perag        *pag,
1670        struct xfs_icwalk       *icw)
1671{
1672        int                     error = 0;
1673
1674        switch (goal) {
1675        case XFS_ICWALK_DQRELE:
1676                xfs_dqrele_inode(ip, icw);
1677                break;
1678        case XFS_ICWALK_BLOCKGC:
1679                error = xfs_blockgc_scan_inode(ip, icw);
1680                break;
1681        case XFS_ICWALK_RECLAIM:
1682                xfs_reclaim_inode(ip, pag);
1683                break;
1684        }
1685        return error;
1686}
1687
1688/*
1689 * For a given per-AG structure @pag and a goal, grab qualifying inodes and
1690 * process them in some manner.
1691 */
1692static int
1693xfs_icwalk_ag(
1694        struct xfs_perag        *pag,
1695        enum xfs_icwalk_goal    goal,
1696        struct xfs_icwalk       *icw)
1697{
1698        struct xfs_mount        *mp = pag->pag_mount;
1699        uint32_t                first_index;
1700        int                     last_error = 0;
1701        int                     skipped;
1702        bool                    done;
1703        int                     nr_found;
1704
1705restart:
1706        done = false;
1707        skipped = 0;
1708        if (goal == XFS_ICWALK_RECLAIM)
1709                first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1710        else
1711                first_index = 0;
1712        nr_found = 0;
1713        do {
1714                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1715                unsigned int    tag = xfs_icwalk_tag(goal);
1716                int             error = 0;
1717                int             i;
1718
1719                rcu_read_lock();
1720
1721                if (tag == XFS_ICWALK_NULL_TAG)
1722                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1723                                        (void **)batch, first_index,
1724                                        XFS_LOOKUP_BATCH);
1725                else
1726                        nr_found = radix_tree_gang_lookup_tag(
1727                                        &pag->pag_ici_root,
1728                                        (void **) batch, first_index,
1729                                        XFS_LOOKUP_BATCH, tag);
1730
1731                if (!nr_found) {
1732                        done = true;
1733                        rcu_read_unlock();
1734                        break;
1735                }
1736
1737                /*
1738                 * Grab the inodes before we drop the lock. if we found
1739                 * nothing, nr == 0 and the loop will be skipped.
1740                 */
1741                for (i = 0; i < nr_found; i++) {
1742                        struct xfs_inode *ip = batch[i];
1743
1744                        if (done || !xfs_icwalk_igrab(goal, ip, icw))
1745                                batch[i] = NULL;
1746
1747                        /*
1748                         * Update the index for the next lookup. Catch
1749                         * overflows into the next AG range which can occur if
1750                         * we have inodes in the last block of the AG and we
1751                         * are currently pointing to the last inode.
1752                         *
1753                         * Because we may see inodes that are from the wrong AG
1754                         * due to RCU freeing and reallocation, only update the
1755                         * index if it lies in this AG. It was a race that lead
1756                         * us to see this inode, so another lookup from the
1757                         * same index will not find it again.
1758                         */
1759                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1760                                continue;
1761                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1762                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1763                                done = true;
1764                }
1765
1766                /* unlock now we've grabbed the inodes. */
1767                rcu_read_unlock();
1768
1769                for (i = 0; i < nr_found; i++) {
1770                        if (!batch[i])
1771                                continue;
1772                        error = xfs_icwalk_process_inode(goal, batch[i], pag,
1773                                        icw);
1774                        if (error == -EAGAIN) {
1775                                skipped++;
1776                                continue;
1777                        }
1778                        if (error && last_error != -EFSCORRUPTED)
1779                                last_error = error;
1780                }
1781
1782                /* bail out if the filesystem is corrupted.  */
1783                if (error == -EFSCORRUPTED)
1784                        break;
1785
1786                cond_resched();
1787
1788                if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1789                        icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1790                        if (icw->icw_scan_limit <= 0)
1791                                break;
1792                }
1793        } while (nr_found && !done);
1794
1795        if (goal == XFS_ICWALK_RECLAIM) {
1796                if (done)
1797                        first_index = 0;
1798                WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1799        }
1800
1801        if (skipped) {
1802                delay(1);
1803                goto restart;
1804        }
1805        return last_error;
1806}
1807
1808/* Fetch the next (possibly tagged) per-AG structure. */
1809static inline struct xfs_perag *
1810xfs_icwalk_get_perag(
1811        struct xfs_mount        *mp,
1812        xfs_agnumber_t          agno,
1813        enum xfs_icwalk_goal    goal)
1814{
1815        unsigned int            tag = xfs_icwalk_tag(goal);
1816
1817        if (tag == XFS_ICWALK_NULL_TAG)
1818                return xfs_perag_get(mp, agno);
1819        return xfs_perag_get_tag(mp, agno, tag);
1820}
1821
1822/* Walk all incore inodes to achieve a given goal. */
1823static int
1824xfs_icwalk(
1825        struct xfs_mount        *mp,
1826        enum xfs_icwalk_goal    goal,
1827        struct xfs_icwalk       *icw)
1828{
1829        struct xfs_perag        *pag;
1830        int                     error = 0;
1831        int                     last_error = 0;
1832        xfs_agnumber_t          agno = 0;
1833
1834        while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) {
1835                agno = pag->pag_agno + 1;
1836                error = xfs_icwalk_ag(pag, goal, icw);
1837                xfs_perag_put(pag);
1838                if (error) {
1839                        last_error = error;
1840                        if (error == -EFSCORRUPTED)
1841                                break;
1842                }
1843        }
1844        return last_error;
1845        BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1846}
1847
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.