linux/fs/xfs/xfs_inode.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include <linux/log2.h>
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_types.h"
  23#include "xfs_log.h"
  24#include "xfs_inum.h"
  25#include "xfs_trans.h"
  26#include "xfs_trans_priv.h"
  27#include "xfs_sb.h"
  28#include "xfs_ag.h"
  29#include "xfs_mount.h"
  30#include "xfs_bmap_btree.h"
  31#include "xfs_alloc_btree.h"
  32#include "xfs_ialloc_btree.h"
  33#include "xfs_attr_sf.h"
  34#include "xfs_dinode.h"
  35#include "xfs_inode.h"
  36#include "xfs_buf_item.h"
  37#include "xfs_inode_item.h"
  38#include "xfs_btree.h"
  39#include "xfs_alloc.h"
  40#include "xfs_ialloc.h"
  41#include "xfs_bmap.h"
  42#include "xfs_error.h"
  43#include "xfs_utils.h"
  44#include "xfs_quota.h"
  45#include "xfs_filestream.h"
  46#include "xfs_vnodeops.h"
  47#include "xfs_trace.h"
  48#include "xfs_icache.h"
  49
  50kmem_zone_t *xfs_ifork_zone;
  51kmem_zone_t *xfs_inode_zone;
  52
  53/*
  54 * Used in xfs_itruncate_extents().  This is the maximum number of extents
  55 * freed from a file in a single transaction.
  56 */
  57#define XFS_ITRUNC_MAX_EXTENTS  2
  58
  59STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  60STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  61STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  62STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  63
  64/*
  65 * helper function to extract extent size hint from inode
  66 */
  67xfs_extlen_t
  68xfs_get_extsz_hint(
  69        struct xfs_inode        *ip)
  70{
  71        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  72                return ip->i_d.di_extsize;
  73        if (XFS_IS_REALTIME_INODE(ip))
  74                return ip->i_mount->m_sb.sb_rextsize;
  75        return 0;
  76}
  77
  78/*
  79 * This is a wrapper routine around the xfs_ilock() routine used to centralize
  80 * some grungy code.  It is used in places that wish to lock the inode solely
  81 * for reading the extents.  The reason these places can't just call
  82 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
  83 * extents from disk for a file in b-tree format.  If the inode is in b-tree
  84 * format, then we need to lock the inode exclusively until the extents are read
  85 * in.  Locking it exclusively all the time would limit our parallelism
  86 * unnecessarily, though.  What we do instead is check to see if the extents
  87 * have been read in yet, and only lock the inode exclusively if they have not.
  88 *
  89 * The function returns a value which should be given to the corresponding
  90 * xfs_iunlock_map_shared().  This value is the mode in which the lock was
  91 * actually taken.
  92 */
  93uint
  94xfs_ilock_map_shared(
  95        xfs_inode_t     *ip)
  96{
  97        uint    lock_mode;
  98
  99        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
 100            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
 101                lock_mode = XFS_ILOCK_EXCL;
 102        } else {
 103                lock_mode = XFS_ILOCK_SHARED;
 104        }
 105
 106        xfs_ilock(ip, lock_mode);
 107
 108        return lock_mode;
 109}
 110
 111/*
 112 * This is simply the unlock routine to go with xfs_ilock_map_shared().
 113 * All it does is call xfs_iunlock() with the given lock_mode.
 114 */
 115void
 116xfs_iunlock_map_shared(
 117        xfs_inode_t     *ip,
 118        unsigned int    lock_mode)
 119{
 120        xfs_iunlock(ip, lock_mode);
 121}
 122
 123/*
 124 * The xfs inode contains 2 locks: a multi-reader lock called the
 125 * i_iolock and a multi-reader lock called the i_lock.  This routine
 126 * allows either or both of the locks to be obtained.
 127 *
 128 * The 2 locks should always be ordered so that the IO lock is
 129 * obtained first in order to prevent deadlock.
 130 *
 131 * ip -- the inode being locked
 132 * lock_flags -- this parameter indicates the inode's locks
 133 *       to be locked.  It can be:
 134 *              XFS_IOLOCK_SHARED,
 135 *              XFS_IOLOCK_EXCL,
 136 *              XFS_ILOCK_SHARED,
 137 *              XFS_ILOCK_EXCL,
 138 *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
 139 *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
 140 *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
 141 *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
 142 */
 143void
 144xfs_ilock(
 145        xfs_inode_t             *ip,
 146        uint                    lock_flags)
 147{
 148        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 149
 150        /*
 151         * You can't set both SHARED and EXCL for the same lock,
 152         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 153         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 154         */
 155        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 156               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 157        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 158               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 159        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 160
 161        if (lock_flags & XFS_IOLOCK_EXCL)
 162                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 163        else if (lock_flags & XFS_IOLOCK_SHARED)
 164                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 165
 166        if (lock_flags & XFS_ILOCK_EXCL)
 167                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 168        else if (lock_flags & XFS_ILOCK_SHARED)
 169                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 170}
 171
 172/*
 173 * This is just like xfs_ilock(), except that the caller
 174 * is guaranteed not to sleep.  It returns 1 if it gets
 175 * the requested locks and 0 otherwise.  If the IO lock is
 176 * obtained but the inode lock cannot be, then the IO lock
 177 * is dropped before returning.
 178 *
 179 * ip -- the inode being locked
 180 * lock_flags -- this parameter indicates the inode's locks to be
 181 *       to be locked.  See the comment for xfs_ilock() for a list
 182 *       of valid values.
 183 */
 184int
 185xfs_ilock_nowait(
 186        xfs_inode_t             *ip,
 187        uint                    lock_flags)
 188{
 189        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 190
 191        /*
 192         * You can't set both SHARED and EXCL for the same lock,
 193         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 194         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 195         */
 196        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 197               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 198        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 199               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 200        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 201
 202        if (lock_flags & XFS_IOLOCK_EXCL) {
 203                if (!mrtryupdate(&ip->i_iolock))
 204                        goto out;
 205        } else if (lock_flags & XFS_IOLOCK_SHARED) {
 206                if (!mrtryaccess(&ip->i_iolock))
 207                        goto out;
 208        }
 209        if (lock_flags & XFS_ILOCK_EXCL) {
 210                if (!mrtryupdate(&ip->i_lock))
 211                        goto out_undo_iolock;
 212        } else if (lock_flags & XFS_ILOCK_SHARED) {
 213                if (!mrtryaccess(&ip->i_lock))
 214                        goto out_undo_iolock;
 215        }
 216        return 1;
 217
 218 out_undo_iolock:
 219        if (lock_flags & XFS_IOLOCK_EXCL)
 220                mrunlock_excl(&ip->i_iolock);
 221        else if (lock_flags & XFS_IOLOCK_SHARED)
 222                mrunlock_shared(&ip->i_iolock);
 223 out:
 224        return 0;
 225}
 226
 227/*
 228 * xfs_iunlock() is used to drop the inode locks acquired with
 229 * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 230 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 231 * that we know which locks to drop.
 232 *
 233 * ip -- the inode being unlocked
 234 * lock_flags -- this parameter indicates the inode's locks to be
 235 *       to be unlocked.  See the comment for xfs_ilock() for a list
 236 *       of valid values for this parameter.
 237 *
 238 */
 239void
 240xfs_iunlock(
 241        xfs_inode_t             *ip,
 242        uint                    lock_flags)
 243{
 244        /*
 245         * You can't set both SHARED and EXCL for the same lock,
 246         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 247         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 248         */
 249        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 250               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 251        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 252               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 253        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 254        ASSERT(lock_flags != 0);
 255
 256        if (lock_flags & XFS_IOLOCK_EXCL)
 257                mrunlock_excl(&ip->i_iolock);
 258        else if (lock_flags & XFS_IOLOCK_SHARED)
 259                mrunlock_shared(&ip->i_iolock);
 260
 261        if (lock_flags & XFS_ILOCK_EXCL)
 262                mrunlock_excl(&ip->i_lock);
 263        else if (lock_flags & XFS_ILOCK_SHARED)
 264                mrunlock_shared(&ip->i_lock);
 265
 266        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 267}
 268
 269/*
 270 * give up write locks.  the i/o lock cannot be held nested
 271 * if it is being demoted.
 272 */
 273void
 274xfs_ilock_demote(
 275        xfs_inode_t             *ip,
 276        uint                    lock_flags)
 277{
 278        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 279        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 280
 281        if (lock_flags & XFS_ILOCK_EXCL)
 282                mrdemote(&ip->i_lock);
 283        if (lock_flags & XFS_IOLOCK_EXCL)
 284                mrdemote(&ip->i_iolock);
 285
 286        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 287}
 288
 289#ifdef DEBUG
 290int
 291xfs_isilocked(
 292        xfs_inode_t             *ip,
 293        uint                    lock_flags)
 294{
 295        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 296                if (!(lock_flags & XFS_ILOCK_SHARED))
 297                        return !!ip->i_lock.mr_writer;
 298                return rwsem_is_locked(&ip->i_lock.mr_lock);
 299        }
 300
 301        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 302                if (!(lock_flags & XFS_IOLOCK_SHARED))
 303                        return !!ip->i_iolock.mr_writer;
 304                return rwsem_is_locked(&ip->i_iolock.mr_lock);
 305        }
 306
 307        ASSERT(0);
 308        return 0;
 309}
 310#endif
 311
 312void
 313__xfs_iflock(
 314        struct xfs_inode        *ip)
 315{
 316        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 317        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 318
 319        do {
 320                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 321                if (xfs_isiflocked(ip))
 322                        io_schedule();
 323        } while (!xfs_iflock_nowait(ip));
 324
 325        finish_wait(wq, &wait.wait);
 326}
 327
 328#ifdef DEBUG
 329/*
 330 * Make sure that the extents in the given memory buffer
 331 * are valid.
 332 */
 333STATIC void
 334xfs_validate_extents(
 335        xfs_ifork_t             *ifp,
 336        int                     nrecs,
 337        xfs_exntfmt_t           fmt)
 338{
 339        xfs_bmbt_irec_t         irec;
 340        xfs_bmbt_rec_host_t     rec;
 341        int                     i;
 342
 343        for (i = 0; i < nrecs; i++) {
 344                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 345                rec.l0 = get_unaligned(&ep->l0);
 346                rec.l1 = get_unaligned(&ep->l1);
 347                xfs_bmbt_get_all(&rec, &irec);
 348                if (fmt == XFS_EXTFMT_NOSTATE)
 349                        ASSERT(irec.br_state == XFS_EXT_NORM);
 350        }
 351}
 352#else /* DEBUG */
 353#define xfs_validate_extents(ifp, nrecs, fmt)
 354#endif /* DEBUG */
 355
 356/*
 357 * Check that none of the inode's in the buffer have a next
 358 * unlinked field of 0.
 359 */
 360#if defined(DEBUG)
 361void
 362xfs_inobp_check(
 363        xfs_mount_t     *mp,
 364        xfs_buf_t       *bp)
 365{
 366        int             i;
 367        int             j;
 368        xfs_dinode_t    *dip;
 369
 370        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 371
 372        for (i = 0; i < j; i++) {
 373                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 374                                        i * mp->m_sb.sb_inodesize);
 375                if (!dip->di_next_unlinked)  {
 376                        xfs_alert(mp,
 377        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
 378                                bp);
 379                        ASSERT(dip->di_next_unlinked);
 380                }
 381        }
 382}
 383#endif
 384
 385static void
 386xfs_inode_buf_verify(
 387        struct xfs_buf  *bp)
 388{
 389        struct xfs_mount *mp = bp->b_target->bt_mount;
 390        int             i;
 391        int             ni;
 392
 393        /*
 394         * Validate the magic number and version of every inode in the buffer
 395         */
 396        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
 397        for (i = 0; i < ni; i++) {
 398                int             di_ok;
 399                xfs_dinode_t    *dip;
 400
 401                dip = (struct xfs_dinode *)xfs_buf_offset(bp,
 402                                        (i << mp->m_sb.sb_inodelog));
 403                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 404                            XFS_DINODE_GOOD_VERSION(dip->di_version);
 405                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 406                                                XFS_ERRTAG_ITOBP_INOTOBP,
 407                                                XFS_RANDOM_ITOBP_INOTOBP))) {
 408                        xfs_buf_ioerror(bp, EFSCORRUPTED);
 409                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
 410                                             mp, dip);
 411#ifdef DEBUG
 412                        xfs_emerg(mp,
 413                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 414                                (unsigned long long)bp->b_bn, i,
 415                                be16_to_cpu(dip->di_magic));
 416                        ASSERT(0);
 417#endif
 418                }
 419        }
 420        xfs_inobp_check(mp, bp);
 421}
 422
 423
 424static void
 425xfs_inode_buf_read_verify(
 426        struct xfs_buf  *bp)
 427{
 428        xfs_inode_buf_verify(bp);
 429}
 430
 431static void
 432xfs_inode_buf_write_verify(
 433        struct xfs_buf  *bp)
 434{
 435        xfs_inode_buf_verify(bp);
 436}
 437
 438const struct xfs_buf_ops xfs_inode_buf_ops = {
 439        .verify_read = xfs_inode_buf_read_verify,
 440        .verify_write = xfs_inode_buf_write_verify,
 441};
 442
 443
 444/*
 445 * This routine is called to map an inode to the buffer containing the on-disk
 446 * version of the inode.  It returns a pointer to the buffer containing the
 447 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 448 * pointer to the on-disk inode within that buffer.
 449 *
 450 * If a non-zero error is returned, then the contents of bpp and dipp are
 451 * undefined.
 452 */
 453int
 454xfs_imap_to_bp(
 455        struct xfs_mount        *mp,
 456        struct xfs_trans        *tp,
 457        struct xfs_imap         *imap,
 458        struct xfs_dinode       **dipp,
 459        struct xfs_buf          **bpp,
 460        uint                    buf_flags,
 461        uint                    iget_flags)
 462{
 463        struct xfs_buf          *bp;
 464        int                     error;
 465
 466        buf_flags |= XBF_UNMAPPED;
 467        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 468                                   (int)imap->im_len, buf_flags, &bp,
 469                                   &xfs_inode_buf_ops);
 470        if (error) {
 471                if (error == EAGAIN) {
 472                        ASSERT(buf_flags & XBF_TRYLOCK);
 473                        return error;
 474                }
 475
 476                if (error == EFSCORRUPTED &&
 477                    (iget_flags & XFS_IGET_UNTRUSTED))
 478                        return XFS_ERROR(EINVAL);
 479
 480                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 481                        __func__, error);
 482                return error;
 483        }
 484
 485        *bpp = bp;
 486        *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
 487        return 0;
 488}
 489
 490/*
 491 * Move inode type and inode format specific information from the
 492 * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 493 * this means set if_rdev to the proper value.  For files, directories,
 494 * and symlinks this means to bring in the in-line data or extent
 495 * pointers.  For a file in B-tree format, only the root is immediately
 496 * brought in-core.  The rest will be in-lined in if_extents when it
 497 * is first referenced (see xfs_iread_extents()).
 498 */
 499STATIC int
 500xfs_iformat(
 501        xfs_inode_t             *ip,
 502        xfs_dinode_t            *dip)
 503{
 504        xfs_attr_shortform_t    *atp;
 505        int                     size;
 506        int                     error = 0;
 507        xfs_fsize_t             di_size;
 508
 509        if (unlikely(be32_to_cpu(dip->di_nextents) +
 510                     be16_to_cpu(dip->di_anextents) >
 511                     be64_to_cpu(dip->di_nblocks))) {
 512                xfs_warn(ip->i_mount,
 513                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 514                        (unsigned long long)ip->i_ino,
 515                        (int)(be32_to_cpu(dip->di_nextents) +
 516                              be16_to_cpu(dip->di_anextents)),
 517                        (unsigned long long)
 518                                be64_to_cpu(dip->di_nblocks));
 519                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 520                                     ip->i_mount, dip);
 521                return XFS_ERROR(EFSCORRUPTED);
 522        }
 523
 524        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 525                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
 526                        (unsigned long long)ip->i_ino,
 527                        dip->di_forkoff);
 528                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 529                                     ip->i_mount, dip);
 530                return XFS_ERROR(EFSCORRUPTED);
 531        }
 532
 533        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
 534                     !ip->i_mount->m_rtdev_targp)) {
 535                xfs_warn(ip->i_mount,
 536                        "corrupt dinode %Lu, has realtime flag set.",
 537                        ip->i_ino);
 538                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
 539                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
 540                return XFS_ERROR(EFSCORRUPTED);
 541        }
 542
 543        switch (ip->i_d.di_mode & S_IFMT) {
 544        case S_IFIFO:
 545        case S_IFCHR:
 546        case S_IFBLK:
 547        case S_IFSOCK:
 548                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 549                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 550                                              ip->i_mount, dip);
 551                        return XFS_ERROR(EFSCORRUPTED);
 552                }
 553                ip->i_d.di_size = 0;
 554                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 555                break;
 556
 557        case S_IFREG:
 558        case S_IFLNK:
 559        case S_IFDIR:
 560                switch (dip->di_format) {
 561                case XFS_DINODE_FMT_LOCAL:
 562                        /*
 563                         * no local regular files yet
 564                         */
 565                        if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
 566                                xfs_warn(ip->i_mount,
 567                        "corrupt inode %Lu (local format for regular file).",
 568                                        (unsigned long long) ip->i_ino);
 569                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 570                                                     XFS_ERRLEVEL_LOW,
 571                                                     ip->i_mount, dip);
 572                                return XFS_ERROR(EFSCORRUPTED);
 573                        }
 574
 575                        di_size = be64_to_cpu(dip->di_size);
 576                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 577                                xfs_warn(ip->i_mount,
 578                        "corrupt inode %Lu (bad size %Ld for local inode).",
 579                                        (unsigned long long) ip->i_ino,
 580                                        (long long) di_size);
 581                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 582                                                     XFS_ERRLEVEL_LOW,
 583                                                     ip->i_mount, dip);
 584                                return XFS_ERROR(EFSCORRUPTED);
 585                        }
 586
 587                        size = (int)di_size;
 588                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 589                        break;
 590                case XFS_DINODE_FMT_EXTENTS:
 591                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 592                        break;
 593                case XFS_DINODE_FMT_BTREE:
 594                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 595                        break;
 596                default:
 597                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 598                                         ip->i_mount);
 599                        return XFS_ERROR(EFSCORRUPTED);
 600                }
 601                break;
 602
 603        default:
 604                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 605                return XFS_ERROR(EFSCORRUPTED);
 606        }
 607        if (error) {
 608                return error;
 609        }
 610        if (!XFS_DFORK_Q(dip))
 611                return 0;
 612
 613        ASSERT(ip->i_afp == NULL);
 614        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
 615
 616        switch (dip->di_aformat) {
 617        case XFS_DINODE_FMT_LOCAL:
 618                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 619                size = be16_to_cpu(atp->hdr.totsize);
 620
 621                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
 622                        xfs_warn(ip->i_mount,
 623                                "corrupt inode %Lu (bad attr fork size %Ld).",
 624                                (unsigned long long) ip->i_ino,
 625                                (long long) size);
 626                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 627                                             XFS_ERRLEVEL_LOW,
 628                                             ip->i_mount, dip);
 629                        return XFS_ERROR(EFSCORRUPTED);
 630                }
 631
 632                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 633                break;
 634        case XFS_DINODE_FMT_EXTENTS:
 635                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 636                break;
 637        case XFS_DINODE_FMT_BTREE:
 638                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 639                break;
 640        default:
 641                error = XFS_ERROR(EFSCORRUPTED);
 642                break;
 643        }
 644        if (error) {
 645                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 646                ip->i_afp = NULL;
 647                xfs_idestroy_fork(ip, XFS_DATA_FORK);
 648        }
 649        return error;
 650}
 651
 652/*
 653 * The file is in-lined in the on-disk inode.
 654 * If it fits into if_inline_data, then copy
 655 * it there, otherwise allocate a buffer for it
 656 * and copy the data there.  Either way, set
 657 * if_data to point at the data.
 658 * If we allocate a buffer for the data, make
 659 * sure that its size is a multiple of 4 and
 660 * record the real size in i_real_bytes.
 661 */
 662STATIC int
 663xfs_iformat_local(
 664        xfs_inode_t     *ip,
 665        xfs_dinode_t    *dip,
 666        int             whichfork,
 667        int             size)
 668{
 669        xfs_ifork_t     *ifp;
 670        int             real_size;
 671
 672        /*
 673         * If the size is unreasonable, then something
 674         * is wrong and we just bail out rather than crash in
 675         * kmem_alloc() or memcpy() below.
 676         */
 677        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 678                xfs_warn(ip->i_mount,
 679        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
 680                        (unsigned long long) ip->i_ino, size,
 681                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 682                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 683                                     ip->i_mount, dip);
 684                return XFS_ERROR(EFSCORRUPTED);
 685        }
 686        ifp = XFS_IFORK_PTR(ip, whichfork);
 687        real_size = 0;
 688        if (size == 0)
 689                ifp->if_u1.if_data = NULL;
 690        else if (size <= sizeof(ifp->if_u2.if_inline_data))
 691                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 692        else {
 693                real_size = roundup(size, 4);
 694                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 695        }
 696        ifp->if_bytes = size;
 697        ifp->if_real_bytes = real_size;
 698        if (size)
 699                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 700        ifp->if_flags &= ~XFS_IFEXTENTS;
 701        ifp->if_flags |= XFS_IFINLINE;
 702        return 0;
 703}
 704
 705/*
 706 * The file consists of a set of extents all
 707 * of which fit into the on-disk inode.
 708 * If there are few enough extents to fit into
 709 * the if_inline_ext, then copy them there.
 710 * Otherwise allocate a buffer for them and copy
 711 * them into it.  Either way, set if_extents
 712 * to point at the extents.
 713 */
 714STATIC int
 715xfs_iformat_extents(
 716        xfs_inode_t     *ip,
 717        xfs_dinode_t    *dip,
 718        int             whichfork)
 719{
 720        xfs_bmbt_rec_t  *dp;
 721        xfs_ifork_t     *ifp;
 722        int             nex;
 723        int             size;
 724        int             i;
 725
 726        ifp = XFS_IFORK_PTR(ip, whichfork);
 727        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 728        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 729
 730        /*
 731         * If the number of extents is unreasonable, then something
 732         * is wrong and we just bail out rather than crash in
 733         * kmem_alloc() or memcpy() below.
 734         */
 735        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 736                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 737                        (unsigned long long) ip->i_ino, nex);
 738                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 739                                     ip->i_mount, dip);
 740                return XFS_ERROR(EFSCORRUPTED);
 741        }
 742
 743        ifp->if_real_bytes = 0;
 744        if (nex == 0)
 745                ifp->if_u1.if_extents = NULL;
 746        else if (nex <= XFS_INLINE_EXTS)
 747                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 748        else
 749                xfs_iext_add(ifp, 0, nex);
 750
 751        ifp->if_bytes = size;
 752        if (size) {
 753                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 754                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 755                for (i = 0; i < nex; i++, dp++) {
 756                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 757                        ep->l0 = get_unaligned_be64(&dp->l0);
 758                        ep->l1 = get_unaligned_be64(&dp->l1);
 759                }
 760                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 761                if (whichfork != XFS_DATA_FORK ||
 762                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 763                                if (unlikely(xfs_check_nostate_extents(
 764                                    ifp, 0, nex))) {
 765                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 766                                                         XFS_ERRLEVEL_LOW,
 767                                                         ip->i_mount);
 768                                        return XFS_ERROR(EFSCORRUPTED);
 769                                }
 770        }
 771        ifp->if_flags |= XFS_IFEXTENTS;
 772        return 0;
 773}
 774
 775/*
 776 * The file has too many extents to fit into
 777 * the inode, so they are in B-tree format.
 778 * Allocate a buffer for the root of the B-tree
 779 * and copy the root into it.  The i_extents
 780 * field will remain NULL until all of the
 781 * extents are read in (when they are needed).
 782 */
 783STATIC int
 784xfs_iformat_btree(
 785        xfs_inode_t             *ip,
 786        xfs_dinode_t            *dip,
 787        int                     whichfork)
 788{
 789        xfs_bmdr_block_t        *dfp;
 790        xfs_ifork_t             *ifp;
 791        /* REFERENCED */
 792        int                     nrecs;
 793        int                     size;
 794
 795        ifp = XFS_IFORK_PTR(ip, whichfork);
 796        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 797        size = XFS_BMAP_BROOT_SPACE(dfp);
 798        nrecs = be16_to_cpu(dfp->bb_numrecs);
 799
 800        /*
 801         * blow out if -- fork has less extents than can fit in
 802         * fork (fork shouldn't be a btree format), root btree
 803         * block has more records than can fit into the fork,
 804         * or the number of extents is greater than the number of
 805         * blocks.
 806         */
 807        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
 808                        XFS_IFORK_MAXEXT(ip, whichfork) ||
 809                     XFS_BMDR_SPACE_CALC(nrecs) >
 810                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
 811                     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 812                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 813                        (unsigned long long) ip->i_ino);
 814                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 815                                 ip->i_mount, dip);
 816                return XFS_ERROR(EFSCORRUPTED);
 817        }
 818
 819        ifp->if_broot_bytes = size;
 820        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
 821        ASSERT(ifp->if_broot != NULL);
 822        /*
 823         * Copy and convert from the on-disk structure
 824         * to the in-memory structure.
 825         */
 826        xfs_bmdr_to_bmbt(ip->i_mount, dfp,
 827                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 828                         ifp->if_broot, size);
 829        ifp->if_flags &= ~XFS_IFEXTENTS;
 830        ifp->if_flags |= XFS_IFBROOT;
 831
 832        return 0;
 833}
 834
 835STATIC void
 836xfs_dinode_from_disk(
 837        xfs_icdinode_t          *to,
 838        xfs_dinode_t            *from)
 839{
 840        to->di_magic = be16_to_cpu(from->di_magic);
 841        to->di_mode = be16_to_cpu(from->di_mode);
 842        to->di_version = from ->di_version;
 843        to->di_format = from->di_format;
 844        to->di_onlink = be16_to_cpu(from->di_onlink);
 845        to->di_uid = be32_to_cpu(from->di_uid);
 846        to->di_gid = be32_to_cpu(from->di_gid);
 847        to->di_nlink = be32_to_cpu(from->di_nlink);
 848        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 849        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 850        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 851        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 852        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 853        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 854        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 855        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 856        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 857        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 858        to->di_size = be64_to_cpu(from->di_size);
 859        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 860        to->di_extsize = be32_to_cpu(from->di_extsize);
 861        to->di_nextents = be32_to_cpu(from->di_nextents);
 862        to->di_anextents = be16_to_cpu(from->di_anextents);
 863        to->di_forkoff = from->di_forkoff;
 864        to->di_aformat  = from->di_aformat;
 865        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 866        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 867        to->di_flags    = be16_to_cpu(from->di_flags);
 868        to->di_gen      = be32_to_cpu(from->di_gen);
 869}
 870
 871void
 872xfs_dinode_to_disk(
 873        xfs_dinode_t            *to,
 874        xfs_icdinode_t          *from)
 875{
 876        to->di_magic = cpu_to_be16(from->di_magic);
 877        to->di_mode = cpu_to_be16(from->di_mode);
 878        to->di_version = from ->di_version;
 879        to->di_format = from->di_format;
 880        to->di_onlink = cpu_to_be16(from->di_onlink);
 881        to->di_uid = cpu_to_be32(from->di_uid);
 882        to->di_gid = cpu_to_be32(from->di_gid);
 883        to->di_nlink = cpu_to_be32(from->di_nlink);
 884        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 885        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 886        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 887        to->di_flushiter = cpu_to_be16(from->di_flushiter);
 888        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 889        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 890        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 891        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 892        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 893        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 894        to->di_size = cpu_to_be64(from->di_size);
 895        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 896        to->di_extsize = cpu_to_be32(from->di_extsize);
 897        to->di_nextents = cpu_to_be32(from->di_nextents);
 898        to->di_anextents = cpu_to_be16(from->di_anextents);
 899        to->di_forkoff = from->di_forkoff;
 900        to->di_aformat = from->di_aformat;
 901        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 902        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 903        to->di_flags = cpu_to_be16(from->di_flags);
 904        to->di_gen = cpu_to_be32(from->di_gen);
 905}
 906
 907STATIC uint
 908_xfs_dic2xflags(
 909        __uint16_t              di_flags)
 910{
 911        uint                    flags = 0;
 912
 913        if (di_flags & XFS_DIFLAG_ANY) {
 914                if (di_flags & XFS_DIFLAG_REALTIME)
 915                        flags |= XFS_XFLAG_REALTIME;
 916                if (di_flags & XFS_DIFLAG_PREALLOC)
 917                        flags |= XFS_XFLAG_PREALLOC;
 918                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 919                        flags |= XFS_XFLAG_IMMUTABLE;
 920                if (di_flags & XFS_DIFLAG_APPEND)
 921                        flags |= XFS_XFLAG_APPEND;
 922                if (di_flags & XFS_DIFLAG_SYNC)
 923                        flags |= XFS_XFLAG_SYNC;
 924                if (di_flags & XFS_DIFLAG_NOATIME)
 925                        flags |= XFS_XFLAG_NOATIME;
 926                if (di_flags & XFS_DIFLAG_NODUMP)
 927                        flags |= XFS_XFLAG_NODUMP;
 928                if (di_flags & XFS_DIFLAG_RTINHERIT)
 929                        flags |= XFS_XFLAG_RTINHERIT;
 930                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 931                        flags |= XFS_XFLAG_PROJINHERIT;
 932                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 933                        flags |= XFS_XFLAG_NOSYMLINKS;
 934                if (di_flags & XFS_DIFLAG_EXTSIZE)
 935                        flags |= XFS_XFLAG_EXTSIZE;
 936                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 937                        flags |= XFS_XFLAG_EXTSZINHERIT;
 938                if (di_flags & XFS_DIFLAG_NODEFRAG)
 939                        flags |= XFS_XFLAG_NODEFRAG;
 940                if (di_flags & XFS_DIFLAG_FILESTREAM)
 941                        flags |= XFS_XFLAG_FILESTREAM;
 942        }
 943
 944        return flags;
 945}
 946
 947uint
 948xfs_ip2xflags(
 949        xfs_inode_t             *ip)
 950{
 951        xfs_icdinode_t          *dic = &ip->i_d;
 952
 953        return _xfs_dic2xflags(dic->di_flags) |
 954                                (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 955}
 956
 957uint
 958xfs_dic2xflags(
 959        xfs_dinode_t            *dip)
 960{
 961        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 962                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 963}
 964
 965/*
 966 * Read the disk inode attributes into the in-core inode structure.
 967 */
 968int
 969xfs_iread(
 970        xfs_mount_t     *mp,
 971        xfs_trans_t     *tp,
 972        xfs_inode_t     *ip,
 973        uint            iget_flags)
 974{
 975        xfs_buf_t       *bp;
 976        xfs_dinode_t    *dip;
 977        int             error;
 978
 979        /*
 980         * Fill in the location information in the in-core inode.
 981         */
 982        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 983        if (error)
 984                return error;
 985
 986        /*
 987         * Get pointers to the on-disk inode and the buffer containing it.
 988         */
 989        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 990        if (error)
 991                return error;
 992
 993        /*
 994         * If we got something that isn't an inode it means someone
 995         * (nfs or dmi) has a stale handle.
 996         */
 997        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
 998#ifdef DEBUG
 999                xfs_alert(mp,
1000                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
1001                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
1002#endif /* DEBUG */
1003                error = XFS_ERROR(EINVAL);
1004                goto out_brelse;
1005        }
1006
1007        /*
1008         * If the on-disk inode is already linked to a directory
1009         * entry, copy all of the inode into the in-core inode.
1010         * xfs_iformat() handles copying in the inode format
1011         * specific information.
1012         * Otherwise, just get the truly permanent information.
1013         */
1014        if (dip->di_mode) {
1015                xfs_dinode_from_disk(&ip->i_d, dip);
1016                error = xfs_iformat(ip, dip);
1017                if (error)  {
1018#ifdef DEBUG
1019                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
1020                                __func__, error);
1021#endif /* DEBUG */
1022                        goto out_brelse;
1023                }
1024        } else {
1025                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1026                ip->i_d.di_version = dip->di_version;
1027                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1028                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1029                /*
1030                 * Make sure to pull in the mode here as well in
1031                 * case the inode is released without being used.
1032                 * This ensures that xfs_inactive() will see that
1033                 * the inode is already free and not try to mess
1034                 * with the uninitialized part of it.
1035                 */
1036                ip->i_d.di_mode = 0;
1037        }
1038
1039        /*
1040         * The inode format changed when we moved the link count and
1041         * made it 32 bits long.  If this is an old format inode,
1042         * convert it in memory to look like a new one.  If it gets
1043         * flushed to disk we will convert back before flushing or
1044         * logging it.  We zero out the new projid field and the old link
1045         * count field.  We'll handle clearing the pad field (the remains
1046         * of the old uuid field) when we actually convert the inode to
1047         * the new format. We don't change the version number so that we
1048         * can distinguish this from a real new format inode.
1049         */
1050        if (ip->i_d.di_version == 1) {
1051                ip->i_d.di_nlink = ip->i_d.di_onlink;
1052                ip->i_d.di_onlink = 0;
1053                xfs_set_projid(ip, 0);
1054        }
1055
1056        ip->i_delayed_blks = 0;
1057
1058        /*
1059         * Mark the buffer containing the inode as something to keep
1060         * around for a while.  This helps to keep recently accessed
1061         * meta-data in-core longer.
1062         */
1063        xfs_buf_set_ref(bp, XFS_INO_REF);
1064
1065        /*
1066         * Use xfs_trans_brelse() to release the buffer containing the
1067         * on-disk inode, because it was acquired with xfs_trans_read_buf()
1068         * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
1069         * brelse().  If we're within a transaction, then xfs_trans_brelse()
1070         * will only release the buffer if it is not dirty within the
1071         * transaction.  It will be OK to release the buffer in this case,
1072         * because inodes on disk are never destroyed and we will be
1073         * locking the new in-core inode before putting it in the hash
1074         * table where other processes can find it.  Thus we don't have
1075         * to worry about the inode being changed just because we released
1076         * the buffer.
1077         */
1078 out_brelse:
1079        xfs_trans_brelse(tp, bp);
1080        return error;
1081}
1082
1083/*
1084 * Read in extents from a btree-format inode.
1085 * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
1086 */
1087int
1088xfs_iread_extents(
1089        xfs_trans_t     *tp,
1090        xfs_inode_t     *ip,
1091        int             whichfork)
1092{
1093        int             error;
1094        xfs_ifork_t     *ifp;
1095        xfs_extnum_t    nextents;
1096
1097        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1098                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1099                                 ip->i_mount);
1100                return XFS_ERROR(EFSCORRUPTED);
1101        }
1102        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1103        ifp = XFS_IFORK_PTR(ip, whichfork);
1104
1105        /*
1106         * We know that the size is valid (it's checked in iformat_btree)
1107         */
1108        ifp->if_bytes = ifp->if_real_bytes = 0;
1109        ifp->if_flags |= XFS_IFEXTENTS;
1110        xfs_iext_add(ifp, 0, nextents);
1111        error = xfs_bmap_read_extents(tp, ip, whichfork);
1112        if (error) {
1113                xfs_iext_destroy(ifp);
1114                ifp->if_flags &= ~XFS_IFEXTENTS;
1115                return error;
1116        }
1117        xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1118        return 0;
1119}
1120
1121/*
1122 * Allocate an inode on disk and return a copy of its in-core version.
1123 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1124 * appropriately within the inode.  The uid and gid for the inode are
1125 * set according to the contents of the given cred structure.
1126 *
1127 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1128 * has a free inode available, call xfs_iget() to obtain the in-core
1129 * version of the allocated inode.  Finally, fill in the inode and
1130 * log its initial contents.  In this case, ialloc_context would be
1131 * set to NULL.
1132 *
1133 * If xfs_dialloc() does not have an available inode, it will replenish
1134 * its supply by doing an allocation. Since we can only do one
1135 * allocation within a transaction without deadlocks, we must commit
1136 * the current transaction before returning the inode itself.
1137 * In this case, therefore, we will set ialloc_context and return.
1138 * The caller should then commit the current transaction, start a new
1139 * transaction, and call xfs_ialloc() again to actually get the inode.
1140 *
1141 * To ensure that some other process does not grab the inode that
1142 * was allocated during the first call to xfs_ialloc(), this routine
1143 * also returns the [locked] bp pointing to the head of the freelist
1144 * as ialloc_context.  The caller should hold this buffer across
1145 * the commit and pass it back into this routine on the second call.
1146 *
1147 * If we are allocating quota inodes, we do not have a parent inode
1148 * to attach to or associate with (i.e. pip == NULL) because they
1149 * are not linked into the directory structure - they are attached
1150 * directly to the superblock - and so have no parent.
1151 */
1152int
1153xfs_ialloc(
1154        xfs_trans_t     *tp,
1155        xfs_inode_t     *pip,
1156        umode_t         mode,
1157        xfs_nlink_t     nlink,
1158        xfs_dev_t       rdev,
1159        prid_t          prid,
1160        int             okalloc,
1161        xfs_buf_t       **ialloc_context,
1162        xfs_inode_t     **ipp)
1163{
1164        xfs_ino_t       ino;
1165        xfs_inode_t     *ip;
1166        uint            flags;
1167        int             error;
1168        timespec_t      tv;
1169        int             filestreams = 0;
1170
1171        /*
1172         * Call the space management code to pick
1173         * the on-disk inode to be allocated.
1174         */
1175        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1176                            ialloc_context, &ino);
1177        if (error)
1178                return error;
1179        if (*ialloc_context || ino == NULLFSINO) {
1180                *ipp = NULL;
1181                return 0;
1182        }
1183        ASSERT(*ialloc_context == NULL);
1184
1185        /*
1186         * Get the in-core inode with the lock held exclusively.
1187         * This is because we're setting fields here we need
1188         * to prevent others from looking at until we're done.
1189         */
1190        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1191                         XFS_ILOCK_EXCL, &ip);
1192        if (error)
1193                return error;
1194        ASSERT(ip != NULL);
1195
1196        ip->i_d.di_mode = mode;
1197        ip->i_d.di_onlink = 0;
1198        ip->i_d.di_nlink = nlink;
1199        ASSERT(ip->i_d.di_nlink == nlink);
1200        ip->i_d.di_uid = current_fsuid();
1201        ip->i_d.di_gid = current_fsgid();
1202        xfs_set_projid(ip, prid);
1203        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1204
1205        /*
1206         * If the superblock version is up to where we support new format
1207         * inodes and this is currently an old format inode, then change
1208         * the inode version number now.  This way we only do the conversion
1209         * here rather than here and in the flush/logging code.
1210         */
1211        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1212            ip->i_d.di_version == 1) {
1213                ip->i_d.di_version = 2;
1214                /*
1215                 * We've already zeroed the old link count, the projid field,
1216                 * and the pad field.
1217                 */
1218        }
1219
1220        /*
1221         * Project ids won't be stored on disk if we are using a version 1 inode.
1222         */
1223        if ((prid != 0) && (ip->i_d.di_version == 1))
1224                xfs_bump_ino_vers2(tp, ip);
1225
1226        if (pip && XFS_INHERIT_GID(pip)) {
1227                ip->i_d.di_gid = pip->i_d.di_gid;
1228                if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1229                        ip->i_d.di_mode |= S_ISGID;
1230                }
1231        }
1232
1233        /*
1234         * If the group ID of the new file does not match the effective group
1235         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1236         * (and only if the irix_sgid_inherit compatibility variable is set).
1237         */
1238        if ((irix_sgid_inherit) &&
1239            (ip->i_d.di_mode & S_ISGID) &&
1240            (!in_group_p((gid_t)ip->i_d.di_gid))) {
1241                ip->i_d.di_mode &= ~S_ISGID;
1242        }
1243
1244        ip->i_d.di_size = 0;
1245        ip->i_d.di_nextents = 0;
1246        ASSERT(ip->i_d.di_nblocks == 0);
1247
1248        nanotime(&tv);
1249        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1250        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1251        ip->i_d.di_atime = ip->i_d.di_mtime;
1252        ip->i_d.di_ctime = ip->i_d.di_mtime;
1253
1254        /*
1255         * di_gen will have been taken care of in xfs_iread.
1256         */
1257        ip->i_d.di_extsize = 0;
1258        ip->i_d.di_dmevmask = 0;
1259        ip->i_d.di_dmstate = 0;
1260        ip->i_d.di_flags = 0;
1261        flags = XFS_ILOG_CORE;
1262        switch (mode & S_IFMT) {
1263        case S_IFIFO:
1264        case S_IFCHR:
1265        case S_IFBLK:
1266        case S_IFSOCK:
1267                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1268                ip->i_df.if_u2.if_rdev = rdev;
1269                ip->i_df.if_flags = 0;
1270                flags |= XFS_ILOG_DEV;
1271                break;
1272        case S_IFREG:
1273                /*
1274                 * we can't set up filestreams until after the VFS inode
1275                 * is set up properly.
1276                 */
1277                if (pip && xfs_inode_is_filestream(pip))
1278                        filestreams = 1;
1279                /* fall through */
1280        case S_IFDIR:
1281                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1282                        uint    di_flags = 0;
1283
1284                        if (S_ISDIR(mode)) {
1285                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1286                                        di_flags |= XFS_DIFLAG_RTINHERIT;
1287                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1288                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1289                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1290                                }
1291                        } else if (S_ISREG(mode)) {
1292                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1293                                        di_flags |= XFS_DIFLAG_REALTIME;
1294                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1295                                        di_flags |= XFS_DIFLAG_EXTSIZE;
1296                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1297                                }
1298                        }
1299                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1300                            xfs_inherit_noatime)
1301                                di_flags |= XFS_DIFLAG_NOATIME;
1302                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1303                            xfs_inherit_nodump)
1304                                di_flags |= XFS_DIFLAG_NODUMP;
1305                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1306                            xfs_inherit_sync)
1307                                di_flags |= XFS_DIFLAG_SYNC;
1308                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1309                            xfs_inherit_nosymlinks)
1310                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
1311                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1312                                di_flags |= XFS_DIFLAG_PROJINHERIT;
1313                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1314                            xfs_inherit_nodefrag)
1315                                di_flags |= XFS_DIFLAG_NODEFRAG;
1316                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1317                                di_flags |= XFS_DIFLAG_FILESTREAM;
1318                        ip->i_d.di_flags |= di_flags;
1319                }
1320                /* FALLTHROUGH */
1321        case S_IFLNK:
1322                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1323                ip->i_df.if_flags = XFS_IFEXTENTS;
1324                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1325                ip->i_df.if_u1.if_extents = NULL;
1326                break;
1327        default:
1328                ASSERT(0);
1329        }
1330        /*
1331         * Attribute fork settings for new inode.
1332         */
1333        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1334        ip->i_d.di_anextents = 0;
1335
1336        /*
1337         * Log the new values stuffed into the inode.
1338         */
1339        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1340        xfs_trans_log_inode(tp, ip, flags);
1341
1342        /* now that we have an i_mode we can setup inode ops and unlock */
1343        xfs_setup_inode(ip);
1344
1345        /* now we have set up the vfs inode we can associate the filestream */
1346        if (filestreams) {
1347                error = xfs_filestream_associate(pip, ip);
1348                if (error < 0)
1349                        return -error;
1350                if (!error)
1351                        xfs_iflags_set(ip, XFS_IFILESTREAM);
1352        }
1353
1354        *ipp = ip;
1355        return 0;
1356}
1357
1358/*
1359 * Free up the underlying blocks past new_size.  The new size must be smaller
1360 * than the current size.  This routine can be used both for the attribute and
1361 * data fork, and does not modify the inode size, which is left to the caller.
1362 *
1363 * The transaction passed to this routine must have made a permanent log
1364 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1365 * given transaction and start new ones, so make sure everything involved in
1366 * the transaction is tidy before calling here.  Some transaction will be
1367 * returned to the caller to be committed.  The incoming transaction must
1368 * already include the inode, and both inode locks must be held exclusively.
1369 * The inode must also be "held" within the transaction.  On return the inode
1370 * will be "held" within the returned transaction.  This routine does NOT
1371 * require any disk space to be reserved for it within the transaction.
1372 *
1373 * If we get an error, we must return with the inode locked and linked into the
1374 * current transaction. This keeps things simple for the higher level code,
1375 * because it always knows that the inode is locked and held in the transaction
1376 * that returns to it whether errors occur or not.  We don't mark the inode
1377 * dirty on error so that transactions can be easily aborted if possible.
1378 */
1379int
1380xfs_itruncate_extents(
1381        struct xfs_trans        **tpp,
1382        struct xfs_inode        *ip,
1383        int                     whichfork,
1384        xfs_fsize_t             new_size)
1385{
1386        struct xfs_mount        *mp = ip->i_mount;
1387        struct xfs_trans        *tp = *tpp;
1388        struct xfs_trans        *ntp;
1389        xfs_bmap_free_t         free_list;
1390        xfs_fsblock_t           first_block;
1391        xfs_fileoff_t           first_unmap_block;
1392        xfs_fileoff_t           last_block;
1393        xfs_filblks_t           unmap_len;
1394        int                     committed;
1395        int                     error = 0;
1396        int                     done = 0;
1397
1398        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1399        ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1400               xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1401        ASSERT(new_size <= XFS_ISIZE(ip));
1402        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1403        ASSERT(ip->i_itemp != NULL);
1404        ASSERT(ip->i_itemp->ili_lock_flags == 0);
1405        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1406
1407        trace_xfs_itruncate_extents_start(ip, new_size);
1408
1409        /*
1410         * Since it is possible for space to become allocated beyond
1411         * the end of the file (in a crash where the space is allocated
1412         * but the inode size is not yet updated), simply remove any
1413         * blocks which show up between the new EOF and the maximum
1414         * possible file size.  If the first block to be removed is
1415         * beyond the maximum file size (ie it is the same as last_block),
1416         * then there is nothing to do.
1417         */
1418        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1419        last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1420        if (first_unmap_block == last_block)
1421                return 0;
1422
1423        ASSERT(first_unmap_block < last_block);
1424        unmap_len = last_block - first_unmap_block + 1;
1425        while (!done) {
1426                xfs_bmap_init(&free_list, &first_block);
1427                error = xfs_bunmapi(tp, ip,
1428                                    first_unmap_block, unmap_len,
1429                                    xfs_bmapi_aflag(whichfork),
1430                                    XFS_ITRUNC_MAX_EXTENTS,
1431                                    &first_block, &free_list,
1432                                    &done);
1433                if (error)
1434                        goto out_bmap_cancel;
1435
1436                /*
1437                 * Duplicate the transaction that has the permanent
1438                 * reservation and commit the old transaction.
1439                 */
1440                error = xfs_bmap_finish(&tp, &free_list, &committed);
1441                if (committed)
1442                        xfs_trans_ijoin(tp, ip, 0);
1443                if (error)
1444                        goto out_bmap_cancel;
1445
1446                if (committed) {
1447                        /*
1448                         * Mark the inode dirty so it will be logged and
1449                         * moved forward in the log as part of every commit.
1450                         */
1451                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1452                }
1453
1454                ntp = xfs_trans_dup(tp);
1455                error = xfs_trans_commit(tp, 0);
1456                tp = ntp;
1457
1458                xfs_trans_ijoin(tp, ip, 0);
1459
1460                if (error)
1461                        goto out;
1462
1463                /*
1464                 * Transaction commit worked ok so we can drop the extra ticket
1465                 * reference that we gained in xfs_trans_dup()
1466                 */
1467                xfs_log_ticket_put(tp->t_ticket);
1468                error = xfs_trans_reserve(tp, 0,
1469                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
1470                                        XFS_TRANS_PERM_LOG_RES,
1471                                        XFS_ITRUNCATE_LOG_COUNT);
1472                if (error)
1473                        goto out;
1474        }
1475
1476        /*
1477         * Always re-log the inode so that our permanent transaction can keep
1478         * on rolling it forward in the log.
1479         */
1480        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1481
1482        trace_xfs_itruncate_extents_end(ip, new_size);
1483
1484out:
1485        *tpp = tp;
1486        return error;
1487out_bmap_cancel:
1488        /*
1489         * If the bunmapi call encounters an error, return to the caller where
1490         * the transaction can be properly aborted.  We just need to make sure
1491         * we're not holding any resources that we were not when we came in.
1492         */
1493        xfs_bmap_cancel(&free_list);
1494        goto out;
1495}
1496
1497/*
1498 * This is called when the inode's link count goes to 0.
1499 * We place the on-disk inode on a list in the AGI.  It
1500 * will be pulled from this list when the inode is freed.
1501 */
1502int
1503xfs_iunlink(
1504        xfs_trans_t     *tp,
1505        xfs_inode_t     *ip)
1506{
1507        xfs_mount_t     *mp;
1508        xfs_agi_t       *agi;
1509        xfs_dinode_t    *dip;
1510        xfs_buf_t       *agibp;
1511        xfs_buf_t       *ibp;
1512        xfs_agino_t     agino;
1513        short           bucket_index;
1514        int             offset;
1515        int             error;
1516
1517        ASSERT(ip->i_d.di_nlink == 0);
1518        ASSERT(ip->i_d.di_mode != 0);
1519
1520        mp = tp->t_mountp;
1521
1522        /*
1523         * Get the agi buffer first.  It ensures lock ordering
1524         * on the list.
1525         */
1526        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1527        if (error)
1528                return error;
1529        agi = XFS_BUF_TO_AGI(agibp);
1530
1531        /*
1532         * Get the index into the agi hash table for the
1533         * list this inode will go on.
1534         */
1535        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1536        ASSERT(agino != 0);
1537        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1538        ASSERT(agi->agi_unlinked[bucket_index]);
1539        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1540
1541        if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1542                /*
1543                 * There is already another inode in the bucket we need
1544                 * to add ourselves to.  Add us at the front of the list.
1545                 * Here we put the head pointer into our next pointer,
1546                 * and then we fall through to point the head at us.
1547                 */
1548                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1549                                       0, 0);
1550                if (error)
1551                        return error;
1552
1553                ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1554                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1555                offset = ip->i_imap.im_boffset +
1556                        offsetof(xfs_dinode_t, di_next_unlinked);
1557                xfs_trans_inode_buf(tp, ibp);
1558                xfs_trans_log_buf(tp, ibp, offset,
1559                                  (offset + sizeof(xfs_agino_t) - 1));
1560                xfs_inobp_check(mp, ibp);
1561        }
1562
1563        /*
1564         * Point the bucket head pointer at the inode being inserted.
1565         */
1566        ASSERT(agino != 0);
1567        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1568        offset = offsetof(xfs_agi_t, agi_unlinked) +
1569                (sizeof(xfs_agino_t) * bucket_index);
1570        xfs_trans_log_buf(tp, agibp, offset,
1571                          (offset + sizeof(xfs_agino_t) - 1));
1572        return 0;
1573}
1574
1575/*
1576 * Pull the on-disk inode from the AGI unlinked list.
1577 */
1578STATIC int
1579xfs_iunlink_remove(
1580        xfs_trans_t     *tp,
1581        xfs_inode_t     *ip)
1582{
1583        xfs_ino_t       next_ino;
1584        xfs_mount_t     *mp;
1585        xfs_agi_t       *agi;
1586        xfs_dinode_t    *dip;
1587        xfs_buf_t       *agibp;
1588        xfs_buf_t       *ibp;
1589        xfs_agnumber_t  agno;
1590        xfs_agino_t     agino;
1591        xfs_agino_t     next_agino;
1592        xfs_buf_t       *last_ibp;
1593        xfs_dinode_t    *last_dip = NULL;
1594        short           bucket_index;
1595        int             offset, last_offset = 0;
1596        int             error;
1597
1598        mp = tp->t_mountp;
1599        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1600
1601        /*
1602         * Get the agi buffer first.  It ensures lock ordering
1603         * on the list.
1604         */
1605        error = xfs_read_agi(mp, tp, agno, &agibp);
1606        if (error)
1607                return error;
1608
1609        agi = XFS_BUF_TO_AGI(agibp);
1610
1611        /*
1612         * Get the index into the agi hash table for the
1613         * list this inode will go on.
1614         */
1615        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1616        ASSERT(agino != 0);
1617        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1618        ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1619        ASSERT(agi->agi_unlinked[bucket_index]);
1620
1621        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1622                /*
1623                 * We're at the head of the list.  Get the inode's on-disk
1624                 * buffer to see if there is anyone after us on the list.
1625                 * Only modify our next pointer if it is not already NULLAGINO.
1626                 * This saves us the overhead of dealing with the buffer when
1627                 * there is no need to change it.
1628                 */
1629                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1630                                       0, 0);
1631                if (error) {
1632                        xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
1633                                __func__, error);
1634                        return error;
1635                }
1636                next_agino = be32_to_cpu(dip->di_next_unlinked);
1637                ASSERT(next_agino != 0);
1638                if (next_agino != NULLAGINO) {
1639                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1640                        offset = ip->i_imap.im_boffset +
1641                                offsetof(xfs_dinode_t, di_next_unlinked);
1642                        xfs_trans_inode_buf(tp, ibp);
1643                        xfs_trans_log_buf(tp, ibp, offset,
1644                                          (offset + sizeof(xfs_agino_t) - 1));
1645                        xfs_inobp_check(mp, ibp);
1646                } else {
1647                        xfs_trans_brelse(tp, ibp);
1648                }
1649                /*
1650                 * Point the bucket head pointer at the next inode.
1651                 */
1652                ASSERT(next_agino != 0);
1653                ASSERT(next_agino != agino);
1654                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1655                offset = offsetof(xfs_agi_t, agi_unlinked) +
1656                        (sizeof(xfs_agino_t) * bucket_index);
1657                xfs_trans_log_buf(tp, agibp, offset,
1658                                  (offset + sizeof(xfs_agino_t) - 1));
1659        } else {
1660                /*
1661                 * We need to search the list for the inode being freed.
1662                 */
1663                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1664                last_ibp = NULL;
1665                while (next_agino != agino) {
1666                        struct xfs_imap imap;
1667
1668                        if (last_ibp)
1669                                xfs_trans_brelse(tp, last_ibp);
1670
1671                        imap.im_blkno = 0;
1672                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1673
1674                        error = xfs_imap(mp, tp, next_ino, &imap, 0);
1675                        if (error) {
1676                                xfs_warn(mp,
1677        "%s: xfs_imap returned error %d.",
1678                                         __func__, error);
1679                                return error;
1680                        }
1681
1682                        error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
1683                                               &last_ibp, 0, 0);
1684                        if (error) {
1685                                xfs_warn(mp,
1686        "%s: xfs_imap_to_bp returned error %d.",
1687                                        __func__, error);
1688                                return error;
1689                        }
1690
1691                        last_offset = imap.im_boffset;
1692                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
1693                        ASSERT(next_agino != NULLAGINO);
1694                        ASSERT(next_agino != 0);
1695                }
1696
1697                /*
1698                 * Now last_ibp points to the buffer previous to us on the
1699                 * unlinked list.  Pull us from the list.
1700                 */
1701                error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1702                                       0, 0);
1703                if (error) {
1704                        xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
1705                                __func__, error);
1706                        return error;
1707                }
1708                next_agino = be32_to_cpu(dip->di_next_unlinked);
1709                ASSERT(next_agino != 0);
1710                ASSERT(next_agino != agino);
1711                if (next_agino != NULLAGINO) {
1712                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1713                        offset = ip->i_imap.im_boffset +
1714                                offsetof(xfs_dinode_t, di_next_unlinked);
1715                        xfs_trans_inode_buf(tp, ibp);
1716                        xfs_trans_log_buf(tp, ibp, offset,
1717                                          (offset + sizeof(xfs_agino_t) - 1));
1718                        xfs_inobp_check(mp, ibp);
1719                } else {
1720                        xfs_trans_brelse(tp, ibp);
1721                }
1722                /*
1723                 * Point the previous inode on the list to the next inode.
1724                 */
1725                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1726                ASSERT(next_agino != 0);
1727                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1728                xfs_trans_inode_buf(tp, last_ibp);
1729                xfs_trans_log_buf(tp, last_ibp, offset,
1730                                  (offset + sizeof(xfs_agino_t) - 1));
1731                xfs_inobp_check(mp, last_ibp);
1732        }
1733        return 0;
1734}
1735
1736/*
1737 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1738 * inodes that are in memory - they all must be marked stale and attached to
1739 * the cluster buffer.
1740 */
1741STATIC int
1742xfs_ifree_cluster(
1743        xfs_inode_t     *free_ip,
1744        xfs_trans_t     *tp,
1745        xfs_ino_t       inum)
1746{
1747        xfs_mount_t             *mp = free_ip->i_mount;
1748        int                     blks_per_cluster;
1749        int                     nbufs;
1750        int                     ninodes;
1751        int                     i, j;
1752        xfs_daddr_t             blkno;
1753        xfs_buf_t               *bp;
1754        xfs_inode_t             *ip;
1755        xfs_inode_log_item_t    *iip;
1756        xfs_log_item_t          *lip;
1757        struct xfs_perag        *pag;
1758
1759        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1760        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1761                blks_per_cluster = 1;
1762                ninodes = mp->m_sb.sb_inopblock;
1763                nbufs = XFS_IALLOC_BLOCKS(mp);
1764        } else {
1765                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1766                                        mp->m_sb.sb_blocksize;
1767                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1768                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1769        }
1770
1771        for (j = 0; j < nbufs; j++, inum += ninodes) {
1772                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1773                                         XFS_INO_TO_AGBNO(mp, inum));
1774
1775                /*
1776                 * We obtain and lock the backing buffer first in the process
1777                 * here, as we have to ensure that any dirty inode that we
1778                 * can't get the flush lock on is attached to the buffer.
1779                 * If we scan the in-memory inodes first, then buffer IO can
1780                 * complete before we get a lock on it, and hence we may fail
1781                 * to mark all the active inodes on the buffer stale.
1782                 */
1783                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1784                                        mp->m_bsize * blks_per_cluster,
1785                                        XBF_UNMAPPED);
1786
1787                if (!bp)
1788                        return ENOMEM;
1789
1790                /*
1791                 * This buffer may not have been correctly initialised as we
1792                 * didn't read it from disk. That's not important because we are
1793                 * only using to mark the buffer as stale in the log, and to
1794                 * attach stale cached inodes on it. That means it will never be
1795                 * dispatched for IO. If it is, we want to know about it, and we
1796                 * want it to fail. We can acheive this by adding a write
1797                 * verifier to the buffer.
1798                 */
1799                 bp->b_ops = &xfs_inode_buf_ops;
1800
1801                /*
1802                 * Walk the inodes already attached to the buffer and mark them
1803                 * stale. These will all have the flush locks held, so an
1804                 * in-memory inode walk can't lock them. By marking them all
1805                 * stale first, we will not attempt to lock them in the loop
1806                 * below as the XFS_ISTALE flag will be set.
1807                 */
1808                lip = bp->b_fspriv;
1809                while (lip) {
1810                        if (lip->li_type == XFS_LI_INODE) {
1811                                iip = (xfs_inode_log_item_t *)lip;
1812                                ASSERT(iip->ili_logged == 1);
1813                                lip->li_cb = xfs_istale_done;
1814                                xfs_trans_ail_copy_lsn(mp->m_ail,
1815                                                        &iip->ili_flush_lsn,
1816                                                        &iip->ili_item.li_lsn);
1817                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1818                        }
1819                        lip = lip->li_bio_list;
1820                }
1821
1822
1823                /*
1824                 * For each inode in memory attempt to add it to the inode
1825                 * buffer and set it up for being staled on buffer IO
1826                 * completion.  This is safe as we've locked out tail pushing
1827                 * and flushing by locking the buffer.
1828                 *
1829                 * We have already marked every inode that was part of a
1830                 * transaction stale above, which means there is no point in
1831                 * even trying to lock them.
1832                 */
1833                for (i = 0; i < ninodes; i++) {
1834retry:
1835                        rcu_read_lock();
1836                        ip = radix_tree_lookup(&pag->pag_ici_root,
1837                                        XFS_INO_TO_AGINO(mp, (inum + i)));
1838
1839                        /* Inode not in memory, nothing to do */
1840                        if (!ip) {
1841                                rcu_read_unlock();
1842                                continue;
1843                        }
1844
1845                        /*
1846                         * because this is an RCU protected lookup, we could
1847                         * find a recently freed or even reallocated inode
1848                         * during the lookup. We need to check under the
1849                         * i_flags_lock for a valid inode here. Skip it if it
1850                         * is not valid, the wrong inode or stale.
1851                         */
1852                        spin_lock(&ip->i_flags_lock);
1853                        if (ip->i_ino != inum + i ||
1854                            __xfs_iflags_test(ip, XFS_ISTALE)) {
1855                                spin_unlock(&ip->i_flags_lock);
1856                                rcu_read_unlock();
1857                                continue;
1858                        }
1859                        spin_unlock(&ip->i_flags_lock);
1860
1861                        /*
1862                         * Don't try to lock/unlock the current inode, but we
1863                         * _cannot_ skip the other inodes that we did not find
1864                         * in the list attached to the buffer and are not
1865                         * already marked stale. If we can't lock it, back off
1866                         * and retry.
1867                         */
1868                        if (ip != free_ip &&
1869                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1870                                rcu_read_unlock();
1871                                delay(1);
1872                                goto retry;
1873                        }
1874                        rcu_read_unlock();
1875
1876                        xfs_iflock(ip);
1877                        xfs_iflags_set(ip, XFS_ISTALE);
1878
1879                        /*
1880                         * we don't need to attach clean inodes or those only
1881                         * with unlogged changes (which we throw away, anyway).
1882                         */
1883                        iip = ip->i_itemp;
1884                        if (!iip || xfs_inode_clean(ip)) {
1885                                ASSERT(ip != free_ip);
1886                                xfs_ifunlock(ip);
1887                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1888                                continue;
1889                        }
1890
1891                        iip->ili_last_fields = iip->ili_fields;
1892                        iip->ili_fields = 0;
1893                        iip->ili_logged = 1;
1894                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1895                                                &iip->ili_item.li_lsn);
1896
1897                        xfs_buf_attach_iodone(bp, xfs_istale_done,
1898                                                  &iip->ili_item);
1899
1900                        if (ip != free_ip)
1901                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
1902                }
1903
1904                xfs_trans_stale_inode_buf(tp, bp);
1905                xfs_trans_binval(tp, bp);
1906        }
1907
1908        xfs_perag_put(pag);
1909        return 0;
1910}
1911
1912/*
1913 * This is called to return an inode to the inode free list.
1914 * The inode should already be truncated to 0 length and have
1915 * no pages associated with it.  This routine also assumes that
1916 * the inode is already a part of the transaction.
1917 *
1918 * The on-disk copy of the inode will have been added to the list
1919 * of unlinked inodes in the AGI. We need to remove the inode from
1920 * that list atomically with respect to freeing it here.
1921 */
1922int
1923xfs_ifree(
1924        xfs_trans_t     *tp,
1925        xfs_inode_t     *ip,
1926        xfs_bmap_free_t *flist)
1927{
1928        int                     error;
1929        int                     delete;
1930        xfs_ino_t               first_ino;
1931        xfs_dinode_t            *dip;
1932        xfs_buf_t               *ibp;
1933
1934        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1935        ASSERT(ip->i_d.di_nlink == 0);
1936        ASSERT(ip->i_d.di_nextents == 0);
1937        ASSERT(ip->i_d.di_anextents == 0);
1938        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1939        ASSERT(ip->i_d.di_nblocks == 0);
1940
1941        /*
1942         * Pull the on-disk inode from the AGI unlinked list.
1943         */
1944        error = xfs_iunlink_remove(tp, ip);
1945        if (error != 0) {
1946                return error;
1947        }
1948
1949        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1950        if (error != 0) {
1951                return error;
1952        }
1953        ip->i_d.di_mode = 0;            /* mark incore inode as free */
1954        ip->i_d.di_flags = 0;
1955        ip->i_d.di_dmevmask = 0;
1956        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
1957        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1958        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1959        /*
1960         * Bump the generation count so no one will be confused
1961         * by reincarnations of this inode.
1962         */
1963        ip->i_d.di_gen++;
1964
1965        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1966
1967        error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
1968                               0, 0);
1969        if (error)
1970                return error;
1971
1972        /*
1973        * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
1974        * from picking up this inode when it is reclaimed (its incore state
1975        * initialzed but not flushed to disk yet). The in-core di_mode is
1976        * already cleared  and a corresponding transaction logged.
1977        * The hack here just synchronizes the in-core to on-disk
1978        * di_mode value in advance before the actual inode sync to disk.
1979        * This is OK because the inode is already unlinked and would never
1980        * change its di_mode again for this inode generation.
1981        * This is a temporary hack that would require a proper fix
1982        * in the future.
1983        */
1984        dip->di_mode = 0;
1985
1986        if (delete) {
1987                error = xfs_ifree_cluster(ip, tp, first_ino);
1988        }
1989
1990        return error;
1991}
1992
1993/*
1994 * Reallocate the space for if_broot based on the number of records
1995 * being added or deleted as indicated in rec_diff.  Move the records
1996 * and pointers in if_broot to fit the new size.  When shrinking this
1997 * will eliminate holes between the records and pointers created by
1998 * the caller.  When growing this will create holes to be filled in
1999 * by the caller.
2000 *
2001 * The caller must not request to add more records than would fit in
2002 * the on-disk inode root.  If the if_broot is currently NULL, then
2003 * if we adding records one will be allocated.  The caller must also
2004 * not request that the number of records go below zero, although
2005 * it can go to zero.
2006 *
2007 * ip -- the inode whose if_broot area is changing
2008 * ext_diff -- the change in the number of records, positive or negative,
2009 *       requested for the if_broot array.
2010 */
2011void
2012xfs_iroot_realloc(
2013        xfs_inode_t             *ip,
2014        int                     rec_diff,
2015        int                     whichfork)
2016{
2017        struct xfs_mount        *mp = ip->i_mount;
2018        int                     cur_max;
2019        xfs_ifork_t             *ifp;
2020        struct xfs_btree_block  *new_broot;
2021        int                     new_max;
2022        size_t                  new_size;
2023        char                    *np;
2024        char                    *op;
2025
2026        /*
2027         * Handle the degenerate case quietly.
2028         */
2029        if (rec_diff == 0) {
2030                return;
2031        }
2032
2033        ifp = XFS_IFORK_PTR(ip, whichfork);
2034        if (rec_diff > 0) {
2035                /*
2036                 * If there wasn't any memory allocated before, just
2037                 * allocate it now and get out.
2038                 */
2039                if (ifp->if_broot_bytes == 0) {
2040                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2041                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2042                        ifp->if_broot_bytes = (int)new_size;
2043                        return;
2044                }
2045
2046                /*
2047                 * If there is already an existing if_broot, then we need
2048                 * to realloc() it and shift the pointers to their new
2049                 * location.  The records don't change location because
2050                 * they are kept butted up against the btree block header.
2051                 */
2052                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2053                new_max = cur_max + rec_diff;
2054                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2055                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2056                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2057                                KM_SLEEP | KM_NOFS);
2058                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2059                                                     ifp->if_broot_bytes);
2060                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2061                                                     (int)new_size);
2062                ifp->if_broot_bytes = (int)new_size;
2063                ASSERT(ifp->if_broot_bytes <=
2064                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2065                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2066                return;
2067        }
2068
2069        /*
2070         * rec_diff is less than 0.  In this case, we are shrinking the
2071         * if_broot buffer.  It must already exist.  If we go to zero
2072         * records, just get rid of the root and clear the status bit.
2073         */
2074        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2075        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2076        new_max = cur_max + rec_diff;
2077        ASSERT(new_max >= 0);
2078        if (new_max > 0)
2079                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2080        else
2081                new_size = 0;
2082        if (new_size > 0) {
2083                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2084                /*
2085                 * First copy over the btree block header.
2086                 */
2087                memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
2088        } else {
2089                new_broot = NULL;
2090                ifp->if_flags &= ~XFS_IFBROOT;
2091        }
2092
2093        /*
2094         * Only copy the records and pointers if there are any.
2095         */
2096        if (new_max > 0) {
2097                /*
2098                 * First copy the records.
2099                 */
2100                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2101                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2102                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2103
2104                /*
2105                 * Then copy the pointers.
2106                 */
2107                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2108                                                     ifp->if_broot_bytes);
2109                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2110                                                     (int)new_size);
2111                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2112        }
2113        kmem_free(ifp->if_broot);
2114        ifp->if_broot = new_broot;
2115        ifp->if_broot_bytes = (int)new_size;
2116        ASSERT(ifp->if_broot_bytes <=
2117                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2118        return;
2119}
2120
2121
2122/*
2123 * This is called when the amount of space needed for if_data
2124 * is increased or decreased.  The change in size is indicated by
2125 * the number of bytes that need to be added or deleted in the
2126 * byte_diff parameter.
2127 *
2128 * If the amount of space needed has decreased below the size of the
2129 * inline buffer, then switch to using the inline buffer.  Otherwise,
2130 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2131 * to what is needed.
2132 *
2133 * ip -- the inode whose if_data area is changing
2134 * byte_diff -- the change in the number of bytes, positive or negative,
2135 *       requested for the if_data array.
2136 */
2137void
2138xfs_idata_realloc(
2139        xfs_inode_t     *ip,
2140        int             byte_diff,
2141        int             whichfork)
2142{
2143        xfs_ifork_t     *ifp;
2144        int             new_size;
2145        int             real_size;
2146
2147        if (byte_diff == 0) {
2148                return;
2149        }
2150
2151        ifp = XFS_IFORK_PTR(ip, whichfork);
2152        new_size = (int)ifp->if_bytes + byte_diff;
2153        ASSERT(new_size >= 0);
2154
2155        if (new_size == 0) {
2156                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2157                        kmem_free(ifp->if_u1.if_data);
2158                }
2159                ifp->if_u1.if_data = NULL;
2160                real_size = 0;
2161        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2162                /*
2163                 * If the valid extents/data can fit in if_inline_ext/data,
2164                 * copy them from the malloc'd vector and free it.
2165                 */
2166                if (ifp->if_u1.if_data == NULL) {
2167                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2168                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2169                        ASSERT(ifp->if_real_bytes != 0);
2170                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2171                              new_size);
2172                        kmem_free(ifp->if_u1.if_data);
2173                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2174                }
2175                real_size = 0;
2176        } else {
2177                /*
2178                 * Stuck with malloc/realloc.
2179                 * For inline data, the underlying buffer must be
2180                 * a multiple of 4 bytes in size so that it can be
2181                 * logged and stay on word boundaries.  We enforce
2182                 * that here.
2183                 */
2184                real_size = roundup(new_size, 4);
2185                if (ifp->if_u1.if_data == NULL) {
2186                        ASSERT(ifp->if_real_bytes == 0);
2187                        ifp->if_u1.if_data = kmem_alloc(real_size,
2188                                                        KM_SLEEP | KM_NOFS);
2189                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2190                        /*
2191                         * Only do the realloc if the underlying size
2192                         * is really changing.
2193                         */
2194                        if (ifp->if_real_bytes != real_size) {
2195                                ifp->if_u1.if_data =
2196                                        kmem_realloc(ifp->if_u1.if_data,
2197                                                        real_size,
2198                                                        ifp->if_real_bytes,
2199                                                        KM_SLEEP | KM_NOFS);
2200                        }
2201                } else {
2202                        ASSERT(ifp->if_real_bytes == 0);
2203                        ifp->if_u1.if_data = kmem_alloc(real_size,
2204                                                        KM_SLEEP | KM_NOFS);
2205                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2206                                ifp->if_bytes);
2207                }
2208        }
2209        ifp->if_real_bytes = real_size;
2210        ifp->if_bytes = new_size;
2211        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2212}
2213
2214void
2215xfs_idestroy_fork(
2216        xfs_inode_t     *ip,
2217        int             whichfork)
2218{
2219        xfs_ifork_t     *ifp;
2220
2221        ifp = XFS_IFORK_PTR(ip, whichfork);
2222        if (ifp->if_broot != NULL) {
2223                kmem_free(ifp->if_broot);
2224                ifp->if_broot = NULL;
2225        }
2226
2227        /*
2228         * If the format is local, then we can't have an extents
2229         * array so just look for an inline data array.  If we're
2230         * not local then we may or may not have an extents list,
2231         * so check and free it up if we do.
2232         */
2233        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2234                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2235                    (ifp->if_u1.if_data != NULL)) {
2236                        ASSERT(ifp->if_real_bytes != 0);
2237                        kmem_free(ifp->if_u1.if_data);
2238                        ifp->if_u1.if_data = NULL;
2239                        ifp->if_real_bytes = 0;
2240                }
2241        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2242                   ((ifp->if_flags & XFS_IFEXTIREC) ||
2243                    ((ifp->if_u1.if_extents != NULL) &&
2244                     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2245                ASSERT(ifp->if_real_bytes != 0);
2246                xfs_iext_destroy(ifp);
2247        }
2248        ASSERT(ifp->if_u1.if_extents == NULL ||
2249               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2250        ASSERT(ifp->if_real_bytes == 0);
2251        if (whichfork == XFS_ATTR_FORK) {
2252                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2253                ip->i_afp = NULL;
2254        }
2255}
2256
2257/*
2258 * This is called to unpin an inode.  The caller must have the inode locked
2259 * in at least shared mode so that the buffer cannot be subsequently pinned
2260 * once someone is waiting for it to be unpinned.
2261 */
2262static void
2263xfs_iunpin(
2264        struct xfs_inode        *ip)
2265{
2266        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2267
2268        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2269
2270        /* Give the log a push to start the unpinning I/O */
2271        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2272
2273}
2274
2275static void
2276__xfs_iunpin_wait(
2277        struct xfs_inode        *ip)
2278{
2279        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2280        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2281
2282        xfs_iunpin(ip);
2283
2284        do {
2285                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2286                if (xfs_ipincount(ip))
2287                        io_schedule();
2288        } while (xfs_ipincount(ip));
2289        finish_wait(wq, &wait.wait);
2290}
2291
2292void
2293xfs_iunpin_wait(
2294        struct xfs_inode        *ip)
2295{
2296        if (xfs_ipincount(ip))
2297                __xfs_iunpin_wait(ip);
2298}
2299
2300/*
2301 * xfs_iextents_copy()
2302 *
2303 * This is called to copy the REAL extents (as opposed to the delayed
2304 * allocation extents) from the inode into the given buffer.  It
2305 * returns the number of bytes copied into the buffer.
2306 *
2307 * If there are no delayed allocation extents, then we can just
2308 * memcpy() the extents into the buffer.  Otherwise, we need to
2309 * examine each extent in turn and skip those which are delayed.
2310 */
2311int
2312xfs_iextents_copy(
2313        xfs_inode_t             *ip,
2314        xfs_bmbt_rec_t          *dp,
2315        int                     whichfork)
2316{
2317        int                     copied;
2318        int                     i;
2319        xfs_ifork_t             *ifp;
2320        int                     nrecs;
2321        xfs_fsblock_t           start_block;
2322
2323        ifp = XFS_IFORK_PTR(ip, whichfork);
2324        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2325        ASSERT(ifp->if_bytes > 0);
2326
2327        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2328        XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2329        ASSERT(nrecs > 0);
2330
2331        /*
2332         * There are some delayed allocation extents in the
2333         * inode, so copy the extents one at a time and skip
2334         * the delayed ones.  There must be at least one
2335         * non-delayed extent.
2336         */
2337        copied = 0;
2338        for (i = 0; i < nrecs; i++) {
2339                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2340                start_block = xfs_bmbt_get_startblock(ep);
2341                if (isnullstartblock(start_block)) {
2342                        /*
2343                         * It's a delayed allocation extent, so skip it.
2344                         */
2345                        continue;
2346                }
2347
2348                /* Translate to on disk format */
2349                put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2350                put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2351                dp++;
2352                copied++;
2353        }
2354        ASSERT(copied != 0);
2355        xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2356
2357        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2358}
2359
2360/*
2361 * Each of the following cases stores data into the same region
2362 * of the on-disk inode, so only one of them can be valid at
2363 * any given time. While it is possible to have conflicting formats
2364 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2365 * in EXTENTS format, this can only happen when the fork has
2366 * changed formats after being modified but before being flushed.
2367 * In these cases, the format always takes precedence, because the
2368 * format indicates the current state of the fork.
2369 */
2370/*ARGSUSED*/
2371STATIC void
2372xfs_iflush_fork(
2373        xfs_inode_t             *ip,
2374        xfs_dinode_t            *dip,
2375        xfs_inode_log_item_t    *iip,
2376        int                     whichfork,
2377        xfs_buf_t               *bp)
2378{
2379        char                    *cp;
2380        xfs_ifork_t             *ifp;
2381        xfs_mount_t             *mp;
2382#ifdef XFS_TRANS_DEBUG
2383        int                     first;
2384#endif
2385        static const short      brootflag[2] =
2386                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2387        static const short      dataflag[2] =
2388                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2389        static const short      extflag[2] =
2390                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2391
2392        if (!iip)
2393                return;
2394        ifp = XFS_IFORK_PTR(ip, whichfork);
2395        /*
2396         * This can happen if we gave up in iformat in an error path,
2397         * for the attribute fork.
2398         */
2399        if (!ifp) {
2400                ASSERT(whichfork == XFS_ATTR_FORK);
2401                return;
2402        }
2403        cp = XFS_DFORK_PTR(dip, whichfork);
2404        mp = ip->i_mount;
2405        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2406        case XFS_DINODE_FMT_LOCAL:
2407                if ((iip->ili_fields & dataflag[whichfork]) &&
2408                    (ifp->if_bytes > 0)) {
2409                        ASSERT(ifp->if_u1.if_data != NULL);
2410                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2411                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2412                }
2413                break;
2414
2415        case XFS_DINODE_FMT_EXTENTS:
2416                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2417                       !(iip->ili_fields & extflag[whichfork]));
2418                if ((iip->ili_fields & extflag[whichfork]) &&
2419                    (ifp->if_bytes > 0)) {
2420                        ASSERT(xfs_iext_get_ext(ifp, 0));
2421                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2422                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2423                                whichfork);
2424                }
2425                break;
2426
2427        case XFS_DINODE_FMT_BTREE:
2428                if ((iip->ili_fields & brootflag[whichfork]) &&
2429                    (ifp->if_broot_bytes > 0)) {
2430                        ASSERT(ifp->if_broot != NULL);
2431                        ASSERT(ifp->if_broot_bytes <=
2432                               (XFS_IFORK_SIZE(ip, whichfork) +
2433                                XFS_BROOT_SIZE_ADJ));
2434                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2435                                (xfs_bmdr_block_t *)cp,
2436                                XFS_DFORK_SIZE(dip, mp, whichfork));
2437                }
2438                break;
2439
2440        case XFS_DINODE_FMT_DEV:
2441                if (iip->ili_fields & XFS_ILOG_DEV) {
2442                        ASSERT(whichfork == XFS_DATA_FORK);
2443                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2444                }
2445                break;
2446
2447        case XFS_DINODE_FMT_UUID:
2448                if (iip->ili_fields & XFS_ILOG_UUID) {
2449                        ASSERT(whichfork == XFS_DATA_FORK);
2450                        memcpy(XFS_DFORK_DPTR(dip),
2451                               &ip->i_df.if_u2.if_uuid,
2452                               sizeof(uuid_t));
2453                }
2454                break;
2455
2456        default:
2457                ASSERT(0);
2458                break;
2459        }
2460}
2461
2462STATIC int
2463xfs_iflush_cluster(
2464        xfs_inode_t     *ip,
2465        xfs_buf_t       *bp)
2466{
2467        xfs_mount_t             *mp = ip->i_mount;
2468        struct xfs_perag        *pag;
2469        unsigned long           first_index, mask;
2470        unsigned long           inodes_per_cluster;
2471        int                     ilist_size;
2472        xfs_inode_t             **ilist;
2473        xfs_inode_t             *iq;
2474        int                     nr_found;
2475        int                     clcount = 0;
2476        int                     bufwasdelwri;
2477        int                     i;
2478
2479        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2480
2481        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2482        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2483        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2484        if (!ilist)
2485                goto out_put;
2486
2487        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2488        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2489        rcu_read_lock();
2490        /* really need a gang lookup range call here */
2491        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2492                                        first_index, inodes_per_cluster);
2493        if (nr_found == 0)
2494                goto out_free;
2495
2496        for (i = 0; i < nr_found; i++) {
2497                iq = ilist[i];
2498                if (iq == ip)
2499                        continue;
2500
2501                /*
2502                 * because this is an RCU protected lookup, we could find a
2503                 * recently freed or even reallocated inode during the lookup.
2504                 * We need to check under the i_flags_lock for a valid inode
2505                 * here. Skip it if it is not valid or the wrong inode.
2506                 */
2507                spin_lock(&ip->i_flags_lock);
2508                if (!ip->i_ino ||
2509                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2510                        spin_unlock(&ip->i_flags_lock);
2511                        continue;
2512                }
2513                spin_unlock(&ip->i_flags_lock);
2514
2515                /*
2516                 * Do an un-protected check to see if the inode is dirty and
2517                 * is a candidate for flushing.  These checks will be repeated
2518                 * later after the appropriate locks are acquired.
2519                 */
2520                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2521                        continue;
2522
2523                /*
2524                 * Try to get locks.  If any are unavailable or it is pinned,
2525                 * then this inode cannot be flushed and is skipped.
2526                 */
2527
2528                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2529                        continue;
2530                if (!xfs_iflock_nowait(iq)) {
2531                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2532                        continue;
2533                }
2534                if (xfs_ipincount(iq)) {
2535                        xfs_ifunlock(iq);
2536                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2537                        continue;
2538                }
2539
2540                /*
2541                 * arriving here means that this inode can be flushed.  First
2542                 * re-check that it's dirty before flushing.
2543                 */
2544                if (!xfs_inode_clean(iq)) {
2545                        int     error;
2546                        error = xfs_iflush_int(iq, bp);
2547                        if (error) {
2548                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2549                                goto cluster_corrupt_out;
2550                        }
2551                        clcount++;
2552                } else {
2553                        xfs_ifunlock(iq);
2554                }
2555                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2556        }
2557
2558        if (clcount) {
2559                XFS_STATS_INC(xs_icluster_flushcnt);
2560                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2561        }
2562
2563out_free:
2564        rcu_read_unlock();
2565        kmem_free(ilist);
2566out_put:
2567        xfs_perag_put(pag);
2568        return 0;
2569
2570
2571cluster_corrupt_out:
2572        /*
2573         * Corruption detected in the clustering loop.  Invalidate the
2574         * inode buffer and shut down the filesystem.
2575         */
2576        rcu_read_unlock();
2577        /*
2578         * Clean up the buffer.  If it was delwri, just release it --
2579         * brelse can handle it with no problems.  If not, shut down the
2580         * filesystem before releasing the buffer.
2581         */
2582        bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2583        if (bufwasdelwri)
2584                xfs_buf_relse(bp);
2585
2586        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2587
2588        if (!bufwasdelwri) {
2589                /*
2590                 * Just like incore_relse: if we have b_iodone functions,
2591                 * mark the buffer as an error and call them.  Otherwise
2592                 * mark it as stale and brelse.
2593                 */
2594                if (bp->b_iodone) {
2595                        XFS_BUF_UNDONE(bp);
2596                        xfs_buf_stale(bp);
2597                        xfs_buf_ioerror(bp, EIO);
2598                        xfs_buf_ioend(bp, 0);
2599                } else {
2600                        xfs_buf_stale(bp);
2601                        xfs_buf_relse(bp);
2602                }
2603        }
2604
2605        /*
2606         * Unlocks the flush lock
2607         */
2608        xfs_iflush_abort(iq, false);
2609        kmem_free(ilist);
2610        xfs_perag_put(pag);
2611        return XFS_ERROR(EFSCORRUPTED);
2612}
2613
2614/*
2615 * Flush dirty inode metadata into the backing buffer.
2616 *
2617 * The caller must have the inode lock and the inode flush lock held.  The
2618 * inode lock will still be held upon return to the caller, and the inode
2619 * flush lock will be released after the inode has reached the disk.
2620 *
2621 * The caller must write out the buffer returned in *bpp and release it.
2622 */
2623int
2624xfs_iflush(
2625        struct xfs_inode        *ip,
2626        struct xfs_buf          **bpp)
2627{
2628        struct xfs_mount        *mp = ip->i_mount;
2629        struct xfs_buf          *bp;
2630        struct xfs_dinode       *dip;
2631        int                     error;
2632
2633        XFS_STATS_INC(xs_iflush_count);
2634
2635        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2636        ASSERT(xfs_isiflocked(ip));
2637        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2638               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2639
2640        *bpp = NULL;
2641
2642        xfs_iunpin_wait(ip);
2643
2644        /*
2645         * For stale inodes we cannot rely on the backing buffer remaining
2646         * stale in cache for the remaining life of the stale inode and so
2647         * xfs_imap_to_bp() below may give us a buffer that no longer contains
2648         * inodes below. We have to check this after ensuring the inode is
2649         * unpinned so that it is safe to reclaim the stale inode after the
2650         * flush call.
2651         */
2652        if (xfs_iflags_test(ip, XFS_ISTALE)) {
2653                xfs_ifunlock(ip);
2654                return 0;
2655        }
2656
2657        /*
2658         * This may have been unpinned because the filesystem is shutting
2659         * down forcibly. If that's the case we must not write this inode
2660         * to disk, because the log record didn't make it to disk.
2661         *
2662         * We also have to remove the log item from the AIL in this case,
2663         * as we wait for an empty AIL as part of the unmount process.
2664         */
2665        if (XFS_FORCED_SHUTDOWN(mp)) {
2666                error = XFS_ERROR(EIO);
2667                goto abort_out;
2668        }
2669
2670        /*
2671         * Get the buffer containing the on-disk inode.
2672         */
2673        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
2674                               0);
2675        if (error || !bp) {
2676                xfs_ifunlock(ip);
2677                return error;
2678        }
2679
2680        /*
2681         * First flush out the inode that xfs_iflush was called with.
2682         */
2683        error = xfs_iflush_int(ip, bp);
2684        if (error)
2685                goto corrupt_out;
2686
2687        /*
2688         * If the buffer is pinned then push on the log now so we won't
2689         * get stuck waiting in the write for too long.
2690         */
2691        if (xfs_buf_ispinned(bp))
2692                xfs_log_force(mp, 0);
2693
2694        /*
2695         * inode clustering:
2696         * see if other inodes can be gathered into this write
2697         */
2698        error = xfs_iflush_cluster(ip, bp);
2699        if (error)
2700                goto cluster_corrupt_out;
2701
2702        *bpp = bp;
2703        return 0;
2704
2705corrupt_out:
2706        xfs_buf_relse(bp);
2707        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2708cluster_corrupt_out:
2709        error = XFS_ERROR(EFSCORRUPTED);
2710abort_out:
2711        /*
2712         * Unlocks the flush lock
2713         */
2714        xfs_iflush_abort(ip, false);
2715        return error;
2716}
2717
2718
2719STATIC int
2720xfs_iflush_int(
2721        xfs_inode_t             *ip,
2722        xfs_buf_t               *bp)
2723{
2724        xfs_inode_log_item_t    *iip;
2725        xfs_dinode_t            *dip;
2726        xfs_mount_t             *mp;
2727#ifdef XFS_TRANS_DEBUG
2728        int                     first;
2729#endif
2730
2731        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2732        ASSERT(xfs_isiflocked(ip));
2733        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2734               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2735
2736        iip = ip->i_itemp;
2737        mp = ip->i_mount;
2738
2739        /* set *dip = inode's place in the buffer */
2740        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2741
2742        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2743                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2744                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2745                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2746                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2747                goto corrupt_out;
2748        }
2749        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2750                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2751                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2752                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2753                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
2754                goto corrupt_out;
2755        }
2756        if (S_ISREG(ip->i_d.di_mode)) {
2757                if (XFS_TEST_ERROR(
2758                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2759                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2760                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2761                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2762                                "%s: Bad regular inode %Lu, ptr 0x%p",
2763                                __func__, ip->i_ino, ip);
2764                        goto corrupt_out;
2765                }
2766        } else if (S_ISDIR(ip->i_d.di_mode)) {
2767                if (XFS_TEST_ERROR(
2768                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2769                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2770                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2771                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2772                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2773                                "%s: Bad directory inode %Lu, ptr 0x%p",
2774                                __func__, ip->i_ino, ip);
2775                        goto corrupt_out;
2776                }
2777        }
2778        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2779                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2780                                XFS_RANDOM_IFLUSH_5)) {
2781                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2782                        "%s: detected corrupt incore inode %Lu, "
2783                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
2784                        __func__, ip->i_ino,
2785                        ip->i_d.di_nextents + ip->i_d.di_anextents,
2786                        ip->i_d.di_nblocks, ip);
2787                goto corrupt_out;
2788        }
2789        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2790                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2791                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2792                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2793                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2794                goto corrupt_out;
2795        }
2796        /*
2797         * bump the flush iteration count, used to detect flushes which
2798         * postdate a log record during recovery.
2799         */
2800
2801        ip->i_d.di_flushiter++;
2802
2803        /*
2804         * Copy the dirty parts of the inode into the on-disk
2805         * inode.  We always copy out the core of the inode,
2806         * because if the inode is dirty at all the core must
2807         * be.
2808         */
2809        xfs_dinode_to_disk(dip, &ip->i_d);
2810
2811        /* Wrap, we never let the log put out DI_MAX_FLUSH */
2812        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
2813                ip->i_d.di_flushiter = 0;
2814
2815        /*
2816         * If this is really an old format inode and the superblock version
2817         * has not been updated to support only new format inodes, then
2818         * convert back to the old inode format.  If the superblock version
2819         * has been updated, then make the conversion permanent.
2820         */
2821        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
2822        if (ip->i_d.di_version == 1) {
2823                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
2824                        /*
2825                         * Convert it back.
2826                         */
2827                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
2828                        dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
2829                } else {
2830                        /*
2831                         * The superblock version has already been bumped,
2832                         * so just make the conversion to the new inode
2833                         * format permanent.
2834                         */
2835                        ip->i_d.di_version = 2;
2836                        dip->di_version = 2;
2837                        ip->i_d.di_onlink = 0;
2838                        dip->di_onlink = 0;
2839                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
2840                        memset(&(dip->di_pad[0]), 0,
2841                              sizeof(dip->di_pad));
2842                        ASSERT(xfs_get_projid(ip) == 0);
2843                }
2844        }
2845
2846        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
2847        if (XFS_IFORK_Q(ip))
2848                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
2849        xfs_inobp_check(mp, bp);
2850
2851        /*
2852         * We've recorded everything logged in the inode, so we'd like to clear
2853         * the ili_fields bits so we don't log and flush things unnecessarily.
2854         * However, we can't stop logging all this information until the data
2855         * we've copied into the disk buffer is written to disk.  If we did we
2856         * might overwrite the copy of the inode in the log with all the data
2857         * after re-logging only part of it, and in the face of a crash we
2858         * wouldn't have all the data we need to recover.
2859         *
2860         * What we do is move the bits to the ili_last_fields field.  When
2861         * logging the inode, these bits are moved back to the ili_fields field.
2862         * In the xfs_iflush_done() routine we clear ili_last_fields, since we
2863         * know that the information those bits represent is permanently on
2864         * disk.  As long as the flush completes before the inode is logged
2865         * again, then both ili_fields and ili_last_fields will be cleared.
2866         *
2867         * We can play with the ili_fields bits here, because the inode lock
2868         * must be held exclusively in order to set bits there and the flush
2869         * lock protects the ili_last_fields bits.  Set ili_logged so the flush
2870         * done routine can tell whether or not to look in the AIL.  Also, store
2871         * the current LSN of the inode so that we can tell whether the item has
2872         * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
2873         * need the AIL lock, because it is a 64 bit value that cannot be read
2874         * atomically.
2875         */
2876        if (iip != NULL && iip->ili_fields != 0) {
2877                iip->ili_last_fields = iip->ili_fields;
2878                iip->ili_fields = 0;
2879                iip->ili_logged = 1;
2880
2881                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2882                                        &iip->ili_item.li_lsn);
2883
2884                /*
2885                 * Attach the function xfs_iflush_done to the inode's
2886                 * buffer.  This will remove the inode from the AIL
2887                 * and unlock the inode's flush lock when the inode is
2888                 * completely written to disk.
2889                 */
2890                xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2891
2892                ASSERT(bp->b_fspriv != NULL);
2893                ASSERT(bp->b_iodone != NULL);
2894        } else {
2895                /*
2896                 * We're flushing an inode which is not in the AIL and has
2897                 * not been logged.  For this case we can immediately drop
2898                 * the inode flush lock because we can avoid the whole
2899                 * AIL state thing.  It's OK to drop the flush lock now,
2900                 * because we've already locked the buffer and to do anything
2901                 * you really need both.
2902                 */
2903                if (iip != NULL) {
2904                        ASSERT(iip->ili_logged == 0);
2905                        ASSERT(iip->ili_last_fields == 0);
2906                        ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2907                }
2908                xfs_ifunlock(ip);
2909        }
2910
2911        return 0;
2912
2913corrupt_out:
2914        return XFS_ERROR(EFSCORRUPTED);
2915}
2916
2917/*
2918 * Return a pointer to the extent record at file index idx.
2919 */
2920xfs_bmbt_rec_host_t *
2921xfs_iext_get_ext(
2922        xfs_ifork_t     *ifp,           /* inode fork pointer */
2923        xfs_extnum_t    idx)            /* index of target extent */
2924{
2925        ASSERT(idx >= 0);
2926        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
2927
2928        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
2929                return ifp->if_u1.if_ext_irec->er_extbuf;
2930        } else if (ifp->if_flags & XFS_IFEXTIREC) {
2931                xfs_ext_irec_t  *erp;           /* irec pointer */
2932                int             erp_idx = 0;    /* irec index */
2933                xfs_extnum_t    page_idx = idx; /* ext index in target list */
2934
2935                erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
2936                return &erp->er_extbuf[page_idx];
2937        } else if (ifp->if_bytes) {
2938                return &ifp->if_u1.if_extents[idx];
2939        } else {
2940                return NULL;
2941        }
2942}
2943
2944/*
2945 * Insert new item(s) into the extent records for incore inode
2946 * fork 'ifp'.  'count' new items are inserted at index 'idx'.
2947 */
2948void
2949xfs_iext_insert(
2950        xfs_inode_t     *ip,            /* incore inode pointer */
2951        xfs_extnum_t    idx,            /* starting index of new items */
2952        xfs_extnum_t    count,          /* number of inserted items */
2953        xfs_bmbt_irec_t *new,           /* items to insert */
2954        int             state)          /* type of extent conversion */
2955{
2956        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2957        xfs_extnum_t    i;              /* extent record index */
2958
2959        trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
2960
2961        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2962        xfs_iext_add(ifp, idx, count);
2963        for (i = idx; i < idx + count; i++, new++)
2964                xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
2965}
2966
2967/*
2968 * This is called when the amount of space required for incore file
2969 * extents needs to be increased. The ext_diff parameter stores the
2970 * number of new extents being added and the idx parameter contains
2971 * the extent index where the new extents will be added. If the new
2972 * extents are being appended, then we just need to (re)allocate and
2973 * initialize the space. Otherwise, if the new extents are being
2974 * inserted into the middle of the existing entries, a bit more work
2975 * is required to make room for the new extents to be inserted. The
2976 * caller is responsible for filling in the new extent entries upon
2977 * return.
2978 */
2979void
2980xfs_iext_add(
2981        xfs_ifork_t     *ifp,           /* inode fork pointer */
2982        xfs_extnum_t    idx,            /* index to begin adding exts */
2983        int             ext_diff)       /* number of extents to add */
2984{
2985        int             byte_diff;      /* new bytes being added */
2986        int             new_size;       /* size of extents after adding */
2987        xfs_extnum_t    nextents;       /* number of extents in file */
2988
2989        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2990        ASSERT((idx >= 0) && (idx <= nextents));
2991        byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
2992        new_size = ifp->if_bytes + byte_diff;
2993        /*
2994         * If the new number of extents (nextents + ext_diff)
2995         * fits inside the inode, then continue to use the inline
2996         * extent buffer.
2997         */
2998        if (nextents + ext_diff <= XFS_INLINE_EXTS) {
2999                if (idx < nextents) {
3000                        memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
3001                                &ifp->if_u2.if_inline_ext[idx],
3002                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3003                        memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3004                }
3005                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3006                ifp->if_real_bytes = 0;
3007        }
3008        /*
3009         * Otherwise use a linear (direct) extent list.
3010         * If the extents are currently inside the inode,
3011         * xfs_iext_realloc_direct will switch us from
3012         * inline to direct extent allocation mode.
3013         */
3014        else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3015                xfs_iext_realloc_direct(ifp, new_size);
3016                if (idx < nextents) {
3017                        memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3018                                &ifp->if_u1.if_extents[idx],
3019                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3020                        memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3021                }
3022        }
3023        /* Indirection array */
3024        else {
3025                xfs_ext_irec_t  *erp;
3026                int             erp_idx = 0;
3027                int             page_idx = idx;
3028
3029                ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3030                if (ifp->if_flags & XFS_IFEXTIREC) {
3031                        erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3032                } else {
3033                        xfs_iext_irec_init(ifp);
3034                        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3035                        erp = ifp->if_u1.if_ext_irec;
3036                }
3037                /* Extents fit in target extent page */
3038                if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3039                        if (page_idx < erp->er_extcount) {
3040                                memmove(&erp->er_extbuf[page_idx + ext_diff],
3041                                        &erp->er_extbuf[page_idx],
3042                                        (erp->er_extcount - page_idx) *
3043                                        sizeof(xfs_bmbt_rec_t));
3044                                memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3045                        }
3046                        erp->er_extcount += ext_diff;
3047                        xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3048                }
3049                /* Insert a new extent page */
3050                else if (erp) {
3051                        xfs_iext_add_indirect_multi(ifp,
3052                                erp_idx, page_idx, ext_diff);
3053                }
3054                /*
3055                 * If extent(s) are being appended to the last page in
3056                 * the indirection array and the new extent(s) don't fit
3057                 * in the page, then erp is NULL and erp_idx is set to
3058                 * the next index needed in the indirection array.
3059                 */
3060                else {
3061                        int     count = ext_diff;
3062
3063                        while (count) {
3064                                erp = xfs_iext_irec_new(ifp, erp_idx);
3065                                erp->er_extcount = count;
3066                                count -= MIN(count, (int)XFS_LINEAR_EXTS);
3067                                if (count) {
3068                                        erp_idx++;
3069                                }
3070                        }
3071                }
3072        }
3073        ifp->if_bytes = new_size;
3074}
3075
3076/*
3077 * This is called when incore extents are being added to the indirection
3078 * array and the new extents do not fit in the target extent list. The
3079 * erp_idx parameter contains the irec index for the target extent list
3080 * in the indirection array, and the idx parameter contains the extent
3081 * index within the list. The number of extents being added is stored
3082 * in the count parameter.
3083 *
3084 *    |-------|   |-------|
3085 *    |       |   |       |    idx - number of extents before idx
3086 *    |  idx  |   | count |
3087 *    |       |   |       |    count - number of extents being inserted at idx
3088 *    |-------|   |-------|
3089 *    | count |   | nex2  |    nex2 - number of extents after idx + count
3090 *    |-------|   |-------|
3091 */
3092void
3093xfs_iext_add_indirect_multi(
3094        xfs_ifork_t     *ifp,                   /* inode fork pointer */
3095        int             erp_idx,                /* target extent irec index */
3096        xfs_extnum_t    idx,                    /* index within target list */
3097        int             count)                  /* new extents being added */
3098{
3099        int             byte_diff;              /* new bytes being added */
3100        xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
3101        xfs_extnum_t    ext_diff;               /* number of extents to add */
3102        xfs_extnum_t    ext_cnt;                /* new extents still needed */
3103        xfs_extnum_t    nex2;                   /* extents after idx + count */
3104        xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
3105        int             nlists;                 /* number of irec's (lists) */
3106
3107        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3108        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3109        nex2 = erp->er_extcount - idx;
3110        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3111
3112        /*
3113         * Save second part of target extent list
3114         * (all extents past */
3115        if (nex2) {
3116                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3117                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3118                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3119                erp->er_extcount -= nex2;
3120                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3121                memset(&erp->er_extbuf[idx], 0, byte_diff);
3122        }
3123
3124        /*
3125         * Add the new extents to the end of the target
3126         * list, then allocate new irec record(s) and
3127         * extent buffer(s) as needed to store the rest
3128         * of the new extents.
3129         */
3130        ext_cnt = count;
3131        ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3132        if (ext_diff) {
3133                erp->er_extcount += ext_diff;
3134                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3135                ext_cnt -= ext_diff;
3136        }
3137        while (ext_cnt) {
3138                erp_idx++;
3139                erp = xfs_iext_irec_new(ifp, erp_idx);
3140                ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3141                erp->er_extcount = ext_diff;
3142                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3143                ext_cnt -= ext_diff;
3144        }
3145
3146        /* Add nex2 extents back to indirection array */
3147        if (nex2) {
3148                xfs_extnum_t    ext_avail;
3149                int             i;
3150
3151                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3152                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3153                i = 0;
3154                /*
3155                 * If nex2 extents fit in the current page, append
3156                 * nex2_ep after the new extents.
3157                 */
3158                if (nex2 <= ext_avail) {
3159                        i = erp->er_extcount;
3160                }
3161                /*
3162                 * Otherwise, check if space is available in the
3163                 * next page.
3164                 */
3165                else if ((erp_idx < nlists - 1) &&
3166                         (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3167                          ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3168                        erp_idx++;
3169                        erp++;
3170                        /* Create a hole for nex2 extents */
3171                        memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3172                                erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3173                }
3174                /*
3175                 * Final choice, create a new extent page for
3176                 * nex2 extents.
3177                 */
3178                else {
3179                        erp_idx++;
3180                        erp = xfs_iext_irec_new(ifp, erp_idx);
3181                }
3182                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3183                kmem_free(nex2_ep);
3184                erp->er_extcount += nex2;
3185                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3186        }
3187}
3188
3189/*
3190 * This is called when the amount of space required for incore file
3191 * extents needs to be decreased. The ext_diff parameter stores the
3192 * number of extents to be removed and the idx parameter contains
3193 * the extent index where the extents will be removed from.
3194 *
3195 * If the amount of space needed has decreased below the linear
3196 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3197 * extent array.  Otherwise, use kmem_realloc() to adjust the
3198 * size to what is needed.
3199 */
3200void
3201xfs_iext_remove(
3202        xfs_inode_t     *ip,            /* incore inode pointer */
3203        xfs_extnum_t    idx,            /* index to begin removing exts */
3204        int             ext_diff,       /* number of extents to remove */
3205        int             state)          /* type of extent conversion */
3206{
3207        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3208        xfs_extnum_t    nextents;       /* number of extents in file */
3209        int             new_size;       /* size of extents after removal */
3210
3211        trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3212
3213        ASSERT(ext_diff > 0);
3214        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3215        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3216
3217        if (new_size == 0) {
3218                xfs_iext_destroy(ifp);
3219        } else if (ifp->if_flags & XFS_IFEXTIREC) {
3220                xfs_iext_remove_indirect(ifp, idx, ext_diff);
3221        } else if (ifp->if_real_bytes) {
3222                xfs_iext_remove_direct(ifp, idx, ext_diff);
3223        } else {
3224                xfs_iext_remove_inline(ifp, idx, ext_diff);
3225        }
3226        ifp->if_bytes = new_size;
3227}
3228
3229/*
3230 * This removes ext_diff extents from the inline buffer, beginning
3231 * at extent index idx.
3232 */
3233void
3234xfs_iext_remove_inline(
3235        xfs_ifork_t     *ifp,           /* inode fork pointer */
3236        xfs_extnum_t    idx,            /* index to begin removing exts */
3237        int             ext_diff)       /* number of extents to remove */
3238{
3239        int             nextents;       /* number of extents in file */
3240
3241        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3242        ASSERT(idx < XFS_INLINE_EXTS);
3243        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3244        ASSERT(((nextents - ext_diff) > 0) &&
3245                (nextents - ext_diff) < XFS_INLINE_EXTS);
3246
3247        if (idx + ext_diff < nextents) {
3248                memmove(&ifp->if_u2.if_inline_ext[idx],
3249                        &ifp->if_u2.if_inline_ext[idx + ext_diff],
3250                        (nextents - (idx + ext_diff)) *
3251                         sizeof(xfs_bmbt_rec_t));
3252                memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3253                        0, ext_diff * sizeof(xfs_bmbt_rec_t));
3254        } else {
3255                memset(&ifp->if_u2.if_inline_ext[idx], 0,
3256                        ext_diff * sizeof(xfs_bmbt_rec_t));
3257        }
3258}
3259
3260/*
3261 * This removes ext_diff extents from a linear (direct) extent list,
3262 * beginning at extent index idx. If the extents are being removed
3263 * from the end of the list (ie. truncate) then we just need to re-
3264 * allocate the list to remove the extra space. Otherwise, if the
3265 * extents are being removed from the middle of the existing extent
3266 * entries, then we first need to move the extent records beginning
3267 * at idx + ext_diff up in the list to overwrite the records being
3268 * removed, then remove the extra space via kmem_realloc.
3269 */
3270void
3271xfs_iext_remove_direct(
3272        xfs_ifork_t     *ifp,           /* inode fork pointer */
3273        xfs_extnum_t    idx,            /* index to begin removing exts */
3274        int             ext_diff)       /* number of extents to remove */
3275{
3276        xfs_extnum_t    nextents;       /* number of extents in file */
3277        int             new_size;       /* size of extents after removal */
3278
3279        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3280        new_size = ifp->if_bytes -
3281                (ext_diff * sizeof(xfs_bmbt_rec_t));
3282        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3283
3284        if (new_size == 0) {
3285                xfs_iext_destroy(ifp);
3286                return;
3287        }
3288        /* Move extents up in the list (if needed) */
3289        if (idx + ext_diff < nextents) {
3290                memmove(&ifp->if_u1.if_extents[idx],
3291                        &ifp->if_u1.if_extents[idx + ext_diff],
3292                        (nextents - (idx + ext_diff)) *
3293                         sizeof(xfs_bmbt_rec_t));
3294        }
3295        memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3296                0, ext_diff * sizeof(xfs_bmbt_rec_t));
3297        /*
3298         * Reallocate the direct extent list. If the extents
3299         * will fit inside the inode then xfs_iext_realloc_direct
3300         * will switch from direct to inline extent allocation
3301         * mode for us.
3302         */
3303        xfs_iext_realloc_direct(ifp, new_size);
3304        ifp->if_bytes = new_size;
3305}
3306
3307/*
3308 * This is called when incore extents are being removed from the
3309 * indirection array and the extents being removed span multiple extent
3310 * buffers. The idx parameter contains the file extent index where we
3311 * want to begin removing extents, and the count parameter contains
3312 * how many extents need to be removed.
3313 *
3314 *    |-------|   |-------|
3315 *    | nex1  |   |       |    nex1 - number of extents before idx
3316 *    |-------|   | count |
3317 *    |       |   |       |    count - number of extents being removed at idx
3318 *    | count |   |-------|
3319 *    |       |   | nex2  |    nex2 - number of extents after idx + count
3320 *    |-------|   |-------|
3321 */
3322void
3323xfs_iext_remove_indirect(
3324        xfs_ifork_t     *ifp,           /* inode fork pointer */
3325        xfs_extnum_t    idx,            /* index to begin removing extents */
3326        int             count)          /* number of extents to remove */
3327{
3328        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3329        int             erp_idx = 0;    /* indirection array index */
3330        xfs_extnum_t    ext_cnt;        /* extents left to remove */
3331        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
3332        xfs_extnum_t    nex1;           /* number of extents before idx */
3333        xfs_extnum_t    nex2;           /* extents after idx + count */
3334        int             page_idx = idx; /* index in target extent list */
3335
3336        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3337        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3338        ASSERT(erp != NULL);
3339        nex1 = page_idx;
3340        ext_cnt = count;
3341        while (ext_cnt) {
3342                nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3343                ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3344                /*
3345                 * Check for deletion of entire list;
3346                 * xfs_iext_irec_remove() updates extent offsets.
3347                 */
3348                if (ext_diff == erp->er_extcount) {
3349                        xfs_iext_irec_remove(ifp, erp_idx);
3350                        ext_cnt -= ext_diff;
3351                        nex1 = 0;
3352                        if (ext_cnt) {
3353                                ASSERT(erp_idx < ifp->if_real_bytes /
3354                                        XFS_IEXT_BUFSZ);
3355                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3356                                nex1 = 0;
3357                                continue;
3358                        } else {
3359                                break;
3360                        }
3361                }
3362                /* Move extents up (if needed) */
3363                if (nex2) {
3364                        memmove(&erp->er_extbuf[nex1],
3365                                &erp->er_extbuf[nex1 + ext_diff],
3366                                nex2 * sizeof(xfs_bmbt_rec_t));
3367                }
3368                /* Zero out rest of page */
3369                memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3370                        ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3371                /* Update remaining counters */
3372                erp->er_extcount -= ext_diff;
3373                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3374                ext_cnt -= ext_diff;
3375                nex1 = 0;
3376                erp_idx++;
3377                erp++;
3378        }
3379        ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3380        xfs_iext_irec_compact(ifp);
3381}
3382
3383/*
3384 * Create, destroy, or resize a linear (direct) block of extents.
3385 */
3386void
3387xfs_iext_realloc_direct(
3388        xfs_ifork_t     *ifp,           /* inode fork pointer */
3389        int             new_size)       /* new size of extents */
3390{
3391        int             rnew_size;      /* real new size of extents */
3392
3393        rnew_size = new_size;
3394
3395        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3396                ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3397                 (new_size != ifp->if_real_bytes)));
3398
3399        /* Free extent records */
3400        if (new_size == 0) {
3401                xfs_iext_destroy(ifp);
3402        }
3403        /* Resize direct extent list and zero any new bytes */
3404        else if (ifp->if_real_bytes) {
3405                /* Check if extents will fit inside the inode */
3406                if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3407                        xfs_iext_direct_to_inline(ifp, new_size /
3408                                (uint)sizeof(xfs_bmbt_rec_t));
3409                        ifp->if_bytes = new_size;
3410                        return;
3411                }
3412                if (!is_power_of_2(new_size)){
3413                        rnew_size = roundup_pow_of_two(new_size);
3414                }
3415                if (rnew_size != ifp->if_real_bytes) {
3416                        ifp->if_u1.if_extents =
3417                                kmem_realloc(ifp->if_u1.if_extents,
3418                                                rnew_size,
3419                                                ifp->if_real_bytes, KM_NOFS);
3420                }
3421                if (rnew_size > ifp->if_real_bytes) {
3422                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3423                                (uint)sizeof(xfs_bmbt_rec_t)], 0,
3424                                rnew_size - ifp->if_real_bytes);
3425                }
3426        }
3427        /*
3428         * Switch from the inline extent buffer to a direct
3429         * extent list. Be sure to include the inline extent
3430         * bytes in new_size.
3431         */
3432        else {
3433                new_size += ifp->if_bytes;
3434                if (!is_power_of_2(new_size)) {
3435                        rnew_size = roundup_pow_of_two(new_size);
3436                }
3437                xfs_iext_inline_to_direct(ifp, rnew_size);
3438        }
3439        ifp->if_real_bytes = rnew_size;
3440        ifp->if_bytes = new_size;
3441}
3442
3443/*
3444 * Switch from linear (direct) extent records to inline buffer.
3445 */
3446void
3447xfs_iext_direct_to_inline(
3448        xfs_ifork_t     *ifp,           /* inode fork pointer */
3449        xfs_extnum_t    nextents)       /* number of extents in file */
3450{
3451        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3452        ASSERT(nextents <= XFS_INLINE_EXTS);
3453        /*
3454         * The inline buffer was zeroed when we switched
3455         * from inline to direct extent allocation mode,
3456         * so we don't need to clear it here.
3457         */
3458        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3459                nextents * sizeof(xfs_bmbt_rec_t));
3460        kmem_free(ifp->if_u1.if_extents);
3461        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3462        ifp->if_real_bytes = 0;
3463}
3464
3465/*
3466 * Switch from inline buffer to linear (direct) extent records.
3467 * new_size should already be rounded up to the next power of 2
3468 * by the caller (when appropriate), so use new_size as it is.
3469 * However, since new_size may be rounded up, we can't update
3470 * if_bytes here. It is the caller's responsibility to update
3471 * if_bytes upon return.
3472 */
3473void
3474xfs_iext_inline_to_direct(
3475        xfs_ifork_t     *ifp,           /* inode fork pointer */
3476        int             new_size)       /* number of extents in file */
3477{
3478        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3479        memset(ifp->if_u1.if_extents, 0, new_size);
3480        if (ifp->if_bytes) {
3481                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3482                        ifp->if_bytes);
3483                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3484                        sizeof(xfs_bmbt_rec_t));
3485        }
3486        ifp->if_real_bytes = new_size;
3487}
3488
3489/*
3490 * Resize an extent indirection array to new_size bytes.
3491 */
3492STATIC void
3493xfs_iext_realloc_indirect(
3494        xfs_ifork_t     *ifp,           /* inode fork pointer */
3495        int             new_size)       /* new indirection array size */
3496{
3497        int             nlists;         /* number of irec's (ex lists) */
3498        int             size;           /* current indirection array size */
3499
3500        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3501        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3502        size = nlists * sizeof(xfs_ext_irec_t);
3503        ASSERT(ifp->if_real_bytes);
3504        ASSERT((new_size >= 0) && (new_size != size));
3505        if (new_size == 0) {
3506                xfs_iext_destroy(ifp);
3507        } else {
3508                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3509                        kmem_realloc(ifp->if_u1.if_ext_irec,
3510                                new_size, size, KM_NOFS);
3511        }
3512}
3513
3514/*
3515 * Switch from indirection array to linear (direct) extent allocations.
3516 */
3517STATIC void
3518xfs_iext_indirect_to_direct(
3519         xfs_ifork_t    *ifp)           /* inode fork pointer */
3520{
3521        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
3522        xfs_extnum_t    nextents;       /* number of extents in file */
3523        int             size;           /* size of file extents */
3524
3525        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3526        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3527        ASSERT(nextents <= XFS_LINEAR_EXTS);
3528        size = nextents * sizeof(xfs_bmbt_rec_t);
3529
3530        xfs_iext_irec_compact_pages(ifp);
3531        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3532
3533        ep = ifp->if_u1.if_ext_irec->er_extbuf;
3534        kmem_free(ifp->if_u1.if_ext_irec);
3535        ifp->if_flags &= ~XFS_IFEXTIREC;
3536        ifp->if_u1.if_extents = ep;
3537        ifp->if_bytes = size;
3538        if (nextents < XFS_LINEAR_EXTS) {
3539                xfs_iext_realloc_direct(ifp, size);
3540        }
3541}
3542
3543/*
3544 * Free incore file extents.
3545 */
3546void
3547xfs_iext_destroy(
3548        xfs_ifork_t     *ifp)           /* inode fork pointer */
3549{
3550        if (ifp->if_flags & XFS_IFEXTIREC) {
3551                int     erp_idx;
3552                int     nlists;
3553
3554                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3555                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3556                        xfs_iext_irec_remove(ifp, erp_idx);
3557                }
3558                ifp->if_flags &= ~XFS_IFEXTIREC;
3559        } else if (ifp->if_real_bytes) {
3560                kmem_free(ifp->if_u1.if_extents);
3561        } else if (ifp->if_bytes) {
3562                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3563                        sizeof(xfs_bmbt_rec_t));
3564        }
3565        ifp->if_u1.if_extents = NULL;
3566        ifp->if_real_bytes = 0;
3567        ifp->if_bytes = 0;
3568}
3569
3570/*
3571 * Return a pointer to the extent record for file system block bno.
3572 */
3573xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
3574xfs_iext_bno_to_ext(
3575        xfs_ifork_t     *ifp,           /* inode fork pointer */
3576        xfs_fileoff_t   bno,            /* block number to search for */
3577        xfs_extnum_t    *idxp)          /* index of target extent */
3578{
3579        xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
3580        xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
3581        xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3582        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3583        int             high;           /* upper boundary in search */
3584        xfs_extnum_t    idx = 0;        /* index of target extent */
3585        int             low;            /* lower boundary in search */
3586        xfs_extnum_t    nextents;       /* number of file extents */
3587        xfs_fileoff_t   startoff = 0;   /* start offset of extent */
3588
3589        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3590        if (nextents == 0) {
3591                *idxp = 0;
3592                return NULL;
3593        }
3594        low = 0;
3595        if (ifp->if_flags & XFS_IFEXTIREC) {
3596                /* Find target extent list */
3597                int     erp_idx = 0;
3598                erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3599                base = erp->er_extbuf;
3600                high = erp->er_extcount - 1;
3601        } else {
3602                base = ifp->if_u1.if_extents;
3603                high = nextents - 1;
3604        }
3605        /* Binary search extent records */
3606        while (low <= high) {
3607                idx = (low + high) >> 1;
3608                ep = base + idx;
3609                startoff = xfs_bmbt_get_startoff(ep);
3610                blockcount = xfs_bmbt_get_blockcount(ep);
3611                if (bno < startoff) {
3612                        high = idx - 1;
3613                } else if (bno >= startoff + blockcount) {
3614                        low = idx + 1;
3615                } else {
3616                        /* Convert back to file-based extent index */
3617                        if (ifp->if_flags & XFS_IFEXTIREC) {
3618                                idx += erp->er_extoff;
3619                        }
3620                        *idxp = idx;
3621                        return ep;
3622                }
3623        }
3624        /* Convert back to file-based extent index */
3625        if (ifp->if_flags & XFS_IFEXTIREC) {
3626                idx += erp->er_extoff;
3627        }
3628        if (bno >= startoff + blockcount) {
3629                if (++idx == nextents) {
3630                        ep = NULL;
3631                } else {
3632                        ep = xfs_iext_get_ext(ifp, idx);
3633                }
3634        }
3635        *idxp = idx;
3636        return ep;
3637}
3638
3639/*
3640 * Return a pointer to the indirection array entry containing the
3641 * extent record for filesystem block bno. Store the index of the
3642 * target irec in *erp_idxp.
3643 */
3644xfs_ext_irec_t *                        /* pointer to found extent record */
3645xfs_iext_bno_to_irec(
3646        xfs_ifork_t     *ifp,           /* inode fork pointer */
3647        xfs_fileoff_t   bno,            /* block number to search for */
3648        int             *erp_idxp)      /* irec index of target ext list */
3649{
3650        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3651        xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
3652        int             erp_idx;        /* indirection array index */
3653        int             nlists;         /* number of extent irec's (lists) */
3654        int             high;           /* binary search upper limit */
3655        int             low;            /* binary search lower limit */
3656
3657        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3658        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3659        erp_idx = 0;
3660        low = 0;
3661        high = nlists - 1;
3662        while (low <= high) {
3663                erp_idx = (low + high) >> 1;
3664                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3665                erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3666                if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3667                        high = erp_idx - 1;
3668                } else if (erp_next && bno >=
3669                           xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3670                        low = erp_idx + 1;
3671                } else {
3672                        break;
3673                }
3674        }
3675        *erp_idxp = erp_idx;
3676        return erp;
3677}
3678
3679/*
3680 * Return a pointer to the indirection array entry containing the
3681 * extent record at file extent index *idxp. Store the index of the
3682 * target irec in *erp_idxp and store the page index of the target
3683 * extent record in *idxp.
3684 */
3685xfs_ext_irec_t *
3686xfs_iext_idx_to_irec(
3687        xfs_ifork_t     *ifp,           /* inode fork pointer */
3688        xfs_extnum_t    *idxp,          /* extent index (file -> page) */
3689        int             *erp_idxp,      /* pointer to target irec */
3690        int             realloc)        /* new bytes were just added */
3691{
3692        xfs_ext_irec_t  *prev;          /* pointer to previous irec */
3693        xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
3694        int             erp_idx;        /* indirection array index */
3695        int             nlists;         /* number of irec's (ex lists) */
3696        int             high;           /* binary search upper limit */
3697        int             low;            /* binary search lower limit */
3698        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
3699
3700        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3701        ASSERT(page_idx >= 0);
3702        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3703        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3704
3705        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3706        erp_idx = 0;
3707        low = 0;
3708        high = nlists - 1;
3709
3710        /* Binary search extent irec's */
3711        while (low <= high) {
3712                erp_idx = (low + high) >> 1;
3713                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3714                prev = erp_idx > 0 ? erp - 1 : NULL;
3715                if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3716                     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3717                        high = erp_idx - 1;
3718                } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3719                           (page_idx == erp->er_extoff + erp->er_extcount &&
3720                            !realloc)) {
3721                        low = erp_idx + 1;
3722                } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3723                           erp->er_extcount == XFS_LINEAR_EXTS) {
3724                        ASSERT(realloc);
3725                        page_idx = 0;
3726                        erp_idx++;
3727                        erp = erp_idx < nlists ? erp + 1 : NULL;
3728                        break;
3729                } else {
3730                        page_idx -= erp->er_extoff;
3731                        break;
3732                }
3733        }
3734        *idxp = page_idx;
3735        *erp_idxp = erp_idx;
3736        return(erp);
3737}
3738
3739/*
3740 * Allocate and initialize an indirection array once the space needed
3741 * for incore extents increases above XFS_IEXT_BUFSZ.
3742 */
3743void
3744xfs_iext_irec_init(
3745        xfs_ifork_t     *ifp)           /* inode fork pointer */
3746{
3747        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3748        xfs_extnum_t    nextents;       /* number of extents in file */
3749
3750        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3751        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3752        ASSERT(nextents <= XFS_LINEAR_EXTS);
3753
3754        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3755
3756        if (nextents == 0) {
3757                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3758        } else if (!ifp->if_real_bytes) {
3759                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3760        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3761                xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3762        }
3763        erp->er_extbuf = ifp->if_u1.if_extents;
3764        erp->er_extcount = nextents;
3765        erp->er_extoff = 0;
3766
3767        ifp->if_flags |= XFS_IFEXTIREC;
3768        ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3769        ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3770        ifp->if_u1.if_ext_irec = erp;
3771
3772        return;
3773}
3774
3775/*
3776 * Allocate and initialize a new entry in the indirection array.
3777 */
3778xfs_ext_irec_t *
3779xfs_iext_irec_new(
3780        xfs_ifork_t     *ifp,           /* inode fork pointer */
3781        int             erp_idx)        /* index for new irec */
3782{
3783        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3784        int             i;              /* loop counter */
3785        int             nlists;         /* number of irec's (ex lists) */
3786
3787        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3788        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3789
3790        /* Resize indirection array */
3791        xfs_iext_realloc_indirect(ifp, ++nlists *
3792                                  sizeof(xfs_ext_irec_t));
3793        /*
3794         * Move records down in the array so the
3795         * new page can use erp_idx.
3796         */
3797        erp = ifp->if_u1.if_ext_irec;
3798        for (i = nlists - 1; i > erp_idx; i--) {
3799                memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3800        }
3801        ASSERT(i == erp_idx);
3802
3803        /* Initialize new extent record */
3804        erp = ifp->if_u1.if_ext_irec;
3805        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3806        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3807        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3808        erp[erp_idx].er_extcount = 0;
3809        erp[erp_idx].er_extoff = erp_idx > 0 ?
3810                erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3811        return (&erp[erp_idx]);
3812}
3813
3814/*
3815 * Remove a record from the indirection array.
3816 */
3817void
3818xfs_iext_irec_remove(
3819        xfs_ifork_t     *ifp,           /* inode fork pointer */
3820        int             erp_idx)        /* irec index to remove */
3821{
3822        xfs_ext_irec_t  *erp;           /* indirection array pointer */
3823        int             i;              /* loop counter */
3824        int             nlists;         /* number of irec's (ex lists) */
3825
3826        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3827        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3828        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3829        if (erp->er_extbuf) {
3830                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3831                        -erp->er_extcount);
3832                kmem_free(erp->er_extbuf);
3833        }
3834        /* Compact extent records */
3835        erp = ifp->if_u1.if_ext_irec;
3836        for (i = erp_idx; i < nlists - 1; i++) {
3837                memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3838        }
3839        /*
3840         * Manually free the last extent record from the indirection
3841         * array.  A call to xfs_iext_realloc_indirect() with a size
3842         * of zero would result in a call to xfs_iext_destroy() which
3843         * would in turn call this function again, creating a nasty
3844         * infinite loop.
3845         */
3846        if (--nlists) {
3847                xfs_iext_realloc_indirect(ifp,
3848                        nlists * sizeof(xfs_ext_irec_t));
3849        } else {
3850                kmem_free(ifp->if_u1.if_ext_irec);
3851        }
3852        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3853}
3854
3855/*
3856 * This is called to clean up large amounts of unused memory allocated
3857 * by the indirection array.  Before compacting anything though, verify
3858 * that the indirection array is still needed and switch back to the
3859 * linear extent list (or even the inline buffer) if possible.  The
3860 * compaction policy is as follows:
3861 *
3862 *    Full Compaction: Extents fit into a single page (or inline buffer)
3863 * Partial Compaction: Extents occupy less than 50% of allocated space
3864 *      No Compaction: Extents occupy at least 50% of allocated space
3865 */
3866void
3867xfs_iext_irec_compact(
3868        xfs_ifork_t     *ifp)           /* inode fork pointer */
3869{
3870        xfs_extnum_t    nextents;       /* number of extents in file */
3871        int             nlists;         /* number of irec's (ex lists) */
3872
3873        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3874        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3875        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3876
3877        if (nextents == 0) {
3878                xfs_iext_destroy(ifp);
3879        } else if (nextents <= XFS_INLINE_EXTS) {
3880                xfs_iext_indirect_to_direct(ifp);
3881                xfs_iext_direct_to_inline(ifp, nextents);
3882        } else if (nextents <= XFS_LINEAR_EXTS) {
3883                xfs_iext_indirect_to_direct(ifp);
3884        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3885                xfs_iext_irec_compact_pages(ifp);
3886        }
3887}
3888
3889/*
3890 * Combine extents from neighboring extent pages.
3891 */
3892void
3893xfs_iext_irec_compact_pages(
3894        xfs_ifork_t     *ifp)           /* inode fork pointer */
3895{
3896        xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
3897        int             erp_idx = 0;    /* indirection array index */
3898        int             nlists;         /* number of irec's (ex lists) */
3899
3900        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3901        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3902        while (erp_idx < nlists - 1) {
3903                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3904                erp_next = erp + 1;
3905                if (erp_next->er_extcount <=
3906                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
3907                        memcpy(&erp->er_extbuf[erp->er_extcount],
3908                                erp_next->er_extbuf, erp_next->er_extcount *
3909                                sizeof(xfs_bmbt_rec_t));
3910                        erp->er_extcount += erp_next->er_extcount;
3911                        /*
3912                         * Free page before removing extent record
3913                         * so er_extoffs don't get modified in
3914                         * xfs_iext_irec_remove.
3915                         */
3916                        kmem_free(erp_next->er_extbuf);
3917                        erp_next->er_extbuf = NULL;
3918                        xfs_iext_irec_remove(ifp, erp_idx + 1);
3919                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3920                } else {
3921                        erp_idx++;
3922                }
3923        }
3924}
3925
3926/*
3927 * This is called to update the er_extoff field in the indirection
3928 * array when extents have been added or removed from one of the
3929 * extent lists. erp_idx contains the irec index to begin updating
3930 * at and ext_diff contains the number of extents that were added
3931 * or removed.
3932 */
3933void
3934xfs_iext_irec_update_extoffs(
3935        xfs_ifork_t     *ifp,           /* inode fork pointer */
3936        int             erp_idx,        /* irec index to update */
3937        int             ext_diff)       /* number of new extents */
3938{
3939        int             i;              /* loop counter */
3940        int             nlists;         /* number of irec's (ex lists */
3941
3942        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3943        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3944        for (i = erp_idx; i < nlists; i++) {
3945                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3946        }
3947}
3948
3949/*
3950 * Test whether it is appropriate to check an inode for and free post EOF
3951 * blocks. The 'force' parameter determines whether we should also consider
3952 * regular files that are marked preallocated or append-only.
3953 */
3954bool
3955xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3956{
3957        /* prealloc/delalloc exists only on regular files */
3958        if (!S_ISREG(ip->i_d.di_mode))
3959                return false;
3960
3961        /*
3962         * Zero sized files with no cached pages and delalloc blocks will not
3963         * have speculative prealloc/delalloc blocks to remove.
3964         */
3965        if (VFS_I(ip)->i_size == 0 &&
3966            VN_CACHED(VFS_I(ip)) == 0 &&
3967            ip->i_delayed_blks == 0)
3968                return false;
3969
3970        /* If we haven't read in the extent list, then don't do it now. */
3971        if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3972                return false;
3973
3974        /*
3975         * Do not free real preallocated or append-only files unless the file
3976         * has delalloc blocks and we are forced to remove them.
3977         */
3978        if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3979                if (!force || ip->i_delayed_blks == 0)
3980                        return false;
3981
3982        return true;
3983}
3984
3985
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.