linux/fs/xfs/xfs_log_recover.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_bit.h"
  22#include "xfs_log.h"
  23#include "xfs_inum.h"
  24#include "xfs_trans.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_mount.h"
  28#include "xfs_error.h"
  29#include "xfs_bmap_btree.h"
  30#include "xfs_alloc_btree.h"
  31#include "xfs_ialloc_btree.h"
  32#include "xfs_dinode.h"
  33#include "xfs_inode.h"
  34#include "xfs_inode_item.h"
  35#include "xfs_alloc.h"
  36#include "xfs_ialloc.h"
  37#include "xfs_log_priv.h"
  38#include "xfs_buf_item.h"
  39#include "xfs_log_recover.h"
  40#include "xfs_extfree_item.h"
  41#include "xfs_trans_priv.h"
  42#include "xfs_quota.h"
  43#include "xfs_utils.h"
  44#include "xfs_trace.h"
  45
  46STATIC int
  47xlog_find_zeroed(
  48        struct xlog     *,
  49        xfs_daddr_t     *);
  50STATIC int
  51xlog_clear_stale_blocks(
  52        struct xlog     *,
  53        xfs_lsn_t);
  54#if defined(DEBUG)
  55STATIC void
  56xlog_recover_check_summary(
  57        struct xlog *);
  58#else
  59#define xlog_recover_check_summary(log)
  60#endif
  61
  62/*
  63 * This structure is used during recovery to record the buf log items which
  64 * have been canceled and should not be replayed.
  65 */
  66struct xfs_buf_cancel {
  67        xfs_daddr_t             bc_blkno;
  68        uint                    bc_len;
  69        int                     bc_refcount;
  70        struct list_head        bc_list;
  71};
  72
  73/*
  74 * Sector aligned buffer routines for buffer create/read/write/access
  75 */
  76
  77/*
  78 * Verify the given count of basic blocks is valid number of blocks
  79 * to specify for an operation involving the given XFS log buffer.
  80 * Returns nonzero if the count is valid, 0 otherwise.
  81 */
  82
  83static inline int
  84xlog_buf_bbcount_valid(
  85        struct xlog     *log,
  86        int             bbcount)
  87{
  88        return bbcount > 0 && bbcount <= log->l_logBBsize;
  89}
  90
  91/*
  92 * Allocate a buffer to hold log data.  The buffer needs to be able
  93 * to map to a range of nbblks basic blocks at any valid (basic
  94 * block) offset within the log.
  95 */
  96STATIC xfs_buf_t *
  97xlog_get_bp(
  98        struct xlog     *log,
  99        int             nbblks)
 100{
 101        struct xfs_buf  *bp;
 102
 103        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 104                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 105                        nbblks);
 106                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 107                return NULL;
 108        }
 109
 110        /*
 111         * We do log I/O in units of log sectors (a power-of-2
 112         * multiple of the basic block size), so we round up the
 113         * requested size to accommodate the basic blocks required
 114         * for complete log sectors.
 115         *
 116         * In addition, the buffer may be used for a non-sector-
 117         * aligned block offset, in which case an I/O of the
 118         * requested size could extend beyond the end of the
 119         * buffer.  If the requested size is only 1 basic block it
 120         * will never straddle a sector boundary, so this won't be
 121         * an issue.  Nor will this be a problem if the log I/O is
 122         * done in basic blocks (sector size 1).  But otherwise we
 123         * extend the buffer by one extra log sector to ensure
 124         * there's space to accommodate this possibility.
 125         */
 126        if (nbblks > 1 && log->l_sectBBsize > 1)
 127                nbblks += log->l_sectBBsize;
 128        nbblks = round_up(nbblks, log->l_sectBBsize);
 129
 130        bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
 131        if (bp)
 132                xfs_buf_unlock(bp);
 133        return bp;
 134}
 135
 136STATIC void
 137xlog_put_bp(
 138        xfs_buf_t       *bp)
 139{
 140        xfs_buf_free(bp);
 141}
 142
 143/*
 144 * Return the address of the start of the given block number's data
 145 * in a log buffer.  The buffer covers a log sector-aligned region.
 146 */
 147STATIC xfs_caddr_t
 148xlog_align(
 149        struct xlog     *log,
 150        xfs_daddr_t     blk_no,
 151        int             nbblks,
 152        struct xfs_buf  *bp)
 153{
 154        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 155
 156        ASSERT(offset + nbblks <= bp->b_length);
 157        return bp->b_addr + BBTOB(offset);
 158}
 159
 160
 161/*
 162 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 163 */
 164STATIC int
 165xlog_bread_noalign(
 166        struct xlog     *log,
 167        xfs_daddr_t     blk_no,
 168        int             nbblks,
 169        struct xfs_buf  *bp)
 170{
 171        int             error;
 172
 173        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 174                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 175                        nbblks);
 176                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 177                return EFSCORRUPTED;
 178        }
 179
 180        blk_no = round_down(blk_no, log->l_sectBBsize);
 181        nbblks = round_up(nbblks, log->l_sectBBsize);
 182
 183        ASSERT(nbblks > 0);
 184        ASSERT(nbblks <= bp->b_length);
 185
 186        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 187        XFS_BUF_READ(bp);
 188        bp->b_io_length = nbblks;
 189        bp->b_error = 0;
 190
 191        xfsbdstrat(log->l_mp, bp);
 192        error = xfs_buf_iowait(bp);
 193        if (error)
 194                xfs_buf_ioerror_alert(bp, __func__);
 195        return error;
 196}
 197
 198STATIC int
 199xlog_bread(
 200        struct xlog     *log,
 201        xfs_daddr_t     blk_no,
 202        int             nbblks,
 203        struct xfs_buf  *bp,
 204        xfs_caddr_t     *offset)
 205{
 206        int             error;
 207
 208        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 209        if (error)
 210                return error;
 211
 212        *offset = xlog_align(log, blk_no, nbblks, bp);
 213        return 0;
 214}
 215
 216/*
 217 * Read at an offset into the buffer. Returns with the buffer in it's original
 218 * state regardless of the result of the read.
 219 */
 220STATIC int
 221xlog_bread_offset(
 222        struct xlog     *log,
 223        xfs_daddr_t     blk_no,         /* block to read from */
 224        int             nbblks,         /* blocks to read */
 225        struct xfs_buf  *bp,
 226        xfs_caddr_t     offset)
 227{
 228        xfs_caddr_t     orig_offset = bp->b_addr;
 229        int             orig_len = BBTOB(bp->b_length);
 230        int             error, error2;
 231
 232        error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 233        if (error)
 234                return error;
 235
 236        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 237
 238        /* must reset buffer pointer even on error */
 239        error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 240        if (error)
 241                return error;
 242        return error2;
 243}
 244
 245/*
 246 * Write out the buffer at the given block for the given number of blocks.
 247 * The buffer is kept locked across the write and is returned locked.
 248 * This can only be used for synchronous log writes.
 249 */
 250STATIC int
 251xlog_bwrite(
 252        struct xlog     *log,
 253        xfs_daddr_t     blk_no,
 254        int             nbblks,
 255        struct xfs_buf  *bp)
 256{
 257        int             error;
 258
 259        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 260                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 261                        nbblks);
 262                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 263                return EFSCORRUPTED;
 264        }
 265
 266        blk_no = round_down(blk_no, log->l_sectBBsize);
 267        nbblks = round_up(nbblks, log->l_sectBBsize);
 268
 269        ASSERT(nbblks > 0);
 270        ASSERT(nbblks <= bp->b_length);
 271
 272        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 273        XFS_BUF_ZEROFLAGS(bp);
 274        xfs_buf_hold(bp);
 275        xfs_buf_lock(bp);
 276        bp->b_io_length = nbblks;
 277        bp->b_error = 0;
 278
 279        error = xfs_bwrite(bp);
 280        if (error)
 281                xfs_buf_ioerror_alert(bp, __func__);
 282        xfs_buf_relse(bp);
 283        return error;
 284}
 285
 286#ifdef DEBUG
 287/*
 288 * dump debug superblock and log record information
 289 */
 290STATIC void
 291xlog_header_check_dump(
 292        xfs_mount_t             *mp,
 293        xlog_rec_header_t       *head)
 294{
 295        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
 296                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 297        xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
 298                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 299}
 300#else
 301#define xlog_header_check_dump(mp, head)
 302#endif
 303
 304/*
 305 * check log record header for recovery
 306 */
 307STATIC int
 308xlog_header_check_recover(
 309        xfs_mount_t             *mp,
 310        xlog_rec_header_t       *head)
 311{
 312        ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 313
 314        /*
 315         * IRIX doesn't write the h_fmt field and leaves it zeroed
 316         * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 317         * a dirty log created in IRIX.
 318         */
 319        if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 320                xfs_warn(mp,
 321        "dirty log written in incompatible format - can't recover");
 322                xlog_header_check_dump(mp, head);
 323                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 324                                 XFS_ERRLEVEL_HIGH, mp);
 325                return XFS_ERROR(EFSCORRUPTED);
 326        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 327                xfs_warn(mp,
 328        "dirty log entry has mismatched uuid - can't recover");
 329                xlog_header_check_dump(mp, head);
 330                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 331                                 XFS_ERRLEVEL_HIGH, mp);
 332                return XFS_ERROR(EFSCORRUPTED);
 333        }
 334        return 0;
 335}
 336
 337/*
 338 * read the head block of the log and check the header
 339 */
 340STATIC int
 341xlog_header_check_mount(
 342        xfs_mount_t             *mp,
 343        xlog_rec_header_t       *head)
 344{
 345        ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 346
 347        if (uuid_is_nil(&head->h_fs_uuid)) {
 348                /*
 349                 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 350                 * h_fs_uuid is nil, we assume this log was last mounted
 351                 * by IRIX and continue.
 352                 */
 353                xfs_warn(mp, "nil uuid in log - IRIX style log");
 354        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 355                xfs_warn(mp, "log has mismatched uuid - can't recover");
 356                xlog_header_check_dump(mp, head);
 357                XFS_ERROR_REPORT("xlog_header_check_mount",
 358                                 XFS_ERRLEVEL_HIGH, mp);
 359                return XFS_ERROR(EFSCORRUPTED);
 360        }
 361        return 0;
 362}
 363
 364STATIC void
 365xlog_recover_iodone(
 366        struct xfs_buf  *bp)
 367{
 368        if (bp->b_error) {
 369                /*
 370                 * We're not going to bother about retrying
 371                 * this during recovery. One strike!
 372                 */
 373                xfs_buf_ioerror_alert(bp, __func__);
 374                xfs_force_shutdown(bp->b_target->bt_mount,
 375                                        SHUTDOWN_META_IO_ERROR);
 376        }
 377        bp->b_iodone = NULL;
 378        xfs_buf_ioend(bp, 0);
 379}
 380
 381/*
 382 * This routine finds (to an approximation) the first block in the physical
 383 * log which contains the given cycle.  It uses a binary search algorithm.
 384 * Note that the algorithm can not be perfect because the disk will not
 385 * necessarily be perfect.
 386 */
 387STATIC int
 388xlog_find_cycle_start(
 389        struct xlog     *log,
 390        struct xfs_buf  *bp,
 391        xfs_daddr_t     first_blk,
 392        xfs_daddr_t     *last_blk,
 393        uint            cycle)
 394{
 395        xfs_caddr_t     offset;
 396        xfs_daddr_t     mid_blk;
 397        xfs_daddr_t     end_blk;
 398        uint            mid_cycle;
 399        int             error;
 400
 401        end_blk = *last_blk;
 402        mid_blk = BLK_AVG(first_blk, end_blk);
 403        while (mid_blk != first_blk && mid_blk != end_blk) {
 404                error = xlog_bread(log, mid_blk, 1, bp, &offset);
 405                if (error)
 406                        return error;
 407                mid_cycle = xlog_get_cycle(offset);
 408                if (mid_cycle == cycle)
 409                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 410                else
 411                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 412                mid_blk = BLK_AVG(first_blk, end_blk);
 413        }
 414        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 415               (mid_blk == end_blk && mid_blk-1 == first_blk));
 416
 417        *last_blk = end_blk;
 418
 419        return 0;
 420}
 421
 422/*
 423 * Check that a range of blocks does not contain stop_on_cycle_no.
 424 * Fill in *new_blk with the block offset where such a block is
 425 * found, or with -1 (an invalid block number) if there is no such
 426 * block in the range.  The scan needs to occur from front to back
 427 * and the pointer into the region must be updated since a later
 428 * routine will need to perform another test.
 429 */
 430STATIC int
 431xlog_find_verify_cycle(
 432        struct xlog     *log,
 433        xfs_daddr_t     start_blk,
 434        int             nbblks,
 435        uint            stop_on_cycle_no,
 436        xfs_daddr_t     *new_blk)
 437{
 438        xfs_daddr_t     i, j;
 439        uint            cycle;
 440        xfs_buf_t       *bp;
 441        xfs_daddr_t     bufblks;
 442        xfs_caddr_t     buf = NULL;
 443        int             error = 0;
 444
 445        /*
 446         * Greedily allocate a buffer big enough to handle the full
 447         * range of basic blocks we'll be examining.  If that fails,
 448         * try a smaller size.  We need to be able to read at least
 449         * a log sector, or we're out of luck.
 450         */
 451        bufblks = 1 << ffs(nbblks);
 452        while (bufblks > log->l_logBBsize)
 453                bufblks >>= 1;
 454        while (!(bp = xlog_get_bp(log, bufblks))) {
 455                bufblks >>= 1;
 456                if (bufblks < log->l_sectBBsize)
 457                        return ENOMEM;
 458        }
 459
 460        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 461                int     bcount;
 462
 463                bcount = min(bufblks, (start_blk + nbblks - i));
 464
 465                error = xlog_bread(log, i, bcount, bp, &buf);
 466                if (error)
 467                        goto out;
 468
 469                for (j = 0; j < bcount; j++) {
 470                        cycle = xlog_get_cycle(buf);
 471                        if (cycle == stop_on_cycle_no) {
 472                                *new_blk = i+j;
 473                                goto out;
 474                        }
 475
 476                        buf += BBSIZE;
 477                }
 478        }
 479
 480        *new_blk = -1;
 481
 482out:
 483        xlog_put_bp(bp);
 484        return error;
 485}
 486
 487/*
 488 * Potentially backup over partial log record write.
 489 *
 490 * In the typical case, last_blk is the number of the block directly after
 491 * a good log record.  Therefore, we subtract one to get the block number
 492 * of the last block in the given buffer.  extra_bblks contains the number
 493 * of blocks we would have read on a previous read.  This happens when the
 494 * last log record is split over the end of the physical log.
 495 *
 496 * extra_bblks is the number of blocks potentially verified on a previous
 497 * call to this routine.
 498 */
 499STATIC int
 500xlog_find_verify_log_record(
 501        struct xlog             *log,
 502        xfs_daddr_t             start_blk,
 503        xfs_daddr_t             *last_blk,
 504        int                     extra_bblks)
 505{
 506        xfs_daddr_t             i;
 507        xfs_buf_t               *bp;
 508        xfs_caddr_t             offset = NULL;
 509        xlog_rec_header_t       *head = NULL;
 510        int                     error = 0;
 511        int                     smallmem = 0;
 512        int                     num_blks = *last_blk - start_blk;
 513        int                     xhdrs;
 514
 515        ASSERT(start_blk != 0 || *last_blk != start_blk);
 516
 517        if (!(bp = xlog_get_bp(log, num_blks))) {
 518                if (!(bp = xlog_get_bp(log, 1)))
 519                        return ENOMEM;
 520                smallmem = 1;
 521        } else {
 522                error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 523                if (error)
 524                        goto out;
 525                offset += ((num_blks - 1) << BBSHIFT);
 526        }
 527
 528        for (i = (*last_blk) - 1; i >= 0; i--) {
 529                if (i < start_blk) {
 530                        /* valid log record not found */
 531                        xfs_warn(log->l_mp,
 532                "Log inconsistent (didn't find previous header)");
 533                        ASSERT(0);
 534                        error = XFS_ERROR(EIO);
 535                        goto out;
 536                }
 537
 538                if (smallmem) {
 539                        error = xlog_bread(log, i, 1, bp, &offset);
 540                        if (error)
 541                                goto out;
 542                }
 543
 544                head = (xlog_rec_header_t *)offset;
 545
 546                if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 547                        break;
 548
 549                if (!smallmem)
 550                        offset -= BBSIZE;
 551        }
 552
 553        /*
 554         * We hit the beginning of the physical log & still no header.  Return
 555         * to caller.  If caller can handle a return of -1, then this routine
 556         * will be called again for the end of the physical log.
 557         */
 558        if (i == -1) {
 559                error = -1;
 560                goto out;
 561        }
 562
 563        /*
 564         * We have the final block of the good log (the first block
 565         * of the log record _before_ the head. So we check the uuid.
 566         */
 567        if ((error = xlog_header_check_mount(log->l_mp, head)))
 568                goto out;
 569
 570        /*
 571         * We may have found a log record header before we expected one.
 572         * last_blk will be the 1st block # with a given cycle #.  We may end
 573         * up reading an entire log record.  In this case, we don't want to
 574         * reset last_blk.  Only when last_blk points in the middle of a log
 575         * record do we update last_blk.
 576         */
 577        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 578                uint    h_size = be32_to_cpu(head->h_size);
 579
 580                xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 581                if (h_size % XLOG_HEADER_CYCLE_SIZE)
 582                        xhdrs++;
 583        } else {
 584                xhdrs = 1;
 585        }
 586
 587        if (*last_blk - i + extra_bblks !=
 588            BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 589                *last_blk = i;
 590
 591out:
 592        xlog_put_bp(bp);
 593        return error;
 594}
 595
 596/*
 597 * Head is defined to be the point of the log where the next log write
 598 * write could go.  This means that incomplete LR writes at the end are
 599 * eliminated when calculating the head.  We aren't guaranteed that previous
 600 * LR have complete transactions.  We only know that a cycle number of
 601 * current cycle number -1 won't be present in the log if we start writing
 602 * from our current block number.
 603 *
 604 * last_blk contains the block number of the first block with a given
 605 * cycle number.
 606 *
 607 * Return: zero if normal, non-zero if error.
 608 */
 609STATIC int
 610xlog_find_head(
 611        struct xlog     *log,
 612        xfs_daddr_t     *return_head_blk)
 613{
 614        xfs_buf_t       *bp;
 615        xfs_caddr_t     offset;
 616        xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 617        int             num_scan_bblks;
 618        uint            first_half_cycle, last_half_cycle;
 619        uint            stop_on_cycle;
 620        int             error, log_bbnum = log->l_logBBsize;
 621
 622        /* Is the end of the log device zeroed? */
 623        if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 624                *return_head_blk = first_blk;
 625
 626                /* Is the whole lot zeroed? */
 627                if (!first_blk) {
 628                        /* Linux XFS shouldn't generate totally zeroed logs -
 629                         * mkfs etc write a dummy unmount record to a fresh
 630                         * log so we can store the uuid in there
 631                         */
 632                        xfs_warn(log->l_mp, "totally zeroed log");
 633                }
 634
 635                return 0;
 636        } else if (error) {
 637                xfs_warn(log->l_mp, "empty log check failed");
 638                return error;
 639        }
 640
 641        first_blk = 0;                  /* get cycle # of 1st block */
 642        bp = xlog_get_bp(log, 1);
 643        if (!bp)
 644                return ENOMEM;
 645
 646        error = xlog_bread(log, 0, 1, bp, &offset);
 647        if (error)
 648                goto bp_err;
 649
 650        first_half_cycle = xlog_get_cycle(offset);
 651
 652        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 653        error = xlog_bread(log, last_blk, 1, bp, &offset);
 654        if (error)
 655                goto bp_err;
 656
 657        last_half_cycle = xlog_get_cycle(offset);
 658        ASSERT(last_half_cycle != 0);
 659
 660        /*
 661         * If the 1st half cycle number is equal to the last half cycle number,
 662         * then the entire log is stamped with the same cycle number.  In this
 663         * case, head_blk can't be set to zero (which makes sense).  The below
 664         * math doesn't work out properly with head_blk equal to zero.  Instead,
 665         * we set it to log_bbnum which is an invalid block number, but this
 666         * value makes the math correct.  If head_blk doesn't changed through
 667         * all the tests below, *head_blk is set to zero at the very end rather
 668         * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 669         * in a circular file.
 670         */
 671        if (first_half_cycle == last_half_cycle) {
 672                /*
 673                 * In this case we believe that the entire log should have
 674                 * cycle number last_half_cycle.  We need to scan backwards
 675                 * from the end verifying that there are no holes still
 676                 * containing last_half_cycle - 1.  If we find such a hole,
 677                 * then the start of that hole will be the new head.  The
 678                 * simple case looks like
 679                 *        x | x ... | x - 1 | x
 680                 * Another case that fits this picture would be
 681                 *        x | x + 1 | x ... | x
 682                 * In this case the head really is somewhere at the end of the
 683                 * log, as one of the latest writes at the beginning was
 684                 * incomplete.
 685                 * One more case is
 686                 *        x | x + 1 | x ... | x - 1 | x
 687                 * This is really the combination of the above two cases, and
 688                 * the head has to end up at the start of the x-1 hole at the
 689                 * end of the log.
 690                 *
 691                 * In the 256k log case, we will read from the beginning to the
 692                 * end of the log and search for cycle numbers equal to x-1.
 693                 * We don't worry about the x+1 blocks that we encounter,
 694                 * because we know that they cannot be the head since the log
 695                 * started with x.
 696                 */
 697                head_blk = log_bbnum;
 698                stop_on_cycle = last_half_cycle - 1;
 699        } else {
 700                /*
 701                 * In this case we want to find the first block with cycle
 702                 * number matching last_half_cycle.  We expect the log to be
 703                 * some variation on
 704                 *        x + 1 ... | x ... | x
 705                 * The first block with cycle number x (last_half_cycle) will
 706                 * be where the new head belongs.  First we do a binary search
 707                 * for the first occurrence of last_half_cycle.  The binary
 708                 * search may not be totally accurate, so then we scan back
 709                 * from there looking for occurrences of last_half_cycle before
 710                 * us.  If that backwards scan wraps around the beginning of
 711                 * the log, then we look for occurrences of last_half_cycle - 1
 712                 * at the end of the log.  The cases we're looking for look
 713                 * like
 714                 *                               v binary search stopped here
 715                 *        x + 1 ... | x | x + 1 | x ... | x
 716                 *                   ^ but we want to locate this spot
 717                 * or
 718                 *        <---------> less than scan distance
 719                 *        x + 1 ... | x ... | x - 1 | x
 720                 *                           ^ we want to locate this spot
 721                 */
 722                stop_on_cycle = last_half_cycle;
 723                if ((error = xlog_find_cycle_start(log, bp, first_blk,
 724                                                &head_blk, last_half_cycle)))
 725                        goto bp_err;
 726        }
 727
 728        /*
 729         * Now validate the answer.  Scan back some number of maximum possible
 730         * blocks and make sure each one has the expected cycle number.  The
 731         * maximum is determined by the total possible amount of buffering
 732         * in the in-core log.  The following number can be made tighter if
 733         * we actually look at the block size of the filesystem.
 734         */
 735        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 736        if (head_blk >= num_scan_bblks) {
 737                /*
 738                 * We are guaranteed that the entire check can be performed
 739                 * in one buffer.
 740                 */
 741                start_blk = head_blk - num_scan_bblks;
 742                if ((error = xlog_find_verify_cycle(log,
 743                                                start_blk, num_scan_bblks,
 744                                                stop_on_cycle, &new_blk)))
 745                        goto bp_err;
 746                if (new_blk != -1)
 747                        head_blk = new_blk;
 748        } else {                /* need to read 2 parts of log */
 749                /*
 750                 * We are going to scan backwards in the log in two parts.
 751                 * First we scan the physical end of the log.  In this part
 752                 * of the log, we are looking for blocks with cycle number
 753                 * last_half_cycle - 1.
 754                 * If we find one, then we know that the log starts there, as
 755                 * we've found a hole that didn't get written in going around
 756                 * the end of the physical log.  The simple case for this is
 757                 *        x + 1 ... | x ... | x - 1 | x
 758                 *        <---------> less than scan distance
 759                 * If all of the blocks at the end of the log have cycle number
 760                 * last_half_cycle, then we check the blocks at the start of
 761                 * the log looking for occurrences of last_half_cycle.  If we
 762                 * find one, then our current estimate for the location of the
 763                 * first occurrence of last_half_cycle is wrong and we move
 764                 * back to the hole we've found.  This case looks like
 765                 *        x + 1 ... | x | x + 1 | x ...
 766                 *                               ^ binary search stopped here
 767                 * Another case we need to handle that only occurs in 256k
 768                 * logs is
 769                 *        x + 1 ... | x ... | x+1 | x ...
 770                 *                   ^ binary search stops here
 771                 * In a 256k log, the scan at the end of the log will see the
 772                 * x + 1 blocks.  We need to skip past those since that is
 773                 * certainly not the head of the log.  By searching for
 774                 * last_half_cycle-1 we accomplish that.
 775                 */
 776                ASSERT(head_blk <= INT_MAX &&
 777                        (xfs_daddr_t) num_scan_bblks >= head_blk);
 778                start_blk = log_bbnum - (num_scan_bblks - head_blk);
 779                if ((error = xlog_find_verify_cycle(log, start_blk,
 780                                        num_scan_bblks - (int)head_blk,
 781                                        (stop_on_cycle - 1), &new_blk)))
 782                        goto bp_err;
 783                if (new_blk != -1) {
 784                        head_blk = new_blk;
 785                        goto validate_head;
 786                }
 787
 788                /*
 789                 * Scan beginning of log now.  The last part of the physical
 790                 * log is good.  This scan needs to verify that it doesn't find
 791                 * the last_half_cycle.
 792                 */
 793                start_blk = 0;
 794                ASSERT(head_blk <= INT_MAX);
 795                if ((error = xlog_find_verify_cycle(log,
 796                                        start_blk, (int)head_blk,
 797                                        stop_on_cycle, &new_blk)))
 798                        goto bp_err;
 799                if (new_blk != -1)
 800                        head_blk = new_blk;
 801        }
 802
 803validate_head:
 804        /*
 805         * Now we need to make sure head_blk is not pointing to a block in
 806         * the middle of a log record.
 807         */
 808        num_scan_bblks = XLOG_REC_SHIFT(log);
 809        if (head_blk >= num_scan_bblks) {
 810                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 811
 812                /* start ptr at last block ptr before head_blk */
 813                if ((error = xlog_find_verify_log_record(log, start_blk,
 814                                                        &head_blk, 0)) == -1) {
 815                        error = XFS_ERROR(EIO);
 816                        goto bp_err;
 817                } else if (error)
 818                        goto bp_err;
 819        } else {
 820                start_blk = 0;
 821                ASSERT(head_blk <= INT_MAX);
 822                if ((error = xlog_find_verify_log_record(log, start_blk,
 823                                                        &head_blk, 0)) == -1) {
 824                        /* We hit the beginning of the log during our search */
 825                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
 826                        new_blk = log_bbnum;
 827                        ASSERT(start_blk <= INT_MAX &&
 828                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
 829                        ASSERT(head_blk <= INT_MAX);
 830                        if ((error = xlog_find_verify_log_record(log,
 831                                                        start_blk, &new_blk,
 832                                                        (int)head_blk)) == -1) {
 833                                error = XFS_ERROR(EIO);
 834                                goto bp_err;
 835                        } else if (error)
 836                                goto bp_err;
 837                        if (new_blk != log_bbnum)
 838                                head_blk = new_blk;
 839                } else if (error)
 840                        goto bp_err;
 841        }
 842
 843        xlog_put_bp(bp);
 844        if (head_blk == log_bbnum)
 845                *return_head_blk = 0;
 846        else
 847                *return_head_blk = head_blk;
 848        /*
 849         * When returning here, we have a good block number.  Bad block
 850         * means that during a previous crash, we didn't have a clean break
 851         * from cycle number N to cycle number N-1.  In this case, we need
 852         * to find the first block with cycle number N-1.
 853         */
 854        return 0;
 855
 856 bp_err:
 857        xlog_put_bp(bp);
 858
 859        if (error)
 860                xfs_warn(log->l_mp, "failed to find log head");
 861        return error;
 862}
 863
 864/*
 865 * Find the sync block number or the tail of the log.
 866 *
 867 * This will be the block number of the last record to have its
 868 * associated buffers synced to disk.  Every log record header has
 869 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
 870 * to get a sync block number.  The only concern is to figure out which
 871 * log record header to believe.
 872 *
 873 * The following algorithm uses the log record header with the largest
 874 * lsn.  The entire log record does not need to be valid.  We only care
 875 * that the header is valid.
 876 *
 877 * We could speed up search by using current head_blk buffer, but it is not
 878 * available.
 879 */
 880STATIC int
 881xlog_find_tail(
 882        struct xlog             *log,
 883        xfs_daddr_t             *head_blk,
 884        xfs_daddr_t             *tail_blk)
 885{
 886        xlog_rec_header_t       *rhead;
 887        xlog_op_header_t        *op_head;
 888        xfs_caddr_t             offset = NULL;
 889        xfs_buf_t               *bp;
 890        int                     error, i, found;
 891        xfs_daddr_t             umount_data_blk;
 892        xfs_daddr_t             after_umount_blk;
 893        xfs_lsn_t               tail_lsn;
 894        int                     hblks;
 895
 896        found = 0;
 897
 898        /*
 899         * Find previous log record
 900         */
 901        if ((error = xlog_find_head(log, head_blk)))
 902                return error;
 903
 904        bp = xlog_get_bp(log, 1);
 905        if (!bp)
 906                return ENOMEM;
 907        if (*head_blk == 0) {                           /* special case */
 908                error = xlog_bread(log, 0, 1, bp, &offset);
 909                if (error)
 910                        goto done;
 911
 912                if (xlog_get_cycle(offset) == 0) {
 913                        *tail_blk = 0;
 914                        /* leave all other log inited values alone */
 915                        goto done;
 916                }
 917        }
 918
 919        /*
 920         * Search backwards looking for log record header block
 921         */
 922        ASSERT(*head_blk < INT_MAX);
 923        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 924                error = xlog_bread(log, i, 1, bp, &offset);
 925                if (error)
 926                        goto done;
 927
 928                if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 929                        found = 1;
 930                        break;
 931                }
 932        }
 933        /*
 934         * If we haven't found the log record header block, start looking
 935         * again from the end of the physical log.  XXXmiken: There should be
 936         * a check here to make sure we didn't search more than N blocks in
 937         * the previous code.
 938         */
 939        if (!found) {
 940                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 941                        error = xlog_bread(log, i, 1, bp, &offset);
 942                        if (error)
 943                                goto done;
 944
 945                        if (*(__be32 *)offset ==
 946                            cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 947                                found = 2;
 948                                break;
 949                        }
 950                }
 951        }
 952        if (!found) {
 953                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 954                ASSERT(0);
 955                return XFS_ERROR(EIO);
 956        }
 957
 958        /* find blk_no of tail of log */
 959        rhead = (xlog_rec_header_t *)offset;
 960        *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 961
 962        /*
 963         * Reset log values according to the state of the log when we
 964         * crashed.  In the case where head_blk == 0, we bump curr_cycle
 965         * one because the next write starts a new cycle rather than
 966         * continuing the cycle of the last good log record.  At this
 967         * point we have guaranteed that all partial log records have been
 968         * accounted for.  Therefore, we know that the last good log record
 969         * written was complete and ended exactly on the end boundary
 970         * of the physical log.
 971         */
 972        log->l_prev_block = i;
 973        log->l_curr_block = (int)*head_blk;
 974        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 975        if (found == 2)
 976                log->l_curr_cycle++;
 977        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
 978        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
 979        xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
 980                                        BBTOB(log->l_curr_block));
 981        xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
 982                                        BBTOB(log->l_curr_block));
 983
 984        /*
 985         * Look for unmount record.  If we find it, then we know there
 986         * was a clean unmount.  Since 'i' could be the last block in
 987         * the physical log, we convert to a log block before comparing
 988         * to the head_blk.
 989         *
 990         * Save the current tail lsn to use to pass to
 991         * xlog_clear_stale_blocks() below.  We won't want to clear the
 992         * unmount record if there is one, so we pass the lsn of the
 993         * unmount record rather than the block after it.
 994         */
 995        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 996                int     h_size = be32_to_cpu(rhead->h_size);
 997                int     h_version = be32_to_cpu(rhead->h_version);
 998
 999                if ((h_version & XLOG_VERSION_2) &&
1000                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1001                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1002                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
1003                                hblks++;
1004                } else {
1005                        hblks = 1;
1006                }
1007        } else {
1008                hblks = 1;
1009        }
1010        after_umount_blk = (i + hblks + (int)
1011                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1012        tail_lsn = atomic64_read(&log->l_tail_lsn);
1013        if (*head_blk == after_umount_blk &&
1014            be32_to_cpu(rhead->h_num_logops) == 1) {
1015                umount_data_blk = (i + hblks) % log->l_logBBsize;
1016                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1017                if (error)
1018                        goto done;
1019
1020                op_head = (xlog_op_header_t *)offset;
1021                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1022                        /*
1023                         * Set tail and last sync so that newly written
1024                         * log records will point recovery to after the
1025                         * current unmount record.
1026                         */
1027                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
1028                                        log->l_curr_cycle, after_umount_blk);
1029                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1030                                        log->l_curr_cycle, after_umount_blk);
1031                        *tail_blk = after_umount_blk;
1032
1033                        /*
1034                         * Note that the unmount was clean. If the unmount
1035                         * was not clean, we need to know this to rebuild the
1036                         * superblock counters from the perag headers if we
1037                         * have a filesystem using non-persistent counters.
1038                         */
1039                        log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1040                }
1041        }
1042
1043        /*
1044         * Make sure that there are no blocks in front of the head
1045         * with the same cycle number as the head.  This can happen
1046         * because we allow multiple outstanding log writes concurrently,
1047         * and the later writes might make it out before earlier ones.
1048         *
1049         * We use the lsn from before modifying it so that we'll never
1050         * overwrite the unmount record after a clean unmount.
1051         *
1052         * Do this only if we are going to recover the filesystem
1053         *
1054         * NOTE: This used to say "if (!readonly)"
1055         * However on Linux, we can & do recover a read-only filesystem.
1056         * We only skip recovery if NORECOVERY is specified on mount,
1057         * in which case we would not be here.
1058         *
1059         * But... if the -device- itself is readonly, just skip this.
1060         * We can't recover this device anyway, so it won't matter.
1061         */
1062        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1063                error = xlog_clear_stale_blocks(log, tail_lsn);
1064
1065done:
1066        xlog_put_bp(bp);
1067
1068        if (error)
1069                xfs_warn(log->l_mp, "failed to locate log tail");
1070        return error;
1071}
1072
1073/*
1074 * Is the log zeroed at all?
1075 *
1076 * The last binary search should be changed to perform an X block read
1077 * once X becomes small enough.  You can then search linearly through
1078 * the X blocks.  This will cut down on the number of reads we need to do.
1079 *
1080 * If the log is partially zeroed, this routine will pass back the blkno
1081 * of the first block with cycle number 0.  It won't have a complete LR
1082 * preceding it.
1083 *
1084 * Return:
1085 *      0  => the log is completely written to
1086 *      -1 => use *blk_no as the first block of the log
1087 *      >0 => error has occurred
1088 */
1089STATIC int
1090xlog_find_zeroed(
1091        struct xlog     *log,
1092        xfs_daddr_t     *blk_no)
1093{
1094        xfs_buf_t       *bp;
1095        xfs_caddr_t     offset;
1096        uint            first_cycle, last_cycle;
1097        xfs_daddr_t     new_blk, last_blk, start_blk;
1098        xfs_daddr_t     num_scan_bblks;
1099        int             error, log_bbnum = log->l_logBBsize;
1100
1101        *blk_no = 0;
1102
1103        /* check totally zeroed log */
1104        bp = xlog_get_bp(log, 1);
1105        if (!bp)
1106                return ENOMEM;
1107        error = xlog_bread(log, 0, 1, bp, &offset);
1108        if (error)
1109                goto bp_err;
1110
1111        first_cycle = xlog_get_cycle(offset);
1112        if (first_cycle == 0) {         /* completely zeroed log */
1113                *blk_no = 0;
1114                xlog_put_bp(bp);
1115                return -1;
1116        }
1117
1118        /* check partially zeroed log */
1119        error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1120        if (error)
1121                goto bp_err;
1122
1123        last_cycle = xlog_get_cycle(offset);
1124        if (last_cycle != 0) {          /* log completely written to */
1125                xlog_put_bp(bp);
1126                return 0;
1127        } else if (first_cycle != 1) {
1128                /*
1129                 * If the cycle of the last block is zero, the cycle of
1130                 * the first block must be 1. If it's not, maybe we're
1131                 * not looking at a log... Bail out.
1132                 */
1133                xfs_warn(log->l_mp,
1134                        "Log inconsistent or not a log (last==0, first!=1)");
1135                return XFS_ERROR(EINVAL);
1136        }
1137
1138        /* we have a partially zeroed log */
1139        last_blk = log_bbnum-1;
1140        if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1141                goto bp_err;
1142
1143        /*
1144         * Validate the answer.  Because there is no way to guarantee that
1145         * the entire log is made up of log records which are the same size,
1146         * we scan over the defined maximum blocks.  At this point, the maximum
1147         * is not chosen to mean anything special.   XXXmiken
1148         */
1149        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1150        ASSERT(num_scan_bblks <= INT_MAX);
1151
1152        if (last_blk < num_scan_bblks)
1153                num_scan_bblks = last_blk;
1154        start_blk = last_blk - num_scan_bblks;
1155
1156        /*
1157         * We search for any instances of cycle number 0 that occur before
1158         * our current estimate of the head.  What we're trying to detect is
1159         *        1 ... | 0 | 1 | 0...
1160         *                       ^ binary search ends here
1161         */
1162        if ((error = xlog_find_verify_cycle(log, start_blk,
1163                                         (int)num_scan_bblks, 0, &new_blk)))
1164                goto bp_err;
1165        if (new_blk != -1)
1166                last_blk = new_blk;
1167
1168        /*
1169         * Potentially backup over partial log record write.  We don't need
1170         * to search the end of the log because we know it is zero.
1171         */
1172        if ((error = xlog_find_verify_log_record(log, start_blk,
1173                                &last_blk, 0)) == -1) {
1174            error = XFS_ERROR(EIO);
1175            goto bp_err;
1176        } else if (error)
1177            goto bp_err;
1178
1179        *blk_no = last_blk;
1180bp_err:
1181        xlog_put_bp(bp);
1182        if (error)
1183                return error;
1184        return -1;
1185}
1186
1187/*
1188 * These are simple subroutines used by xlog_clear_stale_blocks() below
1189 * to initialize a buffer full of empty log record headers and write
1190 * them into the log.
1191 */
1192STATIC void
1193xlog_add_record(
1194        struct xlog             *log,
1195        xfs_caddr_t             buf,
1196        int                     cycle,
1197        int                     block,
1198        int                     tail_cycle,
1199        int                     tail_block)
1200{
1201        xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1202
1203        memset(buf, 0, BBSIZE);
1204        recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1205        recp->h_cycle = cpu_to_be32(cycle);
1206        recp->h_version = cpu_to_be32(
1207                        xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1208        recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1209        recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1210        recp->h_fmt = cpu_to_be32(XLOG_FMT);
1211        memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1212}
1213
1214STATIC int
1215xlog_write_log_records(
1216        struct xlog     *log,
1217        int             cycle,
1218        int             start_block,
1219        int             blocks,
1220        int             tail_cycle,
1221        int             tail_block)
1222{
1223        xfs_caddr_t     offset;
1224        xfs_buf_t       *bp;
1225        int             balign, ealign;
1226        int             sectbb = log->l_sectBBsize;
1227        int             end_block = start_block + blocks;
1228        int             bufblks;
1229        int             error = 0;
1230        int             i, j = 0;
1231
1232        /*
1233         * Greedily allocate a buffer big enough to handle the full
1234         * range of basic blocks to be written.  If that fails, try
1235         * a smaller size.  We need to be able to write at least a
1236         * log sector, or we're out of luck.
1237         */
1238        bufblks = 1 << ffs(blocks);
1239        while (bufblks > log->l_logBBsize)
1240                bufblks >>= 1;
1241        while (!(bp = xlog_get_bp(log, bufblks))) {
1242                bufblks >>= 1;
1243                if (bufblks < sectbb)
1244                        return ENOMEM;
1245        }
1246
1247        /* We may need to do a read at the start to fill in part of
1248         * the buffer in the starting sector not covered by the first
1249         * write below.
1250         */
1251        balign = round_down(start_block, sectbb);
1252        if (balign != start_block) {
1253                error = xlog_bread_noalign(log, start_block, 1, bp);
1254                if (error)
1255                        goto out_put_bp;
1256
1257                j = start_block - balign;
1258        }
1259
1260        for (i = start_block; i < end_block; i += bufblks) {
1261                int             bcount, endcount;
1262
1263                bcount = min(bufblks, end_block - start_block);
1264                endcount = bcount - j;
1265
1266                /* We may need to do a read at the end to fill in part of
1267                 * the buffer in the final sector not covered by the write.
1268                 * If this is the same sector as the above read, skip it.
1269                 */
1270                ealign = round_down(end_block, sectbb);
1271                if (j == 0 && (start_block + endcount > ealign)) {
1272                        offset = bp->b_addr + BBTOB(ealign - start_block);
1273                        error = xlog_bread_offset(log, ealign, sectbb,
1274                                                        bp, offset);
1275                        if (error)
1276                                break;
1277
1278                }
1279
1280                offset = xlog_align(log, start_block, endcount, bp);
1281                for (; j < endcount; j++) {
1282                        xlog_add_record(log, offset, cycle, i+j,
1283                                        tail_cycle, tail_block);
1284                        offset += BBSIZE;
1285                }
1286                error = xlog_bwrite(log, start_block, endcount, bp);
1287                if (error)
1288                        break;
1289                start_block += endcount;
1290                j = 0;
1291        }
1292
1293 out_put_bp:
1294        xlog_put_bp(bp);
1295        return error;
1296}
1297
1298/*
1299 * This routine is called to blow away any incomplete log writes out
1300 * in front of the log head.  We do this so that we won't become confused
1301 * if we come up, write only a little bit more, and then crash again.
1302 * If we leave the partial log records out there, this situation could
1303 * cause us to think those partial writes are valid blocks since they
1304 * have the current cycle number.  We get rid of them by overwriting them
1305 * with empty log records with the old cycle number rather than the
1306 * current one.
1307 *
1308 * The tail lsn is passed in rather than taken from
1309 * the log so that we will not write over the unmount record after a
1310 * clean unmount in a 512 block log.  Doing so would leave the log without
1311 * any valid log records in it until a new one was written.  If we crashed
1312 * during that time we would not be able to recover.
1313 */
1314STATIC int
1315xlog_clear_stale_blocks(
1316        struct xlog     *log,
1317        xfs_lsn_t       tail_lsn)
1318{
1319        int             tail_cycle, head_cycle;
1320        int             tail_block, head_block;
1321        int             tail_distance, max_distance;
1322        int             distance;
1323        int             error;
1324
1325        tail_cycle = CYCLE_LSN(tail_lsn);
1326        tail_block = BLOCK_LSN(tail_lsn);
1327        head_cycle = log->l_curr_cycle;
1328        head_block = log->l_curr_block;
1329
1330        /*
1331         * Figure out the distance between the new head of the log
1332         * and the tail.  We want to write over any blocks beyond the
1333         * head that we may have written just before the crash, but
1334         * we don't want to overwrite the tail of the log.
1335         */
1336        if (head_cycle == tail_cycle) {
1337                /*
1338                 * The tail is behind the head in the physical log,
1339                 * so the distance from the head to the tail is the
1340                 * distance from the head to the end of the log plus
1341                 * the distance from the beginning of the log to the
1342                 * tail.
1343                 */
1344                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1345                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1346                                         XFS_ERRLEVEL_LOW, log->l_mp);
1347                        return XFS_ERROR(EFSCORRUPTED);
1348                }
1349                tail_distance = tail_block + (log->l_logBBsize - head_block);
1350        } else {
1351                /*
1352                 * The head is behind the tail in the physical log,
1353                 * so the distance from the head to the tail is just
1354                 * the tail block minus the head block.
1355                 */
1356                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1357                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1358                                         XFS_ERRLEVEL_LOW, log->l_mp);
1359                        return XFS_ERROR(EFSCORRUPTED);
1360                }
1361                tail_distance = tail_block - head_block;
1362        }
1363
1364        /*
1365         * If the head is right up against the tail, we can't clear
1366         * anything.
1367         */
1368        if (tail_distance <= 0) {
1369                ASSERT(tail_distance == 0);
1370                return 0;
1371        }
1372
1373        max_distance = XLOG_TOTAL_REC_SHIFT(log);
1374        /*
1375         * Take the smaller of the maximum amount of outstanding I/O
1376         * we could have and the distance to the tail to clear out.
1377         * We take the smaller so that we don't overwrite the tail and
1378         * we don't waste all day writing from the head to the tail
1379         * for no reason.
1380         */
1381        max_distance = MIN(max_distance, tail_distance);
1382
1383        if ((head_block + max_distance) <= log->l_logBBsize) {
1384                /*
1385                 * We can stomp all the blocks we need to without
1386                 * wrapping around the end of the log.  Just do it
1387                 * in a single write.  Use the cycle number of the
1388                 * current cycle minus one so that the log will look like:
1389                 *     n ... | n - 1 ...
1390                 */
1391                error = xlog_write_log_records(log, (head_cycle - 1),
1392                                head_block, max_distance, tail_cycle,
1393                                tail_block);
1394                if (error)
1395                        return error;
1396        } else {
1397                /*
1398                 * We need to wrap around the end of the physical log in
1399                 * order to clear all the blocks.  Do it in two separate
1400                 * I/Os.  The first write should be from the head to the
1401                 * end of the physical log, and it should use the current
1402                 * cycle number minus one just like above.
1403                 */
1404                distance = log->l_logBBsize - head_block;
1405                error = xlog_write_log_records(log, (head_cycle - 1),
1406                                head_block, distance, tail_cycle,
1407                                tail_block);
1408
1409                if (error)
1410                        return error;
1411
1412                /*
1413                 * Now write the blocks at the start of the physical log.
1414                 * This writes the remainder of the blocks we want to clear.
1415                 * It uses the current cycle number since we're now on the
1416                 * same cycle as the head so that we get:
1417                 *    n ... n ... | n - 1 ...
1418                 *    ^^^^^ blocks we're writing
1419                 */
1420                distance = max_distance - (log->l_logBBsize - head_block);
1421                error = xlog_write_log_records(log, head_cycle, 0, distance,
1422                                tail_cycle, tail_block);
1423                if (error)
1424                        return error;
1425        }
1426
1427        return 0;
1428}
1429
1430/******************************************************************************
1431 *
1432 *              Log recover routines
1433 *
1434 ******************************************************************************
1435 */
1436
1437STATIC xlog_recover_t *
1438xlog_recover_find_tid(
1439        struct hlist_head       *head,
1440        xlog_tid_t              tid)
1441{
1442        xlog_recover_t          *trans;
1443        struct hlist_node       *n;
1444
1445        hlist_for_each_entry(trans, n, head, r_list) {
1446                if (trans->r_log_tid == tid)
1447                        return trans;
1448        }
1449        return NULL;
1450}
1451
1452STATIC void
1453xlog_recover_new_tid(
1454        struct hlist_head       *head,
1455        xlog_tid_t              tid,
1456        xfs_lsn_t               lsn)
1457{
1458        xlog_recover_t          *trans;
1459
1460        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1461        trans->r_log_tid   = tid;
1462        trans->r_lsn       = lsn;
1463        INIT_LIST_HEAD(&trans->r_itemq);
1464
1465        INIT_HLIST_NODE(&trans->r_list);
1466        hlist_add_head(&trans->r_list, head);
1467}
1468
1469STATIC void
1470xlog_recover_add_item(
1471        struct list_head        *head)
1472{
1473        xlog_recover_item_t     *item;
1474
1475        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1476        INIT_LIST_HEAD(&item->ri_list);
1477        list_add_tail(&item->ri_list, head);
1478}
1479
1480STATIC int
1481xlog_recover_add_to_cont_trans(
1482        struct xlog             *log,
1483        struct xlog_recover     *trans,
1484        xfs_caddr_t             dp,
1485        int                     len)
1486{
1487        xlog_recover_item_t     *item;
1488        xfs_caddr_t             ptr, old_ptr;
1489        int                     old_len;
1490
1491        if (list_empty(&trans->r_itemq)) {
1492                /* finish copying rest of trans header */
1493                xlog_recover_add_item(&trans->r_itemq);
1494                ptr = (xfs_caddr_t) &trans->r_theader +
1495                                sizeof(xfs_trans_header_t) - len;
1496                memcpy(ptr, dp, len); /* d, s, l */
1497                return 0;
1498        }
1499        /* take the tail entry */
1500        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1501
1502        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1503        old_len = item->ri_buf[item->ri_cnt-1].i_len;
1504
1505        ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1506        memcpy(&ptr[old_len], dp, len); /* d, s, l */
1507        item->ri_buf[item->ri_cnt-1].i_len += len;
1508        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1509        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1510        return 0;
1511}
1512
1513/*
1514 * The next region to add is the start of a new region.  It could be
1515 * a whole region or it could be the first part of a new region.  Because
1516 * of this, the assumption here is that the type and size fields of all
1517 * format structures fit into the first 32 bits of the structure.
1518 *
1519 * This works because all regions must be 32 bit aligned.  Therefore, we
1520 * either have both fields or we have neither field.  In the case we have
1521 * neither field, the data part of the region is zero length.  We only have
1522 * a log_op_header and can throw away the header since a new one will appear
1523 * later.  If we have at least 4 bytes, then we can determine how many regions
1524 * will appear in the current log item.
1525 */
1526STATIC int
1527xlog_recover_add_to_trans(
1528        struct xlog             *log,
1529        struct xlog_recover     *trans,
1530        xfs_caddr_t             dp,
1531        int                     len)
1532{
1533        xfs_inode_log_format_t  *in_f;                  /* any will do */
1534        xlog_recover_item_t     *item;
1535        xfs_caddr_t             ptr;
1536
1537        if (!len)
1538                return 0;
1539        if (list_empty(&trans->r_itemq)) {
1540                /* we need to catch log corruptions here */
1541                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1542                        xfs_warn(log->l_mp, "%s: bad header magic number",
1543                                __func__);
1544                        ASSERT(0);
1545                        return XFS_ERROR(EIO);
1546                }
1547                if (len == sizeof(xfs_trans_header_t))
1548                        xlog_recover_add_item(&trans->r_itemq);
1549                memcpy(&trans->r_theader, dp, len); /* d, s, l */
1550                return 0;
1551        }
1552
1553        ptr = kmem_alloc(len, KM_SLEEP);
1554        memcpy(ptr, dp, len);
1555        in_f = (xfs_inode_log_format_t *)ptr;
1556
1557        /* take the tail entry */
1558        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1559        if (item->ri_total != 0 &&
1560             item->ri_total == item->ri_cnt) {
1561                /* tail item is in use, get a new one */
1562                xlog_recover_add_item(&trans->r_itemq);
1563                item = list_entry(trans->r_itemq.prev,
1564                                        xlog_recover_item_t, ri_list);
1565        }
1566
1567        if (item->ri_total == 0) {              /* first region to be added */
1568                if (in_f->ilf_size == 0 ||
1569                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1570                        xfs_warn(log->l_mp,
1571                "bad number of regions (%d) in inode log format",
1572                                  in_f->ilf_size);
1573                        ASSERT(0);
1574                        return XFS_ERROR(EIO);
1575                }
1576
1577                item->ri_total = in_f->ilf_size;
1578                item->ri_buf =
1579                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1580                                    KM_SLEEP);
1581        }
1582        ASSERT(item->ri_total > item->ri_cnt);
1583        /* Description region is ri_buf[0] */
1584        item->ri_buf[item->ri_cnt].i_addr = ptr;
1585        item->ri_buf[item->ri_cnt].i_len  = len;
1586        item->ri_cnt++;
1587        trace_xfs_log_recover_item_add(log, trans, item, 0);
1588        return 0;
1589}
1590
1591/*
1592 * Sort the log items in the transaction. Cancelled buffers need
1593 * to be put first so they are processed before any items that might
1594 * modify the buffers. If they are cancelled, then the modifications
1595 * don't need to be replayed.
1596 */
1597STATIC int
1598xlog_recover_reorder_trans(
1599        struct xlog             *log,
1600        struct xlog_recover     *trans,
1601        int                     pass)
1602{
1603        xlog_recover_item_t     *item, *n;
1604        LIST_HEAD(sort_list);
1605
1606        list_splice_init(&trans->r_itemq, &sort_list);
1607        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1608                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1609
1610                switch (ITEM_TYPE(item)) {
1611                case XFS_LI_BUF:
1612                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1613                                trace_xfs_log_recover_item_reorder_head(log,
1614                                                        trans, item, pass);
1615                                list_move(&item->ri_list, &trans->r_itemq);
1616                                break;
1617                        }
1618                case XFS_LI_INODE:
1619                case XFS_LI_DQUOT:
1620                case XFS_LI_QUOTAOFF:
1621                case XFS_LI_EFD:
1622                case XFS_LI_EFI:
1623                        trace_xfs_log_recover_item_reorder_tail(log,
1624                                                        trans, item, pass);
1625                        list_move_tail(&item->ri_list, &trans->r_itemq);
1626                        break;
1627                default:
1628                        xfs_warn(log->l_mp,
1629                                "%s: unrecognized type of log operation",
1630                                __func__);
1631                        ASSERT(0);
1632                        return XFS_ERROR(EIO);
1633                }
1634        }
1635        ASSERT(list_empty(&sort_list));
1636        return 0;
1637}
1638
1639/*
1640 * Build up the table of buf cancel records so that we don't replay
1641 * cancelled data in the second pass.  For buffer records that are
1642 * not cancel records, there is nothing to do here so we just return.
1643 *
1644 * If we get a cancel record which is already in the table, this indicates
1645 * that the buffer was cancelled multiple times.  In order to ensure
1646 * that during pass 2 we keep the record in the table until we reach its
1647 * last occurrence in the log, we keep a reference count in the cancel
1648 * record in the table to tell us how many times we expect to see this
1649 * record during the second pass.
1650 */
1651STATIC int
1652xlog_recover_buffer_pass1(
1653        struct xlog                     *log,
1654        struct xlog_recover_item        *item)
1655{
1656        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1657        struct list_head        *bucket;
1658        struct xfs_buf_cancel   *bcp;
1659
1660        /*
1661         * If this isn't a cancel buffer item, then just return.
1662         */
1663        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1664                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1665                return 0;
1666        }
1667
1668        /*
1669         * Insert an xfs_buf_cancel record into the hash table of them.
1670         * If there is already an identical record, bump its reference count.
1671         */
1672        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1673        list_for_each_entry(bcp, bucket, bc_list) {
1674                if (bcp->bc_blkno == buf_f->blf_blkno &&
1675                    bcp->bc_len == buf_f->blf_len) {
1676                        bcp->bc_refcount++;
1677                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1678                        return 0;
1679                }
1680        }
1681
1682        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1683        bcp->bc_blkno = buf_f->blf_blkno;
1684        bcp->bc_len = buf_f->blf_len;
1685        bcp->bc_refcount = 1;
1686        list_add_tail(&bcp->bc_list, bucket);
1687
1688        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1689        return 0;
1690}
1691
1692/*
1693 * Check to see whether the buffer being recovered has a corresponding
1694 * entry in the buffer cancel record table.  If it does then return 1
1695 * so that it will be cancelled, otherwise return 0.  If the buffer is
1696 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1697 * the refcount on the entry in the table and remove it from the table
1698 * if this is the last reference.
1699 *
1700 * We remove the cancel record from the table when we encounter its
1701 * last occurrence in the log so that if the same buffer is re-used
1702 * again after its last cancellation we actually replay the changes
1703 * made at that point.
1704 */
1705STATIC int
1706xlog_check_buffer_cancelled(
1707        struct xlog             *log,
1708        xfs_daddr_t             blkno,
1709        uint                    len,
1710        ushort                  flags)
1711{
1712        struct list_head        *bucket;
1713        struct xfs_buf_cancel   *bcp;
1714
1715        if (log->l_buf_cancel_table == NULL) {
1716                /*
1717                 * There is nothing in the table built in pass one,
1718                 * so this buffer must not be cancelled.
1719                 */
1720                ASSERT(!(flags & XFS_BLF_CANCEL));
1721                return 0;
1722        }
1723
1724        /*
1725         * Search for an entry in the  cancel table that matches our buffer.
1726         */
1727        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1728        list_for_each_entry(bcp, bucket, bc_list) {
1729                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1730                        goto found;
1731        }
1732
1733        /*
1734         * We didn't find a corresponding entry in the table, so return 0 so
1735         * that the buffer is NOT cancelled.
1736         */
1737        ASSERT(!(flags & XFS_BLF_CANCEL));
1738        return 0;
1739
1740found:
1741        /*
1742         * We've go a match, so return 1 so that the recovery of this buffer
1743         * is cancelled.  If this buffer is actually a buffer cancel log
1744         * item, then decrement the refcount on the one in the table and
1745         * remove it if this is the last reference.
1746         */
1747        if (flags & XFS_BLF_CANCEL) {
1748                if (--bcp->bc_refcount == 0) {
1749                        list_del(&bcp->bc_list);
1750                        kmem_free(bcp);
1751                }
1752        }
1753        return 1;
1754}
1755
1756/*
1757 * Perform recovery for a buffer full of inodes.  In these buffers, the only
1758 * data which should be recovered is that which corresponds to the
1759 * di_next_unlinked pointers in the on disk inode structures.  The rest of the
1760 * data for the inodes is always logged through the inodes themselves rather
1761 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1762 *
1763 * The only time when buffers full of inodes are fully recovered is when the
1764 * buffer is full of newly allocated inodes.  In this case the buffer will
1765 * not be marked as an inode buffer and so will be sent to
1766 * xlog_recover_do_reg_buffer() below during recovery.
1767 */
1768STATIC int
1769xlog_recover_do_inode_buffer(
1770        struct xfs_mount        *mp,
1771        xlog_recover_item_t     *item,
1772        struct xfs_buf          *bp,
1773        xfs_buf_log_format_t    *buf_f)
1774{
1775        int                     i;
1776        int                     item_index = 0;
1777        int                     bit = 0;
1778        int                     nbits = 0;
1779        int                     reg_buf_offset = 0;
1780        int                     reg_buf_bytes = 0;
1781        int                     next_unlinked_offset;
1782        int                     inodes_per_buf;
1783        xfs_agino_t             *logged_nextp;
1784        xfs_agino_t             *buffer_nextp;
1785
1786        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1787
1788        inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1789        for (i = 0; i < inodes_per_buf; i++) {
1790                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1791                        offsetof(xfs_dinode_t, di_next_unlinked);
1792
1793                while (next_unlinked_offset >=
1794                       (reg_buf_offset + reg_buf_bytes)) {
1795                        /*
1796                         * The next di_next_unlinked field is beyond
1797                         * the current logged region.  Find the next
1798                         * logged region that contains or is beyond
1799                         * the current di_next_unlinked field.
1800                         */
1801                        bit += nbits;
1802                        bit = xfs_next_bit(buf_f->blf_data_map,
1803                                           buf_f->blf_map_size, bit);
1804
1805                        /*
1806                         * If there are no more logged regions in the
1807                         * buffer, then we're done.
1808                         */
1809                        if (bit == -1)
1810                                return 0;
1811
1812                        nbits = xfs_contig_bits(buf_f->blf_data_map,
1813                                                buf_f->blf_map_size, bit);
1814                        ASSERT(nbits > 0);
1815                        reg_buf_offset = bit << XFS_BLF_SHIFT;
1816                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1817                        item_index++;
1818                }
1819
1820                /*
1821                 * If the current logged region starts after the current
1822                 * di_next_unlinked field, then move on to the next
1823                 * di_next_unlinked field.
1824                 */
1825                if (next_unlinked_offset < reg_buf_offset)
1826                        continue;
1827
1828                ASSERT(item->ri_buf[item_index].i_addr != NULL);
1829                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1830                ASSERT((reg_buf_offset + reg_buf_bytes) <=
1831                                                        BBTOB(bp->b_io_length));
1832
1833                /*
1834                 * The current logged region contains a copy of the
1835                 * current di_next_unlinked field.  Extract its value
1836                 * and copy it to the buffer copy.
1837                 */
1838                logged_nextp = item->ri_buf[item_index].i_addr +
1839                                next_unlinked_offset - reg_buf_offset;
1840                if (unlikely(*logged_nextp == 0)) {
1841                        xfs_alert(mp,
1842                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1843                "Trying to replay bad (0) inode di_next_unlinked field.",
1844                                item, bp);
1845                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1846                                         XFS_ERRLEVEL_LOW, mp);
1847                        return XFS_ERROR(EFSCORRUPTED);
1848                }
1849
1850                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1851                                              next_unlinked_offset);
1852                *buffer_nextp = *logged_nextp;
1853        }
1854
1855        return 0;
1856}
1857
1858/*
1859 * Perform a 'normal' buffer recovery.  Each logged region of the
1860 * buffer should be copied over the corresponding region in the
1861 * given buffer.  The bitmap in the buf log format structure indicates
1862 * where to place the logged data.
1863 */
1864STATIC void
1865xlog_recover_do_reg_buffer(
1866        struct xfs_mount        *mp,
1867        xlog_recover_item_t     *item,
1868        struct xfs_buf          *bp,
1869        xfs_buf_log_format_t    *buf_f)
1870{
1871        int                     i;
1872        int                     bit;
1873        int                     nbits;
1874        int                     error;
1875
1876        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1877
1878        bit = 0;
1879        i = 1;  /* 0 is the buf format structure */
1880        while (1) {
1881                bit = xfs_next_bit(buf_f->blf_data_map,
1882                                   buf_f->blf_map_size, bit);
1883                if (bit == -1)
1884                        break;
1885                nbits = xfs_contig_bits(buf_f->blf_data_map,
1886                                        buf_f->blf_map_size, bit);
1887                ASSERT(nbits > 0);
1888                ASSERT(item->ri_buf[i].i_addr != NULL);
1889                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1890                ASSERT(BBTOB(bp->b_io_length) >=
1891                       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
1892
1893                /*
1894                 * Do a sanity check if this is a dquot buffer. Just checking
1895                 * the first dquot in the buffer should do. XXXThis is
1896                 * probably a good thing to do for other buf types also.
1897                 */
1898                error = 0;
1899                if (buf_f->blf_flags &
1900                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1901                        if (item->ri_buf[i].i_addr == NULL) {
1902                                xfs_alert(mp,
1903                                        "XFS: NULL dquot in %s.", __func__);
1904                                goto next;
1905                        }
1906                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1907                                xfs_alert(mp,
1908                                        "XFS: dquot too small (%d) in %s.",
1909                                        item->ri_buf[i].i_len, __func__);
1910                                goto next;
1911                        }
1912                        error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1913                                               -1, 0, XFS_QMOPT_DOWARN,
1914                                               "dquot_buf_recover");
1915                        if (error)
1916                                goto next;
1917                }
1918
1919                memcpy(xfs_buf_offset(bp,
1920                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
1921                        item->ri_buf[i].i_addr,         /* source */
1922                        nbits<<XFS_BLF_SHIFT);          /* length */
1923 next:
1924                i++;
1925                bit += nbits;
1926        }
1927
1928        /* Shouldn't be any more regions */
1929        ASSERT(i == item->ri_total);
1930}
1931
1932/*
1933 * Do some primitive error checking on ondisk dquot data structures.
1934 */
1935int
1936xfs_qm_dqcheck(
1937        struct xfs_mount *mp,
1938        xfs_disk_dquot_t *ddq,
1939        xfs_dqid_t       id,
1940        uint             type,    /* used only when IO_dorepair is true */
1941        uint             flags,
1942        char             *str)
1943{
1944        xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
1945        int             errs = 0;
1946
1947        /*
1948         * We can encounter an uninitialized dquot buffer for 2 reasons:
1949         * 1. If we crash while deleting the quotainode(s), and those blks got
1950         *    used for user data. This is because we take the path of regular
1951         *    file deletion; however, the size field of quotainodes is never
1952         *    updated, so all the tricks that we play in itruncate_finish
1953         *    don't quite matter.
1954         *
1955         * 2. We don't play the quota buffers when there's a quotaoff logitem.
1956         *    But the allocation will be replayed so we'll end up with an
1957         *    uninitialized quota block.
1958         *
1959         * This is all fine; things are still consistent, and we haven't lost
1960         * any quota information. Just don't complain about bad dquot blks.
1961         */
1962        if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
1963                if (flags & XFS_QMOPT_DOWARN)
1964                        xfs_alert(mp,
1965                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1966                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1967                errs++;
1968        }
1969        if (ddq->d_version != XFS_DQUOT_VERSION) {
1970                if (flags & XFS_QMOPT_DOWARN)
1971                        xfs_alert(mp,
1972                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1973                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
1974                errs++;
1975        }
1976
1977        if (ddq->d_flags != XFS_DQ_USER &&
1978            ddq->d_flags != XFS_DQ_PROJ &&
1979            ddq->d_flags != XFS_DQ_GROUP) {
1980                if (flags & XFS_QMOPT_DOWARN)
1981                        xfs_alert(mp,
1982                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1983                        str, id, ddq->d_flags);
1984                errs++;
1985        }
1986
1987        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1988                if (flags & XFS_QMOPT_DOWARN)
1989                        xfs_alert(mp,
1990                        "%s : ondisk-dquot 0x%p, ID mismatch: "
1991                        "0x%x expected, found id 0x%x",
1992                        str, ddq, id, be32_to_cpu(ddq->d_id));
1993                errs++;
1994        }
1995
1996        if (!errs && ddq->d_id) {
1997                if (ddq->d_blk_softlimit &&
1998                    be64_to_cpu(ddq->d_bcount) >
1999                                be64_to_cpu(ddq->d_blk_softlimit)) {
2000                        if (!ddq->d_btimer) {
2001                                if (flags & XFS_QMOPT_DOWARN)
2002                                        xfs_alert(mp,
2003                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
2004                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2005                                errs++;
2006                        }
2007                }
2008                if (ddq->d_ino_softlimit &&
2009                    be64_to_cpu(ddq->d_icount) >
2010                                be64_to_cpu(ddq->d_ino_softlimit)) {
2011                        if (!ddq->d_itimer) {
2012                                if (flags & XFS_QMOPT_DOWARN)
2013                                        xfs_alert(mp,
2014                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2015                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2016                                errs++;
2017                        }
2018                }
2019                if (ddq->d_rtb_softlimit &&
2020                    be64_to_cpu(ddq->d_rtbcount) >
2021                                be64_to_cpu(ddq->d_rtb_softlimit)) {
2022                        if (!ddq->d_rtbtimer) {
2023                                if (flags & XFS_QMOPT_DOWARN)
2024                                        xfs_alert(mp,
2025                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2026                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2027                                errs++;
2028                        }
2029                }
2030        }
2031
2032        if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2033                return errs;
2034
2035        if (flags & XFS_QMOPT_DOWARN)
2036                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2037
2038        /*
2039         * Typically, a repair is only requested by quotacheck.
2040         */
2041        ASSERT(id != -1);
2042        ASSERT(flags & XFS_QMOPT_DQREPAIR);
2043        memset(d, 0, sizeof(xfs_dqblk_t));
2044
2045        d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2046        d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2047        d->dd_diskdq.d_flags = type;
2048        d->dd_diskdq.d_id = cpu_to_be32(id);
2049
2050        return errs;
2051}
2052
2053/*
2054 * Perform a dquot buffer recovery.
2055 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2056 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2057 * Else, treat it as a regular buffer and do recovery.
2058 */
2059STATIC void
2060xlog_recover_do_dquot_buffer(
2061        struct xfs_mount                *mp,
2062        struct xlog                     *log,
2063        struct xlog_recover_item        *item,
2064        struct xfs_buf                  *bp,
2065        struct xfs_buf_log_format       *buf_f)
2066{
2067        uint                    type;
2068
2069        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2070
2071        /*
2072         * Filesystems are required to send in quota flags at mount time.
2073         */
2074        if (mp->m_qflags == 0) {
2075                return;
2076        }
2077
2078        type = 0;
2079        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2080                type |= XFS_DQ_USER;
2081        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2082                type |= XFS_DQ_PROJ;
2083        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2084                type |= XFS_DQ_GROUP;
2085        /*
2086         * This type of quotas was turned off, so ignore this buffer
2087         */
2088        if (log->l_quotaoffs_flag & type)
2089                return;
2090
2091        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2092}
2093
2094/*
2095 * This routine replays a modification made to a buffer at runtime.
2096 * There are actually two types of buffer, regular and inode, which
2097 * are handled differently.  Inode buffers are handled differently
2098 * in that we only recover a specific set of data from them, namely
2099 * the inode di_next_unlinked fields.  This is because all other inode
2100 * data is actually logged via inode records and any data we replay
2101 * here which overlaps that may be stale.
2102 *
2103 * When meta-data buffers are freed at run time we log a buffer item
2104 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2105 * of the buffer in the log should not be replayed at recovery time.
2106 * This is so that if the blocks covered by the buffer are reused for
2107 * file data before we crash we don't end up replaying old, freed
2108 * meta-data into a user's file.
2109 *
2110 * To handle the cancellation of buffer log items, we make two passes
2111 * over the log during recovery.  During the first we build a table of
2112 * those buffers which have been cancelled, and during the second we
2113 * only replay those buffers which do not have corresponding cancel
2114 * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2115 * for more details on the implementation of the table of cancel records.
2116 */
2117STATIC int
2118xlog_recover_buffer_pass2(
2119        struct xlog                     *log,
2120        struct list_head                *buffer_list,
2121        struct xlog_recover_item        *item)
2122{
2123        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
2124        xfs_mount_t             *mp = log->l_mp;
2125        xfs_buf_t               *bp;
2126        int                     error;
2127        uint                    buf_flags;
2128
2129        /*
2130         * In this pass we only want to recover all the buffers which have
2131         * not been cancelled and are not cancellation buffers themselves.
2132         */
2133        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2134                        buf_f->blf_len, buf_f->blf_flags)) {
2135                trace_xfs_log_recover_buf_cancel(log, buf_f);
2136                return 0;
2137        }
2138
2139        trace_xfs_log_recover_buf_recover(log, buf_f);
2140
2141        buf_flags = 0;
2142        if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2143                buf_flags |= XBF_UNMAPPED;
2144
2145        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146                          buf_flags);
2147        if (!bp)
2148                return XFS_ERROR(ENOMEM);
2149        error = bp->b_error;
2150        if (error) {
2151                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2152                xfs_buf_relse(bp);
2153                return error;
2154        }
2155
2156        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2157                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2158        } else if (buf_f->blf_flags &
2159                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2160                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2161        } else {
2162                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2163        }
2164        if (error)
2165                return XFS_ERROR(error);
2166
2167        /*
2168         * Perform delayed write on the buffer.  Asynchronous writes will be
2169         * slower when taking into account all the buffers to be flushed.
2170         *
2171         * Also make sure that only inode buffers with good sizes stay in
2172         * the buffer cache.  The kernel moves inodes in buffers of 1 block
2173         * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2174         * buffers in the log can be a different size if the log was generated
2175         * by an older kernel using unclustered inode buffers or a newer kernel
2176         * running with a different inode cluster size.  Regardless, if the
2177         * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2178         * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2179         * the buffer out of the buffer cache so that the buffer won't
2180         * overlap with future reads of those inodes.
2181         */
2182        if (XFS_DINODE_MAGIC ==
2183            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2184            (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2185                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2186                xfs_buf_stale(bp);
2187                error = xfs_bwrite(bp);
2188        } else {
2189                ASSERT(bp->b_target->bt_mount == mp);
2190                bp->b_iodone = xlog_recover_iodone;
2191                xfs_buf_delwri_queue(bp, buffer_list);
2192        }
2193
2194        xfs_buf_relse(bp);
2195        return error;
2196}
2197
2198STATIC int
2199xlog_recover_inode_pass2(
2200        struct xlog                     *log,
2201        struct list_head                *buffer_list,
2202        struct xlog_recover_item        *item)
2203{
2204        xfs_inode_log_format_t  *in_f;
2205        xfs_mount_t             *mp = log->l_mp;
2206        xfs_buf_t               *bp;
2207        xfs_dinode_t            *dip;
2208        int                     len;
2209        xfs_caddr_t             src;
2210        xfs_caddr_t             dest;
2211        int                     error;
2212        int                     attr_index;
2213        uint                    fields;
2214        xfs_icdinode_t          *dicp;
2215        int                     need_free = 0;
2216
2217        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2218                in_f = item->ri_buf[0].i_addr;
2219        } else {
2220                in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2221                need_free = 1;
2222                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2223                if (error)
2224                        goto error;
2225        }
2226
2227        /*
2228         * Inode buffers can be freed, look out for it,
2229         * and do not replay the inode.
2230         */
2231        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2232                                        in_f->ilf_len, 0)) {
2233                error = 0;
2234                trace_xfs_log_recover_inode_cancel(log, in_f);
2235                goto error;
2236        }
2237        trace_xfs_log_recover_inode_recover(log, in_f);
2238
2239        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
2240        if (!bp) {
2241                error = ENOMEM;
2242                goto error;
2243        }
2244        error = bp->b_error;
2245        if (error) {
2246                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2247                xfs_buf_relse(bp);
2248                goto error;
2249        }
2250        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2251        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2252
2253        /*
2254         * Make sure the place we're flushing out to really looks
2255         * like an inode!
2256         */
2257        if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2258                xfs_buf_relse(bp);
2259                xfs_alert(mp,
2260        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2261                        __func__, dip, bp, in_f->ilf_ino);
2262                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2263                                 XFS_ERRLEVEL_LOW, mp);
2264                error = EFSCORRUPTED;
2265                goto error;
2266        }
2267        dicp = item->ri_buf[1].i_addr;
2268        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2269                xfs_buf_relse(bp);
2270                xfs_alert(mp,
2271                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2272                        __func__, item, in_f->ilf_ino);
2273                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2274                                 XFS_ERRLEVEL_LOW, mp);
2275                error = EFSCORRUPTED;
2276                goto error;
2277        }
2278
2279        /* Skip replay when the on disk inode is newer than the log one */
2280        if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2281                /*
2282                 * Deal with the wrap case, DI_MAX_FLUSH is less
2283                 * than smaller numbers
2284                 */
2285                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2286                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2287                        /* do nothing */
2288                } else {
2289                        xfs_buf_relse(bp);
2290                        trace_xfs_log_recover_inode_skip(log, in_f);
2291                        error = 0;
2292                        goto error;
2293                }
2294        }
2295        /* Take the opportunity to reset the flush iteration count */
2296        dicp->di_flushiter = 0;
2297
2298        if (unlikely(S_ISREG(dicp->di_mode))) {
2299                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2300                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2301                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2302                                         XFS_ERRLEVEL_LOW, mp, dicp);
2303                        xfs_buf_relse(bp);
2304                        xfs_alert(mp,
2305                "%s: Bad regular inode log record, rec ptr 0x%p, "
2306                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2307                                __func__, item, dip, bp, in_f->ilf_ino);
2308                        error = EFSCORRUPTED;
2309                        goto error;
2310                }
2311        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2312                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2313                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2314                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2315                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2316                                             XFS_ERRLEVEL_LOW, mp, dicp);
2317                        xfs_buf_relse(bp);
2318                        xfs_alert(mp,
2319                "%s: Bad dir inode log record, rec ptr 0x%p, "
2320                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2321                                __func__, item, dip, bp, in_f->ilf_ino);
2322                        error = EFSCORRUPTED;
2323                        goto error;
2324                }
2325        }
2326        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2327                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2328                                     XFS_ERRLEVEL_LOW, mp, dicp);
2329                xfs_buf_relse(bp);
2330                xfs_alert(mp,
2331        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2332        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2333                        __func__, item, dip, bp, in_f->ilf_ino,
2334                        dicp->di_nextents + dicp->di_anextents,
2335                        dicp->di_nblocks);
2336                error = EFSCORRUPTED;
2337                goto error;
2338        }
2339        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2340                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2341                                     XFS_ERRLEVEL_LOW, mp, dicp);
2342                xfs_buf_relse(bp);
2343                xfs_alert(mp,
2344        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2345        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2346                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2347                error = EFSCORRUPTED;
2348                goto error;
2349        }
2350        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2351                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2352                                     XFS_ERRLEVEL_LOW, mp, dicp);
2353                xfs_buf_relse(bp);
2354                xfs_alert(mp,
2355                        "%s: Bad inode log record length %d, rec ptr 0x%p",
2356                        __func__, item->ri_buf[1].i_len, item);
2357                error = EFSCORRUPTED;
2358                goto error;
2359        }
2360
2361        /* The core is in in-core format */
2362        xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
2363
2364        /* the rest is in on-disk format */
2365        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2366                memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2367                        item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2368                        item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
2369        }
2370
2371        fields = in_f->ilf_fields;
2372        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2373        case XFS_ILOG_DEV:
2374                xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2375                break;
2376        case XFS_ILOG_UUID:
2377                memcpy(XFS_DFORK_DPTR(dip),
2378                       &in_f->ilf_u.ilfu_uuid,
2379                       sizeof(uuid_t));
2380                break;
2381        }
2382
2383        if (in_f->ilf_size == 2)
2384                goto write_inode_buffer;
2385        len = item->ri_buf[2].i_len;
2386        src = item->ri_buf[2].i_addr;
2387        ASSERT(in_f->ilf_size <= 4);
2388        ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2389        ASSERT(!(fields & XFS_ILOG_DFORK) ||
2390               (len == in_f->ilf_dsize));
2391
2392        switch (fields & XFS_ILOG_DFORK) {
2393        case XFS_ILOG_DDATA:
2394        case XFS_ILOG_DEXT:
2395                memcpy(XFS_DFORK_DPTR(dip), src, len);
2396                break;
2397
2398        case XFS_ILOG_DBROOT:
2399                xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2400                                 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2401                                 XFS_DFORK_DSIZE(dip, mp));
2402                break;
2403
2404        default:
2405                /*
2406                 * There are no data fork flags set.
2407                 */
2408                ASSERT((fields & XFS_ILOG_DFORK) == 0);
2409                break;
2410        }
2411
2412        /*
2413         * If we logged any attribute data, recover it.  There may or
2414         * may not have been any other non-core data logged in this
2415         * transaction.
2416         */
2417        if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2418                if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2419                        attr_index = 3;
2420                } else {
2421                        attr_index = 2;
2422                }
2423                len = item->ri_buf[attr_index].i_len;
2424                src = item->ri_buf[attr_index].i_addr;
2425                ASSERT(len == in_f->ilf_asize);
2426
2427                switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2428                case XFS_ILOG_ADATA:
2429                case XFS_ILOG_AEXT:
2430                        dest = XFS_DFORK_APTR(dip);
2431                        ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2432                        memcpy(dest, src, len);
2433                        break;
2434
2435                case XFS_ILOG_ABROOT:
2436                        dest = XFS_DFORK_APTR(dip);
2437                        xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2438                                         len, (xfs_bmdr_block_t*)dest,
2439                                         XFS_DFORK_ASIZE(dip, mp));
2440                        break;
2441
2442                default:
2443                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2444                        ASSERT(0);
2445                        xfs_buf_relse(bp);
2446                        error = EIO;
2447                        goto error;
2448                }
2449        }
2450
2451write_inode_buffer:
2452        ASSERT(bp->b_target->bt_mount == mp);
2453        bp->b_iodone = xlog_recover_iodone;
2454        xfs_buf_delwri_queue(bp, buffer_list);
2455        xfs_buf_relse(bp);
2456error:
2457        if (need_free)
2458                kmem_free(in_f);
2459        return XFS_ERROR(error);
2460}
2461
2462/*
2463 * Recover QUOTAOFF records. We simply make a note of it in the xlog
2464 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2465 * of that type.
2466 */
2467STATIC int
2468xlog_recover_quotaoff_pass1(
2469        struct xlog                     *log,
2470        struct xlog_recover_item        *item)
2471{
2472        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
2473        ASSERT(qoff_f);
2474
2475        /*
2476         * The logitem format's flag tells us if this was user quotaoff,
2477         * group/project quotaoff or both.
2478         */
2479        if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2480                log->l_quotaoffs_flag |= XFS_DQ_USER;
2481        if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2482                log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2483        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2484                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2485
2486        return (0);
2487}
2488
2489/*
2490 * Recover a dquot record
2491 */
2492STATIC int
2493xlog_recover_dquot_pass2(
2494        struct xlog                     *log,
2495        struct list_head                *buffer_list,
2496        struct xlog_recover_item        *item)
2497{
2498        xfs_mount_t             *mp = log->l_mp;
2499        xfs_buf_t               *bp;
2500        struct xfs_disk_dquot   *ddq, *recddq;
2501        int                     error;
2502        xfs_dq_logformat_t      *dq_f;
2503        uint                    type;
2504
2505
2506        /*
2507         * Filesystems are required to send in quota flags at mount time.
2508         */
2509        if (mp->m_qflags == 0)
2510                return (0);
2511
2512        recddq = item->ri_buf[1].i_addr;
2513        if (recddq == NULL) {
2514                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2515                return XFS_ERROR(EIO);
2516        }
2517        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2518                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2519                        item->ri_buf[1].i_len, __func__);
2520                return XFS_ERROR(EIO);
2521        }
2522
2523        /*
2524         * This type of quotas was turned off, so ignore this record.
2525         */
2526        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2527        ASSERT(type);
2528        if (log->l_quotaoffs_flag & type)
2529                return (0);
2530
2531        /*
2532         * At this point we know that quota was _not_ turned off.
2533         * Since the mount flags are not indicating to us otherwise, this
2534         * must mean that quota is on, and the dquot needs to be replayed.
2535         * Remember that we may not have fully recovered the superblock yet,
2536         * so we can't do the usual trick of looking at the SB quota bits.
2537         *
2538         * The other possibility, of course, is that the quota subsystem was
2539         * removed since the last mount - ENOSYS.
2540         */
2541        dq_f = item->ri_buf[0].i_addr;
2542        ASSERT(dq_f);
2543        error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2544                           "xlog_recover_dquot_pass2 (log copy)");
2545        if (error)
2546                return XFS_ERROR(EIO);
2547        ASSERT(dq_f->qlf_len == 1);
2548
2549        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
2551        if (error)
2552                return error;
2553
2554        ASSERT(bp);
2555        ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2556
2557        /*
2558         * At least the magic num portion should be on disk because this
2559         * was among a chunk of dquots created earlier, and we did some
2560         * minimal initialization then.
2561         */
2562        error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2563                           "xlog_recover_dquot_pass2");
2564        if (error) {
2565                xfs_buf_relse(bp);
2566                return XFS_ERROR(EIO);
2567        }
2568
2569        memcpy(ddq, recddq, item->ri_buf[1].i_len);
2570
2571        ASSERT(dq_f->qlf_size == 2);
2572        ASSERT(bp->b_target->bt_mount == mp);
2573        bp->b_iodone = xlog_recover_iodone;
2574        xfs_buf_delwri_queue(bp, buffer_list);
2575        xfs_buf_relse(bp);
2576
2577        return (0);
2578}
2579
2580/*
2581 * This routine is called to create an in-core extent free intent
2582 * item from the efi format structure which was logged on disk.
2583 * It allocates an in-core efi, copies the extents from the format
2584 * structure into it, and adds the efi to the AIL with the given
2585 * LSN.
2586 */
2587STATIC int
2588xlog_recover_efi_pass2(
2589        struct xlog                     *log,
2590        struct xlog_recover_item        *item,
2591        xfs_lsn_t                       lsn)
2592{
2593        int                     error;
2594        xfs_mount_t             *mp = log->l_mp;
2595        xfs_efi_log_item_t      *efip;
2596        xfs_efi_log_format_t    *efi_formatp;
2597
2598        efi_formatp = item->ri_buf[0].i_addr;
2599
2600        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2601        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2602                                         &(efip->efi_format)))) {
2603                xfs_efi_item_free(efip);
2604                return error;
2605        }
2606        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2607
2608        spin_lock(&log->l_ailp->xa_lock);
2609        /*
2610         * xfs_trans_ail_update() drops the AIL lock.
2611         */
2612        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2613        return 0;
2614}
2615
2616
2617/*
2618 * This routine is called when an efd format structure is found in
2619 * a committed transaction in the log.  It's purpose is to cancel
2620 * the corresponding efi if it was still in the log.  To do this
2621 * it searches the AIL for the efi with an id equal to that in the
2622 * efd format structure.  If we find it, we remove the efi from the
2623 * AIL and free it.
2624 */
2625STATIC int
2626xlog_recover_efd_pass2(
2627        struct xlog                     *log,
2628        struct xlog_recover_item        *item)
2629{
2630        xfs_efd_log_format_t    *efd_formatp;
2631        xfs_efi_log_item_t      *efip = NULL;
2632        xfs_log_item_t          *lip;
2633        __uint64_t              efi_id;
2634        struct xfs_ail_cursor   cur;
2635        struct xfs_ail          *ailp = log->l_ailp;
2636
2637        efd_formatp = item->ri_buf[0].i_addr;
2638        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2639                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2640               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2641                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2642        efi_id = efd_formatp->efd_efi_id;
2643
2644        /*
2645         * Search for the efi with the id in the efd format structure
2646         * in the AIL.
2647         */
2648        spin_lock(&ailp->xa_lock);
2649        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2650        while (lip != NULL) {
2651                if (lip->li_type == XFS_LI_EFI) {
2652                        efip = (xfs_efi_log_item_t *)lip;
2653                        if (efip->efi_format.efi_id == efi_id) {
2654                                /*
2655                                 * xfs_trans_ail_delete() drops the
2656                                 * AIL lock.
2657                                 */
2658                                xfs_trans_ail_delete(ailp, lip,
2659                                                     SHUTDOWN_CORRUPT_INCORE);
2660                                xfs_efi_item_free(efip);
2661                                spin_lock(&ailp->xa_lock);
2662                                break;
2663                        }
2664                }
2665                lip = xfs_trans_ail_cursor_next(ailp, &cur);
2666        }
2667        xfs_trans_ail_cursor_done(ailp, &cur);
2668        spin_unlock(&ailp->xa_lock);
2669
2670        return 0;
2671}
2672
2673/*
2674 * Free up any resources allocated by the transaction
2675 *
2676 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2677 */
2678STATIC void
2679xlog_recover_free_trans(
2680        struct xlog_recover     *trans)
2681{
2682        xlog_recover_item_t     *item, *n;
2683        int                     i;
2684
2685        list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2686                /* Free the regions in the item. */
2687                list_del(&item->ri_list);
2688                for (i = 0; i < item->ri_cnt; i++)
2689                        kmem_free(item->ri_buf[i].i_addr);
2690                /* Free the item itself */
2691                kmem_free(item->ri_buf);
2692                kmem_free(item);
2693        }
2694        /* Free the transaction recover structure */
2695        kmem_free(trans);
2696}
2697
2698STATIC int
2699xlog_recover_commit_pass1(
2700        struct xlog                     *log,
2701        struct xlog_recover             *trans,
2702        struct xlog_recover_item        *item)
2703{
2704        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2705
2706        switch (ITEM_TYPE(item)) {
2707        case XFS_LI_BUF:
2708                return xlog_recover_buffer_pass1(log, item);
2709        case XFS_LI_QUOTAOFF:
2710                return xlog_recover_quotaoff_pass1(log, item);
2711        case XFS_LI_INODE:
2712        case XFS_LI_EFI:
2713        case XFS_LI_EFD:
2714        case XFS_LI_DQUOT:
2715                /* nothing to do in pass 1 */
2716                return 0;
2717        default:
2718                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2719                        __func__, ITEM_TYPE(item));
2720                ASSERT(0);
2721                return XFS_ERROR(EIO);
2722        }
2723}
2724
2725STATIC int
2726xlog_recover_commit_pass2(
2727        struct xlog                     *log,
2728        struct xlog_recover             *trans,
2729        struct list_head                *buffer_list,
2730        struct xlog_recover_item        *item)
2731{
2732        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2733
2734        switch (ITEM_TYPE(item)) {
2735        case XFS_LI_BUF:
2736                return xlog_recover_buffer_pass2(log, buffer_list, item);
2737        case XFS_LI_INODE:
2738                return xlog_recover_inode_pass2(log, buffer_list, item);
2739        case XFS_LI_EFI:
2740                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2741        case XFS_LI_EFD:
2742                return xlog_recover_efd_pass2(log, item);
2743        case XFS_LI_DQUOT:
2744                return xlog_recover_dquot_pass2(log, buffer_list, item);
2745        case XFS_LI_QUOTAOFF:
2746                /* nothing to do in pass2 */
2747                return 0;
2748        default:
2749                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2750                        __func__, ITEM_TYPE(item));
2751                ASSERT(0);
2752                return XFS_ERROR(EIO);
2753        }
2754}
2755
2756/*
2757 * Perform the transaction.
2758 *
2759 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2760 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2761 */
2762STATIC int
2763xlog_recover_commit_trans(
2764        struct xlog             *log,
2765        struct xlog_recover     *trans,
2766        int                     pass)
2767{
2768        int                     error = 0, error2;
2769        xlog_recover_item_t     *item;
2770        LIST_HEAD               (buffer_list);
2771
2772        hlist_del(&trans->r_list);
2773
2774        error = xlog_recover_reorder_trans(log, trans, pass);
2775        if (error)
2776                return error;
2777
2778        list_for_each_entry(item, &trans->r_itemq, ri_list) {
2779                switch (pass) {
2780                case XLOG_RECOVER_PASS1:
2781                        error = xlog_recover_commit_pass1(log, trans, item);
2782                        break;
2783                case XLOG_RECOVER_PASS2:
2784                        error = xlog_recover_commit_pass2(log, trans,
2785                                                          &buffer_list, item);
2786                        break;
2787                default:
2788                        ASSERT(0);
2789                }
2790
2791                if (error)
2792                        goto out;
2793        }
2794
2795        xlog_recover_free_trans(trans);
2796
2797out:
2798        error2 = xfs_buf_delwri_submit(&buffer_list);
2799        return error ? error : error2;
2800}
2801
2802STATIC int
2803xlog_recover_unmount_trans(
2804        struct xlog             *log,
2805        struct xlog_recover     *trans)
2806{
2807        /* Do nothing now */
2808        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2809        return 0;
2810}
2811
2812/*
2813 * There are two valid states of the r_state field.  0 indicates that the
2814 * transaction structure is in a normal state.  We have either seen the
2815 * start of the transaction or the last operation we added was not a partial
2816 * operation.  If the last operation we added to the transaction was a
2817 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2818 *
2819 * NOTE: skip LRs with 0 data length.
2820 */
2821STATIC int
2822xlog_recover_process_data(
2823        struct xlog             *log,
2824        struct hlist_head       rhash[],
2825        struct xlog_rec_header  *rhead,
2826        xfs_caddr_t             dp,
2827        int                     pass)
2828{
2829        xfs_caddr_t             lp;
2830        int                     num_logops;
2831        xlog_op_header_t        *ohead;
2832        xlog_recover_t          *trans;
2833        xlog_tid_t              tid;
2834        int                     error;
2835        unsigned long           hash;
2836        uint                    flags;
2837
2838        lp = dp + be32_to_cpu(rhead->h_len);
2839        num_logops = be32_to_cpu(rhead->h_num_logops);
2840
2841        /* check the log format matches our own - else we can't recover */
2842        if (xlog_header_check_recover(log->l_mp, rhead))
2843                return (XFS_ERROR(EIO));
2844
2845        while ((dp < lp) && num_logops) {
2846                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2847                ohead = (xlog_op_header_t *)dp;
2848                dp += sizeof(xlog_op_header_t);
2849                if (ohead->oh_clientid != XFS_TRANSACTION &&
2850                    ohead->oh_clientid != XFS_LOG) {
2851                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2852                                        __func__, ohead->oh_clientid);
2853                        ASSERT(0);
2854                        return (XFS_ERROR(EIO));
2855                }
2856                tid = be32_to_cpu(ohead->oh_tid);
2857                hash = XLOG_RHASH(tid);
2858                trans = xlog_recover_find_tid(&rhash[hash], tid);
2859                if (trans == NULL) {               /* not found; add new tid */
2860                        if (ohead->oh_flags & XLOG_START_TRANS)
2861                                xlog_recover_new_tid(&rhash[hash], tid,
2862                                        be64_to_cpu(rhead->h_lsn));
2863                } else {
2864                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2865                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
2866                                        __func__, be32_to_cpu(ohead->oh_len));
2867                                WARN_ON(1);
2868                                return (XFS_ERROR(EIO));
2869                        }
2870                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
2871                        if (flags & XLOG_WAS_CONT_TRANS)
2872                                flags &= ~XLOG_CONTINUE_TRANS;
2873                        switch (flags) {
2874                        case XLOG_COMMIT_TRANS:
2875                                error = xlog_recover_commit_trans(log,
2876                                                                trans, pass);
2877                                break;
2878                        case XLOG_UNMOUNT_TRANS:
2879                                error = xlog_recover_unmount_trans(log, trans);
2880                                break;
2881                        case XLOG_WAS_CONT_TRANS:
2882                                error = xlog_recover_add_to_cont_trans(log,
2883                                                trans, dp,
2884                                                be32_to_cpu(ohead->oh_len));
2885                                break;
2886                        case XLOG_START_TRANS:
2887                                xfs_warn(log->l_mp, "%s: bad transaction",
2888                                        __func__);
2889                                ASSERT(0);
2890                                error = XFS_ERROR(EIO);
2891                                break;
2892                        case 0:
2893                        case XLOG_CONTINUE_TRANS:
2894                                error = xlog_recover_add_to_trans(log, trans,
2895                                                dp, be32_to_cpu(ohead->oh_len));
2896                                break;
2897                        default:
2898                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2899                                        __func__, flags);
2900                                ASSERT(0);
2901                                error = XFS_ERROR(EIO);
2902                                break;
2903                        }
2904                        if (error)
2905                                return error;
2906                }
2907                dp += be32_to_cpu(ohead->oh_len);
2908                num_logops--;
2909        }
2910        return 0;
2911}
2912
2913/*
2914 * Process an extent free intent item that was recovered from
2915 * the log.  We need to free the extents that it describes.
2916 */
2917STATIC int
2918xlog_recover_process_efi(
2919        xfs_mount_t             *mp,
2920        xfs_efi_log_item_t      *efip)
2921{
2922        xfs_efd_log_item_t      *efdp;
2923        xfs_trans_t             *tp;
2924        int                     i;
2925        int                     error = 0;
2926        xfs_extent_t            *extp;
2927        xfs_fsblock_t           startblock_fsb;
2928
2929        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
2930
2931        /*
2932         * First check the validity of the extents described by the
2933         * EFI.  If any are bad, then assume that all are bad and
2934         * just toss the EFI.
2935         */
2936        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2937                extp = &(efip->efi_format.efi_extents[i]);
2938                startblock_fsb = XFS_BB_TO_FSB(mp,
2939                                   XFS_FSB_TO_DADDR(mp, extp->ext_start));
2940                if ((startblock_fsb == 0) ||
2941                    (extp->ext_len == 0) ||
2942                    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
2943                    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
2944                        /*
2945                         * This will pull the EFI from the AIL and
2946                         * free the memory associated with it.
2947                         */
2948                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
2949                        return XFS_ERROR(EIO);
2950                }
2951        }
2952
2953        tp = xfs_trans_alloc(mp, 0);
2954        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
2955        if (error)
2956                goto abort_error;
2957        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
2958
2959        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2960                extp = &(efip->efi_format.efi_extents[i]);
2961                error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
2962                if (error)
2963                        goto abort_error;
2964                xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
2965                                         extp->ext_len);
2966        }
2967
2968        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
2969        error = xfs_trans_commit(tp, 0);
2970        return error;
2971
2972abort_error:
2973        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
2974        return error;
2975}
2976
2977/*
2978 * When this is called, all of the EFIs which did not have
2979 * corresponding EFDs should be in the AIL.  What we do now
2980 * is free the extents associated with each one.
2981 *
2982 * Since we process the EFIs in normal transactions, they
2983 * will be removed at some point after the commit.  This prevents
2984 * us from just walking down the list processing each one.
2985 * We'll use a flag in the EFI to skip those that we've already
2986 * processed and use the AIL iteration mechanism's generation
2987 * count to try to speed this up at least a bit.
2988 *
2989 * When we start, we know that the EFIs are the only things in
2990 * the AIL.  As we process them, however, other items are added
2991 * to the AIL.  Since everything added to the AIL must come after
2992 * everything already in the AIL, we stop processing as soon as
2993 * we see something other than an EFI in the AIL.
2994 */
2995STATIC int
2996xlog_recover_process_efis(
2997        struct xlog     *log)
2998{
2999        xfs_log_item_t          *lip;
3000        xfs_efi_log_item_t      *efip;
3001        int                     error = 0;
3002        struct xfs_ail_cursor   cur;
3003        struct xfs_ail          *ailp;
3004
3005        ailp = log->l_ailp;
3006        spin_lock(&ailp->xa_lock);
3007        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3008        while (lip != NULL) {
3009                /*
3010                 * We're done when we see something other than an EFI.
3011                 * There should be no EFIs left in the AIL now.
3012                 */
3013                if (lip->li_type != XFS_LI_EFI) {
3014#ifdef DEBUG
3015                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3016                                ASSERT(lip->li_type != XFS_LI_EFI);
3017#endif
3018                        break;
3019                }
3020
3021                /*
3022                 * Skip EFIs that we've already processed.
3023                 */
3024                efip = (xfs_efi_log_item_t *)lip;
3025                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3026                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
3027                        continue;
3028                }
3029
3030                spin_unlock(&ailp->xa_lock);
3031                error = xlog_recover_process_efi(log->l_mp, efip);
3032                spin_lock(&ailp->xa_lock);
3033                if (error)
3034                        goto out;
3035                lip = xfs_trans_ail_cursor_next(ailp, &cur);
3036        }
3037out:
3038        xfs_trans_ail_cursor_done(ailp, &cur);
3039        spin_unlock(&ailp->xa_lock);
3040        return error;
3041}
3042
3043/*
3044 * This routine performs a transaction to null out a bad inode pointer
3045 * in an agi unlinked inode hash bucket.
3046 */
3047STATIC void
3048xlog_recover_clear_agi_bucket(
3049        xfs_mount_t     *mp,
3050        xfs_agnumber_t  agno,
3051        int             bucket)
3052{
3053        xfs_trans_t     *tp;
3054        xfs_agi_t       *agi;
3055        xfs_buf_t       *agibp;
3056        int             offset;
3057        int             error;
3058
3059        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3060        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3061                                  0, 0, 0);
3062        if (error)
3063                goto out_abort;
3064
3065        error = xfs_read_agi(mp, tp, agno, &agibp);
3066        if (error)
3067                goto out_abort;
3068
3069        agi = XFS_BUF_TO_AGI(agibp);
3070        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3071        offset = offsetof(xfs_agi_t, agi_unlinked) +
3072                 (sizeof(xfs_agino_t) * bucket);
3073        xfs_trans_log_buf(tp, agibp, offset,
3074                          (offset + sizeof(xfs_agino_t) - 1));
3075
3076        error = xfs_trans_commit(tp, 0);
3077        if (error)
3078                goto out_error;
3079        return;
3080
3081out_abort:
3082        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3083out_error:
3084        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3085        return;
3086}
3087
3088STATIC xfs_agino_t
3089xlog_recover_process_one_iunlink(
3090        struct xfs_mount                *mp,
3091        xfs_agnumber_t                  agno,
3092        xfs_agino_t                     agino,
3093        int                             bucket)
3094{
3095        struct xfs_buf                  *ibp;
3096        struct xfs_dinode               *dip;
3097        struct xfs_inode                *ip;
3098        xfs_ino_t                       ino;
3099        int                             error;
3100
3101        ino = XFS_AGINO_TO_INO(mp, agno, agino);
3102        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3103        if (error)
3104                goto fail;
3105
3106        /*
3107         * Get the on disk inode to find the next inode in the bucket.
3108         */
3109        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
3110        if (error)
3111                goto fail_iput;
3112
3113        ASSERT(ip->i_d.di_nlink == 0);
3114        ASSERT(ip->i_d.di_mode != 0);
3115
3116        /* setup for the next pass */
3117        agino = be32_to_cpu(dip->di_next_unlinked);
3118        xfs_buf_relse(ibp);
3119
3120        /*
3121         * Prevent any DMAPI event from being sent when the reference on
3122         * the inode is dropped.
3123         */
3124        ip->i_d.di_dmevmask = 0;
3125
3126        IRELE(ip);
3127        return agino;
3128
3129 fail_iput:
3130        IRELE(ip);
3131 fail:
3132        /*
3133         * We can't read in the inode this bucket points to, or this inode
3134         * is messed up.  Just ditch this bucket of inodes.  We will lose
3135         * some inodes and space, but at least we won't hang.
3136         *
3137         * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3138         * clear the inode pointer in the bucket.
3139         */
3140        xlog_recover_clear_agi_bucket(mp, agno, bucket);
3141        return NULLAGINO;
3142}
3143
3144/*
3145 * xlog_iunlink_recover
3146 *
3147 * This is called during recovery to process any inodes which
3148 * we unlinked but not freed when the system crashed.  These
3149 * inodes will be on the lists in the AGI blocks.  What we do
3150 * here is scan all the AGIs and fully truncate and free any
3151 * inodes found on the lists.  Each inode is removed from the
3152 * lists when it has been fully truncated and is freed.  The
3153 * freeing of the inode and its removal from the list must be
3154 * atomic.
3155 */
3156STATIC void
3157xlog_recover_process_iunlinks(
3158        struct xlog     *log)
3159{
3160        xfs_mount_t     *mp;
3161        xfs_agnumber_t  agno;
3162        xfs_agi_t       *agi;
3163        xfs_buf_t       *agibp;
3164        xfs_agino_t     agino;
3165        int             bucket;
3166        int             error;
3167        uint            mp_dmevmask;
3168
3169        mp = log->l_mp;
3170
3171        /*
3172         * Prevent any DMAPI event from being sent while in this function.
3173         */
3174        mp_dmevmask = mp->m_dmevmask;
3175        mp->m_dmevmask = 0;
3176
3177        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3178                /*
3179                 * Find the agi for this ag.
3180                 */
3181                error = xfs_read_agi(mp, NULL, agno, &agibp);
3182                if (error) {
3183                        /*
3184                         * AGI is b0rked. Don't process it.
3185                         *
3186                         * We should probably mark the filesystem as corrupt
3187                         * after we've recovered all the ag's we can....
3188                         */
3189                        continue;
3190                }
3191                /*
3192                 * Unlock the buffer so that it can be acquired in the normal
3193                 * course of the transaction to truncate and free each inode.
3194                 * Because we are not racing with anyone else here for the AGI
3195                 * buffer, we don't even need to hold it locked to read the
3196                 * initial unlinked bucket entries out of the buffer. We keep
3197                 * buffer reference though, so that it stays pinned in memory
3198                 * while we need the buffer.
3199                 */
3200                agi = XFS_BUF_TO_AGI(agibp);
3201                xfs_buf_unlock(agibp);
3202
3203                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3204                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3205                        while (agino != NULLAGINO) {
3206                                agino = xlog_recover_process_one_iunlink(mp,
3207                                                        agno, agino, bucket);
3208                        }
3209                }
3210                xfs_buf_rele(agibp);
3211        }
3212
3213        mp->m_dmevmask = mp_dmevmask;
3214}
3215
3216
3217#ifdef DEBUG
3218STATIC void
3219xlog_pack_data_checksum(
3220        struct xlog             *log,
3221        struct xlog_in_core     *iclog,
3222        int                     size)
3223{
3224        int             i;
3225        __be32          *up;
3226        uint            chksum = 0;
3227
3228        up = (__be32 *)iclog->ic_datap;
3229        /* divide length by 4 to get # words */
3230        for (i = 0; i < (size >> 2); i++) {
3231                chksum ^= be32_to_cpu(*up);
3232                up++;
3233        }
3234        iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235}
3236#else
3237#define xlog_pack_data_checksum(log, iclog, size)
3238#endif
3239
3240/*
3241 * Stamp cycle number in every block
3242 */
3243void
3244xlog_pack_data(
3245        struct xlog             *log,
3246        struct xlog_in_core     *iclog,
3247        int                     roundoff)
3248{
3249        int                     i, j, k;
3250        int                     size = iclog->ic_offset + roundoff;
3251        __be32                  cycle_lsn;
3252        xfs_caddr_t             dp;
3253
3254        xlog_pack_data_checksum(log, iclog, size);
3255
3256        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3257
3258        dp = iclog->ic_datap;
3259        for (i = 0; i < BTOBB(size) &&
3260                i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262                *(__be32 *)dp = cycle_lsn;
3263                dp += BBSIZE;
3264        }
3265
3266        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267                xlog_in_core_2_t *xhdr = iclog->ic_data;
3268
3269                for ( ; i < BTOBB(size); i++) {
3270                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273                        *(__be32 *)dp = cycle_lsn;
3274                        dp += BBSIZE;
3275                }
3276
3277                for (i = 1; i < log->l_iclog_heads; i++) {
3278                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3279                }
3280        }
3281}
3282
3283STATIC void
3284xlog_unpack_data(
3285        struct xlog_rec_header  *rhead,
3286        xfs_caddr_t             dp,
3287        struct xlog             *log)
3288{
3289        int                     i, j, k;
3290
3291        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3293                *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3294                dp += BBSIZE;
3295        }
3296
3297        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3298                xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3299                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3300                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3301                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3302                        *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3303                        dp += BBSIZE;
3304                }
3305        }
3306}
3307
3308STATIC int
3309xlog_valid_rec_header(
3310        struct xlog             *log,
3311        struct xlog_rec_header  *rhead,
3312        xfs_daddr_t             blkno)
3313{
3314        int                     hlen;
3315
3316        if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
3317                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3318                                XFS_ERRLEVEL_LOW, log->l_mp);
3319                return XFS_ERROR(EFSCORRUPTED);
3320        }
3321        if (unlikely(
3322            (!rhead->h_version ||
3323            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3324                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3325                        __func__, be32_to_cpu(rhead->h_version));
3326                return XFS_ERROR(EIO);
3327        }
3328
3329        /* LR body must have data or it wouldn't have been written */
3330        hlen = be32_to_cpu(rhead->h_len);
3331        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3332                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3333                                XFS_ERRLEVEL_LOW, log->l_mp);
3334                return XFS_ERROR(EFSCORRUPTED);
3335        }
3336        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3337                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3338                                XFS_ERRLEVEL_LOW, log->l_mp);
3339                return XFS_ERROR(EFSCORRUPTED);
3340        }
3341        return 0;
3342}
3343
3344/*
3345 * Read the log from tail to head and process the log records found.
3346 * Handle the two cases where the tail and head are in the same cycle
3347 * and where the active portion of the log wraps around the end of
3348 * the physical log separately.  The pass parameter is passed through
3349 * to the routines called to process the data and is not looked at
3350 * here.
3351 */
3352STATIC int
3353xlog_do_recovery_pass(
3354        struct xlog             *log,
3355        xfs_daddr_t             head_blk,
3356        xfs_daddr_t             tail_blk,
3357        int                     pass)
3358{
3359        xlog_rec_header_t       *rhead;
3360        xfs_daddr_t             blk_no;
3361        xfs_caddr_t             offset;
3362        xfs_buf_t               *hbp, *dbp;
3363        int                     error = 0, h_size;
3364        int                     bblks, split_bblks;
3365        int                     hblks, split_hblks, wrapped_hblks;
3366        struct hlist_head       rhash[XLOG_RHASH_SIZE];
3367
3368        ASSERT(head_blk != tail_blk);
3369
3370        /*
3371         * Read the header of the tail block and get the iclog buffer size from
3372         * h_size.  Use this to tell how many sectors make up the log header.
3373         */
3374        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3375                /*
3376                 * When using variable length iclogs, read first sector of
3377                 * iclog header and extract the header size from it.  Get a
3378                 * new hbp that is the correct size.
3379                 */
3380                hbp = xlog_get_bp(log, 1);
3381                if (!hbp)
3382                        return ENOMEM;
3383
3384                error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3385                if (error)
3386                        goto bread_err1;
3387
3388                rhead = (xlog_rec_header_t *)offset;
3389                error = xlog_valid_rec_header(log, rhead, tail_blk);
3390                if (error)
3391                        goto bread_err1;
3392                h_size = be32_to_cpu(rhead->h_size);
3393                if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3394                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3395                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3396                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
3397                                hblks++;
3398                        xlog_put_bp(hbp);
3399                        hbp = xlog_get_bp(log, hblks);
3400                } else {
3401                        hblks = 1;
3402                }
3403        } else {
3404                ASSERT(log->l_sectBBsize == 1);
3405                hblks = 1;
3406                hbp = xlog_get_bp(log, 1);
3407                h_size = XLOG_BIG_RECORD_BSIZE;
3408        }
3409
3410        if (!hbp)
3411                return ENOMEM;
3412        dbp = xlog_get_bp(log, BTOBB(h_size));
3413        if (!dbp) {
3414                xlog_put_bp(hbp);
3415                return ENOMEM;
3416        }
3417
3418        memset(rhash, 0, sizeof(rhash));
3419        if (tail_blk <= head_blk) {
3420                for (blk_no = tail_blk; blk_no < head_blk; ) {
3421                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3422                        if (error)
3423                                goto bread_err2;
3424
3425                        rhead = (xlog_rec_header_t *)offset;
3426                        error = xlog_valid_rec_header(log, rhead, blk_no);
3427                        if (error)
3428                                goto bread_err2;
3429
3430                        /* blocks in data section */
3431                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3432                        error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3433                                           &offset);
3434                        if (error)
3435                                goto bread_err2;
3436
3437                        xlog_unpack_data(rhead, offset, log);
3438                        if ((error = xlog_recover_process_data(log,
3439                                                rhash, rhead, offset, pass)))
3440                                goto bread_err2;
3441                        blk_no += bblks + hblks;
3442                }
3443        } else {
3444                /*
3445                 * Perform recovery around the end of the physical log.
3446                 * When the head is not on the same cycle number as the tail,
3447                 * we can't do a sequential recovery as above.
3448                 */
3449                blk_no = tail_blk;
3450                while (blk_no < log->l_logBBsize) {
3451                        /*
3452                         * Check for header wrapping around physical end-of-log
3453                         */
3454                        offset = hbp->b_addr;
3455                        split_hblks = 0;
3456                        wrapped_hblks = 0;
3457                        if (blk_no + hblks <= log->l_logBBsize) {
3458                                /* Read header in one read */
3459                                error = xlog_bread(log, blk_no, hblks, hbp,
3460                                                   &offset);
3461                                if (error)
3462                                        goto bread_err2;
3463                        } else {
3464                                /* This LR is split across physical log end */
3465                                if (blk_no != log->l_logBBsize) {
3466                                        /* some data before physical log end */
3467                                        ASSERT(blk_no <= INT_MAX);
3468                                        split_hblks = log->l_logBBsize - (int)blk_no;
3469                                        ASSERT(split_hblks > 0);
3470                                        error = xlog_bread(log, blk_no,
3471                                                           split_hblks, hbp,
3472                                                           &offset);
3473                                        if (error)
3474                                                goto bread_err2;
3475                                }
3476
3477                                /*
3478                                 * Note: this black magic still works with
3479                                 * large sector sizes (non-512) only because:
3480                                 * - we increased the buffer size originally
3481                                 *   by 1 sector giving us enough extra space
3482                                 *   for the second read;
3483                                 * - the log start is guaranteed to be sector
3484                                 *   aligned;
3485                                 * - we read the log end (LR header start)
3486                                 *   _first_, then the log start (LR header end)
3487                                 *   - order is important.
3488                                 */
3489                                wrapped_hblks = hblks - split_hblks;
3490                                error = xlog_bread_offset(log, 0,
3491                                                wrapped_hblks, hbp,
3492                                                offset + BBTOB(split_hblks));
3493                                if (error)
3494                                        goto bread_err2;
3495                        }
3496                        rhead = (xlog_rec_header_t *)offset;
3497                        error = xlog_valid_rec_header(log, rhead,
3498                                                split_hblks ? blk_no : 0);
3499                        if (error)
3500                                goto bread_err2;
3501
3502                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3503                        blk_no += hblks;
3504
3505                        /* Read in data for log record */
3506                        if (blk_no + bblks <= log->l_logBBsize) {
3507                                error = xlog_bread(log, blk_no, bblks, dbp,
3508                                                   &offset);
3509                                if (error)
3510                                        goto bread_err2;
3511                        } else {
3512                                /* This log record is split across the
3513                                 * physical end of log */
3514                                offset = dbp->b_addr;
3515                                split_bblks = 0;
3516                                if (blk_no != log->l_logBBsize) {
3517                                        /* some data is before the physical
3518                                         * end of log */
3519                                        ASSERT(!wrapped_hblks);
3520                                        ASSERT(blk_no <= INT_MAX);
3521                                        split_bblks =
3522                                                log->l_logBBsize - (int)blk_no;
3523                                        ASSERT(split_bblks > 0);
3524                                        error = xlog_bread(log, blk_no,
3525                                                        split_bblks, dbp,
3526                                                        &offset);
3527                                        if (error)
3528                                                goto bread_err2;
3529                                }
3530
3531                                /*
3532                                 * Note: this black magic still works with
3533                                 * large sector sizes (non-512) only because:
3534                                 * - we increased the buffer size originally
3535                                 *   by 1 sector giving us enough extra space
3536                                 *   for the second read;
3537                                 * - the log start is guaranteed to be sector
3538                                 *   aligned;
3539                                 * - we read the log end (LR header start)
3540                                 *   _first_, then the log start (LR header end)
3541                                 *   - order is important.
3542                                 */
3543                                error = xlog_bread_offset(log, 0,
3544                                                bblks - split_bblks, dbp,
3545                                                offset + BBTOB(split_bblks));
3546                                if (error)
3547                                        goto bread_err2;
3548                        }
3549                        xlog_unpack_data(rhead, offset, log);
3550                        if ((error = xlog_recover_process_data(log, rhash,
3551                                                        rhead, offset, pass)))
3552                                goto bread_err2;
3553                        blk_no += bblks;
3554                }
3555
3556                ASSERT(blk_no >= log->l_logBBsize);
3557                blk_no -= log->l_logBBsize;
3558
3559                /* read first part of physical log */
3560                while (blk_no < head_blk) {
3561                        error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3562                        if (error)
3563                                goto bread_err2;
3564
3565                        rhead = (xlog_rec_header_t *)offset;
3566                        error = xlog_valid_rec_header(log, rhead, blk_no);
3567                        if (error)
3568                                goto bread_err2;
3569
3570                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3571                        error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3572                                           &offset);
3573                        if (error)
3574                                goto bread_err2;
3575
3576                        xlog_unpack_data(rhead, offset, log);
3577                        if ((error = xlog_recover_process_data(log, rhash,
3578                                                        rhead, offset, pass)))
3579                                goto bread_err2;
3580                        blk_no += bblks + hblks;
3581                }
3582        }
3583
3584 bread_err2:
3585        xlog_put_bp(dbp);
3586 bread_err1:
3587        xlog_put_bp(hbp);
3588        return error;
3589}
3590
3591/*
3592 * Do the recovery of the log.  We actually do this in two phases.
3593 * The two passes are necessary in order to implement the function
3594 * of cancelling a record written into the log.  The first pass
3595 * determines those things which have been cancelled, and the
3596 * second pass replays log items normally except for those which
3597 * have been cancelled.  The handling of the replay and cancellations
3598 * takes place in the log item type specific routines.
3599 *
3600 * The table of items which have cancel records in the log is allocated
3601 * and freed at this level, since only here do we know when all of
3602 * the log recovery has been completed.
3603 */
3604STATIC int
3605xlog_do_log_recovery(
3606        struct xlog     *log,
3607        xfs_daddr_t     head_blk,
3608        xfs_daddr_t     tail_blk)
3609{
3610        int             error, i;
3611
3612        ASSERT(head_blk != tail_blk);
3613
3614        /*
3615         * First do a pass to find all of the cancelled buf log items.
3616         * Store them in the buf_cancel_table for use in the second pass.
3617         */
3618        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3619                                                 sizeof(struct list_head),
3620                                                 KM_SLEEP);
3621        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3622                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3623
3624        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3625                                      XLOG_RECOVER_PASS1);
3626        if (error != 0) {
3627                kmem_free(log->l_buf_cancel_table);
3628                log->l_buf_cancel_table = NULL;
3629                return error;
3630        }
3631        /*
3632         * Then do a second pass to actually recover the items in the log.
3633         * When it is complete free the table of buf cancel items.
3634         */
3635        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3636                                      XLOG_RECOVER_PASS2);
3637#ifdef DEBUG
3638        if (!error) {
3639                int     i;
3640
3641                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3642                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3643        }
3644#endif  /* DEBUG */
3645
3646        kmem_free(log->l_buf_cancel_table);
3647        log->l_buf_cancel_table = NULL;
3648
3649        return error;
3650}
3651
3652/*
3653 * Do the actual recovery
3654 */
3655STATIC int
3656xlog_do_recover(
3657        struct xlog     *log,
3658        xfs_daddr_t     head_blk,
3659        xfs_daddr_t     tail_blk)
3660{
3661        int             error;
3662        xfs_buf_t       *bp;
3663        xfs_sb_t        *sbp;
3664
3665        /*
3666         * First replay the images in the log.
3667         */
3668        error = xlog_do_log_recovery(log, head_blk, tail_blk);
3669        if (error)
3670                return error;
3671
3672        /*
3673         * If IO errors happened during recovery, bail out.
3674         */
3675        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3676                return (EIO);
3677        }
3678
3679        /*
3680         * We now update the tail_lsn since much of the recovery has completed
3681         * and there may be space available to use.  If there were no extent
3682         * or iunlinks, we can free up the entire log and set the tail_lsn to
3683         * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3684         * lsn of the last known good LR on disk.  If there are extent frees
3685         * or iunlinks they will have some entries in the AIL; so we look at
3686         * the AIL to determine how to set the tail_lsn.
3687         */
3688        xlog_assign_tail_lsn(log->l_mp);
3689
3690        /*
3691         * Now that we've finished replaying all buffer and inode
3692         * updates, re-read in the superblock.
3693         */
3694        bp = xfs_getsb(log->l_mp, 0);
3695        XFS_BUF_UNDONE(bp);
3696        ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697        XFS_BUF_READ(bp);
3698        XFS_BUF_UNASYNC(bp);
3699        xfsbdstrat(log->l_mp, bp);
3700        error = xfs_buf_iowait(bp);
3701        if (error) {
3702                xfs_buf_ioerror_alert(bp, __func__);
3703                ASSERT(0);
3704                xfs_buf_relse(bp);
3705                return error;
3706        }
3707
3708        /* Convert superblock from on-disk format */
3709        sbp = &log->l_mp->m_sb;
3710        xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
3711        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712        ASSERT(xfs_sb_good_version(sbp));
3713        xfs_buf_relse(bp);
3714
3715        /* We've re-read the superblock so re-initialize per-cpu counters */
3716        xfs_icsb_reinit_counters(log->l_mp);
3717
3718        xlog_recover_check_summary(log);
3719
3720        /* Normal transactions can now occur */
3721        log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3722        return 0;
3723}
3724
3725/*
3726 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3727 *
3728 * Return error or zero.
3729 */
3730int
3731xlog_recover(
3732        struct xlog     *log)
3733{
3734        xfs_daddr_t     head_blk, tail_blk;
3735        int             error;
3736
3737        /* find the tail of the log */
3738        if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3739                return error;
3740
3741        if (tail_blk != head_blk) {
3742                /* There used to be a comment here:
3743                 *
3744                 * disallow recovery on read-only mounts.  note -- mount
3745                 * checks for ENOSPC and turns it into an intelligent
3746                 * error message.
3747                 * ...but this is no longer true.  Now, unless you specify
3748                 * NORECOVERY (in which case this function would never be
3749                 * called), we just go ahead and recover.  We do this all
3750                 * under the vfs layer, so we can get away with it unless
3751                 * the device itself is read-only, in which case we fail.
3752                 */
3753                if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3754                        return error;
3755                }
3756
3757                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3758                                log->l_mp->m_logname ? log->l_mp->m_logname
3759                                                     : "internal");
3760
3761                error = xlog_do_recover(log, head_blk, tail_blk);
3762                log->l_flags |= XLOG_RECOVERY_NEEDED;
3763        }
3764        return error;
3765}
3766
3767/*
3768 * In the first part of recovery we replay inodes and buffers and build
3769 * up the list of extent free items which need to be processed.  Here
3770 * we process the extent free items and clean up the on disk unlinked
3771 * inode lists.  This is separated from the first part of recovery so
3772 * that the root and real-time bitmap inodes can be read in from disk in
3773 * between the two stages.  This is necessary so that we can free space
3774 * in the real-time portion of the file system.
3775 */
3776int
3777xlog_recover_finish(
3778        struct xlog     *log)
3779{
3780        /*
3781         * Now we're ready to do the transactions needed for the
3782         * rest of recovery.  Start with completing all the extent
3783         * free intent records and then process the unlinked inode
3784         * lists.  At this point, we essentially run in normal mode
3785         * except that we're still performing recovery actions
3786         * rather than accepting new requests.
3787         */
3788        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3789                int     error;
3790                error = xlog_recover_process_efis(log);
3791                if (error) {
3792                        xfs_alert(log->l_mp, "Failed to recover EFIs");
3793                        return error;
3794                }
3795                /*
3796                 * Sync the log to get all the EFIs out of the AIL.
3797                 * This isn't absolutely necessary, but it helps in
3798                 * case the unlink transactions would have problems
3799                 * pushing the EFIs out of the way.
3800                 */
3801                xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3802
3803                xlog_recover_process_iunlinks(log);
3804
3805                xlog_recover_check_summary(log);
3806
3807                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3808                                log->l_mp->m_logname ? log->l_mp->m_logname
3809                                                     : "internal");
3810                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3811        } else {
3812                xfs_info(log->l_mp, "Ending clean mount");
3813        }
3814        return 0;
3815}
3816
3817
3818#if defined(DEBUG)
3819/*
3820 * Read all of the agf and agi counters and check that they
3821 * are consistent with the superblock counters.
3822 */
3823void
3824xlog_recover_check_summary(
3825        struct xlog     *log)
3826{
3827        xfs_mount_t     *mp;
3828        xfs_agf_t       *agfp;
3829        xfs_buf_t       *agfbp;
3830        xfs_buf_t       *agibp;
3831        xfs_agnumber_t  agno;
3832        __uint64_t      freeblks;
3833        __uint64_t      itotal;
3834        __uint64_t      ifree;
3835        int             error;
3836
3837        mp = log->l_mp;
3838
3839        freeblks = 0LL;
3840        itotal = 0LL;
3841        ifree = 0LL;
3842        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3843                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3844                if (error) {
3845                        xfs_alert(mp, "%s agf read failed agno %d error %d",
3846                                                __func__, agno, error);
3847                } else {
3848                        agfp = XFS_BUF_TO_AGF(agfbp);
3849                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
3850                                    be32_to_cpu(agfp->agf_flcount);
3851                        xfs_buf_relse(agfbp);
3852                }
3853
3854                error = xfs_read_agi(mp, NULL, agno, &agibp);
3855                if (error) {
3856                        xfs_alert(mp, "%s agi read failed agno %d error %d",
3857                                                __func__, agno, error);
3858                } else {
3859                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
3860
3861                        itotal += be32_to_cpu(agi->agi_count);
3862                        ifree += be32_to_cpu(agi->agi_freecount);
3863                        xfs_buf_relse(agibp);
3864                }
3865        }
3866}
3867#endif /* DEBUG */
3868
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.