linux-old/fs/xfs/linux-2.4/xfs_lrw.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms of version 2 of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it would be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11 *
  12 * Further, this software is distributed without any warranty that it is
  13 * free of the rightful claim of any third person regarding infringement
  14 * or the like.  Any license provided herein, whether implied or
  15 * otherwise, applies only to this software file.  Patent licenses, if
  16 * any, provided herein do not apply to combinations of this program with
  17 * other software, or any other product whatsoever.
  18 *
  19 * You should have received a copy of the GNU General Public License along
  20 * with this program; if not, write the Free Software Foundation, Inc., 59
  21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22 *
  23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24 * Mountain View, CA  94043, or:
  25 *
  26 * http://www.sgi.com
  27 *
  28 * For further information regarding this notice, see:
  29 *
  30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31 */
  32/*
  33 *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
  34 *
  35 */
  36
  37#include "xfs.h"
  38
  39#include "xfs_fs.h"
  40#include "xfs_inum.h"
  41#include "xfs_log.h"
  42#include "xfs_trans.h"
  43#include "xfs_sb.h"
  44#include "xfs_ag.h"
  45#include "xfs_dir.h"
  46#include "xfs_dir2.h"
  47#include "xfs_alloc.h"
  48#include "xfs_dmapi.h"
  49#include "xfs_quota.h"
  50#include "xfs_mount.h"
  51#include "xfs_alloc_btree.h"
  52#include "xfs_bmap_btree.h"
  53#include "xfs_ialloc_btree.h"
  54#include "xfs_btree.h"
  55#include "xfs_ialloc.h"
  56#include "xfs_attr_sf.h"
  57#include "xfs_dir_sf.h"
  58#include "xfs_dir2_sf.h"
  59#include "xfs_dinode.h"
  60#include "xfs_inode.h"
  61#include "xfs_bmap.h"
  62#include "xfs_bit.h"
  63#include "xfs_rtalloc.h"
  64#include "xfs_error.h"
  65#include "xfs_itable.h"
  66#include "xfs_rw.h"
  67#include "xfs_refcache.h"
  68#include "xfs_acl.h"
  69#include "xfs_cap.h"
  70#include "xfs_mac.h"
  71#include "xfs_attr.h"
  72#include "xfs_inode_item.h"
  73#include "xfs_buf_item.h"
  74#include "xfs_utils.h"
  75#include "xfs_iomap.h"
  76
  77#include <linux/capability.h>
  78
  79
  80#if defined(XFS_RW_TRACE)
  81void
  82xfs_rw_enter_trace(
  83        int             tag,
  84        xfs_iocore_t    *io,
  85        const char      *buf,
  86        size_t          size,
  87        loff_t          offset,
  88        int             ioflags)
  89{
  90        xfs_inode_t     *ip = XFS_IO_INODE(io);
  91
  92        if (ip->i_rwtrace == NULL)
  93                return;
  94        ktrace_enter(ip->i_rwtrace,
  95                (void *)(unsigned long)tag,
  96                (void *)ip,
  97                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  98                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  99                (void *)(__psint_t)buf,
 100                (void *)((unsigned long)size),
 101                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 102                (void *)((unsigned long)(offset & 0xffffffff)),
 103                (void *)((unsigned long)ioflags),
 104                (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
 105                (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
 106                (void *)NULL,
 107                (void *)NULL,
 108                (void *)NULL,
 109                (void *)NULL,
 110                (void *)NULL);
 111}
 112
 113void
 114xfs_inval_cached_trace(
 115        xfs_iocore_t    *io,
 116        xfs_off_t       offset,
 117        xfs_off_t       len,
 118        xfs_off_t       first,
 119        xfs_off_t       last)
 120{
 121        xfs_inode_t     *ip = XFS_IO_INODE(io);
 122
 123        if (ip->i_rwtrace == NULL)
 124                return;
 125        ktrace_enter(ip->i_rwtrace,
 126                (void *)(__psint_t)XFS_INVAL_CACHED,
 127                (void *)ip,
 128                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 129                (void *)((unsigned long)(offset & 0xffffffff)),
 130                (void *)((unsigned long)((len >> 32) & 0xffffffff)),
 131                (void *)((unsigned long)(len & 0xffffffff)),
 132                (void *)((unsigned long)((first >> 32) & 0xffffffff)),
 133                (void *)((unsigned long)(first & 0xffffffff)),
 134                (void *)((unsigned long)((last >> 32) & 0xffffffff)),
 135                (void *)((unsigned long)(last & 0xffffffff)),
 136                (void *)NULL,
 137                (void *)NULL,
 138                (void *)NULL,
 139                (void *)NULL,
 140                (void *)NULL,
 141                (void *)NULL);
 142}
 143#endif
 144
 145/*
 146 *      xfs_iozero
 147 *
 148 *      xfs_iozero clears the specified range of buffer supplied,
 149 *      and marks all the affected blocks as valid and modified.  If
 150 *      an affected block is not allocated, it will be allocated.  If
 151 *      an affected block is not completely overwritten, and is not
 152 *      valid before the operation, it will be read from disk before
 153 *      being partially zeroed.
 154 */
 155STATIC int
 156xfs_iozero(
 157        struct inode            *ip,    /* inode                        */
 158        loff_t                  pos,    /* offset in file               */
 159        size_t                  count,  /* size of data to zero         */
 160        loff_t                  end_size)       /* max file size to set */
 161{
 162        unsigned                bytes;
 163        struct page             *page;
 164        struct address_space    *mapping;
 165        char                    *kaddr;
 166        int                     status;
 167
 168        mapping = ip->i_mapping;
 169        do {
 170                unsigned long index, offset;
 171
 172                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 173                index = pos >> PAGE_CACHE_SHIFT;
 174                bytes = PAGE_CACHE_SIZE - offset;
 175                if (bytes > count)
 176                        bytes = count;
 177
 178                status = -ENOMEM;
 179                page = grab_cache_page(mapping, index);
 180                if (!page)
 181                        break;
 182
 183                kaddr = kmap(page);
 184                status = mapping->a_ops->prepare_write(NULL, page, offset,
 185                                                        offset + bytes);
 186                if (status) {
 187                        goto unlock;
 188                }
 189
 190                memset((void *) (kaddr + offset), 0, bytes);
 191                flush_dcache_page(page);
 192                status = mapping->a_ops->commit_write(NULL, page, offset,
 193                                                        offset + bytes);
 194                if (!status) {
 195                        pos += bytes;
 196                        count -= bytes;
 197                        if (pos > i_size_read(ip))
 198                                i_size_write(ip, pos < end_size ? pos : end_size);
 199                }
 200
 201unlock:
 202                kunmap(page);
 203                unlock_page(page);
 204                page_cache_release(page);
 205                if (status)
 206                        break;
 207        } while (count);
 208
 209        return (-status);
 210}
 211
 212/*
 213 * xfs_inval_cached_pages
 214 * 
 215 * This routine is responsible for keeping direct I/O and buffered I/O
 216 * somewhat coherent.  From here we make sure that we're at least
 217 * temporarily holding the inode I/O lock exclusively and then call
 218 * the page cache to flush and invalidate any cached pages.  If there
 219 * are no cached pages this routine will be very quick.
 220 */
 221void
 222xfs_inval_cached_pages(
 223        vnode_t         *vp,
 224        xfs_iocore_t    *io,
 225        xfs_off_t       offset,
 226        int             write,
 227        int             relock)
 228{
 229        xfs_mount_t     *mp;
 230
 231        if (!VN_CACHED(vp)) {
 232                return;
 233        }
 234
 235        mp = io->io_mount;
 236
 237        /*
 238         * We need to get the I/O lock exclusively in order
 239         * to safely invalidate pages and mappings.
 240         */
 241        if (relock) {
 242                XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
 243                XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
 244        }
 245
 246        /* Writing beyond EOF creates a hole that must be zeroed */
 247        if (write && (offset > XFS_SIZE(mp, io))) {
 248                xfs_fsize_t     isize;
 249
 250                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 251                isize = XFS_SIZE(mp, io);
 252                if (offset > isize) {
 253                        xfs_zero_eof(vp, io, offset, isize, offset);
 254                }
 255                XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 256        }
 257
 258        xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
 259        VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
 260        if (relock) {
 261                XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
 262        }
 263}
 264
 265ssize_t                 /* bytes read, or (-)  error */
 266xfs_read(
 267        bhv_desc_t      *bdp,
 268        struct file     *file,
 269        char            *buf,
 270        size_t          size,
 271        loff_t          *offset,
 272        int             ioflags,
 273        cred_t          *credp)
 274{
 275        ssize_t         ret;
 276        xfs_fsize_t     n;
 277        xfs_inode_t     *ip;
 278        xfs_mount_t     *mp;
 279
 280        ip = XFS_BHVTOI(bdp);
 281        mp = ip->i_mount;
 282
 283        XFS_STATS_INC(xs_read_calls);
 284
 285        if (unlikely(ioflags & IO_ISDIRECT)) {
 286                if ((ssize_t)size < 0)
 287                        return -XFS_ERROR(EINVAL);
 288                if (((__psint_t)buf & BBMASK) ||
 289                    (*offset & mp->m_blockmask) ||
 290                    (size & mp->m_blockmask)) {
 291                        if (*offset >= ip->i_d.di_size) {
 292                                return (0);
 293                        }
 294                        return -XFS_ERROR(EINVAL);
 295                }
 296        }
 297
 298        n = XFS_MAXIOFFSET(mp) - *offset;
 299        if ((n <= 0) || (size == 0))
 300                return 0;
 301
 302        if (n < size)
 303                size = n;
 304
 305        if (XFS_FORCED_SHUTDOWN(mp)) {
 306                return -EIO;
 307        }
 308
 309        if (!(ioflags & IO_ISLOCKED))
 310                xfs_ilock(ip, XFS_IOLOCK_SHARED);
 311
 312        if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
 313            !(ioflags & IO_INVIS)) {
 314                int error;
 315                vrwlock_t locktype = VRWLOCK_READ;
 316                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 317
 318                error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, size,
 319                                      dmflags, &locktype);
 320                if (error) {
 321                        if (!(ioflags & IO_ISLOCKED))
 322                                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 323                        return -error;
 324                }
 325        }
 326
 327        if (unlikely(ioflags & IO_ISDIRECT)) {
 328                xfs_rw_enter_trace(XFS_DIORD_ENTER, &ip->i_iocore,
 329                                        buf, size, *offset, ioflags);
 330                ret = (*offset < ip->i_d.di_size) ?
 331                        do_generic_direct_read(file, buf, size, offset) : 0;
 332                UPDATE_ATIME(file->f_dentry->d_inode);
 333        } else {
 334                xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 335                                        buf, size, *offset, ioflags);
 336                ret = generic_file_read(file, buf, size, offset);
 337        }
 338
 339        if (!(ioflags & IO_ISLOCKED))
 340                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 341
 342        XFS_STATS_ADD(xs_read_bytes, ret);
 343
 344        if (unlikely(ioflags & IO_INVIS)) {
 345                /* generic_file_read updates the atime but we need to
 346                 * undo that because this I/O was supposed to be invisible.
 347                 */
 348                struct inode *inode = LINVFS_GET_IP(BHV_TO_VNODE(bdp));
 349                inode->i_atime = ip->i_d.di_atime.t_sec;
 350        } else {
 351                xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 352        }
 353
 354        return ret;
 355}
 356
 357/*
 358 * This routine is called to handle zeroing any space in the last
 359 * block of the file that is beyond the EOF.  We do this since the
 360 * size is being increased without writing anything to that block
 361 * and we don't want anyone to read the garbage on the disk.
 362 */
 363STATIC int                              /* error (positive) */
 364xfs_zero_last_block(
 365        struct inode    *ip,
 366        xfs_iocore_t    *io,
 367        xfs_off_t       offset,
 368        xfs_fsize_t     isize,
 369        xfs_fsize_t     end_size)
 370{
 371        xfs_fileoff_t   last_fsb;
 372        xfs_mount_t     *mp;
 373        int             nimaps;
 374        int             zero_offset;
 375        int             zero_len;
 376        int             isize_fsb_offset;
 377        int             error = 0;
 378        xfs_bmbt_irec_t imap;
 379        loff_t          loff;
 380        size_t          lsize;
 381
 382        ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
 383        ASSERT(offset > isize);
 384
 385        mp = io->io_mount;
 386
 387        isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
 388        if (isize_fsb_offset == 0) {
 389                /*
 390                 * There are no extra bytes in the last block on disk to
 391                 * zero, so return.
 392                 */
 393                return 0;
 394        }
 395
 396        last_fsb = XFS_B_TO_FSBT(mp, isize);
 397        nimaps = 1;
 398        error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
 399                          &nimaps, NULL);
 400        if (error) {
 401                return error;
 402        }
 403        ASSERT(nimaps > 0);
 404        /*
 405         * If the block underlying isize is just a hole, then there
 406         * is nothing to zero.
 407         */
 408        if (imap.br_startblock == HOLESTARTBLOCK) {
 409                return 0;
 410        }
 411        /*
 412         * Zero the part of the last block beyond the EOF, and write it
 413         * out sync.  We need to drop the ilock while we do this so we
 414         * don't deadlock when the buffer cache calls back to us.
 415         */
 416        XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
 417        loff = XFS_FSB_TO_B(mp, last_fsb);
 418        lsize = XFS_FSB_TO_B(mp, 1);
 419
 420        zero_offset = isize_fsb_offset;
 421        zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
 422
 423        error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
 424
 425        XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 426        ASSERT(error >= 0);
 427        return error;
 428}
 429
 430/*
 431 * Zero any on disk space between the current EOF and the new,
 432 * larger EOF.  This handles the normal case of zeroing the remainder
 433 * of the last block in the file and the unusual case of zeroing blocks
 434 * out beyond the size of the file.  This second case only happens
 435 * with fixed size extents and when the system crashes before the inode
 436 * size was updated but after blocks were allocated.  If fill is set,
 437 * then any holes in the range are filled and zeroed.  If not, the holes
 438 * are left alone as holes.
 439 */
 440
 441int                                     /* error (positive) */
 442xfs_zero_eof(
 443        vnode_t         *vp,
 444        xfs_iocore_t    *io,
 445        xfs_off_t       offset,         /* starting I/O offset */
 446        xfs_fsize_t     isize,          /* current inode size */
 447        xfs_fsize_t     end_size)       /* terminal inode size */
 448{
 449        struct inode    *ip = LINVFS_GET_IP(vp);
 450        xfs_fileoff_t   start_zero_fsb;
 451        xfs_fileoff_t   end_zero_fsb;
 452        xfs_fileoff_t   prev_zero_fsb;
 453        xfs_fileoff_t   zero_count_fsb;
 454        xfs_fileoff_t   last_fsb;
 455        xfs_extlen_t    buf_len_fsb;
 456        xfs_extlen_t    prev_zero_count;
 457        xfs_mount_t     *mp;
 458        int             nimaps;
 459        int             error = 0;
 460        xfs_bmbt_irec_t imap;
 461        loff_t          loff;
 462        size_t          lsize;
 463
 464        ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 465        ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 466
 467        mp = io->io_mount;
 468
 469        /*
 470         * First handle zeroing the block on which isize resides.
 471         * We only zero a part of that block so it is handled specially.
 472         */
 473        error = xfs_zero_last_block(ip, io, offset, isize, end_size);
 474        if (error) {
 475                ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 476                ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 477                return error;
 478        }
 479
 480        /*
 481         * Calculate the range between the new size and the old
 482         * where blocks needing to be zeroed may exist.  To get the
 483         * block where the last byte in the file currently resides,
 484         * we need to subtract one from the size and truncate back
 485         * to a block boundary.  We subtract 1 in case the size is
 486         * exactly on a block boundary.
 487         */
 488        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 489        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 490        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 491        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 492        if (last_fsb == end_zero_fsb) {
 493                /*
 494                 * The size was only incremented on its last block.
 495                 * We took care of that above, so just return.
 496                 */
 497                return 0;
 498        }
 499
 500        ASSERT(start_zero_fsb <= end_zero_fsb);
 501        prev_zero_fsb = NULLFILEOFF;
 502        prev_zero_count = 0;
 503        while (start_zero_fsb <= end_zero_fsb) {
 504                nimaps = 1;
 505                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 506                error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
 507                                  0, NULL, 0, &imap, &nimaps, NULL);
 508                if (error) {
 509                        ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 510                        ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 511                        return error;
 512                }
 513                ASSERT(nimaps > 0);
 514
 515                if (imap.br_state == XFS_EXT_UNWRITTEN ||
 516                    imap.br_startblock == HOLESTARTBLOCK) {
 517                        /*
 518                         * This loop handles initializing pages that were
 519                         * partially initialized by the code below this
 520                         * loop. It basically zeroes the part of the page
 521                         * that sits on a hole and sets the page as P_HOLE
 522                         * and calls remapf if it is a mapped file.
 523                         */
 524                        prev_zero_fsb = NULLFILEOFF;
 525                        prev_zero_count = 0;
 526                        start_zero_fsb = imap.br_startoff +
 527                                         imap.br_blockcount;
 528                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 529                        continue;
 530                }
 531
 532                /*
 533                 * There are blocks in the range requested.
 534                 * Zero them a single write at a time.  We actually
 535                 * don't zero the entire range returned if it is
 536                 * too big and simply loop around to get the rest.
 537                 * That is not the most efficient thing to do, but it
 538                 * is simple and this path should not be exercised often.
 539                 */
 540                buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
 541                                              mp->m_writeio_blocks << 8);
 542                /*
 543                 * Drop the inode lock while we're doing the I/O.
 544                 * We'll still have the iolock to protect us.
 545                 */
 546                XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 547
 548                loff = XFS_FSB_TO_B(mp, start_zero_fsb);
 549                lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
 550
 551                error = xfs_iozero(ip, loff, lsize, end_size);
 552
 553                if (error) {
 554                        goto out_lock;
 555                }
 556
 557                prev_zero_fsb = start_zero_fsb;
 558                prev_zero_count = buf_len_fsb;
 559                start_zero_fsb = imap.br_startoff + buf_len_fsb;
 560                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 561
 562                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 563        }
 564
 565        return 0;
 566
 567out_lock:
 568
 569        XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 570        ASSERT(error >= 0);
 571        return error;
 572}
 573
 574ssize_t                         /* bytes written, or (-) error */
 575xfs_write(
 576        bhv_desc_t      *bdp,
 577        struct file     *file,
 578        const char      *buf,
 579        size_t          size,
 580        loff_t          *offset,
 581        int             ioflags,
 582        cred_t          *credp)
 583{
 584        xfs_inode_t     *xip;
 585        xfs_mount_t     *mp;
 586        ssize_t         ret;
 587        int             error = 0;
 588        xfs_fsize_t     isize, new_size;
 589        xfs_fsize_t     n, limit;
 590        xfs_iocore_t    *io;
 591        vnode_t         *vp;
 592        int             iolock;
 593        int             eventsent = 0;
 594        vrwlock_t       locktype;
 595
 596        XFS_STATS_INC(xs_write_calls);
 597
 598        vp = BHV_TO_VNODE(bdp);
 599        xip = XFS_BHVTOI(bdp);
 600
 601        if (size == 0)
 602                return 0;
 603
 604        io = &xip->i_iocore;
 605        mp = io->io_mount;
 606
 607        fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
 608
 609        if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
 610                return -EIO;
 611        }
 612
 613        if (unlikely(ioflags & IO_ISDIRECT)) {
 614                if (((__psint_t)buf & BBMASK) ||
 615                    (*offset & mp->m_blockmask) ||
 616                    (size  & mp->m_blockmask)) {
 617                        return XFS_ERROR(-EINVAL);
 618                }
 619                iolock = XFS_IOLOCK_SHARED;
 620                locktype = VRWLOCK_WRITE_DIRECT;
 621        } else {
 622                iolock = XFS_IOLOCK_EXCL;
 623                locktype = VRWLOCK_WRITE;
 624        }
 625
 626        if (ioflags & IO_ISLOCKED)
 627                iolock = 0;
 628
 629        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 630
 631        isize = xip->i_d.di_size;
 632        limit = XFS_MAXIOFFSET(mp);
 633
 634        if (file->f_flags & O_APPEND)
 635                *offset = isize;
 636
 637start:
 638        n = limit - *offset;
 639        if (n <= 0) {
 640                xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 641                return -EFBIG;
 642        }
 643        if (n < size)
 644                size = n;
 645
 646        new_size = *offset + size;
 647        if (new_size > isize) {
 648                io->io_new_size = new_size;
 649        }
 650
 651        if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 652            !(ioflags & IO_INVIS) && !eventsent)) {
 653                loff_t          savedsize = *offset;
 654                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 655
 656                xfs_iunlock(xip, XFS_ILOCK_EXCL);
 657                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
 658                                      *offset, size,
 659                                      dmflags, &locktype);
 660                if (error) {
 661                        if (iolock) xfs_iunlock(xip, iolock);
 662                        return -error;
 663                }
 664                xfs_ilock(xip, XFS_ILOCK_EXCL);
 665                eventsent = 1;
 666
 667                /*
 668                 * The iolock was dropped and reaquired in XFS_SEND_DATA
 669                 * so we have to recheck the size when appending.
 670                 * We will only "goto start;" once, since having sent the
 671                 * event prevents another call to XFS_SEND_DATA, which is
 672                 * what allows the size to change in the first place.
 673                 */
 674                if ((file->f_flags & O_APPEND) &&
 675                    savedsize != xip->i_d.di_size) {
 676                        *offset = isize = xip->i_d.di_size;
 677                        goto start;
 678                }
 679        }
 680
 681        /*
 682         * If the offset is beyond the size of the file, we have a couple
 683         * of things to do. First, if there is already space allocated
 684         * we need to either create holes or zero the disk or ...
 685         *
 686         * If there is a page where the previous size lands, we need
 687         * to zero it out up to the new size.
 688         */
 689
 690        if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
 691                error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
 692                        isize, *offset + size);
 693                if (error) {
 694                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 695                        return(-error);
 696                }
 697        }
 698        xfs_iunlock(xip, XFS_ILOCK_EXCL);
 699
 700        /*
 701         * If we're writing the file then make sure to clear the
 702         * setuid and setgid bits if the process is not being run
 703         * by root.  This keeps people from modifying setuid and
 704         * setgid binaries.
 705         */
 706
 707        if (((xip->i_d.di_mode & S_ISUID) ||
 708            ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
 709                (S_ISGID | S_IXGRP))) &&
 710             !capable(CAP_FSETID)) {
 711                error = xfs_write_clear_setuid(xip);
 712                if (error) {
 713                        xfs_iunlock(xip, iolock);
 714                        return -error;
 715                }
 716        }
 717
 718
 719        if ((ssize_t) size < 0) {
 720                ret = -EINVAL;
 721                goto error;
 722        }
 723
 724        if (!access_ok(VERIFY_READ, buf, size)) {
 725                ret = -EINVAL;
 726                goto error;
 727        }
 728
 729retry:
 730        if (unlikely(ioflags & IO_ISDIRECT)) {
 731                xfs_inval_cached_pages(vp, io, *offset, 1, 1);
 732                xfs_rw_enter_trace(XFS_DIOWR_ENTER,
 733                                        io, buf, size, *offset, ioflags);
 734                ret = do_generic_direct_write(file, buf, size, offset);
 735        } else {
 736                xfs_rw_enter_trace(XFS_WRITE_ENTER,
 737                                        io, buf, size, *offset, ioflags);
 738                ret = do_generic_file_write(file, buf, size, offset);
 739        }
 740
 741        if (unlikely(ioflags & IO_INVIS)) {
 742                /* generic_file_write updates the mtime/ctime but we need
 743                 * to undo that because this I/O was supposed to be
 744                 * invisible.
 745                 */
 746                struct inode    *inode = LINVFS_GET_IP(vp);
 747                inode->i_mtime = xip->i_d.di_mtime.t_sec;
 748                inode->i_ctime = xip->i_d.di_ctime.t_sec;
 749        } else {
 750                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 751        }
 752
 753        if ((ret == -ENOSPC) &&
 754            DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
 755            !(ioflags & IO_INVIS)) {
 756
 757                xfs_rwunlock(bdp, locktype);
 758                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 759                                DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 760                                0, 0, 0); /* Delay flag intentionally  unused */
 761                if (error)
 762                        return -error;
 763                xfs_rwlock(bdp, locktype);
 764                *offset = xip->i_d.di_size;
 765                goto retry;
 766        }
 767
 768error:
 769        if (ret <= 0) {
 770                if (iolock)
 771                        xfs_rwunlock(bdp, locktype);
 772                return ret;
 773        }
 774
 775        XFS_STATS_ADD(xs_write_bytes, ret);
 776
 777        if (*offset > xip->i_d.di_size) {
 778                xfs_ilock(xip, XFS_ILOCK_EXCL);
 779                if (*offset > xip->i_d.di_size) {
 780                        struct inode    *inode = LINVFS_GET_IP(vp);
 781
 782                        xip->i_d.di_size = *offset;
 783                        i_size_write(inode, *offset);
 784                        xip->i_update_core = 1;
 785                        xip->i_update_size = 1;
 786                        mark_inode_dirty_sync(inode);
 787                }
 788                xfs_iunlock(xip, XFS_ILOCK_EXCL);
 789        }
 790
 791        /* Handle various SYNC-type writes */
 792        if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
 793
 794                /*
 795                 * If we're treating this as O_DSYNC and we have not updated the
 796                 * size, force the log.
 797                 */
 798
 799                if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
 800                        && !(xip->i_update_size)) {
 801                        /*
 802                         * If an allocation transaction occurred
 803                         * without extending the size, then we have to force
 804                         * the log up the proper point to ensure that the
 805                         * allocation is permanent.  We can't count on
 806                         * the fact that buffered writes lock out direct I/O
 807                         * writes - the direct I/O write could have extended
 808                         * the size nontransactionally, then finished before
 809                         * we started.  xfs_write_file will think that the file
 810                         * didn't grow but the update isn't safe unless the
 811                         * size change is logged.
 812                         *
 813                         * Force the log if we've committed a transaction
 814                         * against the inode or if someone else has and
 815                         * the commit record hasn't gone to disk (e.g.
 816                         * the inode is pinned).  This guarantees that
 817                         * all changes affecting the inode are permanent
 818                         * when we return.
 819                         */
 820
 821                        xfs_inode_log_item_t *iip;
 822                        xfs_lsn_t lsn;
 823
 824                        iip = xip->i_itemp;
 825                        if (iip && iip->ili_last_lsn) {
 826                                lsn = iip->ili_last_lsn;
 827                                xfs_log_force(mp, lsn,
 828                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
 829                        } else if (xfs_ipincount(xip) > 0) {
 830                                xfs_log_force(mp, (xfs_lsn_t)0,
 831                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
 832                        }
 833
 834                } else {
 835                        xfs_trans_t     *tp;
 836
 837                        /*
 838                         * O_SYNC or O_DSYNC _with_ a size update are handled
 839                         * the same way.
 840                         *
 841                         * If the write was synchronous then we need to make
 842                         * sure that the inode modification time is permanent.
 843                         * We'll have updated the timestamp above, so here
 844                         * we use a synchronous transaction to log the inode.
 845                         * It's not fast, but it's necessary.
 846                         *
 847                         * If this a dsync write and the size got changed
 848                         * non-transactionally, then we need to ensure that
 849                         * the size change gets logged in a synchronous
 850                         * transaction.
 851                         */
 852
 853                        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
 854                        if ((error = xfs_trans_reserve(tp, 0,
 855                                                      XFS_SWRITE_LOG_RES(mp),
 856                                                      0, 0, 0))) {
 857                                /* Transaction reserve failed */
 858                                xfs_trans_cancel(tp, 0);
 859                        } else {
 860                                /* Transaction reserve successful */
 861                                xfs_ilock(xip, XFS_ILOCK_EXCL);
 862                                xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
 863                                xfs_trans_ihold(tp, xip);
 864                                xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
 865                                xfs_trans_set_sync(tp);
 866                                error = xfs_trans_commit(tp, 0, NULL);
 867                                xfs_iunlock(xip, XFS_ILOCK_EXCL);
 868                        }
 869                }
 870        } /* (ioflags & O_SYNC) */
 871
 872        /*
 873         * If we are coming from an nfsd thread then insert into the
 874         * reference cache.
 875         */
 876
 877        if (!strcmp(current->comm, "nfsd"))
 878                xfs_refcache_insert(xip);
 879
 880        /* Drop lock this way - the old refcache release is in here */
 881        if (iolock)
 882                xfs_rwunlock(bdp, locktype);
 883
 884        return(ret);
 885}
 886
 887/*
 888 * All xfs metadata buffers except log state machine buffers
 889 * get this attached as their b_bdstrat callback function.
 890 * This is so that we can catch a buffer
 891 * after prematurely unpinning it to forcibly shutdown the filesystem.
 892 */
 893int
 894xfs_bdstrat_cb(struct xfs_buf *bp)
 895{
 896        xfs_mount_t     *mp;
 897
 898        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
 899        if (!XFS_FORCED_SHUTDOWN(mp)) {
 900                pagebuf_iorequest(bp);
 901                return 0;
 902        } else {
 903                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
 904                /*
 905                 * Metadata write that didn't get logged but
 906                 * written delayed anyway. These aren't associated
 907                 * with a transaction, and can be ignored.
 908                 */
 909                if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
 910                    (XFS_BUF_ISREAD(bp)) == 0)
 911                        return (xfs_bioerror_relse(bp));
 912                else
 913                        return (xfs_bioerror(bp));
 914        }
 915}
 916
 917
 918int
 919xfs_bmap(bhv_desc_t     *bdp,
 920        xfs_off_t       offset,
 921        ssize_t         count,
 922        int             flags,
 923        xfs_iomap_t     *iomapp,
 924        int             *niomaps)
 925{
 926        xfs_inode_t     *ip = XFS_BHVTOI(bdp);
 927        xfs_iocore_t    *io = &ip->i_iocore;
 928
 929        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 930        ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
 931               ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
 932
 933        return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
 934}
 935
 936/*
 937 * Wrapper around bdstrat so that we can stop data
 938 * from going to disk in case we are shutting down the filesystem.
 939 * Typically user data goes thru this path; one of the exceptions
 940 * is the superblock.
 941 */
 942int
 943xfsbdstrat(
 944        struct xfs_mount        *mp,
 945        struct xfs_buf          *bp)
 946{
 947        ASSERT(mp);
 948        if (!XFS_FORCED_SHUTDOWN(mp)) {
 949                /* Grio redirection would go here
 950                 * if (XFS_BUF_IS_GRIO(bp)) {
 951                 */
 952
 953                pagebuf_iorequest(bp);
 954                return 0;
 955        }
 956
 957        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
 958        return (xfs_bioerror_relse(bp));
 959}
 960
 961/*
 962 * If the underlying (data/log/rt) device is readonly, there are some
 963 * operations that cannot proceed.
 964 */
 965int
 966xfs_dev_is_read_only(
 967        xfs_mount_t             *mp,
 968        char                    *message)
 969{
 970        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
 971            xfs_readonly_buftarg(mp->m_logdev_targp) ||
 972            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
 973                cmn_err(CE_NOTE,
 974                        "XFS: %s required on read-only device.", message);
 975                cmn_err(CE_NOTE,
 976                        "XFS: write access unavailable, cannot proceed.");
 977                return EROFS;
 978        }
 979        return 0;
 980}
 981
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.