linux/fs/nfs/direct.c
<<
>>
Prefs
   1/*
   2 * linux/fs/nfs/direct.c
   3 *
   4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
   5 *
   6 * High-performance uncached I/O for the Linux NFS client
   7 *
   8 * There are important applications whose performance or correctness
   9 * depends on uncached access to file data.  Database clusters
  10 * (multiple copies of the same instance running on separate hosts)
  11 * implement their own cache coherency protocol that subsumes file
  12 * system cache protocols.  Applications that process datasets
  13 * considerably larger than the client's memory do not always benefit
  14 * from a local cache.  A streaming video server, for instance, has no
  15 * need to cache the contents of a file.
  16 *
  17 * When an application requests uncached I/O, all read and write requests
  18 * are made directly to the server; data stored or fetched via these
  19 * requests is not cached in the Linux page cache.  The client does not
  20 * correct unaligned requests from applications.  All requested bytes are
  21 * held on permanent storage before a direct write system call returns to
  22 * an application.
  23 *
  24 * Solaris implements an uncached I/O facility called directio() that
  25 * is used for backups and sequential I/O to very large files.  Solaris
  26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
  27 * an undocumented mount option.
  28 *
  29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
  30 * help from Andrew Morton.
  31 *
  32 * 18 Dec 2001  Initial implementation for 2.4  --cel
  33 * 08 Jul 2002  Version for 2.4.19, with bug fixes --trondmy
  34 * 08 Jun 2003  Port to 2.5 APIs  --cel
  35 * 31 Mar 2004  Handle direct I/O without VFS support  --cel
  36 * 15 Sep 2004  Parallel async reads  --cel
  37 * 04 May 2005  support O_DIRECT with aio  --cel
  38 *
  39 */
  40
  41#include <linux/errno.h>
  42#include <linux/sched.h>
  43#include <linux/kernel.h>
  44#include <linux/file.h>
  45#include <linux/pagemap.h>
  46#include <linux/kref.h>
  47#include <linux/slab.h>
  48#include <linux/task_io_accounting_ops.h>
  49#include <linux/module.h>
  50
  51#include <linux/nfs_fs.h>
  52#include <linux/nfs_page.h>
  53#include <linux/sunrpc/clnt.h>
  54
  55#include <asm/uaccess.h>
  56#include <linux/atomic.h>
  57
  58#include "internal.h"
  59#include "iostat.h"
  60#include "pnfs.h"
  61
  62#define NFSDBG_FACILITY         NFSDBG_VFS
  63
  64static struct kmem_cache *nfs_direct_cachep;
  65
  66/*
  67 * This represents a set of asynchronous requests that we're waiting on
  68 */
  69struct nfs_direct_req {
  70        struct kref             kref;           /* release manager */
  71
  72        /* I/O parameters */
  73        struct nfs_open_context *ctx;           /* file open context info */
  74        struct nfs_lock_context *l_ctx;         /* Lock context info */
  75        struct kiocb *          iocb;           /* controlling i/o request */
  76        struct inode *          inode;          /* target file of i/o */
  77
  78        /* completion state */
  79        atomic_t                io_count;       /* i/os we're waiting for */
  80        spinlock_t              lock;           /* protect completion state */
  81        ssize_t                 count,          /* bytes actually processed */
  82                                bytes_left,     /* bytes left to be sent */
  83                                error;          /* any reported error */
  84        struct completion       completion;     /* wait for i/o completion */
  85
  86        /* commit state */
  87        struct nfs_mds_commit_info mds_cinfo;   /* Storage for cinfo */
  88        struct pnfs_ds_commit_info ds_cinfo;    /* Storage for cinfo */
  89        struct work_struct      work;
  90        int                     flags;
  91#define NFS_ODIRECT_DO_COMMIT           (1)     /* an unstable reply was received */
  92#define NFS_ODIRECT_RESCHED_WRITES      (2)     /* write verification failed */
  93        struct nfs_writeverf    verf;           /* unstable write verifier */
  94};
  95
  96static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
  97static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
  98static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
  99static void nfs_direct_write_schedule_work(struct work_struct *work);
 100
 101static inline void get_dreq(struct nfs_direct_req *dreq)
 102{
 103        atomic_inc(&dreq->io_count);
 104}
 105
 106static inline int put_dreq(struct nfs_direct_req *dreq)
 107{
 108        return atomic_dec_and_test(&dreq->io_count);
 109}
 110
 111/**
 112 * nfs_direct_IO - NFS address space operation for direct I/O
 113 * @rw: direction (read or write)
 114 * @iocb: target I/O control block
 115 * @iov: array of vectors that define I/O buffer
 116 * @pos: offset in file to begin the operation
 117 * @nr_segs: size of iovec array
 118 *
 119 * The presence of this routine in the address space ops vector means
 120 * the NFS client supports direct I/O. However, for most direct IO, we
 121 * shunt off direct read and write requests before the VFS gets them,
 122 * so this method is only ever called for swap.
 123 */
 124ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 125{
 126#ifndef CONFIG_NFS_SWAP
 127        dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
 128                        iocb->ki_filp->f_path.dentry->d_name.name,
 129                        (long long) pos, nr_segs);
 130
 131        return -EINVAL;
 132#else
 133        VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
 134        VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
 135
 136        if (rw == READ || rw == KERNEL_READ)
 137                return nfs_file_direct_read(iocb, iov, nr_segs, pos,
 138                                rw == READ ? true : false);
 139        return nfs_file_direct_write(iocb, iov, nr_segs, pos,
 140                                rw == WRITE ? true : false);
 141#endif /* CONFIG_NFS_SWAP */
 142}
 143
 144static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 145{
 146        unsigned int i;
 147        for (i = 0; i < npages; i++)
 148                page_cache_release(pages[i]);
 149}
 150
 151void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 152                              struct nfs_direct_req *dreq)
 153{
 154        cinfo->lock = &dreq->lock;
 155        cinfo->mds = &dreq->mds_cinfo;
 156        cinfo->ds = &dreq->ds_cinfo;
 157        cinfo->dreq = dreq;
 158        cinfo->completion_ops = &nfs_direct_commit_completion_ops;
 159}
 160
 161static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 162{
 163        struct nfs_direct_req *dreq;
 164
 165        dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
 166        if (!dreq)
 167                return NULL;
 168
 169        kref_init(&dreq->kref);
 170        kref_get(&dreq->kref);
 171        init_completion(&dreq->completion);
 172        INIT_LIST_HEAD(&dreq->mds_cinfo.list);
 173        INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
 174        spin_lock_init(&dreq->lock);
 175
 176        return dreq;
 177}
 178
 179static void nfs_direct_req_free(struct kref *kref)
 180{
 181        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 182
 183        if (dreq->l_ctx != NULL)
 184                nfs_put_lock_context(dreq->l_ctx);
 185        if (dreq->ctx != NULL)
 186                put_nfs_open_context(dreq->ctx);
 187        kmem_cache_free(nfs_direct_cachep, dreq);
 188}
 189
 190static void nfs_direct_req_release(struct nfs_direct_req *dreq)
 191{
 192        kref_put(&dreq->kref, nfs_direct_req_free);
 193}
 194
 195ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
 196{
 197        return dreq->bytes_left;
 198}
 199EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
 200
 201/*
 202 * Collects and returns the final error value/byte-count.
 203 */
 204static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
 205{
 206        ssize_t result = -EIOCBQUEUED;
 207
 208        /* Async requests don't wait here */
 209        if (dreq->iocb)
 210                goto out;
 211
 212        result = wait_for_completion_killable(&dreq->completion);
 213
 214        if (!result)
 215                result = dreq->error;
 216        if (!result)
 217                result = dreq->count;
 218
 219out:
 220        return (ssize_t) result;
 221}
 222
 223/*
 224 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 225 * the iocb is still valid here if this is a synchronous request.
 226 */
 227static void nfs_direct_complete(struct nfs_direct_req *dreq)
 228{
 229        if (dreq->iocb) {
 230                long res = (long) dreq->error;
 231                if (!res)
 232                        res = (long) dreq->count;
 233                aio_complete(dreq->iocb, res, 0);
 234        }
 235        complete_all(&dreq->completion);
 236
 237        nfs_direct_req_release(dreq);
 238}
 239
 240static void nfs_direct_readpage_release(struct nfs_page *req)
 241{
 242        dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
 243                req->wb_context->dentry->d_inode->i_sb->s_id,
 244                (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 245                req->wb_bytes,
 246                (long long)req_offset(req));
 247        nfs_release_request(req);
 248}
 249
 250static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 251{
 252        unsigned long bytes = 0;
 253        struct nfs_direct_req *dreq = hdr->dreq;
 254
 255        if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 256                goto out_put;
 257
 258        spin_lock(&dreq->lock);
 259        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
 260                dreq->error = hdr->error;
 261        else
 262                dreq->count += hdr->good_bytes;
 263        spin_unlock(&dreq->lock);
 264
 265        while (!list_empty(&hdr->pages)) {
 266                struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 267                struct page *page = req->wb_page;
 268
 269                if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
 270                        if (bytes > hdr->good_bytes)
 271                                zero_user(page, 0, PAGE_SIZE);
 272                        else if (hdr->good_bytes - bytes < PAGE_SIZE)
 273                                zero_user_segment(page,
 274                                        hdr->good_bytes & ~PAGE_MASK,
 275                                        PAGE_SIZE);
 276                }
 277                if (!PageCompound(page)) {
 278                        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
 279                                if (bytes < hdr->good_bytes)
 280                                        set_page_dirty(page);
 281                        } else
 282                                set_page_dirty(page);
 283                }
 284                bytes += req->wb_bytes;
 285                nfs_list_remove_request(req);
 286                nfs_direct_readpage_release(req);
 287        }
 288out_put:
 289        if (put_dreq(dreq))
 290                nfs_direct_complete(dreq);
 291        hdr->release(hdr);
 292}
 293
 294static void nfs_read_sync_pgio_error(struct list_head *head)
 295{
 296        struct nfs_page *req;
 297
 298        while (!list_empty(head)) {
 299                req = nfs_list_entry(head->next);
 300                nfs_list_remove_request(req);
 301                nfs_release_request(req);
 302        }
 303}
 304
 305static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
 306{
 307        get_dreq(hdr->dreq);
 308}
 309
 310static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
 311        .error_cleanup = nfs_read_sync_pgio_error,
 312        .init_hdr = nfs_direct_pgio_init,
 313        .completion = nfs_direct_read_completion,
 314};
 315
 316/*
 317 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
 318 * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
 319 * bail and stop sending more reads.  Read length accounting is
 320 * handled automatically by nfs_direct_read_result().  Otherwise, if
 321 * no requests have been sent, just return an error.
 322 */
 323static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
 324                                                const struct iovec *iov,
 325                                                loff_t pos, bool uio)
 326{
 327        struct nfs_direct_req *dreq = desc->pg_dreq;
 328        struct nfs_open_context *ctx = dreq->ctx;
 329        struct inode *inode = ctx->dentry->d_inode;
 330        unsigned long user_addr = (unsigned long)iov->iov_base;
 331        size_t count = iov->iov_len;
 332        size_t rsize = NFS_SERVER(inode)->rsize;
 333        unsigned int pgbase;
 334        int result;
 335        ssize_t started = 0;
 336        struct page **pagevec = NULL;
 337        unsigned int npages;
 338
 339        do {
 340                size_t bytes;
 341                int i;
 342
 343                pgbase = user_addr & ~PAGE_MASK;
 344                bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
 345
 346                result = -ENOMEM;
 347                npages = nfs_page_array_len(pgbase, bytes);
 348                if (!pagevec)
 349                        pagevec = kmalloc(npages * sizeof(struct page *),
 350                                          GFP_KERNEL);
 351                if (!pagevec)
 352                        break;
 353                if (uio) {
 354                        down_read(&current->mm->mmap_sem);
 355                        result = get_user_pages(current, current->mm, user_addr,
 356                                        npages, 1, 0, pagevec, NULL);
 357                        up_read(&current->mm->mmap_sem);
 358                        if (result < 0)
 359                                break;
 360                } else {
 361                        WARN_ON(npages != 1);
 362                        result = get_kernel_page(user_addr, 1, pagevec);
 363                        if (WARN_ON(result != 1))
 364                                break;
 365                }
 366
 367                if ((unsigned)result < npages) {
 368                        bytes = result * PAGE_SIZE;
 369                        if (bytes <= pgbase) {
 370                                nfs_direct_release_pages(pagevec, result);
 371                                break;
 372                        }
 373                        bytes -= pgbase;
 374                        npages = result;
 375                }
 376
 377                for (i = 0; i < npages; i++) {
 378                        struct nfs_page *req;
 379                        unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 380                        /* XXX do we need to do the eof zeroing found in async_filler? */
 381                        req = nfs_create_request(dreq->ctx, dreq->inode,
 382                                                 pagevec[i],
 383                                                 pgbase, req_len);
 384                        if (IS_ERR(req)) {
 385                                result = PTR_ERR(req);
 386                                break;
 387                        }
 388                        req->wb_index = pos >> PAGE_SHIFT;
 389                        req->wb_offset = pos & ~PAGE_MASK;
 390                        if (!nfs_pageio_add_request(desc, req)) {
 391                                result = desc->pg_error;
 392                                nfs_release_request(req);
 393                                break;
 394                        }
 395                        pgbase = 0;
 396                        bytes -= req_len;
 397                        started += req_len;
 398                        user_addr += req_len;
 399                        pos += req_len;
 400                        count -= req_len;
 401                        dreq->bytes_left -= req_len;
 402                }
 403                /* The nfs_page now hold references to these pages */
 404                nfs_direct_release_pages(pagevec, npages);
 405        } while (count != 0 && result >= 0);
 406
 407        kfree(pagevec);
 408
 409        if (started)
 410                return started;
 411        return result < 0 ? (ssize_t) result : -EFAULT;
 412}
 413
 414static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 415                                              const struct iovec *iov,
 416                                              unsigned long nr_segs,
 417                                              loff_t pos, bool uio)
 418{
 419        struct nfs_pageio_descriptor desc;
 420        ssize_t result = -EINVAL;
 421        size_t requested_bytes = 0;
 422        unsigned long seg;
 423
 424        NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
 425                             &nfs_direct_read_completion_ops);
 426        get_dreq(dreq);
 427        desc.pg_dreq = dreq;
 428
 429        for (seg = 0; seg < nr_segs; seg++) {
 430                const struct iovec *vec = &iov[seg];
 431                result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
 432                if (result < 0)
 433                        break;
 434                requested_bytes += result;
 435                if ((size_t)result < vec->iov_len)
 436                        break;
 437                pos += vec->iov_len;
 438        }
 439
 440        nfs_pageio_complete(&desc);
 441
 442        /*
 443         * If no bytes were started, return the error, and let the
 444         * generic layer handle the completion.
 445         */
 446        if (requested_bytes == 0) {
 447                nfs_direct_req_release(dreq);
 448                return result < 0 ? result : -EIO;
 449        }
 450
 451        if (put_dreq(dreq))
 452                nfs_direct_complete(dreq);
 453        return 0;
 454}
 455
 456static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 457                               unsigned long nr_segs, loff_t pos, bool uio)
 458{
 459        ssize_t result = -ENOMEM;
 460        struct inode *inode = iocb->ki_filp->f_mapping->host;
 461        struct nfs_direct_req *dreq;
 462        struct nfs_lock_context *l_ctx;
 463
 464        dreq = nfs_direct_req_alloc();
 465        if (dreq == NULL)
 466                goto out;
 467
 468        dreq->inode = inode;
 469        dreq->bytes_left = iov_length(iov, nr_segs);
 470        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 471        l_ctx = nfs_get_lock_context(dreq->ctx);
 472        if (IS_ERR(l_ctx)) {
 473                result = PTR_ERR(l_ctx);
 474                goto out_release;
 475        }
 476        dreq->l_ctx = l_ctx;
 477        if (!is_sync_kiocb(iocb))
 478                dreq->iocb = iocb;
 479
 480        NFS_I(inode)->read_io += iov_length(iov, nr_segs);
 481        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
 482        if (!result)
 483                result = nfs_direct_wait(dreq);
 484out_release:
 485        nfs_direct_req_release(dreq);
 486out:
 487        return result;
 488}
 489
 490static void nfs_inode_dio_write_done(struct inode *inode)
 491{
 492        nfs_zap_mapping(inode, inode->i_mapping);
 493        inode_dio_done(inode);
 494}
 495
 496#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 497static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 498{
 499        struct nfs_pageio_descriptor desc;
 500        struct nfs_page *req, *tmp;
 501        LIST_HEAD(reqs);
 502        struct nfs_commit_info cinfo;
 503        LIST_HEAD(failed);
 504
 505        nfs_init_cinfo_from_dreq(&cinfo, dreq);
 506        pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
 507        spin_lock(cinfo.lock);
 508        nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
 509        spin_unlock(cinfo.lock);
 510
 511        dreq->count = 0;
 512        get_dreq(dreq);
 513
 514        NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
 515                              &nfs_direct_write_completion_ops);
 516        desc.pg_dreq = dreq;
 517
 518        list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
 519                if (!nfs_pageio_add_request(&desc, req)) {
 520                        nfs_list_remove_request(req);
 521                        nfs_list_add_request(req, &failed);
 522                        spin_lock(cinfo.lock);
 523                        dreq->flags = 0;
 524                        dreq->error = -EIO;
 525                        spin_unlock(cinfo.lock);
 526                }
 527                nfs_release_request(req);
 528        }
 529        nfs_pageio_complete(&desc);
 530
 531        while (!list_empty(&failed)) {
 532                req = nfs_list_entry(failed.next);
 533                nfs_list_remove_request(req);
 534                nfs_unlock_and_release_request(req);
 535        }
 536
 537        if (put_dreq(dreq))
 538                nfs_direct_write_complete(dreq, dreq->inode);
 539}
 540
 541static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 542{
 543        struct nfs_direct_req *dreq = data->dreq;
 544        struct nfs_commit_info cinfo;
 545        struct nfs_page *req;
 546        int status = data->task.tk_status;
 547
 548        nfs_init_cinfo_from_dreq(&cinfo, dreq);
 549        if (status < 0) {
 550                dprintk("NFS: %5u commit failed with error %d.\n",
 551                        data->task.tk_pid, status);
 552                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 553        } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
 554                dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
 555                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 556        }
 557
 558        dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
 559        while (!list_empty(&data->pages)) {
 560                req = nfs_list_entry(data->pages.next);
 561                nfs_list_remove_request(req);
 562                if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
 563                        /* Note the rewrite will go through mds */
 564                        nfs_mark_request_commit(req, NULL, &cinfo);
 565                } else
 566                        nfs_release_request(req);
 567                nfs_unlock_and_release_request(req);
 568        }
 569
 570        if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
 571                nfs_direct_write_complete(dreq, data->inode);
 572}
 573
 574static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
 575{
 576        /* There is no lock to clear */
 577}
 578
 579static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
 580        .completion = nfs_direct_commit_complete,
 581        .error_cleanup = nfs_direct_error_cleanup,
 582};
 583
 584static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 585{
 586        int res;
 587        struct nfs_commit_info cinfo;
 588        LIST_HEAD(mds_list);
 589
 590        nfs_init_cinfo_from_dreq(&cinfo, dreq);
 591        nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
 592        res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
 593        if (res < 0) /* res == -ENOMEM */
 594                nfs_direct_write_reschedule(dreq);
 595}
 596
 597static void nfs_direct_write_schedule_work(struct work_struct *work)
 598{
 599        struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
 600        int flags = dreq->flags;
 601
 602        dreq->flags = 0;
 603        switch (flags) {
 604                case NFS_ODIRECT_DO_COMMIT:
 605                        nfs_direct_commit_schedule(dreq);
 606                        break;
 607                case NFS_ODIRECT_RESCHED_WRITES:
 608                        nfs_direct_write_reschedule(dreq);
 609                        break;
 610                default:
 611                        nfs_inode_dio_write_done(dreq->inode);
 612                        nfs_direct_complete(dreq);
 613        }
 614}
 615
 616static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 617{
 618        schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 619}
 620
 621#else
 622static void nfs_direct_write_schedule_work(struct work_struct *work)
 623{
 624}
 625
 626static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 627{
 628        nfs_inode_dio_write_done(inode);
 629        nfs_direct_complete(dreq);
 630}
 631#endif
 632
 633/*
 634 * NB: Return the value of the first error return code.  Subsequent
 635 *     errors after the first one are ignored.
 636 */
 637/*
 638 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
 639 * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
 640 * bail and stop sending more writes.  Write length accounting is
 641 * handled automatically by nfs_direct_write_result().  Otherwise, if
 642 * no requests have been sent, just return an error.
 643 */
 644static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
 645                                                 const struct iovec *iov,
 646                                                 loff_t pos, bool uio)
 647{
 648        struct nfs_direct_req *dreq = desc->pg_dreq;
 649        struct nfs_open_context *ctx = dreq->ctx;
 650        struct inode *inode = ctx->dentry->d_inode;
 651        unsigned long user_addr = (unsigned long)iov->iov_base;
 652        size_t count = iov->iov_len;
 653        size_t wsize = NFS_SERVER(inode)->wsize;
 654        unsigned int pgbase;
 655        int result;
 656        ssize_t started = 0;
 657        struct page **pagevec = NULL;
 658        unsigned int npages;
 659
 660        do {
 661                size_t bytes;
 662                int i;
 663
 664                pgbase = user_addr & ~PAGE_MASK;
 665                bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
 666
 667                result = -ENOMEM;
 668                npages = nfs_page_array_len(pgbase, bytes);
 669                if (!pagevec)
 670                        pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
 671                if (!pagevec)
 672                        break;
 673
 674                if (uio) {
 675                        down_read(&current->mm->mmap_sem);
 676                        result = get_user_pages(current, current->mm, user_addr,
 677                                                npages, 0, 0, pagevec, NULL);
 678                        up_read(&current->mm->mmap_sem);
 679                        if (result < 0)
 680                                break;
 681                } else {
 682                        WARN_ON(npages != 1);
 683                        result = get_kernel_page(user_addr, 0, pagevec);
 684                        if (WARN_ON(result != 1))
 685                                break;
 686                }
 687
 688                if ((unsigned)result < npages) {
 689                        bytes = result * PAGE_SIZE;
 690                        if (bytes <= pgbase) {
 691                                nfs_direct_release_pages(pagevec, result);
 692                                break;
 693                        }
 694                        bytes -= pgbase;
 695                        npages = result;
 696                }
 697
 698                for (i = 0; i < npages; i++) {
 699                        struct nfs_page *req;
 700                        unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 701
 702                        req = nfs_create_request(dreq->ctx, dreq->inode,
 703                                                 pagevec[i],
 704                                                 pgbase, req_len);
 705                        if (IS_ERR(req)) {
 706                                result = PTR_ERR(req);
 707                                break;
 708                        }
 709                        nfs_lock_request(req);
 710                        req->wb_index = pos >> PAGE_SHIFT;
 711                        req->wb_offset = pos & ~PAGE_MASK;
 712                        if (!nfs_pageio_add_request(desc, req)) {
 713                                result = desc->pg_error;
 714                                nfs_unlock_and_release_request(req);
 715                                break;
 716                        }
 717                        pgbase = 0;
 718                        bytes -= req_len;
 719                        started += req_len;
 720                        user_addr += req_len;
 721                        pos += req_len;
 722                        count -= req_len;
 723                        dreq->bytes_left -= req_len;
 724                }
 725                /* The nfs_page now hold references to these pages */
 726                nfs_direct_release_pages(pagevec, npages);
 727        } while (count != 0 && result >= 0);
 728
 729        kfree(pagevec);
 730
 731        if (started)
 732                return started;
 733        return result < 0 ? (ssize_t) result : -EFAULT;
 734}
 735
 736static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 737{
 738        struct nfs_direct_req *dreq = hdr->dreq;
 739        struct nfs_commit_info cinfo;
 740        int bit = -1;
 741        struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 742
 743        if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
 744                goto out_put;
 745
 746        nfs_init_cinfo_from_dreq(&cinfo, dreq);
 747
 748        spin_lock(&dreq->lock);
 749
 750        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
 751                dreq->flags = 0;
 752                dreq->error = hdr->error;
 753        }
 754        if (dreq->error != 0)
 755                bit = NFS_IOHDR_ERROR;
 756        else {
 757                dreq->count += hdr->good_bytes;
 758                if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
 759                        dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 760                        bit = NFS_IOHDR_NEED_RESCHED;
 761                } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
 762                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
 763                                bit = NFS_IOHDR_NEED_RESCHED;
 764                        else if (dreq->flags == 0) {
 765                                memcpy(&dreq->verf, hdr->verf,
 766                                       sizeof(dreq->verf));
 767                                bit = NFS_IOHDR_NEED_COMMIT;
 768                                dreq->flags = NFS_ODIRECT_DO_COMMIT;
 769                        } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
 770                                if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
 771                                        dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 772                                        bit = NFS_IOHDR_NEED_RESCHED;
 773                                } else
 774                                        bit = NFS_IOHDR_NEED_COMMIT;
 775                        }
 776                }
 777        }
 778        spin_unlock(&dreq->lock);
 779
 780        while (!list_empty(&hdr->pages)) {
 781                req = nfs_list_entry(hdr->pages.next);
 782                nfs_list_remove_request(req);
 783                switch (bit) {
 784                case NFS_IOHDR_NEED_RESCHED:
 785                case NFS_IOHDR_NEED_COMMIT:
 786                        kref_get(&req->wb_kref);
 787                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 788                }
 789                nfs_unlock_and_release_request(req);
 790        }
 791
 792out_put:
 793        if (put_dreq(dreq))
 794                nfs_direct_write_complete(dreq, hdr->inode);
 795        hdr->release(hdr);
 796}
 797
 798static void nfs_write_sync_pgio_error(struct list_head *head)
 799{
 800        struct nfs_page *req;
 801
 802        while (!list_empty(head)) {
 803                req = nfs_list_entry(head->next);
 804                nfs_list_remove_request(req);
 805                nfs_unlock_and_release_request(req);
 806        }
 807}
 808
 809static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
 810        .error_cleanup = nfs_write_sync_pgio_error,
 811        .init_hdr = nfs_direct_pgio_init,
 812        .completion = nfs_direct_write_completion,
 813};
 814
 815static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 816                                               const struct iovec *iov,
 817                                               unsigned long nr_segs,
 818                                               loff_t pos, bool uio)
 819{
 820        struct nfs_pageio_descriptor desc;
 821        struct inode *inode = dreq->inode;
 822        ssize_t result = 0;
 823        size_t requested_bytes = 0;
 824        unsigned long seg;
 825
 826        NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
 827                              &nfs_direct_write_completion_ops);
 828        desc.pg_dreq = dreq;
 829        get_dreq(dreq);
 830        atomic_inc(&inode->i_dio_count);
 831
 832        NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
 833        for (seg = 0; seg < nr_segs; seg++) {
 834                const struct iovec *vec = &iov[seg];
 835                result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
 836                if (result < 0)
 837                        break;
 838                requested_bytes += result;
 839                if ((size_t)result < vec->iov_len)
 840                        break;
 841                pos += vec->iov_len;
 842        }
 843        nfs_pageio_complete(&desc);
 844
 845        /*
 846         * If no bytes were started, return the error, and let the
 847         * generic layer handle the completion.
 848         */
 849        if (requested_bytes == 0) {
 850                inode_dio_done(inode);
 851                nfs_direct_req_release(dreq);
 852                return result < 0 ? result : -EIO;
 853        }
 854
 855        if (put_dreq(dreq))
 856                nfs_direct_write_complete(dreq, dreq->inode);
 857        return 0;
 858}
 859
 860static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 861                                unsigned long nr_segs, loff_t pos,
 862                                size_t count, bool uio)
 863{
 864        ssize_t result = -ENOMEM;
 865        struct inode *inode = iocb->ki_filp->f_mapping->host;
 866        struct nfs_direct_req *dreq;
 867        struct nfs_lock_context *l_ctx;
 868
 869        dreq = nfs_direct_req_alloc();
 870        if (!dreq)
 871                goto out;
 872
 873        dreq->inode = inode;
 874        dreq->bytes_left = count;
 875        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 876        l_ctx = nfs_get_lock_context(dreq->ctx);
 877        if (IS_ERR(l_ctx)) {
 878                result = PTR_ERR(l_ctx);
 879                goto out_release;
 880        }
 881        dreq->l_ctx = l_ctx;
 882        if (!is_sync_kiocb(iocb))
 883                dreq->iocb = iocb;
 884
 885        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
 886        if (!result)
 887                result = nfs_direct_wait(dreq);
 888out_release:
 889        nfs_direct_req_release(dreq);
 890out:
 891        return result;
 892}
 893
 894/**
 895 * nfs_file_direct_read - file direct read operation for NFS files
 896 * @iocb: target I/O control block
 897 * @iov: vector of user buffers into which to read data
 898 * @nr_segs: size of iov vector
 899 * @pos: byte offset in file where reading starts
 900 *
 901 * We use this function for direct reads instead of calling
 902 * generic_file_aio_read() in order to avoid gfar's check to see if
 903 * the request starts before the end of the file.  For that check
 904 * to work, we must generate a GETATTR before each direct read, and
 905 * even then there is a window between the GETATTR and the subsequent
 906 * READ where the file size could change.  Our preference is simply
 907 * to do all reads the application wants, and the server will take
 908 * care of managing the end of file boundary.
 909 *
 910 * This function also eliminates unnecessarily updating the file's
 911 * atime locally, as the NFS server sets the file's atime, and this
 912 * client must read the updated atime from the server back into its
 913 * cache.
 914 */
 915ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 916                                unsigned long nr_segs, loff_t pos, bool uio)
 917{
 918        ssize_t retval = -EINVAL;
 919        struct file *file = iocb->ki_filp;
 920        struct address_space *mapping = file->f_mapping;
 921        size_t count;
 922
 923        count = iov_length(iov, nr_segs);
 924        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 925
 926        dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
 927                file->f_path.dentry->d_parent->d_name.name,
 928                file->f_path.dentry->d_name.name,
 929                count, (long long) pos);
 930
 931        retval = 0;
 932        if (!count)
 933                goto out;
 934
 935        retval = nfs_sync_mapping(mapping);
 936        if (retval)
 937                goto out;
 938
 939        task_io_account_read(count);
 940
 941        retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
 942        if (retval > 0)
 943                iocb->ki_pos = pos + retval;
 944
 945out:
 946        return retval;
 947}
 948
 949/**
 950 * nfs_file_direct_write - file direct write operation for NFS files
 951 * @iocb: target I/O control block
 952 * @iov: vector of user buffers from which to write data
 953 * @nr_segs: size of iov vector
 954 * @pos: byte offset in file where writing starts
 955 *
 956 * We use this function for direct writes instead of calling
 957 * generic_file_aio_write() in order to avoid taking the inode
 958 * semaphore and updating the i_size.  The NFS server will set
 959 * the new i_size and this client must read the updated size
 960 * back into its cache.  We let the server do generic write
 961 * parameter checking and report problems.
 962 *
 963 * We eliminate local atime updates, see direct read above.
 964 *
 965 * We avoid unnecessary page cache invalidations for normal cached
 966 * readers of this file.
 967 *
 968 * Note that O_APPEND is not supported for NFS direct writes, as there
 969 * is no atomic O_APPEND write facility in the NFS protocol.
 970 */
 971ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 972                                unsigned long nr_segs, loff_t pos, bool uio)
 973{
 974        ssize_t retval = -EINVAL;
 975        struct file *file = iocb->ki_filp;
 976        struct address_space *mapping = file->f_mapping;
 977        size_t count;
 978
 979        count = iov_length(iov, nr_segs);
 980        nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 981
 982        dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
 983                file->f_path.dentry->d_parent->d_name.name,
 984                file->f_path.dentry->d_name.name,
 985                count, (long long) pos);
 986
 987        retval = generic_write_checks(file, &pos, &count, 0);
 988        if (retval)
 989                goto out;
 990
 991        retval = -EINVAL;
 992        if ((ssize_t) count < 0)
 993                goto out;
 994        retval = 0;
 995        if (!count)
 996                goto out;
 997
 998        retval = nfs_sync_mapping(mapping);
 999        if (retval)
1000                goto out;
1001
1002        task_io_account_write(count);
1003
1004        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
1005        if (retval > 0) {
1006                struct inode *inode = mapping->host;
1007
1008                iocb->ki_pos = pos + retval;
1009                spin_lock(&inode->i_lock);
1010                if (i_size_read(inode) < iocb->ki_pos)
1011                        i_size_write(inode, iocb->ki_pos);
1012                spin_unlock(&inode->i_lock);
1013        }
1014out:
1015        return retval;
1016}
1017
1018/**
1019 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
1020 *
1021 */
1022int __init nfs_init_directcache(void)
1023{
1024        nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
1025                                                sizeof(struct nfs_direct_req),
1026                                                0, (SLAB_RECLAIM_ACCOUNT|
1027                                                        SLAB_MEM_SPREAD),
1028                                                NULL);
1029        if (nfs_direct_cachep == NULL)
1030                return -ENOMEM;
1031
1032        return 0;
1033}
1034
1035/**
1036 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1037 *
1038 */
1039void nfs_destroy_directcache(void)
1040{
1041        kmem_cache_destroy(nfs_direct_cachep);
1042}
1043
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.