linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include <linux/nfs_page.h>
  32#include <linux/module.h>
  33#include "internal.h"
  34#include "pnfs.h"
  35#include "iostat.h"
  36
  37#define NFSDBG_FACILITY         NFSDBG_PNFS
  38
  39/* Locking:
  40 *
  41 * pnfs_spinlock:
  42 *      protects pnfs_modules_tbl.
  43 */
  44static DEFINE_SPINLOCK(pnfs_spinlock);
  45
  46/*
  47 * pnfs_modules_tbl holds all pnfs modules
  48 */
  49static LIST_HEAD(pnfs_modules_tbl);
  50
  51/* Return the registered pnfs layout driver module matching given id */
  52static struct pnfs_layoutdriver_type *
  53find_pnfs_driver_locked(u32 id)
  54{
  55        struct pnfs_layoutdriver_type *local;
  56
  57        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  58                if (local->id == id)
  59                        goto out;
  60        local = NULL;
  61out:
  62        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  63        return local;
  64}
  65
  66static struct pnfs_layoutdriver_type *
  67find_pnfs_driver(u32 id)
  68{
  69        struct pnfs_layoutdriver_type *local;
  70
  71        spin_lock(&pnfs_spinlock);
  72        local = find_pnfs_driver_locked(id);
  73        if (local != NULL && !try_module_get(local->owner)) {
  74                dprintk("%s: Could not grab reference on module\n", __func__);
  75                local = NULL;
  76        }
  77        spin_unlock(&pnfs_spinlock);
  78        return local;
  79}
  80
  81void
  82unset_pnfs_layoutdriver(struct nfs_server *nfss)
  83{
  84        if (nfss->pnfs_curr_ld) {
  85                if (nfss->pnfs_curr_ld->clear_layoutdriver)
  86                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
  87                /* Decrement the MDS count. Purge the deviceid cache if zero */
  88                if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
  89                        nfs4_deviceid_purge_client(nfss->nfs_client);
  90                module_put(nfss->pnfs_curr_ld->owner);
  91        }
  92        nfss->pnfs_curr_ld = NULL;
  93}
  94
  95/*
  96 * Try to set the server's pnfs module to the pnfs layout type specified by id.
  97 * Currently only one pNFS layout driver per filesystem is supported.
  98 *
  99 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 100 */
 101void
 102set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
 103                      u32 id)
 104{
 105        struct pnfs_layoutdriver_type *ld_type = NULL;
 106
 107        if (id == 0)
 108                goto out_no_driver;
 109        if (!(server->nfs_client->cl_exchange_flags &
 110                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
 111                printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
 112                        __func__, id, server->nfs_client->cl_exchange_flags);
 113                goto out_no_driver;
 114        }
 115        ld_type = find_pnfs_driver(id);
 116        if (!ld_type) {
 117                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
 118                ld_type = find_pnfs_driver(id);
 119                if (!ld_type) {
 120                        dprintk("%s: No pNFS module found for %u.\n",
 121                                __func__, id);
 122                        goto out_no_driver;
 123                }
 124        }
 125        server->pnfs_curr_ld = ld_type;
 126        if (ld_type->set_layoutdriver
 127            && ld_type->set_layoutdriver(server, mntfh)) {
 128                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
 129                        "driver %u.\n", __func__, id);
 130                module_put(ld_type->owner);
 131                goto out_no_driver;
 132        }
 133        /* Bump the MDS count */
 134        atomic_inc(&server->nfs_client->cl_mds_count);
 135
 136        dprintk("%s: pNFS module for %u set\n", __func__, id);
 137        return;
 138
 139out_no_driver:
 140        dprintk("%s: Using NFSv4 I/O\n", __func__);
 141        server->pnfs_curr_ld = NULL;
 142}
 143
 144int
 145pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 146{
 147        int status = -EINVAL;
 148        struct pnfs_layoutdriver_type *tmp;
 149
 150        if (ld_type->id == 0) {
 151                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
 152                return status;
 153        }
 154        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 155                printk(KERN_ERR "NFS: %s Layout driver must provide "
 156                       "alloc_lseg and free_lseg.\n", __func__);
 157                return status;
 158        }
 159
 160        spin_lock(&pnfs_spinlock);
 161        tmp = find_pnfs_driver_locked(ld_type->id);
 162        if (!tmp) {
 163                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 164                status = 0;
 165                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 166                        ld_type->name);
 167        } else {
 168                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
 169                        __func__, ld_type->id);
 170        }
 171        spin_unlock(&pnfs_spinlock);
 172
 173        return status;
 174}
 175EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 176
 177void
 178pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 179{
 180        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 181        spin_lock(&pnfs_spinlock);
 182        list_del(&ld_type->pnfs_tblid);
 183        spin_unlock(&pnfs_spinlock);
 184}
 185EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 186
 187/*
 188 * pNFS client layout cache
 189 */
 190
 191/* Need to hold i_lock if caller does not already hold reference */
 192void
 193get_layout_hdr(struct pnfs_layout_hdr *lo)
 194{
 195        atomic_inc(&lo->plh_refcount);
 196}
 197
 198static struct pnfs_layout_hdr *
 199pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 200{
 201        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 202        return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
 203                kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
 204}
 205
 206static void
 207pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 208{
 209        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
 210        put_rpccred(lo->plh_lc_cred);
 211        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
 212}
 213
 214static void
 215destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 216{
 217        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 218        BUG_ON(!list_empty(&lo->plh_layouts));
 219        NFS_I(lo->plh_inode)->layout = NULL;
 220        pnfs_free_layout_hdr(lo);
 221}
 222
 223static void
 224put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 225{
 226        if (atomic_dec_and_test(&lo->plh_refcount))
 227                destroy_layout_hdr(lo);
 228}
 229
 230void
 231put_layout_hdr(struct pnfs_layout_hdr *lo)
 232{
 233        struct inode *inode = lo->plh_inode;
 234
 235        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 236                destroy_layout_hdr(lo);
 237                spin_unlock(&inode->i_lock);
 238        }
 239}
 240
 241static void
 242init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 243{
 244        INIT_LIST_HEAD(&lseg->pls_list);
 245        INIT_LIST_HEAD(&lseg->pls_lc_list);
 246        atomic_set(&lseg->pls_refcount, 1);
 247        smp_mb();
 248        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 249        lseg->pls_layout = lo;
 250}
 251
 252static void free_lseg(struct pnfs_layout_segment *lseg)
 253{
 254        struct inode *ino = lseg->pls_layout->plh_inode;
 255
 256        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 257        /* Matched by get_layout_hdr in pnfs_insert_layout */
 258        put_layout_hdr(NFS_I(ino)->layout);
 259}
 260
 261static void
 262put_lseg_common(struct pnfs_layout_segment *lseg)
 263{
 264        struct inode *inode = lseg->pls_layout->plh_inode;
 265
 266        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 267        list_del_init(&lseg->pls_list);
 268        if (list_empty(&lseg->pls_layout->plh_segs)) {
 269                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
 270                /* Matched by initial refcount set in alloc_init_layout_hdr */
 271                put_layout_hdr_locked(lseg->pls_layout);
 272        }
 273        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 274}
 275
 276void
 277put_lseg(struct pnfs_layout_segment *lseg)
 278{
 279        struct inode *inode;
 280
 281        if (!lseg)
 282                return;
 283
 284        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 285                atomic_read(&lseg->pls_refcount),
 286                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 287        inode = lseg->pls_layout->plh_inode;
 288        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 289                LIST_HEAD(free_me);
 290
 291                put_lseg_common(lseg);
 292                list_add(&lseg->pls_list, &free_me);
 293                spin_unlock(&inode->i_lock);
 294                pnfs_free_lseg_list(&free_me);
 295        }
 296}
 297EXPORT_SYMBOL_GPL(put_lseg);
 298
 299static inline u64
 300end_offset(u64 start, u64 len)
 301{
 302        u64 end;
 303
 304        end = start + len;
 305        return end >= start ? end : NFS4_MAX_UINT64;
 306}
 307
 308/* last octet in a range */
 309static inline u64
 310last_byte_offset(u64 start, u64 len)
 311{
 312        u64 end;
 313
 314        BUG_ON(!len);
 315        end = start + len;
 316        return end > start ? end - 1 : NFS4_MAX_UINT64;
 317}
 318
 319/*
 320 * is l2 fully contained in l1?
 321 *   start1                             end1
 322 *   [----------------------------------)
 323 *           start2           end2
 324 *           [----------------)
 325 */
 326static inline int
 327lo_seg_contained(struct pnfs_layout_range *l1,
 328                 struct pnfs_layout_range *l2)
 329{
 330        u64 start1 = l1->offset;
 331        u64 end1 = end_offset(start1, l1->length);
 332        u64 start2 = l2->offset;
 333        u64 end2 = end_offset(start2, l2->length);
 334
 335        return (start1 <= start2) && (end1 >= end2);
 336}
 337
 338/*
 339 * is l1 and l2 intersecting?
 340 *   start1                             end1
 341 *   [----------------------------------)
 342 *                              start2           end2
 343 *                              [----------------)
 344 */
 345static inline int
 346lo_seg_intersecting(struct pnfs_layout_range *l1,
 347                    struct pnfs_layout_range *l2)
 348{
 349        u64 start1 = l1->offset;
 350        u64 end1 = end_offset(start1, l1->length);
 351        u64 start2 = l2->offset;
 352        u64 end2 = end_offset(start2, l2->length);
 353
 354        return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
 355               (end2 == NFS4_MAX_UINT64 || end2 > start1);
 356}
 357
 358static bool
 359should_free_lseg(struct pnfs_layout_range *lseg_range,
 360                 struct pnfs_layout_range *recall_range)
 361{
 362        return (recall_range->iomode == IOMODE_ANY ||
 363                lseg_range->iomode == recall_range->iomode) &&
 364               lo_seg_intersecting(lseg_range, recall_range);
 365}
 366
 367/* Returns 1 if lseg is removed from list, 0 otherwise */
 368static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 369                             struct list_head *tmp_list)
 370{
 371        int rv = 0;
 372
 373        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 374                /* Remove the reference keeping the lseg in the
 375                 * list.  It will now be removed when all
 376                 * outstanding io is finished.
 377                 */
 378                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 379                        atomic_read(&lseg->pls_refcount));
 380                if (atomic_dec_and_test(&lseg->pls_refcount)) {
 381                        put_lseg_common(lseg);
 382                        list_add(&lseg->pls_list, tmp_list);
 383                        rv = 1;
 384                }
 385        }
 386        return rv;
 387}
 388
 389/* Returns count of number of matching invalid lsegs remaining in list
 390 * after call.
 391 */
 392int
 393mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 394                            struct list_head *tmp_list,
 395                            struct pnfs_layout_range *recall_range)
 396{
 397        struct pnfs_layout_segment *lseg, *next;
 398        int invalid = 0, removed = 0;
 399
 400        dprintk("%s:Begin lo %p\n", __func__, lo);
 401
 402        if (list_empty(&lo->plh_segs)) {
 403                /* Reset MDS Threshold I/O counters */
 404                NFS_I(lo->plh_inode)->write_io = 0;
 405                NFS_I(lo->plh_inode)->read_io = 0;
 406                if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
 407                        put_layout_hdr_locked(lo);
 408                return 0;
 409        }
 410        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 411                if (!recall_range ||
 412                    should_free_lseg(&lseg->pls_range, recall_range)) {
 413                        dprintk("%s: freeing lseg %p iomode %d "
 414                                "offset %llu length %llu\n", __func__,
 415                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
 416                                lseg->pls_range.length);
 417                        invalid++;
 418                        removed += mark_lseg_invalid(lseg, tmp_list);
 419                }
 420        dprintk("%s:Return %i\n", __func__, invalid - removed);
 421        return invalid - removed;
 422}
 423
 424/* note free_me must contain lsegs from a single layout_hdr */
 425void
 426pnfs_free_lseg_list(struct list_head *free_me)
 427{
 428        struct pnfs_layout_segment *lseg, *tmp;
 429        struct pnfs_layout_hdr *lo;
 430
 431        if (list_empty(free_me))
 432                return;
 433
 434        lo = list_first_entry(free_me, struct pnfs_layout_segment,
 435                              pls_list)->pls_layout;
 436
 437        if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
 438                struct nfs_client *clp;
 439
 440                clp = NFS_SERVER(lo->plh_inode)->nfs_client;
 441                spin_lock(&clp->cl_lock);
 442                list_del_init(&lo->plh_layouts);
 443                spin_unlock(&clp->cl_lock);
 444        }
 445        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 446                list_del(&lseg->pls_list);
 447                free_lseg(lseg);
 448        }
 449}
 450
 451void
 452pnfs_destroy_layout(struct nfs_inode *nfsi)
 453{
 454        struct pnfs_layout_hdr *lo;
 455        LIST_HEAD(tmp_list);
 456
 457        spin_lock(&nfsi->vfs_inode.i_lock);
 458        lo = nfsi->layout;
 459        if (lo) {
 460                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
 461                mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 462        }
 463        spin_unlock(&nfsi->vfs_inode.i_lock);
 464        pnfs_free_lseg_list(&tmp_list);
 465}
 466EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 467
 468/*
 469 * Called by the state manger to remove all layouts established under an
 470 * expired lease.
 471 */
 472void
 473pnfs_destroy_all_layouts(struct nfs_client *clp)
 474{
 475        struct nfs_server *server;
 476        struct pnfs_layout_hdr *lo;
 477        LIST_HEAD(tmp_list);
 478
 479        nfs4_deviceid_mark_client_invalid(clp);
 480        nfs4_deviceid_purge_client(clp);
 481
 482        spin_lock(&clp->cl_lock);
 483        rcu_read_lock();
 484        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 485                if (!list_empty(&server->layouts))
 486                        list_splice_init(&server->layouts, &tmp_list);
 487        }
 488        rcu_read_unlock();
 489        spin_unlock(&clp->cl_lock);
 490
 491        while (!list_empty(&tmp_list)) {
 492                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
 493                                plh_layouts);
 494                dprintk("%s freeing layout for inode %lu\n", __func__,
 495                        lo->plh_inode->i_ino);
 496                list_del_init(&lo->plh_layouts);
 497                pnfs_destroy_layout(NFS_I(lo->plh_inode));
 498        }
 499}
 500
 501/* update lo->plh_stateid with new if is more recent */
 502void
 503pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 504                        bool update_barrier)
 505{
 506        u32 oldseq, newseq;
 507
 508        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 509        newseq = be32_to_cpu(new->seqid);
 510        if ((int)(newseq - oldseq) > 0) {
 511                nfs4_stateid_copy(&lo->plh_stateid, new);
 512                if (update_barrier) {
 513                        u32 new_barrier = be32_to_cpu(new->seqid);
 514
 515                        if ((int)(new_barrier - lo->plh_barrier))
 516                                lo->plh_barrier = new_barrier;
 517                } else {
 518                        /* Because of wraparound, we want to keep the barrier
 519                         * "close" to the current seqids.  It needs to be
 520                         * within 2**31 to count as "behind", so if it
 521                         * gets too near that limit, give us a litle leeway
 522                         * and bring it to within 2**30.
 523                         * NOTE - and yes, this is all unsigned arithmetic.
 524                         */
 525                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
 526                                lo->plh_barrier = newseq - (1 << 30);
 527                }
 528        }
 529}
 530
 531/* lget is set to 1 if called from inside send_layoutget call chain */
 532static bool
 533pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
 534                        int lget)
 535{
 536        if ((stateid) &&
 537            (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
 538                return true;
 539        return lo->plh_block_lgets ||
 540                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
 541                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 542                (list_empty(&lo->plh_segs) &&
 543                 (atomic_read(&lo->plh_outstanding) > lget));
 544}
 545
 546int
 547pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 548                              struct nfs4_state *open_state)
 549{
 550        int status = 0;
 551
 552        dprintk("--> %s\n", __func__);
 553        spin_lock(&lo->plh_inode->i_lock);
 554        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
 555                status = -EAGAIN;
 556        } else if (list_empty(&lo->plh_segs)) {
 557                int seq;
 558
 559                do {
 560                        seq = read_seqbegin(&open_state->seqlock);
 561                        nfs4_stateid_copy(dst, &open_state->stateid);
 562                } while (read_seqretry(&open_state->seqlock, seq));
 563        } else
 564                nfs4_stateid_copy(dst, &lo->plh_stateid);
 565        spin_unlock(&lo->plh_inode->i_lock);
 566        dprintk("<-- %s\n", __func__);
 567        return status;
 568}
 569
 570/*
 571* Get layout from server.
 572*    for now, assume that whole file layouts are requested.
 573*    arg->offset: 0
 574*    arg->length: all ones
 575*/
 576static struct pnfs_layout_segment *
 577send_layoutget(struct pnfs_layout_hdr *lo,
 578           struct nfs_open_context *ctx,
 579           struct pnfs_layout_range *range,
 580           gfp_t gfp_flags)
 581{
 582        struct inode *ino = lo->plh_inode;
 583        struct nfs_server *server = NFS_SERVER(ino);
 584        struct nfs4_layoutget *lgp;
 585        struct pnfs_layout_segment *lseg = NULL;
 586
 587        dprintk("--> %s\n", __func__);
 588
 589        BUG_ON(ctx == NULL);
 590        lgp = kzalloc(sizeof(*lgp), gfp_flags);
 591        if (lgp == NULL)
 592                return NULL;
 593
 594        lgp->args.minlength = PAGE_CACHE_SIZE;
 595        if (lgp->args.minlength > range->length)
 596                lgp->args.minlength = range->length;
 597        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 598        lgp->args.range = *range;
 599        lgp->args.type = server->pnfs_curr_ld->id;
 600        lgp->args.inode = ino;
 601        lgp->args.ctx = get_nfs_open_context(ctx);
 602        lgp->lsegpp = &lseg;
 603        lgp->gfp_flags = gfp_flags;
 604
 605        /* Synchronously retrieve layout information from server and
 606         * store in lseg.
 607         */
 608        nfs4_proc_layoutget(lgp, gfp_flags);
 609        if (!lseg) {
 610                /* remember that LAYOUTGET failed and suspend trying */
 611                set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
 612        }
 613
 614        return lseg;
 615}
 616
 617/*
 618 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 619 * when the layout segment list is empty.
 620 *
 621 * Note that a pnfs_layout_hdr can exist with an empty layout segment
 622 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
 623 * deviceid is marked invalid.
 624 */
 625int
 626_pnfs_return_layout(struct inode *ino)
 627{
 628        struct pnfs_layout_hdr *lo = NULL;
 629        struct nfs_inode *nfsi = NFS_I(ino);
 630        LIST_HEAD(tmp_list);
 631        struct nfs4_layoutreturn *lrp;
 632        nfs4_stateid stateid;
 633        int status = 0, empty;
 634
 635        dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
 636
 637        spin_lock(&ino->i_lock);
 638        lo = nfsi->layout;
 639        if (!lo || pnfs_test_layout_returned(lo)) {
 640                spin_unlock(&ino->i_lock);
 641                dprintk("NFS: %s no layout to return\n", __func__);
 642                goto out;
 643        }
 644        stateid = nfsi->layout->plh_stateid;
 645        /* Reference matched in nfs4_layoutreturn_release */
 646        get_layout_hdr(lo);
 647        empty = list_empty(&lo->plh_segs);
 648        mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 649        /* Don't send a LAYOUTRETURN if list was initially empty */
 650        if (empty) {
 651                spin_unlock(&ino->i_lock);
 652                put_layout_hdr(lo);
 653                dprintk("NFS: %s no layout segments to return\n", __func__);
 654                goto out;
 655        }
 656        lo->plh_block_lgets++;
 657        pnfs_mark_layout_returned(lo);
 658        spin_unlock(&ino->i_lock);
 659        pnfs_free_lseg_list(&tmp_list);
 660
 661        WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
 662
 663        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
 664        if (unlikely(lrp == NULL)) {
 665                status = -ENOMEM;
 666                set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags);
 667                set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags);
 668                pnfs_clear_layout_returned(lo);
 669                put_layout_hdr(lo);
 670                goto out;
 671        }
 672
 673        lrp->args.stateid = stateid;
 674        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 675        lrp->args.inode = ino;
 676        lrp->args.layout = lo;
 677        lrp->clp = NFS_SERVER(ino)->nfs_client;
 678
 679        status = nfs4_proc_layoutreturn(lrp);
 680out:
 681        dprintk("<-- %s status: %d\n", __func__, status);
 682        return status;
 683}
 684EXPORT_SYMBOL_GPL(_pnfs_return_layout);
 685
 686bool pnfs_roc(struct inode *ino)
 687{
 688        struct pnfs_layout_hdr *lo;
 689        struct pnfs_layout_segment *lseg, *tmp;
 690        LIST_HEAD(tmp_list);
 691        bool found = false;
 692
 693        spin_lock(&ino->i_lock);
 694        lo = NFS_I(ino)->layout;
 695        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
 696            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 697                goto out_nolayout;
 698        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
 699                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 700                        mark_lseg_invalid(lseg, &tmp_list);
 701                        found = true;
 702                }
 703        if (!found)
 704                goto out_nolayout;
 705        lo->plh_block_lgets++;
 706        get_layout_hdr(lo); /* matched in pnfs_roc_release */
 707        spin_unlock(&ino->i_lock);
 708        pnfs_free_lseg_list(&tmp_list);
 709        return true;
 710
 711out_nolayout:
 712        spin_unlock(&ino->i_lock);
 713        return false;
 714}
 715
 716void pnfs_roc_release(struct inode *ino)
 717{
 718        struct pnfs_layout_hdr *lo;
 719
 720        spin_lock(&ino->i_lock);
 721        lo = NFS_I(ino)->layout;
 722        lo->plh_block_lgets--;
 723        put_layout_hdr_locked(lo);
 724        spin_unlock(&ino->i_lock);
 725}
 726
 727void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 728{
 729        struct pnfs_layout_hdr *lo;
 730
 731        spin_lock(&ino->i_lock);
 732        lo = NFS_I(ino)->layout;
 733        if ((int)(barrier - lo->plh_barrier) > 0)
 734                lo->plh_barrier = barrier;
 735        spin_unlock(&ino->i_lock);
 736}
 737
 738bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
 739{
 740        struct nfs_inode *nfsi = NFS_I(ino);
 741        struct pnfs_layout_segment *lseg;
 742        bool found = false;
 743
 744        spin_lock(&ino->i_lock);
 745        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
 746                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 747                        found = true;
 748                        break;
 749                }
 750        if (!found) {
 751                struct pnfs_layout_hdr *lo = nfsi->layout;
 752                u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 753
 754                /* Since close does not return a layout stateid for use as
 755                 * a barrier, we choose the worst-case barrier.
 756                 */
 757                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 758        }
 759        spin_unlock(&ino->i_lock);
 760        return found;
 761}
 762
 763/*
 764 * Compare two layout segments for sorting into layout cache.
 765 * We want to preferentially return RW over RO layouts, so ensure those
 766 * are seen first.
 767 */
 768static s64
 769cmp_layout(struct pnfs_layout_range *l1,
 770           struct pnfs_layout_range *l2)
 771{
 772        s64 d;
 773
 774        /* high offset > low offset */
 775        d = l1->offset - l2->offset;
 776        if (d)
 777                return d;
 778
 779        /* short length > long length */
 780        d = l2->length - l1->length;
 781        if (d)
 782                return d;
 783
 784        /* read > read/write */
 785        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 786}
 787
 788static void
 789pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 790                   struct pnfs_layout_segment *lseg)
 791{
 792        struct pnfs_layout_segment *lp;
 793
 794        dprintk("%s:Begin\n", __func__);
 795
 796        assert_spin_locked(&lo->plh_inode->i_lock);
 797        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
 798                if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
 799                        continue;
 800                list_add_tail(&lseg->pls_list, &lp->pls_list);
 801                dprintk("%s: inserted lseg %p "
 802                        "iomode %d offset %llu length %llu before "
 803                        "lp %p iomode %d offset %llu length %llu\n",
 804                        __func__, lseg, lseg->pls_range.iomode,
 805                        lseg->pls_range.offset, lseg->pls_range.length,
 806                        lp, lp->pls_range.iomode, lp->pls_range.offset,
 807                        lp->pls_range.length);
 808                goto out;
 809        }
 810        list_add_tail(&lseg->pls_list, &lo->plh_segs);
 811        dprintk("%s: inserted lseg %p "
 812                "iomode %d offset %llu length %llu at tail\n",
 813                __func__, lseg, lseg->pls_range.iomode,
 814                lseg->pls_range.offset, lseg->pls_range.length);
 815out:
 816        get_layout_hdr(lo);
 817
 818        dprintk("%s:Return\n", __func__);
 819}
 820
 821static struct pnfs_layout_hdr *
 822alloc_init_layout_hdr(struct inode *ino,
 823                      struct nfs_open_context *ctx,
 824                      gfp_t gfp_flags)
 825{
 826        struct pnfs_layout_hdr *lo;
 827
 828        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
 829        if (!lo)
 830                return NULL;
 831        atomic_set(&lo->plh_refcount, 1);
 832        INIT_LIST_HEAD(&lo->plh_layouts);
 833        INIT_LIST_HEAD(&lo->plh_segs);
 834        INIT_LIST_HEAD(&lo->plh_bulk_recall);
 835        lo->plh_inode = ino;
 836        lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
 837        return lo;
 838}
 839
 840static struct pnfs_layout_hdr *
 841pnfs_find_alloc_layout(struct inode *ino,
 842                       struct nfs_open_context *ctx,
 843                       gfp_t gfp_flags)
 844{
 845        struct nfs_inode *nfsi = NFS_I(ino);
 846        struct pnfs_layout_hdr *new = NULL;
 847
 848        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
 849
 850        assert_spin_locked(&ino->i_lock);
 851        if (nfsi->layout) {
 852                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
 853                        return NULL;
 854                else
 855                        return nfsi->layout;
 856        }
 857        spin_unlock(&ino->i_lock);
 858        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
 859        spin_lock(&ino->i_lock);
 860
 861        if (likely(nfsi->layout == NULL))       /* Won the race? */
 862                nfsi->layout = new;
 863        else
 864                pnfs_free_layout_hdr(new);
 865        return nfsi->layout;
 866}
 867
 868/*
 869 * iomode matching rules:
 870 * iomode       lseg    match
 871 * -----        -----   -----
 872 * ANY          READ    true
 873 * ANY          RW      true
 874 * RW           READ    false
 875 * RW           RW      true
 876 * READ         READ    true
 877 * READ         RW      true
 878 */
 879static int
 880is_matching_lseg(struct pnfs_layout_range *ls_range,
 881                 struct pnfs_layout_range *range)
 882{
 883        struct pnfs_layout_range range1;
 884
 885        if ((range->iomode == IOMODE_RW &&
 886             ls_range->iomode != IOMODE_RW) ||
 887            !lo_seg_intersecting(ls_range, range))
 888                return 0;
 889
 890        /* range1 covers only the first byte in the range */
 891        range1 = *range;
 892        range1.length = 1;
 893        return lo_seg_contained(ls_range, &range1);
 894}
 895
 896/*
 897 * lookup range in layout
 898 */
 899static struct pnfs_layout_segment *
 900pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 901                struct pnfs_layout_range *range)
 902{
 903        struct pnfs_layout_segment *lseg, *ret = NULL;
 904
 905        dprintk("%s:Begin\n", __func__);
 906
 907        assert_spin_locked(&lo->plh_inode->i_lock);
 908        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 909                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
 910                    is_matching_lseg(&lseg->pls_range, range)) {
 911                        ret = get_lseg(lseg);
 912                        break;
 913                }
 914                if (lseg->pls_range.offset > range->offset)
 915                        break;
 916        }
 917
 918        dprintk("%s:Return lseg %p ref %d\n",
 919                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
 920        return ret;
 921}
 922
 923/*
 924 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
 925 * to the MDS or over pNFS
 926 *
 927 * The nfs_inode read_io and write_io fields are cumulative counters reset
 928 * when there are no layout segments. Note that in pnfs_update_layout iomode
 929 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
 930 * WRITE request.
 931 *
 932 * A return of true means use MDS I/O.
 933 *
 934 * From rfc 5661:
 935 * If a file's size is smaller than the file size threshold, data accesses
 936 * SHOULD be sent to the metadata server.  If an I/O request has a length that
 937 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
 938 * server.  If both file size and I/O size are provided, the client SHOULD
 939 * reach or exceed  both thresholds before sending its read or write
 940 * requests to the data server.
 941 */
 942static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
 943                                     struct inode *ino, int iomode)
 944{
 945        struct nfs4_threshold *t = ctx->mdsthreshold;
 946        struct nfs_inode *nfsi = NFS_I(ino);
 947        loff_t fsize = i_size_read(ino);
 948        bool size = false, size_set = false, io = false, io_set = false, ret = false;
 949
 950        if (t == NULL)
 951                return ret;
 952
 953        dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
 954                __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
 955
 956        switch (iomode) {
 957        case IOMODE_READ:
 958                if (t->bm & THRESHOLD_RD) {
 959                        dprintk("%s fsize %llu\n", __func__, fsize);
 960                        size_set = true;
 961                        if (fsize < t->rd_sz)
 962                                size = true;
 963                }
 964                if (t->bm & THRESHOLD_RD_IO) {
 965                        dprintk("%s nfsi->read_io %llu\n", __func__,
 966                                nfsi->read_io);
 967                        io_set = true;
 968                        if (nfsi->read_io < t->rd_io_sz)
 969                                io = true;
 970                }
 971                break;
 972        case IOMODE_RW:
 973                if (t->bm & THRESHOLD_WR) {
 974                        dprintk("%s fsize %llu\n", __func__, fsize);
 975                        size_set = true;
 976                        if (fsize < t->wr_sz)
 977                                size = true;
 978                }
 979                if (t->bm & THRESHOLD_WR_IO) {
 980                        dprintk("%s nfsi->write_io %llu\n", __func__,
 981                                nfsi->write_io);
 982                        io_set = true;
 983                        if (nfsi->write_io < t->wr_io_sz)
 984                                io = true;
 985                }
 986                break;
 987        }
 988        if (size_set && io_set) {
 989                if (size && io)
 990                        ret = true;
 991        } else if (size || io)
 992                ret = true;
 993
 994        dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
 995        return ret;
 996}
 997
 998/*
 999 * Layout segment is retreived from the server if not cached.
1000 * The appropriate layout segment is referenced and returned to the caller.
1001 */
1002struct pnfs_layout_segment *
1003pnfs_update_layout(struct inode *ino,
1004                   struct nfs_open_context *ctx,
1005                   loff_t pos,
1006                   u64 count,
1007                   enum pnfs_iomode iomode,
1008                   gfp_t gfp_flags)
1009{
1010        struct pnfs_layout_range arg = {
1011                .iomode = iomode,
1012                .offset = pos,
1013                .length = count,
1014        };
1015        unsigned pg_offset;
1016        struct nfs_inode *nfsi = NFS_I(ino);
1017        struct nfs_server *server = NFS_SERVER(ino);
1018        struct nfs_client *clp = server->nfs_client;
1019        struct pnfs_layout_hdr *lo;
1020        struct pnfs_layout_segment *lseg = NULL;
1021        bool first = false;
1022
1023        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1024                return NULL;
1025
1026        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1027                return NULL;
1028
1029        spin_lock(&ino->i_lock);
1030        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1031        if (lo == NULL) {
1032                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
1033                goto out_unlock;
1034        }
1035
1036        /* Do we even need to bother with this? */
1037        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1038                dprintk("%s matches recall, use MDS\n", __func__);
1039                goto out_unlock;
1040        }
1041
1042        /* if LAYOUTGET already failed once we don't try again */
1043        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
1044                goto out_unlock;
1045
1046        /* Check to see if the layout for the given range already exists */
1047        lseg = pnfs_find_lseg(lo, &arg);
1048        if (lseg)
1049                goto out_unlock;
1050
1051        if (pnfs_layoutgets_blocked(lo, NULL, 0))
1052                goto out_unlock;
1053        atomic_inc(&lo->plh_outstanding);
1054
1055        get_layout_hdr(lo);
1056        if (list_empty(&lo->plh_segs))
1057                first = true;
1058
1059        /* Enable LAYOUTRETURNs */
1060        pnfs_clear_layout_returned(lo);
1061
1062        spin_unlock(&ino->i_lock);
1063        if (first) {
1064                /* The lo must be on the clp list if there is any
1065                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1066                 */
1067                spin_lock(&clp->cl_lock);
1068                BUG_ON(!list_empty(&lo->plh_layouts));
1069                list_add_tail(&lo->plh_layouts, &server->layouts);
1070                spin_unlock(&clp->cl_lock);
1071        }
1072
1073        pg_offset = arg.offset & ~PAGE_CACHE_MASK;
1074        if (pg_offset) {
1075                arg.offset -= pg_offset;
1076                arg.length += pg_offset;
1077        }
1078        if (arg.length != NFS4_MAX_UINT64)
1079                arg.length = PAGE_CACHE_ALIGN(arg.length);
1080
1081        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1082        if (!lseg && first) {
1083                spin_lock(&clp->cl_lock);
1084                list_del_init(&lo->plh_layouts);
1085                spin_unlock(&clp->cl_lock);
1086        }
1087        atomic_dec(&lo->plh_outstanding);
1088        put_layout_hdr(lo);
1089out:
1090        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
1091                nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
1092        return lseg;
1093out_unlock:
1094        spin_unlock(&ino->i_lock);
1095        goto out;
1096}
1097EXPORT_SYMBOL_GPL(pnfs_update_layout);
1098
1099int
1100pnfs_layout_process(struct nfs4_layoutget *lgp)
1101{
1102        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1103        struct nfs4_layoutget_res *res = &lgp->res;
1104        struct pnfs_layout_segment *lseg;
1105        struct inode *ino = lo->plh_inode;
1106        int status = 0;
1107
1108        /* Inject layout blob into I/O device driver */
1109        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1110        if (!lseg || IS_ERR(lseg)) {
1111                if (!lseg)
1112                        status = -ENOMEM;
1113                else
1114                        status = PTR_ERR(lseg);
1115                dprintk("%s: Could not allocate layout: error %d\n",
1116                       __func__, status);
1117                goto out;
1118        }
1119
1120        spin_lock(&ino->i_lock);
1121        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1122                dprintk("%s forget reply due to recall\n", __func__);
1123                goto out_forget_reply;
1124        }
1125
1126        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
1127                dprintk("%s forget reply due to state\n", __func__);
1128                goto out_forget_reply;
1129        }
1130        init_lseg(lo, lseg);
1131        lseg->pls_range = res->range;
1132        *lgp->lsegpp = get_lseg(lseg);
1133        pnfs_insert_layout(lo, lseg);
1134
1135        if (res->return_on_close) {
1136                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1137                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1138        }
1139
1140        /* Done processing layoutget. Set the layout stateid */
1141        pnfs_set_layout_stateid(lo, &res->stateid, false);
1142        spin_unlock(&ino->i_lock);
1143out:
1144        return status;
1145
1146out_forget_reply:
1147        spin_unlock(&ino->i_lock);
1148        lseg->pls_layout = lo;
1149        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1150        goto out;
1151}
1152
1153void
1154pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1155{
1156        BUG_ON(pgio->pg_lseg != NULL);
1157
1158        if (req->wb_offset != req->wb_pgbase) {
1159                nfs_pageio_reset_read_mds(pgio);
1160                return;
1161        }
1162        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1163                                           req->wb_context,
1164                                           req_offset(req),
1165                                           req->wb_bytes,
1166                                           IOMODE_READ,
1167                                           GFP_KERNEL);
1168        /* If no lseg, fall back to read through mds */
1169        if (pgio->pg_lseg == NULL)
1170                nfs_pageio_reset_read_mds(pgio);
1171
1172}
1173EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1174
1175void
1176pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1177{
1178        BUG_ON(pgio->pg_lseg != NULL);
1179
1180        if (req->wb_offset != req->wb_pgbase) {
1181                nfs_pageio_reset_write_mds(pgio);
1182                return;
1183        }
1184        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1185                                           req->wb_context,
1186                                           req_offset(req),
1187                                           req->wb_bytes,
1188                                           IOMODE_RW,
1189                                           GFP_NOFS);
1190        /* If no lseg, fall back to write through mds */
1191        if (pgio->pg_lseg == NULL)
1192                nfs_pageio_reset_write_mds(pgio);
1193}
1194EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1195
1196void
1197pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
1198                      const struct nfs_pgio_completion_ops *compl_ops)
1199{
1200        struct nfs_server *server = NFS_SERVER(inode);
1201        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1202
1203        if (ld == NULL)
1204                nfs_pageio_init_read(pgio, inode, compl_ops);
1205        else
1206                nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
1207}
1208
1209void
1210pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
1211                       int ioflags,
1212                       const struct nfs_pgio_completion_ops *compl_ops)
1213{
1214        struct nfs_server *server = NFS_SERVER(inode);
1215        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1216
1217        if (ld == NULL)
1218                nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
1219        else
1220                nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
1221}
1222
1223bool
1224pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1225                     struct nfs_page *req)
1226{
1227        if (pgio->pg_lseg == NULL)
1228                return nfs_generic_pg_test(pgio, prev, req);
1229
1230        /*
1231         * Test if a nfs_page is fully contained in the pnfs_layout_range.
1232         * Note that this test makes several assumptions:
1233         * - that the previous nfs_page in the struct nfs_pageio_descriptor
1234         *   is known to lie within the range.
1235         *   - that the nfs_page being tested is known to be contiguous with the
1236         *   previous nfs_page.
1237         *   - Layout ranges are page aligned, so we only have to test the
1238         *   start offset of the request.
1239         *
1240         * Please also note that 'end_offset' is actually the offset of the
1241         * first byte that lies outside the pnfs_layout_range. FIXME?
1242         *
1243         */
1244        return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
1245                                         pgio->pg_lseg->pls_range.length);
1246}
1247EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1248
1249int pnfs_write_done_resend_to_mds(struct inode *inode,
1250                                struct list_head *head,
1251                                const struct nfs_pgio_completion_ops *compl_ops)
1252{
1253        struct nfs_pageio_descriptor pgio;
1254        LIST_HEAD(failed);
1255
1256        /* Resend all requests through the MDS */
1257        nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
1258        while (!list_empty(head)) {
1259                struct nfs_page *req = nfs_list_entry(head->next);
1260
1261                nfs_list_remove_request(req);
1262                if (!nfs_pageio_add_request(&pgio, req))
1263                        nfs_list_add_request(req, &failed);
1264        }
1265        nfs_pageio_complete(&pgio);
1266
1267        if (!list_empty(&failed)) {
1268                /* For some reason our attempt to resend pages. Mark the
1269                 * overall send request as having failed, and let
1270                 * nfs_writeback_release_full deal with the error.
1271                 */
1272                list_move(&failed, head);
1273                return -EIO;
1274        }
1275        return 0;
1276}
1277EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1278
1279static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
1280{
1281        struct nfs_pgio_header *hdr = data->header;
1282
1283        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1284        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1285            PNFS_LAYOUTRET_ON_ERROR) {
1286                clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
1287                pnfs_return_layout(hdr->inode);
1288        }
1289        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1290                data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
1291                                                        &hdr->pages,
1292                                                        hdr->completion_ops);
1293}
1294
1295/*
1296 * Called by non rpc-based layout drivers
1297 */
1298void pnfs_ld_write_done(struct nfs_write_data *data)
1299{
1300        struct nfs_pgio_header *hdr = data->header;
1301
1302        if (!hdr->pnfs_error) {
1303                pnfs_set_layoutcommit(data);
1304                hdr->mds_ops->rpc_call_done(&data->task, data);
1305        } else
1306                pnfs_ld_handle_write_error(data);
1307        hdr->mds_ops->rpc_release(data);
1308}
1309EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1310
1311static void
1312pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1313                struct nfs_write_data *data)
1314{
1315        struct nfs_pgio_header *hdr = data->header;
1316
1317        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1318                list_splice_tail_init(&hdr->pages, &desc->pg_list);
1319                nfs_pageio_reset_write_mds(desc);
1320                desc->pg_recoalesce = 1;
1321        }
1322        nfs_writedata_release(data);
1323}
1324
1325static enum pnfs_try_status
1326pnfs_try_to_write_data(struct nfs_write_data *wdata,
1327                        const struct rpc_call_ops *call_ops,
1328                        struct pnfs_layout_segment *lseg,
1329                        int how)
1330{
1331        struct nfs_pgio_header *hdr = wdata->header;
1332        struct inode *inode = hdr->inode;
1333        enum pnfs_try_status trypnfs;
1334        struct nfs_server *nfss = NFS_SERVER(inode);
1335
1336        hdr->mds_ops = call_ops;
1337
1338        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1339                inode->i_ino, wdata->args.count, wdata->args.offset, how);
1340        trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
1341        if (trypnfs != PNFS_NOT_ATTEMPTED)
1342                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
1343        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1344        return trypnfs;
1345}
1346
1347static void
1348pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
1349{
1350        struct nfs_write_data *data;
1351        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1352        struct pnfs_layout_segment *lseg = desc->pg_lseg;
1353
1354        desc->pg_lseg = NULL;
1355        while (!list_empty(head)) {
1356                enum pnfs_try_status trypnfs;
1357
1358                data = list_first_entry(head, struct nfs_write_data, list);
1359                list_del_init(&data->list);
1360
1361                trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
1362                if (trypnfs == PNFS_NOT_ATTEMPTED)
1363                        pnfs_write_through_mds(desc, data);
1364        }
1365        put_lseg(lseg);
1366}
1367
1368static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1369{
1370        put_lseg(hdr->lseg);
1371        nfs_writehdr_free(hdr);
1372}
1373EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1374
1375int
1376pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1377{
1378        struct nfs_write_header *whdr;
1379        struct nfs_pgio_header *hdr;
1380        int ret;
1381
1382        whdr = nfs_writehdr_alloc();
1383        if (!whdr) {
1384                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1385                put_lseg(desc->pg_lseg);
1386                desc->pg_lseg = NULL;
1387                return -ENOMEM;
1388        }
1389        hdr = &whdr->header;
1390        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1391        hdr->lseg = get_lseg(desc->pg_lseg);
1392        atomic_inc(&hdr->refcnt);
1393        ret = nfs_generic_flush(desc, hdr);
1394        if (ret != 0) {
1395                put_lseg(desc->pg_lseg);
1396                desc->pg_lseg = NULL;
1397        } else
1398                pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
1399        if (atomic_dec_and_test(&hdr->refcnt))
1400                hdr->completion_ops->completion(hdr);
1401        return ret;
1402}
1403EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1404
1405int pnfs_read_done_resend_to_mds(struct inode *inode,
1406                                struct list_head *head,
1407                                const struct nfs_pgio_completion_ops *compl_ops)
1408{
1409        struct nfs_pageio_descriptor pgio;
1410        LIST_HEAD(failed);
1411
1412        /* Resend all requests through the MDS */
1413        nfs_pageio_init_read(&pgio, inode, compl_ops);
1414        while (!list_empty(head)) {
1415                struct nfs_page *req = nfs_list_entry(head->next);
1416
1417                nfs_list_remove_request(req);
1418                if (!nfs_pageio_add_request(&pgio, req))
1419                        nfs_list_add_request(req, &failed);
1420        }
1421        nfs_pageio_complete(&pgio);
1422
1423        if (!list_empty(&failed)) {
1424                list_move(&failed, head);
1425                return -EIO;
1426        }
1427        return 0;
1428}
1429EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
1430
1431static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
1432{
1433        struct nfs_pgio_header *hdr = data->header;
1434
1435        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
1436        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1437            PNFS_LAYOUTRET_ON_ERROR) {
1438                clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
1439                pnfs_return_layout(hdr->inode);
1440        }
1441        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1442                data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
1443                                                        &hdr->pages,
1444                                                        hdr->completion_ops);
1445}
1446
1447/*
1448 * Called by non rpc-based layout drivers
1449 */
1450void pnfs_ld_read_done(struct nfs_read_data *data)
1451{
1452        struct nfs_pgio_header *hdr = data->header;
1453
1454        if (likely(!hdr->pnfs_error)) {
1455                __nfs4_read_done_cb(data);
1456                hdr->mds_ops->rpc_call_done(&data->task, data);
1457        } else
1458                pnfs_ld_handle_read_error(data);
1459        hdr->mds_ops->rpc_release(data);
1460}
1461EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1462
1463static void
1464pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1465                struct nfs_read_data *data)
1466{
1467        struct nfs_pgio_header *hdr = data->header;
1468
1469        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1470                list_splice_tail_init(&hdr->pages, &desc->pg_list);
1471                nfs_pageio_reset_read_mds(desc);
1472                desc->pg_recoalesce = 1;
1473        }
1474        nfs_readdata_release(data);
1475}
1476
1477/*
1478 * Call the appropriate parallel I/O subsystem read function.
1479 */
1480static enum pnfs_try_status
1481pnfs_try_to_read_data(struct nfs_read_data *rdata,
1482                       const struct rpc_call_ops *call_ops,
1483                       struct pnfs_layout_segment *lseg)
1484{
1485        struct nfs_pgio_header *hdr = rdata->header;
1486        struct inode *inode = hdr->inode;
1487        struct nfs_server *nfss = NFS_SERVER(inode);
1488        enum pnfs_try_status trypnfs;
1489
1490        hdr->mds_ops = call_ops;
1491
1492        dprintk("%s: Reading ino:%lu %u@%llu\n",
1493                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
1494
1495        trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
1496        if (trypnfs != PNFS_NOT_ATTEMPTED)
1497                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
1498        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1499        return trypnfs;
1500}
1501
1502static void
1503pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
1504{
1505        struct nfs_read_data *data;
1506        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1507        struct pnfs_layout_segment *lseg = desc->pg_lseg;
1508
1509        desc->pg_lseg = NULL;
1510        while (!list_empty(head)) {
1511                enum pnfs_try_status trypnfs;
1512
1513                data = list_first_entry(head, struct nfs_read_data, list);
1514                list_del_init(&data->list);
1515
1516                trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
1517                if (trypnfs == PNFS_NOT_ATTEMPTED)
1518                        pnfs_read_through_mds(desc, data);
1519        }
1520        put_lseg(lseg);
1521}
1522
1523static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1524{
1525        put_lseg(hdr->lseg);
1526        nfs_readhdr_free(hdr);
1527}
1528EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1529
1530int
1531pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1532{
1533        struct nfs_read_header *rhdr;
1534        struct nfs_pgio_header *hdr;
1535        int ret;
1536
1537        rhdr = nfs_readhdr_alloc();
1538        if (!rhdr) {
1539                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1540                ret = -ENOMEM;
1541                put_lseg(desc->pg_lseg);
1542                desc->pg_lseg = NULL;
1543                return ret;
1544        }
1545        hdr = &rhdr->header;
1546        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1547        hdr->lseg = get_lseg(desc->pg_lseg);
1548        atomic_inc(&hdr->refcnt);
1549        ret = nfs_generic_pagein(desc, hdr);
1550        if (ret != 0) {
1551                put_lseg(desc->pg_lseg);
1552                desc->pg_lseg = NULL;
1553        } else
1554                pnfs_do_multiple_reads(desc, &hdr->rpc_list);
1555        if (atomic_dec_and_test(&hdr->refcnt))
1556                hdr->completion_ops->completion(hdr);
1557        return ret;
1558}
1559EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1560
1561/*
1562 * There can be multiple RW segments.
1563 */
1564static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1565{
1566        struct pnfs_layout_segment *lseg;
1567
1568        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1569                if (lseg->pls_range.iomode == IOMODE_RW &&
1570                    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1571                        list_add(&lseg->pls_lc_list, listp);
1572        }
1573}
1574
1575void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1576{
1577        if (lseg->pls_range.iomode == IOMODE_RW) {
1578                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1579                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1580        } else {
1581                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1582                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1583        }
1584}
1585EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1586
1587void
1588pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1589{
1590        struct nfs_pgio_header *hdr = wdata->header;
1591        struct inode *inode = hdr->inode;
1592        struct nfs_inode *nfsi = NFS_I(inode);
1593        loff_t end_pos = wdata->mds_offset + wdata->res.count;
1594        bool mark_as_dirty = false;
1595
1596        spin_lock(&inode->i_lock);
1597        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1598                mark_as_dirty = true;
1599                dprintk("%s: Set layoutcommit for inode %lu ",
1600                        __func__, inode->i_ino);
1601        }
1602        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
1603                /* references matched in nfs4_layoutcommit_release */
1604                get_lseg(hdr->lseg);
1605        }
1606        if (end_pos > nfsi->layout->plh_lwb)
1607                nfsi->layout->plh_lwb = end_pos;
1608        spin_unlock(&inode->i_lock);
1609        dprintk("%s: lseg %p end_pos %llu\n",
1610                __func__, hdr->lseg, nfsi->layout->plh_lwb);
1611
1612        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1613         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1614        if (mark_as_dirty)
1615                mark_inode_dirty_sync(inode);
1616}
1617EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1618
1619void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1620{
1621        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1622
1623        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1624                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
1625}
1626
1627/*
1628 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1629 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
1630 * data to disk to allow the server to recover the data if it crashes.
1631 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
1632 * is off, and a COMMIT is sent to a data server, or
1633 * if WRITEs to a data server return NFS_DATA_SYNC.
1634 */
1635int
1636pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1637{
1638        struct nfs4_layoutcommit_data *data;
1639        struct nfs_inode *nfsi = NFS_I(inode);
1640        loff_t end_pos;
1641        int status = 0;
1642
1643        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
1644
1645        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1646                return 0;
1647
1648        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1649        data = kzalloc(sizeof(*data), GFP_NOFS);
1650        if (!data) {
1651                status = -ENOMEM;
1652                goto out;
1653        }
1654
1655        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1656                goto out_free;
1657
1658        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1659                if (!sync) {
1660                        status = -EAGAIN;
1661                        goto out_free;
1662                }
1663                status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1664                                        nfs_wait_bit_killable, TASK_KILLABLE);
1665                if (status)
1666                        goto out_free;
1667        }
1668
1669        INIT_LIST_HEAD(&data->lseg_list);
1670        spin_lock(&inode->i_lock);
1671        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1672                clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1673                spin_unlock(&inode->i_lock);
1674                wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1675                goto out_free;
1676        }
1677
1678        pnfs_list_write_lseg(inode, &data->lseg_list);
1679
1680        end_pos = nfsi->layout->plh_lwb;
1681        nfsi->layout->plh_lwb = 0;
1682
1683        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
1684        spin_unlock(&inode->i_lock);
1685
1686        data->args.inode = inode;
1687        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
1688        nfs_fattr_init(&data->fattr);
1689        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1690        data->res.fattr = &data->fattr;
1691        data->args.lastbytewritten = end_pos - 1;
1692        data->res.server = NFS_SERVER(inode);
1693
1694        status = nfs4_proc_layoutcommit(data, sync);
1695out:
1696        if (status)
1697                mark_inode_dirty_sync(inode);
1698        dprintk("<-- %s status %d\n", __func__, status);
1699        return status;
1700out_free:
1701        kfree(data);
1702        goto out;
1703}
1704
1705struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1706{
1707        struct nfs4_threshold *thp;
1708
1709        thp = kzalloc(sizeof(*thp), GFP_NOFS);
1710        if (!thp) {
1711                dprintk("%s mdsthreshold allocation failed\n", __func__);
1712                return NULL;
1713        }
1714        return thp;
1715}
1716
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.