darwin-xnu/bsd/nfs/nfs_socket.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3 *
   4 * @APPLE_LICENSE_HEADER_START@
   5 * 
   6 * The contents of this file constitute Original Code as defined in and
   7 * are subject to the Apple Public Source License Version 1.1 (the
   8 * "License").  You may not use this file except in compliance with the
   9 * License.  Please obtain a copy of the License at
  10 * http://www.apple.com/publicsource and read it before using this file.
  11 * 
  12 * This Original Code and all software distributed under the License are
  13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17 * License for the specific language governing rights and limitations
  18 * under the License.
  19 * 
  20 * @APPLE_LICENSE_HEADER_END@
  21 */
  22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23/*
  24 * Copyright (c) 1989, 1991, 1993, 1995
  25 *      The Regents of the University of California.  All rights reserved.
  26 *
  27 * This code is derived from software contributed to Berkeley by
  28 * Rick Macklem at The University of Guelph.
  29 *
  30 * Redistribution and use in source and binary forms, with or without
  31 * modification, are permitted provided that the following conditions
  32 * are met:
  33 * 1. Redistributions of source code must retain the above copyright
  34 *    notice, this list of conditions and the following disclaimer.
  35 * 2. Redistributions in binary form must reproduce the above copyright
  36 *    notice, this list of conditions and the following disclaimer in the
  37 *    documentation and/or other materials provided with the distribution.
  38 * 3. All advertising materials mentioning features or use of this software
  39 *    must display the following acknowledgement:
  40 *      This product includes software developed by the University of
  41 *      California, Berkeley and its contributors.
  42 * 4. Neither the name of the University nor the names of its contributors
  43 *    may be used to endorse or promote products derived from this software
  44 *    without specific prior written permission.
  45 *
  46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  49 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  56 * SUCH DAMAGE.
  57 *
  58 *      @(#)nfs_socket.c        8.5 (Berkeley) 3/30/95
  59 * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $
  60 */
  61
  62/*
  63 * Socket operations for use by nfs
  64 */
  65
  66#include <sys/param.h>
  67#include <sys/systm.h>
  68#include <sys/proc.h>
  69#include <sys/kauth.h>
  70#include <sys/mount_internal.h>
  71#include <sys/kernel.h>
  72#include <sys/kpi_mbuf.h>
  73#include <sys/malloc.h>
  74#include <sys/vnode.h>
  75#include <sys/domain.h>
  76#include <sys/protosw.h>
  77#include <sys/socket.h>
  78#include <sys/syslog.h>
  79#include <sys/tprintf.h>
  80#include <sys/uio_internal.h>
  81#include <libkern/OSAtomic.h>
  82
  83#include <sys/time.h>
  84#include <kern/clock.h>
  85#include <kern/task.h>
  86#include <kern/thread.h>
  87#include <sys/user.h>
  88
  89#include <netinet/in.h>
  90#include <netinet/tcp.h>
  91
  92#include <nfs/rpcv2.h>
  93#include <nfs/nfsproto.h>
  94#include <nfs/nfs.h>
  95#include <nfs/xdr_subs.h>
  96#include <nfs/nfsm_subs.h>
  97#include <nfs/nfsmount.h>
  98#include <nfs/nfsnode.h>
  99#include <nfs/nfsrtt.h>
 100
 101#include <sys/kdebug.h>
 102
 103#define FSDBG(A, B, C, D, E) \
 104        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
 105                (int)(B), (int)(C), (int)(D), (int)(E), 0)
 106#define FSDBG_TOP(A, B, C, D, E) \
 107        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
 108                (int)(B), (int)(C), (int)(D), (int)(E), 0)
 109#define FSDBG_BOT(A, B, C, D, E) \
 110        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
 111                (int)(B), (int)(C), (int)(D), (int)(E), 0)
 112
 113/*
 114 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
 115 * Use the mean and mean deviation of rtt for the appropriate type of rpc
 116 * for the frequent rpcs and a default for the others.
 117 * The justification for doing "other" this way is that these rpcs
 118 * happen so infrequently that timer est. would probably be stale.
 119 * Also, since many of these rpcs are
 120 * non-idempotent, a conservative timeout is desired.
 121 * getattr, lookup - A+2D
 122 * read, write     - A+4D
 123 * other           - nm_timeo
 124 */
 125#define NFS_RTO(n, t) \
 126        ((t) == 0 ? (n)->nm_timeo : \
 127         ((t) < 3 ? \
 128          (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
 129          ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
 130#define NFS_SRTT(r)     (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
 131#define NFS_SDRTT(r)    (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
 132/*
 133 * External data, mostly RPC constants in XDR form
 134 */
 135extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
 136        rpc_msgaccepted, rpc_call, rpc_autherr,
 137        rpc_auth_kerb;
 138extern u_long nfs_prog;
 139extern struct nfsstats nfsstats;
 140extern int nfsv3_procid[NFS_NPROCS];
 141extern int nfs_ticks;
 142extern u_long nfs_xidwrap;
 143
 144/*
 145 * Defines which timer to use for the procnum.
 146 * 0 - default
 147 * 1 - getattr
 148 * 2 - lookup
 149 * 3 - read
 150 * 4 - write
 151 */
 152static int proct[NFS_NPROCS] = {
 153        0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0
 154};
 155
 156/*
 157 * There is a congestion window for outstanding rpcs maintained per mount
 158 * point. The cwnd size is adjusted in roughly the way that:
 159 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
 160 * SIGCOMM '88". ACM, August 1988.
 161 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
 162 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
 163 * of rpcs is in progress.
 164 * (The sent count and cwnd are scaled for integer arith.)
 165 * Variants of "slow start" were tried and were found to be too much of a
 166 * performance hit (ave. rtt 3 times larger),
 167 * I suspect due to the large rtt that nfs rpcs have.
 168 */
 169#define NFS_CWNDSCALE   256
 170#define NFS_MAXCWND     (NFS_CWNDSCALE * 32)
 171static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
 172int nfsrtton = 0;
 173struct nfsrtt nfsrtt;
 174
 175static int      nfs_rcvlock(struct nfsreq *);
 176static void     nfs_rcvunlock(struct nfsreq *);
 177static int      nfs_receive(struct nfsreq *rep, mbuf_t *mp);
 178static int      nfs_reconnect(struct nfsreq *rep);
 179static void     nfs_repdequeue(struct nfsreq *rep);
 180
 181/* XXX */
 182boolean_t       current_thread_aborted(void);
 183kern_return_t   thread_terminate(thread_t);
 184
 185#ifndef NFS_NOSERVER 
 186static int      nfsrv_getstream(struct nfssvc_sock *,int);
 187
 188int (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *nd,
 189                                    struct nfssvc_sock *slp,
 190                                    proc_t procp,
 191                                    mbuf_t *mreqp) = {
 192        nfsrv_null,
 193        nfsrv_getattr,
 194        nfsrv_setattr,
 195        nfsrv_lookup,
 196        nfsrv3_access,
 197        nfsrv_readlink,
 198        nfsrv_read,
 199        nfsrv_write,
 200        nfsrv_create,
 201        nfsrv_mkdir,
 202        nfsrv_symlink,
 203        nfsrv_mknod,
 204        nfsrv_remove,
 205        nfsrv_rmdir,
 206        nfsrv_rename,
 207        nfsrv_link,
 208        nfsrv_readdir,
 209        nfsrv_readdirplus,
 210        nfsrv_statfs,
 211        nfsrv_fsinfo,
 212        nfsrv_pathconf,
 213        nfsrv_commit,
 214        nfsrv_noop
 215};
 216#endif /* NFS_NOSERVER */
 217
 218
 219/*
 220 * attempt to bind a socket to a reserved port
 221 */
 222static int
 223nfs_bind_resv(struct nfsmount *nmp)
 224{
 225        socket_t so = nmp->nm_so;
 226        struct sockaddr_in sin;
 227        int error;
 228        u_short tport;
 229
 230        if (!so)
 231                return (EINVAL);
 232
 233        sin.sin_len = sizeof (struct sockaddr_in);
 234        sin.sin_family = AF_INET;
 235        sin.sin_addr.s_addr = INADDR_ANY;
 236        tport = IPPORT_RESERVED - 1;
 237        sin.sin_port = htons(tport);
 238
 239        while (((error = sock_bind(so, (struct sockaddr *) &sin)) == EADDRINUSE) &&
 240               (--tport > IPPORT_RESERVED / 2))
 241                sin.sin_port = htons(tport);
 242        return (error);
 243}
 244
 245/*
 246 * variables for managing the nfs_bind_resv_thread
 247 */
 248int nfs_resv_mounts = 0;
 249static int nfs_bind_resv_thread_state = 0;
 250#define NFS_BIND_RESV_THREAD_STATE_INITTED      1
 251#define NFS_BIND_RESV_THREAD_STATE_RUNNING      2
 252lck_grp_t *nfs_bind_resv_lck_grp;
 253lck_grp_attr_t *nfs_bind_resv_lck_grp_attr;
 254lck_attr_t *nfs_bind_resv_lck_attr;
 255lck_mtx_t *nfs_bind_resv_mutex;
 256struct nfs_bind_resv_request {
 257        TAILQ_ENTRY(nfs_bind_resv_request) brr_chain;
 258        struct nfsmount *brr_nmp;
 259        int brr_error;
 260};
 261static TAILQ_HEAD(, nfs_bind_resv_request) nfs_bind_resv_request_queue;
 262
 263/*
 264 * thread to handle any reserved port bind requests
 265 */
 266static void
 267nfs_bind_resv_thread(void)
 268{
 269        struct nfs_bind_resv_request *brreq;
 270
 271        nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 272
 273        while (nfs_resv_mounts > 0) {
 274                lck_mtx_lock(nfs_bind_resv_mutex);
 275                while ((brreq = TAILQ_FIRST(&nfs_bind_resv_request_queue))) {
 276                        TAILQ_REMOVE(&nfs_bind_resv_request_queue, brreq, brr_chain);
 277                        lck_mtx_unlock(nfs_bind_resv_mutex);
 278                        brreq->brr_error = nfs_bind_resv(brreq->brr_nmp);
 279                        wakeup(brreq);
 280                        lck_mtx_lock(nfs_bind_resv_mutex);
 281                }
 282                msleep((caddr_t)&nfs_bind_resv_request_queue,
 283                                nfs_bind_resv_mutex, PSOCK | PDROP,
 284                                "nfs_bind_resv_request_queue", 0);
 285        }
 286
 287        nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 288        (void) thread_terminate(current_thread());
 289}
 290
 291int
 292nfs_bind_resv_thread_wake(void)
 293{
 294        if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING)
 295                return (EIO);
 296        wakeup(&nfs_bind_resv_request_queue);
 297        return (0);
 298}
 299
 300/*
 301 * underprivileged procs call this to request nfs_bind_resv_thread
 302 * to perform the reserved port binding for them.
 303 */
 304static int
 305nfs_bind_resv_nopriv(struct nfsmount *nmp)
 306{
 307        struct nfs_bind_resv_request brreq;
 308        int error;
 309
 310        if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_RUNNING) {
 311                if (nfs_bind_resv_thread_state < NFS_BIND_RESV_THREAD_STATE_INITTED) {
 312                        nfs_bind_resv_lck_grp_attr = lck_grp_attr_alloc_init();
 313                        lck_grp_attr_setstat(nfs_bind_resv_lck_grp_attr);
 314                        nfs_bind_resv_lck_grp = lck_grp_alloc_init("nfs_bind_resv", nfs_bind_resv_lck_grp_attr);
 315                        nfs_bind_resv_lck_attr = lck_attr_alloc_init();
 316                        nfs_bind_resv_mutex = lck_mtx_alloc_init(nfs_bind_resv_lck_grp, nfs_bind_resv_lck_attr);
 317                        TAILQ_INIT(&nfs_bind_resv_request_queue);
 318                        nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_INITTED;
 319                }
 320                kernel_thread(kernel_task, nfs_bind_resv_thread);
 321                nfs_bind_resv_thread_state = NFS_BIND_RESV_THREAD_STATE_RUNNING;
 322        }
 323
 324        brreq.brr_nmp = nmp;
 325        brreq.brr_error = 0;
 326
 327        lck_mtx_lock(nfs_bind_resv_mutex);
 328        TAILQ_INSERT_TAIL(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 329        lck_mtx_unlock(nfs_bind_resv_mutex);
 330
 331        error = nfs_bind_resv_thread_wake();
 332        if (error) {
 333                TAILQ_REMOVE(&nfs_bind_resv_request_queue, &brreq, brr_chain);
 334                /* Note: we might be able to simply restart the thread */
 335                return (error);
 336        }
 337
 338        tsleep((caddr_t)&brreq, PSOCK, "nfsbindresv", 0);
 339
 340        return (brreq.brr_error);
 341}
 342
 343/*
 344 * Initialize sockets and congestion for a new NFS connection.
 345 * We do not free the sockaddr if error.
 346 */
 347int
 348nfs_connect(
 349        struct nfsmount *nmp,
 350        __unused struct nfsreq *rep)
 351{
 352        socket_t so;
 353        int error, rcvreserve, sndreserve;
 354        struct sockaddr *saddr;
 355        struct timeval timeo;
 356
 357        nmp->nm_so = 0;
 358        saddr = mbuf_data(nmp->nm_nam);
 359        error = sock_socket(saddr->sa_family, nmp->nm_sotype,
 360                                                nmp->nm_soproto, 0, 0, &nmp->nm_so);
 361        if (error) {
 362                goto bad;
 363        }
 364        so = nmp->nm_so;
 365
 366        /*
 367         * Some servers require that the client port be a reserved port number.
 368         */
 369        if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
 370                proc_t p;
 371                /*
 372                 * sobind() requires current_proc() to have superuser privs.
 373                 * If this bind is part of a reconnect, and the current proc
 374                 * doesn't have superuser privs, we hand the sobind() off to
 375                 * a kernel thread to process.
 376                 */
 377                if ((nmp->nm_state & NFSSTA_MOUNTED) &&
 378                    (p = current_proc()) && suser(kauth_cred_get(), 0)) {
 379                        /* request nfs_bind_resv_thread() to do bind */
 380                        error = nfs_bind_resv_nopriv(nmp);
 381                } else {
 382                        error = nfs_bind_resv(nmp);
 383                }
 384                if (error)
 385                        goto bad;
 386        }
 387
 388        /*
 389         * Protocols that do not require connections may be optionally left
 390         * unconnected for servers that reply from a port other than NFS_PORT.
 391         */
 392        if (nmp->nm_flag & NFSMNT_NOCONN) {
 393                if (nmp->nm_sotype == SOCK_STREAM) {
 394                        error = ENOTCONN;
 395                        goto bad;
 396                }
 397        } else {
 398                struct timeval  tv;
 399                tv.tv_sec = 2;
 400                tv.tv_usec = 0;
 401                error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT);
 402                if (error && error != EINPROGRESS) {
 403                        goto bad;
 404                }
 405                
 406                while ((error = sock_connectwait(so, &tv)) == EINPROGRESS) {
 407                        if (rep && (error = nfs_sigintr(nmp, rep, rep->r_procp))) {
 408                                goto bad;
 409                        }
 410                }
 411        }
 412        
 413        /*
 414         * Always time out on recieve, this allows us to reconnect the
 415         * socket to deal with network changes.
 416         */
 417        timeo.tv_usec = 0;
 418        timeo.tv_sec = 2;
 419        error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
 420        if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) {
 421                timeo.tv_sec = 5;
 422        } else {
 423                timeo.tv_sec = 0;
 424        }
 425        error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
 426        
 427        if (nmp->nm_sotype == SOCK_DGRAM) {
 428                sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 429                rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 430                        (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 431        } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
 432                sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 3;
 433                rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) *
 434                        (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 435        } else {
 436                int proto;
 437                int on = 1;
 438                
 439                sock_gettype(so, NULL, NULL, &proto);
 440                if (nmp->nm_sotype != SOCK_STREAM)
 441                        panic("nfscon sotype");
 442
 443                // Assume that SOCK_STREAM always requires a connection
 444                sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on));
 445                
 446                if (proto == IPPROTO_TCP) {
 447                        sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on));
 448                }
 449
 450                sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) * 3;
 451                rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) *
 452                                (nmp->nm_readahead > 0 ? nmp->nm_readahead + 1 : 2);
 453        }
 454
 455        if (sndreserve > NFS_MAXSOCKBUF)
 456                sndreserve = NFS_MAXSOCKBUF;
 457        if (rcvreserve > NFS_MAXSOCKBUF)
 458                rcvreserve = NFS_MAXSOCKBUF;
 459        error = sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &sndreserve, sizeof(sndreserve));
 460        if (error) {
 461                goto bad;
 462        }
 463        error = sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &rcvreserve, sizeof(rcvreserve));
 464        if (error) {
 465                goto bad;
 466        }
 467
 468        sock_nointerrupt(so, 1);
 469
 470        /* Initialize other non-zero congestion variables */
 471        nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
 472                nmp->nm_srtt[3] = (NFS_TIMEO << 3);
 473        nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
 474                nmp->nm_sdrtt[3] = 0;
 475        nmp->nm_cwnd = NFS_MAXCWND / 2;     /* Initial send window */
 476        nmp->nm_sent = 0;
 477        FSDBG(529, nmp, nmp->nm_state, nmp->nm_soflags, nmp->nm_cwnd);
 478        nmp->nm_timeouts = 0;
 479        return (0);
 480
 481bad:
 482        nfs_disconnect(nmp);
 483        return (error);
 484}
 485
 486/*
 487 * Reconnect routine:
 488 * Called when a connection is broken on a reliable protocol.
 489 * - clean up the old socket
 490 * - nfs_connect() again
 491 * - set R_MUSTRESEND for all outstanding requests on mount point
 492 * If this fails the mount point is DEAD!
 493 * nb: Must be called with the nfs_sndlock() set on the mount point.
 494 */
 495static int
 496nfs_reconnect(struct nfsreq *rep)
 497{
 498        struct nfsreq *rp;
 499        struct nfsmount *nmp = rep->r_nmp;
 500        int error;
 501
 502        nfs_disconnect(nmp);
 503        while ((error = nfs_connect(nmp, rep))) {
 504                if (error == EINTR || error == ERESTART)
 505                        return (EINTR);
 506                if (error == EIO)
 507                        return (EIO);
 508                nfs_down(rep->r_nmp, rep->r_procp, error, NFSSTA_TIMEO,
 509                        "can not connect");
 510                rep->r_flags |= R_TPRINTFMSG;
 511                if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
 512                        /* we're not yet completely mounted and */
 513                        /* we can't reconnect, so we fail */
 514                        return (error);
 515                }
 516                if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp)))
 517                        return (error);
 518                tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0);
 519        }
 520
 521        /*
 522         * Loop through outstanding request list and fix up all requests
 523         * on old socket.
 524         */
 525        TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
 526                if (rp->r_nmp == nmp)
 527                        rp->r_flags |= R_MUSTRESEND;
 528        }
 529        return (0);
 530}
 531
 532/*
 533 * NFS disconnect. Clean up and unlink.
 534 */
 535void
 536nfs_disconnect(struct nfsmount *nmp)
 537{
 538        socket_t so;
 539
 540        if (nmp->nm_so) {
 541                so = nmp->nm_so;
 542                nmp->nm_so = 0;
 543                sock_shutdown(so, 2);
 544                sock_close(so);
 545        }
 546}
 547
 548/*
 549 * This is the nfs send routine. For connection based socket types, it
 550 * must be called with an nfs_sndlock() on the socket.
 551 * "rep == NULL" indicates that it has been called from a server.
 552 * For the client side:
 553 * - return EINTR if the RPC is terminated, 0 otherwise
 554 * - set R_MUSTRESEND if the send fails for any reason
 555 * - do any cleanup required by recoverable socket errors (???)
 556 * For the server side:
 557 * - return EINTR or ERESTART if interrupted by a signal
 558 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
 559 * - do any cleanup required by recoverable socket errors (???)
 560 */
 561int
 562nfs_send(so, nam, top, rep)
 563        socket_t so;
 564        mbuf_t nam;
 565        mbuf_t top;
 566        struct nfsreq *rep;
 567{
 568        struct sockaddr *sendnam;
 569        int error, error2, sotype, flags;
 570        u_long xidqueued = 0;
 571        struct nfsreq *rp;
 572        char savenametolog[MAXPATHLEN];
 573        struct msghdr msg;
 574        
 575        if (rep) {
 576                error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 577                if (error) {
 578                        mbuf_freem(top);
 579                        return (error);
 580                }
 581                if ((so = rep->r_nmp->nm_so) == NULL) {
 582                        rep->r_flags |= R_MUSTRESEND;
 583                        mbuf_freem(top);
 584                        return (0);
 585                }
 586                rep->r_flags &= ~R_MUSTRESEND;
 587                TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 588                        if (rp == rep)
 589                                break;
 590                if (rp)
 591                        xidqueued = rp->r_xid;
 592        }
 593        sock_gettype(so, NULL, &sotype, NULL);
 594        if ((sotype == SOCK_STREAM) || (sock_isconnected(so)) ||
 595            (nam == 0))
 596                sendnam = (struct sockaddr *)0;
 597        else
 598                sendnam = mbuf_data(nam);
 599
 600        if (sotype == SOCK_SEQPACKET)
 601                flags = MSG_EOR;
 602        else
 603                flags = 0;
 604
 605        /* 
 606         * Save the name here in case mount point goes away if we block.
 607         * The name is using local stack and is large, but don't
 608         * want to block if we malloc.
 609         */
 610        if (rep)
 611                strncpy(savenametolog,
 612                        vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname,
 613                        MAXPATHLEN - 1);
 614        bzero(&msg, sizeof(msg));
 615        msg.msg_name = (caddr_t)sendnam;
 616        msg.msg_namelen = sendnam == 0 ? 0 : sendnam->sa_len;
 617        error = sock_sendmbuf(so, &msg, top, flags, NULL);
 618
 619        if (error) {
 620                if (rep) {
 621                        if (xidqueued) {
 622                                TAILQ_FOREACH(rp, &nfs_reqq, r_chain)
 623                                        if (rp == rep && rp->r_xid == xidqueued)
 624                                                break;
 625                                if (!rp)
 626                                        panic("nfs_send: error %d xid %x gone",
 627                                              error, xidqueued);
 628                        }
 629                        log(LOG_INFO, "nfs send error %d for server %s\n",
 630                            error, savenametolog);
 631                        /*
 632                         * Deal with errors for the client side.
 633                         */
 634                        error2 = nfs_sigintr(rep->r_nmp, rep, rep->r_procp);
 635                        if (error2) {
 636                                error = error2;
 637                        } else {
 638                                rep->r_flags |= R_MUSTRESEND;
 639                        }
 640                } else
 641                        log(LOG_INFO, "nfsd send error %d\n", error);
 642
 643                /*
 644                 * Handle any recoverable (soft) socket errors here. (???)
 645                 */
 646                if (error != EINTR && error != ERESTART && error != EIO &&
 647                        error != EWOULDBLOCK && error != EPIPE) {
 648                        error = 0;
 649                }
 650        }
 651        return (error);
 652}
 653
 654/*
 655 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
 656 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
 657 * Mark and consolidate the data into a new mbuf list.
 658 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
 659 *     small mbufs.
 660 * For SOCK_STREAM we must be very careful to read an entire record once
 661 * we have read any of it, even if the system call has been interrupted.
 662 */
 663static int
 664nfs_receive(struct nfsreq *rep, mbuf_t *mp)
 665{
 666        socket_t so;
 667        struct iovec_32 aio;
 668        mbuf_t m, mlast;
 669        u_long len, fraglen;
 670        int error, error2, sotype;
 671        proc_t p = current_proc();      /* XXX */
 672        struct msghdr msg;
 673        size_t rcvlen;
 674        int lastfragment;
 675
 676        /*
 677         * Set up arguments for soreceive()
 678         */
 679        *mp = NULL;
 680        sotype = rep->r_nmp->nm_sotype;
 681
 682        /*
 683         * For reliable protocols, lock against other senders/receivers
 684         * in case a reconnect is necessary.
 685         * For SOCK_STREAM, first get the Record Mark to find out how much
 686         * more there is to get.
 687         * We must lock the socket against other receivers
 688         * until we have an entire rpc request/reply.
 689         */
 690        if (sotype != SOCK_DGRAM) {
 691                error = nfs_sndlock(rep);
 692                if (error)
 693                        return (error);
 694tryagain:
 695                /*
 696                 * Check for fatal errors and resending request.
 697                 */
 698                /*
 699                 * Ugh: If a reconnect attempt just happened, nm_so
 700                 * would have changed. NULL indicates a failed
 701                 * attempt that has essentially shut down this
 702                 * mount point.
 703                 */
 704                if ((error = nfs_sigintr(rep->r_nmp, rep, p)) || rep->r_mrep) {
 705                        nfs_sndunlock(rep);
 706                        if (error)
 707                                return (error);
 708                        return (EINTR);
 709                }
 710                so = rep->r_nmp->nm_so;
 711                if (!so) {
 712                        error = nfs_reconnect(rep);
 713                        if (error) {
 714                                nfs_sndunlock(rep);
 715                                return (error);
 716                        }
 717                        goto tryagain;
 718                }
 719                while (rep->r_flags & R_MUSTRESEND) {
 720                        error = mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m);
 721                        if (!error) {
 722                                OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
 723                                error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
 724                        }
 725                        /*
 726                         * we also hold rcv lock so rep is still
 727                         * legit this point
 728                         */
 729                        if (error) {
 730                                if (error == EINTR || error == ERESTART ||
 731                                    (error = nfs_reconnect(rep))) {
 732                                        nfs_sndunlock(rep);
 733                                        return (error);
 734                                }
 735                                goto tryagain;
 736                        }
 737                }
 738                nfs_sndunlock(rep);
 739                if (sotype == SOCK_STREAM) {
 740                        error = 0;
 741                        len = 0;
 742                        lastfragment = 0;
 743                        mlast = NULL;
 744                        while (!error && !lastfragment) {
 745                                aio.iov_base = (uintptr_t) &fraglen;
 746                                aio.iov_len = sizeof(u_long);
 747                                bzero(&msg, sizeof(msg));
 748                                msg.msg_iov = (struct iovec *) &aio;
 749                                msg.msg_iovlen = 1;
 750                                do {
 751                                   error = sock_receive(so, &msg, MSG_WAITALL, &rcvlen);
 752                                   if (!rep->r_nmp) /* if unmounted then bailout */
 753                                        goto shutout;
 754                                   if (error == EWOULDBLOCK && rep) {
 755                                        error2 = nfs_sigintr(rep->r_nmp, rep, p);
 756                                        if (error2)
 757                                                error = error2;
 758                                   }
 759                                } while (error == EWOULDBLOCK);
 760                                if (!error && rcvlen < aio.iov_len) {
 761                                    /* only log a message if we got a partial word */
 762                                    if (rcvlen != 0)
 763                                            log(LOG_INFO,
 764                                                 "short receive (%d/%d) from nfs server %s\n",
 765                                                 rcvlen, sizeof(u_long),
 766                                                 vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 767                                    error = EPIPE;
 768                                }
 769                                if (error)
 770                                        goto errout;
 771                                lastfragment = ntohl(fraglen) & 0x80000000;
 772                                fraglen = ntohl(fraglen) & ~0x80000000;
 773                                len += fraglen;
 774                                /*
 775                                 * This is SERIOUS! We are out of sync with the sender
 776                                 * and forcing a disconnect/reconnect is all I can do.
 777                                 */
 778                                if (len > NFS_MAXPACKET) {
 779                                    log(LOG_ERR, "%s (%d) from nfs server %s\n",
 780                                        "impossible RPC record length", len,
 781                                        vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 782                                    error = EFBIG;
 783                                    goto errout;
 784                                }
 785
 786                                m = NULL;
 787                                do {
 788                                    rcvlen = fraglen;
 789                                    error = sock_receivembuf(so, NULL, &m, MSG_WAITALL, &rcvlen);
 790                                    if (!rep->r_nmp) /* if unmounted then bailout */ {
 791                                        goto shutout;
 792                                    }
 793                                } while (error == EWOULDBLOCK || error == EINTR ||
 794                                         error == ERESTART);
 795
 796                                if (!error && fraglen > rcvlen) {
 797                                    log(LOG_INFO,
 798                                        "short receive (%d/%d) from nfs server %s\n",
 799                                        rcvlen, fraglen,
 800                                        vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 801                                    error = EPIPE;
 802                                    mbuf_freem(m);
 803                                }
 804                                if (!error) {
 805                                        if (!*mp) {
 806                                                *mp = m;
 807                                                mlast = m;
 808                                        } else {
 809                                                error = mbuf_setnext(mlast, m);
 810                                                if (error) {
 811                                                        printf("nfs_receive: mbuf_setnext failed %d\n", error);
 812                                                        mbuf_freem(m);
 813                                                }
 814                                        }
 815                                        while (mbuf_next(mlast))
 816                                                mlast = mbuf_next(mlast);
 817                                }
 818                        }
 819                } else {
 820                        bzero(&msg, sizeof(msg));
 821                        do {
 822                            rcvlen = 100000000;
 823                            error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 824                            if (!rep->r_nmp) /* if unmounted then bailout */ {
 825                                goto shutout;
 826                            }   
 827                            if (error == EWOULDBLOCK && rep) {
 828                                error2 = nfs_sigintr(rep->r_nmp, rep, p);
 829                                if (error2) {
 830                                        return (error2);
 831                                }
 832                            }
 833                        } while (error == EWOULDBLOCK);
 834
 835                        if ((msg.msg_flags & MSG_EOR) == 0)
 836                                printf("Egad!!\n");
 837                        if (!error && *mp == NULL)
 838                                error = EPIPE;
 839                        len = rcvlen;
 840                }
 841errout:
 842                if (error && error != EINTR && error != ERESTART) {
 843                        mbuf_freem(*mp);
 844                        *mp = NULL;
 845                        if (error != EPIPE)
 846                                log(LOG_INFO,
 847                                    "receive error %d from nfs server %s\n", error,
 848                                    vfs_statfs(rep->r_nmp->nm_mountp)->f_mntfromname);
 849                        error = nfs_sndlock(rep);
 850                        if (!error) {
 851                                error = nfs_reconnect(rep);
 852                                if (!error)
 853                                        goto tryagain;
 854                                nfs_sndunlock(rep);
 855                        }
 856                }
 857        } else {
 858                /*
 859                 * We could have failed while rebinding the datagram socket
 860                 * so we need to attempt to rebind here.
 861                 */
 862                if ((so = rep->r_nmp->nm_so) == NULL) {
 863                        error = nfs_sndlock(rep);
 864                        if (!error) {
 865                                error = nfs_reconnect(rep);
 866                                nfs_sndunlock(rep);
 867                        }
 868                        if (error)
 869                                return (error);
 870                        if (!rep->r_nmp) /* if unmounted then bailout */
 871                                return (ENXIO);
 872                        so = rep->r_nmp->nm_so;
 873                }
 874                bzero(&msg, sizeof(msg));
 875                len = 0;
 876                do {
 877                        rcvlen = 1000000;
 878                        error = sock_receivembuf(so, &msg, mp, 0, &rcvlen);
 879                        if (!rep->r_nmp) /* if unmounted then bailout */
 880                                goto shutout;
 881                        if (error) {
 882                                error2 = nfs_sigintr(rep->r_nmp, rep, p);
 883                                if (error2) {
 884                                        error = error2;
 885                                        goto shutout;
 886                                }
 887                        }
 888                        /* Reconnect for all errors.  We may be receiving
 889                         * soft/hard/blocking errors because of a network
 890                         * change.
 891                         * XXX: we should rate limit or delay this
 892                         * to once every N attempts or something.
 893                         * although TCP doesn't seem to.
 894                         */
 895                        if (error) {
 896                                error2 = nfs_sndlock(rep);
 897                                if (!error2) {
 898                                        error2 = nfs_reconnect(rep);
 899                                        if (error2)
 900                                                error = error2;
 901                                        else if (!rep->r_nmp) /* if unmounted then bailout */
 902                                                error = ENXIO;
 903                                        else
 904                                                so = rep->r_nmp->nm_so;
 905                                        nfs_sndunlock(rep);
 906                                } else {
 907                                        error = error2;
 908                                }
 909                        }
 910                } while (error == EWOULDBLOCK);
 911        }
 912shutout:
 913        if (error) {
 914                mbuf_freem(*mp);
 915                *mp = NULL;
 916        }
 917        return (error);
 918}
 919
 920/*
 921 * Implement receipt of reply on a socket.
 922 * We must search through the list of received datagrams matching them
 923 * with outstanding requests using the xid, until ours is found.
 924 */
 925/* ARGSUSED */
 926int
 927nfs_reply(myrep)
 928        struct nfsreq *myrep;
 929{
 930        struct nfsreq *rep;
 931        struct nfsmount *nmp = myrep->r_nmp;
 932        long t1;
 933        mbuf_t mrep, md;
 934        u_long rxid, *tl;
 935        caddr_t dpos, cp2;
 936        int error;
 937
 938        /*
 939         * Loop around until we get our own reply
 940         */
 941        for (;;) {
 942                /*
 943                 * Lock against other receivers so that I don't get stuck in
 944                 * sbwait() after someone else has received my reply for me.
 945                 * Also necessary for connection based protocols to avoid
 946                 * race conditions during a reconnect.
 947                 * If nfs_rcvlock() returns EALREADY, that means that
 948                 * the reply has already been recieved by another
 949                 * process and we can return immediately.  In this
 950                 * case, the lock is not taken to avoid races with
 951                 * other processes.
 952                 */
 953                error = nfs_rcvlock(myrep);
 954                if (error == EALREADY)
 955                        return (0);
 956                if (error)
 957                        return (error);
 958                
 959                /*
 960                 * If we slept after putting bits otw, then reply may have
 961                 * arrived.  In which case returning is required, or we
 962                 * would hang trying to nfs_receive an already received reply.
 963                 */
 964                if (myrep->r_mrep != NULL) {
 965                        nfs_rcvunlock(myrep);
 966                        FSDBG(530, myrep->r_xid, myrep, myrep->r_nmp, -1);
 967                        return (0);
 968                }
 969                /*
 970                 * Get the next Rpc reply off the socket. Assume myrep->r_nmp
 971                 * is still intact by checks done in nfs_rcvlock.
 972                 */
 973                error = nfs_receive(myrep, &mrep);
 974                /*
 975                 * Bailout asap if nfsmount struct gone (unmounted). 
 976                 */
 977                if (!myrep->r_nmp) {
 978                        FSDBG(530, myrep->r_xid, myrep, nmp, -2);
 979                        if (mrep)
 980                                mbuf_freem(mrep);
 981                        return (ENXIO);
 982                }
 983                if (error) {
 984                        FSDBG(530, myrep->r_xid, myrep, nmp, error);
 985                        nfs_rcvunlock(myrep);
 986
 987                        /* Bailout asap if nfsmount struct gone (unmounted). */
 988                        if (!myrep->r_nmp) {
 989                                if (mrep)
 990                                        mbuf_freem(mrep);
 991                                return (ENXIO);
 992                        }
 993
 994                        /*
 995                         * Ignore routing errors on connectionless protocols??
 996                         */
 997                        if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
 998                                if (nmp->nm_so) {
 999                                        int clearerror;
1000                                        int optlen = sizeof(clearerror);
1001                                        sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1002                                }
1003                                continue;
1004                        }
1005                        if (mrep)
1006                                mbuf_freem(mrep);
1007                        return (error);
1008                }
1009
1010                /*
1011                 * We assume all is fine, but if we did not have an error
1012                 * and mrep is 0, better not dereference it. nfs_receive
1013                 * calls soreceive which carefully sets error=0 when it got
1014                 * errors on sbwait (tsleep). In most cases, I assume that's 
1015                 * so we could go back again. In tcp case, EPIPE is returned.
1016                 * In udp, case nfs_receive gets back here with no error and no
1017                 * mrep. Is the right fix to have soreceive check for process
1018                 * aborted after sbwait and return something non-zero? Should
1019                 * nfs_receive give an EPIPE?  Too risky to play with those
1020                 * two this late in game for a shutdown problem. Instead,
1021                 * just check here and get out. (ekn)
1022                 */
1023                if (!mrep) {
1024                        nfs_rcvunlock(myrep);
1025                        FSDBG(530, myrep->r_xid, myrep, nmp, -3);
1026                        return (ENXIO); /* sounds good */
1027                }
1028                        
1029                /*
1030                 * Get the xid and check that it is an rpc reply
1031                 */
1032                md = mrep;
1033                dpos = mbuf_data(md);
1034                nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED);
1035                rxid = *tl++;
1036                if (*tl != rpc_reply) {
1037                        OSAddAtomic(1, (SInt32*)&nfsstats.rpcinvalid);
1038                        mbuf_freem(mrep);
1039nfsmout:
1040                        if (nmp->nm_state & NFSSTA_RCVLOCK)
1041                                nfs_rcvunlock(myrep);
1042                        continue;
1043                }
1044
1045                /*
1046                 * Loop through the request list to match up the reply
1047                 * Iff no match, just drop the datagram
1048                 */
1049                TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1050                        if (rep->r_mrep == NULL && rxid == rep->r_xid) {
1051                                /* Found it.. */
1052                                rep->r_mrep = mrep;
1053                                rep->r_md = md;
1054                                rep->r_dpos = dpos;
1055                                /*
1056                                 * If we're tracking the round trip time
1057                                 * then we update the circular log here
1058                                 * with the stats from our current request.
1059                                 */
1060                                if (nfsrtton) {
1061                                        struct rttl *rt;
1062
1063                                        rt = &nfsrtt.rttl[nfsrtt.pos];
1064                                        rt->proc = rep->r_procnum;
1065                                        rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
1066                                        rt->sent = nmp->nm_sent;
1067                                        rt->cwnd = nmp->nm_cwnd;
1068                                        if (proct[rep->r_procnum] == 0)
1069                                                panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1070                                        rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
1071                                        rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
1072                                        rt->fsid = vfs_statfs(nmp->nm_mountp)->f_fsid;
1073                                        microtime(&rt->tstamp); // XXX unused
1074                                        if (rep->r_flags & R_TIMING)
1075                                                rt->rtt = rep->r_rtt;
1076                                        else
1077                                                rt->rtt = 1000000;
1078                                        nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
1079                                }
1080                                /*
1081                                 * Update congestion window.
1082                                 * Do the additive increase of
1083                                 * one rpc/rtt.
1084                                 */
1085                                FSDBG(530, rep->r_xid, rep, nmp->nm_sent,
1086                                      nmp->nm_cwnd);
1087                                if (nmp->nm_cwnd <= nmp->nm_sent) {
1088                                        nmp->nm_cwnd +=
1089                                           (NFS_CWNDSCALE * NFS_CWNDSCALE +
1090                                           (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
1091                                        if (nmp->nm_cwnd > NFS_MAXCWND)
1092                                                nmp->nm_cwnd = NFS_MAXCWND;
1093                                }
1094                                if (rep->r_flags & R_SENT) {
1095                                    rep->r_flags &= ~R_SENT;
1096                                    nmp->nm_sent -= NFS_CWNDSCALE;
1097                               }
1098                                /*
1099                                 * Update rtt using a gain of 0.125 on the mean
1100                                 * and a gain of 0.25 on the deviation.
1101                                 */
1102                                if (rep->r_flags & R_TIMING) {
1103                                        /*
1104                                         * Since the timer resolution of
1105                                         * NFS_HZ is so course, it can often
1106                                         * result in r_rtt == 0. Since
1107                                         * r_rtt == N means that the actual
1108                                         * rtt is between N+dt and N+2-dt ticks,
1109                                         * add 1.
1110                                         */
1111                                        if (proct[rep->r_procnum] == 0)
1112                                                panic("nfs_reply: proct[%d] is zero", rep->r_procnum);
1113                                        t1 = rep->r_rtt + 1;
1114                                        t1 -= (NFS_SRTT(rep) >> 3);
1115                                        NFS_SRTT(rep) += t1;
1116                                        if (t1 < 0)
1117                                                t1 = -t1;
1118                                        t1 -= (NFS_SDRTT(rep) >> 2);
1119                                        NFS_SDRTT(rep) += t1;
1120                                }
1121                                nmp->nm_timeouts = 0;
1122                                break;
1123                        }
1124                }
1125                nfs_rcvunlock(myrep);
1126                /*
1127                 * If not matched to a request, drop it.
1128                 * If it's mine, get out.
1129                 */
1130                if (rep == 0) {
1131                        OSAddAtomic(1, (SInt32*)&nfsstats.rpcunexpected);
1132                        mbuf_freem(mrep);
1133                } else if (rep == myrep) {
1134                        if (rep->r_mrep == NULL)
1135                                panic("nfs_reply: nil r_mrep");
1136                        return (0);
1137                }
1138                FSDBG(530, myrep->r_xid, myrep, rep,
1139                      rep ? rep->r_xid : myrep->r_flags);
1140        }
1141}
1142
1143/*
1144 * nfs_request - goes something like this
1145 *      - fill in request struct
1146 *      - links it into list
1147 *      - calls nfs_send() for first transmit
1148 *      - calls nfs_receive() to get reply
1149 *      - break down rpc header and return with nfs reply pointed to
1150 *        by mrep or error
1151 * nb: always frees up mreq mbuf list
1152 */
1153int
1154nfs_request(vp, mp, mrest, procnum, procp, cred, mrp, mdp, dposp, xidp)
1155        vnode_t vp;
1156        mount_t mp;
1157        mbuf_t mrest;
1158        int procnum;
1159        proc_t procp;
1160        kauth_cred_t cred;
1161        mbuf_t *mrp;
1162        mbuf_t *mdp;
1163        caddr_t *dposp;
1164        u_int64_t *xidp;
1165{
1166        mbuf_t m, mrep, m2;
1167        struct nfsreq re, *rep;
1168        u_long *tl;
1169        int i;
1170        struct nfsmount *nmp;
1171        mbuf_t md, mheadend;
1172        char nickv[RPCX_NICKVERF];
1173        time_t waituntil;
1174        caddr_t dpos, cp2;
1175        int t1, error = 0, mrest_len, auth_len, auth_type;
1176        int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
1177        int verf_len, verf_type;
1178        u_long xid;
1179        char *auth_str, *verf_str;
1180        NFSKERBKEY_T key;               /* save session key */
1181        int nmsotype;
1182        struct timeval now;
1183
1184        if (mrp)
1185                *mrp = NULL;
1186        if (xidp)
1187                *xidp = 0;
1188        nmp = VFSTONFS(mp);
1189
1190        rep = &re;
1191
1192        if (vp)
1193                nmp = VFSTONFS(vnode_mount(vp));
1194        if (nmp == NULL ||
1195            (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1196            (NFSSTA_FORCE|NFSSTA_TIMEO)) {
1197                mbuf_freem(mrest);
1198                return (ENXIO);
1199        }
1200        nmsotype = nmp->nm_sotype;
1201
1202        FSDBG_TOP(531, vp, procnum, nmp, rep);
1203
1204        rep->r_nmp = nmp;
1205        rep->r_vp = vp;
1206        rep->r_procp = procp;
1207        rep->r_procnum = procnum;
1208        microuptime(&now);
1209        rep->r_lastmsg = now.tv_sec -
1210            ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay));
1211        i = 0;
1212        m = mrest;
1213        while (m) {
1214                i += mbuf_len(m);
1215                m = mbuf_next(m);
1216        }
1217        mrest_len = i;
1218
1219        /*
1220         * Get the RPC header with authorization.
1221         */
1222kerbauth:
1223        nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1224        if (!nmp) {
1225                FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1226                mbuf_freem(mrest);
1227                return (ENXIO);
1228        }
1229        verf_str = auth_str = (char *)0;
1230        if (nmp->nm_flag & NFSMNT_KERB) {
1231                verf_str = nickv;
1232                verf_len = sizeof (nickv);
1233                auth_type = RPCAUTH_KERB4;
1234                bzero((caddr_t)key, sizeof (key));
1235                if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
1236                        &auth_len, verf_str, verf_len)) {
1237                        nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1238                        if (!nmp) {
1239                                FSDBG_BOT(531, 2, vp, error, rep);
1240                                mbuf_freem(mrest);
1241                                return (ENXIO);
1242                        }
1243                        error = nfs_getauth(nmp, rep, cred, &auth_str,
1244                                &auth_len, verf_str, &verf_len, key);
1245                        nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1246                        if (!error && !nmp)
1247                                error = ENXIO;
1248                        if (error) {
1249                                FSDBG_BOT(531, 2, vp, error, rep);
1250                                mbuf_freem(mrest);
1251                                return (error);
1252                        }
1253                }
1254        } else {
1255                auth_type = RPCAUTH_UNIX;
1256                if (cred->cr_ngroups < 1)
1257                        panic("nfsreq nogrps");
1258                auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1259                        nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1260                        5 * NFSX_UNSIGNED;
1261        }
1262        error = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1263             auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid, &m);
1264        if (auth_str)
1265                _FREE(auth_str, M_TEMP);
1266        if (error) {
1267                mbuf_freem(mrest);
1268                FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1269                return (error);
1270        }
1271        if (xidp)
1272                *xidp = ntohl(xid) + ((u_int64_t)nfs_xidwrap << 32);
1273
1274        /*
1275         * For stream protocols, insert a Sun RPC Record Mark.
1276         */
1277        if (nmsotype == SOCK_STREAM) {
1278                error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK);
1279                if (error) {
1280                        mbuf_freem(m);
1281                        FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1282                        return (error);
1283                }
1284                *((u_long*)mbuf_data(m)) =
1285                        htonl(0x80000000 | (mbuf_pkthdr_len(m) - NFSX_UNSIGNED));
1286        }
1287        rep->r_mreq = m;
1288        rep->r_xid = xid;
1289tryagain:
1290        nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1291        if (nmp && (nmp->nm_flag & NFSMNT_SOFT))
1292                rep->r_retry = nmp->nm_retry;
1293        else
1294                rep->r_retry = NFS_MAXREXMIT + 1;       /* past clip limit */
1295        rep->r_rtt = rep->r_rexmit = 0;
1296        if (proct[procnum] > 0)
1297                rep->r_flags = R_TIMING;
1298        else
1299                rep->r_flags = 0;
1300        rep->r_mrep = NULL;
1301
1302        /*
1303         * Do the client side RPC.
1304         */
1305        OSAddAtomic(1, (SInt32*)&nfsstats.rpcrequests);
1306        /*
1307         * Chain request into list of outstanding requests. Be sure
1308         * to put it LAST so timer finds oldest requests first.
1309         */
1310        TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1311
1312        /*
1313         * If backing off another request or avoiding congestion, don't
1314         * send this one now but let timer do it. If not timing a request,
1315         * do it now.
1316         */
1317        if (nmp && nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1318                           (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1319                           nmp->nm_sent < nmp->nm_cwnd)) {
1320                int connrequired = (nmp->nm_sotype == SOCK_STREAM);
1321
1322                if (connrequired)
1323                        error = nfs_sndlock(rep);
1324
1325                /*
1326                 * Set the R_SENT before doing the send in case another thread
1327                 * processes the reply before the nfs_send returns here
1328                 */
1329                if (!error) {
1330                        if ((rep->r_flags & R_MUSTRESEND) == 0) {
1331                                FSDBG(531, rep->r_xid, rep, nmp->nm_sent,
1332                                      nmp->nm_cwnd);
1333                                nmp->nm_sent += NFS_CWNDSCALE;
1334                                rep->r_flags |= R_SENT;
1335                        }
1336
1337                        error = mbuf_copym(m, 0, MBUF_COPYALL, MBUF_WAITOK, &m2);
1338                        if (!error)
1339                                error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1340                        if (connrequired)
1341                                nfs_sndunlock(rep);
1342                }
1343                nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1344                if (error) {
1345                        if (nmp)
1346                                nmp->nm_sent -= NFS_CWNDSCALE;
1347                        rep->r_flags &= ~R_SENT;
1348                }
1349        } else {
1350                rep->r_rtt = -1;
1351        }
1352
1353        /*
1354         * Wait for the reply from our send or the timer's.
1355         */
1356        if (!error || error == EPIPE)
1357                error = nfs_reply(rep);
1358
1359        /*
1360         * RPC done, unlink the request.
1361         */
1362        nfs_repdequeue(rep);
1363
1364        nmp = vp ? VFSTONFS(vnode_mount(vp)) : rep->r_nmp;
1365
1366        /*
1367         * Decrement the outstanding request count.
1368         */
1369        if (rep->r_flags & R_SENT) {
1370                rep->r_flags &= ~R_SENT;        /* paranoia */
1371                if (nmp) {
1372                        FSDBG(531, rep->r_xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1373                        nmp->nm_sent -= NFS_CWNDSCALE;
1374                }
1375        }
1376
1377        /*
1378         * If there was a successful reply and a tprintf msg.
1379         * tprintf a response.
1380         */
1381        if (!error)
1382                nfs_up(nmp, procp, NFSSTA_TIMEO,
1383                        (rep->r_flags & R_TPRINTFMSG) ? "is alive again" : NULL);
1384        mrep = rep->r_mrep;
1385        md = rep->r_md;
1386        dpos = rep->r_dpos;
1387        if (!error && !nmp)
1388                error = ENXIO;
1389        if (error) {
1390                mbuf_freem(rep->r_mreq);
1391                FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1392                return (error);
1393        }
1394
1395        /*
1396         * break down the rpc header and check if ok
1397         */
1398        nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
1399        if (*tl++ == rpc_msgdenied) {
1400                if (*tl == rpc_mismatch)
1401                        error = EOPNOTSUPP;
1402                else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1403                        if (!failed_auth) {
1404                                failed_auth++;
1405                                error = mbuf_setnext(mheadend, NULL);
1406                                mbuf_freem(mrep);
1407                                mbuf_freem(rep->r_mreq);
1408                                if (!error)
1409                                        goto kerbauth;
1410                                printf("nfs_request: mbuf_setnext failed\n");
1411                        } else
1412                                error = EAUTH;
1413                } else
1414                        error = EACCES;
1415                mbuf_freem(mrep);
1416                mbuf_freem(rep->r_mreq);
1417                FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1418                return (error);
1419        }
1420
1421        /*
1422         * Grab any Kerberos verifier, otherwise just throw it away.
1423         */
1424        verf_type = fxdr_unsigned(int, *tl++);
1425        i = fxdr_unsigned(int, *tl);
1426        if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1427                error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1428                if (error)
1429                        goto nfsmout;
1430        } else if (i > 0)
1431                nfsm_adv(nfsm_rndup(i));
1432        nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1433        /* 0 == ok */
1434        if (*tl == 0) {
1435                nfsm_dissect(tl, u_long *, NFSX_UNSIGNED);
1436                if (*tl != 0) {
1437                        error = fxdr_unsigned(int, *tl);
1438                        if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1439                                error == NFSERR_TRYLATER) {
1440                                mbuf_freem(mrep);
1441                                error = 0;
1442                                microuptime(&now);
1443                                waituntil = now.tv_sec + trylater_delay;
1444                                while (now.tv_sec < waituntil) {
1445                                        tsleep((caddr_t)&lbolt, PSOCK, "nfstrylater", 0);
1446                                        microuptime(&now);
1447                                }
1448                                trylater_delay *= 2;
1449                                if (trylater_delay > 60)
1450                                        trylater_delay = 60;
1451                                goto tryagain;
1452                        }
1453
1454                        /*
1455                         * If the File Handle was stale, invalidate the
1456                         * lookup cache, just in case.
1457                         */
1458                        if ((error == ESTALE) && vp)
1459                                cache_purge(vp);
1460                        if (nmp->nm_flag & NFSMNT_NFSV3) {
1461                                *mrp = mrep;
1462                                *mdp = md;
1463                                *dposp = dpos;
1464                                error |= NFSERR_RETERR;
1465                        } else {
1466                                mbuf_freem(mrep);
1467                                error &= ~NFSERR_RETERR;
1468                        }
1469                        mbuf_freem(rep->r_mreq);
1470                        FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1471                        return (error);
1472                }
1473
1474                *mrp = mrep;
1475                *mdp = md;
1476                *dposp = dpos;
1477                mbuf_freem(rep->r_mreq);
1478                FSDBG_BOT(531, 0xf0f0f0f0, rep->r_xid, nmp, rep);
1479                return (0);
1480        }
1481        mbuf_freem(mrep);
1482        error = EPROTONOSUPPORT;
1483nfsmout:
1484        mbuf_freem(rep->r_mreq);
1485        FSDBG_BOT(531, error, rep->r_xid, nmp, rep);
1486        return (error);
1487}
1488
1489#ifndef NFS_NOSERVER
1490/*
1491 * Generate the rpc reply header
1492 * siz arg. is used to decide if adding a cluster is worthwhile
1493 */
1494int
1495nfs_rephead(siz, nd, slp, err, mrq, mbp, bposp)
1496        int siz;
1497        struct nfsrv_descript *nd;
1498        struct nfssvc_sock *slp;
1499        int err;
1500        mbuf_t *mrq;
1501        mbuf_t *mbp;
1502        caddr_t *bposp;
1503{
1504        u_long *tl;
1505        mbuf_t mreq;
1506        caddr_t bpos;
1507        mbuf_t mb, mb2;
1508        int error, mlen;
1509
1510        /*
1511         * If this is a big reply, use a cluster else
1512         * try and leave leading space for the lower level headers.
1513         */
1514        siz += RPC_REPLYSIZ;
1515        if (siz >= nfs_mbuf_minclsize) {
1516                error = mbuf_getpacket(MBUF_WAITOK, &mreq);
1517        } else {
1518                error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mreq);
1519        }
1520        if (error) {
1521                /* unable to allocate packet */
1522                /* XXX nfsstat? */
1523                return (error);
1524        }
1525        mb = mreq;
1526        tl = mbuf_data(mreq);
1527        mlen = 6 * NFSX_UNSIGNED;
1528        if (siz < nfs_mbuf_minclsize) {
1529                /* leave space for lower level headers */
1530                tl += 80/sizeof(*tl);  /* XXX max_hdr? XXX */
1531                mbuf_setdata(mreq, tl, mlen);
1532        } else {
1533                mbuf_setlen(mreq, mlen);
1534        }
1535        bpos = ((caddr_t)tl) + mlen;
1536        *tl++ = txdr_unsigned(nd->nd_retxid);
1537        *tl++ = rpc_reply;
1538        if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1539                *tl++ = rpc_msgdenied;
1540                if (err & NFSERR_AUTHERR) {
1541                        *tl++ = rpc_autherr;
1542                        *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1543                        mlen -= NFSX_UNSIGNED;
1544                        mbuf_setlen(mreq, mlen);
1545                        bpos -= NFSX_UNSIGNED;
1546                } else {
1547                        *tl++ = rpc_mismatch;
1548                        *tl++ = txdr_unsigned(RPC_VER2);
1549                        *tl = txdr_unsigned(RPC_VER2);
1550                }
1551        } else {
1552                *tl++ = rpc_msgaccepted;
1553
1554                /*
1555                 * For Kerberos authentication, we must send the nickname
1556                 * verifier back, otherwise just RPCAUTH_NULL.
1557                 */
1558                if (nd->nd_flag & ND_KERBFULL) {
1559                    struct nfsuid *nuidp;
1560                    struct timeval ktvin, ktvout;
1561                    uid_t uid = kauth_cred_getuid(nd->nd_cr);
1562
1563                    lck_rw_lock_shared(&slp->ns_rwlock);
1564                    for (nuidp = NUIDHASH(slp, uid)->lh_first;
1565                        nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1566                        if (kauth_cred_getuid(nuidp->nu_cr) == uid &&
1567                            (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1568                             &nuidp->nu_haddr, nd->nd_nam2)))
1569                            break;
1570                    }
1571                    if (nuidp) {
1572                        ktvin.tv_sec =
1573                            txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1574                        ktvin.tv_usec =
1575                            txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1576
1577                        /*
1578                         * Encrypt the timestamp in ecb mode using the
1579                         * session key.
1580                         */
1581#if NFSKERB
1582                        XXX
1583#endif
1584
1585                        *tl++ = rpc_auth_kerb;
1586                        *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1587                        *tl = ktvout.tv_sec;
1588                        nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED);
1589                        *tl++ = ktvout.tv_usec;
1590                        *tl++ = txdr_unsigned(kauth_cred_getuid(nuidp->nu_cr));
1591                    } else {
1592                        *tl++ = 0;
1593                        *tl++ = 0;
1594                    }
1595                    lck_rw_done(&slp->ns_rwlock);
1596                } else {
1597                        *tl++ = 0;
1598                        *tl++ = 0;
1599                }
1600                switch (err) {
1601                case EPROGUNAVAIL:
1602                        *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1603                        break;
1604                case EPROGMISMATCH:
1605                        *tl = txdr_unsigned(RPC_PROGMISMATCH);
1606                        nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
1607                        // XXX hard coded versions
1608                        *tl++ = txdr_unsigned(2);
1609                        *tl = txdr_unsigned(3);
1610                        break;
1611                case EPROCUNAVAIL:
1612                        *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1613                        break;
1614                case EBADRPC:
1615                        *tl = txdr_unsigned(RPC_GARBAGE);
1616                        break;
1617                default:
1618                        *tl = 0;
1619                        if (err != NFSERR_RETVOID) {
1620                                nfsm_build(tl, u_long *, NFSX_UNSIGNED);
1621                                if (err)
1622                                    *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1623                                else
1624                                    *tl = 0;
1625                        }
1626                        break;
1627                }
1628        }
1629
1630        if (mrq != NULL)
1631                *mrq = mreq;
1632        *mbp = mb;
1633        *bposp = bpos;
1634        if (err != 0 && err != NFSERR_RETVOID) {
1635                OSAddAtomic(1, (SInt32*)&nfsstats.srvrpc_errs);
1636        }
1637        return (0);
1638}
1639
1640
1641#endif /* NFS_NOSERVER */
1642
1643
1644/*
1645 * From FreeBSD 1.58, a Matt Dillon fix...
1646 * Flag a request as being about to terminate.
1647 * The nm_sent count is decremented now to avoid deadlocks when the process
1648 * in soreceive() hasn't yet managed to send its own request.
1649 */
1650static void
1651nfs_softterm(struct nfsreq *rep)
1652{
1653
1654        rep->r_flags |= R_SOFTTERM;
1655        if (rep->r_flags & R_SENT) {
1656                FSDBG(532, rep->r_xid, rep, rep->r_nmp->nm_sent,
1657                      rep->r_nmp->nm_cwnd);
1658                rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1659                rep->r_flags &= ~R_SENT;
1660        }
1661}
1662
1663void
1664nfs_timer_funnel(void * arg)
1665{
1666        (void) thread_funnel_set(kernel_flock, TRUE);
1667        nfs_timer(arg);
1668        (void) thread_funnel_set(kernel_flock, FALSE);
1669
1670}
1671
1672/*
1673 * Ensure rep isn't in use by the timer, then dequeue it.
1674 */
1675static void
1676nfs_repdequeue(struct nfsreq *rep)
1677{
1678
1679        while ((rep->r_flags & R_BUSY)) {
1680                rep->r_flags |= R_WAITING;
1681                tsleep(rep, PSOCK, "repdeq", 0);
1682        }
1683        TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1684}
1685
1686/*
1687 * Busy (lock) a nfsreq, used by the nfs timer to make sure it's not
1688 * free()'d out from under it.
1689 */
1690static void
1691nfs_repbusy(struct nfsreq *rep)
1692{
1693
1694        if ((rep->r_flags & R_BUSY))
1695                panic("rep locked");
1696        rep->r_flags |= R_BUSY;
1697}
1698
1699/*
1700 * Unbusy the nfsreq passed in, return the next nfsreq in the chain busied.
1701 */
1702static struct nfsreq *
1703nfs_repnext(struct nfsreq *rep)
1704{
1705        struct nfsreq * nextrep;
1706
1707        if (rep == NULL)
1708                return (NULL);
1709        /*
1710         * We need to get and busy the next req before signalling the
1711         * current one, otherwise wakeup() may block us and we'll race to
1712         * grab the next req.
1713         */
1714        nextrep = TAILQ_NEXT(rep, r_chain);
1715        if (nextrep != NULL)
1716                nfs_repbusy(nextrep);
1717        /* unbusy and signal. */
1718        rep->r_flags &= ~R_BUSY;
1719        if ((rep->r_flags & R_WAITING)) {
1720                rep->r_flags &= ~R_WAITING;
1721                wakeup(rep);
1722        }
1723        return (nextrep);
1724}
1725
1726/*
1727 * Nfs timer routine
1728 * Scan the nfsreq list and retranmit any requests that have timed out
1729 * To avoid retransmission attempts on STREAM sockets (in the future) make
1730 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1731 */
1732void
1733nfs_timer(__unused void *arg)
1734{
1735        struct nfsreq *rep;
1736        mbuf_t m;
1737        socket_t so;
1738        struct nfsmount *nmp;
1739        int timeo;
1740        int error;
1741#ifndef NFS_NOSERVER
1742        struct nfssvc_sock *slp;
1743        u_quad_t cur_usec;
1744#endif /* NFS_NOSERVER */
1745        int flags, rexmit, cwnd, sent;
1746        u_long xid;
1747        struct timeval now;
1748
1749        rep = TAILQ_FIRST(&nfs_reqq);
1750        if (rep != NULL)
1751                nfs_repbusy(rep);
1752        microuptime(&now);
1753        for ( ; rep != NULL ; rep = nfs_repnext(rep)) {
1754                nmp = rep->r_nmp;
1755                if (!nmp) /* unmounted */
1756                    continue;
1757                if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
1758                        continue;
1759                if (nfs_sigintr(nmp, rep, rep->r_procp))
1760                        continue;
1761                if (nmp->nm_tprintf_initial_delay != 0 &&
1762                    (rep->r_rexmit > 2 || (rep->r_flags & R_RESENDERR)) &&
1763                    rep->r_lastmsg + nmp->nm_tprintf_delay < now.tv_sec) {
1764                        rep->r_lastmsg = now.tv_sec;
1765                        nfs_down(rep->r_nmp, rep->r_procp, 0, NFSSTA_TIMEO,
1766                                "not responding");
1767                        rep->r_flags |= R_TPRINTFMSG;
1768                        if (!(nmp->nm_state & NFSSTA_MOUNTED)) {
1769                                /* we're not yet completely mounted and */
1770                                /* we can't complete an RPC, so we fail */
1771                                OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1772                                nfs_softterm(rep);
1773                                continue;
1774                        }
1775                }
1776                if (rep->r_rtt >= 0) {
1777                        rep->r_rtt++;
1778                        if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1779                                timeo = nmp->nm_timeo;
1780                        else
1781                                timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1782                        /* ensure 62.5 ms floor */
1783                        while (16 * timeo < hz)
1784                            timeo *= 2;
1785                        if (nmp->nm_timeouts > 0)
1786                                timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1787                        if (rep->r_rtt <= timeo)
1788                                continue;
1789                        if (nmp->nm_timeouts < 8)
1790                                nmp->nm_timeouts++;
1791                }
1792                /*
1793                 * Check for too many retransmits.  This is never true for
1794                 * 'hard' mounts because we set r_retry to NFS_MAXREXMIT + 1
1795                 * and never allow r_rexmit to be more than NFS_MAXREXMIT.
1796                 */
1797                if (rep->r_rexmit >= rep->r_retry) {    /* too many */
1798                        OSAddAtomic(1, (SInt32*)&nfsstats.rpctimeouts);
1799                        nfs_softterm(rep);
1800                        continue;
1801                }
1802                if (nmp->nm_sotype != SOCK_DGRAM) {
1803                        if (++rep->r_rexmit > NFS_MAXREXMIT)
1804                                rep->r_rexmit = NFS_MAXREXMIT;
1805                        continue;
1806                }
1807                if ((so = nmp->nm_so) == NULL)
1808                        continue;
1809
1810                /*
1811                 * If there is enough space and the window allows..
1812                 *      Resend it
1813                 * Set r_rtt to -1 in case we fail to send it now.
1814                 */
1815                rep->r_rtt = -1;
1816                if (((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1817                    (rep->r_flags & R_SENT) ||
1818                    nmp->nm_sent < nmp->nm_cwnd) &&
1819                   (mbuf_copym(rep->r_mreq, 0, MBUF_COPYALL, MBUF_DONTWAIT, &m) == 0)){
1820                        struct msghdr   msg;
1821                        /*
1822                         * Iff first send, start timing
1823                         * else turn timing off, backoff timer
1824                         * and divide congestion window by 2.
1825                         * We update these *before* the send to avoid
1826                         * racing against receiving the reply.
1827                         * We save them so we can restore them on send error.
1828                         */
1829                        flags = rep->r_flags;
1830                        rexmit = rep->r_rexmit;
1831                        cwnd = nmp->nm_cwnd;
1832                        sent = nmp->nm_sent;
1833                        xid = rep->r_xid;
1834                        if (rep->r_flags & R_SENT) {
1835                                rep->r_flags &= ~R_TIMING;
1836                                if (++rep->r_rexmit > NFS_MAXREXMIT)
1837                                        rep->r_rexmit = NFS_MAXREXMIT;
1838                                nmp->nm_cwnd >>= 1;
1839                                if (nmp->nm_cwnd < NFS_CWNDSCALE)
1840                                        nmp->nm_cwnd = NFS_CWNDSCALE;
1841                                OSAddAtomic(1, (SInt32*)&nfsstats.rpcretries);
1842                        } else {
1843                                rep->r_flags |= R_SENT;
1844                                nmp->nm_sent += NFS_CWNDSCALE;
1845                        }
1846                        FSDBG(535, xid, rep, nmp->nm_sent, nmp->nm_cwnd);
1847
1848                        bzero(&msg, sizeof(msg));
1849                        if ((nmp->nm_flag & NFSMNT_NOCONN) == NFSMNT_NOCONN) {
1850                                msg.msg_name = mbuf_data(nmp->nm_nam);
1851                                msg.msg_namelen = mbuf_len(nmp->nm_nam);
1852                        }
1853                        error = sock_sendmbuf(so, &msg, m, MSG_DONTWAIT, NULL);
1854
1855                        FSDBG(535, xid, error, sent, cwnd);
1856
1857                        if (error) {
1858                                if (error == EWOULDBLOCK) {
1859                                        rep->r_flags = flags;
1860                                        rep->r_rexmit = rexmit;
1861                                        nmp->nm_cwnd = cwnd;
1862                                        nmp->nm_sent = sent;
1863                                        rep->r_xid = xid;
1864                                }
1865                                else {
1866                                        if (NFSIGNORE_SOERROR(nmp->nm_sotype, error)) {
1867                                                int clearerror;
1868                                                int optlen = sizeof(clearerror);
1869                                                sock_getsockopt(nmp->nm_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen);
1870                                        }
1871                                        rep->r_flags  = flags | R_RESENDERR;
1872                                        rep->r_rexmit = rexmit;
1873                                        nmp->nm_cwnd = cwnd;
1874                                        nmp->nm_sent = sent;
1875                                        if (flags & R_SENT)
1876                                                OSAddAtomic(-1, (SInt32*)&nfsstats.rpcretries);
1877                                }
1878                        } else
1879                                rep->r_rtt = 0;
1880                }
1881        }
1882        microuptime(&now);
1883#ifndef NFS_NOSERVER
1884        /*
1885         * Scan the write gathering queues for writes that need to be
1886         * completed now.
1887         */
1888        cur_usec = (u_quad_t)now.tv_sec * 1000000 + (u_quad_t)now.tv_usec;
1889        lck_mtx_lock(nfsd_mutex);
1890        TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1891            if (slp->ns_wgtime && (slp->ns_wgtime <= cur_usec))
1892                nfsrv_wakenfsd(slp);
1893        }
1894        while ((slp = TAILQ_FIRST(&nfssvc_deadsockhead))) {
1895                if ((slp->ns_timestamp + 5) > now.tv_sec)
1896                        break;
1897                TAILQ_REMOVE(&nfssvc_deadsockhead, slp, ns_chain);
1898                nfsrv_slpfree(slp);
1899        }
1900        lck_mtx_unlock(nfsd_mutex);
1901#endif /* NFS_NOSERVER */
1902
1903        if (nfsbuffreeuptimestamp + 30 <= now.tv_sec) {
1904                /*
1905                 * We haven't called nfs_buf_freeup() in a little while.
1906                 * So, see if we can free up any stale/unused bufs now.
1907                 */
1908                nfs_buf_freeup(1);
1909        }
1910
1911        timeout(nfs_timer_funnel, (void *)0, nfs_ticks);
1912
1913}
1914
1915
1916/*
1917 * Test for a termination condition pending on the process.
1918 * This is used to determine if we need to bail on a mount.
1919 * EIO is returned if there has been a soft timeout.
1920 * EINTR is returned if there is a signal pending that is not being ignored
1921 * and the mount is interruptable, or if we are a thread that is in the process
1922 * of cancellation (also SIGKILL posted).
1923 */
1924int
1925nfs_sigintr(nmp, rep, p)
1926        struct nfsmount *nmp;
1927        struct nfsreq *rep;
1928        proc_t p;
1929{
1930        sigset_t pending_sigs;
1931        int context_good = 0;
1932        struct nfsmount *repnmp;
1933        extern proc_t kernproc;
1934
1935        if (nmp == NULL)
1936                return (ENXIO);
1937        if (rep != NULL) {
1938                repnmp = rep->r_nmp;
1939                /* we've had a forced unmount. */
1940                if (repnmp == NULL)
1941                        return (ENXIO);
1942                /* request has timed out on a 'soft' mount. */
1943                if (rep->r_flags & R_SOFTTERM)
1944                        return (EIO);
1945                /*
1946                 * We're in the progress of a force unmount and there's
1947                 * been a timeout we're dead and fail IO.
1948                 */
1949                if ((repnmp->nm_state & (NFSSTA_FORCE|NFSSTA_TIMEO)) ==
1950                   (NFSSTA_FORCE|NFSSTA_TIMEO))
1951                        return (EIO);
1952                /* Someone is unmounting us, go soft and mark it. */
1953                if (repnmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) {
1954                        repnmp->nm_flag |= NFSMNT_SOFT;
1955                        nmp->nm_state |= NFSSTA_FORCE;
1956                }
1957                /*
1958                 * If the mount is hung and we've requested not to hang
1959                 * on remote filesystems, then bail now.
1960                 */
1961                if (p != NULL && (proc_noremotehang(p)) != 0 &&
1962                    (repnmp->nm_state & NFSSTA_TIMEO) != 0)
1963                        return (EIO);
1964        }
1965        /* XXX: is this valid?  this probably should be an assertion. */
1966        if (p == NULL)
1967                return (0);
1968
1969        /* Is this thread belongs to kernel task; then abort check  is not needed */
1970        if ((current_proc() != kernproc) && current_thread_aborted()) {
1971                return (EINTR);
1972        }
1973        /* mask off thread and process blocked signals. */
1974
1975        pending_sigs = proc_pendingsignals(p, NFSINT_SIGMASK);
1976        if (pending_sigs && (nmp->nm_flag & NFSMNT_INT) != 0)
1977                return (EINTR);
1978        return (0);
1979}
1980
1981/*
1982 * Lock a socket against others.
1983 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1984 * and also to avoid race conditions between the processes with nfs requests
1985 * in progress when a reconnect is necessary.
1986 */
1987int
1988nfs_sndlock(rep)
1989        struct nfsreq *rep;
1990{
1991        int *statep;
1992        proc_t p;
1993        int error, slpflag = 0, slptimeo = 0;
1994
1995        if (rep->r_nmp == NULL)
1996                return (ENXIO);
1997        statep = &rep->r_nmp->nm_state;
1998
1999        p = rep->r_procp;
2000        if (rep->r_nmp->nm_flag & NFSMNT_INT)
2001                slpflag = PCATCH;
2002        while (*statep & NFSSTA_SNDLOCK) {
2003                error = nfs_sigintr(rep->r_nmp, rep, p);
2004                if (error)
2005                        return (error);
2006                *statep |= NFSSTA_WANTSND;
2007                if (p != NULL && (proc_noremotehang(p)) != 0)
2008                        slptimeo = hz;
2009                tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsndlck", slptimeo);
2010                if (slpflag == PCATCH) {
2011                        slpflag = 0;
2012                        slptimeo = 2 * hz;
2013                }
2014                /*
2015                 * Make sure while we slept that the mountpoint didn't go away.
2016                 * nfs_sigintr and callers expect it in tact.
2017                 */
2018                if (!rep->r_nmp) 
2019                        return (ENXIO); /* don't have lock until out of loop */
2020        }
2021        *statep |= NFSSTA_SNDLOCK;
2022        return (0);
2023}
2024
2025/*
2026 * Unlock the stream socket for others.
2027 */
2028void
2029nfs_sndunlock(rep)
2030        struct nfsreq *rep;
2031{
2032        int *statep;
2033
2034        if (rep->r_nmp == NULL)
2035                return;
2036        statep = &rep->r_nmp->nm_state;
2037        if ((*statep & NFSSTA_SNDLOCK) == 0)
2038                panic("nfs sndunlock");
2039        *statep &= ~NFSSTA_SNDLOCK;
2040        if (*statep & NFSSTA_WANTSND) {
2041                *statep &= ~NFSSTA_WANTSND;
2042                wakeup((caddr_t)statep);
2043        }
2044}
2045
2046static int
2047nfs_rcvlock(struct nfsreq *rep)
2048{
2049        int *statep;
2050        int error, slpflag, slptimeo = 0;
2051
2052        /* make sure we still have our mountpoint */
2053        if (!rep->r_nmp) {
2054                if (rep->r_mrep != NULL)
2055                        return (EALREADY);
2056                return (ENXIO);
2057        }
2058
2059        statep = &rep->r_nmp->nm_state;
2060        FSDBG_TOP(534, rep->r_xid, rep, rep->r_nmp, *statep);
2061        if (rep->r_nmp->nm_flag & NFSMNT_INT)
2062                slpflag = PCATCH;
2063        else
2064                slpflag = 0;
2065        while (*statep & NFSSTA_RCVLOCK) {
2066                if ((error = nfs_sigintr(rep->r_nmp, rep, rep->r_procp))) {
2067                        FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x100);
2068                        return (error);
2069                } else if (rep->r_mrep != NULL) {
2070                        /*
2071                         * Don't bother sleeping if reply already arrived
2072                         */
2073                        FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x101);
2074                        return (EALREADY);
2075                }
2076                FSDBG(534, rep->r_xid, rep, rep->r_nmp, 0x102);
2077                *statep |= NFSSTA_WANTRCV;
2078                /*
2079                 * We need to poll if we're P_NOREMOTEHANG so that we
2080                 * call nfs_sigintr periodically above.
2081                 */
2082                if (rep->r_procp != NULL &&
2083                    (proc_noremotehang(rep->r_procp)) != 0)
2084                        slptimeo = hz;
2085                tsleep((caddr_t)statep, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo);
2086                if (slpflag == PCATCH) {
2087                        slpflag = 0;
2088                        slptimeo = 2 * hz;
2089                }
2090                /*
2091                 * Make sure while we slept that the mountpoint didn't go away.
2092                 * nfs_sigintr and caller nfs_reply expect it intact.
2093                 */
2094                if (!rep->r_nmp)  {
2095                        FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, 0x103);
2096                        return (ENXIO); /* don't have lock until out of loop */
2097                }
2098        }
2099        /*
2100         * nfs_reply will handle it if reply already arrived.
2101         * (We may have slept or been preempted).
2102         */
2103        FSDBG_BOT(534, rep->r_xid, rep, rep->r_nmp, *statep);
2104        *statep |= NFSSTA_RCVLOCK;
2105        return (0);
2106}
2107
2108/*
2109 * Unlock the stream socket for others.
2110 */
2111static void
2112nfs_rcvunlock(struct nfsreq *rep)
2113{
2114        int *statep;
2115        
2116        if (rep->r_nmp == NULL)
2117                return;
2118        statep = &rep->r_nmp->nm_state;
2119
2120        FSDBG(533, statep, *statep, 0, 0);
2121        if ((*statep & NFSSTA_RCVLOCK) == 0)
2122                panic("nfs rcvunlock");
2123        *statep &= ~NFSSTA_RCVLOCK;
2124        if (*statep & NFSSTA_WANTRCV) {
2125                *statep &= ~NFSSTA_WANTRCV;
2126                wakeup((caddr_t)statep);
2127        }
2128}
2129
2130
2131#ifndef NFS_NOSERVER
2132/*
2133 * Socket upcall routine for the nfsd sockets.
2134 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2135 * Essentially do as much as possible non-blocking, else punt and it will
2136 * be called with MBUF_WAITOK from an nfsd.
2137 */
2138void
2139nfsrv_rcv(socket_t so, caddr_t arg, int waitflag)
2140{
2141        struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2142
2143        if (!nfs_numnfsd || !(slp->ns_flag & SLP_VALID))
2144                return;
2145
2146        lck_rw_lock_exclusive(&slp->ns_rwlock);
2147        nfsrv_rcv_locked(so, slp, waitflag);
2148        /* Note: ns_rwlock gets dropped when called with MBUF_DONTWAIT */
2149}
2150void
2151nfsrv_rcv_locked(socket_t so, struct nfssvc_sock *slp, int waitflag)
2152{
2153        mbuf_t m, mp, mhck, m2;
2154        int ns_flag=0, error;
2155        struct msghdr   msg;
2156        size_t bytes_read;
2157
2158        if ((slp->ns_flag & SLP_VALID) == 0) {
2159                if (waitflag == MBUF_DONTWAIT)
2160                        lck_rw_done(&slp->ns_rwlock);
2161                return;
2162        }
2163
2164#ifdef notdef
2165        /*
2166         * Define this to test for nfsds handling this under heavy load.
2167         */
2168        if (waitflag == MBUF_DONTWAIT) {
2169                ns_flag = SLP_NEEDQ;
2170                goto dorecs;
2171        }
2172#endif
2173        if (slp->ns_sotype == SOCK_STREAM) {
2174                /*
2175                 * If there are already records on the queue, defer soreceive()
2176                 * to an nfsd so that there is feedback to the TCP layer that
2177                 * the nfs servers are heavily loaded.
2178                 */
2179                if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
2180                        ns_flag = SLP_NEEDQ;
2181                        goto dorecs;
2182                }
2183
2184                /*
2185                 * Do soreceive().
2186                 */
2187                bytes_read = 1000000000;
2188                error = sock_receivembuf(so, NULL, &mp, MSG_DONTWAIT, &bytes_read);
2189                if (error || mp == NULL) {
2190                        if (error == EWOULDBLOCK)
2191                                ns_flag = SLP_NEEDQ;
2192                        else
2193                                ns_flag = SLP_DISCONN;
2194                        goto dorecs;
2195                }
2196                m = mp;
2197                if (slp->ns_rawend) {
2198                        if ((error = mbuf_setnext(slp->ns_rawend, m)))
2199                                panic("nfsrv_rcv: mbuf_setnext failed %d\n", error);
2200                        slp->ns_cc += bytes_read;
2201                } else {
2202                        slp->ns_raw = m;
2203                        slp->ns_cc = bytes_read;
2204                }
2205                while ((m2 = mbuf_next(m)))
2206                        m = m2;
2207                slp->ns_rawend = m;
2208
2209                /*
2210                 * Now try and parse record(s) out of the raw stream data.
2211                 */
2212                error = nfsrv_getstream(slp, waitflag);
2213                if (error) {
2214                        if (error == EPERM)
2215                                ns_flag = SLP_DISCONN;
2216                        else
2217                                ns_flag = SLP_NEEDQ;
2218                }
2219        } else {
2220                struct sockaddr_storage nam;
2221                
2222                bzero(&msg, sizeof(msg));
2223                msg.msg_name = (caddr_t)&nam;
2224                msg.msg_namelen = sizeof(nam);
2225                
2226                do {
2227                        bytes_read = 1000000000;
2228                        error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read);
2229                        if (mp) {
2230                                if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) {
2231                                        mbuf_setlen(mhck, nam.ss_len);
2232                                        bcopy(&nam, mbuf_data(mhck), nam.ss_len);
2233                                        m = mhck;
2234                                        if (mbuf_setnext(m, mp)) {
2235                                                /* trouble... just drop it */
2236                                                printf("nfsrv_rcv: mbuf_setnext failed\n");
2237                                                mbuf_free(mhck);
2238                                                m = mp;
2239                                        }
2240                                } else {
2241                                        m = mp;
2242                                }
2243                                if (slp->ns_recend)
2244                                        mbuf_setnextpkt(slp->ns_recend, m);
2245                                else
2246                                        slp->ns_rec = m;
2247                                slp->ns_recend = m;
2248                                mbuf_setnextpkt(m, NULL);
2249                        }
2250#if 0
2251                        if (error) {
2252                                /*
2253                                 * This may be needed in the future to support
2254                                 * non-byte-stream connection-oriented protocols
2255                                 * such as SCTP.
2256                                 */
2257                                /*
2258                                 * This (slp->ns_sotype == SOCK_STREAM) should really
2259                                 * be a check for PR_CONNREQUIRED.
2260                                 */
2261                                if ((slp->ns_sotype == SOCK_STREAM)
2262                                        && error != EWOULDBLOCK) {
2263                                        ns_flag = SLP_DISCONN;
2264                                        goto dorecs;
2265                                }
2266                        }
2267#endif
2268                } while (mp);
2269        }
2270
2271        /*
2272         * Now try and process the request records, non-blocking.
2273         */
2274dorecs:
2275        if (ns_flag)
2276                slp->ns_flag |= ns_flag;
2277        if (waitflag == MBUF_DONTWAIT) {
2278                int wake = (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)));
2279                lck_rw_done(&slp->ns_rwlock);
2280                if (wake && nfs_numnfsd) {
2281                        lck_mtx_lock(nfsd_mutex);
2282                        nfsrv_wakenfsd(slp);
2283                        lck_mtx_unlock(nfsd_mutex);
2284                }
2285        }
2286}
2287
2288/*
2289 * Try and extract an RPC request from the mbuf data list received on a
2290 * stream socket. The "waitflag" argument indicates whether or not it
2291 * can sleep.
2292 */
2293static int
2294nfsrv_getstream(slp, waitflag)
2295        struct nfssvc_sock *slp;
2296        int waitflag;
2297{
2298        mbuf_t m;
2299        char *cp1, *cp2, *mdata;
2300        int len, mlen, error;
2301        mbuf_t om, m2, recm;
2302        u_long recmark;
2303
2304        if (slp->ns_flag & SLP_GETSTREAM)
2305                panic("nfs getstream");
2306        slp->ns_flag |= SLP_GETSTREAM;
2307        for (;;) {
2308            if (slp->ns_reclen == 0) {
2309                if (slp->ns_cc < NFSX_UNSIGNED) {
2310                        slp->ns_flag &= ~SLP_GETSTREAM;
2311                        return (0);
2312                }
2313                m = slp->ns_raw;
2314                mdata = mbuf_data(m);
2315                mlen = mbuf_len(m);
2316                if (mlen >= NFSX_UNSIGNED) {
2317                        bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED);
2318                        mdata += NFSX_UNSIGNED;
2319                        mlen -= NFSX_UNSIGNED;
2320                        mbuf_setdata(m, mdata, mlen);
2321                } else {
2322                        cp1 = (caddr_t)&recmark;
2323                        cp2 = mdata;
2324                        while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2325                                while (mlen == 0) {
2326                                        m = mbuf_next(m);
2327                                        cp2 = mbuf_data(m);
2328                                        mlen = mbuf_len(m);
2329                                }
2330                                *cp1++ = *cp2++;
2331                                mlen--;
2332                                mbuf_setdata(m, cp2, mlen);
2333                        }
2334                }
2335                slp->ns_cc -= NFSX_UNSIGNED;
2336                recmark = ntohl(recmark);
2337                slp->ns_reclen = recmark & ~0x80000000;
2338                if (recmark & 0x80000000)
2339                        slp->ns_flag |= SLP_LASTFRAG;
2340                else
2341                        slp->ns_flag &= ~SLP_LASTFRAG;
2342                if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) {
2343                        slp->ns_flag &= ~SLP_GETSTREAM;
2344                        return (EPERM);
2345                }
2346            }
2347
2348            /*
2349             * Now get the record part.
2350             *
2351             * Note that slp->ns_reclen may be 0.  Linux sometimes
2352             * generates 0-length RPCs
2353             */
2354            recm = NULL;
2355            if (slp->ns_cc == slp->ns_reclen) {
2356                recm = slp->ns_raw;
2357                slp->ns_raw = slp->ns_rawend = NULL;
2358                slp->ns_cc = slp->ns_reclen = 0;
2359            } else if (slp->ns_cc > slp->ns_reclen) {
2360                len = 0;
2361                m = slp->ns_raw;
2362                mlen = mbuf_len(m);
2363                mdata = mbuf_data(m);
2364                om = NULL;
2365                while (len < slp->ns_reclen) {
2366                        if ((len + mlen) > slp->ns_reclen) {
2367                                if (mbuf_copym(m, 0, slp->ns_reclen - len, waitflag, &m2)) {
2368                                        slp->ns_flag &= ~SLP_GETSTREAM;
2369                                        return (EWOULDBLOCK);
2370                                }
2371                                if (om) {
2372                                        if (mbuf_setnext(om, m2)) {
2373                                                /* trouble... just drop it */
2374                                                printf("nfsrv_getstream: mbuf_setnext failed\n");
2375                                                mbuf_freem(m2);
2376                                                slp->ns_flag &= ~SLP_GETSTREAM;
2377                                                return (EWOULDBLOCK);
2378                                        }
2379                                        recm = slp->ns_raw;
2380                                } else {
2381                                        recm = m2;
2382                                }
2383                                mdata += slp->ns_reclen - len;
2384                                mlen -= slp->ns_reclen - len;
2385                                mbuf_setdata(m, mdata, mlen);
2386                                len = slp->ns_reclen;
2387                        } else if ((len + mlen) == slp->ns_reclen) {
2388                                om = m;
2389                                len += mlen;
2390                                m = mbuf_next(m);
2391                                recm = slp->ns_raw;
2392                                if (mbuf_setnext(om, NULL)) {
2393                                        printf("nfsrv_getstream: mbuf_setnext failed 2\n");
2394                                        slp->ns_flag &= ~SLP_GETSTREAM;
2395                                        return (EWOULDBLOCK);
2396                                }
2397                                mlen = mbuf_len(m);
2398                                mdata = mbuf_data(m);
2399                        } else {
2400                                om = m;
2401                                len += mlen;
2402                                m = mbuf_next(m);
2403                                mlen = mbuf_len(m);
2404                                mdata = mbuf_data(m);
2405                        }
2406                }
2407                slp->ns_raw = m;
2408                slp->ns_cc -= len;
2409                slp->ns_reclen = 0;
2410            } else {
2411                slp->ns_flag &= ~SLP_GETSTREAM;
2412                return (0);
2413            }
2414
2415            /*
2416             * Accumulate the fragments into a record.
2417             */
2418            if (slp->ns_frag == NULL) {
2419                slp->ns_frag = recm;
2420            } else {
2421                m = slp->ns_frag;
2422                while ((m2 = mbuf_next(m)))
2423                    m = m2;
2424                if ((error = mbuf_setnext(m, recm)))
2425                    panic("nfsrv_getstream: mbuf_setnext failed 3, %d\n", error);
2426            }
2427            if (slp->ns_flag & SLP_LASTFRAG) {
2428                if (slp->ns_recend)
2429                    mbuf_setnextpkt(slp->ns_recend, slp->ns_frag);
2430                else
2431                    slp->ns_rec = slp->ns_frag;
2432                slp->ns_recend = slp->ns_frag;
2433                slp->ns_frag = NULL;
2434            }
2435        }
2436}
2437
2438/*
2439 * Parse an RPC header.
2440 */
2441int
2442nfsrv_dorec(slp, nfsd, ndp)
2443        struct nfssvc_sock *slp;
2444        struct nfsd *nfsd;
2445        struct nfsrv_descript **ndp;
2446{
2447        mbuf_t m;
2448        mbuf_t nam;
2449        struct nfsrv_descript *nd;
2450        int error;
2451
2452        *ndp = NULL;
2453        if ((slp->ns_flag & SLP_VALID) == 0 || (slp->ns_rec == NULL))
2454                return (ENOBUFS);
2455        MALLOC_ZONE(nd, struct nfsrv_descript *,
2456                        sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK);
2457        if (!nd)
2458                return (ENOMEM);
2459        m = slp->ns_rec;
2460        slp->ns_rec = mbuf_nextpkt(m);
2461        if (slp->ns_rec)
2462                mbuf_setnextpkt(m, NULL);
2463        else
2464                slp->ns_recend = NULL;
2465        if (mbuf_type(m) == MBUF_TYPE_SONAME) {
2466                nam = m;
2467                m = mbuf_next(m);
2468                if ((error = mbuf_setnext(nam, NULL)))
2469                        panic("nfsrv_dorec: mbuf_setnext failed %d\n", error);
2470        } else
2471                nam = NULL;
2472        nd->nd_md = nd->nd_mrep = m;
2473        nd->nd_nam2 = nam;
2474        nd->nd_dpos = mbuf_data(m);
2475        error = nfs_getreq(nd, nfsd, TRUE);
2476        if (error) {
2477                if (nam)
2478                        mbuf_freem(nam);
2479                FREE_ZONE((caddr_t)nd,  sizeof *nd, M_NFSRVDESC);
2480                return (error);
2481        }
2482        *ndp = nd;
2483        nfsd->nfsd_nd = nd;
2484        return (0);
2485}
2486
2487/*
2488 * Parse an RPC request
2489 * - verify it
2490 * - fill in the cred struct.
2491 */
2492int
2493nfs_getreq(nd, nfsd, has_header)
2494        struct nfsrv_descript *nd;
2495        struct nfsd *nfsd;
2496        int has_header;
2497{
2498        int len, i;
2499        u_long *tl;
2500        long t1;
2501        uio_t uiop;
2502        caddr_t dpos, cp2, cp;
2503        u_long nfsvers, auth_type;
2504        uid_t nickuid;
2505        int error = 0, ticklen;
2506        mbuf_t mrep, md;
2507        struct nfsuid *nuidp;
2508        uid_t user_id;
2509        gid_t group_id;
2510        int ngroups;
2511        struct ucred temp_cred;
2512        struct timeval tvin, tvout, now;
2513        char uio_buf[ UIO_SIZEOF(1) ];
2514#if 0                           /* until encrypted keys are implemented */
2515        NFSKERBKEYSCHED_T keys; /* stores key schedule */
2516#endif
2517
2518        nd->nd_cr = NULL;
2519
2520        mrep = nd->nd_mrep;
2521        md = nd->nd_md;
2522        dpos = nd->nd_dpos;
2523        if (has_header) {
2524                nfsm_dissect(tl, u_long *, 10 * NFSX_UNSIGNED);
2525                nd->nd_retxid = fxdr_unsigned(u_long, *tl++);
2526                if (*tl++ != rpc_call) {
2527                        mbuf_freem(mrep);
2528                        return (EBADRPC);
2529                }
2530        } else
2531                nfsm_dissect(tl, u_long *, 8 * NFSX_UNSIGNED);
2532        nd->nd_repstat = 0;
2533        nd->nd_flag = 0;
2534        if (*tl++ != rpc_vers) {
2535                nd->nd_repstat = ERPCMISMATCH;
2536                nd->nd_procnum = NFSPROC_NOOP;
2537                return (0);
2538        }
2539        if (*tl != nfs_prog) {
2540                nd->nd_repstat = EPROGUNAVAIL;
2541                nd->nd_procnum = NFSPROC_NOOP;
2542                return (0);
2543        }
2544        tl++;
2545        nfsvers = fxdr_unsigned(u_long, *tl++);
2546        if ((nfsvers < NFS_VER2) || (nfsvers > NFS_VER3)) {
2547                nd->nd_repstat = EPROGMISMATCH;
2548                nd->nd_procnum = NFSPROC_NOOP;
2549                return (0);
2550        }
2551        else if (nfsvers == NFS_VER3)
2552                nd->nd_flag = ND_NFSV3;
2553        nd->nd_procnum = fxdr_unsigned(u_long, *tl++);
2554        if (nd->nd_procnum == NFSPROC_NULL)
2555                return (0);
2556        if ((nd->nd_procnum >= NFS_NPROCS) ||
2557                (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2558                nd->nd_repstat = EPROCUNAVAIL;
2559                nd->nd_procnum = NFSPROC_NOOP;
2560                return (0);
2561        }
2562        if ((nd->nd_flag & ND_NFSV3) == 0)
2563                nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2564        auth_type = *tl++;
2565        len = fxdr_unsigned(int, *tl++);
2566        if (len < 0 || len > RPCAUTH_MAXSIZ) {
2567                mbuf_freem(mrep);
2568                return (EBADRPC);
2569        }
2570
2571        nd->nd_flag &= ~ND_KERBAUTH;
2572        /*
2573         * Handle auth_unix or auth_kerb.
2574         */
2575        if (auth_type == rpc_auth_unix) {
2576                len = fxdr_unsigned(int, *++tl);
2577                if (len < 0 || len > NFS_MAXNAMLEN) {
2578                        mbuf_freem(mrep);
2579                        return (EBADRPC);
2580                }
2581                bzero(&temp_cred, sizeof(temp_cred));
2582                nfsm_adv(nfsm_rndup(len));
2583                nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2584                user_id = fxdr_unsigned(uid_t, *tl++);
2585                group_id = fxdr_unsigned(gid_t, *tl++);
2586                temp_cred.cr_groups[0] = group_id;
2587                len = fxdr_unsigned(int, *tl);
2588                if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2589                        mbuf_freem(mrep);
2590                        return (EBADRPC);
2591                }
2592                nfsm_dissect(tl, u_long *, (len + 2) * NFSX_UNSIGNED);
2593                for (i = 1; i <= len; i++)
2594                    if (i < NGROUPS)
2595                        temp_cred.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2596                    else
2597                        tl++;
2598                ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2599                if (ngroups > 1)
2600                    nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2601                len = fxdr_unsigned(int, *++tl);
2602                if (len < 0 || len > RPCAUTH_MAXSIZ) {
2603                        mbuf_freem(mrep);
2604                        return (EBADRPC);
2605                }
2606                temp_cred.cr_uid = user_id;
2607                temp_cred.cr_ngroups = ngroups;
2608                nd->nd_cr = kauth_cred_create(&temp_cred); 
2609                if (nd->nd_cr == NULL) {
2610                        nd->nd_repstat = ENOMEM;
2611                        nd->nd_procnum = NFSPROC_NOOP;
2612                        return (0);
2613                }
2614                if (len > 0)
2615                        nfsm_adv(nfsm_rndup(len));
2616        } else if (auth_type == rpc_auth_kerb) {
2617                switch (fxdr_unsigned(int, *tl++)) {
2618                case RPCAKN_FULLNAME:
2619                        ticklen = fxdr_unsigned(int, *tl);
2620                        *((u_long *)nfsd->nfsd_authstr) = *tl;
2621                        uiop = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, 
2622                                                &uio_buf[0], sizeof(uio_buf));
2623                        if (!uiop) {
2624                                nd->nd_repstat = ENOMEM;
2625                                nd->nd_procnum = NFSPROC_NOOP;
2626                                return (0);
2627                        }
2628
2629                        // LP64todo - fix this
2630                        nfsd->nfsd_authlen = (nfsm_rndup(ticklen) + (NFSX_UNSIGNED * 2));
2631                        if ((nfsm_rndup(ticklen) + NFSX_UNSIGNED) > (len - 2 * NFSX_UNSIGNED)) {
2632                                mbuf_freem(mrep);
2633                                return (EBADRPC);
2634                        }
2635                        uio_addiov(uiop, CAST_USER_ADDR_T(&nfsd->nfsd_authstr[4]), RPCAUTH_MAXSIZ - 4);
2636                        // LP64todo - fix this
2637                        nfsm_mtouio(uiop, uio_resid(uiop));
2638                        nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2639                        if (*tl++ != rpc_auth_kerb ||
2640                                fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2641                                printf("Bad kerb verifier\n");
2642                                nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2643                                nd->nd_procnum = NFSPROC_NOOP;
2644                                return (0);
2645                        }
2646                        nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
2647                        tl = (u_long *)cp;
2648                        if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2649                                printf("Not fullname kerb verifier\n");
2650                                nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2651                                nd->nd_procnum = NFSPROC_NOOP;
2652                                return (0);
2653                        }
2654                        cp += NFSX_UNSIGNED;
2655                        bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2656                        nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2657                        nd->nd_flag |= ND_KERBFULL;
2658                        nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2659                        break;
2660                case RPCAKN_NICKNAME:
2661                        if (len != 2 * NFSX_UNSIGNED) {
2662                                printf("Kerb nickname short\n");
2663                                nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2664                                nd->nd_procnum = NFSPROC_NOOP;
2665                                return (0);
2666                        }
2667                        nickuid = fxdr_unsigned(uid_t, *tl);
2668                        nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED);
2669                        if (*tl++ != rpc_auth_kerb ||
2670                                fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2671                                printf("Kerb nick verifier bad\n");
2672                                nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2673                                nd->nd_procnum = NFSPROC_NOOP;
2674                                return (0);
2675                        }
2676                        nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED);
2677                        tvin.tv_sec = *tl++;
2678                        tvin.tv_usec = *tl;
2679
2680                        for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2681                            nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2682                                if (kauth_cred_getuid(nuidp->nu_cr) == nickuid &&
2683                                    (!nd->nd_nam2 ||
2684                                     netaddr_match(NU_NETFAM(nuidp),
2685                                      &nuidp->nu_haddr, nd->nd_nam2)))
2686                                        break;
2687                        }
2688                        if (!nuidp) {
2689                                nd->nd_repstat =
2690                                        (NFSERR_AUTHERR|AUTH_REJECTCRED);
2691                                nd->nd_procnum = NFSPROC_NOOP;
2692                                return (0);
2693                        }
2694
2695                        /*
2696                         * Now, decrypt the timestamp using the session key
2697                         * and validate it.
2698                         */
2699#if NFSKERB
2700                        XXX
2701#endif
2702
2703                        tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2704                        tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2705                        microtime(&now);
2706                        if (nuidp->nu_expire < now.tv_sec ||
2707                            nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2708                            (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2709                             nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2710                                nuidp->nu_expire = 0;
2711                                nd->nd_repstat =
2712                                    (NFSERR_AUTHERR|AUTH_REJECTVERF);
2713                                nd->nd_procnum = NFSPROC_NOOP;
2714                                return (0);
2715                        }
2716                        bzero(&temp_cred, sizeof(temp_cred));
2717                        ngroups = nuidp->nu_cr->cr_ngroups;
2718                        for (i = 0; i < ngroups; i++)
2719                                temp_cred.cr_groups[i] = nuidp->nu_cr->cr_groups[i];
2720                        if (ngroups > 1)
2721                                nfsrvw_sort(&temp_cred.cr_groups[0], ngroups);
2722
2723                        temp_cred.cr_uid = kauth_cred_getuid(nuidp->nu_cr);
2724                        temp_cred.cr_ngroups = ngroups;
2725                        nd->nd_cr = kauth_cred_create(&temp_cred); 
2726                        if (!nd->nd_cr) {
2727                                nd->nd_repstat = ENOMEM;
2728                                nd->nd_procnum = NFSPROC_NOOP;
2729                                return (0);
2730                        }
2731                        nd->nd_flag |= ND_KERBNICK;
2732                };
2733        } else {
2734                nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2735                nd->nd_procnum = NFSPROC_NOOP;
2736                return (0);
2737        }
2738
2739        nd->nd_md = md;
2740        nd->nd_dpos = dpos;
2741        return (0);
2742nfsmout:
2743        if (nd->nd_cr)
2744                kauth_cred_rele(nd->nd_cr);
2745        return (error);
2746}
2747
2748/*
2749 * Search for a sleeping nfsd and wake it up.
2750 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the
2751 * running nfsds will go look for the work in the nfssvc_sock list.
2752 * Note: Must be called with nfsd_mutex held.
2753 */
2754void
2755nfsrv_wakenfsd(struct nfssvc_sock *slp)
2756{
2757        struct nfsd *nd;
2758
2759        if ((slp->ns_flag & SLP_VALID) == 0)
2760                return;
2761
2762        lck_rw_lock_exclusive(&slp->ns_rwlock);
2763
2764        if (nfsd_waiting) {
2765                TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2766                        if (nd->nfsd_flag & NFSD_WAITING) {
2767                                nd->nfsd_flag &= ~NFSD_WAITING;
2768                                if (nd->nfsd_slp)
2769                                        panic("nfsd wakeup");
2770                                slp->ns_sref++;
2771                                nd->nfsd_slp = slp;
2772                                lck_rw_done(&slp->ns_rwlock);
2773                                wakeup((caddr_t)nd);
2774                                return;
2775                        }
2776                }
2777        }
2778
2779        slp->ns_flag |= SLP_DOREC;
2780
2781        lck_rw_done(&slp->ns_rwlock);
2782
2783        nfsd_head_flag |= NFSD_CHECKSLP;
2784}
2785#endif /* NFS_NOSERVER */
2786
2787static int
2788nfs_msg(proc_t p,
2789        const char *server,
2790        const char *msg,
2791        int error)
2792{
2793        tpr_t tpr;
2794
2795        if (p)
2796                tpr = tprintf_open(p);
2797        else
2798                tpr = NULL;
2799        if (error)
2800                tprintf(tpr, "nfs server %s: %s, error %d\n", server, msg,
2801                    error);
2802        else
2803                tprintf(tpr, "nfs server %s: %s\n", server, msg);
2804        tprintf_close(tpr);
2805        return (0);
2806}
2807
2808void
2809nfs_down(nmp, proc, error, flags, msg)
2810        struct nfsmount *nmp;
2811        proc_t proc;
2812        int error, flags;
2813        const char *msg;
2814{
2815        if (nmp == NULL)
2816                return;
2817        if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) {
2818                vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 0);
2819                nmp->nm_state |= NFSSTA_TIMEO;
2820        }
2821        if ((flags & NFSSTA_LOCKTIMEO) && !(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2822                vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 0);
2823                nmp->nm_state |= NFSSTA_LOCKTIMEO;
2824        }
2825        nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, error);
2826}
2827
2828void
2829nfs_up(nmp, proc, flags, msg)
2830        struct nfsmount *nmp;
2831        proc_t proc;
2832        int flags;
2833        const char *msg;
2834{
2835        if (nmp == NULL)
2836                return;
2837        if (msg)
2838                nfs_msg(proc, vfs_statfs(nmp->nm_mountp)->f_mntfromname, msg, 0);
2839        if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) {
2840                nmp->nm_state &= ~NFSSTA_TIMEO;
2841                vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESP, 1);
2842        }
2843        if ((flags & NFSSTA_LOCKTIMEO) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) {
2844                nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
2845                vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_NOTRESPLOCK, 1);
2846        }
2847}
2848
2849
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.