1d96b98a3SKenneth D. Merry /*- 2d63027b6SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3d63027b6SPedro F. Giffuni * 4d96b98a3SKenneth D. Merry * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ 5d96b98a3SKenneth D. Merry * Copyright (c) 2013 Spectra Logic Corporation 6d96b98a3SKenneth D. Merry * 7d96b98a3SKenneth D. Merry * Redistribution and use in source and binary forms, with or without 8d96b98a3SKenneth D. Merry * modification, are permitted provided that the following conditions 9d96b98a3SKenneth D. Merry * are met: 10d96b98a3SKenneth D. Merry * 1. Redistributions of source code must retain the above copyright 11d96b98a3SKenneth D. Merry * notice, this list of conditions and the following disclaimer. 12d96b98a3SKenneth D. Merry * 2. Redistributions in binary form must reproduce the above copyright 13d96b98a3SKenneth D. Merry * notice, this list of conditions and the following disclaimer in the 14d96b98a3SKenneth D. Merry * documentation and/or other materials provided with the distribution. 15d96b98a3SKenneth D. Merry * 16d96b98a3SKenneth D. Merry * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17d96b98a3SKenneth D. Merry * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18d96b98a3SKenneth D. Merry * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19d96b98a3SKenneth D. Merry * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20d96b98a3SKenneth D. Merry * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21d96b98a3SKenneth D. Merry * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22d96b98a3SKenneth D. Merry * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23d96b98a3SKenneth D. Merry * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24d96b98a3SKenneth D. Merry * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25d96b98a3SKenneth D. Merry * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26d96b98a3SKenneth D. Merry * SUCH DAMAGE. 27d96b98a3SKenneth D. Merry */ 28d96b98a3SKenneth D. Merry 29d96b98a3SKenneth D. Merry #include <sys/cdefs.h> 30d96b98a3SKenneth D. Merry __FBSDID("$FreeBSD$"); 31d96b98a3SKenneth D. Merry 329897e357SRick Macklem #include <sys/types.h> 339897e357SRick Macklem #include <sys/mbuf.h> 349897e357SRick Macklem #include <sys/sbuf.h> 359897e357SRick Macklem 36d96b98a3SKenneth D. Merry #include <fs/nfs/nfsport.h> 379897e357SRick Macklem #include <fs/nfsserver/nfs_fha_new.h> 38d96b98a3SKenneth D. Merry 39d96b98a3SKenneth D. Merry #include <rpc/rpc.h> 409897e357SRick Macklem 419897e357SRick Macklem static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA"); 42d96b98a3SKenneth D. Merry 43d96b98a3SKenneth D. Merry static void fhanew_init(void *foo); 44d96b98a3SKenneth D. Merry static void fhanew_uninit(void *foo); 459897e357SRick Macklem static rpcproc_t fhanew_get_procnum(rpcproc_t procnum); 469897e357SRick Macklem static int fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, 479897e357SRick Macklem caddr_t *dpos); 489897e357SRick Macklem static int fhanew_is_read(rpcproc_t procnum); 499897e357SRick Macklem static int fhanew_is_write(rpcproc_t procnum); 509897e357SRick Macklem static int fhanew_get_offset(struct mbuf **md, caddr_t *dpos, 519897e357SRick Macklem int v3, struct fha_info *info); 529897e357SRick Macklem static int fhanew_no_offset(rpcproc_t procnum); 539897e357SRick Macklem static void fhanew_set_locktype(rpcproc_t procnum, 54d96b98a3SKenneth D. Merry struct fha_info *info); 55d96b98a3SKenneth D. Merry static int fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS); 569897e357SRick Macklem static void fha_extract_info(struct svc_req *req, 579897e357SRick Macklem struct fha_info *i); 58d96b98a3SKenneth D. Merry 59cd406ac9SRick Macklem NFSD_VNET_DEFINE_STATIC(struct fha_params *, fhanew_softc); 60*a90b47abSRick Macklem NFSD_VNET_DEFINE_STATIC(struct fha_ctls, nfsfha_ctls); 61d96b98a3SKenneth D. Merry 62d96b98a3SKenneth D. Merry SYSCTL_DECL(_vfs_nfsd); 63*a90b47abSRick Macklem SYSCTL_NODE(_vfs_nfsd, OID_AUTO, fha, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 64*a90b47abSRick Macklem "NFS File Handle Affinity (FHA)"); 65*a90b47abSRick Macklem 66*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 67*a90b47abSRick Macklem OID_AUTO, enable, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 68*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).enable, 0, 69*a90b47abSRick Macklem "Enable NFS File Handle Affinity (FHA)"); 70*a90b47abSRick Macklem 71*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 72*a90b47abSRick Macklem OID_AUTO, read, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 73*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).read, 0, 74*a90b47abSRick Macklem "Enable NFS FHA read locality"); 75*a90b47abSRick Macklem 76*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 77*a90b47abSRick Macklem OID_AUTO, write, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 78*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).write, 0, 79*a90b47abSRick Macklem "Enable NFS FHA write locality"); 80*a90b47abSRick Macklem 81*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 82*a90b47abSRick Macklem OID_AUTO, bin_shift, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 83*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).bin_shift, 0, 84*a90b47abSRick Macklem "Maximum locality distance 2^(bin_shift) bytes"); 85*a90b47abSRick Macklem 86*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 87*a90b47abSRick Macklem OID_AUTO, max_nfsds_per_fh, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 88*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).max_nfsds_per_fh, 0, 89*a90b47abSRick Macklem "Maximum nfsd threads that " 90*a90b47abSRick Macklem "should be working on requests for the same file handle"); 91*a90b47abSRick Macklem 92*a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha, 93*a90b47abSRick Macklem OID_AUTO, max_reqs_per_nfsd, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN, 94*a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).max_reqs_per_nfsd, 0, "Maximum requests that " 95*a90b47abSRick Macklem "single nfsd thread should be working on at any time"); 96*a90b47abSRick Macklem 97*a90b47abSRick Macklem SYSCTL_PROC(_vfs_nfsd_fha, OID_AUTO, fhe_stats, 98*a90b47abSRick Macklem CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, 99*a90b47abSRick Macklem fhenew_stats_sysctl, "A", ""); 100d96b98a3SKenneth D. Merry 101d96b98a3SKenneth D. Merry extern int newnfs_nfsv3_procid[]; 1027e44856eSRick Macklem 103ed03776cSRick Macklem VNET_SYSINIT(nfs_fhanew, SI_SUB_VNET_DONE, SI_ORDER_ANY, fhanew_init, NULL); 104ed03776cSRick Macklem VNET_SYSUNINIT(nfs_fhanew, SI_SUB_VNET_DONE, SI_ORDER_ANY, fhanew_uninit, NULL); 105d96b98a3SKenneth D. Merry 106d96b98a3SKenneth D. Merry static void 107d96b98a3SKenneth D. Merry fhanew_init(void *foo) 108d96b98a3SKenneth D. Merry { 109d96b98a3SKenneth D. Merry struct fha_params *softc; 1109897e357SRick Macklem int i; 111d96b98a3SKenneth D. Merry 112cd406ac9SRick Macklem NFSD_VNET(fhanew_softc) = malloc(sizeof(struct fha_params), M_TEMP, 113cd406ac9SRick Macklem M_WAITOK | M_ZERO); 114cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc); 115d96b98a3SKenneth D. Merry 116d96b98a3SKenneth D. Merry snprintf(softc->server_name, sizeof(softc->server_name), 117d96b98a3SKenneth D. Merry FHANEW_SERVER_NAME); 118d96b98a3SKenneth D. Merry 1199897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++) 1209897e357SRick Macklem mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF); 1219897e357SRick Macklem 1229897e357SRick Macklem /* 1239897e357SRick Macklem * Set the default tuning parameters. 1249897e357SRick Macklem */ 125*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).enable = FHA_DEF_ENABLE; 126*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).read = FHA_DEF_READ; 127*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).write = FHA_DEF_WRITE; 128*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).bin_shift = FHA_DEF_BIN_SHIFT; 129*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH; 130*a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD; 1319897e357SRick Macklem 132cd406ac9SRick Macklem } 133d96b98a3SKenneth D. Merry 134d96b98a3SKenneth D. Merry static void 135d96b98a3SKenneth D. Merry fhanew_uninit(void *foo) 136d96b98a3SKenneth D. Merry { 137d96b98a3SKenneth D. Merry struct fha_params *softc; 1389897e357SRick Macklem int i; 139d96b98a3SKenneth D. Merry 140cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc); 141d96b98a3SKenneth D. Merry 1429897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++) 1439897e357SRick Macklem mtx_destroy(&softc->fha_hash[i].mtx); 144cd406ac9SRick Macklem free(softc, M_TEMP); 145d96b98a3SKenneth D. Merry } 146d96b98a3SKenneth D. Merry 1479897e357SRick Macklem static rpcproc_t 148d96b98a3SKenneth D. Merry fhanew_get_procnum(rpcproc_t procnum) 149d96b98a3SKenneth D. Merry { 150d96b98a3SKenneth D. Merry if (procnum > NFSV2PROC_STATFS) 151d96b98a3SKenneth D. Merry return (-1); 152d96b98a3SKenneth D. Merry 153d96b98a3SKenneth D. Merry return (newnfs_nfsv3_procid[procnum]); 154d96b98a3SKenneth D. Merry } 155d96b98a3SKenneth D. Merry 1569897e357SRick Macklem static int 15710f8f58dSAlexander Motin fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, caddr_t *dpos) 158d96b98a3SKenneth D. Merry { 159d96b98a3SKenneth D. Merry struct nfsrv_descript lnd, *nd; 160d96b98a3SKenneth D. Merry uint32_t *tl; 16110f8f58dSAlexander Motin uint8_t *buf; 16210f8f58dSAlexander Motin uint64_t t; 16310f8f58dSAlexander Motin int error, len, i; 164d96b98a3SKenneth D. Merry 165d96b98a3SKenneth D. Merry error = 0; 166d96b98a3SKenneth D. Merry len = 0; 167d96b98a3SKenneth D. Merry nd = &lnd; 168d96b98a3SKenneth D. Merry 169d96b98a3SKenneth D. Merry nd->nd_md = *md; 170d96b98a3SKenneth D. Merry nd->nd_dpos = *dpos; 171d96b98a3SKenneth D. Merry 172d96b98a3SKenneth D. Merry if (v3) { 173d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, NFSX_UNSIGNED); 174d96b98a3SKenneth D. Merry if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) { 175d96b98a3SKenneth D. Merry error = EBADRPC; 176d96b98a3SKenneth D. Merry goto nfsmout; 177d96b98a3SKenneth D. Merry } 178d96b98a3SKenneth D. Merry } else { 179d96b98a3SKenneth D. Merry len = NFSX_V2FH; 180d96b98a3SKenneth D. Merry } 181d96b98a3SKenneth D. Merry 18210f8f58dSAlexander Motin t = 0; 183d96b98a3SKenneth D. Merry if (len != 0) { 18410f8f58dSAlexander Motin NFSM_DISSECT_NONBLOCK(buf, uint8_t *, len); 18510f8f58dSAlexander Motin for (i = 0; i < len; i++) 18610f8f58dSAlexander Motin t ^= ((uint64_t)buf[i] << (i & 7) * 8); 18710f8f58dSAlexander Motin } 18810f8f58dSAlexander Motin *fh = t; 189d96b98a3SKenneth D. Merry 190d96b98a3SKenneth D. Merry nfsmout: 191d96b98a3SKenneth D. Merry *md = nd->nd_md; 192d96b98a3SKenneth D. Merry *dpos = nd->nd_dpos; 193d96b98a3SKenneth D. Merry 194d96b98a3SKenneth D. Merry return (error); 195d96b98a3SKenneth D. Merry } 196d96b98a3SKenneth D. Merry 1979897e357SRick Macklem static int 198d96b98a3SKenneth D. Merry fhanew_is_read(rpcproc_t procnum) 199d96b98a3SKenneth D. Merry { 200d96b98a3SKenneth D. Merry if (procnum == NFSPROC_READ) 201d96b98a3SKenneth D. Merry return (1); 202d96b98a3SKenneth D. Merry else 203d96b98a3SKenneth D. Merry return (0); 204d96b98a3SKenneth D. Merry } 205d96b98a3SKenneth D. Merry 2069897e357SRick Macklem static int 207d96b98a3SKenneth D. Merry fhanew_is_write(rpcproc_t procnum) 208d96b98a3SKenneth D. Merry { 209d96b98a3SKenneth D. Merry if (procnum == NFSPROC_WRITE) 210d96b98a3SKenneth D. Merry return (1); 211d96b98a3SKenneth D. Merry else 212d96b98a3SKenneth D. Merry return (0); 213d96b98a3SKenneth D. Merry } 214d96b98a3SKenneth D. Merry 2159897e357SRick Macklem static int 216d96b98a3SKenneth D. Merry fhanew_get_offset(struct mbuf **md, caddr_t *dpos, int v3, 217d96b98a3SKenneth D. Merry struct fha_info *info) 218d96b98a3SKenneth D. Merry { 219d96b98a3SKenneth D. Merry struct nfsrv_descript lnd, *nd; 220d96b98a3SKenneth D. Merry uint32_t *tl; 221d96b98a3SKenneth D. Merry int error; 222d96b98a3SKenneth D. Merry 223d96b98a3SKenneth D. Merry error = 0; 224d96b98a3SKenneth D. Merry 225d96b98a3SKenneth D. Merry nd = &lnd; 226d96b98a3SKenneth D. Merry nd->nd_md = *md; 227d96b98a3SKenneth D. Merry nd->nd_dpos = *dpos; 228d96b98a3SKenneth D. Merry 229d96b98a3SKenneth D. Merry if (v3) { 230d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, 2 * NFSX_UNSIGNED); 231d96b98a3SKenneth D. Merry info->offset = fxdr_hyper(tl); 232d96b98a3SKenneth D. Merry } else { 233d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, NFSX_UNSIGNED); 234d96b98a3SKenneth D. Merry info->offset = fxdr_unsigned(uint32_t, *tl); 235d96b98a3SKenneth D. Merry } 236d96b98a3SKenneth D. Merry 237d96b98a3SKenneth D. Merry nfsmout: 238d96b98a3SKenneth D. Merry *md = nd->nd_md; 239d96b98a3SKenneth D. Merry *dpos = nd->nd_dpos; 240d96b98a3SKenneth D. Merry 241d96b98a3SKenneth D. Merry return (error); 242d96b98a3SKenneth D. Merry } 243d96b98a3SKenneth D. Merry 2449897e357SRick Macklem static int 245d96b98a3SKenneth D. Merry fhanew_no_offset(rpcproc_t procnum) 246d96b98a3SKenneth D. Merry { 247d96b98a3SKenneth D. Merry if (procnum == NFSPROC_FSSTAT || 248d96b98a3SKenneth D. Merry procnum == NFSPROC_FSINFO || 249d96b98a3SKenneth D. Merry procnum == NFSPROC_PATHCONF || 250d96b98a3SKenneth D. Merry procnum == NFSPROC_NOOP || 251d96b98a3SKenneth D. Merry procnum == NFSPROC_NULL) 252d96b98a3SKenneth D. Merry return (1); 253d96b98a3SKenneth D. Merry else 254d96b98a3SKenneth D. Merry return (0); 255d96b98a3SKenneth D. Merry } 256d96b98a3SKenneth D. Merry 2579897e357SRick Macklem static void 258d96b98a3SKenneth D. Merry fhanew_set_locktype(rpcproc_t procnum, struct fha_info *info) 259d96b98a3SKenneth D. Merry { 260d96b98a3SKenneth D. Merry switch (procnum) { 261d96b98a3SKenneth D. Merry case NFSPROC_NULL: 262d96b98a3SKenneth D. Merry case NFSPROC_GETATTR: 263d96b98a3SKenneth D. Merry case NFSPROC_LOOKUP: 264d96b98a3SKenneth D. Merry case NFSPROC_ACCESS: 265d96b98a3SKenneth D. Merry case NFSPROC_READLINK: 266d96b98a3SKenneth D. Merry case NFSPROC_READ: 267d96b98a3SKenneth D. Merry case NFSPROC_READDIR: 268d96b98a3SKenneth D. Merry case NFSPROC_READDIRPLUS: 269d96b98a3SKenneth D. Merry case NFSPROC_WRITE: 270d96b98a3SKenneth D. Merry info->locktype = LK_SHARED; 271d96b98a3SKenneth D. Merry break; 272d96b98a3SKenneth D. Merry case NFSPROC_SETATTR: 273d96b98a3SKenneth D. Merry case NFSPROC_CREATE: 274d96b98a3SKenneth D. Merry case NFSPROC_MKDIR: 275d96b98a3SKenneth D. Merry case NFSPROC_SYMLINK: 276d96b98a3SKenneth D. Merry case NFSPROC_MKNOD: 277d96b98a3SKenneth D. Merry case NFSPROC_REMOVE: 278d96b98a3SKenneth D. Merry case NFSPROC_RMDIR: 279d96b98a3SKenneth D. Merry case NFSPROC_RENAME: 280d96b98a3SKenneth D. Merry case NFSPROC_LINK: 281d96b98a3SKenneth D. Merry case NFSPROC_FSSTAT: 282d96b98a3SKenneth D. Merry case NFSPROC_FSINFO: 283d96b98a3SKenneth D. Merry case NFSPROC_PATHCONF: 284d96b98a3SKenneth D. Merry case NFSPROC_COMMIT: 285d96b98a3SKenneth D. Merry case NFSPROC_NOOP: 286d96b98a3SKenneth D. Merry info->locktype = LK_EXCLUSIVE; 287d96b98a3SKenneth D. Merry break; 288d96b98a3SKenneth D. Merry } 289d96b98a3SKenneth D. Merry } 290d96b98a3SKenneth D. Merry 2919897e357SRick Macklem /* 2929897e357SRick Macklem * This just specifies that offsets should obey affinity when within 2939897e357SRick Macklem * the same 1Mbyte (1<<20) chunk for the file (reads only for now). 2949897e357SRick Macklem */ 2959897e357SRick Macklem static void 2969897e357SRick Macklem fha_extract_info(struct svc_req *req, struct fha_info *i) 297d96b98a3SKenneth D. Merry { 2989897e357SRick Macklem struct mbuf *md; 2999897e357SRick Macklem caddr_t dpos; 3009897e357SRick Macklem static u_int64_t random_fh = 0; 3019897e357SRick Macklem int error; 3029897e357SRick Macklem int v3 = (req->rq_vers == 3); 3039897e357SRick Macklem rpcproc_t procnum; 3049897e357SRick Macklem 3059897e357SRick Macklem /* 3069897e357SRick Macklem * We start off with a random fh. If we get a reasonable 3079897e357SRick Macklem * procnum, we set the fh. If there's a concept of offset 3089897e357SRick Macklem * that we're interested in, we set that. 3099897e357SRick Macklem */ 3109897e357SRick Macklem i->fh = ++random_fh; 3119897e357SRick Macklem i->offset = 0; 3129897e357SRick Macklem i->locktype = LK_EXCLUSIVE; 3139897e357SRick Macklem i->read = i->write = 0; 3149897e357SRick Macklem 3159897e357SRick Macklem /* 3169897e357SRick Macklem * Extract the procnum and convert to v3 form if necessary, 3179897e357SRick Macklem * taking care to deal with out-of-range procnums. Caller will 3189897e357SRick Macklem * ensure that rq_vers is either 2 or 3. 3199897e357SRick Macklem */ 3209897e357SRick Macklem procnum = req->rq_proc; 3219897e357SRick Macklem if (!v3) { 3229897e357SRick Macklem rpcproc_t tmp_procnum; 3239897e357SRick Macklem 3249897e357SRick Macklem tmp_procnum = fhanew_get_procnum(procnum); 3259897e357SRick Macklem if (tmp_procnum == -1) 3269897e357SRick Macklem goto out; 3279897e357SRick Macklem procnum = tmp_procnum; 328d96b98a3SKenneth D. Merry } 329d96b98a3SKenneth D. Merry 3309897e357SRick Macklem /* 3319897e357SRick Macklem * We do affinity for most. However, we divide a realm of affinity 3329897e357SRick Macklem * by file offset so as to allow for concurrent random access. We 3339897e357SRick Macklem * only do this for reads today, but this may change when IFS supports 3349897e357SRick Macklem * efficient concurrent writes. 3359897e357SRick Macklem */ 3369897e357SRick Macklem if (fhanew_no_offset(procnum)) 3379897e357SRick Macklem goto out; 338d96b98a3SKenneth D. Merry 3399897e357SRick Macklem i->read = fhanew_is_read(procnum); 3409897e357SRick Macklem i->write = fhanew_is_write(procnum); 3419897e357SRick Macklem 3429897e357SRick Macklem error = newnfs_realign(&req->rq_args, M_NOWAIT); 3439897e357SRick Macklem if (error) 3449897e357SRick Macklem goto out; 3459897e357SRick Macklem md = req->rq_args; 3469897e357SRick Macklem dpos = mtod(md, caddr_t); 3479897e357SRick Macklem 3489897e357SRick Macklem /* Grab the filehandle. */ 3499897e357SRick Macklem error = fhanew_get_fh(&i->fh, v3, &md, &dpos); 3509897e357SRick Macklem if (error) 3519897e357SRick Macklem goto out; 3529897e357SRick Macklem 3539897e357SRick Macklem /* Content ourselves with zero offset for all but reads. */ 3549897e357SRick Macklem if (i->read || i->write) 3559897e357SRick Macklem fhanew_get_offset(&md, &dpos, v3, i); 3569897e357SRick Macklem 3579897e357SRick Macklem out: 3589897e357SRick Macklem fhanew_set_locktype(procnum, i); 3599897e357SRick Macklem } 3609897e357SRick Macklem 3619897e357SRick Macklem static struct fha_hash_entry * 3629897e357SRick Macklem fha_hash_entry_new(u_int64_t fh) 3639897e357SRick Macklem { 3649897e357SRick Macklem struct fha_hash_entry *e; 3659897e357SRick Macklem 3669897e357SRick Macklem e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK); 3679897e357SRick Macklem e->fh = fh; 3689897e357SRick Macklem e->num_rw = 0; 3699897e357SRick Macklem e->num_exclusive = 0; 3709897e357SRick Macklem e->num_threads = 0; 3719897e357SRick Macklem LIST_INIT(&e->threads); 3729897e357SRick Macklem 3739897e357SRick Macklem return (e); 3749897e357SRick Macklem } 3759897e357SRick Macklem 3769897e357SRick Macklem static void 3779897e357SRick Macklem fha_hash_entry_destroy(struct fha_hash_entry *e) 3789897e357SRick Macklem { 3799897e357SRick Macklem 3809897e357SRick Macklem mtx_assert(e->mtx, MA_OWNED); 3819897e357SRick Macklem KASSERT(e->num_rw == 0, 3829897e357SRick Macklem ("%d reqs on destroyed fhe %p", e->num_rw, e)); 3839897e357SRick Macklem KASSERT(e->num_exclusive == 0, 3849897e357SRick Macklem ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e)); 3859897e357SRick Macklem KASSERT(e->num_threads == 0, 3869897e357SRick Macklem ("%d threads on destroyed fhe %p", e->num_threads, e)); 3879897e357SRick Macklem free(e, M_NFS_FHA); 3889897e357SRick Macklem } 3899897e357SRick Macklem 3909897e357SRick Macklem static void 3919897e357SRick Macklem fha_hash_entry_remove(struct fha_hash_entry *e) 3929897e357SRick Macklem { 3939897e357SRick Macklem 3949897e357SRick Macklem mtx_assert(e->mtx, MA_OWNED); 3959897e357SRick Macklem LIST_REMOVE(e, link); 3969897e357SRick Macklem fha_hash_entry_destroy(e); 3979897e357SRick Macklem } 3989897e357SRick Macklem 3999897e357SRick Macklem static struct fha_hash_entry * 4009897e357SRick Macklem fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh) 4019897e357SRick Macklem { 4029897e357SRick Macklem struct fha_hash_slot *fhs; 4039897e357SRick Macklem struct fha_hash_entry *fhe, *new_fhe; 4049897e357SRick Macklem 4059897e357SRick Macklem fhs = &softc->fha_hash[fh % FHA_HASH_SIZE]; 4069897e357SRick Macklem new_fhe = fha_hash_entry_new(fh); 4079897e357SRick Macklem new_fhe->mtx = &fhs->mtx; 4089897e357SRick Macklem mtx_lock(&fhs->mtx); 4099897e357SRick Macklem LIST_FOREACH(fhe, &fhs->list, link) 4109897e357SRick Macklem if (fhe->fh == fh) 4119897e357SRick Macklem break; 4129897e357SRick Macklem if (!fhe) { 4139897e357SRick Macklem fhe = new_fhe; 4149897e357SRick Macklem LIST_INSERT_HEAD(&fhs->list, fhe, link); 4159897e357SRick Macklem } else 4169897e357SRick Macklem fha_hash_entry_destroy(new_fhe); 4179897e357SRick Macklem return (fhe); 4189897e357SRick Macklem } 4199897e357SRick Macklem 4209897e357SRick Macklem static void 4219897e357SRick Macklem fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) 4229897e357SRick Macklem { 4239897e357SRick Macklem 4249897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED); 4259897e357SRick Macklem thread->st_p2 = 0; 4269897e357SRick Macklem LIST_INSERT_HEAD(&fhe->threads, thread, st_alink); 4279897e357SRick Macklem fhe->num_threads++; 4289897e357SRick Macklem } 4299897e357SRick Macklem 4309897e357SRick Macklem static void 4319897e357SRick Macklem fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) 4329897e357SRick Macklem { 4339897e357SRick Macklem 4349897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED); 4359897e357SRick Macklem KASSERT(thread->st_p2 == 0, 4369897e357SRick Macklem ("%d reqs on removed thread %p", thread->st_p2, thread)); 4379897e357SRick Macklem LIST_REMOVE(thread, st_alink); 4389897e357SRick Macklem fhe->num_threads--; 4399897e357SRick Macklem } 4409897e357SRick Macklem 4419897e357SRick Macklem /* 4429897e357SRick Macklem * Account for an ongoing operation associated with this file. 4439897e357SRick Macklem */ 4449897e357SRick Macklem static void 4459897e357SRick Macklem fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count) 4469897e357SRick Macklem { 4479897e357SRick Macklem 4489897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED); 4499897e357SRick Macklem if (LK_EXCLUSIVE == locktype) 4509897e357SRick Macklem fhe->num_exclusive += count; 4519897e357SRick Macklem else 4529897e357SRick Macklem fhe->num_rw += count; 4539897e357SRick Macklem } 4549897e357SRick Macklem 4559897e357SRick Macklem /* 4569897e357SRick Macklem * Get the service thread currently associated with the fhe that is 4579897e357SRick Macklem * appropriate to handle this operation. 4589897e357SRick Macklem */ 4599897e357SRick Macklem static SVCTHREAD * 4609897e357SRick Macklem fha_hash_entry_choose_thread(struct fha_params *softc, 4619897e357SRick Macklem struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread) 4629897e357SRick Macklem { 4639897e357SRick Macklem SVCTHREAD *thread, *min_thread = NULL; 4649897e357SRick Macklem int req_count, min_count = 0; 4659897e357SRick Macklem off_t offset1, offset2; 4669897e357SRick Macklem 4679897e357SRick Macklem LIST_FOREACH(thread, &fhe->threads, st_alink) { 4689897e357SRick Macklem req_count = thread->st_p2; 4699897e357SRick Macklem 4709897e357SRick Macklem /* If there are any writes in progress, use the first thread. */ 4719897e357SRick Macklem if (fhe->num_exclusive) { 4729897e357SRick Macklem #if 0 4739897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 4749897e357SRick Macklem "fha: %p(%d)w", thread, req_count); 4759897e357SRick Macklem #endif 4769897e357SRick Macklem return (thread); 4779897e357SRick Macklem } 4789897e357SRick Macklem 4799897e357SRick Macklem /* Check whether we should consider locality. */ 480*a90b47abSRick Macklem if ((i->read && !NFSD_VNET(nfsfha_ctls).read) || 481*a90b47abSRick Macklem (i->write && !NFSD_VNET(nfsfha_ctls).write)) 4829897e357SRick Macklem goto noloc; 4839897e357SRick Macklem 4849897e357SRick Macklem /* 4859897e357SRick Macklem * Check for locality, making sure that we won't 4869897e357SRick Macklem * exceed our per-thread load limit in the process. 4879897e357SRick Macklem */ 4889897e357SRick Macklem offset1 = i->offset; 4899897e357SRick Macklem offset2 = thread->st_p3; 4909897e357SRick Macklem 4919897e357SRick Macklem if (((offset1 >= offset2) 492*a90b47abSRick Macklem && ((offset1 - offset2) < (1 << NFSD_VNET(nfsfha_ctls).bin_shift))) 4939897e357SRick Macklem || ((offset2 > offset1) 494*a90b47abSRick Macklem && ((offset2 - offset1) < (1 << NFSD_VNET(nfsfha_ctls).bin_shift)))) { 495*a90b47abSRick Macklem if ((NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd == 0) || 496*a90b47abSRick Macklem (req_count < NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd)) { 4979897e357SRick Macklem #if 0 4989897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 4999897e357SRick Macklem "fha: %p(%d)r", thread, req_count); 5009897e357SRick Macklem #endif 5019897e357SRick Macklem return (thread); 5029897e357SRick Macklem } 5039897e357SRick Macklem } 5049897e357SRick Macklem 5059897e357SRick Macklem noloc: 5069897e357SRick Macklem /* 5079897e357SRick Macklem * We don't have a locality match, so skip this thread, 5089897e357SRick Macklem * but keep track of the most attractive thread in case 5099897e357SRick Macklem * we need to come back to it later. 5109897e357SRick Macklem */ 5119897e357SRick Macklem #if 0 5129897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 5139897e357SRick Macklem "fha: %p(%d)s off1 %llu off2 %llu", thread, 5149897e357SRick Macklem req_count, offset1, offset2); 5159897e357SRick Macklem #endif 5169897e357SRick Macklem if ((min_thread == NULL) || (req_count < min_count)) { 5179897e357SRick Macklem min_count = req_count; 5189897e357SRick Macklem min_thread = thread; 5199897e357SRick Macklem } 5209897e357SRick Macklem } 5219897e357SRick Macklem 5229897e357SRick Macklem /* 5239897e357SRick Macklem * We didn't find a good match yet. See if we can add 5249897e357SRick Macklem * a new thread to this file handle entry's thread list. 5259897e357SRick Macklem */ 526*a90b47abSRick Macklem if ((NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh == 0) || 527*a90b47abSRick Macklem (fhe->num_threads < NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh)) { 5289897e357SRick Macklem thread = this_thread; 5299897e357SRick Macklem #if 0 5309897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 5319897e357SRick Macklem "fha: %p(%d)t", thread, thread->st_p2); 5329897e357SRick Macklem #endif 5339897e357SRick Macklem fha_hash_entry_add_thread(fhe, thread); 5349897e357SRick Macklem } else { 5359897e357SRick Macklem /* 5369897e357SRick Macklem * We don't want to use any more threads for this file, so 5379897e357SRick Macklem * go back to the most attractive nfsd we're already using. 5389897e357SRick Macklem */ 5399897e357SRick Macklem thread = min_thread; 5409897e357SRick Macklem } 5419897e357SRick Macklem 5429897e357SRick Macklem return (thread); 5439897e357SRick Macklem } 5449897e357SRick Macklem 5459897e357SRick Macklem /* 5469897e357SRick Macklem * After getting a request, try to assign it to some thread. Usually we 5479897e357SRick Macklem * handle it ourselves. 5489897e357SRick Macklem */ 549d96b98a3SKenneth D. Merry SVCTHREAD * 550d96b98a3SKenneth D. Merry fhanew_assign(SVCTHREAD *this_thread, struct svc_req *req) 551d96b98a3SKenneth D. Merry { 552cd406ac9SRick Macklem struct fha_params *softc; 5539897e357SRick Macklem SVCTHREAD *thread; 5549897e357SRick Macklem struct fha_info i; 5559897e357SRick Macklem struct fha_hash_entry *fhe; 5569897e357SRick Macklem 557cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread)); 558cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc); 5599897e357SRick Macklem /* Check to see whether we're enabled. */ 560*a90b47abSRick Macklem if (NFSD_VNET(nfsfha_ctls).enable == 0) 5619897e357SRick Macklem goto thist; 5629897e357SRick Macklem 5639897e357SRick Macklem /* 5649897e357SRick Macklem * Only do placement if this is an NFS request. 5659897e357SRick Macklem */ 5669897e357SRick Macklem if (req->rq_prog != NFS_PROG) 5679897e357SRick Macklem goto thist; 5689897e357SRick Macklem 5699897e357SRick Macklem if (req->rq_vers != 2 && req->rq_vers != 3) 5709897e357SRick Macklem goto thist; 5719897e357SRick Macklem 5729897e357SRick Macklem fha_extract_info(req, &i); 5739897e357SRick Macklem 5749897e357SRick Macklem /* 5759897e357SRick Macklem * We save the offset associated with this request for later 5769897e357SRick Macklem * nfsd matching. 5779897e357SRick Macklem */ 5789897e357SRick Macklem fhe = fha_hash_entry_lookup(softc, i.fh); 5799897e357SRick Macklem req->rq_p1 = fhe; 5809897e357SRick Macklem req->rq_p2 = i.locktype; 5819897e357SRick Macklem req->rq_p3 = i.offset; 5829897e357SRick Macklem 5839897e357SRick Macklem /* 5849897e357SRick Macklem * Choose a thread, taking into consideration locality, thread load, 5859897e357SRick Macklem * and the number of threads already working on this file. 5869897e357SRick Macklem */ 5879897e357SRick Macklem thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread); 5889897e357SRick Macklem KASSERT(thread, ("fha_assign: NULL thread!")); 5899897e357SRick Macklem fha_hash_entry_add_op(fhe, i.locktype, 1); 5909897e357SRick Macklem thread->st_p2++; 5919897e357SRick Macklem thread->st_p3 = i.offset; 5929897e357SRick Macklem 5939897e357SRick Macklem /* 5949897e357SRick Macklem * Grab the pool lock here to not let chosen thread go away before 5959897e357SRick Macklem * the new request inserted to its queue while we drop fhe lock. 5969897e357SRick Macklem */ 5979897e357SRick Macklem mtx_lock(&thread->st_lock); 5989897e357SRick Macklem mtx_unlock(fhe->mtx); 5999897e357SRick Macklem 600cd406ac9SRick Macklem NFSD_CURVNET_RESTORE(); 6019897e357SRick Macklem return (thread); 6029897e357SRick Macklem thist: 6039897e357SRick Macklem req->rq_p1 = NULL; 604cd406ac9SRick Macklem NFSD_CURVNET_RESTORE(); 6059897e357SRick Macklem mtx_lock(&this_thread->st_lock); 6069897e357SRick Macklem return (this_thread); 6079897e357SRick Macklem } 6089897e357SRick Macklem 6099897e357SRick Macklem /* 6109897e357SRick Macklem * Called when we're done with an operation. The request has already 6119897e357SRick Macklem * been de-queued. 6129897e357SRick Macklem */ 6139897e357SRick Macklem void 6149897e357SRick Macklem fhanew_nd_complete(SVCTHREAD *thread, struct svc_req *req) 6159897e357SRick Macklem { 6169897e357SRick Macklem struct fha_hash_entry *fhe = req->rq_p1; 6179897e357SRick Macklem struct mtx *mtx; 6189897e357SRick Macklem 619cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread)); 6209897e357SRick Macklem /* 6219897e357SRick Macklem * This may be called for reqs that didn't go through 6229897e357SRick Macklem * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS. 6239897e357SRick Macklem */ 624cd406ac9SRick Macklem if (!fhe) { 625cd406ac9SRick Macklem NFSD_CURVNET_RESTORE(); 6269897e357SRick Macklem return; 627cd406ac9SRick Macklem } 6289897e357SRick Macklem 6299897e357SRick Macklem mtx = fhe->mtx; 6309897e357SRick Macklem mtx_lock(mtx); 6319897e357SRick Macklem fha_hash_entry_add_op(fhe, req->rq_p2, -1); 6329897e357SRick Macklem thread->st_p2--; 6339897e357SRick Macklem KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p", 6349897e357SRick Macklem thread->st_p2, thread)); 6359897e357SRick Macklem if (thread->st_p2 == 0) { 6369897e357SRick Macklem fha_hash_entry_remove_thread(fhe, thread); 6379897e357SRick Macklem if (0 == fhe->num_rw + fhe->num_exclusive) 6389897e357SRick Macklem fha_hash_entry_remove(fhe); 6399897e357SRick Macklem } 6409897e357SRick Macklem mtx_unlock(mtx); 641cd406ac9SRick Macklem NFSD_CURVNET_RESTORE(); 6429897e357SRick Macklem } 6439897e357SRick Macklem 6449897e357SRick Macklem static int 6459897e357SRick Macklem fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS) 6469897e357SRick Macklem { 647cd406ac9SRick Macklem struct fha_params *softc; 6489897e357SRick Macklem int error, i; 6499897e357SRick Macklem struct sbuf sb; 6509897e357SRick Macklem struct fha_hash_entry *fhe; 6519897e357SRick Macklem bool_t first, hfirst; 6529897e357SRick Macklem SVCTHREAD *thread; 6539897e357SRick Macklem 6549897e357SRick Macklem sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN); 6559897e357SRick Macklem 656cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread)); 657cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc); 6589897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++) 6599897e357SRick Macklem if (!LIST_EMPTY(&softc->fha_hash[i].list)) 6609897e357SRick Macklem break; 6619897e357SRick Macklem 6629897e357SRick Macklem if (i == FHA_HASH_SIZE) { 6639897e357SRick Macklem sbuf_printf(&sb, "No file handle entries.\n"); 6649897e357SRick Macklem goto out; 6659897e357SRick Macklem } 6669897e357SRick Macklem 6679897e357SRick Macklem hfirst = TRUE; 6689897e357SRick Macklem for (; i < FHA_HASH_SIZE; i++) { 6699897e357SRick Macklem mtx_lock(&softc->fha_hash[i].mtx); 6709897e357SRick Macklem if (LIST_EMPTY(&softc->fha_hash[i].list)) { 6719897e357SRick Macklem mtx_unlock(&softc->fha_hash[i].mtx); 6729897e357SRick Macklem continue; 6739897e357SRick Macklem } 6749897e357SRick Macklem sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i); 6759897e357SRick Macklem first = TRUE; 6769897e357SRick Macklem LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) { 6779897e357SRick Macklem sbuf_printf(&sb, "%sfhe %p: {\n", first ? " " : ", ", 6789897e357SRick Macklem fhe); 6799897e357SRick Macklem sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh); 6809897e357SRick Macklem sbuf_printf(&sb, " num_rw/exclusive: %d/%d\n", 6819897e357SRick Macklem fhe->num_rw, fhe->num_exclusive); 6829897e357SRick Macklem sbuf_printf(&sb, " num_threads: %d\n", 6839897e357SRick Macklem fhe->num_threads); 6849897e357SRick Macklem 6859897e357SRick Macklem LIST_FOREACH(thread, &fhe->threads, st_alink) { 6869897e357SRick Macklem sbuf_printf(&sb, " thread %p offset %ju " 6879897e357SRick Macklem "reqs %d\n", thread, 6889897e357SRick Macklem thread->st_p3, thread->st_p2); 6899897e357SRick Macklem } 6909897e357SRick Macklem 6919897e357SRick Macklem sbuf_printf(&sb, " }"); 6929897e357SRick Macklem first = FALSE; 6939897e357SRick Macklem } 6949897e357SRick Macklem sbuf_printf(&sb, "\n}"); 6959897e357SRick Macklem mtx_unlock(&softc->fha_hash[i].mtx); 6969897e357SRick Macklem hfirst = FALSE; 6979897e357SRick Macklem } 6989897e357SRick Macklem 6999897e357SRick Macklem out: 700cd406ac9SRick Macklem NFSD_CURVNET_RESTORE(); 7019897e357SRick Macklem sbuf_trim(&sb); 7029897e357SRick Macklem sbuf_finish(&sb); 7039897e357SRick Macklem error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); 7049897e357SRick Macklem sbuf_delete(&sb); 7059897e357SRick Macklem return (error); 706d96b98a3SKenneth D. Merry } 707