1d96b98a3SKenneth D. Merry /*-
2*4d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
3d63027b6SPedro F. Giffuni *
4d96b98a3SKenneth D. Merry * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
5d96b98a3SKenneth D. Merry * Copyright (c) 2013 Spectra Logic Corporation
6d96b98a3SKenneth D. Merry *
7d96b98a3SKenneth D. Merry * Redistribution and use in source and binary forms, with or without
8d96b98a3SKenneth D. Merry * modification, are permitted provided that the following conditions
9d96b98a3SKenneth D. Merry * are met:
10d96b98a3SKenneth D. Merry * 1. Redistributions of source code must retain the above copyright
11d96b98a3SKenneth D. Merry * notice, this list of conditions and the following disclaimer.
12d96b98a3SKenneth D. Merry * 2. Redistributions in binary form must reproduce the above copyright
13d96b98a3SKenneth D. Merry * notice, this list of conditions and the following disclaimer in the
14d96b98a3SKenneth D. Merry * documentation and/or other materials provided with the distribution.
15d96b98a3SKenneth D. Merry *
16d96b98a3SKenneth D. Merry * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17d96b98a3SKenneth D. Merry * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18d96b98a3SKenneth D. Merry * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19d96b98a3SKenneth D. Merry * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20d96b98a3SKenneth D. Merry * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21d96b98a3SKenneth D. Merry * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22d96b98a3SKenneth D. Merry * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23d96b98a3SKenneth D. Merry * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24d96b98a3SKenneth D. Merry * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25d96b98a3SKenneth D. Merry * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26d96b98a3SKenneth D. Merry * SUCH DAMAGE.
27d96b98a3SKenneth D. Merry */
28d96b98a3SKenneth D. Merry
299897e357SRick Macklem #include <sys/types.h>
309897e357SRick Macklem #include <sys/mbuf.h>
319897e357SRick Macklem #include <sys/sbuf.h>
329897e357SRick Macklem
33d96b98a3SKenneth D. Merry #include <fs/nfs/nfsport.h>
349897e357SRick Macklem #include <fs/nfsserver/nfs_fha_new.h>
35d96b98a3SKenneth D. Merry
36d96b98a3SKenneth D. Merry #include <rpc/rpc.h>
379897e357SRick Macklem
389897e357SRick Macklem static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
39d96b98a3SKenneth D. Merry
40d96b98a3SKenneth D. Merry static void fhanew_init(void *foo);
41d96b98a3SKenneth D. Merry static void fhanew_uninit(void *foo);
429897e357SRick Macklem static rpcproc_t fhanew_get_procnum(rpcproc_t procnum);
439897e357SRick Macklem static int fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md,
449897e357SRick Macklem caddr_t *dpos);
459897e357SRick Macklem static int fhanew_is_read(rpcproc_t procnum);
469897e357SRick Macklem static int fhanew_is_write(rpcproc_t procnum);
479897e357SRick Macklem static int fhanew_get_offset(struct mbuf **md, caddr_t *dpos,
489897e357SRick Macklem int v3, struct fha_info *info);
499897e357SRick Macklem static int fhanew_no_offset(rpcproc_t procnum);
509897e357SRick Macklem static void fhanew_set_locktype(rpcproc_t procnum,
51d96b98a3SKenneth D. Merry struct fha_info *info);
52d96b98a3SKenneth D. Merry static int fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS);
539897e357SRick Macklem static void fha_extract_info(struct svc_req *req,
549897e357SRick Macklem struct fha_info *i);
55d96b98a3SKenneth D. Merry
56cd406ac9SRick Macklem NFSD_VNET_DEFINE_STATIC(struct fha_params *, fhanew_softc);
57a90b47abSRick Macklem NFSD_VNET_DEFINE_STATIC(struct fha_ctls, nfsfha_ctls);
58d96b98a3SKenneth D. Merry
59d96b98a3SKenneth D. Merry SYSCTL_DECL(_vfs_nfsd);
60a90b47abSRick Macklem SYSCTL_NODE(_vfs_nfsd, OID_AUTO, fha, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
61a90b47abSRick Macklem "NFS File Handle Affinity (FHA)");
62a90b47abSRick Macklem
63a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
64a90b47abSRick Macklem OID_AUTO, enable, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
65a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).enable, 0,
66a90b47abSRick Macklem "Enable NFS File Handle Affinity (FHA)");
67a90b47abSRick Macklem
68a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
69a90b47abSRick Macklem OID_AUTO, read, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
70a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).read, 0,
71a90b47abSRick Macklem "Enable NFS FHA read locality");
72a90b47abSRick Macklem
73a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
74a90b47abSRick Macklem OID_AUTO, write, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
75a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).write, 0,
76a90b47abSRick Macklem "Enable NFS FHA write locality");
77a90b47abSRick Macklem
78a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
79a90b47abSRick Macklem OID_AUTO, bin_shift, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
80a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).bin_shift, 0,
81a90b47abSRick Macklem "Maximum locality distance 2^(bin_shift) bytes");
82a90b47abSRick Macklem
83a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
84a90b47abSRick Macklem OID_AUTO, max_nfsds_per_fh, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
85a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).max_nfsds_per_fh, 0,
86a90b47abSRick Macklem "Maximum nfsd threads that "
87a90b47abSRick Macklem "should be working on requests for the same file handle");
88a90b47abSRick Macklem
89a90b47abSRick Macklem SYSCTL_UINT(_vfs_nfsd_fha,
90a90b47abSRick Macklem OID_AUTO, max_reqs_per_nfsd, CTLFLAG_NFSD_VNET | CTLFLAG_RWTUN,
91a90b47abSRick Macklem &NFSD_VNET_NAME(nfsfha_ctls).max_reqs_per_nfsd, 0, "Maximum requests that "
92a90b47abSRick Macklem "single nfsd thread should be working on at any time");
93a90b47abSRick Macklem
94a90b47abSRick Macklem SYSCTL_PROC(_vfs_nfsd_fha, OID_AUTO, fhe_stats,
95a90b47abSRick Macklem CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
96a90b47abSRick Macklem fhenew_stats_sysctl, "A", "");
97d96b98a3SKenneth D. Merry
98d96b98a3SKenneth D. Merry extern int newnfs_nfsv3_procid[];
997e44856eSRick Macklem
100ed03776cSRick Macklem VNET_SYSINIT(nfs_fhanew, SI_SUB_VNET_DONE, SI_ORDER_ANY, fhanew_init, NULL);
101ed03776cSRick Macklem VNET_SYSUNINIT(nfs_fhanew, SI_SUB_VNET_DONE, SI_ORDER_ANY, fhanew_uninit, NULL);
102d96b98a3SKenneth D. Merry
103d96b98a3SKenneth D. Merry static void
fhanew_init(void * foo)104d96b98a3SKenneth D. Merry fhanew_init(void *foo)
105d96b98a3SKenneth D. Merry {
106d96b98a3SKenneth D. Merry struct fha_params *softc;
1079897e357SRick Macklem int i;
108d96b98a3SKenneth D. Merry
109cd406ac9SRick Macklem NFSD_VNET(fhanew_softc) = malloc(sizeof(struct fha_params), M_TEMP,
110cd406ac9SRick Macklem M_WAITOK | M_ZERO);
111cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc);
112d96b98a3SKenneth D. Merry
113d96b98a3SKenneth D. Merry snprintf(softc->server_name, sizeof(softc->server_name),
114d96b98a3SKenneth D. Merry FHANEW_SERVER_NAME);
115d96b98a3SKenneth D. Merry
1169897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++)
1179897e357SRick Macklem mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
1189897e357SRick Macklem
1199897e357SRick Macklem /*
1209897e357SRick Macklem * Set the default tuning parameters.
1219897e357SRick Macklem */
122a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).enable = FHA_DEF_ENABLE;
123a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).read = FHA_DEF_READ;
124a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).write = FHA_DEF_WRITE;
125a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).bin_shift = FHA_DEF_BIN_SHIFT;
126a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
127a90b47abSRick Macklem NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
1289897e357SRick Macklem
129cd406ac9SRick Macklem }
130d96b98a3SKenneth D. Merry
131d96b98a3SKenneth D. Merry static void
fhanew_uninit(void * foo)132d96b98a3SKenneth D. Merry fhanew_uninit(void *foo)
133d96b98a3SKenneth D. Merry {
134d96b98a3SKenneth D. Merry struct fha_params *softc;
1359897e357SRick Macklem int i;
136d96b98a3SKenneth D. Merry
137cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc);
138d96b98a3SKenneth D. Merry
1399897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++)
1409897e357SRick Macklem mtx_destroy(&softc->fha_hash[i].mtx);
141cd406ac9SRick Macklem free(softc, M_TEMP);
142d96b98a3SKenneth D. Merry }
143d96b98a3SKenneth D. Merry
1449897e357SRick Macklem static rpcproc_t
fhanew_get_procnum(rpcproc_t procnum)145d96b98a3SKenneth D. Merry fhanew_get_procnum(rpcproc_t procnum)
146d96b98a3SKenneth D. Merry {
147d96b98a3SKenneth D. Merry if (procnum > NFSV2PROC_STATFS)
148d96b98a3SKenneth D. Merry return (-1);
149d96b98a3SKenneth D. Merry
150d96b98a3SKenneth D. Merry return (newnfs_nfsv3_procid[procnum]);
151d96b98a3SKenneth D. Merry }
152d96b98a3SKenneth D. Merry
1539897e357SRick Macklem static int
fhanew_get_fh(uint64_t * fh,int v3,struct mbuf ** md,caddr_t * dpos)15410f8f58dSAlexander Motin fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, caddr_t *dpos)
155d96b98a3SKenneth D. Merry {
156d96b98a3SKenneth D. Merry struct nfsrv_descript lnd, *nd;
157d96b98a3SKenneth D. Merry uint32_t *tl;
15810f8f58dSAlexander Motin uint8_t *buf;
15910f8f58dSAlexander Motin uint64_t t;
16010f8f58dSAlexander Motin int error, len, i;
161d96b98a3SKenneth D. Merry
162d96b98a3SKenneth D. Merry error = 0;
163d96b98a3SKenneth D. Merry len = 0;
164d96b98a3SKenneth D. Merry nd = &lnd;
165d96b98a3SKenneth D. Merry
166d96b98a3SKenneth D. Merry nd->nd_md = *md;
167d96b98a3SKenneth D. Merry nd->nd_dpos = *dpos;
168d96b98a3SKenneth D. Merry
169d96b98a3SKenneth D. Merry if (v3) {
170d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, NFSX_UNSIGNED);
171d96b98a3SKenneth D. Merry if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) {
172d96b98a3SKenneth D. Merry error = EBADRPC;
173d96b98a3SKenneth D. Merry goto nfsmout;
174d96b98a3SKenneth D. Merry }
175d96b98a3SKenneth D. Merry } else {
176d96b98a3SKenneth D. Merry len = NFSX_V2FH;
177d96b98a3SKenneth D. Merry }
178d96b98a3SKenneth D. Merry
17910f8f58dSAlexander Motin t = 0;
180d96b98a3SKenneth D. Merry if (len != 0) {
18110f8f58dSAlexander Motin NFSM_DISSECT_NONBLOCK(buf, uint8_t *, len);
18210f8f58dSAlexander Motin for (i = 0; i < len; i++)
18310f8f58dSAlexander Motin t ^= ((uint64_t)buf[i] << (i & 7) * 8);
18410f8f58dSAlexander Motin }
18510f8f58dSAlexander Motin *fh = t;
186d96b98a3SKenneth D. Merry
187d96b98a3SKenneth D. Merry nfsmout:
188d96b98a3SKenneth D. Merry *md = nd->nd_md;
189d96b98a3SKenneth D. Merry *dpos = nd->nd_dpos;
190d96b98a3SKenneth D. Merry
191d96b98a3SKenneth D. Merry return (error);
192d96b98a3SKenneth D. Merry }
193d96b98a3SKenneth D. Merry
1949897e357SRick Macklem static int
fhanew_is_read(rpcproc_t procnum)195d96b98a3SKenneth D. Merry fhanew_is_read(rpcproc_t procnum)
196d96b98a3SKenneth D. Merry {
197d96b98a3SKenneth D. Merry if (procnum == NFSPROC_READ)
198d96b98a3SKenneth D. Merry return (1);
199d96b98a3SKenneth D. Merry else
200d96b98a3SKenneth D. Merry return (0);
201d96b98a3SKenneth D. Merry }
202d96b98a3SKenneth D. Merry
2039897e357SRick Macklem static int
fhanew_is_write(rpcproc_t procnum)204d96b98a3SKenneth D. Merry fhanew_is_write(rpcproc_t procnum)
205d96b98a3SKenneth D. Merry {
206d96b98a3SKenneth D. Merry if (procnum == NFSPROC_WRITE)
207d96b98a3SKenneth D. Merry return (1);
208d96b98a3SKenneth D. Merry else
209d96b98a3SKenneth D. Merry return (0);
210d96b98a3SKenneth D. Merry }
211d96b98a3SKenneth D. Merry
2129897e357SRick Macklem static int
fhanew_get_offset(struct mbuf ** md,caddr_t * dpos,int v3,struct fha_info * info)213d96b98a3SKenneth D. Merry fhanew_get_offset(struct mbuf **md, caddr_t *dpos, int v3,
214d96b98a3SKenneth D. Merry struct fha_info *info)
215d96b98a3SKenneth D. Merry {
216d96b98a3SKenneth D. Merry struct nfsrv_descript lnd, *nd;
217d96b98a3SKenneth D. Merry uint32_t *tl;
218d96b98a3SKenneth D. Merry int error;
219d96b98a3SKenneth D. Merry
220d96b98a3SKenneth D. Merry error = 0;
221d96b98a3SKenneth D. Merry
222d96b98a3SKenneth D. Merry nd = &lnd;
223d96b98a3SKenneth D. Merry nd->nd_md = *md;
224d96b98a3SKenneth D. Merry nd->nd_dpos = *dpos;
225d96b98a3SKenneth D. Merry
226d96b98a3SKenneth D. Merry if (v3) {
227d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, 2 * NFSX_UNSIGNED);
228d96b98a3SKenneth D. Merry info->offset = fxdr_hyper(tl);
229d96b98a3SKenneth D. Merry } else {
230d96b98a3SKenneth D. Merry NFSM_DISSECT_NONBLOCK(tl, uint32_t *, NFSX_UNSIGNED);
231d96b98a3SKenneth D. Merry info->offset = fxdr_unsigned(uint32_t, *tl);
232d96b98a3SKenneth D. Merry }
233d96b98a3SKenneth D. Merry
234d96b98a3SKenneth D. Merry nfsmout:
235d96b98a3SKenneth D. Merry *md = nd->nd_md;
236d96b98a3SKenneth D. Merry *dpos = nd->nd_dpos;
237d96b98a3SKenneth D. Merry
238d96b98a3SKenneth D. Merry return (error);
239d96b98a3SKenneth D. Merry }
240d96b98a3SKenneth D. Merry
2419897e357SRick Macklem static int
fhanew_no_offset(rpcproc_t procnum)242d96b98a3SKenneth D. Merry fhanew_no_offset(rpcproc_t procnum)
243d96b98a3SKenneth D. Merry {
244d96b98a3SKenneth D. Merry if (procnum == NFSPROC_FSSTAT ||
245d96b98a3SKenneth D. Merry procnum == NFSPROC_FSINFO ||
246d96b98a3SKenneth D. Merry procnum == NFSPROC_PATHCONF ||
247d96b98a3SKenneth D. Merry procnum == NFSPROC_NOOP ||
248d96b98a3SKenneth D. Merry procnum == NFSPROC_NULL)
249d96b98a3SKenneth D. Merry return (1);
250d96b98a3SKenneth D. Merry else
251d96b98a3SKenneth D. Merry return (0);
252d96b98a3SKenneth D. Merry }
253d96b98a3SKenneth D. Merry
2549897e357SRick Macklem static void
fhanew_set_locktype(rpcproc_t procnum,struct fha_info * info)255d96b98a3SKenneth D. Merry fhanew_set_locktype(rpcproc_t procnum, struct fha_info *info)
256d96b98a3SKenneth D. Merry {
257d96b98a3SKenneth D. Merry switch (procnum) {
258d96b98a3SKenneth D. Merry case NFSPROC_NULL:
259d96b98a3SKenneth D. Merry case NFSPROC_GETATTR:
260d96b98a3SKenneth D. Merry case NFSPROC_LOOKUP:
261d96b98a3SKenneth D. Merry case NFSPROC_ACCESS:
262d96b98a3SKenneth D. Merry case NFSPROC_READLINK:
263d96b98a3SKenneth D. Merry case NFSPROC_READ:
264d96b98a3SKenneth D. Merry case NFSPROC_READDIR:
265d96b98a3SKenneth D. Merry case NFSPROC_READDIRPLUS:
266d96b98a3SKenneth D. Merry case NFSPROC_WRITE:
267d96b98a3SKenneth D. Merry info->locktype = LK_SHARED;
268d96b98a3SKenneth D. Merry break;
269d96b98a3SKenneth D. Merry case NFSPROC_SETATTR:
270d96b98a3SKenneth D. Merry case NFSPROC_CREATE:
271d96b98a3SKenneth D. Merry case NFSPROC_MKDIR:
272d96b98a3SKenneth D. Merry case NFSPROC_SYMLINK:
273d96b98a3SKenneth D. Merry case NFSPROC_MKNOD:
274d96b98a3SKenneth D. Merry case NFSPROC_REMOVE:
275d96b98a3SKenneth D. Merry case NFSPROC_RMDIR:
276d96b98a3SKenneth D. Merry case NFSPROC_RENAME:
277d96b98a3SKenneth D. Merry case NFSPROC_LINK:
278d96b98a3SKenneth D. Merry case NFSPROC_FSSTAT:
279d96b98a3SKenneth D. Merry case NFSPROC_FSINFO:
280d96b98a3SKenneth D. Merry case NFSPROC_PATHCONF:
281d96b98a3SKenneth D. Merry case NFSPROC_COMMIT:
282d96b98a3SKenneth D. Merry case NFSPROC_NOOP:
283d96b98a3SKenneth D. Merry info->locktype = LK_EXCLUSIVE;
284d96b98a3SKenneth D. Merry break;
285d96b98a3SKenneth D. Merry }
286d96b98a3SKenneth D. Merry }
287d96b98a3SKenneth D. Merry
2889897e357SRick Macklem /*
2899897e357SRick Macklem * This just specifies that offsets should obey affinity when within
2909897e357SRick Macklem * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
2919897e357SRick Macklem */
2929897e357SRick Macklem static void
fha_extract_info(struct svc_req * req,struct fha_info * i)2939897e357SRick Macklem fha_extract_info(struct svc_req *req, struct fha_info *i)
294d96b98a3SKenneth D. Merry {
2959897e357SRick Macklem struct mbuf *md;
2969897e357SRick Macklem caddr_t dpos;
2979897e357SRick Macklem static u_int64_t random_fh = 0;
2989897e357SRick Macklem int error;
2999897e357SRick Macklem int v3 = (req->rq_vers == 3);
3009897e357SRick Macklem rpcproc_t procnum;
3019897e357SRick Macklem
3029897e357SRick Macklem /*
3039897e357SRick Macklem * We start off with a random fh. If we get a reasonable
3049897e357SRick Macklem * procnum, we set the fh. If there's a concept of offset
3059897e357SRick Macklem * that we're interested in, we set that.
3069897e357SRick Macklem */
3079897e357SRick Macklem i->fh = ++random_fh;
3089897e357SRick Macklem i->offset = 0;
3099897e357SRick Macklem i->locktype = LK_EXCLUSIVE;
3109897e357SRick Macklem i->read = i->write = 0;
3119897e357SRick Macklem
3129897e357SRick Macklem /*
3139897e357SRick Macklem * Extract the procnum and convert to v3 form if necessary,
3149897e357SRick Macklem * taking care to deal with out-of-range procnums. Caller will
3159897e357SRick Macklem * ensure that rq_vers is either 2 or 3.
3169897e357SRick Macklem */
3179897e357SRick Macklem procnum = req->rq_proc;
3189897e357SRick Macklem if (!v3) {
3199897e357SRick Macklem rpcproc_t tmp_procnum;
3209897e357SRick Macklem
3219897e357SRick Macklem tmp_procnum = fhanew_get_procnum(procnum);
3229897e357SRick Macklem if (tmp_procnum == -1)
3239897e357SRick Macklem goto out;
3249897e357SRick Macklem procnum = tmp_procnum;
325d96b98a3SKenneth D. Merry }
326d96b98a3SKenneth D. Merry
3279897e357SRick Macklem /*
3289897e357SRick Macklem * We do affinity for most. However, we divide a realm of affinity
3299897e357SRick Macklem * by file offset so as to allow for concurrent random access. We
3309897e357SRick Macklem * only do this for reads today, but this may change when IFS supports
3319897e357SRick Macklem * efficient concurrent writes.
3329897e357SRick Macklem */
3339897e357SRick Macklem if (fhanew_no_offset(procnum))
3349897e357SRick Macklem goto out;
335d96b98a3SKenneth D. Merry
3369897e357SRick Macklem i->read = fhanew_is_read(procnum);
3379897e357SRick Macklem i->write = fhanew_is_write(procnum);
3389897e357SRick Macklem
3399897e357SRick Macklem error = newnfs_realign(&req->rq_args, M_NOWAIT);
3409897e357SRick Macklem if (error)
3419897e357SRick Macklem goto out;
3429897e357SRick Macklem md = req->rq_args;
3439897e357SRick Macklem dpos = mtod(md, caddr_t);
3449897e357SRick Macklem
3459897e357SRick Macklem /* Grab the filehandle. */
3469897e357SRick Macklem error = fhanew_get_fh(&i->fh, v3, &md, &dpos);
3479897e357SRick Macklem if (error)
3489897e357SRick Macklem goto out;
3499897e357SRick Macklem
3509897e357SRick Macklem /* Content ourselves with zero offset for all but reads. */
3519897e357SRick Macklem if (i->read || i->write)
3529897e357SRick Macklem fhanew_get_offset(&md, &dpos, v3, i);
3539897e357SRick Macklem
3549897e357SRick Macklem out:
3559897e357SRick Macklem fhanew_set_locktype(procnum, i);
3569897e357SRick Macklem }
3579897e357SRick Macklem
3589897e357SRick Macklem static struct fha_hash_entry *
fha_hash_entry_new(u_int64_t fh)3599897e357SRick Macklem fha_hash_entry_new(u_int64_t fh)
3609897e357SRick Macklem {
3619897e357SRick Macklem struct fha_hash_entry *e;
3629897e357SRick Macklem
3639897e357SRick Macklem e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
3649897e357SRick Macklem e->fh = fh;
3659897e357SRick Macklem e->num_rw = 0;
3669897e357SRick Macklem e->num_exclusive = 0;
3679897e357SRick Macklem e->num_threads = 0;
3689897e357SRick Macklem LIST_INIT(&e->threads);
3699897e357SRick Macklem
3709897e357SRick Macklem return (e);
3719897e357SRick Macklem }
3729897e357SRick Macklem
3739897e357SRick Macklem static void
fha_hash_entry_destroy(struct fha_hash_entry * e)3749897e357SRick Macklem fha_hash_entry_destroy(struct fha_hash_entry *e)
3759897e357SRick Macklem {
3769897e357SRick Macklem
3779897e357SRick Macklem mtx_assert(e->mtx, MA_OWNED);
3789897e357SRick Macklem KASSERT(e->num_rw == 0,
3799897e357SRick Macklem ("%d reqs on destroyed fhe %p", e->num_rw, e));
3809897e357SRick Macklem KASSERT(e->num_exclusive == 0,
3819897e357SRick Macklem ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
3829897e357SRick Macklem KASSERT(e->num_threads == 0,
3839897e357SRick Macklem ("%d threads on destroyed fhe %p", e->num_threads, e));
3849897e357SRick Macklem free(e, M_NFS_FHA);
3859897e357SRick Macklem }
3869897e357SRick Macklem
3879897e357SRick Macklem static void
fha_hash_entry_remove(struct fha_hash_entry * e)3889897e357SRick Macklem fha_hash_entry_remove(struct fha_hash_entry *e)
3899897e357SRick Macklem {
3909897e357SRick Macklem
3919897e357SRick Macklem mtx_assert(e->mtx, MA_OWNED);
3929897e357SRick Macklem LIST_REMOVE(e, link);
3939897e357SRick Macklem fha_hash_entry_destroy(e);
3949897e357SRick Macklem }
3959897e357SRick Macklem
3969897e357SRick Macklem static struct fha_hash_entry *
fha_hash_entry_lookup(struct fha_params * softc,u_int64_t fh)3979897e357SRick Macklem fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
3989897e357SRick Macklem {
3999897e357SRick Macklem struct fha_hash_slot *fhs;
4009897e357SRick Macklem struct fha_hash_entry *fhe, *new_fhe;
4019897e357SRick Macklem
4029897e357SRick Macklem fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
4039897e357SRick Macklem new_fhe = fha_hash_entry_new(fh);
4049897e357SRick Macklem new_fhe->mtx = &fhs->mtx;
4059897e357SRick Macklem mtx_lock(&fhs->mtx);
4069897e357SRick Macklem LIST_FOREACH(fhe, &fhs->list, link)
4079897e357SRick Macklem if (fhe->fh == fh)
4089897e357SRick Macklem break;
4099897e357SRick Macklem if (!fhe) {
4109897e357SRick Macklem fhe = new_fhe;
4119897e357SRick Macklem LIST_INSERT_HEAD(&fhs->list, fhe, link);
4129897e357SRick Macklem } else
4139897e357SRick Macklem fha_hash_entry_destroy(new_fhe);
4149897e357SRick Macklem return (fhe);
4159897e357SRick Macklem }
4169897e357SRick Macklem
4179897e357SRick Macklem static void
fha_hash_entry_add_thread(struct fha_hash_entry * fhe,SVCTHREAD * thread)4189897e357SRick Macklem fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
4199897e357SRick Macklem {
4209897e357SRick Macklem
4219897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED);
4229897e357SRick Macklem thread->st_p2 = 0;
4239897e357SRick Macklem LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
4249897e357SRick Macklem fhe->num_threads++;
4259897e357SRick Macklem }
4269897e357SRick Macklem
4279897e357SRick Macklem static void
fha_hash_entry_remove_thread(struct fha_hash_entry * fhe,SVCTHREAD * thread)4289897e357SRick Macklem fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
4299897e357SRick Macklem {
4309897e357SRick Macklem
4319897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED);
4329897e357SRick Macklem KASSERT(thread->st_p2 == 0,
4339897e357SRick Macklem ("%d reqs on removed thread %p", thread->st_p2, thread));
4349897e357SRick Macklem LIST_REMOVE(thread, st_alink);
4359897e357SRick Macklem fhe->num_threads--;
4369897e357SRick Macklem }
4379897e357SRick Macklem
4389897e357SRick Macklem /*
4399897e357SRick Macklem * Account for an ongoing operation associated with this file.
4409897e357SRick Macklem */
4419897e357SRick Macklem static void
fha_hash_entry_add_op(struct fha_hash_entry * fhe,int locktype,int count)4429897e357SRick Macklem fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
4439897e357SRick Macklem {
4449897e357SRick Macklem
4459897e357SRick Macklem mtx_assert(fhe->mtx, MA_OWNED);
4469897e357SRick Macklem if (LK_EXCLUSIVE == locktype)
4479897e357SRick Macklem fhe->num_exclusive += count;
4489897e357SRick Macklem else
4499897e357SRick Macklem fhe->num_rw += count;
4509897e357SRick Macklem }
4519897e357SRick Macklem
4529897e357SRick Macklem /*
4539897e357SRick Macklem * Get the service thread currently associated with the fhe that is
4549897e357SRick Macklem * appropriate to handle this operation.
4559897e357SRick Macklem */
4569897e357SRick Macklem static SVCTHREAD *
fha_hash_entry_choose_thread(struct fha_params * softc,struct fha_hash_entry * fhe,struct fha_info * i,SVCTHREAD * this_thread)4579897e357SRick Macklem fha_hash_entry_choose_thread(struct fha_params *softc,
4589897e357SRick Macklem struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
4599897e357SRick Macklem {
4609897e357SRick Macklem SVCTHREAD *thread, *min_thread = NULL;
4619897e357SRick Macklem int req_count, min_count = 0;
4629897e357SRick Macklem off_t offset1, offset2;
4639897e357SRick Macklem
4649897e357SRick Macklem LIST_FOREACH(thread, &fhe->threads, st_alink) {
4659897e357SRick Macklem req_count = thread->st_p2;
4669897e357SRick Macklem
4679897e357SRick Macklem /* If there are any writes in progress, use the first thread. */
4689897e357SRick Macklem if (fhe->num_exclusive) {
4699897e357SRick Macklem #if 0
4709897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
4719897e357SRick Macklem "fha: %p(%d)w", thread, req_count);
4729897e357SRick Macklem #endif
4739897e357SRick Macklem return (thread);
4749897e357SRick Macklem }
4759897e357SRick Macklem
4769897e357SRick Macklem /* Check whether we should consider locality. */
477a90b47abSRick Macklem if ((i->read && !NFSD_VNET(nfsfha_ctls).read) ||
478a90b47abSRick Macklem (i->write && !NFSD_VNET(nfsfha_ctls).write))
4799897e357SRick Macklem goto noloc;
4809897e357SRick Macklem
4819897e357SRick Macklem /*
4829897e357SRick Macklem * Check for locality, making sure that we won't
4839897e357SRick Macklem * exceed our per-thread load limit in the process.
4849897e357SRick Macklem */
4859897e357SRick Macklem offset1 = i->offset;
4869897e357SRick Macklem offset2 = thread->st_p3;
4879897e357SRick Macklem
4889897e357SRick Macklem if (((offset1 >= offset2)
489a90b47abSRick Macklem && ((offset1 - offset2) < (1 << NFSD_VNET(nfsfha_ctls).bin_shift)))
4909897e357SRick Macklem || ((offset2 > offset1)
491a90b47abSRick Macklem && ((offset2 - offset1) < (1 << NFSD_VNET(nfsfha_ctls).bin_shift)))) {
492a90b47abSRick Macklem if ((NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd == 0) ||
493a90b47abSRick Macklem (req_count < NFSD_VNET(nfsfha_ctls).max_reqs_per_nfsd)) {
4949897e357SRick Macklem #if 0
4959897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
4969897e357SRick Macklem "fha: %p(%d)r", thread, req_count);
4979897e357SRick Macklem #endif
4989897e357SRick Macklem return (thread);
4999897e357SRick Macklem }
5009897e357SRick Macklem }
5019897e357SRick Macklem
5029897e357SRick Macklem noloc:
5039897e357SRick Macklem /*
5049897e357SRick Macklem * We don't have a locality match, so skip this thread,
5059897e357SRick Macklem * but keep track of the most attractive thread in case
5069897e357SRick Macklem * we need to come back to it later.
5079897e357SRick Macklem */
5089897e357SRick Macklem #if 0
5099897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
5109897e357SRick Macklem "fha: %p(%d)s off1 %llu off2 %llu", thread,
5119897e357SRick Macklem req_count, offset1, offset2);
5129897e357SRick Macklem #endif
5139897e357SRick Macklem if ((min_thread == NULL) || (req_count < min_count)) {
5149897e357SRick Macklem min_count = req_count;
5159897e357SRick Macklem min_thread = thread;
5169897e357SRick Macklem }
5179897e357SRick Macklem }
5189897e357SRick Macklem
5199897e357SRick Macklem /*
5209897e357SRick Macklem * We didn't find a good match yet. See if we can add
5219897e357SRick Macklem * a new thread to this file handle entry's thread list.
5229897e357SRick Macklem */
523a90b47abSRick Macklem if ((NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh == 0) ||
524a90b47abSRick Macklem (fhe->num_threads < NFSD_VNET(nfsfha_ctls).max_nfsds_per_fh)) {
5259897e357SRick Macklem thread = this_thread;
5269897e357SRick Macklem #if 0
5279897e357SRick Macklem ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
5289897e357SRick Macklem "fha: %p(%d)t", thread, thread->st_p2);
5299897e357SRick Macklem #endif
5309897e357SRick Macklem fha_hash_entry_add_thread(fhe, thread);
5319897e357SRick Macklem } else {
5329897e357SRick Macklem /*
5339897e357SRick Macklem * We don't want to use any more threads for this file, so
5349897e357SRick Macklem * go back to the most attractive nfsd we're already using.
5359897e357SRick Macklem */
5369897e357SRick Macklem thread = min_thread;
5379897e357SRick Macklem }
5389897e357SRick Macklem
5399897e357SRick Macklem return (thread);
5409897e357SRick Macklem }
5419897e357SRick Macklem
5429897e357SRick Macklem /*
5439897e357SRick Macklem * After getting a request, try to assign it to some thread. Usually we
5449897e357SRick Macklem * handle it ourselves.
5459897e357SRick Macklem */
546d96b98a3SKenneth D. Merry SVCTHREAD *
fhanew_assign(SVCTHREAD * this_thread,struct svc_req * req)547d96b98a3SKenneth D. Merry fhanew_assign(SVCTHREAD *this_thread, struct svc_req *req)
548d96b98a3SKenneth D. Merry {
549cd406ac9SRick Macklem struct fha_params *softc;
5509897e357SRick Macklem SVCTHREAD *thread;
5519897e357SRick Macklem struct fha_info i;
5529897e357SRick Macklem struct fha_hash_entry *fhe;
5539897e357SRick Macklem
554cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread));
555cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc);
5569897e357SRick Macklem /* Check to see whether we're enabled. */
557a90b47abSRick Macklem if (NFSD_VNET(nfsfha_ctls).enable == 0)
5589897e357SRick Macklem goto thist;
5599897e357SRick Macklem
5609897e357SRick Macklem /*
5619897e357SRick Macklem * Only do placement if this is an NFS request.
5629897e357SRick Macklem */
5639897e357SRick Macklem if (req->rq_prog != NFS_PROG)
5649897e357SRick Macklem goto thist;
5659897e357SRick Macklem
5669897e357SRick Macklem if (req->rq_vers != 2 && req->rq_vers != 3)
5679897e357SRick Macklem goto thist;
5689897e357SRick Macklem
5699897e357SRick Macklem fha_extract_info(req, &i);
5709897e357SRick Macklem
5719897e357SRick Macklem /*
5729897e357SRick Macklem * We save the offset associated with this request for later
5739897e357SRick Macklem * nfsd matching.
5749897e357SRick Macklem */
5759897e357SRick Macklem fhe = fha_hash_entry_lookup(softc, i.fh);
5769897e357SRick Macklem req->rq_p1 = fhe;
5779897e357SRick Macklem req->rq_p2 = i.locktype;
5789897e357SRick Macklem req->rq_p3 = i.offset;
5799897e357SRick Macklem
5809897e357SRick Macklem /*
5819897e357SRick Macklem * Choose a thread, taking into consideration locality, thread load,
5829897e357SRick Macklem * and the number of threads already working on this file.
5839897e357SRick Macklem */
5849897e357SRick Macklem thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
5859897e357SRick Macklem KASSERT(thread, ("fha_assign: NULL thread!"));
5869897e357SRick Macklem fha_hash_entry_add_op(fhe, i.locktype, 1);
5879897e357SRick Macklem thread->st_p2++;
5889897e357SRick Macklem thread->st_p3 = i.offset;
5899897e357SRick Macklem
5909897e357SRick Macklem /*
5919897e357SRick Macklem * Grab the pool lock here to not let chosen thread go away before
5929897e357SRick Macklem * the new request inserted to its queue while we drop fhe lock.
5939897e357SRick Macklem */
5949897e357SRick Macklem mtx_lock(&thread->st_lock);
5959897e357SRick Macklem mtx_unlock(fhe->mtx);
5969897e357SRick Macklem
597cd406ac9SRick Macklem NFSD_CURVNET_RESTORE();
5989897e357SRick Macklem return (thread);
5999897e357SRick Macklem thist:
6009897e357SRick Macklem req->rq_p1 = NULL;
601cd406ac9SRick Macklem NFSD_CURVNET_RESTORE();
6029897e357SRick Macklem mtx_lock(&this_thread->st_lock);
6039897e357SRick Macklem return (this_thread);
6049897e357SRick Macklem }
6059897e357SRick Macklem
6069897e357SRick Macklem /*
6079897e357SRick Macklem * Called when we're done with an operation. The request has already
6089897e357SRick Macklem * been de-queued.
6099897e357SRick Macklem */
6109897e357SRick Macklem void
fhanew_nd_complete(SVCTHREAD * thread,struct svc_req * req)6119897e357SRick Macklem fhanew_nd_complete(SVCTHREAD *thread, struct svc_req *req)
6129897e357SRick Macklem {
6139897e357SRick Macklem struct fha_hash_entry *fhe = req->rq_p1;
6149897e357SRick Macklem struct mtx *mtx;
6159897e357SRick Macklem
616cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread));
6179897e357SRick Macklem /*
6189897e357SRick Macklem * This may be called for reqs that didn't go through
6199897e357SRick Macklem * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
6209897e357SRick Macklem */
621cd406ac9SRick Macklem if (!fhe) {
622cd406ac9SRick Macklem NFSD_CURVNET_RESTORE();
6239897e357SRick Macklem return;
624cd406ac9SRick Macklem }
6259897e357SRick Macklem
6269897e357SRick Macklem mtx = fhe->mtx;
6279897e357SRick Macklem mtx_lock(mtx);
6289897e357SRick Macklem fha_hash_entry_add_op(fhe, req->rq_p2, -1);
6299897e357SRick Macklem thread->st_p2--;
6309897e357SRick Macklem KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
6319897e357SRick Macklem thread->st_p2, thread));
6329897e357SRick Macklem if (thread->st_p2 == 0) {
6339897e357SRick Macklem fha_hash_entry_remove_thread(fhe, thread);
6349897e357SRick Macklem if (0 == fhe->num_rw + fhe->num_exclusive)
6359897e357SRick Macklem fha_hash_entry_remove(fhe);
6369897e357SRick Macklem }
6379897e357SRick Macklem mtx_unlock(mtx);
638cd406ac9SRick Macklem NFSD_CURVNET_RESTORE();
6399897e357SRick Macklem }
6409897e357SRick Macklem
6419897e357SRick Macklem static int
fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS)6429897e357SRick Macklem fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS)
6439897e357SRick Macklem {
644cd406ac9SRick Macklem struct fha_params *softc;
6459897e357SRick Macklem int error, i;
6469897e357SRick Macklem struct sbuf sb;
6479897e357SRick Macklem struct fha_hash_entry *fhe;
6489897e357SRick Macklem bool_t first, hfirst;
6499897e357SRick Macklem SVCTHREAD *thread;
6509897e357SRick Macklem
6519897e357SRick Macklem sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
6529897e357SRick Macklem
653cd406ac9SRick Macklem NFSD_CURVNET_SET(NFSD_TD_TO_VNET(curthread));
654cd406ac9SRick Macklem softc = NFSD_VNET(fhanew_softc);
6559897e357SRick Macklem for (i = 0; i < FHA_HASH_SIZE; i++)
6569897e357SRick Macklem if (!LIST_EMPTY(&softc->fha_hash[i].list))
6579897e357SRick Macklem break;
6589897e357SRick Macklem
6599897e357SRick Macklem if (i == FHA_HASH_SIZE) {
6609897e357SRick Macklem sbuf_printf(&sb, "No file handle entries.\n");
6619897e357SRick Macklem goto out;
6629897e357SRick Macklem }
6639897e357SRick Macklem
6649897e357SRick Macklem hfirst = TRUE;
6659897e357SRick Macklem for (; i < FHA_HASH_SIZE; i++) {
6669897e357SRick Macklem mtx_lock(&softc->fha_hash[i].mtx);
6679897e357SRick Macklem if (LIST_EMPTY(&softc->fha_hash[i].list)) {
6689897e357SRick Macklem mtx_unlock(&softc->fha_hash[i].mtx);
6699897e357SRick Macklem continue;
6709897e357SRick Macklem }
6719897e357SRick Macklem sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
6729897e357SRick Macklem first = TRUE;
6739897e357SRick Macklem LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
6749897e357SRick Macklem sbuf_printf(&sb, "%sfhe %p: {\n", first ? " " : ", ",
6759897e357SRick Macklem fhe);
6769897e357SRick Macklem sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh);
6779897e357SRick Macklem sbuf_printf(&sb, " num_rw/exclusive: %d/%d\n",
6789897e357SRick Macklem fhe->num_rw, fhe->num_exclusive);
6799897e357SRick Macklem sbuf_printf(&sb, " num_threads: %d\n",
6809897e357SRick Macklem fhe->num_threads);
6819897e357SRick Macklem
6829897e357SRick Macklem LIST_FOREACH(thread, &fhe->threads, st_alink) {
6839897e357SRick Macklem sbuf_printf(&sb, " thread %p offset %ju "
6849897e357SRick Macklem "reqs %d\n", thread,
6859897e357SRick Macklem thread->st_p3, thread->st_p2);
6869897e357SRick Macklem }
6879897e357SRick Macklem
6889897e357SRick Macklem sbuf_printf(&sb, " }");
6899897e357SRick Macklem first = FALSE;
6909897e357SRick Macklem }
6919897e357SRick Macklem sbuf_printf(&sb, "\n}");
6929897e357SRick Macklem mtx_unlock(&softc->fha_hash[i].mtx);
6939897e357SRick Macklem hfirst = FALSE;
6949897e357SRick Macklem }
6959897e357SRick Macklem
6969897e357SRick Macklem out:
697cd406ac9SRick Macklem NFSD_CURVNET_RESTORE();
6989897e357SRick Macklem sbuf_trim(&sb);
6999897e357SRick Macklem sbuf_finish(&sb);
7009897e357SRick Macklem error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
7019897e357SRick Macklem sbuf_delete(&sb);
7029897e357SRick Macklem return (error);
703d96b98a3SKenneth D. Merry }
704