xref: /illumos-gate/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2018-2021 Tintri by DDN, Inc.  All rights reserved.
14  * Copyright 2022 RackTop Systems, Inc.
15  */
16 
17 /*
18  * Support functions for smb2_ioctl/fsctl codes:
19  * FSCTL_OFFLOAD_READ
20  * FSCTL_OFFLOAD_WRITE
21  * (and related)
22  */
23 
24 #include <smbsrv/smb2_kproto.h>
25 #include <smbsrv/smb_fsops.h>
26 #include <smb/winioctl.h>
27 
28 /*
29  * Summary of how offload data transfer works:
30  *
31  * The client drives a server-side copy.  Outline:
32  * 1: open src_file
33  * 2: create dst_file and set its size
34  * 3: while src_file not all copied {
35  *        offload_read(src_file, &token);
36  *        while token not all copied {
37  *	      offload_write(dst_file, token);
38  *        }
39  *    }
40  *
41  * Each "offload read" request returns a "token" representing some
42  * portion of the source file.  The server decides what kind of
43  * token to use, and how much of the source file it should cover.
44  * The length represented may be less then the client requested.
45  * No data are copied during offload_read (just meta-data).
46  *
47  * Each "offload write" request copies some portion of the data
48  * represented by the "token" into the output file.  The amount
49  * of data copied may be less than the client requested, and the
50  * client keeps sending offload write requests until they have
51  * copied all the data represented by the current token.
52  */
53 
54 /* [MS-FSA] OFFLOAD_READ_FLAG_ALL_ZERO_BEYOND_CURRENT_RANGE */
55 #define	OFFLOAD_READ_FLAG_ALL_ZERO_BEYOND	1
56 
57 /*
58  * [MS-FSCC] 2.3.79 STORAGE_OFFLOAD_TOKEN
59  * Note reserved: 0xFFFF0002 – 0xFFFFFFFF
60  *
61  * ...TOKEN_TYPE_ZERO_DATA:  A well-known Token that indicates ...
62  * (offload write should just zero to the destination)
63  * The payload (tok_other) is ignored with this type.
64  */
65 #define	STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA	0xFFFF0001
66 
67 /* Our vendor-specific token type: struct tok_native1 */
68 #define	STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1	0x10001
69 
70 #define	TOKEN_TOTAL_SIZE	512
71 #define	TOKEN_MAX_PAYLOAD	504	/* 512 - 8 */
72 
73 /* This mask is for sanity checking offsets etc. */
74 #define	OFFMASK		((uint64_t)DEV_BSIZE-1)
75 
76 typedef struct smb_odx_token {
77 	uint32_t	tok_type;	/* big-endian on the wire */
78 	uint16_t	tok_reserved;	/* zero */
79 	uint16_t	tok_len;	/* big-endian on the wire */
80 	union {
81 		uint8_t u_tok_other[TOKEN_MAX_PAYLOAD];
82 		struct tok_native1 {
83 			smb2fid_t	tn1_fid;
84 			uint64_t	tn1_off;
85 			uint64_t	tn1_eof;
86 			uint32_t	tn1_tid;
87 		} u_tok_native1;
88 	} tok_u;
89 } smb_odx_token_t;
90 
91 typedef struct odx_write_args {
92 	uint32_t in_struct_size;
93 	uint32_t in_flags;
94 	uint64_t in_dstoff;
95 	uint64_t in_xlen;
96 	uint64_t in_xoff;
97 	uint32_t out_struct_size;
98 	uint32_t out_flags;
99 	uint64_t out_xlen;
100 	uint64_t wa_eof;
101 } odx_write_args_t;
102 
103 static int smb_odx_get_token(mbuf_chain_t *, smb_odx_token_t *);
104 static int smb_odx_get_token_native1(mbuf_chain_t *, struct tok_native1 *);
105 static int smb_odx_put_token(mbuf_chain_t *, smb_odx_token_t *);
106 static int smb_odx_put_token_native1(mbuf_chain_t *, struct tok_native1 *);
107 
108 static uint32_t smb2_fsctl_odx_write_zeros(smb_request_t *, odx_write_args_t *);
109 static uint32_t smb2_fsctl_odx_write_native1(smb_request_t *,
110     odx_write_args_t *, smb_odx_token_t *);
111 
112 
113 /* We can disable this feature for testing etc. */
114 int smb2_odx_enable = 1;
115 
116 /*
117  * These two variables determine the intervals of offload_read and
118  * offload_write calls (respectively) during an offload copy.
119  *
120  * For the offload read token we could offer a token representing
121  * the whole file, but we'll have the client come back for a new
122  * "token" after each 256M so we have a chance to look for "holes".
123  * This lets us use the special "zero" token while we're in any
124  * un-allocated parts of the file, so offload_write can use the
125  * (more efficient) smb_fsop_freesp instead of copying.
126  *
127  * We limit the size of offload_write to 16M per request so we
128  * don't end up taking so long with I/O that the client might
129  * time out the request.  Keep: write_max <= read_max
130  */
131 uint32_t smb2_odx_read_max = (1<<28); /* 256M */
132 uint32_t smb2_odx_write_max = (1<<24); /* 16M */
133 
134 /*
135  * This buffer size determines the I/O size for the copy during
136  * offoad write, where it will read/write using this buffer.
137  * Note: We kmem_alloc this, so don't make it HUGE.  It only
138  * needs to be large enough to allow the copy to proceed with
139  * reasonable efficiency.  1M is currently the largest possible
140  * block size with ZFS, so that's what we'll use here.
141  *
142  * Actually, limit this to kmem_max_cached, to avoid contention
143  * allocating from kmem_oversize_arena.
144  */
145 uint32_t smb2_odx_buf_size = (1<<17); /* 128k */
146 
147 
148 /*
149  * FSCTL_OFFLOAD_READ
150  * [MS-FSCC] 2.3.77
151  *
152  * Similar (in concept) to FSCTL_SRV_REQUEST_RESUME_KEY
153  *
154  * The returned data is an (opaque to the client) 512-byte "token"
155  * that represents the specified range (offset, length) of the
156  * source file.  The "token" we return here comes back to us in an
157  * FSCTL_OFFLOAD_READ.  We must stash whatever we'll need then in
158  * the token we return here.
159  *
160  * We want server-side copy to be able to copy "holes" efficiently,
161  * but would rather avoid the complexity of encoding a list of all
162  * allocated ranges into our returned token, so this compromise:
163  *
164  * When the current range is entirely within a "hole", we'll return
165  * the special "zeros" token, and the offload write using that token
166  * will use the simple and very efficient smb_fsop_freesp.  In this
167  * scenario, we'll have a copy stride of smb2_odx_read_max (256M).
168  *
169  * When there's any data in the range to copy, we'll return our
170  * "native" token, and the subsequent offload_write will walk the
171  * allocated ranges copying and/or zeroing as needed.  In this
172  * scenario, we'll have a copy stride of smb2_odx_write_max (16M).
173  *
174  * One additional optimization allowed by the protocol is that when
175  * we discover that there's no more data after the current range,
176  * we can set the flag ..._ALL_ZERO_BEYOND which tells that client
177  * they can stop copying here if they like.
178  */
179 uint32_t
180 smb2_fsctl_odx_read(smb_request_t *sr, smb_fsctl_t *fsctl)
181 {
182 	smb_attr_t src_attr;
183 	smb_odx_token_t *tok = NULL;
184 	struct tok_native1 *tn1;
185 	smb_ofile_t *ofile = sr->fid_ofile;
186 	uint64_t src_size, src_rnd_size;
187 	off64_t data, hole;
188 	uint32_t in_struct_size;
189 	uint32_t in_flags;
190 	uint32_t in_ttl;
191 	uint64_t in_file_off;
192 	uint64_t in_copy_len;
193 	uint64_t out_xlen;
194 	uint32_t out_struct_size = TOKEN_TOTAL_SIZE + 16;
195 	uint32_t out_flags = 0;
196 	uint32_t status;
197 	uint32_t tok_type;
198 	int rc;
199 
200 	if (smb2_odx_enable == 0)
201 		return (NT_STATUS_INVALID_DEVICE_REQUEST);
202 
203 	/*
204 	 * Make sure the (src) ofile granted access allows read.
205 	 * [MS-FSA] didn't mention this, so it's not clear where
206 	 * this should happen relative to other checks.  Usually
207 	 * access checks happen early.
208 	 */
209 	status = smb_ofile_access(ofile, ofile->f_cr, FILE_READ_DATA);
210 	if (status != NT_STATUS_SUCCESS)
211 		return (status);
212 
213 	/*
214 	 * Decode FSCTL_OFFLOAD_READ_INPUT struct,
215 	 * and do in/out size checks.
216 	 */
217 	rc = smb_mbc_decodef(
218 	    fsctl->in_mbc, "lll4.qq",
219 	    &in_struct_size,	/* l */
220 	    &in_flags,		/* l */
221 	    &in_ttl,		/* l */
222 	    /* reserved		4. */
223 	    &in_file_off,	/* q */
224 	    &in_copy_len);	/* q */
225 	if (rc != 0)
226 		return (NT_STATUS_BUFFER_TOO_SMALL);
227 	if (fsctl->MaxOutputResp < out_struct_size)
228 		return (NT_STATUS_BUFFER_TOO_SMALL);
229 
230 	/*
231 	 * More arg checking per MS-FSA
232 	 */
233 	if ((in_file_off & OFFMASK) != 0 ||
234 	    (in_copy_len & OFFMASK) != 0)
235 		return (NT_STATUS_INVALID_PARAMETER);
236 	if (in_struct_size != 32)
237 		return (NT_STATUS_INVALID_PARAMETER);
238 	if (in_file_off > INT64_MAX ||
239 	    (in_file_off + in_copy_len) < in_file_off)
240 		return (NT_STATUS_INVALID_PARAMETER);
241 
242 	/*
243 	 * [MS-FSA] (summarizing)
244 	 * If not data stream, or if sparse, encrypted, compressed...
245 	 * return STATUS_OFFLOAD_READ_FILE_NOT_SUPPORTED.
246 	 *
247 	 * We'll ignore most of those except to require:
248 	 * Plain file, not a stream.
249 	 */
250 	if (!smb_node_is_file(ofile->f_node))
251 		return (NT_STATUS_OFFLOAD_READ_FILE_NOT_SUPPORTED);
252 	if (SMB_IS_STREAM(ofile->f_node))
253 		return (NT_STATUS_OFFLOAD_READ_FILE_NOT_SUPPORTED);
254 
255 	/*
256 	 * [MS-FSA] If Open.Stream.IsDeleted ...
257 	 */
258 	if (ofile->f_node->flags & NODE_FLAGS_DELETE_COMMITTED)
259 		return (NT_STATUS_FILE_DELETED);
260 
261 	/*
262 	 * If CopyLength == 0, "return immediately success".
263 	 */
264 	if (in_copy_len == 0) {
265 		out_xlen = 0;
266 		tok_type = STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA;
267 		goto done;
268 	}
269 
270 	/*
271 	 * Check for lock conflicting with the read.
272 	 */
273 	status = smb_lock_range_access(sr, ofile->f_node,
274 	    in_file_off, in_copy_len, B_FALSE);
275 	if (status != 0)
276 		return (status); /* == FILE_LOCK_CONFLICT */
277 
278 	/*
279 	 * Get the file size (rounded to a full block)
280 	 * and check the requested offset.
281 	 */
282 	bzero(&src_attr, sizeof (src_attr));
283 	src_attr.sa_mask = SMB_AT_SIZE;
284 	status = smb2_ofile_getattr(sr, ofile, &src_attr);
285 	if (status != NT_STATUS_SUCCESS)
286 		return (status);
287 	src_size = src_attr.sa_vattr.va_size;
288 	if (in_file_off >= src_size)
289 		return (NT_STATUS_END_OF_FILE);
290 
291 	/*
292 	 * Limit the transfer length based on (rounded) EOF.
293 	 * Clients expect ranges of whole disk blocks.
294 	 * If we get a read in this rounded-up range,
295 	 * we'll supply zeros.
296 	 */
297 	src_rnd_size = (src_size + OFFMASK) & ~OFFMASK;
298 	out_xlen = in_copy_len;
299 	if ((in_file_off + out_xlen) > src_rnd_size)
300 		out_xlen = src_rnd_size - in_file_off;
301 
302 	/*
303 	 * Also, have the client come back for a new token after every
304 	 * smb2_odx_read_max bytes, so we'll have opportunities to
305 	 * recognize "holes" in the source file.
306 	 */
307 	if (out_xlen > smb2_odx_read_max)
308 		out_xlen = smb2_odx_read_max;
309 
310 	/*
311 	 * Ask the filesystem if there are any allocated regions in
312 	 * the requested range, and return either the "zeros" token
313 	 * or our "native" token as appropriate (details above).
314 	 */
315 	data = in_file_off;
316 	tok_type = STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1;
317 	rc = smb_fsop_next_alloc_range(ofile->f_cr, ofile->f_node,
318 	    &data, &hole);
319 	switch (rc) {
320 	case 0:
321 		/* Found some data.  Is it beyond this range? */
322 		if (data >= (in_file_off + out_xlen))
323 			tok_type = STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA;
324 		break;
325 	case ENXIO:
326 		/*
327 		 * No data here to EOF.  Use TOKEN_TYPE_ZERO_DATA,
328 		 * but only if we're not crossing src_size, because
329 		 * type zero cannot preserve unaligned src_size.
330 		 */
331 		if ((in_file_off + out_xlen) <= src_size)
332 			tok_type = STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA;
333 		out_flags |= OFFLOAD_READ_FLAG_ALL_ZERO_BEYOND;
334 		break;
335 	case ENOSYS:	/* FS does not support VOP_IOCTL... */
336 	case ENOTTY:	/* ... or _FIO_SEEK_DATA, _HOLE */
337 		break;
338 	default:
339 		cmn_err(CE_NOTE, "smb_fsop_next_alloc_range: rc=%d", rc);
340 		break;
341 	}
342 
343 done:
344 	/* Already checked MaxOutputResp */
345 	(void) smb_mbc_encodef(
346 	    fsctl->out_mbc, "llq",
347 	    out_struct_size,	/* l */
348 	    out_flags,		/* l */
349 	    out_xlen);		/* q */
350 
351 	/*
352 	 * Build the ODX token to return
353 	 */
354 	tok = smb_srm_zalloc(sr, sizeof (*tok));
355 	tok->tok_type = tok_type;
356 	tok->tok_reserved = 0;
357 	if (tok_type == STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1) {
358 		tok->tok_len = sizeof (*tn1);
359 		tn1 = &tok->tok_u.u_tok_native1;
360 		tn1->tn1_fid.persistent = ofile->f_persistid;
361 		tn1->tn1_fid.temporal = ofile->f_fid;
362 		tn1->tn1_off = in_file_off;
363 		tn1->tn1_eof = src_size;
364 		tn1->tn1_tid = sr->smb_tid;
365 	}
366 
367 	rc = smb_odx_put_token(fsctl->out_mbc, tok);
368 	if (rc != 0)
369 		return (NT_STATUS_BUFFER_TOO_SMALL);
370 
371 	return (NT_STATUS_SUCCESS);
372 }
373 
374 /*
375  * FSCTL_OFFLOAD_WRITE
376  * [MS-FSCC] 2.3.80
377  *
378  * Similar (in concept) to FSCTL_COPYCHUNK_WRITE
379  *
380  * Copies from a source file identified by a "token"
381  * (previously returned by FSCTL_OFFLOAD_READ)
382  * to the file on which the ioctl is issued.
383  */
384 uint32_t
385 smb2_fsctl_odx_write(smb_request_t *sr, smb_fsctl_t *fsctl)
386 {
387 	smb_attr_t dst_attr;
388 	odx_write_args_t args;
389 	smb_odx_token_t *tok = NULL;
390 	smb_ofile_t *ofile = sr->fid_ofile;
391 	uint32_t status = NT_STATUS_INVALID_PARAMETER;
392 	int rc;
393 
394 	bzero(&args, sizeof (args));
395 	args.out_struct_size = 16;
396 
397 	if (smb2_odx_enable == 0)
398 		return (NT_STATUS_INVALID_DEVICE_REQUEST);
399 
400 	/*
401 	 * Make sure the (dst) ofile granted_access allows write.
402 	 * [MS-FSA] didn't mention this, so it's not clear where
403 	 * this should happen relative to other checks.  Usually
404 	 * access checks happen early.
405 	 */
406 	status = smb_ofile_access(ofile, ofile->f_cr, FILE_WRITE_DATA);
407 	if (status != NT_STATUS_SUCCESS)
408 		return (status);
409 
410 	/*
411 	 * Decode FSCTL_OFFLOAD_WRITE_INPUT struct,
412 	 * and do in/out size checks.
413 	 */
414 	rc = smb_mbc_decodef(
415 	    fsctl->in_mbc, "llqqq",
416 	    &args.in_struct_size,	/* l */
417 	    &args.in_flags,		/* l */
418 	    &args.in_dstoff,		/* q */
419 	    &args.in_xlen,		/* q */
420 	    &args.in_xoff);		/* q */
421 	if (rc != 0)
422 		return (NT_STATUS_BUFFER_TOO_SMALL);
423 	tok = smb_srm_zalloc(sr, sizeof (*tok));
424 	rc = smb_odx_get_token(fsctl->in_mbc, tok);
425 	if (rc != 0)
426 		return (NT_STATUS_BUFFER_TOO_SMALL);
427 	if (fsctl->MaxOutputResp < args.out_struct_size)
428 		return (NT_STATUS_BUFFER_TOO_SMALL);
429 
430 	/*
431 	 * More arg checking per MS-FSA
432 	 */
433 	if ((args.in_dstoff & OFFMASK) != 0 ||
434 	    (args.in_xoff & OFFMASK) != 0 ||
435 	    (args.in_xlen & OFFMASK) != 0)
436 		return (NT_STATUS_INVALID_PARAMETER);
437 	if (args.in_struct_size != (TOKEN_TOTAL_SIZE + 32))
438 		return (NT_STATUS_INVALID_PARAMETER);
439 	if (args.in_dstoff > INT64_MAX ||
440 	    (args.in_dstoff + args.in_xlen) < args.in_dstoff)
441 		return (NT_STATUS_INVALID_PARAMETER);
442 
443 	/*
444 	 * If CopyLength == 0, "return immediately success".
445 	 */
446 	if (args.in_xlen == 0) {
447 		status = 0;
448 		goto done;
449 	}
450 
451 	/*
452 	 * [MS-FSA] (summarizing)
453 	 * If not data stream, or if sparse, encrypted, compressed...
454 	 * return STATUS_OFFLOAD_WRITE_FILE_NOT_SUPPORTED.
455 	 *
456 	 * We'll ignore most of those except to require:
457 	 * Plain file, not a stream.
458 	 */
459 	if (!smb_node_is_file(ofile->f_node))
460 		return (NT_STATUS_OFFLOAD_WRITE_FILE_NOT_SUPPORTED);
461 	if (SMB_IS_STREAM(ofile->f_node))
462 		return (NT_STATUS_OFFLOAD_WRITE_FILE_NOT_SUPPORTED);
463 
464 	/*
465 	 * [MS-FSA] If Open.Stream.IsDeleted ...
466 	 */
467 	if (ofile->f_node->flags & NODE_FLAGS_DELETE_COMMITTED)
468 		return (NT_STATUS_FILE_DELETED);
469 
470 	/*
471 	 * Check for lock conflicting with the write.
472 	 */
473 	status = smb_lock_range_access(sr, ofile->f_node,
474 	    args.in_dstoff, args.in_xlen, B_TRUE);
475 	if (status != 0)
476 		return (status); /* == FILE_LOCK_CONFLICT */
477 
478 	/*
479 	 * Need the file size
480 	 */
481 	bzero(&dst_attr, sizeof (dst_attr));
482 	dst_attr.sa_mask = SMB_AT_SIZE;
483 	status = smb2_ofile_getattr(sr, ofile, &dst_attr);
484 	if (status != NT_STATUS_SUCCESS)
485 		return (status);
486 	args.wa_eof = dst_attr.sa_vattr.va_size;
487 
488 	/*
489 	 * Destination offset vs. EOF
490 	 */
491 	if (args.in_dstoff > args.wa_eof)
492 		return (NT_STATUS_END_OF_FILE);
493 
494 	/*
495 	 * Finally, run the I/O
496 	 */
497 	switch (tok->tok_type) {
498 	case STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA:
499 		status = smb2_fsctl_odx_write_zeros(sr, &args);
500 		break;
501 	case STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1:
502 		status = smb2_fsctl_odx_write_native1(sr, &args, tok);
503 		break;
504 	default:
505 		status = NT_STATUS_INVALID_TOKEN;
506 		break;
507 	}
508 
509 done:
510 	/*
511 	 * Checked MaxOutputResp above, so we can ignore errors
512 	 * from mbc_encodef here.
513 	 */
514 	if (status == NT_STATUS_SUCCESS) {
515 		(void) smb_mbc_encodef(
516 		    fsctl->out_mbc, "llq",
517 		    args.out_struct_size,
518 		    args.out_flags,
519 		    args.out_xlen);
520 	}
521 
522 	return (status);
523 }
524 
525 /*
526  * Handle FSCTL_OFFLOAD_WRITE with token type
527  * STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA
528  *
529  * In this handler, the "token" represents a source of zeros,
530  * limited to the range: in_dstoff to (in_dstoff + in_xlen)
531  *
532  * ODX write handlers are allowed to return any transfer amount
533  * less than or equal to the requested size.  We want to limit
534  * the amount of I/O "work" we do per ODX write call.  Here,
535  * we're only doing meta-data operations, so we'll allow up to
536  * up to smb2_odx_read_max (256M) per call.
537  *
538  * The I/O "work" done by this function is to make zeros appear
539  * in the file in the range: in_dstoff, (in_dstoff + in_xlen).
540  * Rather than actually write zeros, we'll use VOP_SPACE to
541  * make "holes" in the file.  If any of the range we're asked
542  * to zero out is beyond the destination EOF, we can simply
543  * extend the file length (zeros will appear).
544  *
545  * The caller has verified block alignement of:
546  * args->in_dstoff, args->in_xoff, args->in_xlen
547  */
548 static uint32_t
549 smb2_fsctl_odx_write_zeros(smb_request_t *sr, odx_write_args_t *args)
550 {
551 	smb_ofile_t *dst_ofile = sr->fid_ofile;
552 	uint64_t xlen;
553 	int rc;
554 
555 	ASSERT(args->in_xlen > 0);
556 	args->out_xlen = 0;
557 
558 	/*
559 	 * Limit the I/O size. (per above)
560 	 */
561 	if (args->in_xlen > smb2_odx_read_max)
562 		args->in_xlen = smb2_odx_read_max;
563 
564 	/*
565 	 * Handle the part below destination EOF.
566 	 * (in_dstoff to wa_eof).
567 	 */
568 	if (args->in_dstoff < args->wa_eof) {
569 		xlen = args->in_xlen;
570 		if ((args->in_dstoff + xlen) > args->wa_eof) {
571 			xlen = args->wa_eof - args->in_dstoff;
572 			ASSERT(xlen < args->in_xlen);
573 		}
574 		rc = smb_fsop_freesp(sr, dst_ofile->f_cr, dst_ofile,
575 		    args->in_dstoff, xlen);
576 		if (rc != 0) {
577 			/* Let client fall-back to normal copy. */
578 			return (NT_STATUS_OFFLOAD_WRITE_FILE_NOT_SUPPORTED);
579 		}
580 	}
581 
582 	/*
583 	 * Now the part after destination EOF, if any.
584 	 * Just set the file size.
585 	 */
586 	if ((args->in_dstoff + args->in_xlen) > args->wa_eof) {
587 		smb_attr_t attr;
588 
589 		bzero(&attr, sizeof (smb_attr_t));
590 		attr.sa_mask = SMB_AT_SIZE;
591 		attr.sa_vattr.va_size = args->in_dstoff + args->in_xlen;
592 
593 		rc = smb_node_setattr(sr, dst_ofile->f_node,
594 		    dst_ofile->f_cr, dst_ofile, &attr);
595 		if (rc != 0) {
596 			return (smb_errno2status(rc));
597 		}
598 	}
599 
600 	args->out_xlen = args->in_xlen;
601 
602 	return (0);
603 }
604 
605 /*
606  * Handle FSCTL_OFFLOAD_WRITE with token type
607  * STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1
608  *
609  * For this handler, the token represents a valid range in the
610  * source file (tn1_off to tn1_eof).  The token contains enough
611  * information for us to find the tree and file handle that the
612  * client has open on the source file for this copy.
613  *
614  * ODX write handlers are allowed to return any transfer amount
615  * less than or equal to the requested size.  We want to limit
616  * the amount of I/O "work" we do per ODX write call.  Here,
617  * we're actually copying from another file, so limit transfers
618  * to smb2_odx_write_max (16M) per call.
619  *
620  * Copying past un-aligned end of source file:
621  *
622  * The MS-FSA spec. is silent about copying when the file length is
623  * not block aligned. Clients normally request copying a range that's
624  * the file size rounded up to a block boundary, and expect that copy
625  * to extend the destination as long as the copy has not crossed the
626  * EOF in the source file.  This means that the last block we copy
627  * will generally be a partial copy, where the first part comes from
628  * the source file, and the remainider is either zeros or truncated.
629  *
630  * Extending the destination file:
631  *
632  * With a whole file copy, we want the destination file length to
633  * match the source file length, even if it's not block aligned.
634  * We could just never extend the destination file, but there are
635  * WPTS tests that prove that ODX write IS supposed to extend the
636  * destination file when appropriate.  This is solved by having
637  * this write handler extend the destination file as long as the
638  * copy has not yet crossed EOF in the source file.  After we've
639  * past the source EOF with copying, we'll zero out the remainder
640  * of the block in which the copy stopped, stopping at either the
641  * end of the block or the end of the destination file, whichever
642  * comes first.  This guarantees that a future read anywhere in
643  * that range will see either data from the source file or zeros.
644  *
645  * Note that no matter which way we stopped copying, we MUST
646  * return a block-aligned transfer size in our response.
647  * The caller has verified block alignement of:
648  * args->in_dstoff, args->in_xoff, args->in_xlen
649  */
650 static uint32_t
651 smb2_fsctl_odx_write_native1(smb_request_t *sr,
652     odx_write_args_t *args, smb_odx_token_t *tok)
653 {
654 	struct tok_native1 *tn1;
655 	smb_ofile_t *dst_ofile = sr->fid_ofile;
656 	smb_ofile_t *src_ofile = NULL;
657 	void *buffer = NULL;
658 	size_t bufsize = smb2_odx_buf_size;
659 	uint64_t src_offset;
660 	uint32_t resid;
661 	uint32_t xlen;
662 	uint32_t status;
663 
664 	ASSERT(args->in_xlen > 0);
665 	args->out_xlen = 0;
666 
667 	/*
668 	 * Limit the I/O size. (per above)
669 	 */
670 	if (args->in_xlen > smb2_odx_write_max)
671 		args->in_xlen = smb2_odx_write_max;
672 
673 	/*
674 	 * Lookup the source ofile using the "token".
675 	 */
676 	tn1 = &tok->tok_u.u_tok_native1;
677 
678 	/*
679 	 * If the source ofile came from another tree, we need to
680 	 * get the other tree and use it for the fid lookup.
681 	 * Do that by temporarily changing sr->tid_tree around
682 	 * the call to smb_ofile_lookup_by_fid().
683 	 */
684 	if (tn1->tn1_tid != sr->smb_tid) {
685 		smb_tree_t *saved_tree;
686 		smb_tree_t *src_tree;
687 
688 		src_tree = smb_session_lookup_tree(sr->session,
689 		    (uint16_t)tn1->tn1_tid);
690 		if (src_tree == NULL) {
691 			status = NT_STATUS_INVALID_TOKEN;
692 			goto out;
693 		}
694 
695 		saved_tree = sr->tid_tree;
696 		sr->tid_tree = src_tree;
697 
698 		src_ofile = smb_ofile_lookup_by_fid(sr,
699 		    (uint16_t)tn1->tn1_fid.temporal);
700 
701 		sr->tid_tree = saved_tree;
702 		smb_tree_release(src_tree);
703 	} else {
704 		src_ofile = smb_ofile_lookup_by_fid(sr,
705 		    (uint16_t)tn1->tn1_fid.temporal);
706 	}
707 
708 	if (src_ofile == NULL ||
709 	    src_ofile->f_persistid != tn1->tn1_fid.persistent) {
710 		status = NT_STATUS_INVALID_TOKEN;
711 		goto out;
712 	}
713 
714 	/*
715 	 * Make sure src_ofile is open on a regular file, and
716 	 * granted access includes READ_DATA.  These were all
717 	 * validated in ODX READ, so if these checks fail it
718 	 * means somebody messed with the token or something.
719 	 */
720 	if (!smb_node_is_file(src_ofile->f_node)) {
721 		status = NT_STATUS_ACCESS_DENIED;
722 		goto out;
723 	}
724 	status = smb_ofile_access(src_ofile, src_ofile->f_cr, FILE_READ_DATA);
725 	if (status != NT_STATUS_SUCCESS)
726 		goto out;
727 
728 	/*
729 	 * Get a buffer used for copying, always smb2_odx_buf_size
730 	 *
731 	 * Rather than sleep for this relatively large allocation,
732 	 * allow the allocation to fail and return an error.
733 	 * The client should then fall back to normal copy.
734 	 */
735 	buffer = kmem_alloc(bufsize, KM_NOSLEEP_LAZY);
736 	if (buffer == NULL) {
737 		status = NT_STATUS_INSUFF_SERVER_RESOURCES;
738 		goto out;
739 	}
740 
741 	/*
742 	 * Note: in_xoff is relative to the beginning of the "token"
743 	 * (a range of the source file tn1_off, tn1_eof).  Make sure
744 	 * in_xoff is within the range represented by this token.
745 	 */
746 	src_offset = tn1->tn1_off + args->in_xoff;
747 	if (src_offset >= tn1->tn1_eof ||
748 	    src_offset < tn1->tn1_off) {
749 		status = NT_STATUS_INVALID_PARAMETER;
750 		goto out;
751 	}
752 
753 	/*
754 	 * Source offset+len vs. source EOF (see top comment)
755 	 */
756 	xlen = (uint32_t)args->in_xlen;
757 	if ((src_offset + xlen) > tn1->tn1_eof) {
758 		/*
759 		 * Copying would pass tn1_eof.  Reduce xlen.
760 		 */
761 		DTRACE_PROBE3(crossed__eof, smb_request_t *, sr,
762 		    odx_write_args_t *, args, smb_odx_token_t *, tok);
763 		xlen = (uint32_t)(tn1->tn1_eof - src_offset);
764 	}
765 
766 	/*
767 	 * Copy src to dst for xlen.  This MAY extend the dest file.
768 	 * Note: xlen may be not block-aligned now.  Handled below.
769 	 */
770 	resid = xlen;
771 	status = smb2_sparse_copy(sr, src_ofile, dst_ofile,
772 	    src_offset, args->in_dstoff, &resid, buffer, bufsize);
773 
774 	/*
775 	 * If the result was a partial copy, round down the reported
776 	 * transfer size to a block boundary. If we moved any data,
777 	 * suppress errors on this call.  If an error was suppressed,
778 	 * it will happen again and be returned on the next call.
779 	 */
780 	if (status != 0 || resid != 0) {
781 		xlen -= resid;
782 		xlen &= ~OFFMASK;
783 		args->out_xlen = xlen;
784 		/* If we moved any data, suppress errors. */
785 		if (xlen > 0)
786 			status = 0;
787 		goto out;
788 	}
789 
790 	/*
791 	 * If the copying covered the whole in_xlen, we're done.
792 	 * The test is >= here just so we can guarantee < below.
793 	 */
794 	if (xlen >= args->in_xlen) {
795 		args->out_xlen = args->in_xlen;
796 		goto out;
797 	}
798 
799 	/*
800 	 * Have: xlen < args->in_xlen
801 	 *
802 	 * Here we know xlen was reduced because the copy
803 	 * crossed the source EOF.  See top comment.
804 	 * Set the rounded-up transfer size now, and
805 	 * deal with the remainder of the last block.
806 	 */
807 	args->out_xlen = (xlen + OFFMASK) & ~OFFMASK;
808 
809 	/*
810 	 * If smb2_sparse_copy passed wa_eof, that means we've
811 	 * extended the file, so the remainder of the last block
812 	 * written is beyond the destination EOF was, so there's
813 	 * no need to zero out the remainder. "We're done".
814 	 */
815 	args->in_dstoff += xlen;
816 	if (args->in_dstoff >= args->wa_eof)
817 		goto out;
818 
819 	/*
820 	 * Have: in_dstoff < wa_eof
821 	 *
822 	 * Zero out the unwritten part of the last block that
823 	 * falls before the destination EOF. (Not extending.)
824 	 * Here, resid is the length of the part we'll zero.
825 	 */
826 	resid = args->out_xlen - xlen;
827 	if ((args->in_dstoff + resid) > args->wa_eof)
828 		resid = args->wa_eof - args->in_dstoff;
829 	if (resid > 0) {
830 		int rc;
831 		/*
832 		 * Zero out in_dstoff to wa_eof.
833 		 */
834 		rc = smb_fsop_freesp(sr, dst_ofile->f_cr, dst_ofile,
835 		    args->in_dstoff, resid);
836 		if (rc != 0) {
837 			status = smb_errno2status(rc);
838 		}
839 	}
840 
841 out:
842 	if (src_ofile != NULL)
843 		smb_ofile_release(src_ofile);
844 
845 	if (buffer != NULL)
846 		kmem_free(buffer, bufsize);
847 
848 	return (status);
849 }
850 
851 /*
852  * Get an smb_odx_token_t from the (input) mbuf chain.
853  * Consumes exactly TOKEN_TOTAL_SIZE bytes.
854  */
855 static int
856 smb_odx_get_token(mbuf_chain_t *mbc, smb_odx_token_t *tok)
857 {
858 	mbuf_chain_t tok_mbc;
859 	int start_pos = mbc->chain_offset;
860 	int rc;
861 
862 	if (MBC_ROOM_FOR(mbc, TOKEN_TOTAL_SIZE) == 0)
863 		return (-1);
864 
865 	/*
866 	 * No big-endian support in smb_mbc_encodef, so swap
867 	 * the big-endian fields: tok_type (32-bits),
868 	 * (reserved is 16-bit zero, so no swap),
869 	 * and tok_len (16-bits)
870 	 */
871 	rc = smb_mbc_decodef(
872 	    mbc, "l..w",
873 	    &tok->tok_type,
874 	    /* tok_reserved */
875 	    &tok->tok_len);
876 	if (rc != 0)
877 		return (rc);
878 	tok->tok_type = BSWAP_32(tok->tok_type);
879 	tok->tok_len = BSWAP_16(tok->tok_len);
880 
881 	if (tok->tok_len > TOKEN_MAX_PAYLOAD)
882 		return (-1);
883 	rc = MBC_SHADOW_CHAIN(&tok_mbc, mbc,
884 	    mbc->chain_offset, tok->tok_len);
885 	if (rc != 0)
886 		return (rc);
887 
888 	switch (tok->tok_type) {
889 	case STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA:
890 		/* no payload */
891 		break;
892 	case STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1:
893 		rc = smb_odx_get_token_native1(&tok_mbc,
894 		    &tok->tok_u.u_tok_native1);
895 		break;
896 	default:
897 		/* caller will error out */
898 		break;
899 	}
900 
901 	if (rc == 0) {
902 		/* Advance past what we shadowed. */
903 		mbc->chain_offset = start_pos + TOKEN_TOTAL_SIZE;
904 	}
905 
906 	return (rc);
907 }
908 
909 static int
910 smb_odx_get_token_native1(mbuf_chain_t *mbc, struct tok_native1 *tn1)
911 {
912 	int rc;
913 
914 	rc = smb_mbc_decodef(
915 	    mbc, "qqqql",
916 	    &tn1->tn1_fid.persistent,
917 	    &tn1->tn1_fid.temporal,
918 	    &tn1->tn1_off,
919 	    &tn1->tn1_eof,
920 	    &tn1->tn1_tid);
921 
922 	return (rc);
923 }
924 
925 /*
926  * Put an smb_odx_token_t into the (output) mbuf chain,
927  * padded to TOKEN_TOTAL_SIZE bytes.
928  */
929 static int
930 smb_odx_put_token(mbuf_chain_t *mbc, smb_odx_token_t *tok)
931 {
932 	int rc, padlen;
933 	int start_pos = mbc->chain_offset;
934 	int end_pos = start_pos + TOKEN_TOTAL_SIZE;
935 
936 	if (tok->tok_len > TOKEN_MAX_PAYLOAD)
937 		return (-1);
938 
939 	/*
940 	 * No big-endian support in smb_mbc_encodef, so swap
941 	 * the big-endian fields: tok_type (32-bits),
942 	 * (reserved is 16-bit zero, so no swap),
943 	 * and tok_len (16-bits)
944 	 */
945 	rc = smb_mbc_encodef(
946 	    mbc, "lww",
947 	    BSWAP_32(tok->tok_type),
948 	    0, /* tok_reserved */
949 	    BSWAP_16(tok->tok_len));
950 	if (rc != 0)
951 		return (rc);
952 
953 	switch (tok->tok_type) {
954 	case STORAGE_OFFLOAD_TOKEN_TYPE_ZERO_DATA:
955 		/* no payload */
956 		break;
957 	case STORAGE_OFFLOAD_TOKEN_TYPE_NATIVE1:
958 		rc = smb_odx_put_token_native1(mbc,
959 		    &tok->tok_u.u_tok_native1);
960 		break;
961 	default:
962 		ASSERT(0);
963 		return (-1);
964 	}
965 
966 	/* Pad out to TOKEN_TOTAL_SIZE bytes. */
967 	if (mbc->chain_offset < end_pos) {
968 		padlen = end_pos - mbc->chain_offset;
969 		(void) smb_mbc_encodef(mbc, "#.", padlen);
970 	}
971 	ASSERT(mbc->chain_offset == end_pos);
972 
973 	return (rc);
974 }
975 
976 static int
977 smb_odx_put_token_native1(mbuf_chain_t *mbc, struct tok_native1 *tn1)
978 {
979 	int rc;
980 
981 	rc = smb_mbc_encodef(
982 	    mbc, "qqqql",
983 	    tn1->tn1_fid.persistent,
984 	    tn1->tn1_fid.temporal,
985 	    tn1->tn1_off,
986 	    tn1->tn1_eof,
987 	    tn1->tn1_tid);
988 
989 	return (rc);
990 }
991