xref: /linux/fs/netfs/direct_write.c (revision d1384f70b2e3162786bc73b8f86c27417803bd57)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Unbuffered and direct write support.
3  *
4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/export.h>
9 #include <linux/uio.h>
10 #include "internal.h"
11 
12 /*
13  * Perform the cleanup rituals after an unbuffered write is complete.
14  */
netfs_unbuffered_write_done(struct netfs_io_request * wreq)15 static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
16 {
17 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
18 
19 	_enter("R=%x", wreq->debug_id);
20 
21 	/* Okay, declare that all I/O is complete. */
22 	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
23 
24 	if (!wreq->error)
25 		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
26 
27 	if (wreq->origin == NETFS_DIO_WRITE &&
28 	    wreq->mapping->nrpages) {
29 		/* mmap may have got underfoot and we may now have folios
30 		 * locally covering the region we just wrote.  Attempt to
31 		 * discard the folios, but leave in place any modified locally.
32 		 * ->write_iter() is prevented from interfering by the DIO
33 		 * counter.
34 		 */
35 		pgoff_t first = wreq->start >> PAGE_SHIFT;
36 		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
37 
38 		invalidate_inode_pages2_range(wreq->mapping, first, last);
39 	}
40 
41 	if (wreq->origin == NETFS_DIO_WRITE)
42 		inode_dio_end(wreq->inode);
43 
44 	_debug("finished");
45 	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
46 	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
47 
48 	if (wreq->iocb) {
49 		size_t written = umin(wreq->transferred, wreq->len);
50 
51 		wreq->iocb->ki_pos += written;
52 		if (wreq->iocb->ki_complete) {
53 			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
54 			wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
55 		}
56 		wreq->iocb = VFS_PTR_POISON;
57 	}
58 
59 	netfs_clear_subrequests(wreq);
60 }
61 
62 /*
63  * Collect the subrequest results of unbuffered write subrequests.
64  */
netfs_unbuffered_write_collect(struct netfs_io_request * wreq,struct netfs_io_stream * stream,struct netfs_io_subrequest * subreq)65 static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
66 					   struct netfs_io_stream *stream,
67 					   struct netfs_io_subrequest *subreq)
68 {
69 	trace_netfs_collect_sreq(wreq, subreq);
70 
71 	spin_lock(&wreq->lock);
72 	list_del_init(&subreq->rreq_link);
73 	spin_unlock(&wreq->lock);
74 
75 	wreq->transferred += subreq->transferred;
76 	iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
77 
78 	stream->collected_to = subreq->start + subreq->transferred;
79 	wreq->collected_to = stream->collected_to;
80 	netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
81 
82 	trace_netfs_collect_stream(wreq, stream);
83 	trace_netfs_collect_state(wreq, wreq->collected_to, 0);
84 }
85 
86 /*
87  * Write data to the server without going through the pagecache and without
88  * writing it to the local cache.  We dispatch the subrequests serially and
89  * wait for each to complete before dispatching the next, lest we leave a gap
90  * in the data written due to a failure such as ENOSPC.  We could, however
91  * attempt to do preparation such as content encryption for the next subreq
92  * whilst the current is in progress.
93  */
netfs_unbuffered_write(struct netfs_io_request * wreq)94 static int netfs_unbuffered_write(struct netfs_io_request *wreq)
95 {
96 	struct netfs_io_subrequest *subreq = NULL;
97 	struct netfs_io_stream *stream = &wreq->io_streams[0];
98 	int ret;
99 
100 	_enter("%llx", wreq->len);
101 
102 	if (wreq->origin == NETFS_DIO_WRITE)
103 		inode_dio_begin(wreq->inode);
104 
105 	stream->collected_to = wreq->start;
106 
107 	for (;;) {
108 		bool retry = false;
109 
110 		if (!subreq) {
111 			netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
112 			subreq = stream->construct;
113 			stream->construct = NULL;
114 		}
115 
116 		/* Check if (re-)preparation failed. */
117 		if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
118 			netfs_write_subrequest_terminated(subreq, subreq->error);
119 			wreq->error = subreq->error;
120 			break;
121 		}
122 
123 		iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
124 		if (!iov_iter_count(&subreq->io_iter))
125 			break;
126 
127 		subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
128 					       stream->sreq_max_len,
129 					       stream->sreq_max_segs);
130 		iov_iter_truncate(&subreq->io_iter, subreq->len);
131 		stream->submit_extendable_to = subreq->len;
132 
133 		trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
134 		stream->issue_write(subreq);
135 
136 		/* Async, need to wait. */
137 		netfs_wait_for_in_progress_stream(wreq, stream);
138 
139 		if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
140 			retry = true;
141 		} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
142 			ret = subreq->error;
143 			wreq->error = ret;
144 			netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
145 			subreq = NULL;
146 			break;
147 		}
148 		ret = 0;
149 
150 		if (!retry) {
151 			netfs_unbuffered_write_collect(wreq, stream, subreq);
152 			subreq = NULL;
153 			if (wreq->transferred >= wreq->len)
154 				break;
155 			if (!wreq->iocb && signal_pending(current)) {
156 				ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
157 				trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
158 				break;
159 			}
160 			continue;
161 		}
162 
163 		/* We need to retry the last subrequest, so first reset the
164 		 * iterator, taking into account what, if anything, we managed
165 		 * to transfer.
166 		 */
167 		subreq->error = -EAGAIN;
168 		trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
169 		if (subreq->transferred > 0)
170 			iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
171 
172 		if (stream->source == NETFS_UPLOAD_TO_SERVER &&
173 		    wreq->netfs_ops->retry_request)
174 			wreq->netfs_ops->retry_request(wreq, stream);
175 
176 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
177 		__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
178 		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
179 		subreq->io_iter		= wreq->buffer.iter;
180 		subreq->start		= wreq->start + wreq->transferred;
181 		subreq->len		= wreq->len   - wreq->transferred;
182 		subreq->transferred	= 0;
183 		subreq->retry_count	+= 1;
184 		stream->sreq_max_len	= UINT_MAX;
185 		stream->sreq_max_segs	= INT_MAX;
186 
187 		netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
188 
189 		if (stream->prepare_write) {
190 			stream->prepare_write(subreq);
191 			__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
192 			netfs_stat(&netfs_n_wh_retry_write_subreq);
193 		} else {
194 			struct iov_iter source;
195 
196 			netfs_reset_iter(subreq);
197 			source = subreq->io_iter;
198 			netfs_reissue_write(stream, subreq, &source);
199 		}
200 	}
201 
202 	netfs_unbuffered_write_done(wreq);
203 	_leave(" = %d", ret);
204 	return ret;
205 }
206 
netfs_unbuffered_write_async(struct work_struct * work)207 static void netfs_unbuffered_write_async(struct work_struct *work)
208 {
209 	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
210 
211 	netfs_unbuffered_write(wreq);
212 	netfs_put_request(wreq, netfs_rreq_trace_put_complete);
213 }
214 
215 /*
216  * Perform an unbuffered write where we may have to do an RMW operation on an
217  * encrypted file.  This can also be used for direct I/O writes.
218  */
netfs_unbuffered_write_iter_locked(struct kiocb * iocb,struct iov_iter * iter,struct netfs_group * netfs_group)219 ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
220 						  struct netfs_group *netfs_group)
221 {
222 	struct netfs_io_request *wreq;
223 	unsigned long long start = iocb->ki_pos;
224 	unsigned long long end = start + iov_iter_count(iter);
225 	ssize_t ret, n;
226 	size_t len = iov_iter_count(iter);
227 	bool async = !is_sync_kiocb(iocb);
228 
229 	_enter("");
230 
231 	/* We're going to need a bounce buffer if what we transmit is going to
232 	 * be different in some way to the source buffer, e.g. because it gets
233 	 * encrypted/compressed or because it needs expanding to a block size.
234 	 */
235 	// TODO
236 
237 	_debug("uw %llx-%llx", start, end);
238 
239 	wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
240 				      iocb->ki_flags & IOCB_DIRECT ?
241 				      NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
242 	if (IS_ERR(wreq))
243 		return PTR_ERR(wreq);
244 
245 	wreq->io_streams[0].avail = true;
246 	trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
247 				 netfs_write_trace_dio_write :
248 				 netfs_write_trace_unbuffered_write));
249 
250 	{
251 		/* If this is an async op and we're not using a bounce buffer,
252 		 * we have to save the source buffer as the iterator is only
253 		 * good until we return.  In such a case, extract an iterator
254 		 * to represent as much of the the output buffer as we can
255 		 * manage.  Note that the extraction might not be able to
256 		 * allocate a sufficiently large bvec array and may shorten the
257 		 * request.
258 		 */
259 		if (user_backed_iter(iter)) {
260 			n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
261 			if (n < 0) {
262 				ret = n;
263 				goto error_put;
264 			}
265 			wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
266 			wreq->direct_bv_count = n;
267 			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
268 		} else {
269 			/* If this is a kernel-generated async DIO request,
270 			 * assume that any resources the iterator points to
271 			 * (eg. a bio_vec array) will persist till the end of
272 			 * the op.
273 			 */
274 			wreq->buffer.iter = *iter;
275 		}
276 
277 		wreq->len = iov_iter_count(&wreq->buffer.iter);
278 	}
279 
280 	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
281 
282 	/* Copy the data into the bounce buffer and encrypt it. */
283 	// TODO
284 
285 	/* Dispatch the write. */
286 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
287 
288 	if (async) {
289 		INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
290 		wreq->iocb = iocb;
291 		queue_work(system_dfl_wq, &wreq->work);
292 		ret = -EIOCBQUEUED;
293 	} else {
294 		ret = netfs_unbuffered_write(wreq);
295 		if (ret < 0) {
296 			_debug("begin = %zd", ret);
297 		} else {
298 			iocb->ki_pos += wreq->transferred;
299 			ret = wreq->transferred ?: wreq->error;
300 		}
301 
302 		netfs_put_request(wreq, netfs_rreq_trace_put_complete);
303 	}
304 
305 	netfs_put_request(wreq, netfs_rreq_trace_put_return);
306 	return ret;
307 
308 error_put:
309 	netfs_put_failed_request(wreq);
310 	return ret;
311 }
312 EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
313 
314 /**
315  * netfs_unbuffered_write_iter - Unbuffered write to a file
316  * @iocb: IO state structure
317  * @from: iov_iter with data to write
318  *
319  * Do an unbuffered write to a file, writing the data directly to the server
320  * and not lodging the data in the pagecache.
321  *
322  * Return:
323  * * Negative error code if no data has been written at all of
324  *   vfs_fsync_range() failed for a synchronous write
325  * * Number of bytes written, even for truncated writes
326  */
netfs_unbuffered_write_iter(struct kiocb * iocb,struct iov_iter * from)327 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
328 {
329 	struct file *file = iocb->ki_filp;
330 	struct address_space *mapping = file->f_mapping;
331 	struct inode *inode = mapping->host;
332 	struct netfs_inode *ictx = netfs_inode(inode);
333 	ssize_t ret;
334 	loff_t pos = iocb->ki_pos;
335 	unsigned long long end = pos + iov_iter_count(from) - 1;
336 
337 	_enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
338 
339 	if (!iov_iter_count(from))
340 		return 0;
341 
342 	trace_netfs_write_iter(iocb, from);
343 	netfs_stat(&netfs_n_wh_dio_write);
344 
345 	ret = netfs_start_io_direct(inode);
346 	if (ret < 0)
347 		return ret;
348 	ret = generic_write_checks(iocb, from);
349 	if (ret <= 0)
350 		goto out;
351 	ret = file_remove_privs(file);
352 	if (ret < 0)
353 		goto out;
354 	ret = file_update_time(file);
355 	if (ret < 0)
356 		goto out;
357 	if (iocb->ki_flags & IOCB_NOWAIT) {
358 		/* We could block if there are any pages in the range. */
359 		ret = -EAGAIN;
360 		if (filemap_range_has_page(mapping, pos, end))
361 			if (filemap_invalidate_inode(inode, true, pos, end))
362 				goto out;
363 	} else {
364 		ret = filemap_write_and_wait_range(mapping, pos, end);
365 		if (ret < 0)
366 			goto out;
367 	}
368 
369 	/*
370 	 * After a write we want buffered reads to be sure to go to disk to get
371 	 * the new data.  We invalidate clean cached page from the region we're
372 	 * about to write.  We do this *before* the write so that we can return
373 	 * without clobbering -EIOCBQUEUED from ->direct_IO().
374 	 */
375 	ret = filemap_invalidate_inode(inode, true, pos, end);
376 	if (ret < 0)
377 		goto out;
378 	end = iocb->ki_pos + iov_iter_count(from);
379 	if (end > ictx->zero_point)
380 		ictx->zero_point = end;
381 
382 	fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
383 			   FSCACHE_INVAL_DIO_WRITE);
384 	ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
385 out:
386 	netfs_end_io_direct(inode);
387 	return ret;
388 }
389 EXPORT_SYMBOL(netfs_unbuffered_write_iter);
390