xref: /linux/fs/netfs/direct_write.c (revision 57885276cc16a2e2b76282c808a4e84cbecb3aae)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Unbuffered and direct write support.
3  *
4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/export.h>
9 #include <linux/uio.h>
10 #include "internal.h"
11 
12 /*
13  * Perform the cleanup rituals after an unbuffered write is complete.
14  */
15 static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
16 {
17 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
18 
19 	_enter("R=%x", wreq->debug_id);
20 
21 	/* Okay, declare that all I/O is complete. */
22 	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
23 
24 	if (!wreq->error)
25 		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
26 
27 	if (wreq->origin == NETFS_DIO_WRITE &&
28 	    wreq->mapping->nrpages) {
29 		/* mmap may have got underfoot and we may now have folios
30 		 * locally covering the region we just wrote.  Attempt to
31 		 * discard the folios, but leave in place any modified locally.
32 		 * ->write_iter() is prevented from interfering by the DIO
33 		 * counter.
34 		 */
35 		pgoff_t first = wreq->start >> PAGE_SHIFT;
36 		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
37 
38 		invalidate_inode_pages2_range(wreq->mapping, first, last);
39 	}
40 
41 	if (wreq->origin == NETFS_DIO_WRITE)
42 		inode_dio_end(wreq->inode);
43 
44 	_debug("finished");
45 	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
46 	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
47 
48 	if (wreq->iocb) {
49 		size_t written = umin(wreq->transferred, wreq->len);
50 
51 		wreq->iocb->ki_pos += written;
52 		if (wreq->iocb->ki_complete) {
53 			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
54 			wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
55 		}
56 		wreq->iocb = VFS_PTR_POISON;
57 	}
58 
59 	netfs_clear_subrequests(wreq);
60 }
61 
62 /*
63  * Collect the subrequest results of unbuffered write subrequests.
64  */
65 static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
66 					   struct netfs_io_stream *stream,
67 					   struct netfs_io_subrequest *subreq)
68 {
69 	trace_netfs_collect_sreq(wreq, subreq);
70 
71 	spin_lock(&wreq->lock);
72 	list_del_init(&subreq->rreq_link);
73 	spin_unlock(&wreq->lock);
74 
75 	wreq->transferred += subreq->transferred;
76 	iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
77 
78 	stream->collected_to = subreq->start + subreq->transferred;
79 	wreq->collected_to = stream->collected_to;
80 	netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
81 
82 	trace_netfs_collect_stream(wreq, stream);
83 	trace_netfs_collect_state(wreq, wreq->collected_to, 0);
84 }
85 
86 /*
87  * Write data to the server without going through the pagecache and without
88  * writing it to the local cache.  We dispatch the subrequests serially and
89  * wait for each to complete before dispatching the next, lest we leave a gap
90  * in the data written due to a failure such as ENOSPC.  We could, however
91  * attempt to do preparation such as content encryption for the next subreq
92  * whilst the current is in progress.
93  */
94 static int netfs_unbuffered_write(struct netfs_io_request *wreq)
95 {
96 	struct netfs_io_subrequest *subreq = NULL;
97 	struct netfs_io_stream *stream = &wreq->io_streams[0];
98 	int ret;
99 
100 	_enter("%llx", wreq->len);
101 
102 	if (wreq->origin == NETFS_DIO_WRITE)
103 		inode_dio_begin(wreq->inode);
104 
105 	stream->collected_to = wreq->start;
106 
107 	for (;;) {
108 		bool retry = false;
109 
110 		if (!subreq) {
111 			netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
112 			subreq = stream->construct;
113 			stream->construct = NULL;
114 			stream->front = NULL;
115 		}
116 
117 		/* Check if (re-)preparation failed. */
118 		if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
119 			netfs_write_subrequest_terminated(subreq, subreq->error);
120 			wreq->error = subreq->error;
121 			break;
122 		}
123 
124 		iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
125 		if (!iov_iter_count(&subreq->io_iter))
126 			break;
127 
128 		subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
129 					       stream->sreq_max_len,
130 					       stream->sreq_max_segs);
131 		iov_iter_truncate(&subreq->io_iter, subreq->len);
132 		stream->submit_extendable_to = subreq->len;
133 
134 		trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
135 		stream->issue_write(subreq);
136 
137 		/* Async, need to wait. */
138 		netfs_wait_for_in_progress_stream(wreq, stream);
139 
140 		if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
141 			retry = true;
142 		} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
143 			ret = subreq->error;
144 			wreq->error = ret;
145 			netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
146 			subreq = NULL;
147 			break;
148 		}
149 		ret = 0;
150 
151 		if (!retry) {
152 			netfs_unbuffered_write_collect(wreq, stream, subreq);
153 			subreq = NULL;
154 			if (wreq->transferred >= wreq->len)
155 				break;
156 			if (!wreq->iocb && signal_pending(current)) {
157 				ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
158 				trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
159 				break;
160 			}
161 			continue;
162 		}
163 
164 		/* We need to retry the last subrequest, so first reset the
165 		 * iterator, taking into account what, if anything, we managed
166 		 * to transfer.
167 		 */
168 		subreq->error = -EAGAIN;
169 		trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
170 		if (subreq->transferred > 0)
171 			iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
172 
173 		if (stream->source == NETFS_UPLOAD_TO_SERVER &&
174 		    wreq->netfs_ops->retry_request)
175 			wreq->netfs_ops->retry_request(wreq, stream);
176 
177 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
178 		__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
179 		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
180 		subreq->io_iter		= wreq->buffer.iter;
181 		subreq->start		= wreq->start + wreq->transferred;
182 		subreq->len		= wreq->len   - wreq->transferred;
183 		subreq->transferred	= 0;
184 		subreq->retry_count	+= 1;
185 		stream->sreq_max_len	= UINT_MAX;
186 		stream->sreq_max_segs	= INT_MAX;
187 
188 		netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
189 		stream->prepare_write(subreq);
190 
191 		__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
192 		netfs_stat(&netfs_n_wh_retry_write_subreq);
193 	}
194 
195 	netfs_unbuffered_write_done(wreq);
196 	_leave(" = %d", ret);
197 	return ret;
198 }
199 
200 static void netfs_unbuffered_write_async(struct work_struct *work)
201 {
202 	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
203 
204 	netfs_unbuffered_write(wreq);
205 	netfs_put_request(wreq, netfs_rreq_trace_put_complete);
206 }
207 
208 /*
209  * Perform an unbuffered write where we may have to do an RMW operation on an
210  * encrypted file.  This can also be used for direct I/O writes.
211  */
212 ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
213 						  struct netfs_group *netfs_group)
214 {
215 	struct netfs_io_request *wreq;
216 	unsigned long long start = iocb->ki_pos;
217 	unsigned long long end = start + iov_iter_count(iter);
218 	ssize_t ret, n;
219 	size_t len = iov_iter_count(iter);
220 	bool async = !is_sync_kiocb(iocb);
221 
222 	_enter("");
223 
224 	/* We're going to need a bounce buffer if what we transmit is going to
225 	 * be different in some way to the source buffer, e.g. because it gets
226 	 * encrypted/compressed or because it needs expanding to a block size.
227 	 */
228 	// TODO
229 
230 	_debug("uw %llx-%llx", start, end);
231 
232 	wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
233 				      iocb->ki_flags & IOCB_DIRECT ?
234 				      NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
235 	if (IS_ERR(wreq))
236 		return PTR_ERR(wreq);
237 
238 	wreq->io_streams[0].avail = true;
239 	trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
240 				 netfs_write_trace_dio_write :
241 				 netfs_write_trace_unbuffered_write));
242 
243 	{
244 		/* If this is an async op and we're not using a bounce buffer,
245 		 * we have to save the source buffer as the iterator is only
246 		 * good until we return.  In such a case, extract an iterator
247 		 * to represent as much of the the output buffer as we can
248 		 * manage.  Note that the extraction might not be able to
249 		 * allocate a sufficiently large bvec array and may shorten the
250 		 * request.
251 		 */
252 		if (user_backed_iter(iter)) {
253 			n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
254 			if (n < 0) {
255 				ret = n;
256 				goto error_put;
257 			}
258 			wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
259 			wreq->direct_bv_count = n;
260 			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
261 		} else {
262 			/* If this is a kernel-generated async DIO request,
263 			 * assume that any resources the iterator points to
264 			 * (eg. a bio_vec array) will persist till the end of
265 			 * the op.
266 			 */
267 			wreq->buffer.iter = *iter;
268 		}
269 
270 		wreq->len = iov_iter_count(&wreq->buffer.iter);
271 	}
272 
273 	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
274 
275 	/* Copy the data into the bounce buffer and encrypt it. */
276 	// TODO
277 
278 	/* Dispatch the write. */
279 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
280 
281 	if (async) {
282 		INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
283 		wreq->iocb = iocb;
284 		queue_work(system_dfl_wq, &wreq->work);
285 		ret = -EIOCBQUEUED;
286 	} else {
287 		ret = netfs_unbuffered_write(wreq);
288 		if (ret < 0) {
289 			_debug("begin = %zd", ret);
290 		} else {
291 			iocb->ki_pos += wreq->transferred;
292 			ret = wreq->transferred ?: wreq->error;
293 		}
294 
295 		netfs_put_request(wreq, netfs_rreq_trace_put_complete);
296 	}
297 
298 	netfs_put_request(wreq, netfs_rreq_trace_put_return);
299 	return ret;
300 
301 error_put:
302 	netfs_put_failed_request(wreq);
303 	return ret;
304 }
305 EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
306 
307 /**
308  * netfs_unbuffered_write_iter - Unbuffered write to a file
309  * @iocb: IO state structure
310  * @from: iov_iter with data to write
311  *
312  * Do an unbuffered write to a file, writing the data directly to the server
313  * and not lodging the data in the pagecache.
314  *
315  * Return:
316  * * Negative error code if no data has been written at all of
317  *   vfs_fsync_range() failed for a synchronous write
318  * * Number of bytes written, even for truncated writes
319  */
320 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
321 {
322 	struct file *file = iocb->ki_filp;
323 	struct address_space *mapping = file->f_mapping;
324 	struct inode *inode = mapping->host;
325 	struct netfs_inode *ictx = netfs_inode(inode);
326 	ssize_t ret;
327 	loff_t pos = iocb->ki_pos;
328 	unsigned long long end = pos + iov_iter_count(from) - 1;
329 
330 	_enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
331 
332 	if (!iov_iter_count(from))
333 		return 0;
334 
335 	trace_netfs_write_iter(iocb, from);
336 	netfs_stat(&netfs_n_wh_dio_write);
337 
338 	ret = netfs_start_io_direct(inode);
339 	if (ret < 0)
340 		return ret;
341 	ret = generic_write_checks(iocb, from);
342 	if (ret <= 0)
343 		goto out;
344 	ret = file_remove_privs(file);
345 	if (ret < 0)
346 		goto out;
347 	ret = file_update_time(file);
348 	if (ret < 0)
349 		goto out;
350 	if (iocb->ki_flags & IOCB_NOWAIT) {
351 		/* We could block if there are any pages in the range. */
352 		ret = -EAGAIN;
353 		if (filemap_range_has_page(mapping, pos, end))
354 			if (filemap_invalidate_inode(inode, true, pos, end))
355 				goto out;
356 	} else {
357 		ret = filemap_write_and_wait_range(mapping, pos, end);
358 		if (ret < 0)
359 			goto out;
360 	}
361 
362 	/*
363 	 * After a write we want buffered reads to be sure to go to disk to get
364 	 * the new data.  We invalidate clean cached page from the region we're
365 	 * about to write.  We do this *before* the write so that we can return
366 	 * without clobbering -EIOCBQUEUED from ->direct_IO().
367 	 */
368 	ret = filemap_invalidate_inode(inode, true, pos, end);
369 	if (ret < 0)
370 		goto out;
371 	end = iocb->ki_pos + iov_iter_count(from);
372 	if (end > ictx->zero_point)
373 		ictx->zero_point = end;
374 
375 	fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
376 			   FSCACHE_INVAL_DIO_WRITE);
377 	ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
378 out:
379 	netfs_end_io_direct(inode);
380 	return ret;
381 }
382 EXPORT_SYMBOL(netfs_unbuffered_write_iter);
383