1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Unbuffered and direct write support.
3 *
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8 #include <linux/export.h>
9 #include <linux/uio.h>
10 #include "internal.h"
11
12 /*
13 * Perform the cleanup rituals after an unbuffered write is complete.
14 */
netfs_unbuffered_write_done(struct netfs_io_request * wreq)15 static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
16 {
17 struct netfs_inode *ictx = netfs_inode(wreq->inode);
18
19 _enter("R=%x", wreq->debug_id);
20
21 /* Okay, declare that all I/O is complete. */
22 trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
23
24 if (!wreq->error)
25 netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
26
27 if (wreq->origin == NETFS_DIO_WRITE &&
28 wreq->mapping->nrpages) {
29 /* mmap may have got underfoot and we may now have folios
30 * locally covering the region we just wrote. Attempt to
31 * discard the folios, but leave in place any modified locally.
32 * ->write_iter() is prevented from interfering by the DIO
33 * counter.
34 */
35 pgoff_t first = wreq->start >> PAGE_SHIFT;
36 pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
37
38 invalidate_inode_pages2_range(wreq->mapping, first, last);
39 }
40
41 if (wreq->origin == NETFS_DIO_WRITE)
42 inode_dio_end(wreq->inode);
43
44 _debug("finished");
45 netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
46 /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
47
48 if (wreq->iocb) {
49 size_t written = umin(wreq->transferred, wreq->len);
50
51 wreq->iocb->ki_pos += written;
52 if (wreq->iocb->ki_complete) {
53 trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
54 wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
55 }
56 wreq->iocb = VFS_PTR_POISON;
57 }
58
59 netfs_clear_subrequests(wreq);
60 }
61
62 /*
63 * Collect the subrequest results of unbuffered write subrequests.
64 */
netfs_unbuffered_write_collect(struct netfs_io_request * wreq,struct netfs_io_stream * stream,struct netfs_io_subrequest * subreq)65 static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
66 struct netfs_io_stream *stream,
67 struct netfs_io_subrequest *subreq)
68 {
69 trace_netfs_collect_sreq(wreq, subreq);
70
71 spin_lock(&wreq->lock);
72 list_del_init(&subreq->rreq_link);
73 spin_unlock(&wreq->lock);
74
75 wreq->transferred += subreq->transferred;
76 iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
77
78 stream->collected_to = subreq->start + subreq->transferred;
79 wreq->collected_to = stream->collected_to;
80 netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
81
82 trace_netfs_collect_stream(wreq, stream);
83 trace_netfs_collect_state(wreq, wreq->collected_to, 0);
84 }
85
86 /*
87 * Write data to the server without going through the pagecache and without
88 * writing it to the local cache. We dispatch the subrequests serially and
89 * wait for each to complete before dispatching the next, lest we leave a gap
90 * in the data written due to a failure such as ENOSPC. We could, however
91 * attempt to do preparation such as content encryption for the next subreq
92 * whilst the current is in progress.
93 */
netfs_unbuffered_write(struct netfs_io_request * wreq)94 static int netfs_unbuffered_write(struct netfs_io_request *wreq)
95 {
96 struct netfs_io_subrequest *subreq = NULL;
97 struct netfs_io_stream *stream = &wreq->io_streams[0];
98 int ret;
99
100 _enter("%llx", wreq->len);
101
102 if (wreq->origin == NETFS_DIO_WRITE)
103 inode_dio_begin(wreq->inode);
104
105 stream->collected_to = wreq->start;
106
107 for (;;) {
108 bool retry = false;
109
110 if (!subreq) {
111 netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
112 subreq = stream->construct;
113 stream->construct = NULL;
114 stream->front = NULL;
115 }
116
117 /* Check if (re-)preparation failed. */
118 if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
119 netfs_write_subrequest_terminated(subreq, subreq->error);
120 wreq->error = subreq->error;
121 break;
122 }
123
124 iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
125 if (!iov_iter_count(&subreq->io_iter))
126 break;
127
128 subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
129 stream->sreq_max_len,
130 stream->sreq_max_segs);
131 iov_iter_truncate(&subreq->io_iter, subreq->len);
132 stream->submit_extendable_to = subreq->len;
133
134 trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
135 stream->issue_write(subreq);
136
137 /* Async, need to wait. */
138 netfs_wait_for_in_progress_stream(wreq, stream);
139
140 if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
141 retry = true;
142 } else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
143 ret = subreq->error;
144 wreq->error = ret;
145 netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
146 subreq = NULL;
147 break;
148 }
149 ret = 0;
150
151 if (!retry) {
152 netfs_unbuffered_write_collect(wreq, stream, subreq);
153 subreq = NULL;
154 if (wreq->transferred >= wreq->len)
155 break;
156 if (!wreq->iocb && signal_pending(current)) {
157 ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
158 trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
159 break;
160 }
161 continue;
162 }
163
164 /* We need to retry the last subrequest, so first reset the
165 * iterator, taking into account what, if anything, we managed
166 * to transfer.
167 */
168 subreq->error = -EAGAIN;
169 trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
170 if (subreq->transferred > 0)
171 iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
172
173 if (stream->source == NETFS_UPLOAD_TO_SERVER &&
174 wreq->netfs_ops->retry_request)
175 wreq->netfs_ops->retry_request(wreq, stream);
176
177 __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
178 __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
179 __clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
180 subreq->io_iter = wreq->buffer.iter;
181 subreq->start = wreq->start + wreq->transferred;
182 subreq->len = wreq->len - wreq->transferred;
183 subreq->transferred = 0;
184 subreq->retry_count += 1;
185 stream->sreq_max_len = UINT_MAX;
186 stream->sreq_max_segs = INT_MAX;
187
188 netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
189 stream->prepare_write(subreq);
190
191 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
192 netfs_stat(&netfs_n_wh_retry_write_subreq);
193 }
194
195 netfs_unbuffered_write_done(wreq);
196 _leave(" = %d", ret);
197 return ret;
198 }
199
netfs_unbuffered_write_async(struct work_struct * work)200 static void netfs_unbuffered_write_async(struct work_struct *work)
201 {
202 struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
203
204 netfs_unbuffered_write(wreq);
205 netfs_put_request(wreq, netfs_rreq_trace_put_complete);
206 }
207
208 /*
209 * Perform an unbuffered write where we may have to do an RMW operation on an
210 * encrypted file. This can also be used for direct I/O writes.
211 */
netfs_unbuffered_write_iter_locked(struct kiocb * iocb,struct iov_iter * iter,struct netfs_group * netfs_group)212 ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
213 struct netfs_group *netfs_group)
214 {
215 struct netfs_io_request *wreq;
216 unsigned long long start = iocb->ki_pos;
217 unsigned long long end = start + iov_iter_count(iter);
218 ssize_t ret, n;
219 size_t len = iov_iter_count(iter);
220 bool async = !is_sync_kiocb(iocb);
221
222 _enter("");
223
224 /* We're going to need a bounce buffer if what we transmit is going to
225 * be different in some way to the source buffer, e.g. because it gets
226 * encrypted/compressed or because it needs expanding to a block size.
227 */
228 // TODO
229
230 _debug("uw %llx-%llx", start, end);
231
232 wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
233 iocb->ki_flags & IOCB_DIRECT ?
234 NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
235 if (IS_ERR(wreq))
236 return PTR_ERR(wreq);
237
238 wreq->io_streams[0].avail = true;
239 trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
240 netfs_write_trace_dio_write :
241 netfs_write_trace_unbuffered_write));
242
243 {
244 /* If this is an async op and we're not using a bounce buffer,
245 * we have to save the source buffer as the iterator is only
246 * good until we return. In such a case, extract an iterator
247 * to represent as much of the the output buffer as we can
248 * manage. Note that the extraction might not be able to
249 * allocate a sufficiently large bvec array and may shorten the
250 * request.
251 */
252 if (user_backed_iter(iter)) {
253 n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
254 if (n < 0) {
255 ret = n;
256 goto error_put;
257 }
258 wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
259 wreq->direct_bv_count = n;
260 wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
261 } else {
262 /* If this is a kernel-generated async DIO request,
263 * assume that any resources the iterator points to
264 * (eg. a bio_vec array) will persist till the end of
265 * the op.
266 */
267 wreq->buffer.iter = *iter;
268 }
269
270 wreq->len = iov_iter_count(&wreq->buffer.iter);
271 }
272
273 __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
274
275 /* Copy the data into the bounce buffer and encrypt it. */
276 // TODO
277
278 /* Dispatch the write. */
279 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
280
281 if (async) {
282 INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
283 wreq->iocb = iocb;
284 queue_work(system_dfl_wq, &wreq->work);
285 ret = -EIOCBQUEUED;
286 } else {
287 ret = netfs_unbuffered_write(wreq);
288 if (ret < 0) {
289 _debug("begin = %zd", ret);
290 } else {
291 iocb->ki_pos += wreq->transferred;
292 ret = wreq->transferred ?: wreq->error;
293 }
294
295 netfs_put_request(wreq, netfs_rreq_trace_put_complete);
296 }
297
298 netfs_put_request(wreq, netfs_rreq_trace_put_return);
299 return ret;
300
301 error_put:
302 netfs_put_failed_request(wreq);
303 return ret;
304 }
305 EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
306
307 /**
308 * netfs_unbuffered_write_iter - Unbuffered write to a file
309 * @iocb: IO state structure
310 * @from: iov_iter with data to write
311 *
312 * Do an unbuffered write to a file, writing the data directly to the server
313 * and not lodging the data in the pagecache.
314 *
315 * Return:
316 * * Negative error code if no data has been written at all of
317 * vfs_fsync_range() failed for a synchronous write
318 * * Number of bytes written, even for truncated writes
319 */
netfs_unbuffered_write_iter(struct kiocb * iocb,struct iov_iter * from)320 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
321 {
322 struct file *file = iocb->ki_filp;
323 struct address_space *mapping = file->f_mapping;
324 struct inode *inode = mapping->host;
325 struct netfs_inode *ictx = netfs_inode(inode);
326 ssize_t ret;
327 loff_t pos = iocb->ki_pos;
328 unsigned long long end = pos + iov_iter_count(from) - 1;
329
330 _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
331
332 if (!iov_iter_count(from))
333 return 0;
334
335 trace_netfs_write_iter(iocb, from);
336 netfs_stat(&netfs_n_wh_dio_write);
337
338 ret = netfs_start_io_direct(inode);
339 if (ret < 0)
340 return ret;
341 ret = generic_write_checks(iocb, from);
342 if (ret <= 0)
343 goto out;
344 ret = file_remove_privs(file);
345 if (ret < 0)
346 goto out;
347 ret = file_update_time(file);
348 if (ret < 0)
349 goto out;
350 if (iocb->ki_flags & IOCB_NOWAIT) {
351 /* We could block if there are any pages in the range. */
352 ret = -EAGAIN;
353 if (filemap_range_has_page(mapping, pos, end))
354 if (filemap_invalidate_inode(inode, true, pos, end))
355 goto out;
356 } else {
357 ret = filemap_write_and_wait_range(mapping, pos, end);
358 if (ret < 0)
359 goto out;
360 }
361
362 /*
363 * After a write we want buffered reads to be sure to go to disk to get
364 * the new data. We invalidate clean cached page from the region we're
365 * about to write. We do this *before* the write so that we can return
366 * without clobbering -EIOCBQUEUED from ->direct_IO().
367 */
368 ret = filemap_invalidate_inode(inode, true, pos, end);
369 if (ret < 0)
370 goto out;
371 end = iocb->ki_pos + iov_iter_count(from);
372 if (end > ictx->zero_point)
373 ictx->zero_point = end;
374
375 fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
376 FSCACHE_INVAL_DIO_WRITE);
377 ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
378 out:
379 netfs_end_io_direct(inode);
380 return ret;
381 }
382 EXPORT_SYMBOL(netfs_unbuffered_write_iter);
383