1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level buffered write support.
3 *
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8 #include <linux/export.h>
9 #include <linux/fs.h>
10 #include <linux/mm.h>
11 #include <linux/pagemap.h>
12 #include <linux/slab.h>
13 #include <linux/pagevec.h>
14 #include "internal.h"
15
__netfs_set_group(struct folio * folio,struct netfs_group * netfs_group)16 static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
17 {
18 if (netfs_group)
19 folio_attach_private(folio, netfs_get_group(netfs_group));
20 }
21
netfs_set_group(struct folio * folio,struct netfs_group * netfs_group)22 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
23 {
24 void *priv = folio_get_private(folio);
25
26 if (unlikely(priv != netfs_group)) {
27 if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
28 folio_attach_private(folio, netfs_get_group(netfs_group));
29 else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
30 folio_detach_private(folio);
31 }
32 }
33
34 /*
35 * Grab a folio for writing and lock it. Attempt to allocate as large a folio
36 * as possible to hold as much of the remaining length as possible in one go.
37 */
netfs_grab_folio_for_write(struct address_space * mapping,loff_t pos,size_t part)38 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
39 loff_t pos, size_t part)
40 {
41 pgoff_t index = pos / PAGE_SIZE;
42 fgf_t fgp_flags = FGP_WRITEBEGIN;
43
44 if (mapping_large_folio_support(mapping))
45 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
46
47 return __filemap_get_folio(mapping, index, fgp_flags,
48 mapping_gfp_mask(mapping));
49 }
50
51 /*
52 * Update i_size and estimate the update to i_blocks to reflect the additional
53 * data written into the pagecache until we can find out from the server what
54 * the values actually are.
55 */
netfs_update_i_size(struct netfs_inode * ctx,struct inode * inode,loff_t pos,size_t copied)56 void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
57 loff_t pos, size_t copied)
58 {
59 loff_t i_size, end = pos + copied;
60 blkcnt_t add;
61 size_t gap;
62
63 if (end <= i_size_read(inode))
64 return;
65
66 if (ctx->ops->update_i_size) {
67 ctx->ops->update_i_size(inode, end);
68 return;
69 }
70
71 spin_lock(&inode->i_lock);
72
73 i_size = i_size_read(inode);
74 if (end > i_size) {
75 i_size_write(inode, end);
76 #if IS_ENABLED(CONFIG_FSCACHE)
77 fscache_update_cookie(ctx->cache, NULL, &end);
78 #endif
79
80 gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
81 if (copied > gap) {
82 add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
83
84 inode->i_blocks = min_t(blkcnt_t,
85 DIV_ROUND_UP(end, SECTOR_SIZE),
86 inode->i_blocks + add);
87 }
88 }
89 spin_unlock(&inode->i_lock);
90 }
91
92 /**
93 * netfs_perform_write - Copy data into the pagecache.
94 * @iocb: The operation parameters
95 * @iter: The source buffer
96 * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
97 *
98 * Copy data into pagecache folios attached to the inode specified by @iocb.
99 * The caller must hold appropriate inode locks.
100 *
101 * Dirty folios are tagged with a netfs_folio struct if they're not up to date
102 * to indicate the range modified. Dirty folios may also be tagged with a
103 * netfs-specific grouping such that data from an old group gets flushed before
104 * a new one is started.
105 */
netfs_perform_write(struct kiocb * iocb,struct iov_iter * iter,struct netfs_group * netfs_group)106 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
107 struct netfs_group *netfs_group)
108 {
109 struct file *file = iocb->ki_filp;
110 struct inode *inode = file_inode(file);
111 struct address_space *mapping = inode->i_mapping;
112 struct netfs_inode *ctx = netfs_inode(inode);
113 struct writeback_control wbc = {
114 .sync_mode = WB_SYNC_NONE,
115 .for_sync = true,
116 .nr_to_write = LONG_MAX,
117 .range_start = iocb->ki_pos,
118 .range_end = iocb->ki_pos + iter->count,
119 };
120 struct netfs_io_request *wreq = NULL;
121 struct folio *folio = NULL, *writethrough = NULL;
122 unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
123 ssize_t written = 0, ret, ret2;
124 loff_t pos = iocb->ki_pos;
125 size_t max_chunk = mapping_max_folio_size(mapping);
126 bool maybe_trouble = false;
127
128 if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
129 ) {
130 wbc_attach_fdatawrite_inode(&wbc, mapping->host);
131
132 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
133 if (ret < 0) {
134 wbc_detach_inode(&wbc);
135 goto out;
136 }
137
138 wreq = netfs_begin_writethrough(iocb, iter->count);
139 if (IS_ERR(wreq)) {
140 wbc_detach_inode(&wbc);
141 ret = PTR_ERR(wreq);
142 wreq = NULL;
143 goto out;
144 }
145 if (!is_sync_kiocb(iocb))
146 wreq->iocb = iocb;
147 netfs_stat(&netfs_n_wh_writethrough);
148 } else {
149 netfs_stat(&netfs_n_wh_buffered_write);
150 }
151
152 do {
153 struct netfs_folio *finfo;
154 struct netfs_group *group;
155 unsigned long long fpos;
156 size_t flen;
157 size_t offset; /* Offset into pagecache folio */
158 size_t part; /* Bytes to write to folio */
159 size_t copied; /* Bytes copied from user */
160
161 offset = pos & (max_chunk - 1);
162 part = min(max_chunk - offset, iov_iter_count(iter));
163
164 /* Bring in the user pages that we will copy from _first_ lest
165 * we hit a nasty deadlock on copying from the same page as
166 * we're writing to, without it being marked uptodate.
167 *
168 * Not only is this an optimisation, but it is also required to
169 * check that the address is actually valid, when atomic
170 * usercopies are used below.
171 *
172 * We rely on the page being held onto long enough by the LRU
173 * that we can grab it below if this causes it to be read.
174 */
175 ret = -EFAULT;
176 if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
177 break;
178
179 folio = netfs_grab_folio_for_write(mapping, pos, part);
180 if (IS_ERR(folio)) {
181 ret = PTR_ERR(folio);
182 break;
183 }
184
185 flen = folio_size(folio);
186 fpos = folio_pos(folio);
187 offset = pos - fpos;
188 part = min_t(size_t, flen - offset, part);
189
190 /* Wait for writeback to complete. The writeback engine owns
191 * the info in folio->private and may change it until it
192 * removes the WB mark.
193 */
194 if (folio_get_private(folio) &&
195 folio_wait_writeback_killable(folio)) {
196 ret = written ? -EINTR : -ERESTARTSYS;
197 goto error_folio_unlock;
198 }
199
200 if (signal_pending(current)) {
201 ret = written ? -EINTR : -ERESTARTSYS;
202 goto error_folio_unlock;
203 }
204
205 /* Decide how we should modify a folio. We might be attempting
206 * to do write-streaming, in which case we don't want to a
207 * local RMW cycle if we can avoid it. If we're doing local
208 * caching or content crypto, we award that priority over
209 * avoiding RMW. If the file is open readably, then we also
210 * assume that we may want to read what we wrote.
211 */
212 finfo = netfs_folio_info(folio);
213 group = netfs_folio_group(folio);
214
215 if (unlikely(group != netfs_group) &&
216 group != NETFS_FOLIO_COPY_TO_CACHE)
217 goto flush_content;
218
219 if (folio_test_uptodate(folio)) {
220 if (mapping_writably_mapped(mapping))
221 flush_dcache_folio(folio);
222 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
223 if (unlikely(copied == 0))
224 goto copy_failed;
225 netfs_set_group(folio, netfs_group);
226 trace_netfs_folio(folio, netfs_folio_is_uptodate);
227 goto copied;
228 }
229
230 /* If the page is above the zero-point then we assume that the
231 * server would just return a block of zeros or a short read if
232 * we try to read it.
233 */
234 if (fpos >= ctx->zero_point) {
235 folio_zero_segment(folio, 0, offset);
236 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
237 if (unlikely(copied == 0))
238 goto copy_failed;
239 folio_zero_segment(folio, offset + copied, flen);
240 __netfs_set_group(folio, netfs_group);
241 folio_mark_uptodate(folio);
242 trace_netfs_folio(folio, netfs_modify_and_clear);
243 goto copied;
244 }
245
246 /* See if we can write a whole folio in one go. */
247 if (!maybe_trouble && offset == 0 && part >= flen) {
248 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
249 if (unlikely(copied == 0))
250 goto copy_failed;
251 if (unlikely(copied < part)) {
252 maybe_trouble = true;
253 iov_iter_revert(iter, copied);
254 copied = 0;
255 folio_unlock(folio);
256 goto retry;
257 }
258 __netfs_set_group(folio, netfs_group);
259 folio_mark_uptodate(folio);
260 trace_netfs_folio(folio, netfs_whole_folio_modify);
261 goto copied;
262 }
263
264 /* We don't want to do a streaming write on a file that loses
265 * caching service temporarily because the backing store got
266 * culled and we don't really want to get a streaming write on
267 * a file that's open for reading as ->read_folio() then has to
268 * be able to flush it.
269 */
270 if ((file->f_mode & FMODE_READ) ||
271 netfs_is_cache_enabled(ctx)) {
272 if (finfo) {
273 netfs_stat(&netfs_n_wh_wstream_conflict);
274 goto flush_content;
275 }
276 ret = netfs_prefetch_for_write(file, folio, offset, part);
277 if (ret < 0) {
278 _debug("prefetch = %zd", ret);
279 goto error_folio_unlock;
280 }
281 /* Note that copy-to-cache may have been set. */
282
283 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
284 if (unlikely(copied == 0))
285 goto copy_failed;
286 netfs_set_group(folio, netfs_group);
287 trace_netfs_folio(folio, netfs_just_prefetch);
288 goto copied;
289 }
290
291 if (!finfo) {
292 ret = -EIO;
293 if (WARN_ON(folio_get_private(folio)))
294 goto error_folio_unlock;
295 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
296 if (unlikely(copied == 0))
297 goto copy_failed;
298 if (offset == 0 && copied == flen) {
299 __netfs_set_group(folio, netfs_group);
300 folio_mark_uptodate(folio);
301 trace_netfs_folio(folio, netfs_streaming_filled_page);
302 goto copied;
303 }
304
305 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
306 if (!finfo) {
307 iov_iter_revert(iter, copied);
308 ret = -ENOMEM;
309 goto error_folio_unlock;
310 }
311 finfo->netfs_group = netfs_get_group(netfs_group);
312 finfo->dirty_offset = offset;
313 finfo->dirty_len = copied;
314 folio_attach_private(folio, (void *)((unsigned long)finfo |
315 NETFS_FOLIO_INFO));
316 trace_netfs_folio(folio, netfs_streaming_write);
317 goto copied;
318 }
319
320 /* We can continue a streaming write only if it continues on
321 * from the previous. If it overlaps, we must flush lest we
322 * suffer a partial copy and disjoint dirty regions.
323 */
324 if (offset == finfo->dirty_offset + finfo->dirty_len) {
325 copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
326 if (unlikely(copied == 0))
327 goto copy_failed;
328 finfo->dirty_len += copied;
329 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
330 if (finfo->netfs_group)
331 folio_change_private(folio, finfo->netfs_group);
332 else
333 folio_detach_private(folio);
334 folio_mark_uptodate(folio);
335 kfree(finfo);
336 trace_netfs_folio(folio, netfs_streaming_cont_filled_page);
337 } else {
338 trace_netfs_folio(folio, netfs_streaming_write_cont);
339 }
340 goto copied;
341 }
342
343 /* Incompatible write; flush the folio and try again. */
344 flush_content:
345 trace_netfs_folio(folio, netfs_flush_content);
346 folio_unlock(folio);
347 folio_put(folio);
348 ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
349 if (ret < 0)
350 goto error_folio_unlock;
351 continue;
352
353 copied:
354 flush_dcache_folio(folio);
355
356 /* Update the inode size if we moved the EOF marker */
357 netfs_update_i_size(ctx, inode, pos, copied);
358 pos += copied;
359 written += copied;
360
361 if (likely(!wreq)) {
362 folio_mark_dirty(folio);
363 folio_unlock(folio);
364 } else {
365 netfs_advance_writethrough(wreq, &wbc, folio, copied,
366 offset + copied == flen,
367 &writethrough);
368 /* Folio unlocked */
369 }
370 retry:
371 folio_put(folio);
372 folio = NULL;
373
374 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
375 if (unlikely(ret < 0))
376 break;
377
378 cond_resched();
379 } while (iov_iter_count(iter));
380
381 out:
382 if (likely(written)) {
383 /* Set indication that ctime and mtime got updated in case
384 * close is deferred.
385 */
386 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags);
387 if (unlikely(ctx->ops->post_modify))
388 ctx->ops->post_modify(inode);
389 }
390
391 if (unlikely(wreq)) {
392 ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
393 wbc_detach_inode(&wbc);
394 if (ret2 == -EIOCBQUEUED)
395 return ret2;
396 if (ret == 0 && ret2 < 0)
397 ret = ret2;
398 }
399
400 iocb->ki_pos += written;
401 _leave(" = %zd [%zd]", written, ret);
402 return written ? written : ret;
403
404 copy_failed:
405 ret = -EFAULT;
406 error_folio_unlock:
407 folio_unlock(folio);
408 folio_put(folio);
409 goto out;
410 }
411 EXPORT_SYMBOL(netfs_perform_write);
412
413 /**
414 * netfs_buffered_write_iter_locked - write data to a file
415 * @iocb: IO state structure (file, offset, etc.)
416 * @from: iov_iter with data to write
417 * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
418 *
419 * This function does all the work needed for actually writing data to a
420 * file. It does all basic checks, removes SUID from the file, updates
421 * modification times and calls proper subroutines depending on whether we
422 * do direct IO or a standard buffered write.
423 *
424 * The caller must hold appropriate locks around this function and have called
425 * generic_write_checks() already. The caller is also responsible for doing
426 * any necessary syncing afterwards.
427 *
428 * This function does *not* take care of syncing data in case of O_SYNC write.
429 * A caller has to handle it. This is mainly due to the fact that we want to
430 * avoid syncing under i_rwsem.
431 *
432 * Return:
433 * * number of bytes written, even for truncated writes
434 * * negative error code if no data has been written at all
435 */
netfs_buffered_write_iter_locked(struct kiocb * iocb,struct iov_iter * from,struct netfs_group * netfs_group)436 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
437 struct netfs_group *netfs_group)
438 {
439 struct file *file = iocb->ki_filp;
440 ssize_t ret;
441
442 trace_netfs_write_iter(iocb, from);
443
444 ret = file_remove_privs(file);
445 if (ret)
446 return ret;
447
448 ret = file_update_time(file);
449 if (ret)
450 return ret;
451
452 return netfs_perform_write(iocb, from, netfs_group);
453 }
454 EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
455
456 /**
457 * netfs_file_write_iter - write data to a file
458 * @iocb: IO state structure
459 * @from: iov_iter with data to write
460 *
461 * Perform a write to a file, writing into the pagecache if possible and doing
462 * an unbuffered write instead if not.
463 *
464 * Return:
465 * * Negative error code if no data has been written at all of
466 * vfs_fsync_range() failed for a synchronous write
467 * * Number of bytes written, even for truncated writes
468 */
netfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)469 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
470 {
471 struct file *file = iocb->ki_filp;
472 struct inode *inode = file->f_mapping->host;
473 struct netfs_inode *ictx = netfs_inode(inode);
474 ssize_t ret;
475
476 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
477
478 if (!iov_iter_count(from))
479 return 0;
480
481 if ((iocb->ki_flags & IOCB_DIRECT) ||
482 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
483 return netfs_unbuffered_write_iter(iocb, from);
484
485 ret = netfs_start_io_write(inode);
486 if (ret < 0)
487 return ret;
488
489 ret = generic_write_checks(iocb, from);
490 if (ret > 0)
491 ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
492 netfs_end_io_write(inode);
493 if (ret > 0)
494 ret = generic_write_sync(iocb, ret);
495 return ret;
496 }
497 EXPORT_SYMBOL(netfs_file_write_iter);
498
499 /*
500 * Notification that a previously read-only page is about to become writable.
501 * The caller indicates the precise page that needs to be written to, but
502 * we only track group on a per-folio basis, so we block more often than
503 * we might otherwise.
504 */
netfs_page_mkwrite(struct vm_fault * vmf,struct netfs_group * netfs_group)505 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
506 {
507 struct netfs_group *group;
508 struct folio *folio = page_folio(vmf->page);
509 struct file *file = vmf->vma->vm_file;
510 struct address_space *mapping = file->f_mapping;
511 struct inode *inode = file_inode(file);
512 struct netfs_inode *ictx = netfs_inode(inode);
513 vm_fault_t ret = VM_FAULT_NOPAGE;
514 int err;
515
516 _enter("%lx", folio->index);
517
518 sb_start_pagefault(inode->i_sb);
519
520 if (folio_lock_killable(folio) < 0)
521 goto out;
522 if (folio->mapping != mapping)
523 goto unlock;
524 if (folio_wait_writeback_killable(folio) < 0)
525 goto unlock;
526
527 /* Can we see a streaming write here? */
528 if (WARN_ON(!folio_test_uptodate(folio))) {
529 ret = VM_FAULT_SIGBUS;
530 goto unlock;
531 }
532
533 group = netfs_folio_group(folio);
534 if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
535 folio_unlock(folio);
536 err = filemap_fdatawrite_range(mapping,
537 folio_pos(folio),
538 folio_pos(folio) + folio_size(folio));
539 switch (err) {
540 case 0:
541 ret = VM_FAULT_RETRY;
542 goto out;
543 case -ENOMEM:
544 ret = VM_FAULT_OOM;
545 goto out;
546 default:
547 ret = VM_FAULT_SIGBUS;
548 goto out;
549 }
550 }
551
552 if (folio_test_dirty(folio))
553 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
554 else
555 trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
556 netfs_set_group(folio, netfs_group);
557 file_update_time(file);
558 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags);
559 if (ictx->ops->post_modify)
560 ictx->ops->post_modify(inode);
561 ret = VM_FAULT_LOCKED;
562 out:
563 sb_end_pagefault(inode->i_sb);
564 return ret;
565 unlock:
566 folio_unlock(folio);
567 goto out;
568 }
569 EXPORT_SYMBOL(netfs_page_mkwrite);
570