1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* Network filesystem high-level buffered read support. 3 * 4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/task_io_accounting_ops.h> 10 #include "internal.h" 11 12 /* 13 * Unlock the folios in a read operation. We need to set PG_fscache on any 14 * folios we're going to write back before we unlock them. 15 */ 16 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) 17 { 18 struct netfs_io_subrequest *subreq; 19 struct folio *folio; 20 pgoff_t start_page = rreq->start / PAGE_SIZE; 21 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; 22 size_t account = 0; 23 bool subreq_failed = false; 24 25 XA_STATE(xas, &rreq->mapping->i_pages, start_page); 26 27 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { 28 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); 29 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 30 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); 31 } 32 } 33 34 /* Walk through the pagecache and the I/O request lists simultaneously. 35 * We may have a mixture of cached and uncached sections and we only 36 * really want to write out the uncached sections. This is slightly 37 * complicated by the possibility that we might have huge pages with a 38 * mixture inside. 39 */ 40 subreq = list_first_entry(&rreq->subrequests, 41 struct netfs_io_subrequest, rreq_link); 42 subreq_failed = (subreq->error < 0); 43 44 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); 45 46 rcu_read_lock(); 47 xas_for_each(&xas, folio, last_page) { 48 loff_t pg_end; 49 bool pg_failed = false; 50 bool folio_started; 51 52 if (xas_retry(&xas, folio)) 53 continue; 54 55 pg_end = folio_pos(folio) + folio_size(folio) - 1; 56 57 folio_started = false; 58 for (;;) { 59 loff_t sreq_end; 60 61 if (!subreq) { 62 pg_failed = true; 63 break; 64 } 65 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { 66 folio_start_fscache(folio); 67 folio_started = true; 68 } 69 pg_failed |= subreq_failed; 70 sreq_end = subreq->start + subreq->len - 1; 71 if (pg_end < sreq_end) 72 break; 73 74 account += subreq->transferred; 75 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 76 subreq = list_next_entry(subreq, rreq_link); 77 subreq_failed = (subreq->error < 0); 78 } else { 79 subreq = NULL; 80 subreq_failed = false; 81 } 82 83 if (pg_end == sreq_end) 84 break; 85 } 86 87 if (!pg_failed) { 88 flush_dcache_folio(folio); 89 folio_mark_uptodate(folio); 90 } 91 92 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 93 if (folio_index(folio) == rreq->no_unlock_folio && 94 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) 95 _debug("no unlock"); 96 else 97 folio_unlock(folio); 98 } 99 } 100 rcu_read_unlock(); 101 102 task_io_account_read(account); 103 if (rreq->netfs_ops->done) 104 rreq->netfs_ops->done(rreq); 105 } 106 107 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, 108 loff_t *_start, size_t *_len, loff_t i_size) 109 { 110 struct netfs_cache_resources *cres = &rreq->cache_resources; 111 112 if (cres->ops && cres->ops->expand_readahead) 113 cres->ops->expand_readahead(cres, _start, _len, i_size); 114 } 115 116 static void netfs_rreq_expand(struct netfs_io_request *rreq, 117 struct readahead_control *ractl) 118 { 119 /* Give the cache a chance to change the request parameters. The 120 * resultant request must contain the original region. 121 */ 122 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); 123 124 /* Give the netfs a chance to change the request parameters. The 125 * resultant request must contain the original region. 126 */ 127 if (rreq->netfs_ops->expand_readahead) 128 rreq->netfs_ops->expand_readahead(rreq); 129 130 /* Expand the request if the cache wants it to start earlier. Note 131 * that the expansion may get further extended if the VM wishes to 132 * insert THPs and the preferred start and/or end wind up in the middle 133 * of THPs. 134 * 135 * If this is the case, however, the THP size should be an integer 136 * multiple of the cache granule size, so we get a whole number of 137 * granules to deal with. 138 */ 139 if (rreq->start != readahead_pos(ractl) || 140 rreq->len != readahead_length(ractl)) { 141 readahead_expand(ractl, rreq->start, rreq->len); 142 rreq->start = readahead_pos(ractl); 143 rreq->len = readahead_length(ractl); 144 145 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 146 netfs_read_trace_expanded); 147 } 148 } 149 150 /** 151 * netfs_readahead - Helper to manage a read request 152 * @ractl: The description of the readahead request 153 * 154 * Fulfil a readahead request by drawing data from the cache if possible, or 155 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O 156 * requests from different sources will get munged together. If necessary, the 157 * readahead window can be expanded in either direction to a more convenient 158 * alighment for RPC efficiency or to make storage in the cache feasible. 159 * 160 * The calling netfs must initialise a netfs context contiguous to the vfs 161 * inode before calling this. 162 * 163 * This is usable whether or not caching is enabled. 164 */ 165 void netfs_readahead(struct readahead_control *ractl) 166 { 167 struct netfs_io_request *rreq; 168 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); 169 int ret; 170 171 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); 172 173 if (readahead_count(ractl) == 0) 174 return; 175 176 rreq = netfs_alloc_request(ractl->mapping, ractl->file, 177 readahead_pos(ractl), 178 readahead_length(ractl), 179 NETFS_READAHEAD); 180 if (IS_ERR(rreq)) 181 return; 182 183 if (ctx->ops->begin_cache_operation) { 184 ret = ctx->ops->begin_cache_operation(rreq); 185 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 186 goto cleanup_free; 187 } 188 189 netfs_stat(&netfs_n_rh_readahead); 190 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 191 netfs_read_trace_readahead); 192 193 netfs_rreq_expand(rreq, ractl); 194 195 /* Drop the refs on the folios here rather than in the cache or 196 * filesystem. The locks will be dropped in netfs_rreq_unlock(). 197 */ 198 while (readahead_folio(ractl)) 199 ; 200 201 netfs_begin_read(rreq, false); 202 return; 203 204 cleanup_free: 205 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 206 return; 207 } 208 EXPORT_SYMBOL(netfs_readahead); 209 210 /** 211 * netfs_read_folio - Helper to manage a read_folio request 212 * @file: The file to read from 213 * @folio: The folio to read 214 * 215 * Fulfil a read_folio request by drawing data from the cache if 216 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 217 * Multiple I/O requests from different sources will get munged together. 218 * 219 * The calling netfs must initialise a netfs context contiguous to the vfs 220 * inode before calling this. 221 * 222 * This is usable whether or not caching is enabled. 223 */ 224 int netfs_read_folio(struct file *file, struct folio *folio) 225 { 226 struct address_space *mapping = folio_file_mapping(folio); 227 struct netfs_io_request *rreq; 228 struct netfs_inode *ctx = netfs_inode(mapping->host); 229 int ret; 230 231 _enter("%lx", folio_index(folio)); 232 233 rreq = netfs_alloc_request(mapping, file, 234 folio_file_pos(folio), folio_size(folio), 235 NETFS_READPAGE); 236 if (IS_ERR(rreq)) { 237 ret = PTR_ERR(rreq); 238 goto alloc_error; 239 } 240 241 if (ctx->ops->begin_cache_operation) { 242 ret = ctx->ops->begin_cache_operation(rreq); 243 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 244 goto discard; 245 } 246 247 netfs_stat(&netfs_n_rh_readpage); 248 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); 249 return netfs_begin_read(rreq, true); 250 251 discard: 252 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 253 alloc_error: 254 folio_unlock(folio); 255 return ret; 256 } 257 EXPORT_SYMBOL(netfs_read_folio); 258 259 /* 260 * Prepare a folio for writing without reading first 261 * @folio: The folio being prepared 262 * @pos: starting position for the write 263 * @len: length of write 264 * @always_fill: T if the folio should always be completely filled/cleared 265 * 266 * In some cases, write_begin doesn't need to read at all: 267 * - full folio write 268 * - write that lies in a folio that is completely beyond EOF 269 * - write that covers the folio from start to EOF or beyond it 270 * 271 * If any of these criteria are met, then zero out the unwritten parts 272 * of the folio and return true. Otherwise, return false. 273 */ 274 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, 275 bool always_fill) 276 { 277 struct inode *inode = folio_inode(folio); 278 loff_t i_size = i_size_read(inode); 279 size_t offset = offset_in_folio(folio, pos); 280 size_t plen = folio_size(folio); 281 282 if (unlikely(always_fill)) { 283 if (pos - offset + len <= i_size) 284 return false; /* Page entirely before EOF */ 285 zero_user_segment(&folio->page, 0, plen); 286 folio_mark_uptodate(folio); 287 return true; 288 } 289 290 /* Full folio write */ 291 if (offset == 0 && len >= plen) 292 return true; 293 294 /* Page entirely beyond the end of the file */ 295 if (pos - offset >= i_size) 296 goto zero_out; 297 298 /* Write that covers from the start of the folio to EOF or beyond */ 299 if (offset == 0 && (pos + len) >= i_size) 300 goto zero_out; 301 302 return false; 303 zero_out: 304 zero_user_segments(&folio->page, 0, offset, offset + len, plen); 305 return true; 306 } 307 308 /** 309 * netfs_write_begin - Helper to prepare for writing 310 * @ctx: The netfs context 311 * @file: The file to read from 312 * @mapping: The mapping to read from 313 * @pos: File position at which the write will begin 314 * @len: The length of the write (may extend beyond the end of the folio chosen) 315 * @_folio: Where to put the resultant folio 316 * @_fsdata: Place for the netfs to store a cookie 317 * 318 * Pre-read data for a write-begin request by drawing data from the cache if 319 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 320 * Multiple I/O requests from different sources will get munged together. If 321 * necessary, the readahead window can be expanded in either direction to a 322 * more convenient alighment for RPC efficiency or to make storage in the cache 323 * feasible. 324 * 325 * The calling netfs must provide a table of operations, only one of which, 326 * issue_op, is mandatory. 327 * 328 * The check_write_begin() operation can be provided to check for and flush 329 * conflicting writes once the folio is grabbed and locked. It is passed a 330 * pointer to the fsdata cookie that gets returned to the VM to be passed to 331 * write_end. It is permitted to sleep. It should return 0 if the request 332 * should go ahead or it may return an error. It may also unlock and put the 333 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 334 * will cause the folio to be re-got and the process to be retried. 335 * 336 * The calling netfs must initialise a netfs context contiguous to the vfs 337 * inode before calling this. 338 * 339 * This is usable whether or not caching is enabled. 340 */ 341 int netfs_write_begin(struct netfs_inode *ctx, 342 struct file *file, struct address_space *mapping, 343 loff_t pos, unsigned int len, struct folio **_folio, 344 void **_fsdata) 345 { 346 struct netfs_io_request *rreq; 347 struct folio *folio; 348 pgoff_t index = pos >> PAGE_SHIFT; 349 int ret; 350 351 DEFINE_READAHEAD(ractl, file, NULL, mapping, index); 352 353 retry: 354 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 355 mapping_gfp_mask(mapping)); 356 if (IS_ERR(folio)) 357 return PTR_ERR(folio); 358 359 if (ctx->ops->check_write_begin) { 360 /* Allow the netfs (eg. ceph) to flush conflicts. */ 361 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); 362 if (ret < 0) { 363 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); 364 goto error; 365 } 366 if (!folio) 367 goto retry; 368 } 369 370 if (folio_test_uptodate(folio)) 371 goto have_folio; 372 373 /* If the page is beyond the EOF, we want to clear it - unless it's 374 * within the cache granule containing the EOF, in which case we need 375 * to preload the granule. 376 */ 377 if (!netfs_is_cache_enabled(ctx) && 378 netfs_skip_folio_read(folio, pos, len, false)) { 379 netfs_stat(&netfs_n_rh_write_zskip); 380 goto have_folio_no_wait; 381 } 382 383 rreq = netfs_alloc_request(mapping, file, 384 folio_file_pos(folio), folio_size(folio), 385 NETFS_READ_FOR_WRITE); 386 if (IS_ERR(rreq)) { 387 ret = PTR_ERR(rreq); 388 goto error; 389 } 390 rreq->no_unlock_folio = folio_index(folio); 391 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 392 393 if (ctx->ops->begin_cache_operation) { 394 ret = ctx->ops->begin_cache_operation(rreq); 395 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 396 goto error_put; 397 } 398 399 netfs_stat(&netfs_n_rh_write_begin); 400 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); 401 402 /* Expand the request to meet caching requirements and download 403 * preferences. 404 */ 405 ractl._nr_pages = folio_nr_pages(folio); 406 netfs_rreq_expand(rreq, &ractl); 407 408 /* We hold the folio locks, so we can drop the references */ 409 folio_get(folio); 410 while (readahead_folio(&ractl)) 411 ; 412 413 ret = netfs_begin_read(rreq, true); 414 if (ret < 0) 415 goto error; 416 417 have_folio: 418 ret = folio_wait_fscache_killable(folio); 419 if (ret < 0) 420 goto error; 421 have_folio_no_wait: 422 *_folio = folio; 423 _leave(" = 0"); 424 return 0; 425 426 error_put: 427 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 428 error: 429 if (folio) { 430 folio_unlock(folio); 431 folio_put(folio); 432 } 433 _leave(" = %d", ret); 434 return ret; 435 } 436 EXPORT_SYMBOL(netfs_write_begin); 437