1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* Network filesystem high-level buffered read support. 3 * 4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/task_io_accounting_ops.h> 10 #include "internal.h" 11 12 /* 13 * Unlock the folios in a read operation. We need to set PG_fscache on any 14 * folios we're going to write back before we unlock them. 15 */ 16 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) 17 { 18 struct netfs_io_subrequest *subreq; 19 struct folio *folio; 20 pgoff_t start_page = rreq->start / PAGE_SIZE; 21 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; 22 size_t account = 0; 23 bool subreq_failed = false; 24 25 XA_STATE(xas, &rreq->mapping->i_pages, start_page); 26 27 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { 28 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); 29 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 30 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); 31 } 32 } 33 34 /* Walk through the pagecache and the I/O request lists simultaneously. 35 * We may have a mixture of cached and uncached sections and we only 36 * really want to write out the uncached sections. This is slightly 37 * complicated by the possibility that we might have huge pages with a 38 * mixture inside. 39 */ 40 subreq = list_first_entry(&rreq->subrequests, 41 struct netfs_io_subrequest, rreq_link); 42 subreq_failed = (subreq->error < 0); 43 44 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); 45 46 rcu_read_lock(); 47 xas_for_each(&xas, folio, last_page) { 48 loff_t pg_end; 49 bool pg_failed = false; 50 51 if (xas_retry(&xas, folio)) 52 continue; 53 54 pg_end = folio_pos(folio) + folio_size(folio) - 1; 55 56 for (;;) { 57 loff_t sreq_end; 58 59 if (!subreq) { 60 pg_failed = true; 61 break; 62 } 63 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) 64 folio_start_fscache(folio); 65 pg_failed |= subreq_failed; 66 sreq_end = subreq->start + subreq->len - 1; 67 if (pg_end < sreq_end) 68 break; 69 70 account += subreq->transferred; 71 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 72 subreq = list_next_entry(subreq, rreq_link); 73 subreq_failed = (subreq->error < 0); 74 } else { 75 subreq = NULL; 76 subreq_failed = false; 77 } 78 79 if (pg_end == sreq_end) 80 break; 81 } 82 83 if (!pg_failed) { 84 flush_dcache_folio(folio); 85 folio_mark_uptodate(folio); 86 } 87 88 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 89 if (folio_index(folio) == rreq->no_unlock_folio && 90 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) 91 _debug("no unlock"); 92 else 93 folio_unlock(folio); 94 } 95 } 96 rcu_read_unlock(); 97 98 task_io_account_read(account); 99 if (rreq->netfs_ops->done) 100 rreq->netfs_ops->done(rreq); 101 } 102 103 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, 104 loff_t *_start, size_t *_len, loff_t i_size) 105 { 106 struct netfs_cache_resources *cres = &rreq->cache_resources; 107 108 if (cres->ops && cres->ops->expand_readahead) 109 cres->ops->expand_readahead(cres, _start, _len, i_size); 110 } 111 112 static void netfs_rreq_expand(struct netfs_io_request *rreq, 113 struct readahead_control *ractl) 114 { 115 /* Give the cache a chance to change the request parameters. The 116 * resultant request must contain the original region. 117 */ 118 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); 119 120 /* Give the netfs a chance to change the request parameters. The 121 * resultant request must contain the original region. 122 */ 123 if (rreq->netfs_ops->expand_readahead) 124 rreq->netfs_ops->expand_readahead(rreq); 125 126 /* Expand the request if the cache wants it to start earlier. Note 127 * that the expansion may get further extended if the VM wishes to 128 * insert THPs and the preferred start and/or end wind up in the middle 129 * of THPs. 130 * 131 * If this is the case, however, the THP size should be an integer 132 * multiple of the cache granule size, so we get a whole number of 133 * granules to deal with. 134 */ 135 if (rreq->start != readahead_pos(ractl) || 136 rreq->len != readahead_length(ractl)) { 137 readahead_expand(ractl, rreq->start, rreq->len); 138 rreq->start = readahead_pos(ractl); 139 rreq->len = readahead_length(ractl); 140 141 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 142 netfs_read_trace_expanded); 143 } 144 } 145 146 /** 147 * netfs_readahead - Helper to manage a read request 148 * @ractl: The description of the readahead request 149 * 150 * Fulfil a readahead request by drawing data from the cache if possible, or 151 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O 152 * requests from different sources will get munged together. If necessary, the 153 * readahead window can be expanded in either direction to a more convenient 154 * alighment for RPC efficiency or to make storage in the cache feasible. 155 * 156 * The calling netfs must initialise a netfs context contiguous to the vfs 157 * inode before calling this. 158 * 159 * This is usable whether or not caching is enabled. 160 */ 161 void netfs_readahead(struct readahead_control *ractl) 162 { 163 struct netfs_io_request *rreq; 164 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); 165 int ret; 166 167 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); 168 169 if (readahead_count(ractl) == 0) 170 return; 171 172 rreq = netfs_alloc_request(ractl->mapping, ractl->file, 173 readahead_pos(ractl), 174 readahead_length(ractl), 175 NETFS_READAHEAD); 176 if (IS_ERR(rreq)) 177 return; 178 179 if (ctx->ops->begin_cache_operation) { 180 ret = ctx->ops->begin_cache_operation(rreq); 181 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 182 goto cleanup_free; 183 } 184 185 netfs_stat(&netfs_n_rh_readahead); 186 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 187 netfs_read_trace_readahead); 188 189 netfs_rreq_expand(rreq, ractl); 190 191 /* Drop the refs on the folios here rather than in the cache or 192 * filesystem. The locks will be dropped in netfs_rreq_unlock(). 193 */ 194 while (readahead_folio(ractl)) 195 ; 196 197 netfs_begin_read(rreq, false); 198 return; 199 200 cleanup_free: 201 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 202 return; 203 } 204 EXPORT_SYMBOL(netfs_readahead); 205 206 /** 207 * netfs_read_folio - Helper to manage a read_folio request 208 * @file: The file to read from 209 * @folio: The folio to read 210 * 211 * Fulfil a read_folio request by drawing data from the cache if 212 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 213 * Multiple I/O requests from different sources will get munged together. 214 * 215 * The calling netfs must initialise a netfs context contiguous to the vfs 216 * inode before calling this. 217 * 218 * This is usable whether or not caching is enabled. 219 */ 220 int netfs_read_folio(struct file *file, struct folio *folio) 221 { 222 struct address_space *mapping = folio_file_mapping(folio); 223 struct netfs_io_request *rreq; 224 struct netfs_inode *ctx = netfs_inode(mapping->host); 225 int ret; 226 227 _enter("%lx", folio_index(folio)); 228 229 rreq = netfs_alloc_request(mapping, file, 230 folio_file_pos(folio), folio_size(folio), 231 NETFS_READPAGE); 232 if (IS_ERR(rreq)) { 233 ret = PTR_ERR(rreq); 234 goto alloc_error; 235 } 236 237 if (ctx->ops->begin_cache_operation) { 238 ret = ctx->ops->begin_cache_operation(rreq); 239 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 240 goto discard; 241 } 242 243 netfs_stat(&netfs_n_rh_readpage); 244 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); 245 return netfs_begin_read(rreq, true); 246 247 discard: 248 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 249 alloc_error: 250 folio_unlock(folio); 251 return ret; 252 } 253 EXPORT_SYMBOL(netfs_read_folio); 254 255 /* 256 * Prepare a folio for writing without reading first 257 * @folio: The folio being prepared 258 * @pos: starting position for the write 259 * @len: length of write 260 * @always_fill: T if the folio should always be completely filled/cleared 261 * 262 * In some cases, write_begin doesn't need to read at all: 263 * - full folio write 264 * - write that lies in a folio that is completely beyond EOF 265 * - write that covers the folio from start to EOF or beyond it 266 * 267 * If any of these criteria are met, then zero out the unwritten parts 268 * of the folio and return true. Otherwise, return false. 269 */ 270 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, 271 bool always_fill) 272 { 273 struct inode *inode = folio_inode(folio); 274 loff_t i_size = i_size_read(inode); 275 size_t offset = offset_in_folio(folio, pos); 276 size_t plen = folio_size(folio); 277 278 if (unlikely(always_fill)) { 279 if (pos - offset + len <= i_size) 280 return false; /* Page entirely before EOF */ 281 zero_user_segment(&folio->page, 0, plen); 282 folio_mark_uptodate(folio); 283 return true; 284 } 285 286 /* Full folio write */ 287 if (offset == 0 && len >= plen) 288 return true; 289 290 /* Page entirely beyond the end of the file */ 291 if (pos - offset >= i_size) 292 goto zero_out; 293 294 /* Write that covers from the start of the folio to EOF or beyond */ 295 if (offset == 0 && (pos + len) >= i_size) 296 goto zero_out; 297 298 return false; 299 zero_out: 300 zero_user_segments(&folio->page, 0, offset, offset + len, plen); 301 return true; 302 } 303 304 /** 305 * netfs_write_begin - Helper to prepare for writing 306 * @ctx: The netfs context 307 * @file: The file to read from 308 * @mapping: The mapping to read from 309 * @pos: File position at which the write will begin 310 * @len: The length of the write (may extend beyond the end of the folio chosen) 311 * @_folio: Where to put the resultant folio 312 * @_fsdata: Place for the netfs to store a cookie 313 * 314 * Pre-read data for a write-begin request by drawing data from the cache if 315 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 316 * Multiple I/O requests from different sources will get munged together. If 317 * necessary, the readahead window can be expanded in either direction to a 318 * more convenient alighment for RPC efficiency or to make storage in the cache 319 * feasible. 320 * 321 * The calling netfs must provide a table of operations, only one of which, 322 * issue_op, is mandatory. 323 * 324 * The check_write_begin() operation can be provided to check for and flush 325 * conflicting writes once the folio is grabbed and locked. It is passed a 326 * pointer to the fsdata cookie that gets returned to the VM to be passed to 327 * write_end. It is permitted to sleep. It should return 0 if the request 328 * should go ahead or it may return an error. It may also unlock and put the 329 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 330 * will cause the folio to be re-got and the process to be retried. 331 * 332 * The calling netfs must initialise a netfs context contiguous to the vfs 333 * inode before calling this. 334 * 335 * This is usable whether or not caching is enabled. 336 */ 337 int netfs_write_begin(struct netfs_inode *ctx, 338 struct file *file, struct address_space *mapping, 339 loff_t pos, unsigned int len, struct folio **_folio, 340 void **_fsdata) 341 { 342 struct netfs_io_request *rreq; 343 struct folio *folio; 344 pgoff_t index = pos >> PAGE_SHIFT; 345 int ret; 346 347 DEFINE_READAHEAD(ractl, file, NULL, mapping, index); 348 349 retry: 350 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 351 mapping_gfp_mask(mapping)); 352 if (IS_ERR(folio)) 353 return PTR_ERR(folio); 354 355 if (ctx->ops->check_write_begin) { 356 /* Allow the netfs (eg. ceph) to flush conflicts. */ 357 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); 358 if (ret < 0) { 359 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); 360 goto error; 361 } 362 if (!folio) 363 goto retry; 364 } 365 366 if (folio_test_uptodate(folio)) 367 goto have_folio; 368 369 /* If the page is beyond the EOF, we want to clear it - unless it's 370 * within the cache granule containing the EOF, in which case we need 371 * to preload the granule. 372 */ 373 if (!netfs_is_cache_enabled(ctx) && 374 netfs_skip_folio_read(folio, pos, len, false)) { 375 netfs_stat(&netfs_n_rh_write_zskip); 376 goto have_folio_no_wait; 377 } 378 379 rreq = netfs_alloc_request(mapping, file, 380 folio_file_pos(folio), folio_size(folio), 381 NETFS_READ_FOR_WRITE); 382 if (IS_ERR(rreq)) { 383 ret = PTR_ERR(rreq); 384 goto error; 385 } 386 rreq->no_unlock_folio = folio_index(folio); 387 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 388 389 if (ctx->ops->begin_cache_operation) { 390 ret = ctx->ops->begin_cache_operation(rreq); 391 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 392 goto error_put; 393 } 394 395 netfs_stat(&netfs_n_rh_write_begin); 396 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); 397 398 /* Expand the request to meet caching requirements and download 399 * preferences. 400 */ 401 ractl._nr_pages = folio_nr_pages(folio); 402 netfs_rreq_expand(rreq, &ractl); 403 404 /* We hold the folio locks, so we can drop the references */ 405 folio_get(folio); 406 while (readahead_folio(&ractl)) 407 ; 408 409 ret = netfs_begin_read(rreq, true); 410 if (ret < 0) 411 goto error; 412 413 have_folio: 414 ret = folio_wait_fscache_killable(folio); 415 if (ret < 0) 416 goto error; 417 have_folio_no_wait: 418 *_folio = folio; 419 _leave(" = 0"); 420 return 0; 421 422 error_put: 423 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 424 error: 425 if (folio) { 426 folio_unlock(folio); 427 folio_put(folio); 428 } 429 _leave(" = %d", ret); 430 return ret; 431 } 432 EXPORT_SYMBOL(netfs_write_begin); 433