1 /* 2 * Request reply cache. This is currently a global cache, but this may 3 * change in the future and be a per-client cache. 4 * 5 * This code is heavily inspired by the 44BSD implementation, although 6 * it does things a bit differently. 7 * 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 9 */ 10 11 #include <linux/slab.h> 12 #include <linux/sunrpc/addr.h> 13 #include <linux/highmem.h> 14 #include <linux/log2.h> 15 #include <linux/hash.h> 16 #include <net/checksum.h> 17 18 #include "nfsd.h" 19 #include "cache.h" 20 21 #define NFSDDBG_FACILITY NFSDDBG_REPCACHE 22 23 /* 24 * We use this value to determine the number of hash buckets from the max 25 * cache size, the idea being that when the cache is at its maximum number 26 * of entries, then this should be the average number of entries per bucket. 27 */ 28 #define TARGET_BUCKET_SIZE 64 29 30 struct nfsd_drc_bucket { 31 struct list_head lru_head; 32 spinlock_t cache_lock; 33 }; 34 35 static struct nfsd_drc_bucket *drc_hashtbl; 36 static struct kmem_cache *drc_slab; 37 38 /* max number of entries allowed in the cache */ 39 static unsigned int max_drc_entries; 40 41 /* number of significant bits in the hash value */ 42 static unsigned int maskbits; 43 static unsigned int drc_hashsize; 44 45 /* 46 * Stats and other tracking of on the duplicate reply cache. All of these and 47 * the "rc" fields in nfsdstats are protected by the cache_lock 48 */ 49 50 /* total number of entries */ 51 static atomic_t num_drc_entries; 52 53 /* cache misses due only to checksum comparison failures */ 54 static unsigned int payload_misses; 55 56 /* amount of memory (in bytes) currently consumed by the DRC */ 57 static unsigned int drc_mem_usage; 58 59 /* longest hash chain seen */ 60 static unsigned int longest_chain; 61 62 /* size of cache when we saw the longest hash chain */ 63 static unsigned int longest_chain_cachesize; 64 65 static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); 66 static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, 67 struct shrink_control *sc); 68 static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, 69 struct shrink_control *sc); 70 71 static struct shrinker nfsd_reply_cache_shrinker = { 72 .scan_objects = nfsd_reply_cache_scan, 73 .count_objects = nfsd_reply_cache_count, 74 .seeks = 1, 75 }; 76 77 /* 78 * Put a cap on the size of the DRC based on the amount of available 79 * low memory in the machine. 80 * 81 * 64MB: 8192 82 * 128MB: 11585 83 * 256MB: 16384 84 * 512MB: 23170 85 * 1GB: 32768 86 * 2GB: 46340 87 * 4GB: 65536 88 * 8GB: 92681 89 * 16GB: 131072 90 * 91 * ...with a hard cap of 256k entries. In the worst case, each entry will be 92 * ~1k, so the above numbers should give a rough max of the amount of memory 93 * used in k. 94 */ 95 static unsigned int 96 nfsd_cache_size_limit(void) 97 { 98 unsigned int limit; 99 unsigned long low_pages = totalram_pages - totalhigh_pages; 100 101 limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); 102 return min_t(unsigned int, limit, 256*1024); 103 } 104 105 /* 106 * Compute the number of hash buckets we need. Divide the max cachesize by 107 * the "target" max bucket size, and round up to next power of two. 108 */ 109 static unsigned int 110 nfsd_hashsize(unsigned int limit) 111 { 112 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); 113 } 114 115 static u32 116 nfsd_cache_hash(__be32 xid) 117 { 118 return hash_32(be32_to_cpu(xid), maskbits); 119 } 120 121 static struct svc_cacherep * 122 nfsd_reply_cache_alloc(void) 123 { 124 struct svc_cacherep *rp; 125 126 rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); 127 if (rp) { 128 rp->c_state = RC_UNUSED; 129 rp->c_type = RC_NOCACHE; 130 INIT_LIST_HEAD(&rp->c_lru); 131 } 132 return rp; 133 } 134 135 static void 136 nfsd_reply_cache_free_locked(struct svc_cacherep *rp) 137 { 138 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { 139 drc_mem_usage -= rp->c_replvec.iov_len; 140 kfree(rp->c_replvec.iov_base); 141 } 142 list_del(&rp->c_lru); 143 atomic_dec(&num_drc_entries); 144 drc_mem_usage -= sizeof(*rp); 145 kmem_cache_free(drc_slab, rp); 146 } 147 148 static void 149 nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) 150 { 151 spin_lock(&b->cache_lock); 152 nfsd_reply_cache_free_locked(rp); 153 spin_unlock(&b->cache_lock); 154 } 155 156 int nfsd_reply_cache_init(void) 157 { 158 unsigned int hashsize; 159 unsigned int i; 160 int status = 0; 161 162 max_drc_entries = nfsd_cache_size_limit(); 163 atomic_set(&num_drc_entries, 0); 164 hashsize = nfsd_hashsize(max_drc_entries); 165 maskbits = ilog2(hashsize); 166 167 status = register_shrinker(&nfsd_reply_cache_shrinker); 168 if (status) 169 return status; 170 171 drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 172 0, 0, NULL); 173 if (!drc_slab) 174 goto out_nomem; 175 176 drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); 177 if (!drc_hashtbl) 178 goto out_nomem; 179 for (i = 0; i < hashsize; i++) { 180 INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); 181 spin_lock_init(&drc_hashtbl[i].cache_lock); 182 } 183 drc_hashsize = hashsize; 184 185 return 0; 186 out_nomem: 187 printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); 188 nfsd_reply_cache_shutdown(); 189 return -ENOMEM; 190 } 191 192 void nfsd_reply_cache_shutdown(void) 193 { 194 struct svc_cacherep *rp; 195 unsigned int i; 196 197 unregister_shrinker(&nfsd_reply_cache_shrinker); 198 199 for (i = 0; i < drc_hashsize; i++) { 200 struct list_head *head = &drc_hashtbl[i].lru_head; 201 while (!list_empty(head)) { 202 rp = list_first_entry(head, struct svc_cacherep, c_lru); 203 nfsd_reply_cache_free_locked(rp); 204 } 205 } 206 207 kfree (drc_hashtbl); 208 drc_hashtbl = NULL; 209 drc_hashsize = 0; 210 211 kmem_cache_destroy(drc_slab); 212 drc_slab = NULL; 213 } 214 215 /* 216 * Move cache entry to end of LRU list, and queue the cleaner to run if it's 217 * not already scheduled. 218 */ 219 static void 220 lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) 221 { 222 rp->c_timestamp = jiffies; 223 list_move_tail(&rp->c_lru, &b->lru_head); 224 } 225 226 static long 227 prune_bucket(struct nfsd_drc_bucket *b) 228 { 229 struct svc_cacherep *rp, *tmp; 230 long freed = 0; 231 232 list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { 233 /* 234 * Don't free entries attached to calls that are still 235 * in-progress, but do keep scanning the list. 236 */ 237 if (rp->c_state == RC_INPROG) 238 continue; 239 if (atomic_read(&num_drc_entries) <= max_drc_entries && 240 time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) 241 break; 242 nfsd_reply_cache_free_locked(rp); 243 freed++; 244 } 245 return freed; 246 } 247 248 /* 249 * Walk the LRU list and prune off entries that are older than RC_EXPIRE. 250 * Also prune the oldest ones when the total exceeds the max number of entries. 251 */ 252 static long 253 prune_cache_entries(void) 254 { 255 unsigned int i; 256 long freed = 0; 257 258 for (i = 0; i < drc_hashsize; i++) { 259 struct nfsd_drc_bucket *b = &drc_hashtbl[i]; 260 261 if (list_empty(&b->lru_head)) 262 continue; 263 spin_lock(&b->cache_lock); 264 freed += prune_bucket(b); 265 spin_unlock(&b->cache_lock); 266 } 267 return freed; 268 } 269 270 static unsigned long 271 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) 272 { 273 return atomic_read(&num_drc_entries); 274 } 275 276 static unsigned long 277 nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 278 { 279 return prune_cache_entries(); 280 } 281 /* 282 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes 283 */ 284 static __wsum 285 nfsd_cache_csum(struct svc_rqst *rqstp) 286 { 287 int idx; 288 unsigned int base; 289 __wsum csum; 290 struct xdr_buf *buf = &rqstp->rq_arg; 291 const unsigned char *p = buf->head[0].iov_base; 292 size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, 293 RC_CSUMLEN); 294 size_t len = min(buf->head[0].iov_len, csum_len); 295 296 /* rq_arg.head first */ 297 csum = csum_partial(p, len, 0); 298 csum_len -= len; 299 300 /* Continue into page array */ 301 idx = buf->page_base / PAGE_SIZE; 302 base = buf->page_base & ~PAGE_MASK; 303 while (csum_len) { 304 p = page_address(buf->pages[idx]) + base; 305 len = min_t(size_t, PAGE_SIZE - base, csum_len); 306 csum = csum_partial(p, len, csum); 307 csum_len -= len; 308 base = 0; 309 ++idx; 310 } 311 return csum; 312 } 313 314 static bool 315 nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) 316 { 317 /* Check RPC XID first */ 318 if (rqstp->rq_xid != rp->c_xid) 319 return false; 320 /* compare checksum of NFS data */ 321 if (csum != rp->c_csum) { 322 ++payload_misses; 323 return false; 324 } 325 326 /* Other discriminators */ 327 if (rqstp->rq_proc != rp->c_proc || 328 rqstp->rq_prot != rp->c_prot || 329 rqstp->rq_vers != rp->c_vers || 330 rqstp->rq_arg.len != rp->c_len || 331 !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || 332 rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) 333 return false; 334 335 return true; 336 } 337 338 /* 339 * Search the request hash for an entry that matches the given rqstp. 340 * Must be called with cache_lock held. Returns the found entry or 341 * NULL on failure. 342 */ 343 static struct svc_cacherep * 344 nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, 345 __wsum csum) 346 { 347 struct svc_cacherep *rp, *ret = NULL; 348 struct list_head *rh = &b->lru_head; 349 unsigned int entries = 0; 350 351 list_for_each_entry(rp, rh, c_lru) { 352 ++entries; 353 if (nfsd_cache_match(rqstp, csum, rp)) { 354 ret = rp; 355 break; 356 } 357 } 358 359 /* tally hash chain length stats */ 360 if (entries > longest_chain) { 361 longest_chain = entries; 362 longest_chain_cachesize = atomic_read(&num_drc_entries); 363 } else if (entries == longest_chain) { 364 /* prefer to keep the smallest cachesize possible here */ 365 longest_chain_cachesize = min_t(unsigned int, 366 longest_chain_cachesize, 367 atomic_read(&num_drc_entries)); 368 } 369 370 return ret; 371 } 372 373 /* 374 * Try to find an entry matching the current call in the cache. When none 375 * is found, we try to grab the oldest expired entry off the LRU list. If 376 * a suitable one isn't there, then drop the cache_lock and allocate a 377 * new one, then search again in case one got inserted while this thread 378 * didn't hold the lock. 379 */ 380 int 381 nfsd_cache_lookup(struct svc_rqst *rqstp) 382 { 383 struct svc_cacherep *rp, *found; 384 __be32 xid = rqstp->rq_xid; 385 u32 proto = rqstp->rq_prot, 386 vers = rqstp->rq_vers, 387 proc = rqstp->rq_proc; 388 __wsum csum; 389 u32 hash = nfsd_cache_hash(xid); 390 struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; 391 unsigned long age; 392 int type = rqstp->rq_cachetype; 393 int rtn = RC_DOIT; 394 395 rqstp->rq_cacherep = NULL; 396 if (type == RC_NOCACHE) { 397 nfsdstats.rcnocache++; 398 return rtn; 399 } 400 401 csum = nfsd_cache_csum(rqstp); 402 403 /* 404 * Since the common case is a cache miss followed by an insert, 405 * preallocate an entry. 406 */ 407 rp = nfsd_reply_cache_alloc(); 408 spin_lock(&b->cache_lock); 409 if (likely(rp)) { 410 atomic_inc(&num_drc_entries); 411 drc_mem_usage += sizeof(*rp); 412 } 413 414 /* go ahead and prune the cache */ 415 prune_bucket(b); 416 417 found = nfsd_cache_search(b, rqstp, csum); 418 if (found) { 419 if (likely(rp)) 420 nfsd_reply_cache_free_locked(rp); 421 rp = found; 422 goto found_entry; 423 } 424 425 if (!rp) { 426 dprintk("nfsd: unable to allocate DRC entry!\n"); 427 goto out; 428 } 429 430 nfsdstats.rcmisses++; 431 rqstp->rq_cacherep = rp; 432 rp->c_state = RC_INPROG; 433 rp->c_xid = xid; 434 rp->c_proc = proc; 435 rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); 436 rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); 437 rp->c_prot = proto; 438 rp->c_vers = vers; 439 rp->c_len = rqstp->rq_arg.len; 440 rp->c_csum = csum; 441 442 lru_put_end(b, rp); 443 444 /* release any buffer */ 445 if (rp->c_type == RC_REPLBUFF) { 446 drc_mem_usage -= rp->c_replvec.iov_len; 447 kfree(rp->c_replvec.iov_base); 448 rp->c_replvec.iov_base = NULL; 449 } 450 rp->c_type = RC_NOCACHE; 451 out: 452 spin_unlock(&b->cache_lock); 453 return rtn; 454 455 found_entry: 456 nfsdstats.rchits++; 457 /* We found a matching entry which is either in progress or done. */ 458 age = jiffies - rp->c_timestamp; 459 lru_put_end(b, rp); 460 461 rtn = RC_DROPIT; 462 /* Request being processed or excessive rexmits */ 463 if (rp->c_state == RC_INPROG || age < RC_DELAY) 464 goto out; 465 466 /* From the hall of fame of impractical attacks: 467 * Is this a user who tries to snoop on the cache? */ 468 rtn = RC_DOIT; 469 if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) 470 goto out; 471 472 /* Compose RPC reply header */ 473 switch (rp->c_type) { 474 case RC_NOCACHE: 475 break; 476 case RC_REPLSTAT: 477 svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); 478 rtn = RC_REPLY; 479 break; 480 case RC_REPLBUFF: 481 if (!nfsd_cache_append(rqstp, &rp->c_replvec)) 482 goto out; /* should not happen */ 483 rtn = RC_REPLY; 484 break; 485 default: 486 printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); 487 nfsd_reply_cache_free_locked(rp); 488 } 489 490 goto out; 491 } 492 493 /* 494 * Update a cache entry. This is called from nfsd_dispatch when 495 * the procedure has been executed and the complete reply is in 496 * rqstp->rq_res. 497 * 498 * We're copying around data here rather than swapping buffers because 499 * the toplevel loop requires max-sized buffers, which would be a waste 500 * of memory for a cache with a max reply size of 100 bytes (diropokres). 501 * 502 * If we should start to use different types of cache entries tailored 503 * specifically for attrstat and fh's, we may save even more space. 504 * 505 * Also note that a cachetype of RC_NOCACHE can legally be passed when 506 * nfsd failed to encode a reply that otherwise would have been cached. 507 * In this case, nfsd_cache_update is called with statp == NULL. 508 */ 509 void 510 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) 511 { 512 struct svc_cacherep *rp = rqstp->rq_cacherep; 513 struct kvec *resv = &rqstp->rq_res.head[0], *cachv; 514 u32 hash; 515 struct nfsd_drc_bucket *b; 516 int len; 517 size_t bufsize = 0; 518 519 if (!rp) 520 return; 521 522 hash = nfsd_cache_hash(rp->c_xid); 523 b = &drc_hashtbl[hash]; 524 525 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); 526 len >>= 2; 527 528 /* Don't cache excessive amounts of data and XDR failures */ 529 if (!statp || len > (256 >> 2)) { 530 nfsd_reply_cache_free(b, rp); 531 return; 532 } 533 534 switch (cachetype) { 535 case RC_REPLSTAT: 536 if (len != 1) 537 printk("nfsd: RC_REPLSTAT/reply len %d!\n",len); 538 rp->c_replstat = *statp; 539 break; 540 case RC_REPLBUFF: 541 cachv = &rp->c_replvec; 542 bufsize = len << 2; 543 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); 544 if (!cachv->iov_base) { 545 nfsd_reply_cache_free(b, rp); 546 return; 547 } 548 cachv->iov_len = bufsize; 549 memcpy(cachv->iov_base, statp, bufsize); 550 break; 551 case RC_NOCACHE: 552 nfsd_reply_cache_free(b, rp); 553 return; 554 } 555 spin_lock(&b->cache_lock); 556 drc_mem_usage += bufsize; 557 lru_put_end(b, rp); 558 rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); 559 rp->c_type = cachetype; 560 rp->c_state = RC_DONE; 561 spin_unlock(&b->cache_lock); 562 return; 563 } 564 565 /* 566 * Copy cached reply to current reply buffer. Should always fit. 567 * FIXME as reply is in a page, we should just attach the page, and 568 * keep a refcount.... 569 */ 570 static int 571 nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) 572 { 573 struct kvec *vec = &rqstp->rq_res.head[0]; 574 575 if (vec->iov_len + data->iov_len > PAGE_SIZE) { 576 printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n", 577 data->iov_len); 578 return 0; 579 } 580 memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); 581 vec->iov_len += data->iov_len; 582 return 1; 583 } 584 585 /* 586 * Note that fields may be added, removed or reordered in the future. Programs 587 * scraping this file for info should test the labels to ensure they're 588 * getting the correct field. 589 */ 590 static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) 591 { 592 seq_printf(m, "max entries: %u\n", max_drc_entries); 593 seq_printf(m, "num entries: %u\n", 594 atomic_read(&num_drc_entries)); 595 seq_printf(m, "hash buckets: %u\n", 1 << maskbits); 596 seq_printf(m, "mem usage: %u\n", drc_mem_usage); 597 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); 598 seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); 599 seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); 600 seq_printf(m, "payload misses: %u\n", payload_misses); 601 seq_printf(m, "longest chain len: %u\n", longest_chain); 602 seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); 603 return 0; 604 } 605 606 int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) 607 { 608 return single_open(file, nfsd_reply_cache_stats_show, NULL); 609 } 610