1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file cong.c 7 * Oracle elects to have and use the contents of cong.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 13 /* 14 * Copyright (c) 2007 Oracle. All rights reserved. 15 * 16 * This software is available to you under a choice of one of two 17 * licenses. You may choose to be licensed under the terms of the GNU 18 * General Public License (GPL) Version 2, available from the file 19 * COPYING in the main directory of this source tree, or the 20 * OpenIB.org BSD license below: 21 * 22 * Redistribution and use in source and binary forms, with or 23 * without modification, are permitted provided that the following 24 * conditions are met: 25 * 26 * - Redistributions of source code must retain the above 27 * copyright notice, this list of conditions and the following 28 * disclaimer. 29 * 30 * - Redistributions in binary form must reproduce the above 31 * copyright notice, this list of conditions and the following 32 * disclaimer in the documentation and/or other materials 33 * provided with the distribution. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 42 * SOFTWARE. 43 * 44 */ 45 #include <sys/rds.h> 46 47 #include <sys/ib/clients/rdsv3/rdsv3.h> 48 #include <sys/ib/clients/rdsv3/rdsv3_impl.h> 49 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 50 51 /* 52 * This file implements the receive side of the unconventional congestion 53 * management in RDS. 54 * 55 * Messages waiting in the receive queue on the receiving socket are accounted 56 * against the sockets SO_RCVBUF option value. Only the payload bytes in the 57 * message are accounted for. If the number of bytes queued equals or exceeds 58 * rcvbuf then the socket is congested. All sends attempted to this socket's 59 * address should return block or return -EWOULDBLOCK. 60 * 61 * Applications are expected to be reasonably tuned such that this situation 62 * very rarely occurs. An application encountering this "back-pressure" is 63 * considered a bug. 64 * 65 * This is implemented by having each node maintain bitmaps which indicate 66 * which ports on bound addresses are congested. As the bitmap changes it is 67 * sent through all the connections which terminate in the local address of the 68 * bitmap which changed. 69 * 70 * The bitmaps are allocated as connections are brought up. This avoids 71 * allocation in the interrupt handling path which queues messages on sockets. 72 * The dense bitmaps let transports send the entire bitmap on any bitmap change 73 * reasonably efficiently. This is much easier to implement than some 74 * finer-grained communication of per-port congestion. The sender does a very 75 * inexpensive bit test to test if the port it's about to send to is congested 76 * or not. 77 */ 78 79 /* 80 * Interaction with poll is a tad tricky. We want all processes stuck in 81 * poll to wake up and check whether a congested destination became uncongested. 82 * The really sad thing is we have no idea which destinations the application 83 * wants to send to - we don't even know which rdsv3_connections are involved. 84 * So until we implement a more flexible rds poll interface, we have to make 85 * do with this: 86 * We maintain a global counter that is incremented each time a congestion map 87 * update is received. Each rds socket tracks this value, and if rdsv3_poll 88 * finds that the saved generation number is smaller than the global generation 89 * number, it wakes up the process. 90 */ 91 static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0); 92 93 /* 94 * Congestion monitoring 95 */ 96 static struct list rdsv3_cong_monitor; 97 static krwlock_t rdsv3_cong_monitor_lock; 98 99 /* 100 * Yes, a global lock. It's used so infrequently that it's worth keeping it 101 * global to simplify the locking. It's only used in the following 102 * circumstances: 103 * 104 * - on connection buildup to associate a conn with its maps 105 * - on map changes to inform conns of a new map to send 106 * 107 * It's sadly ordered under the socket callback lock and the connection lock. 108 * Receive paths can mark ports congested from interrupt context so the 109 * lock masks interrupts. 110 */ 111 static kmutex_t rdsv3_cong_lock; 112 static struct avl_tree rdsv3_cong_tree; 113 114 static struct rdsv3_cong_map * 115 rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert) 116 { 117 struct rdsv3_cong_map *map; 118 avl_index_t where; 119 120 if (insert) { 121 map = avl_find(&rdsv3_cong_tree, insert, &where); 122 if (map == NULL) { 123 avl_insert(&rdsv3_cong_tree, insert, where); 124 return (NULL); 125 } 126 } else { 127 struct rdsv3_cong_map map1; 128 map1.m_addr = addr; 129 map = avl_find(&rdsv3_cong_tree, &map1, &where); 130 } 131 132 return (map); 133 } 134 135 /* 136 * There is only ever one bitmap for any address. Connections try and allocate 137 * these bitmaps in the process getting pointers to them. The bitmaps are only 138 * ever freed as the module is removed after all connections have been freed. 139 */ 140 static struct rdsv3_cong_map * 141 rdsv3_cong_from_addr(uint32_be_t addr) 142 { 143 struct rdsv3_cong_map *map; 144 struct rdsv3_cong_map *ret = NULL; 145 unsigned long zp; 146 unsigned long i; 147 148 RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr)); 149 150 map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP); 151 if (!map) 152 return (NULL); 153 154 map->m_addr = addr; 155 rdsv3_init_waitqueue(&map->m_waitq); 156 list_create(&map->m_conn_list, sizeof (struct rdsv3_connection), 157 offsetof(struct rdsv3_connection, c_map_item)); 158 159 for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) { 160 zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP); 161 if (zp == 0) 162 goto out; 163 map->m_page_addrs[i] = zp; 164 } 165 166 mutex_enter(&rdsv3_cong_lock); 167 ret = rdsv3_cong_tree_walk(addr, map); 168 mutex_exit(&rdsv3_cong_lock); 169 170 if (!ret) { 171 ret = map; 172 map = NULL; 173 } 174 175 out: 176 if (map) { 177 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; 178 i++) 179 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); 180 kmem_free(map, sizeof (*map)); 181 } 182 183 RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x", 184 ret, ntohl(addr)); 185 186 return (ret); 187 } 188 189 /* 190 * Put the conn on its local map's list. This is called when the conn is 191 * really added to the hash. It's nested under the rdsv3_conn_lock, sadly. 192 */ 193 void 194 rdsv3_cong_add_conn(struct rdsv3_connection *conn) 195 { 196 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn); 197 198 RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p", 199 conn, conn->c_lcong); 200 mutex_enter(&rdsv3_cong_lock); 201 list_insert_tail(&conn->c_lcong->m_conn_list, conn); 202 mutex_exit(&rdsv3_cong_lock); 203 204 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn); 205 } 206 207 void 208 rdsv3_cong_remove_conn(struct rdsv3_connection *conn) 209 { 210 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn); 211 212 RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p", 213 conn, conn->c_lcong); 214 mutex_enter(&rdsv3_cong_lock); 215 list_remove_node(&conn->c_map_item); 216 mutex_exit(&rdsv3_cong_lock); 217 218 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn); 219 } 220 221 int 222 rdsv3_cong_get_maps(struct rdsv3_connection *conn) 223 { 224 conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr); 225 conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr); 226 227 if (!(conn->c_lcong && conn->c_fcong)) 228 return (-ENOMEM); 229 230 return (0); 231 } 232 233 void 234 rdsv3_cong_queue_updates(struct rdsv3_cong_map *map) 235 { 236 struct rdsv3_connection *conn; 237 238 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map); 239 240 mutex_enter(&rdsv3_cong_lock); 241 242 RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) { 243 if (!test_and_set_bit(0, &conn->c_map_queued)) { 244 rdsv3_stats_inc(s_cong_update_queued); 245 (void) rdsv3_send_xmit(conn); 246 } 247 } 248 249 mutex_exit(&rdsv3_cong_lock); 250 251 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map); 252 } 253 254 void 255 rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask) 256 { 257 RDSV3_DPRINTF4("rdsv3_cong_map_updated", 258 "waking map %p for %u.%u.%u.%u", 259 map, NIPQUAD(map->m_addr)); 260 261 rdsv3_stats_inc(s_cong_update_received); 262 atomic_add_32(&rdsv3_cong_generation, 1); 263 #if 0 264 XXX 265 if (waitqueue_active(&map->m_waitq)) 266 #endif 267 rdsv3_wake_up(&map->m_waitq); 268 269 if (portmask && !list_is_empty(&rdsv3_cong_monitor)) { 270 struct rdsv3_sock *rs; 271 272 rw_enter(&rdsv3_cong_monitor_lock, RW_READER); 273 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor, 274 rs_cong_list) { 275 mutex_enter(&rs->rs_lock); 276 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); 277 rs->rs_cong_mask &= ~portmask; 278 mutex_exit(&rs->rs_lock); 279 if (rs->rs_cong_notify) 280 rdsv3_wake_sk_sleep(rs); 281 } 282 rw_exit(&rdsv3_cong_monitor_lock); 283 } 284 285 RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map); 286 } 287 288 int 289 rdsv3_cong_updated_since(unsigned long *recent) 290 { 291 unsigned long gen = atomic_get(&rdsv3_cong_generation); 292 293 if (*recent == gen) 294 return (0); 295 *recent = gen; 296 return (1); 297 } 298 299 /* 300 * We're called under the locking that protects the sockets receive buffer 301 * consumption. This makes it a lot easier for the caller to only call us 302 * when it knows that an existing set bit needs to be cleared, and vice versa. 303 * We can't block and we need to deal with concurrent sockets working against 304 * the same per-address map. 305 */ 306 void 307 rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port) 308 { 309 unsigned long i; 310 unsigned long off; 311 312 RDSV3_DPRINTF4("rdsv3_cong_set_bit", 313 "setting congestion for %u.%u.%u.%u:%u in map %p", 314 NIPQUAD(map->m_addr), ntohs(port), map); 315 316 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 317 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 318 set_le_bit(off, (void *)map->m_page_addrs[i]); 319 } 320 321 void 322 rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port) 323 { 324 unsigned long i; 325 unsigned long off; 326 327 RDSV3_DPRINTF4("rdsv3_cong_clear_bit", 328 "clearing congestion for %u.%u.%u.%u:%u in map %p\n", 329 NIPQUAD(map->m_addr), ntohs(port), map); 330 331 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 332 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 333 clear_le_bit(off, (void *)map->m_page_addrs[i]); 334 } 335 336 static int 337 rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port) 338 { 339 unsigned long i; 340 unsigned long off; 341 342 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 343 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 344 345 RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx", 346 ntohs(port), i, off); 347 348 return (test_le_bit(off, (void *)map->m_page_addrs[i])); 349 } 350 351 void 352 rdsv3_cong_add_socket(struct rdsv3_sock *rs) 353 { 354 RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs); 355 356 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); 357 if (!list_link_active(&rs->rs_cong_list)) 358 list_insert_head(&rdsv3_cong_monitor, rs); 359 rw_exit(&rdsv3_cong_monitor_lock); 360 } 361 362 void 363 rdsv3_cong_remove_socket(struct rdsv3_sock *rs) 364 { 365 struct rdsv3_cong_map *map; 366 367 RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs); 368 369 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); 370 list_remove_node(&rs->rs_cong_list); 371 rw_exit(&rdsv3_cong_monitor_lock); 372 373 /* update congestion map for now-closed port */ 374 mutex_enter(&rdsv3_cong_lock); 375 map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL); 376 mutex_exit(&rdsv3_cong_lock); 377 378 if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) { 379 rdsv3_cong_clear_bit(map, rs->rs_bound_port); 380 rdsv3_cong_queue_updates(map); 381 } 382 } 383 384 int 385 rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock, 386 struct rdsv3_sock *rs) 387 { 388 int ret = 0; 389 390 RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)", 391 rs, nonblock); 392 393 if (!rdsv3_cong_test_bit(map, port)) 394 return (0); 395 if (nonblock) { 396 if (rs && rs->rs_cong_monitor) { 397 /* 398 * It would have been nice to have an atomic set_bit on 399 * a uint64_t. 400 */ 401 mutex_enter(&rs->rs_lock); 402 rs->rs_cong_mask |= 403 RDS_CONG_MONITOR_MASK(ntohs(port)); 404 mutex_exit(&rs->rs_lock); 405 406 /* 407 * Test again - a congestion update may have arrived in 408 * the meantime. 409 */ 410 if (!rdsv3_cong_test_bit(map, port)) 411 return (0); 412 } 413 rdsv3_stats_inc(s_cong_send_error); 414 return (-ENOBUFS); 415 } 416 417 rdsv3_stats_inc(s_cong_send_blocked); 418 RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u", 419 map, ntohs(port)); 420 421 #if 0 422 ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port)); 423 if (ret == 0) 424 return (-ERESTART); 425 return (0); 426 #else 427 mutex_enter(&map->m_waitq.waitq_mutex); 428 map->m_waitq.waitq_waiters++; 429 while (rdsv3_cong_test_bit(map, port)) { 430 ret = cv_wait_sig(&map->m_waitq.waitq_cv, 431 &map->m_waitq.waitq_mutex); 432 if (ret == 0) { 433 ret = -EINTR; 434 break; 435 } 436 } 437 map->m_waitq.waitq_waiters--; 438 mutex_exit(&map->m_waitq.waitq_mutex); 439 return (ret); 440 #endif 441 } 442 443 void 444 rdsv3_cong_exit(void) 445 { 446 struct rdsv3_cong_map *map; 447 unsigned long i; 448 449 RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter"); 450 451 while ((map = avl_first(&rdsv3_cong_tree))) { 452 RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map); 453 avl_remove(&rdsv3_cong_tree, map); 454 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; 455 i++) 456 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); 457 kmem_free(map, sizeof (*map)); 458 } 459 460 RDSV3_DPRINTF4("rdsv3_cong_exit", "Return"); 461 } 462 463 /* 464 * Allocate a RDS message containing a congestion update. 465 */ 466 struct rdsv3_message * 467 rdsv3_cong_update_alloc(struct rdsv3_connection *conn) 468 { 469 struct rdsv3_cong_map *map = conn->c_lcong; 470 struct rdsv3_message *rm; 471 472 rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES); 473 if (!IS_ERR(rm)) 474 rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP; 475 476 return (rm); 477 } 478 479 static int 480 rdsv3_cong_compare(const void *map1, const void *map2) 481 { 482 #define addr1 ((struct rdsv3_cong_map *)map1)->m_addr 483 #define addr2 ((struct rdsv3_cong_map *)map2)->m_addr 484 485 if (addr1 < addr2) 486 return (-1); 487 if (addr1 > addr2) 488 return (1); 489 return (0); 490 } 491 492 void 493 rdsv3_cong_init(void) 494 { 495 list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock), 496 offsetof(struct rdsv3_sock, rs_cong_list)); 497 rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL); 498 mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL); 499 avl_create(&rdsv3_cong_tree, rdsv3_cong_compare, 500 sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map, 501 m_rb_node)); 502 } 503