1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * 3 * vim: noexpandtab sw=8 ts=8 sts=0: 4 * 5 * Copyright (C) 2004 Oracle. All rights reserved. 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public 18 * License along with this program; if not, write to the 19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 20 * Boston, MA 021110-1307, USA. 21 * 22 * ---- 23 * 24 * Callers for this were originally written against a very simple synchronus 25 * API. This implementation reflects those simple callers. Some day I'm sure 26 * we'll need to move to a more robust posting/callback mechanism. 27 * 28 * Transmit calls pass in kernel virtual addresses and block copying this into 29 * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting 30 * for a failed socket to timeout. TX callers can also pass in a poniter to an 31 * 'int' which gets filled with an errno off the wire in response to the 32 * message they send. 33 * 34 * Handlers for unsolicited messages are registered. Each socket has a page 35 * that incoming data is copied into. First the header, then the data. 36 * Handlers are called from only one thread with a reference to this per-socket 37 * page. This page is destroyed after the handler call, so it can't be 38 * referenced beyond the call. Handlers may block but are discouraged from 39 * doing so. 40 * 41 * Any framing errors (bad magic, large payload lengths) close a connection. 42 * 43 * Our sock_container holds the state we associate with a socket. It's current 44 * framing state is held there as well as the refcounting we do around when it 45 * is safe to tear down the socket. The socket is only finally torn down from 46 * the container when the container loses all of its references -- so as long 47 * as you hold a ref on the container you can trust that the socket is valid 48 * for use with kernel socket APIs. 49 * 50 * Connections are initiated between a pair of nodes when the node with the 51 * higher node number gets a heartbeat callback which indicates that the lower 52 * numbered node has started heartbeating. The lower numbered node is passive 53 * and only accepts the connection if the higher numbered node is heartbeating. 54 */ 55 56 #include <linux/kernel.h> 57 #include <linux/sched/mm.h> 58 #include <linux/jiffies.h> 59 #include <linux/slab.h> 60 #include <linux/idr.h> 61 #include <linux/kref.h> 62 #include <linux/net.h> 63 #include <linux/export.h> 64 #include <net/tcp.h> 65 66 #include <linux/uaccess.h> 67 68 #include "heartbeat.h" 69 #include "tcp.h" 70 #include "nodemanager.h" 71 #define MLOG_MASK_PREFIX ML_TCP 72 #include "masklog.h" 73 #include "quorum.h" 74 75 #include "tcp_internal.h" 76 77 #define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" 78 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 79 &sc->sc_node->nd_ipv4_address, \ 80 ntohs(sc->sc_node->nd_ipv4_port) 81 82 /* 83 * In the following two log macros, the whitespace after the ',' just 84 * before ##args is intentional. Otherwise, gcc 2.95 will eat the 85 * previous token if args expands to nothing. 86 */ 87 #define msglog(hdr, fmt, args...) do { \ 88 typeof(hdr) __hdr = (hdr); \ 89 mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ 90 "key %08x num %u] " fmt, \ 91 be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ 92 be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ 93 be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ 94 be32_to_cpu(__hdr->msg_num) , ##args); \ 95 } while (0) 96 97 #define sclog(sc, fmt, args...) do { \ 98 typeof(sc) __sc = (sc); \ 99 mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ 100 "pg_off %zu] " fmt, __sc, \ 101 kref_read(&__sc->sc_kref), __sc->sc_sock, \ 102 __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ 103 ##args); \ 104 } while (0) 105 106 static DEFINE_RWLOCK(o2net_handler_lock); 107 static struct rb_root o2net_handler_tree = RB_ROOT; 108 109 static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; 110 111 /* XXX someday we'll need better accounting */ 112 static struct socket *o2net_listen_sock; 113 114 /* 115 * listen work is only queued by the listening socket callbacks on the 116 * o2net_wq. teardown detaches the callbacks before destroying the workqueue. 117 * quorum work is queued as sock containers are shutdown.. stop_listening 118 * tears down all the node's sock containers, preventing future shutdowns 119 * and queued quroum work, before canceling delayed quorum work and 120 * destroying the work queue. 121 */ 122 static struct workqueue_struct *o2net_wq; 123 static struct work_struct o2net_listen_work; 124 125 static struct o2hb_callback_func o2net_hb_up, o2net_hb_down; 126 #define O2NET_HB_PRI 0x1 127 128 static struct o2net_handshake *o2net_hand; 129 static struct o2net_msg *o2net_keep_req, *o2net_keep_resp; 130 131 static int o2net_sys_err_translations[O2NET_ERR_MAX] = 132 {[O2NET_ERR_NONE] = 0, 133 [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, 134 [O2NET_ERR_OVERFLOW] = -EOVERFLOW, 135 [O2NET_ERR_DIED] = -EHOSTDOWN,}; 136 137 /* can't quite avoid *all* internal declarations :/ */ 138 static void o2net_sc_connect_completed(struct work_struct *work); 139 static void o2net_rx_until_empty(struct work_struct *work); 140 static void o2net_shutdown_sc(struct work_struct *work); 141 static void o2net_listen_data_ready(struct sock *sk); 142 static void o2net_sc_send_keep_req(struct work_struct *work); 143 static void o2net_idle_timer(unsigned long data); 144 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); 145 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 146 147 #ifdef CONFIG_DEBUG_FS 148 static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, 149 u32 msgkey, struct task_struct *task, u8 node) 150 { 151 INIT_LIST_HEAD(&nst->st_net_debug_item); 152 nst->st_task = task; 153 nst->st_msg_type = msgtype; 154 nst->st_msg_key = msgkey; 155 nst->st_node = node; 156 } 157 158 static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 159 { 160 nst->st_sock_time = ktime_get(); 161 } 162 163 static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 164 { 165 nst->st_send_time = ktime_get(); 166 } 167 168 static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 169 { 170 nst->st_status_time = ktime_get(); 171 } 172 173 static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 174 struct o2net_sock_container *sc) 175 { 176 nst->st_sc = sc; 177 } 178 179 static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, 180 u32 msg_id) 181 { 182 nst->st_id = msg_id; 183 } 184 185 static inline void o2net_set_sock_timer(struct o2net_sock_container *sc) 186 { 187 sc->sc_tv_timer = ktime_get(); 188 } 189 190 static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc) 191 { 192 sc->sc_tv_data_ready = ktime_get(); 193 } 194 195 static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc) 196 { 197 sc->sc_tv_advance_start = ktime_get(); 198 } 199 200 static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc) 201 { 202 sc->sc_tv_advance_stop = ktime_get(); 203 } 204 205 static inline void o2net_set_func_start_time(struct o2net_sock_container *sc) 206 { 207 sc->sc_tv_func_start = ktime_get(); 208 } 209 210 static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc) 211 { 212 sc->sc_tv_func_stop = ktime_get(); 213 } 214 215 #else /* CONFIG_DEBUG_FS */ 216 # define o2net_init_nst(a, b, c, d, e) 217 # define o2net_set_nst_sock_time(a) 218 # define o2net_set_nst_send_time(a) 219 # define o2net_set_nst_status_time(a) 220 # define o2net_set_nst_sock_container(a, b) 221 # define o2net_set_nst_msg_id(a, b) 222 # define o2net_set_sock_timer(a) 223 # define o2net_set_data_ready_time(a) 224 # define o2net_set_advance_start_time(a) 225 # define o2net_set_advance_stop_time(a) 226 # define o2net_set_func_start_time(a) 227 # define o2net_set_func_stop_time(a) 228 #endif /* CONFIG_DEBUG_FS */ 229 230 #ifdef CONFIG_OCFS2_FS_STATS 231 static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) 232 { 233 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); 234 } 235 236 static void o2net_update_send_stats(struct o2net_send_tracking *nst, 237 struct o2net_sock_container *sc) 238 { 239 sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, 240 ktime_sub(ktime_get(), 241 nst->st_status_time)); 242 sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, 243 ktime_sub(nst->st_status_time, 244 nst->st_send_time)); 245 sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, 246 ktime_sub(nst->st_send_time, 247 nst->st_sock_time)); 248 sc->sc_send_count++; 249 } 250 251 static void o2net_update_recv_stats(struct o2net_sock_container *sc) 252 { 253 sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, 254 o2net_get_func_run_time(sc)); 255 sc->sc_recv_count++; 256 } 257 258 #else 259 260 # define o2net_update_send_stats(a, b) 261 262 # define o2net_update_recv_stats(sc) 263 264 #endif /* CONFIG_OCFS2_FS_STATS */ 265 266 static inline unsigned int o2net_reconnect_delay(void) 267 { 268 return o2nm_single_cluster->cl_reconnect_delay_ms; 269 } 270 271 static inline unsigned int o2net_keepalive_delay(void) 272 { 273 return o2nm_single_cluster->cl_keepalive_delay_ms; 274 } 275 276 static inline unsigned int o2net_idle_timeout(void) 277 { 278 return o2nm_single_cluster->cl_idle_timeout_ms; 279 } 280 281 static inline int o2net_sys_err_to_errno(enum o2net_system_error err) 282 { 283 int trans; 284 BUG_ON(err >= O2NET_ERR_MAX); 285 trans = o2net_sys_err_translations[err]; 286 287 /* Just in case we mess up the translation table above */ 288 BUG_ON(err != O2NET_ERR_NONE && trans == 0); 289 return trans; 290 } 291 292 static struct o2net_node * o2net_nn_from_num(u8 node_num) 293 { 294 BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes)); 295 return &o2net_nodes[node_num]; 296 } 297 298 static u8 o2net_num_from_nn(struct o2net_node *nn) 299 { 300 BUG_ON(nn == NULL); 301 return nn - o2net_nodes; 302 } 303 304 /* ------------------------------------------------------------ */ 305 306 static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) 307 { 308 int ret; 309 310 spin_lock(&nn->nn_lock); 311 ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC); 312 if (ret >= 0) { 313 nsw->ns_id = ret; 314 list_add_tail(&nsw->ns_node_item, &nn->nn_status_list); 315 } 316 spin_unlock(&nn->nn_lock); 317 if (ret < 0) 318 return ret; 319 320 init_waitqueue_head(&nsw->ns_wq); 321 nsw->ns_sys_status = O2NET_ERR_NONE; 322 nsw->ns_status = 0; 323 return 0; 324 } 325 326 static void o2net_complete_nsw_locked(struct o2net_node *nn, 327 struct o2net_status_wait *nsw, 328 enum o2net_system_error sys_status, 329 s32 status) 330 { 331 assert_spin_locked(&nn->nn_lock); 332 333 if (!list_empty(&nsw->ns_node_item)) { 334 list_del_init(&nsw->ns_node_item); 335 nsw->ns_sys_status = sys_status; 336 nsw->ns_status = status; 337 idr_remove(&nn->nn_status_idr, nsw->ns_id); 338 wake_up(&nsw->ns_wq); 339 } 340 } 341 342 static void o2net_complete_nsw(struct o2net_node *nn, 343 struct o2net_status_wait *nsw, 344 u64 id, enum o2net_system_error sys_status, 345 s32 status) 346 { 347 spin_lock(&nn->nn_lock); 348 if (nsw == NULL) { 349 if (id > INT_MAX) 350 goto out; 351 352 nsw = idr_find(&nn->nn_status_idr, id); 353 if (nsw == NULL) 354 goto out; 355 } 356 357 o2net_complete_nsw_locked(nn, nsw, sys_status, status); 358 359 out: 360 spin_unlock(&nn->nn_lock); 361 return; 362 } 363 364 static void o2net_complete_nodes_nsw(struct o2net_node *nn) 365 { 366 struct o2net_status_wait *nsw, *tmp; 367 unsigned int num_kills = 0; 368 369 assert_spin_locked(&nn->nn_lock); 370 371 list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { 372 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); 373 num_kills++; 374 } 375 376 mlog(0, "completed %d messages for node %u\n", num_kills, 377 o2net_num_from_nn(nn)); 378 } 379 380 static int o2net_nsw_completed(struct o2net_node *nn, 381 struct o2net_status_wait *nsw) 382 { 383 int completed; 384 spin_lock(&nn->nn_lock); 385 completed = list_empty(&nsw->ns_node_item); 386 spin_unlock(&nn->nn_lock); 387 return completed; 388 } 389 390 /* ------------------------------------------------------------ */ 391 392 static void sc_kref_release(struct kref *kref) 393 { 394 struct o2net_sock_container *sc = container_of(kref, 395 struct o2net_sock_container, sc_kref); 396 BUG_ON(timer_pending(&sc->sc_idle_timeout)); 397 398 sclog(sc, "releasing\n"); 399 400 if (sc->sc_sock) { 401 sock_release(sc->sc_sock); 402 sc->sc_sock = NULL; 403 } 404 405 o2nm_undepend_item(&sc->sc_node->nd_item); 406 o2nm_node_put(sc->sc_node); 407 sc->sc_node = NULL; 408 409 o2net_debug_del_sc(sc); 410 411 if (sc->sc_page) 412 __free_page(sc->sc_page); 413 kfree(sc); 414 } 415 416 static void sc_put(struct o2net_sock_container *sc) 417 { 418 sclog(sc, "put\n"); 419 kref_put(&sc->sc_kref, sc_kref_release); 420 } 421 static void sc_get(struct o2net_sock_container *sc) 422 { 423 sclog(sc, "get\n"); 424 kref_get(&sc->sc_kref); 425 } 426 static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) 427 { 428 struct o2net_sock_container *sc, *ret = NULL; 429 struct page *page = NULL; 430 int status = 0; 431 432 page = alloc_page(GFP_NOFS); 433 sc = kzalloc(sizeof(*sc), GFP_NOFS); 434 if (sc == NULL || page == NULL) 435 goto out; 436 437 kref_init(&sc->sc_kref); 438 o2nm_node_get(node); 439 sc->sc_node = node; 440 441 /* pin the node item of the remote node */ 442 status = o2nm_depend_item(&node->nd_item); 443 if (status) { 444 mlog_errno(status); 445 o2nm_node_put(node); 446 goto out; 447 } 448 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); 449 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); 450 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); 451 INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); 452 453 init_timer(&sc->sc_idle_timeout); 454 sc->sc_idle_timeout.function = o2net_idle_timer; 455 sc->sc_idle_timeout.data = (unsigned long)sc; 456 457 sclog(sc, "alloced\n"); 458 459 ret = sc; 460 sc->sc_page = page; 461 o2net_debug_add_sc(sc); 462 sc = NULL; 463 page = NULL; 464 465 out: 466 if (page) 467 __free_page(page); 468 kfree(sc); 469 470 return ret; 471 } 472 473 /* ------------------------------------------------------------ */ 474 475 static void o2net_sc_queue_work(struct o2net_sock_container *sc, 476 struct work_struct *work) 477 { 478 sc_get(sc); 479 if (!queue_work(o2net_wq, work)) 480 sc_put(sc); 481 } 482 static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc, 483 struct delayed_work *work, 484 int delay) 485 { 486 sc_get(sc); 487 if (!queue_delayed_work(o2net_wq, work, delay)) 488 sc_put(sc); 489 } 490 static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, 491 struct delayed_work *work) 492 { 493 if (cancel_delayed_work(work)) 494 sc_put(sc); 495 } 496 497 static atomic_t o2net_connected_peers = ATOMIC_INIT(0); 498 499 int o2net_num_connected_peers(void) 500 { 501 return atomic_read(&o2net_connected_peers); 502 } 503 504 static void o2net_set_nn_state(struct o2net_node *nn, 505 struct o2net_sock_container *sc, 506 unsigned valid, int err) 507 { 508 int was_valid = nn->nn_sc_valid; 509 int was_err = nn->nn_persistent_error; 510 struct o2net_sock_container *old_sc = nn->nn_sc; 511 512 assert_spin_locked(&nn->nn_lock); 513 514 if (old_sc && !sc) 515 atomic_dec(&o2net_connected_peers); 516 else if (!old_sc && sc) 517 atomic_inc(&o2net_connected_peers); 518 519 /* the node num comparison and single connect/accept path should stop 520 * an non-null sc from being overwritten with another */ 521 BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); 522 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); 523 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); 524 525 if (was_valid && !valid && err == 0) 526 err = -ENOTCONN; 527 528 mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", 529 o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, 530 nn->nn_persistent_error, err); 531 532 nn->nn_sc = sc; 533 nn->nn_sc_valid = valid ? 1 : 0; 534 nn->nn_persistent_error = err; 535 536 /* mirrors o2net_tx_can_proceed() */ 537 if (nn->nn_persistent_error || nn->nn_sc_valid) 538 wake_up(&nn->nn_sc_wq); 539 540 if (was_valid && !was_err && nn->nn_persistent_error) { 541 o2quo_conn_err(o2net_num_from_nn(nn)); 542 queue_delayed_work(o2net_wq, &nn->nn_still_up, 543 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); 544 } 545 546 if (was_valid && !valid) { 547 if (old_sc) 548 printk(KERN_NOTICE "o2net: No longer connected to " 549 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 550 o2net_complete_nodes_nsw(nn); 551 } 552 553 if (!was_valid && valid) { 554 o2quo_conn_up(o2net_num_from_nn(nn)); 555 cancel_delayed_work(&nn->nn_connect_expired); 556 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", 557 o2nm_this_node() > sc->sc_node->nd_num ? 558 "Connected to" : "Accepted connection from", 559 SC_NODEF_ARGS(sc)); 560 } 561 562 /* trigger the connecting worker func as long as we're not valid, 563 * it will back off if it shouldn't connect. This can be called 564 * from node config teardown and so needs to be careful about 565 * the work queue actually being up. */ 566 if (!valid && o2net_wq) { 567 unsigned long delay; 568 /* delay if we're within a RECONNECT_DELAY of the 569 * last attempt */ 570 delay = (nn->nn_last_connect_attempt + 571 msecs_to_jiffies(o2net_reconnect_delay())) 572 - jiffies; 573 if (delay > msecs_to_jiffies(o2net_reconnect_delay())) 574 delay = 0; 575 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); 576 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); 577 578 /* 579 * Delay the expired work after idle timeout. 580 * 581 * We might have lots of failed connection attempts that run 582 * through here but we only cancel the connect_expired work when 583 * a connection attempt succeeds. So only the first enqueue of 584 * the connect_expired work will do anything. The rest will see 585 * that it's already queued and do nothing. 586 */ 587 delay += msecs_to_jiffies(o2net_idle_timeout()); 588 queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); 589 } 590 591 /* keep track of the nn's sc ref for the caller */ 592 if ((old_sc == NULL) && sc) 593 sc_get(sc); 594 if (old_sc && (old_sc != sc)) { 595 o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); 596 sc_put(old_sc); 597 } 598 } 599 600 /* see o2net_register_callbacks() */ 601 static void o2net_data_ready(struct sock *sk) 602 { 603 void (*ready)(struct sock *sk); 604 struct o2net_sock_container *sc; 605 606 read_lock_bh(&sk->sk_callback_lock); 607 sc = sk->sk_user_data; 608 if (sc) { 609 sclog(sc, "data_ready hit\n"); 610 o2net_set_data_ready_time(sc); 611 o2net_sc_queue_work(sc, &sc->sc_rx_work); 612 ready = sc->sc_data_ready; 613 } else { 614 ready = sk->sk_data_ready; 615 } 616 read_unlock_bh(&sk->sk_callback_lock); 617 618 ready(sk); 619 } 620 621 /* see o2net_register_callbacks() */ 622 static void o2net_state_change(struct sock *sk) 623 { 624 void (*state_change)(struct sock *sk); 625 struct o2net_sock_container *sc; 626 627 read_lock_bh(&sk->sk_callback_lock); 628 sc = sk->sk_user_data; 629 if (sc == NULL) { 630 state_change = sk->sk_state_change; 631 goto out; 632 } 633 634 sclog(sc, "state_change to %d\n", sk->sk_state); 635 636 state_change = sc->sc_state_change; 637 638 switch(sk->sk_state) { 639 /* ignore connecting sockets as they make progress */ 640 case TCP_SYN_SENT: 641 case TCP_SYN_RECV: 642 break; 643 case TCP_ESTABLISHED: 644 o2net_sc_queue_work(sc, &sc->sc_connect_work); 645 break; 646 default: 647 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT 648 " shutdown, state %d\n", 649 SC_NODEF_ARGS(sc), sk->sk_state); 650 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 651 break; 652 } 653 out: 654 read_unlock_bh(&sk->sk_callback_lock); 655 state_change(sk); 656 } 657 658 /* 659 * we register callbacks so we can queue work on events before calling 660 * the original callbacks. our callbacks our careful to test user_data 661 * to discover when they've reaced with o2net_unregister_callbacks(). 662 */ 663 static void o2net_register_callbacks(struct sock *sk, 664 struct o2net_sock_container *sc) 665 { 666 write_lock_bh(&sk->sk_callback_lock); 667 668 /* accepted sockets inherit the old listen socket data ready */ 669 if (sk->sk_data_ready == o2net_listen_data_ready) { 670 sk->sk_data_ready = sk->sk_user_data; 671 sk->sk_user_data = NULL; 672 } 673 674 BUG_ON(sk->sk_user_data != NULL); 675 sk->sk_user_data = sc; 676 sc_get(sc); 677 678 sc->sc_data_ready = sk->sk_data_ready; 679 sc->sc_state_change = sk->sk_state_change; 680 sk->sk_data_ready = o2net_data_ready; 681 sk->sk_state_change = o2net_state_change; 682 683 mutex_init(&sc->sc_send_lock); 684 685 write_unlock_bh(&sk->sk_callback_lock); 686 } 687 688 static int o2net_unregister_callbacks(struct sock *sk, 689 struct o2net_sock_container *sc) 690 { 691 int ret = 0; 692 693 write_lock_bh(&sk->sk_callback_lock); 694 if (sk->sk_user_data == sc) { 695 ret = 1; 696 sk->sk_user_data = NULL; 697 sk->sk_data_ready = sc->sc_data_ready; 698 sk->sk_state_change = sc->sc_state_change; 699 } 700 write_unlock_bh(&sk->sk_callback_lock); 701 702 return ret; 703 } 704 705 /* 706 * this is a little helper that is called by callers who have seen a problem 707 * with an sc and want to detach it from the nn if someone already hasn't beat 708 * them to it. if an error is given then the shutdown will be persistent 709 * and pending transmits will be canceled. 710 */ 711 static void o2net_ensure_shutdown(struct o2net_node *nn, 712 struct o2net_sock_container *sc, 713 int err) 714 { 715 spin_lock(&nn->nn_lock); 716 if (nn->nn_sc == sc) 717 o2net_set_nn_state(nn, NULL, 0, err); 718 spin_unlock(&nn->nn_lock); 719 } 720 721 /* 722 * This work queue function performs the blocking parts of socket shutdown. A 723 * few paths lead here. set_nn_state will trigger this callback if it sees an 724 * sc detached from the nn. state_change will also trigger this callback 725 * directly when it sees errors. In that case we need to call set_nn_state 726 * ourselves as state_change couldn't get the nn_lock and call set_nn_state 727 * itself. 728 */ 729 static void o2net_shutdown_sc(struct work_struct *work) 730 { 731 struct o2net_sock_container *sc = 732 container_of(work, struct o2net_sock_container, 733 sc_shutdown_work); 734 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 735 736 sclog(sc, "shutting down\n"); 737 738 /* drop the callbacks ref and call shutdown only once */ 739 if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { 740 /* we shouldn't flush as we're in the thread, the 741 * races with pending sc work structs are harmless */ 742 del_timer_sync(&sc->sc_idle_timeout); 743 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 744 sc_put(sc); 745 kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); 746 } 747 748 /* not fatal so failed connects before the other guy has our 749 * heartbeat can be retried */ 750 o2net_ensure_shutdown(nn, sc, 0); 751 sc_put(sc); 752 } 753 754 /* ------------------------------------------------------------ */ 755 756 static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type, 757 u32 key) 758 { 759 int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); 760 761 if (ret == 0) 762 ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); 763 764 return ret; 765 } 766 767 static struct o2net_msg_handler * 768 o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, 769 struct rb_node **ret_parent) 770 { 771 struct rb_node **p = &o2net_handler_tree.rb_node; 772 struct rb_node *parent = NULL; 773 struct o2net_msg_handler *nmh, *ret = NULL; 774 int cmp; 775 776 while (*p) { 777 parent = *p; 778 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); 779 cmp = o2net_handler_cmp(nmh, msg_type, key); 780 781 if (cmp < 0) 782 p = &(*p)->rb_left; 783 else if (cmp > 0) 784 p = &(*p)->rb_right; 785 else { 786 ret = nmh; 787 break; 788 } 789 } 790 791 if (ret_p != NULL) 792 *ret_p = p; 793 if (ret_parent != NULL) 794 *ret_parent = parent; 795 796 return ret; 797 } 798 799 static void o2net_handler_kref_release(struct kref *kref) 800 { 801 struct o2net_msg_handler *nmh; 802 nmh = container_of(kref, struct o2net_msg_handler, nh_kref); 803 804 kfree(nmh); 805 } 806 807 static void o2net_handler_put(struct o2net_msg_handler *nmh) 808 { 809 kref_put(&nmh->nh_kref, o2net_handler_kref_release); 810 } 811 812 /* max_len is protection for the handler func. incoming messages won't 813 * be given to the handler if their payload is longer than the max. */ 814 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 815 o2net_msg_handler_func *func, void *data, 816 o2net_post_msg_handler_func *post_func, 817 struct list_head *unreg_list) 818 { 819 struct o2net_msg_handler *nmh = NULL; 820 struct rb_node **p, *parent; 821 int ret = 0; 822 823 if (max_len > O2NET_MAX_PAYLOAD_BYTES) { 824 mlog(0, "max_len for message handler out of range: %u\n", 825 max_len); 826 ret = -EINVAL; 827 goto out; 828 } 829 830 if (!msg_type) { 831 mlog(0, "no message type provided: %u, %p\n", msg_type, func); 832 ret = -EINVAL; 833 goto out; 834 835 } 836 if (!func) { 837 mlog(0, "no message handler provided: %u, %p\n", 838 msg_type, func); 839 ret = -EINVAL; 840 goto out; 841 } 842 843 nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS); 844 if (nmh == NULL) { 845 ret = -ENOMEM; 846 goto out; 847 } 848 849 nmh->nh_func = func; 850 nmh->nh_func_data = data; 851 nmh->nh_post_func = post_func; 852 nmh->nh_msg_type = msg_type; 853 nmh->nh_max_len = max_len; 854 nmh->nh_key = key; 855 /* the tree and list get this ref.. they're both removed in 856 * unregister when this ref is dropped */ 857 kref_init(&nmh->nh_kref); 858 INIT_LIST_HEAD(&nmh->nh_unregister_item); 859 860 write_lock(&o2net_handler_lock); 861 if (o2net_handler_tree_lookup(msg_type, key, &p, &parent)) 862 ret = -EEXIST; 863 else { 864 rb_link_node(&nmh->nh_node, parent, p); 865 rb_insert_color(&nmh->nh_node, &o2net_handler_tree); 866 list_add_tail(&nmh->nh_unregister_item, unreg_list); 867 868 mlog(ML_TCP, "registered handler func %p type %u key %08x\n", 869 func, msg_type, key); 870 /* we've had some trouble with handlers seemingly vanishing. */ 871 mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, 872 &parent) == NULL, 873 "couldn't find handler we *just* registered " 874 "for type %u key %08x\n", msg_type, key); 875 } 876 write_unlock(&o2net_handler_lock); 877 if (ret) 878 goto out; 879 880 out: 881 if (ret) 882 kfree(nmh); 883 884 return ret; 885 } 886 EXPORT_SYMBOL_GPL(o2net_register_handler); 887 888 void o2net_unregister_handler_list(struct list_head *list) 889 { 890 struct o2net_msg_handler *nmh, *n; 891 892 write_lock(&o2net_handler_lock); 893 list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { 894 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", 895 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); 896 rb_erase(&nmh->nh_node, &o2net_handler_tree); 897 list_del_init(&nmh->nh_unregister_item); 898 kref_put(&nmh->nh_kref, o2net_handler_kref_release); 899 } 900 write_unlock(&o2net_handler_lock); 901 } 902 EXPORT_SYMBOL_GPL(o2net_unregister_handler_list); 903 904 static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) 905 { 906 struct o2net_msg_handler *nmh; 907 908 read_lock(&o2net_handler_lock); 909 nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL); 910 if (nmh) 911 kref_get(&nmh->nh_kref); 912 read_unlock(&o2net_handler_lock); 913 914 return nmh; 915 } 916 917 /* ------------------------------------------------------------ */ 918 919 static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) 920 { 921 struct kvec vec = { .iov_len = len, .iov_base = data, }; 922 struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; 923 return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); 924 } 925 926 static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, 927 size_t veclen, size_t total) 928 { 929 int ret; 930 struct msghdr msg = {.msg_flags = 0,}; 931 932 if (sock == NULL) { 933 ret = -EINVAL; 934 goto out; 935 } 936 937 ret = kernel_sendmsg(sock, &msg, vec, veclen, total); 938 if (likely(ret == total)) 939 return 0; 940 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); 941 if (ret >= 0) 942 ret = -EPIPE; /* should be smarter, I bet */ 943 out: 944 mlog(0, "returning error: %d\n", ret); 945 return ret; 946 } 947 948 static void o2net_sendpage(struct o2net_sock_container *sc, 949 void *kmalloced_virt, 950 size_t size) 951 { 952 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 953 ssize_t ret; 954 955 while (1) { 956 mutex_lock(&sc->sc_send_lock); 957 ret = sc->sc_sock->ops->sendpage(sc->sc_sock, 958 virt_to_page(kmalloced_virt), 959 (long)kmalloced_virt & ~PAGE_MASK, 960 size, MSG_DONTWAIT); 961 mutex_unlock(&sc->sc_send_lock); 962 if (ret == size) 963 break; 964 if (ret == (ssize_t)-EAGAIN) { 965 mlog(0, "sendpage of size %zu to " SC_NODEF_FMT 966 " returned EAGAIN\n", size, SC_NODEF_ARGS(sc)); 967 cond_resched(); 968 continue; 969 } 970 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 971 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); 972 o2net_ensure_shutdown(nn, sc, 0); 973 break; 974 } 975 } 976 977 static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key) 978 { 979 memset(msg, 0, sizeof(struct o2net_msg)); 980 msg->magic = cpu_to_be16(O2NET_MSG_MAGIC); 981 msg->data_len = cpu_to_be16(data_len); 982 msg->msg_type = cpu_to_be16(msg_type); 983 msg->sys_status = cpu_to_be32(O2NET_ERR_NONE); 984 msg->status = 0; 985 msg->key = cpu_to_be32(key); 986 } 987 988 static int o2net_tx_can_proceed(struct o2net_node *nn, 989 struct o2net_sock_container **sc_ret, 990 int *error) 991 { 992 int ret = 0; 993 994 spin_lock(&nn->nn_lock); 995 if (nn->nn_persistent_error) { 996 ret = 1; 997 *sc_ret = NULL; 998 *error = nn->nn_persistent_error; 999 } else if (nn->nn_sc_valid) { 1000 kref_get(&nn->nn_sc->sc_kref); 1001 1002 ret = 1; 1003 *sc_ret = nn->nn_sc; 1004 *error = 0; 1005 } 1006 spin_unlock(&nn->nn_lock); 1007 1008 return ret; 1009 } 1010 1011 /* Get a map of all nodes to which this node is currently connected to */ 1012 void o2net_fill_node_map(unsigned long *map, unsigned bytes) 1013 { 1014 struct o2net_sock_container *sc; 1015 int node, ret; 1016 1017 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); 1018 1019 memset(map, 0, bytes); 1020 for (node = 0; node < O2NM_MAX_NODES; ++node) { 1021 if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) 1022 continue; 1023 if (!ret) { 1024 set_bit(node, map); 1025 sc_put(sc); 1026 } 1027 } 1028 } 1029 EXPORT_SYMBOL_GPL(o2net_fill_node_map); 1030 1031 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 1032 size_t caller_veclen, u8 target_node, int *status) 1033 { 1034 int ret = 0; 1035 struct o2net_msg *msg = NULL; 1036 size_t veclen, caller_bytes = 0; 1037 struct kvec *vec = NULL; 1038 struct o2net_sock_container *sc = NULL; 1039 struct o2net_node *nn = o2net_nn_from_num(target_node); 1040 struct o2net_status_wait nsw = { 1041 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), 1042 }; 1043 struct o2net_send_tracking nst; 1044 1045 o2net_init_nst(&nst, msg_type, key, current, target_node); 1046 1047 if (o2net_wq == NULL) { 1048 mlog(0, "attempt to tx without o2netd running\n"); 1049 ret = -ESRCH; 1050 goto out; 1051 } 1052 1053 if (caller_veclen == 0) { 1054 mlog(0, "bad kvec array length\n"); 1055 ret = -EINVAL; 1056 goto out; 1057 } 1058 1059 caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); 1060 if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) { 1061 mlog(0, "total payload len %zu too large\n", caller_bytes); 1062 ret = -EINVAL; 1063 goto out; 1064 } 1065 1066 if (target_node == o2nm_this_node()) { 1067 ret = -ELOOP; 1068 goto out; 1069 } 1070 1071 o2net_debug_add_nst(&nst); 1072 1073 o2net_set_nst_sock_time(&nst); 1074 1075 wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); 1076 if (ret) 1077 goto out; 1078 1079 o2net_set_nst_sock_container(&nst, sc); 1080 1081 veclen = caller_veclen + 1; 1082 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); 1083 if (vec == NULL) { 1084 mlog(0, "failed to %zu element kvec!\n", veclen); 1085 ret = -ENOMEM; 1086 goto out; 1087 } 1088 1089 msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC); 1090 if (!msg) { 1091 mlog(0, "failed to allocate a o2net_msg!\n"); 1092 ret = -ENOMEM; 1093 goto out; 1094 } 1095 1096 o2net_init_msg(msg, caller_bytes, msg_type, key); 1097 1098 vec[0].iov_len = sizeof(struct o2net_msg); 1099 vec[0].iov_base = msg; 1100 memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); 1101 1102 ret = o2net_prep_nsw(nn, &nsw); 1103 if (ret) 1104 goto out; 1105 1106 msg->msg_num = cpu_to_be32(nsw.ns_id); 1107 o2net_set_nst_msg_id(&nst, nsw.ns_id); 1108 1109 o2net_set_nst_send_time(&nst); 1110 1111 /* finally, convert the message header to network byte-order 1112 * and send */ 1113 mutex_lock(&sc->sc_send_lock); 1114 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, 1115 sizeof(struct o2net_msg) + caller_bytes); 1116 mutex_unlock(&sc->sc_send_lock); 1117 msglog(msg, "sending returned %d\n", ret); 1118 if (ret < 0) { 1119 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); 1120 goto out; 1121 } 1122 1123 /* wait on other node's handler */ 1124 o2net_set_nst_status_time(&nst); 1125 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1126 1127 o2net_update_send_stats(&nst, sc); 1128 1129 /* Note that we avoid overwriting the callers status return 1130 * variable if a system error was reported on the other 1131 * side. Callers beware. */ 1132 ret = o2net_sys_err_to_errno(nsw.ns_sys_status); 1133 if (status && !ret) 1134 *status = nsw.ns_status; 1135 1136 mlog(0, "woken, returning system status %d, user status %d\n", 1137 ret, nsw.ns_status); 1138 out: 1139 o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ 1140 if (sc) 1141 sc_put(sc); 1142 kfree(vec); 1143 kfree(msg); 1144 o2net_complete_nsw(nn, &nsw, 0, 0, 0); 1145 return ret; 1146 } 1147 EXPORT_SYMBOL_GPL(o2net_send_message_vec); 1148 1149 int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, 1150 u8 target_node, int *status) 1151 { 1152 struct kvec vec = { 1153 .iov_base = data, 1154 .iov_len = len, 1155 }; 1156 return o2net_send_message_vec(msg_type, key, &vec, 1, 1157 target_node, status); 1158 } 1159 EXPORT_SYMBOL_GPL(o2net_send_message); 1160 1161 static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr, 1162 enum o2net_system_error syserr, int err) 1163 { 1164 struct kvec vec = { 1165 .iov_base = hdr, 1166 .iov_len = sizeof(struct o2net_msg), 1167 }; 1168 1169 BUG_ON(syserr >= O2NET_ERR_MAX); 1170 1171 /* leave other fields intact from the incoming message, msg_num 1172 * in particular */ 1173 hdr->sys_status = cpu_to_be32(syserr); 1174 hdr->status = cpu_to_be32(err); 1175 hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic 1176 hdr->data_len = 0; 1177 1178 msglog(hdr, "about to send status magic %d\n", err); 1179 /* hdr has been in host byteorder this whole time */ 1180 return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg)); 1181 } 1182 1183 /* this returns -errno if the header was unknown or too large, etc. 1184 * after this is called the buffer us reused for the next message */ 1185 static int o2net_process_message(struct o2net_sock_container *sc, 1186 struct o2net_msg *hdr) 1187 { 1188 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1189 int ret = 0, handler_status; 1190 enum o2net_system_error syserr; 1191 struct o2net_msg_handler *nmh = NULL; 1192 void *ret_data = NULL; 1193 1194 msglog(hdr, "processing message\n"); 1195 1196 o2net_sc_postpone_idle(sc); 1197 1198 switch(be16_to_cpu(hdr->magic)) { 1199 case O2NET_MSG_STATUS_MAGIC: 1200 /* special type for returning message status */ 1201 o2net_complete_nsw(nn, NULL, 1202 be32_to_cpu(hdr->msg_num), 1203 be32_to_cpu(hdr->sys_status), 1204 be32_to_cpu(hdr->status)); 1205 goto out; 1206 case O2NET_MSG_KEEP_REQ_MAGIC: 1207 o2net_sendpage(sc, o2net_keep_resp, 1208 sizeof(*o2net_keep_resp)); 1209 goto out; 1210 case O2NET_MSG_KEEP_RESP_MAGIC: 1211 goto out; 1212 case O2NET_MSG_MAGIC: 1213 break; 1214 default: 1215 msglog(hdr, "bad magic\n"); 1216 ret = -EINVAL; 1217 goto out; 1218 break; 1219 } 1220 1221 /* find a handler for it */ 1222 handler_status = 0; 1223 nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type), 1224 be32_to_cpu(hdr->key)); 1225 if (!nmh) { 1226 mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", 1227 be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); 1228 syserr = O2NET_ERR_NO_HNDLR; 1229 goto out_respond; 1230 } 1231 1232 syserr = O2NET_ERR_NONE; 1233 1234 if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) 1235 syserr = O2NET_ERR_OVERFLOW; 1236 1237 if (syserr != O2NET_ERR_NONE) 1238 goto out_respond; 1239 1240 o2net_set_func_start_time(sc); 1241 sc->sc_msg_key = be32_to_cpu(hdr->key); 1242 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1243 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1244 be16_to_cpu(hdr->data_len), 1245 nmh->nh_func_data, &ret_data); 1246 o2net_set_func_stop_time(sc); 1247 1248 o2net_update_recv_stats(sc); 1249 1250 out_respond: 1251 /* this destroys the hdr, so don't use it after this */ 1252 mutex_lock(&sc->sc_send_lock); 1253 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, 1254 handler_status); 1255 mutex_unlock(&sc->sc_send_lock); 1256 hdr = NULL; 1257 mlog(0, "sending handler status %d, syserr %d returned %d\n", 1258 handler_status, syserr, ret); 1259 1260 if (nmh) { 1261 BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); 1262 if (nmh->nh_post_func) 1263 (nmh->nh_post_func)(handler_status, nmh->nh_func_data, 1264 ret_data); 1265 } 1266 1267 out: 1268 if (nmh) 1269 o2net_handler_put(nmh); 1270 return ret; 1271 } 1272 1273 static int o2net_check_handshake(struct o2net_sock_container *sc) 1274 { 1275 struct o2net_handshake *hand = page_address(sc->sc_page); 1276 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1277 1278 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { 1279 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " 1280 "protocol version %llu but %llu is required. " 1281 "Disconnecting.\n", SC_NODEF_ARGS(sc), 1282 (unsigned long long)be64_to_cpu(hand->protocol_version), 1283 O2NET_PROTOCOL_VERSION); 1284 1285 /* don't bother reconnecting if its the wrong version. */ 1286 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1287 return -1; 1288 } 1289 1290 /* 1291 * Ensure timeouts are consistent with other nodes, otherwise 1292 * we can end up with one node thinking that the other must be down, 1293 * but isn't. This can ultimately cause corruption. 1294 */ 1295 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1296 o2net_idle_timeout()) { 1297 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " 1298 "idle timeout of %u ms, but we use %u ms locally. " 1299 "Disconnecting.\n", SC_NODEF_ARGS(sc), 1300 be32_to_cpu(hand->o2net_idle_timeout_ms), 1301 o2net_idle_timeout()); 1302 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1303 return -1; 1304 } 1305 1306 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1307 o2net_keepalive_delay()) { 1308 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " 1309 "delay of %u ms, but we use %u ms locally. " 1310 "Disconnecting.\n", SC_NODEF_ARGS(sc), 1311 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1312 o2net_keepalive_delay()); 1313 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1314 return -1; 1315 } 1316 1317 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1318 O2HB_MAX_WRITE_TIMEOUT_MS) { 1319 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " 1320 "timeout of %u ms, but we use %u ms locally. " 1321 "Disconnecting.\n", SC_NODEF_ARGS(sc), 1322 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1323 O2HB_MAX_WRITE_TIMEOUT_MS); 1324 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1325 return -1; 1326 } 1327 1328 sc->sc_handshake_ok = 1; 1329 1330 spin_lock(&nn->nn_lock); 1331 /* set valid and queue the idle timers only if it hasn't been 1332 * shut down already */ 1333 if (nn->nn_sc == sc) { 1334 o2net_sc_reset_idle_timer(sc); 1335 atomic_set(&nn->nn_timeout, 0); 1336 o2net_set_nn_state(nn, sc, 1, 0); 1337 } 1338 spin_unlock(&nn->nn_lock); 1339 1340 /* shift everything up as though it wasn't there */ 1341 sc->sc_page_off -= sizeof(struct o2net_handshake); 1342 if (sc->sc_page_off) 1343 memmove(hand, hand + 1, sc->sc_page_off); 1344 1345 return 0; 1346 } 1347 1348 /* this demuxes the queued rx bytes into header or payload bits and calls 1349 * handlers as each full message is read off the socket. it returns -error, 1350 * == 0 eof, or > 0 for progress made.*/ 1351 static int o2net_advance_rx(struct o2net_sock_container *sc) 1352 { 1353 struct o2net_msg *hdr; 1354 int ret = 0; 1355 void *data; 1356 size_t datalen; 1357 1358 sclog(sc, "receiving\n"); 1359 o2net_set_advance_start_time(sc); 1360 1361 if (unlikely(sc->sc_handshake_ok == 0)) { 1362 if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1363 data = page_address(sc->sc_page) + sc->sc_page_off; 1364 datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; 1365 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1366 if (ret > 0) 1367 sc->sc_page_off += ret; 1368 } 1369 1370 if (sc->sc_page_off == sizeof(struct o2net_handshake)) { 1371 o2net_check_handshake(sc); 1372 if (unlikely(sc->sc_handshake_ok == 0)) 1373 ret = -EPROTO; 1374 } 1375 goto out; 1376 } 1377 1378 /* do we need more header? */ 1379 if (sc->sc_page_off < sizeof(struct o2net_msg)) { 1380 data = page_address(sc->sc_page) + sc->sc_page_off; 1381 datalen = sizeof(struct o2net_msg) - sc->sc_page_off; 1382 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1383 if (ret > 0) { 1384 sc->sc_page_off += ret; 1385 /* only swab incoming here.. we can 1386 * only get here once as we cross from 1387 * being under to over */ 1388 if (sc->sc_page_off == sizeof(struct o2net_msg)) { 1389 hdr = page_address(sc->sc_page); 1390 if (be16_to_cpu(hdr->data_len) > 1391 O2NET_MAX_PAYLOAD_BYTES) 1392 ret = -EOVERFLOW; 1393 } 1394 } 1395 if (ret <= 0) 1396 goto out; 1397 } 1398 1399 if (sc->sc_page_off < sizeof(struct o2net_msg)) { 1400 /* oof, still don't have a header */ 1401 goto out; 1402 } 1403 1404 /* this was swabbed above when we first read it */ 1405 hdr = page_address(sc->sc_page); 1406 1407 msglog(hdr, "at page_off %zu\n", sc->sc_page_off); 1408 1409 /* do we need more payload? */ 1410 if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) { 1411 /* need more payload */ 1412 data = page_address(sc->sc_page) + sc->sc_page_off; 1413 datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) - 1414 sc->sc_page_off; 1415 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1416 if (ret > 0) 1417 sc->sc_page_off += ret; 1418 if (ret <= 0) 1419 goto out; 1420 } 1421 1422 if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) { 1423 /* we can only get here once, the first time we read 1424 * the payload.. so set ret to progress if the handler 1425 * works out. after calling this the message is toast */ 1426 ret = o2net_process_message(sc, hdr); 1427 if (ret == 0) 1428 ret = 1; 1429 sc->sc_page_off = 0; 1430 } 1431 1432 out: 1433 sclog(sc, "ret = %d\n", ret); 1434 o2net_set_advance_stop_time(sc); 1435 return ret; 1436 } 1437 1438 /* this work func is triggerd by data ready. it reads until it can read no 1439 * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing 1440 * our work the work struct will be marked and we'll be called again. */ 1441 static void o2net_rx_until_empty(struct work_struct *work) 1442 { 1443 struct o2net_sock_container *sc = 1444 container_of(work, struct o2net_sock_container, sc_rx_work); 1445 int ret; 1446 1447 do { 1448 ret = o2net_advance_rx(sc); 1449 } while (ret > 0); 1450 1451 if (ret <= 0 && ret != -EAGAIN) { 1452 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1453 sclog(sc, "saw error %d, closing\n", ret); 1454 /* not permanent so read failed handshake can retry */ 1455 o2net_ensure_shutdown(nn, sc, 0); 1456 } 1457 1458 sc_put(sc); 1459 } 1460 1461 static int o2net_set_nodelay(struct socket *sock) 1462 { 1463 int ret, val = 1; 1464 mm_segment_t oldfs; 1465 1466 oldfs = get_fs(); 1467 set_fs(KERNEL_DS); 1468 1469 /* 1470 * Dear unsuspecting programmer, 1471 * 1472 * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level 1473 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will 1474 * silently turn into SO_DEBUG. 1475 * 1476 * Yours, 1477 * Keeper of hilariously fragile interfaces. 1478 */ 1479 ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, 1480 (char __user *)&val, sizeof(val)); 1481 1482 set_fs(oldfs); 1483 return ret; 1484 } 1485 1486 static int o2net_set_usertimeout(struct socket *sock) 1487 { 1488 int user_timeout = O2NET_TCP_USER_TIMEOUT; 1489 1490 return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, 1491 (char *)&user_timeout, sizeof(user_timeout)); 1492 } 1493 1494 static void o2net_initialize_handshake(void) 1495 { 1496 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1497 O2HB_MAX_WRITE_TIMEOUT_MS); 1498 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); 1499 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1500 o2net_keepalive_delay()); 1501 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1502 o2net_reconnect_delay()); 1503 } 1504 1505 /* ------------------------------------------------------------ */ 1506 1507 /* called when a connect completes and after a sock is accepted. the 1508 * rx path will see the response and mark the sc valid */ 1509 static void o2net_sc_connect_completed(struct work_struct *work) 1510 { 1511 struct o2net_sock_container *sc = 1512 container_of(work, struct o2net_sock_container, 1513 sc_connect_work); 1514 1515 mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", 1516 (unsigned long long)O2NET_PROTOCOL_VERSION, 1517 (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); 1518 1519 o2net_initialize_handshake(); 1520 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1521 sc_put(sc); 1522 } 1523 1524 /* this is called as a work_struct func. */ 1525 static void o2net_sc_send_keep_req(struct work_struct *work) 1526 { 1527 struct o2net_sock_container *sc = 1528 container_of(work, struct o2net_sock_container, 1529 sc_keepalive_work.work); 1530 1531 o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req)); 1532 sc_put(sc); 1533 } 1534 1535 /* socket shutdown does a del_timer_sync against this as it tears down. 1536 * we can't start this timer until we've got to the point in sc buildup 1537 * where shutdown is going to be involved */ 1538 static void o2net_idle_timer(unsigned long data) 1539 { 1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1542 #ifdef CONFIG_DEBUG_FS 1543 unsigned long msecs = ktime_to_ms(ktime_get()) - 1544 ktime_to_ms(sc->sc_tv_timer); 1545 #else 1546 unsigned long msecs = o2net_idle_timeout(); 1547 #endif 1548 1549 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " 1550 "idle for %lu.%lu secs.\n", 1551 SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); 1552 1553 /* idle timerout happen, don't shutdown the connection, but 1554 * make fence decision. Maybe the connection can recover before 1555 * the decision is made. 1556 */ 1557 atomic_set(&nn->nn_timeout, 1); 1558 o2quo_conn_err(o2net_num_from_nn(nn)); 1559 queue_delayed_work(o2net_wq, &nn->nn_still_up, 1560 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); 1561 1562 o2net_sc_reset_idle_timer(sc); 1563 1564 } 1565 1566 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) 1567 { 1568 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1569 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1570 msecs_to_jiffies(o2net_keepalive_delay())); 1571 o2net_set_sock_timer(sc); 1572 mod_timer(&sc->sc_idle_timeout, 1573 jiffies + msecs_to_jiffies(o2net_idle_timeout())); 1574 } 1575 1576 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1577 { 1578 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1579 1580 /* clear fence decision since the connection recover from timeout*/ 1581 if (atomic_read(&nn->nn_timeout)) { 1582 o2quo_conn_up(o2net_num_from_nn(nn)); 1583 cancel_delayed_work(&nn->nn_still_up); 1584 atomic_set(&nn->nn_timeout, 0); 1585 } 1586 1587 /* Only push out an existing timer */ 1588 if (timer_pending(&sc->sc_idle_timeout)) 1589 o2net_sc_reset_idle_timer(sc); 1590 } 1591 1592 /* this work func is kicked whenever a path sets the nn state which doesn't 1593 * have valid set. This includes seeing hb come up, losing a connection, 1594 * having a connect attempt fail, etc. This centralizes the logic which decides 1595 * if a connect attempt should be made or if we should give up and all future 1596 * transmit attempts should fail */ 1597 static void o2net_start_connect(struct work_struct *work) 1598 { 1599 struct o2net_node *nn = 1600 container_of(work, struct o2net_node, nn_connect_work.work); 1601 struct o2net_sock_container *sc = NULL; 1602 struct o2nm_node *node = NULL, *mynode = NULL; 1603 struct socket *sock = NULL; 1604 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1605 int ret = 0, stop; 1606 unsigned int timeout; 1607 unsigned int noio_flag; 1608 1609 /* 1610 * sock_create allocates the sock with GFP_KERNEL. We must set 1611 * per-process flag PF_MEMALLOC_NOIO so that all allocations done 1612 * by this process are done as if GFP_NOIO was specified. So we 1613 * are not reentering filesystem while doing memory reclaim. 1614 */ 1615 noio_flag = memalloc_noio_save(); 1616 /* if we're greater we initiate tx, otherwise we accept */ 1617 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1618 goto out; 1619 1620 /* watch for racing with tearing a node down */ 1621 node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); 1622 if (node == NULL) 1623 goto out; 1624 1625 mynode = o2nm_get_node_by_num(o2nm_this_node()); 1626 if (mynode == NULL) 1627 goto out; 1628 1629 spin_lock(&nn->nn_lock); 1630 /* 1631 * see if we already have one pending or have given up. 1632 * For nn_timeout, it is set when we close the connection 1633 * because of the idle time out. So it means that we have 1634 * at least connected to that node successfully once, 1635 * now try to connect to it again. 1636 */ 1637 timeout = atomic_read(&nn->nn_timeout); 1638 stop = (nn->nn_sc || 1639 (nn->nn_persistent_error && 1640 (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); 1641 spin_unlock(&nn->nn_lock); 1642 if (stop) 1643 goto out; 1644 1645 nn->nn_last_connect_attempt = jiffies; 1646 1647 sc = sc_alloc(node); 1648 if (sc == NULL) { 1649 mlog(0, "couldn't allocate sc\n"); 1650 ret = -ENOMEM; 1651 goto out; 1652 } 1653 1654 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1655 if (ret < 0) { 1656 mlog(0, "can't create socket: %d\n", ret); 1657 goto out; 1658 } 1659 sc->sc_sock = sock; /* freed by sc_kref_release */ 1660 1661 sock->sk->sk_allocation = GFP_ATOMIC; 1662 1663 myaddr.sin_family = AF_INET; 1664 myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; 1665 myaddr.sin_port = htons(0); /* any port */ 1666 1667 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, 1668 sizeof(myaddr)); 1669 if (ret) { 1670 mlog(ML_ERROR, "bind failed with %d at address %pI4\n", 1671 ret, &mynode->nd_ipv4_address); 1672 goto out; 1673 } 1674 1675 ret = o2net_set_nodelay(sc->sc_sock); 1676 if (ret) { 1677 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); 1678 goto out; 1679 } 1680 1681 ret = o2net_set_usertimeout(sock); 1682 if (ret) { 1683 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); 1684 goto out; 1685 } 1686 1687 o2net_register_callbacks(sc->sc_sock->sk, sc); 1688 1689 spin_lock(&nn->nn_lock); 1690 /* handshake completion will set nn->nn_sc_valid */ 1691 o2net_set_nn_state(nn, sc, 0, 0); 1692 spin_unlock(&nn->nn_lock); 1693 1694 remoteaddr.sin_family = AF_INET; 1695 remoteaddr.sin_addr.s_addr = node->nd_ipv4_address; 1696 remoteaddr.sin_port = node->nd_ipv4_port; 1697 1698 ret = sc->sc_sock->ops->connect(sc->sc_sock, 1699 (struct sockaddr *)&remoteaddr, 1700 sizeof(remoteaddr), 1701 O_NONBLOCK); 1702 if (ret == -EINPROGRESS) 1703 ret = 0; 1704 1705 out: 1706 if (ret && sc) { 1707 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT 1708 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); 1709 /* 0 err so that another will be queued and attempted 1710 * from set_nn_state */ 1711 o2net_ensure_shutdown(nn, sc, 0); 1712 } 1713 if (sc) 1714 sc_put(sc); 1715 if (node) 1716 o2nm_node_put(node); 1717 if (mynode) 1718 o2nm_node_put(mynode); 1719 1720 memalloc_noio_restore(noio_flag); 1721 return; 1722 } 1723 1724 static void o2net_connect_expired(struct work_struct *work) 1725 { 1726 struct o2net_node *nn = 1727 container_of(work, struct o2net_node, nn_connect_expired.work); 1728 1729 spin_lock(&nn->nn_lock); 1730 if (!nn->nn_sc_valid) { 1731 printk(KERN_NOTICE "o2net: No connection established with " 1732 "node %u after %u.%u seconds, check network and" 1733 " cluster configuration.\n", 1734 o2net_num_from_nn(nn), 1735 o2net_idle_timeout() / 1000, 1736 o2net_idle_timeout() % 1000); 1737 1738 o2net_set_nn_state(nn, NULL, 0, 0); 1739 } 1740 spin_unlock(&nn->nn_lock); 1741 } 1742 1743 static void o2net_still_up(struct work_struct *work) 1744 { 1745 struct o2net_node *nn = 1746 container_of(work, struct o2net_node, nn_still_up.work); 1747 1748 o2quo_hb_still_up(o2net_num_from_nn(nn)); 1749 } 1750 1751 /* ------------------------------------------------------------ */ 1752 1753 void o2net_disconnect_node(struct o2nm_node *node) 1754 { 1755 struct o2net_node *nn = o2net_nn_from_num(node->nd_num); 1756 1757 /* don't reconnect until it's heartbeating again */ 1758 spin_lock(&nn->nn_lock); 1759 atomic_set(&nn->nn_timeout, 0); 1760 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1761 spin_unlock(&nn->nn_lock); 1762 1763 if (o2net_wq) { 1764 cancel_delayed_work(&nn->nn_connect_expired); 1765 cancel_delayed_work(&nn->nn_connect_work); 1766 cancel_delayed_work(&nn->nn_still_up); 1767 flush_workqueue(o2net_wq); 1768 } 1769 } 1770 1771 static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, 1772 void *data) 1773 { 1774 o2quo_hb_down(node_num); 1775 1776 if (!node) 1777 return; 1778 1779 if (node_num != o2nm_this_node()) 1780 o2net_disconnect_node(node); 1781 1782 BUG_ON(atomic_read(&o2net_connected_peers) < 0); 1783 } 1784 1785 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, 1786 void *data) 1787 { 1788 struct o2net_node *nn = o2net_nn_from_num(node_num); 1789 1790 o2quo_hb_up(node_num); 1791 1792 BUG_ON(!node); 1793 1794 /* ensure an immediate connect attempt */ 1795 nn->nn_last_connect_attempt = jiffies - 1796 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1797 1798 if (node_num != o2nm_this_node()) { 1799 /* believe it or not, accept and node hearbeating testing 1800 * can succeed for this node before we got here.. so 1801 * only use set_nn_state to clear the persistent error 1802 * if that hasn't already happened */ 1803 spin_lock(&nn->nn_lock); 1804 atomic_set(&nn->nn_timeout, 0); 1805 if (nn->nn_persistent_error) 1806 o2net_set_nn_state(nn, NULL, 0, 0); 1807 spin_unlock(&nn->nn_lock); 1808 } 1809 } 1810 1811 void o2net_unregister_hb_callbacks(void) 1812 { 1813 o2hb_unregister_callback(NULL, &o2net_hb_up); 1814 o2hb_unregister_callback(NULL, &o2net_hb_down); 1815 } 1816 1817 int o2net_register_hb_callbacks(void) 1818 { 1819 int ret; 1820 1821 o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB, 1822 o2net_hb_node_down_cb, NULL, O2NET_HB_PRI); 1823 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, 1824 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); 1825 1826 ret = o2hb_register_callback(NULL, &o2net_hb_up); 1827 if (ret == 0) 1828 ret = o2hb_register_callback(NULL, &o2net_hb_down); 1829 1830 if (ret) 1831 o2net_unregister_hb_callbacks(); 1832 1833 return ret; 1834 } 1835 1836 /* ------------------------------------------------------------ */ 1837 1838 static int o2net_accept_one(struct socket *sock, int *more) 1839 { 1840 int ret, slen; 1841 struct sockaddr_in sin; 1842 struct socket *new_sock = NULL; 1843 struct o2nm_node *node = NULL; 1844 struct o2nm_node *local_node = NULL; 1845 struct o2net_sock_container *sc = NULL; 1846 struct o2net_node *nn; 1847 unsigned int noio_flag; 1848 1849 /* 1850 * sock_create_lite allocates the sock with GFP_KERNEL. We must set 1851 * per-process flag PF_MEMALLOC_NOIO so that all allocations done 1852 * by this process are done as if GFP_NOIO was specified. So we 1853 * are not reentering filesystem while doing memory reclaim. 1854 */ 1855 noio_flag = memalloc_noio_save(); 1856 1857 BUG_ON(sock == NULL); 1858 *more = 0; 1859 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, 1860 sock->sk->sk_protocol, &new_sock); 1861 if (ret) 1862 goto out; 1863 1864 new_sock->type = sock->type; 1865 new_sock->ops = sock->ops; 1866 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); 1867 if (ret < 0) 1868 goto out; 1869 1870 *more = 1; 1871 new_sock->sk->sk_allocation = GFP_ATOMIC; 1872 1873 ret = o2net_set_nodelay(new_sock); 1874 if (ret) { 1875 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); 1876 goto out; 1877 } 1878 1879 ret = o2net_set_usertimeout(new_sock); 1880 if (ret) { 1881 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); 1882 goto out; 1883 } 1884 1885 slen = sizeof(sin); 1886 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1887 &slen, 1); 1888 if (ret < 0) 1889 goto out; 1890 1891 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); 1892 if (node == NULL) { 1893 printk(KERN_NOTICE "o2net: Attempt to connect from unknown " 1894 "node at %pI4:%d\n", &sin.sin_addr.s_addr, 1895 ntohs(sin.sin_port)); 1896 ret = -EINVAL; 1897 goto out; 1898 } 1899 1900 if (o2nm_this_node() >= node->nd_num) { 1901 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1902 if (local_node) 1903 printk(KERN_NOTICE "o2net: Unexpected connect attempt " 1904 "seen at node '%s' (%u, %pI4:%d) from " 1905 "node '%s' (%u, %pI4:%d)\n", 1906 local_node->nd_name, local_node->nd_num, 1907 &(local_node->nd_ipv4_address), 1908 ntohs(local_node->nd_ipv4_port), 1909 node->nd_name, 1910 node->nd_num, &sin.sin_addr.s_addr, 1911 ntohs(sin.sin_port)); 1912 ret = -EINVAL; 1913 goto out; 1914 } 1915 1916 /* this happens all the time when the other node sees our heartbeat 1917 * and tries to connect before we see their heartbeat */ 1918 if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) { 1919 mlog(ML_CONN, "attempt to connect from node '%s' at " 1920 "%pI4:%d but it isn't heartbeating\n", 1921 node->nd_name, &sin.sin_addr.s_addr, 1922 ntohs(sin.sin_port)); 1923 ret = -EINVAL; 1924 goto out; 1925 } 1926 1927 nn = o2net_nn_from_num(node->nd_num); 1928 1929 spin_lock(&nn->nn_lock); 1930 if (nn->nn_sc) 1931 ret = -EBUSY; 1932 else 1933 ret = 0; 1934 spin_unlock(&nn->nn_lock); 1935 if (ret) { 1936 printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " 1937 "at %pI4:%d but it already has an open connection\n", 1938 node->nd_name, &sin.sin_addr.s_addr, 1939 ntohs(sin.sin_port)); 1940 goto out; 1941 } 1942 1943 sc = sc_alloc(node); 1944 if (sc == NULL) { 1945 ret = -ENOMEM; 1946 goto out; 1947 } 1948 1949 sc->sc_sock = new_sock; 1950 new_sock = NULL; 1951 1952 spin_lock(&nn->nn_lock); 1953 atomic_set(&nn->nn_timeout, 0); 1954 o2net_set_nn_state(nn, sc, 0, 0); 1955 spin_unlock(&nn->nn_lock); 1956 1957 o2net_register_callbacks(sc->sc_sock->sk, sc); 1958 o2net_sc_queue_work(sc, &sc->sc_rx_work); 1959 1960 o2net_initialize_handshake(); 1961 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1962 1963 out: 1964 if (new_sock) 1965 sock_release(new_sock); 1966 if (node) 1967 o2nm_node_put(node); 1968 if (local_node) 1969 o2nm_node_put(local_node); 1970 if (sc) 1971 sc_put(sc); 1972 1973 memalloc_noio_restore(noio_flag); 1974 return ret; 1975 } 1976 1977 /* 1978 * This function is invoked in response to one or more 1979 * pending accepts at softIRQ level. We must drain the 1980 * entire que before returning. 1981 */ 1982 1983 static void o2net_accept_many(struct work_struct *work) 1984 { 1985 struct socket *sock = o2net_listen_sock; 1986 int more; 1987 int err; 1988 1989 /* 1990 * It is critical to note that due to interrupt moderation 1991 * at the network driver level, we can't assume to get a 1992 * softIRQ for every single conn since tcp SYN packets 1993 * can arrive back-to-back, and therefore many pending 1994 * accepts may result in just 1 softIRQ. If we terminate 1995 * the o2net_accept_one() loop upon seeing an err, what happens 1996 * to the rest of the conns in the queue? If no new SYN 1997 * arrives for hours, no softIRQ will be delivered, 1998 * and the connections will just sit in the queue. 1999 */ 2000 2001 for (;;) { 2002 err = o2net_accept_one(sock, &more); 2003 if (!more) 2004 break; 2005 cond_resched(); 2006 } 2007 } 2008 2009 static void o2net_listen_data_ready(struct sock *sk) 2010 { 2011 void (*ready)(struct sock *sk); 2012 2013 read_lock_bh(&sk->sk_callback_lock); 2014 ready = sk->sk_user_data; 2015 if (ready == NULL) { /* check for teardown race */ 2016 ready = sk->sk_data_ready; 2017 goto out; 2018 } 2019 2020 /* This callback may called twice when a new connection 2021 * is being established as a child socket inherits everything 2022 * from a parent LISTEN socket, including the data_ready cb of 2023 * the parent. This leads to a hazard. In o2net_accept_one() 2024 * we are still initializing the child socket but have not 2025 * changed the inherited data_ready callback yet when 2026 * data starts arriving. 2027 * We avoid this hazard by checking the state. 2028 * For the listening socket, the state will be TCP_LISTEN; for the new 2029 * socket, will be TCP_ESTABLISHED. Also, in this case, 2030 * sk->sk_user_data is not a valid function pointer. 2031 */ 2032 2033 if (sk->sk_state == TCP_LISTEN) { 2034 queue_work(o2net_wq, &o2net_listen_work); 2035 } else { 2036 ready = NULL; 2037 } 2038 2039 out: 2040 read_unlock_bh(&sk->sk_callback_lock); 2041 if (ready != NULL) 2042 ready(sk); 2043 } 2044 2045 static int o2net_open_listening_sock(__be32 addr, __be16 port) 2046 { 2047 struct socket *sock = NULL; 2048 int ret; 2049 struct sockaddr_in sin = { 2050 .sin_family = PF_INET, 2051 .sin_addr = { .s_addr = addr }, 2052 .sin_port = port, 2053 }; 2054 2055 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 2056 if (ret < 0) { 2057 printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); 2058 goto out; 2059 } 2060 2061 sock->sk->sk_allocation = GFP_ATOMIC; 2062 2063 write_lock_bh(&sock->sk->sk_callback_lock); 2064 sock->sk->sk_user_data = sock->sk->sk_data_ready; 2065 sock->sk->sk_data_ready = o2net_listen_data_ready; 2066 write_unlock_bh(&sock->sk->sk_callback_lock); 2067 2068 o2net_listen_sock = sock; 2069 INIT_WORK(&o2net_listen_work, o2net_accept_many); 2070 2071 sock->sk->sk_reuse = SK_CAN_REUSE; 2072 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 2073 if (ret < 0) { 2074 printk(KERN_ERR "o2net: Error %d while binding socket at " 2075 "%pI4:%u\n", ret, &addr, ntohs(port)); 2076 goto out; 2077 } 2078 2079 ret = sock->ops->listen(sock, 64); 2080 if (ret < 0) 2081 printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", 2082 ret, &addr, ntohs(port)); 2083 2084 out: 2085 if (ret) { 2086 o2net_listen_sock = NULL; 2087 if (sock) 2088 sock_release(sock); 2089 } 2090 return ret; 2091 } 2092 2093 /* 2094 * called from node manager when we should bring up our network listening 2095 * socket. node manager handles all the serialization to only call this 2096 * once and to match it with o2net_stop_listening(). note, 2097 * o2nm_this_node() doesn't work yet as we're being called while it 2098 * is being set up. 2099 */ 2100 int o2net_start_listening(struct o2nm_node *node) 2101 { 2102 int ret = 0; 2103 2104 BUG_ON(o2net_wq != NULL); 2105 BUG_ON(o2net_listen_sock != NULL); 2106 2107 mlog(ML_KTHREAD, "starting o2net thread...\n"); 2108 o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); 2109 if (o2net_wq == NULL) { 2110 mlog(ML_ERROR, "unable to launch o2net thread\n"); 2111 return -ENOMEM; /* ? */ 2112 } 2113 2114 ret = o2net_open_listening_sock(node->nd_ipv4_address, 2115 node->nd_ipv4_port); 2116 if (ret) { 2117 destroy_workqueue(o2net_wq); 2118 o2net_wq = NULL; 2119 } else 2120 o2quo_conn_up(node->nd_num); 2121 2122 return ret; 2123 } 2124 2125 /* again, o2nm_this_node() doesn't work here as we're involved in 2126 * tearing it down */ 2127 void o2net_stop_listening(struct o2nm_node *node) 2128 { 2129 struct socket *sock = o2net_listen_sock; 2130 size_t i; 2131 2132 BUG_ON(o2net_wq == NULL); 2133 BUG_ON(o2net_listen_sock == NULL); 2134 2135 /* stop the listening socket from generating work */ 2136 write_lock_bh(&sock->sk->sk_callback_lock); 2137 sock->sk->sk_data_ready = sock->sk->sk_user_data; 2138 sock->sk->sk_user_data = NULL; 2139 write_unlock_bh(&sock->sk->sk_callback_lock); 2140 2141 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 2142 struct o2nm_node *node = o2nm_get_node_by_num(i); 2143 if (node) { 2144 o2net_disconnect_node(node); 2145 o2nm_node_put(node); 2146 } 2147 } 2148 2149 /* finish all work and tear down the work queue */ 2150 mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n"); 2151 destroy_workqueue(o2net_wq); 2152 o2net_wq = NULL; 2153 2154 sock_release(o2net_listen_sock); 2155 o2net_listen_sock = NULL; 2156 2157 o2quo_conn_err(node->nd_num); 2158 } 2159 2160 /* ------------------------------------------------------------ */ 2161 2162 int o2net_init(void) 2163 { 2164 unsigned long i; 2165 2166 o2quo_init(); 2167 2168 if (o2net_debugfs_init()) 2169 goto out; 2170 2171 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2172 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2173 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2174 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) 2175 goto out; 2176 2177 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); 2178 o2net_hand->connector_id = cpu_to_be64(1); 2179 2180 o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC); 2181 o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC); 2182 2183 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 2184 struct o2net_node *nn = o2net_nn_from_num(i); 2185 2186 atomic_set(&nn->nn_timeout, 0); 2187 spin_lock_init(&nn->nn_lock); 2188 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); 2189 INIT_DELAYED_WORK(&nn->nn_connect_expired, 2190 o2net_connect_expired); 2191 INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up); 2192 /* until we see hb from a node we'll return einval */ 2193 nn->nn_persistent_error = -ENOTCONN; 2194 init_waitqueue_head(&nn->nn_sc_wq); 2195 idr_init(&nn->nn_status_idr); 2196 INIT_LIST_HEAD(&nn->nn_status_list); 2197 } 2198 2199 return 0; 2200 2201 out: 2202 kfree(o2net_hand); 2203 kfree(o2net_keep_req); 2204 kfree(o2net_keep_resp); 2205 o2net_debugfs_exit(); 2206 o2quo_exit(); 2207 return -ENOMEM; 2208 } 2209 2210 void o2net_exit(void) 2211 { 2212 o2quo_exit(); 2213 kfree(o2net_hand); 2214 kfree(o2net_keep_req); 2215 kfree(o2net_keep_resp); 2216 o2net_debugfs_exit(); 2217 } 2218