1d021c344SAndy King /* 2d021c344SAndy King * VMware vSockets Driver 3d021c344SAndy King * 4d021c344SAndy King * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 5d021c344SAndy King * 6d021c344SAndy King * This program is free software; you can redistribute it and/or modify it 7d021c344SAndy King * under the terms of the GNU General Public License as published by the Free 8d021c344SAndy King * Software Foundation version 2 and no later version. 9d021c344SAndy King * 10d021c344SAndy King * This program is distributed in the hope that it will be useful, but WITHOUT 11d021c344SAndy King * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12d021c344SAndy King * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13d021c344SAndy King * more details. 14d021c344SAndy King */ 15d021c344SAndy King 16d021c344SAndy King #include <linux/types.h> 17d021c344SAndy King #include <linux/socket.h> 18d021c344SAndy King #include <linux/stddef.h> 19d021c344SAndy King #include <net/sock.h> 20d021c344SAndy King 21d021c344SAndy King #include "vmci_transport_notify.h" 22d021c344SAndy King 23d021c344SAndy King #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name) 24d021c344SAndy King 25d021c344SAndy King static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) 26d021c344SAndy King { 27d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 28d021c344SAndy King bool retval; 29d021c344SAndy King u64 notify_limit; 30d021c344SAndy King 31d021c344SAndy King if (!PKT_FIELD(vsk, peer_waiting_write)) 32d021c344SAndy King return false; 33d021c344SAndy King 34d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 35d021c344SAndy King /* When the sender blocks, we take that as a sign that the sender is 36d021c344SAndy King * faster than the receiver. To reduce the transmit rate of the sender, 37d021c344SAndy King * we delay the sending of the read notification by decreasing the 38d021c344SAndy King * write_notify_window. The notification is delayed until the number of 39d021c344SAndy King * bytes used in the queue drops below the write_notify_window. 40d021c344SAndy King */ 41d021c344SAndy King 42d021c344SAndy King if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { 43d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write_detected) = true; 44d021c344SAndy King if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { 45d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = 46d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window); 47d021c344SAndy King } else { 48d021c344SAndy King PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; 49d021c344SAndy King if (PKT_FIELD(vsk, write_notify_window) < 50d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window)) 51d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = 52d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window); 53d021c344SAndy King 54d021c344SAndy King } 55d021c344SAndy King } 56d021c344SAndy King notify_limit = vmci_trans(vsk)->consume_size - 57d021c344SAndy King PKT_FIELD(vsk, write_notify_window); 58d021c344SAndy King #else 59d021c344SAndy King notify_limit = 0; 60d021c344SAndy King #endif 61d021c344SAndy King 62d021c344SAndy King /* For now we ignore the wait information and just see if the free 63d021c344SAndy King * space exceeds the notify limit. Note that improving this function 64d021c344SAndy King * to be more intelligent will not require a protocol change and will 65d021c344SAndy King * retain compatibility between endpoints with mixed versions of this 66d021c344SAndy King * function. 67d021c344SAndy King * 68d021c344SAndy King * The notify_limit is used to delay notifications in the case where 69d021c344SAndy King * flow control is enabled. Below the test is expressed in terms of 70d021c344SAndy King * free space in the queue: if free_space > ConsumeSize - 71d021c344SAndy King * write_notify_window then notify An alternate way of expressing this 72d021c344SAndy King * is to rewrite the expression to use the data ready in the receive 73d021c344SAndy King * queue: if write_notify_window > bufferReady then notify as 74d021c344SAndy King * free_space == ConsumeSize - bufferReady. 75d021c344SAndy King */ 76d021c344SAndy King retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > 77d021c344SAndy King notify_limit; 78d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 79d021c344SAndy King if (retval) { 80d021c344SAndy King /* 81d021c344SAndy King * Once we notify the peer, we reset the detected flag so the 82d021c344SAndy King * next wait will again cause a decrease in the window size. 83d021c344SAndy King */ 84d021c344SAndy King 85d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write_detected) = false; 86d021c344SAndy King } 87d021c344SAndy King #endif 88d021c344SAndy King return retval; 89d021c344SAndy King #else 90d021c344SAndy King return true; 91d021c344SAndy King #endif 92d021c344SAndy King } 93d021c344SAndy King 94d021c344SAndy King static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk) 95d021c344SAndy King { 96d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 97d021c344SAndy King if (!PKT_FIELD(vsk, peer_waiting_read)) 98d021c344SAndy King return false; 99d021c344SAndy King 100d021c344SAndy King /* For now we ignore the wait information and just see if there is any 101d021c344SAndy King * data for our peer to read. Note that improving this function to be 102d021c344SAndy King * more intelligent will not require a protocol change and will retain 103d021c344SAndy King * compatibility between endpoints with mixed versions of this 104d021c344SAndy King * function. 105d021c344SAndy King */ 106d021c344SAndy King return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0; 107d021c344SAndy King #else 108d021c344SAndy King return true; 109d021c344SAndy King #endif 110d021c344SAndy King } 111d021c344SAndy King 112d021c344SAndy King static void 113d021c344SAndy King vmci_transport_handle_waiting_read(struct sock *sk, 114d021c344SAndy King struct vmci_transport_packet *pkt, 115d021c344SAndy King bool bottom_half, 116d021c344SAndy King struct sockaddr_vm *dst, 117d021c344SAndy King struct sockaddr_vm *src) 118d021c344SAndy King { 119d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 120d021c344SAndy King struct vsock_sock *vsk; 121d021c344SAndy King 122d021c344SAndy King vsk = vsock_sk(sk); 123d021c344SAndy King 124d021c344SAndy King PKT_FIELD(vsk, peer_waiting_read) = true; 125d021c344SAndy King memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait, 126d021c344SAndy King sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 127d021c344SAndy King 128d021c344SAndy King if (vmci_transport_notify_waiting_read(vsk)) { 129d021c344SAndy King bool sent; 130d021c344SAndy King 131d021c344SAndy King if (bottom_half) 132d021c344SAndy King sent = vmci_transport_send_wrote_bh(dst, src) > 0; 133d021c344SAndy King else 134d021c344SAndy King sent = vmci_transport_send_wrote(sk) > 0; 135d021c344SAndy King 136d021c344SAndy King if (sent) 137d021c344SAndy King PKT_FIELD(vsk, peer_waiting_read) = false; 138d021c344SAndy King } 139d021c344SAndy King #endif 140d021c344SAndy King } 141d021c344SAndy King 142d021c344SAndy King static void 143d021c344SAndy King vmci_transport_handle_waiting_write(struct sock *sk, 144d021c344SAndy King struct vmci_transport_packet *pkt, 145d021c344SAndy King bool bottom_half, 146d021c344SAndy King struct sockaddr_vm *dst, 147d021c344SAndy King struct sockaddr_vm *src) 148d021c344SAndy King { 149d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 150d021c344SAndy King struct vsock_sock *vsk; 151d021c344SAndy King 152d021c344SAndy King vsk = vsock_sk(sk); 153d021c344SAndy King 154d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write) = true; 155d021c344SAndy King memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait, 156d021c344SAndy King sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 157d021c344SAndy King 158d021c344SAndy King if (vmci_transport_notify_waiting_write(vsk)) { 159d021c344SAndy King bool sent; 160d021c344SAndy King 161d021c344SAndy King if (bottom_half) 162d021c344SAndy King sent = vmci_transport_send_read_bh(dst, src) > 0; 163d021c344SAndy King else 164d021c344SAndy King sent = vmci_transport_send_read(sk) > 0; 165d021c344SAndy King 166d021c344SAndy King if (sent) 167d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write) = false; 168d021c344SAndy King } 169d021c344SAndy King #endif 170d021c344SAndy King } 171d021c344SAndy King 172d021c344SAndy King static void 173d021c344SAndy King vmci_transport_handle_read(struct sock *sk, 174d021c344SAndy King struct vmci_transport_packet *pkt, 175d021c344SAndy King bool bottom_half, 176d021c344SAndy King struct sockaddr_vm *dst, struct sockaddr_vm *src) 177d021c344SAndy King { 178d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 179d021c344SAndy King struct vsock_sock *vsk; 180d021c344SAndy King 181d021c344SAndy King vsk = vsock_sk(sk); 182d021c344SAndy King PKT_FIELD(vsk, sent_waiting_write) = false; 183d021c344SAndy King #endif 184d021c344SAndy King 185d021c344SAndy King sk->sk_write_space(sk); 186d021c344SAndy King } 187d021c344SAndy King 188d021c344SAndy King static bool send_waiting_read(struct sock *sk, u64 room_needed) 189d021c344SAndy King { 190d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 191d021c344SAndy King struct vsock_sock *vsk; 192d021c344SAndy King struct vmci_transport_waiting_info waiting_info; 193d021c344SAndy King u64 tail; 194d021c344SAndy King u64 head; 195d021c344SAndy King u64 room_left; 196d021c344SAndy King bool ret; 197d021c344SAndy King 198d021c344SAndy King vsk = vsock_sk(sk); 199d021c344SAndy King 200d021c344SAndy King if (PKT_FIELD(vsk, sent_waiting_read)) 201d021c344SAndy King return true; 202d021c344SAndy King 203d021c344SAndy King if (PKT_FIELD(vsk, write_notify_window) < 204d021c344SAndy King vmci_trans(vsk)->consume_size) 205d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = 206d021c344SAndy King min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, 207d021c344SAndy King vmci_trans(vsk)->consume_size); 208d021c344SAndy King 209d021c344SAndy King vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head); 210d021c344SAndy King room_left = vmci_trans(vsk)->consume_size - head; 211d021c344SAndy King if (room_needed >= room_left) { 212d021c344SAndy King waiting_info.offset = room_needed - room_left; 213d021c344SAndy King waiting_info.generation = 214d021c344SAndy King PKT_FIELD(vsk, consume_q_generation) + 1; 215d021c344SAndy King } else { 216d021c344SAndy King waiting_info.offset = head + room_needed; 217d021c344SAndy King waiting_info.generation = PKT_FIELD(vsk, consume_q_generation); 218d021c344SAndy King } 219d021c344SAndy King 220d021c344SAndy King ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0; 221d021c344SAndy King if (ret) 222d021c344SAndy King PKT_FIELD(vsk, sent_waiting_read) = true; 223d021c344SAndy King 224d021c344SAndy King return ret; 225d021c344SAndy King #else 226d021c344SAndy King return true; 227d021c344SAndy King #endif 228d021c344SAndy King } 229d021c344SAndy King 230d021c344SAndy King static bool send_waiting_write(struct sock *sk, u64 room_needed) 231d021c344SAndy King { 232d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 233d021c344SAndy King struct vsock_sock *vsk; 234d021c344SAndy King struct vmci_transport_waiting_info waiting_info; 235d021c344SAndy King u64 tail; 236d021c344SAndy King u64 head; 237d021c344SAndy King u64 room_left; 238d021c344SAndy King bool ret; 239d021c344SAndy King 240d021c344SAndy King vsk = vsock_sk(sk); 241d021c344SAndy King 242d021c344SAndy King if (PKT_FIELD(vsk, sent_waiting_write)) 243d021c344SAndy King return true; 244d021c344SAndy King 245d021c344SAndy King vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head); 246d021c344SAndy King room_left = vmci_trans(vsk)->produce_size - tail; 247d021c344SAndy King if (room_needed + 1 >= room_left) { 248d021c344SAndy King /* Wraps around to current generation. */ 249d021c344SAndy King waiting_info.offset = room_needed + 1 - room_left; 250d021c344SAndy King waiting_info.generation = PKT_FIELD(vsk, produce_q_generation); 251d021c344SAndy King } else { 252d021c344SAndy King waiting_info.offset = tail + room_needed + 1; 253d021c344SAndy King waiting_info.generation = 254d021c344SAndy King PKT_FIELD(vsk, produce_q_generation) - 1; 255d021c344SAndy King } 256d021c344SAndy King 257d021c344SAndy King ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0; 258d021c344SAndy King if (ret) 259d021c344SAndy King PKT_FIELD(vsk, sent_waiting_write) = true; 260d021c344SAndy King 261d021c344SAndy King return ret; 262d021c344SAndy King #else 263d021c344SAndy King return true; 264d021c344SAndy King #endif 265d021c344SAndy King } 266d021c344SAndy King 267d021c344SAndy King static int vmci_transport_send_read_notification(struct sock *sk) 268d021c344SAndy King { 269d021c344SAndy King struct vsock_sock *vsk; 270d021c344SAndy King bool sent_read; 271d021c344SAndy King unsigned int retries; 272d021c344SAndy King int err; 273d021c344SAndy King 274d021c344SAndy King vsk = vsock_sk(sk); 275d021c344SAndy King sent_read = false; 276d021c344SAndy King retries = 0; 277d021c344SAndy King err = 0; 278d021c344SAndy King 279d021c344SAndy King if (vmci_transport_notify_waiting_write(vsk)) { 280d021c344SAndy King /* Notify the peer that we have read, retrying the send on 281d021c344SAndy King * failure up to our maximum value. XXX For now we just log 282d021c344SAndy King * the failure, but later we should schedule a work item to 283d021c344SAndy King * handle the resend until it succeeds. That would require 284d021c344SAndy King * keeping track of work items in the vsk and cleaning them up 285d021c344SAndy King * upon socket close. 286d021c344SAndy King */ 287d021c344SAndy King while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 288d021c344SAndy King !sent_read && 289d021c344SAndy King retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 290d021c344SAndy King err = vmci_transport_send_read(sk); 291d021c344SAndy King if (err >= 0) 292d021c344SAndy King sent_read = true; 293d021c344SAndy King 294d021c344SAndy King retries++; 295d021c344SAndy King } 296d021c344SAndy King 297d021c344SAndy King if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) 298d021c344SAndy King pr_err("%p unable to send read notify to peer\n", sk); 299d021c344SAndy King else 300d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 301d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write) = false; 302d021c344SAndy King #endif 303d021c344SAndy King 304d021c344SAndy King } 305d021c344SAndy King return err; 306d021c344SAndy King } 307d021c344SAndy King 308d021c344SAndy King static void 309d021c344SAndy King vmci_transport_handle_wrote(struct sock *sk, 310d021c344SAndy King struct vmci_transport_packet *pkt, 311d021c344SAndy King bool bottom_half, 312d021c344SAndy King struct sockaddr_vm *dst, struct sockaddr_vm *src) 313d021c344SAndy King { 314d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 315d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 316d021c344SAndy King PKT_FIELD(vsk, sent_waiting_read) = false; 317d021c344SAndy King #endif 318676d2369SDavid S. Miller sk->sk_data_ready(sk); 319d021c344SAndy King } 320d021c344SAndy King 321d021c344SAndy King static void vmci_transport_notify_pkt_socket_init(struct sock *sk) 322d021c344SAndy King { 323d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 324d021c344SAndy King 325d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 326d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 327d021c344SAndy King PKT_FIELD(vsk, peer_waiting_read) = false; 328d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write) = false; 329d021c344SAndy King PKT_FIELD(vsk, peer_waiting_write_detected) = false; 330d021c344SAndy King PKT_FIELD(vsk, sent_waiting_read) = false; 331d021c344SAndy King PKT_FIELD(vsk, sent_waiting_write) = false; 332d021c344SAndy King PKT_FIELD(vsk, produce_q_generation) = 0; 333d021c344SAndy King PKT_FIELD(vsk, consume_q_generation) = 0; 334d021c344SAndy King 335d021c344SAndy King memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0, 336d021c344SAndy King sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 337d021c344SAndy King memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0, 338d021c344SAndy King sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 339d021c344SAndy King } 340d021c344SAndy King 341d021c344SAndy King static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) 342d021c344SAndy King { 343d021c344SAndy King } 344d021c344SAndy King 345d021c344SAndy King static int 346d021c344SAndy King vmci_transport_notify_pkt_poll_in(struct sock *sk, 347d021c344SAndy King size_t target, bool *data_ready_now) 348d021c344SAndy King { 349d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 350d021c344SAndy King 351d021c344SAndy King if (vsock_stream_has_data(vsk)) { 352d021c344SAndy King *data_ready_now = true; 353d021c344SAndy King } else { 354d021c344SAndy King /* We can't read right now because there is nothing in the 355d021c344SAndy King * queue. Ask for notifications when there is something to 356d021c344SAndy King * read. 357d021c344SAndy King */ 358d021c344SAndy King if (sk->sk_state == SS_CONNECTED) { 359d021c344SAndy King if (!send_waiting_read(sk, 1)) 360d021c344SAndy King return -1; 361d021c344SAndy King 362d021c344SAndy King } 363d021c344SAndy King *data_ready_now = false; 364d021c344SAndy King } 365d021c344SAndy King 366d021c344SAndy King return 0; 367d021c344SAndy King } 368d021c344SAndy King 369d021c344SAndy King static int 370d021c344SAndy King vmci_transport_notify_pkt_poll_out(struct sock *sk, 371d021c344SAndy King size_t target, bool *space_avail_now) 372d021c344SAndy King { 373d021c344SAndy King s64 produce_q_free_space; 374d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 375d021c344SAndy King 376d021c344SAndy King produce_q_free_space = vsock_stream_has_space(vsk); 377d021c344SAndy King if (produce_q_free_space > 0) { 378d021c344SAndy King *space_avail_now = true; 379d021c344SAndy King return 0; 380d021c344SAndy King } else if (produce_q_free_space == 0) { 381d021c344SAndy King /* This is a connected socket but we can't currently send data. 382d021c344SAndy King * Notify the peer that we are waiting if the queue is full. We 383d021c344SAndy King * only send a waiting write if the queue is full because 384d021c344SAndy King * otherwise we end up in an infinite WAITING_WRITE, READ, 385d021c344SAndy King * WAITING_WRITE, READ, etc. loop. Treat failing to send the 386d021c344SAndy King * notification as a socket error, passing that back through 387d021c344SAndy King * the mask. 388d021c344SAndy King */ 389d021c344SAndy King if (!send_waiting_write(sk, 1)) 390d021c344SAndy King return -1; 391d021c344SAndy King 392d021c344SAndy King *space_avail_now = false; 393d021c344SAndy King } 394d021c344SAndy King 395d021c344SAndy King return 0; 396d021c344SAndy King } 397d021c344SAndy King 398d021c344SAndy King static int 399d021c344SAndy King vmci_transport_notify_pkt_recv_init( 400d021c344SAndy King struct sock *sk, 401d021c344SAndy King size_t target, 402d021c344SAndy King struct vmci_transport_recv_notify_data *data) 403d021c344SAndy King { 404d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 405d021c344SAndy King 406d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 407d021c344SAndy King data->consume_head = 0; 408d021c344SAndy King data->produce_tail = 0; 409d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 410d021c344SAndy King data->notify_on_block = false; 411d021c344SAndy King 412d021c344SAndy King if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { 413d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window) = target + 1; 414d021c344SAndy King if (PKT_FIELD(vsk, write_notify_window) < 415d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window)) { 416d021c344SAndy King /* If the current window is smaller than the new 417d021c344SAndy King * minimal window size, we need to reevaluate whether 418d021c344SAndy King * we need to notify the sender. If the number of ready 419d021c344SAndy King * bytes are smaller than the new window, we need to 420d021c344SAndy King * send a notification to the sender before we block. 421d021c344SAndy King */ 422d021c344SAndy King 423d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = 424d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window); 425d021c344SAndy King data->notify_on_block = true; 426d021c344SAndy King } 427d021c344SAndy King } 428d021c344SAndy King #endif 429d021c344SAndy King #endif 430d021c344SAndy King 431d021c344SAndy King return 0; 432d021c344SAndy King } 433d021c344SAndy King 434d021c344SAndy King static int 435d021c344SAndy King vmci_transport_notify_pkt_recv_pre_block( 436d021c344SAndy King struct sock *sk, 437d021c344SAndy King size_t target, 438d021c344SAndy King struct vmci_transport_recv_notify_data *data) 439d021c344SAndy King { 440d021c344SAndy King int err = 0; 441d021c344SAndy King 442d021c344SAndy King /* Notify our peer that we are waiting for data to read. */ 443d021c344SAndy King if (!send_waiting_read(sk, target)) { 444d021c344SAndy King err = -EHOSTUNREACH; 445d021c344SAndy King return err; 446d021c344SAndy King } 447d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 448d021c344SAndy King if (data->notify_on_block) { 449d021c344SAndy King err = vmci_transport_send_read_notification(sk); 450d021c344SAndy King if (err < 0) 451d021c344SAndy King return err; 452d021c344SAndy King 453d021c344SAndy King data->notify_on_block = false; 454d021c344SAndy King } 455d021c344SAndy King #endif 456d021c344SAndy King 457d021c344SAndy King return err; 458d021c344SAndy King } 459d021c344SAndy King 460d021c344SAndy King static int 461d021c344SAndy King vmci_transport_notify_pkt_recv_pre_dequeue( 462d021c344SAndy King struct sock *sk, 463d021c344SAndy King size_t target, 464d021c344SAndy King struct vmci_transport_recv_notify_data *data) 465d021c344SAndy King { 466d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 467d021c344SAndy King 468d021c344SAndy King /* Now consume up to len bytes from the queue. Note that since we have 469d021c344SAndy King * the socket locked we should copy at least ready bytes. 470d021c344SAndy King */ 471d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 472d021c344SAndy King vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, 473d021c344SAndy King &data->produce_tail, 474d021c344SAndy King &data->consume_head); 475d021c344SAndy King #endif 476d021c344SAndy King 477d021c344SAndy King return 0; 478d021c344SAndy King } 479d021c344SAndy King 480d021c344SAndy King static int 481d021c344SAndy King vmci_transport_notify_pkt_recv_post_dequeue( 482d021c344SAndy King struct sock *sk, 483d021c344SAndy King size_t target, 484d021c344SAndy King ssize_t copied, 485d021c344SAndy King bool data_read, 486d021c344SAndy King struct vmci_transport_recv_notify_data *data) 487d021c344SAndy King { 488d021c344SAndy King struct vsock_sock *vsk; 489d021c344SAndy King int err; 490d021c344SAndy King 491d021c344SAndy King vsk = vsock_sk(sk); 492d021c344SAndy King err = 0; 493d021c344SAndy King 494d021c344SAndy King if (data_read) { 495d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 496d021c344SAndy King /* Detect a wrap-around to maintain queue generation. Note 497d021c344SAndy King * that this is safe since we hold the socket lock across the 498d021c344SAndy King * two queue pair operations. 499d021c344SAndy King */ 500d021c344SAndy King if (copied >= 501d021c344SAndy King vmci_trans(vsk)->consume_size - data->consume_head) 502d021c344SAndy King PKT_FIELD(vsk, consume_q_generation)++; 503d021c344SAndy King #endif 504d021c344SAndy King 505d021c344SAndy King err = vmci_transport_send_read_notification(sk); 506d021c344SAndy King if (err < 0) 507d021c344SAndy King return err; 508d021c344SAndy King 509d021c344SAndy King } 510d021c344SAndy King return err; 511d021c344SAndy King } 512d021c344SAndy King 513d021c344SAndy King static int 514d021c344SAndy King vmci_transport_notify_pkt_send_init( 515d021c344SAndy King struct sock *sk, 516d021c344SAndy King struct vmci_transport_send_notify_data *data) 517d021c344SAndy King { 518d021c344SAndy King #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 519d021c344SAndy King data->consume_head = 0; 520d021c344SAndy King data->produce_tail = 0; 521d021c344SAndy King #endif 522d021c344SAndy King 523d021c344SAndy King return 0; 524d021c344SAndy King } 525d021c344SAndy King 526d021c344SAndy King static int 527d021c344SAndy King vmci_transport_notify_pkt_send_pre_block( 528d021c344SAndy King struct sock *sk, 529d021c344SAndy King struct vmci_transport_send_notify_data *data) 530d021c344SAndy King { 531d021c344SAndy King /* Notify our peer that we are waiting for room to write. */ 532d021c344SAndy King if (!send_waiting_write(sk, 1)) 533d021c344SAndy King return -EHOSTUNREACH; 534d021c344SAndy King 535d021c344SAndy King return 0; 536d021c344SAndy King } 537d021c344SAndy King 538d021c344SAndy King static int 539d021c344SAndy King vmci_transport_notify_pkt_send_pre_enqueue( 540d021c344SAndy King struct sock *sk, 541d021c344SAndy King struct vmci_transport_send_notify_data *data) 542d021c344SAndy King { 543d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 544d021c344SAndy King 545d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 546d021c344SAndy King vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, 547d021c344SAndy King &data->produce_tail, 548d021c344SAndy King &data->consume_head); 549d021c344SAndy King #endif 550d021c344SAndy King 551d021c344SAndy King return 0; 552d021c344SAndy King } 553d021c344SAndy King 554d021c344SAndy King static int 555d021c344SAndy King vmci_transport_notify_pkt_send_post_enqueue( 556d021c344SAndy King struct sock *sk, 557d021c344SAndy King ssize_t written, 558d021c344SAndy King struct vmci_transport_send_notify_data *data) 559d021c344SAndy King { 560d021c344SAndy King int err = 0; 561d021c344SAndy King struct vsock_sock *vsk; 562d021c344SAndy King bool sent_wrote = false; 563d021c344SAndy King int retries = 0; 564d021c344SAndy King 565d021c344SAndy King vsk = vsock_sk(sk); 566d021c344SAndy King 567d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 568d021c344SAndy King /* Detect a wrap-around to maintain queue generation. Note that this 569d021c344SAndy King * is safe since we hold the socket lock across the two queue pair 570d021c344SAndy King * operations. 571d021c344SAndy King */ 572d021c344SAndy King if (written >= vmci_trans(vsk)->produce_size - data->produce_tail) 573d021c344SAndy King PKT_FIELD(vsk, produce_q_generation)++; 574d021c344SAndy King 575d021c344SAndy King #endif 576d021c344SAndy King 577d021c344SAndy King if (vmci_transport_notify_waiting_read(vsk)) { 578d021c344SAndy King /* Notify the peer that we have written, retrying the send on 579d021c344SAndy King * failure up to our maximum value. See the XXX comment for the 580d021c344SAndy King * corresponding piece of code in StreamRecvmsg() for potential 581d021c344SAndy King * improvements. 582d021c344SAndy King */ 583d021c344SAndy King while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 584d021c344SAndy King !sent_wrote && 585d021c344SAndy King retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 586d021c344SAndy King err = vmci_transport_send_wrote(sk); 587d021c344SAndy King if (err >= 0) 588d021c344SAndy King sent_wrote = true; 589d021c344SAndy King 590d021c344SAndy King retries++; 591d021c344SAndy King } 592d021c344SAndy King 593d021c344SAndy King if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 594d021c344SAndy King pr_err("%p unable to send wrote notify to peer\n", sk); 595d021c344SAndy King return err; 596d021c344SAndy King } else { 597d021c344SAndy King #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 598d021c344SAndy King PKT_FIELD(vsk, peer_waiting_read) = false; 599d021c344SAndy King #endif 600d021c344SAndy King } 601d021c344SAndy King } 602d021c344SAndy King return err; 603d021c344SAndy King } 604d021c344SAndy King 605d021c344SAndy King static void 606d021c344SAndy King vmci_transport_notify_pkt_handle_pkt( 607d021c344SAndy King struct sock *sk, 608d021c344SAndy King struct vmci_transport_packet *pkt, 609d021c344SAndy King bool bottom_half, 610d021c344SAndy King struct sockaddr_vm *dst, 611d021c344SAndy King struct sockaddr_vm *src, bool *pkt_processed) 612d021c344SAndy King { 613d021c344SAndy King bool processed = false; 614d021c344SAndy King 615d021c344SAndy King switch (pkt->type) { 616d021c344SAndy King case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 617d021c344SAndy King vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); 618d021c344SAndy King processed = true; 619d021c344SAndy King break; 620d021c344SAndy King case VMCI_TRANSPORT_PACKET_TYPE_READ: 621d021c344SAndy King vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); 622d021c344SAndy King processed = true; 623d021c344SAndy King break; 624d021c344SAndy King case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: 625d021c344SAndy King vmci_transport_handle_waiting_write(sk, pkt, bottom_half, 626d021c344SAndy King dst, src); 627d021c344SAndy King processed = true; 628d021c344SAndy King break; 629d021c344SAndy King 630d021c344SAndy King case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: 631d021c344SAndy King vmci_transport_handle_waiting_read(sk, pkt, bottom_half, 632d021c344SAndy King dst, src); 633d021c344SAndy King processed = true; 634d021c344SAndy King break; 635d021c344SAndy King } 636d021c344SAndy King 637d021c344SAndy King if (pkt_processed) 638d021c344SAndy King *pkt_processed = processed; 639d021c344SAndy King } 640d021c344SAndy King 641d021c344SAndy King static void vmci_transport_notify_pkt_process_request(struct sock *sk) 642d021c344SAndy King { 643d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 644d021c344SAndy King 645d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 646d021c344SAndy King if (vmci_trans(vsk)->consume_size < 647d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window)) 648d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window) = 649d021c344SAndy King vmci_trans(vsk)->consume_size; 650d021c344SAndy King } 651d021c344SAndy King 652d021c344SAndy King static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) 653d021c344SAndy King { 654d021c344SAndy King struct vsock_sock *vsk = vsock_sk(sk); 655d021c344SAndy King 656d021c344SAndy King PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 657d021c344SAndy King if (vmci_trans(vsk)->consume_size < 658d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window)) 659d021c344SAndy King PKT_FIELD(vsk, write_notify_min_window) = 660d021c344SAndy King vmci_trans(vsk)->consume_size; 661d021c344SAndy King } 662d021c344SAndy King 663d021c344SAndy King /* Socket control packet based operations. */ 6643b22dae3SJulia Lawall const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { 665*99a5e178SKees Cook .socket_init = vmci_transport_notify_pkt_socket_init, 666*99a5e178SKees Cook .socket_destruct = vmci_transport_notify_pkt_socket_destruct, 667*99a5e178SKees Cook .poll_in = vmci_transport_notify_pkt_poll_in, 668*99a5e178SKees Cook .poll_out = vmci_transport_notify_pkt_poll_out, 669*99a5e178SKees Cook .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt, 670*99a5e178SKees Cook .recv_init = vmci_transport_notify_pkt_recv_init, 671*99a5e178SKees Cook .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block, 672*99a5e178SKees Cook .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue, 673*99a5e178SKees Cook .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue, 674*99a5e178SKees Cook .send_init = vmci_transport_notify_pkt_send_init, 675*99a5e178SKees Cook .send_pre_block = vmci_transport_notify_pkt_send_pre_block, 676*99a5e178SKees Cook .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue, 677*99a5e178SKees Cook .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue, 678*99a5e178SKees Cook .process_request = vmci_transport_notify_pkt_process_request, 679*99a5e178SKees Cook .process_negotiate = vmci_transport_notify_pkt_process_negotiate, 680d021c344SAndy King }; 681