1604326b4SDaniel Borkmann // SPDX-License-Identifier: GPL-2.0 2604326b4SDaniel Borkmann /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ 3604326b4SDaniel Borkmann 4604326b4SDaniel Borkmann #include <linux/skmsg.h> 5604326b4SDaniel Borkmann #include <linux/skbuff.h> 6604326b4SDaniel Borkmann #include <linux/scatterlist.h> 7604326b4SDaniel Borkmann 8604326b4SDaniel Borkmann #include <net/sock.h> 9604326b4SDaniel Borkmann #include <net/tcp.h> 10e91de6afSJohn Fastabend #include <net/tls.h> 11604326b4SDaniel Borkmann 12604326b4SDaniel Borkmann static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) 13604326b4SDaniel Borkmann { 14604326b4SDaniel Borkmann if (msg->sg.end > msg->sg.start && 15604326b4SDaniel Borkmann elem_first_coalesce < msg->sg.end) 16604326b4SDaniel Borkmann return true; 17604326b4SDaniel Borkmann 18604326b4SDaniel Borkmann if (msg->sg.end < msg->sg.start && 19604326b4SDaniel Borkmann (elem_first_coalesce > msg->sg.start || 20604326b4SDaniel Borkmann elem_first_coalesce < msg->sg.end)) 21604326b4SDaniel Borkmann return true; 22604326b4SDaniel Borkmann 23604326b4SDaniel Borkmann return false; 24604326b4SDaniel Borkmann } 25604326b4SDaniel Borkmann 26604326b4SDaniel Borkmann int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, 27604326b4SDaniel Borkmann int elem_first_coalesce) 28604326b4SDaniel Borkmann { 29604326b4SDaniel Borkmann struct page_frag *pfrag = sk_page_frag(sk); 309c34e38cSWang Yufen u32 osize = msg->sg.size; 31604326b4SDaniel Borkmann int ret = 0; 32604326b4SDaniel Borkmann 33604326b4SDaniel Borkmann len -= msg->sg.size; 34604326b4SDaniel Borkmann while (len > 0) { 35604326b4SDaniel Borkmann struct scatterlist *sge; 36604326b4SDaniel Borkmann u32 orig_offset; 37604326b4SDaniel Borkmann int use, i; 38604326b4SDaniel Borkmann 399c34e38cSWang Yufen if (!sk_page_frag_refill(sk, pfrag)) { 409c34e38cSWang Yufen ret = -ENOMEM; 419c34e38cSWang Yufen goto msg_trim; 429c34e38cSWang Yufen } 43604326b4SDaniel Borkmann 44604326b4SDaniel Borkmann orig_offset = pfrag->offset; 45604326b4SDaniel Borkmann use = min_t(int, len, pfrag->size - orig_offset); 469c34e38cSWang Yufen if (!sk_wmem_schedule(sk, use)) { 479c34e38cSWang Yufen ret = -ENOMEM; 489c34e38cSWang Yufen goto msg_trim; 499c34e38cSWang Yufen } 50604326b4SDaniel Borkmann 51604326b4SDaniel Borkmann i = msg->sg.end; 52604326b4SDaniel Borkmann sk_msg_iter_var_prev(i); 53604326b4SDaniel Borkmann sge = &msg->sg.data[i]; 54604326b4SDaniel Borkmann 55604326b4SDaniel Borkmann if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && 56604326b4SDaniel Borkmann sg_page(sge) == pfrag->page && 57604326b4SDaniel Borkmann sge->offset + sge->length == orig_offset) { 58604326b4SDaniel Borkmann sge->length += use; 59604326b4SDaniel Borkmann } else { 60604326b4SDaniel Borkmann if (sk_msg_full(msg)) { 61604326b4SDaniel Borkmann ret = -ENOSPC; 62604326b4SDaniel Borkmann break; 63604326b4SDaniel Borkmann } 64604326b4SDaniel Borkmann 65604326b4SDaniel Borkmann sge = &msg->sg.data[msg->sg.end]; 66604326b4SDaniel Borkmann sg_unmark_end(sge); 67604326b4SDaniel Borkmann sg_set_page(sge, pfrag->page, use, orig_offset); 68604326b4SDaniel Borkmann get_page(pfrag->page); 69604326b4SDaniel Borkmann sk_msg_iter_next(msg, end); 70604326b4SDaniel Borkmann } 71604326b4SDaniel Borkmann 72604326b4SDaniel Borkmann sk_mem_charge(sk, use); 73604326b4SDaniel Borkmann msg->sg.size += use; 74604326b4SDaniel Borkmann pfrag->offset += use; 75604326b4SDaniel Borkmann len -= use; 76604326b4SDaniel Borkmann } 77604326b4SDaniel Borkmann 78604326b4SDaniel Borkmann return ret; 799c34e38cSWang Yufen 809c34e38cSWang Yufen msg_trim: 819c34e38cSWang Yufen sk_msg_trim(sk, msg, osize); 829c34e38cSWang Yufen return ret; 83604326b4SDaniel Borkmann } 84604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_alloc); 85604326b4SDaniel Borkmann 86d829e9c4SDaniel Borkmann int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, 87d829e9c4SDaniel Borkmann u32 off, u32 len) 88d829e9c4SDaniel Borkmann { 89d829e9c4SDaniel Borkmann int i = src->sg.start; 90d829e9c4SDaniel Borkmann struct scatterlist *sge = sk_msg_elem(src, i); 91fda497e5SVakul Garg struct scatterlist *sgd = NULL; 92d829e9c4SDaniel Borkmann u32 sge_len, sge_off; 93d829e9c4SDaniel Borkmann 94d829e9c4SDaniel Borkmann while (off) { 95d829e9c4SDaniel Borkmann if (sge->length > off) 96d829e9c4SDaniel Borkmann break; 97d829e9c4SDaniel Borkmann off -= sge->length; 98d829e9c4SDaniel Borkmann sk_msg_iter_var_next(i); 99d829e9c4SDaniel Borkmann if (i == src->sg.end && off) 100d829e9c4SDaniel Borkmann return -ENOSPC; 101d829e9c4SDaniel Borkmann sge = sk_msg_elem(src, i); 102d829e9c4SDaniel Borkmann } 103d829e9c4SDaniel Borkmann 104d829e9c4SDaniel Borkmann while (len) { 105d829e9c4SDaniel Borkmann sge_len = sge->length - off; 106d829e9c4SDaniel Borkmann if (sge_len > len) 107d829e9c4SDaniel Borkmann sge_len = len; 108fda497e5SVakul Garg 109fda497e5SVakul Garg if (dst->sg.end) 110fda497e5SVakul Garg sgd = sk_msg_elem(dst, dst->sg.end - 1); 111fda497e5SVakul Garg 112fda497e5SVakul Garg if (sgd && 113fda497e5SVakul Garg (sg_page(sge) == sg_page(sgd)) && 114fda497e5SVakul Garg (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { 115fda497e5SVakul Garg sgd->length += sge_len; 116fda497e5SVakul Garg dst->sg.size += sge_len; 117fda497e5SVakul Garg } else if (!sk_msg_full(dst)) { 118fda497e5SVakul Garg sge_off = sge->offset + off; 119fda497e5SVakul Garg sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); 120fda497e5SVakul Garg } else { 121fda497e5SVakul Garg return -ENOSPC; 122fda497e5SVakul Garg } 123fda497e5SVakul Garg 124d829e9c4SDaniel Borkmann off = 0; 125d829e9c4SDaniel Borkmann len -= sge_len; 126d829e9c4SDaniel Borkmann sk_mem_charge(sk, sge_len); 127d829e9c4SDaniel Borkmann sk_msg_iter_var_next(i); 128d829e9c4SDaniel Borkmann if (i == src->sg.end && len) 129d829e9c4SDaniel Borkmann return -ENOSPC; 130d829e9c4SDaniel Borkmann sge = sk_msg_elem(src, i); 131d829e9c4SDaniel Borkmann } 132d829e9c4SDaniel Borkmann 133d829e9c4SDaniel Borkmann return 0; 134d829e9c4SDaniel Borkmann } 135d829e9c4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_clone); 136d829e9c4SDaniel Borkmann 137604326b4SDaniel Borkmann void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) 138604326b4SDaniel Borkmann { 139604326b4SDaniel Borkmann int i = msg->sg.start; 140604326b4SDaniel Borkmann 141604326b4SDaniel Borkmann do { 142604326b4SDaniel Borkmann struct scatterlist *sge = sk_msg_elem(msg, i); 143604326b4SDaniel Borkmann 144604326b4SDaniel Borkmann if (bytes < sge->length) { 145604326b4SDaniel Borkmann sge->length -= bytes; 146604326b4SDaniel Borkmann sge->offset += bytes; 147604326b4SDaniel Borkmann sk_mem_uncharge(sk, bytes); 148604326b4SDaniel Borkmann break; 149604326b4SDaniel Borkmann } 150604326b4SDaniel Borkmann 151604326b4SDaniel Borkmann sk_mem_uncharge(sk, sge->length); 152604326b4SDaniel Borkmann bytes -= sge->length; 153604326b4SDaniel Borkmann sge->length = 0; 154604326b4SDaniel Borkmann sge->offset = 0; 155604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 156604326b4SDaniel Borkmann } while (bytes && i != msg->sg.end); 157604326b4SDaniel Borkmann msg->sg.start = i; 158604326b4SDaniel Borkmann } 159604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_return_zero); 160604326b4SDaniel Borkmann 161604326b4SDaniel Borkmann void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) 162604326b4SDaniel Borkmann { 163604326b4SDaniel Borkmann int i = msg->sg.start; 164604326b4SDaniel Borkmann 165604326b4SDaniel Borkmann do { 166604326b4SDaniel Borkmann struct scatterlist *sge = &msg->sg.data[i]; 167604326b4SDaniel Borkmann int uncharge = (bytes < sge->length) ? bytes : sge->length; 168604326b4SDaniel Borkmann 169604326b4SDaniel Borkmann sk_mem_uncharge(sk, uncharge); 170604326b4SDaniel Borkmann bytes -= uncharge; 171604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 172604326b4SDaniel Borkmann } while (i != msg->sg.end); 173604326b4SDaniel Borkmann } 174604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_return); 175604326b4SDaniel Borkmann 176604326b4SDaniel Borkmann static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, 177604326b4SDaniel Borkmann bool charge) 178604326b4SDaniel Borkmann { 179604326b4SDaniel Borkmann struct scatterlist *sge = sk_msg_elem(msg, i); 180604326b4SDaniel Borkmann u32 len = sge->length; 181604326b4SDaniel Borkmann 18236cd0e69SJohn Fastabend /* When the skb owns the memory we free it from consume_skb path. */ 18336cd0e69SJohn Fastabend if (!msg->skb) { 184604326b4SDaniel Borkmann if (charge) 185604326b4SDaniel Borkmann sk_mem_uncharge(sk, len); 186604326b4SDaniel Borkmann put_page(sg_page(sge)); 18736cd0e69SJohn Fastabend } 188604326b4SDaniel Borkmann memset(sge, 0, sizeof(*sge)); 189604326b4SDaniel Borkmann return len; 190604326b4SDaniel Borkmann } 191604326b4SDaniel Borkmann 192604326b4SDaniel Borkmann static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, 193604326b4SDaniel Borkmann bool charge) 194604326b4SDaniel Borkmann { 195604326b4SDaniel Borkmann struct scatterlist *sge = sk_msg_elem(msg, i); 196604326b4SDaniel Borkmann int freed = 0; 197604326b4SDaniel Borkmann 198604326b4SDaniel Borkmann while (msg->sg.size) { 199604326b4SDaniel Borkmann msg->sg.size -= sge->length; 200604326b4SDaniel Borkmann freed += sk_msg_free_elem(sk, msg, i, charge); 201604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 202604326b4SDaniel Borkmann sk_msg_check_to_free(msg, i, msg->sg.size); 203604326b4SDaniel Borkmann sge = sk_msg_elem(msg, i); 204604326b4SDaniel Borkmann } 205604326b4SDaniel Borkmann consume_skb(msg->skb); 206604326b4SDaniel Borkmann sk_msg_init(msg); 207604326b4SDaniel Borkmann return freed; 208604326b4SDaniel Borkmann } 209604326b4SDaniel Borkmann 210604326b4SDaniel Borkmann int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) 211604326b4SDaniel Borkmann { 212604326b4SDaniel Borkmann return __sk_msg_free(sk, msg, msg->sg.start, false); 213604326b4SDaniel Borkmann } 214604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); 215604326b4SDaniel Borkmann 216604326b4SDaniel Borkmann int sk_msg_free(struct sock *sk, struct sk_msg *msg) 217604326b4SDaniel Borkmann { 218604326b4SDaniel Borkmann return __sk_msg_free(sk, msg, msg->sg.start, true); 219604326b4SDaniel Borkmann } 220604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_free); 221604326b4SDaniel Borkmann 222604326b4SDaniel Borkmann static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, 223604326b4SDaniel Borkmann u32 bytes, bool charge) 224604326b4SDaniel Borkmann { 225604326b4SDaniel Borkmann struct scatterlist *sge; 226604326b4SDaniel Borkmann u32 i = msg->sg.start; 227604326b4SDaniel Borkmann 228604326b4SDaniel Borkmann while (bytes) { 229604326b4SDaniel Borkmann sge = sk_msg_elem(msg, i); 230604326b4SDaniel Borkmann if (!sge->length) 231604326b4SDaniel Borkmann break; 232604326b4SDaniel Borkmann if (bytes < sge->length) { 233604326b4SDaniel Borkmann if (charge) 234604326b4SDaniel Borkmann sk_mem_uncharge(sk, bytes); 235604326b4SDaniel Borkmann sge->length -= bytes; 236604326b4SDaniel Borkmann sge->offset += bytes; 237604326b4SDaniel Borkmann msg->sg.size -= bytes; 238604326b4SDaniel Borkmann break; 239604326b4SDaniel Borkmann } 240604326b4SDaniel Borkmann 241604326b4SDaniel Borkmann msg->sg.size -= sge->length; 242604326b4SDaniel Borkmann bytes -= sge->length; 243604326b4SDaniel Borkmann sk_msg_free_elem(sk, msg, i, charge); 244604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 245604326b4SDaniel Borkmann sk_msg_check_to_free(msg, i, bytes); 246604326b4SDaniel Borkmann } 247604326b4SDaniel Borkmann msg->sg.start = i; 248604326b4SDaniel Borkmann } 249604326b4SDaniel Borkmann 250604326b4SDaniel Borkmann void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) 251604326b4SDaniel Borkmann { 252604326b4SDaniel Borkmann __sk_msg_free_partial(sk, msg, bytes, true); 253604326b4SDaniel Borkmann } 254604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_free_partial); 255604326b4SDaniel Borkmann 256604326b4SDaniel Borkmann void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, 257604326b4SDaniel Borkmann u32 bytes) 258604326b4SDaniel Borkmann { 259604326b4SDaniel Borkmann __sk_msg_free_partial(sk, msg, bytes, false); 260604326b4SDaniel Borkmann } 261604326b4SDaniel Borkmann 262604326b4SDaniel Borkmann void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) 263604326b4SDaniel Borkmann { 264604326b4SDaniel Borkmann int trim = msg->sg.size - len; 265604326b4SDaniel Borkmann u32 i = msg->sg.end; 266604326b4SDaniel Borkmann 267604326b4SDaniel Borkmann if (trim <= 0) { 268604326b4SDaniel Borkmann WARN_ON(trim < 0); 269604326b4SDaniel Borkmann return; 270604326b4SDaniel Borkmann } 271604326b4SDaniel Borkmann 272604326b4SDaniel Borkmann sk_msg_iter_var_prev(i); 273604326b4SDaniel Borkmann msg->sg.size = len; 274604326b4SDaniel Borkmann while (msg->sg.data[i].length && 275604326b4SDaniel Borkmann trim >= msg->sg.data[i].length) { 276604326b4SDaniel Borkmann trim -= msg->sg.data[i].length; 277604326b4SDaniel Borkmann sk_msg_free_elem(sk, msg, i, true); 278604326b4SDaniel Borkmann sk_msg_iter_var_prev(i); 279604326b4SDaniel Borkmann if (!trim) 280604326b4SDaniel Borkmann goto out; 281604326b4SDaniel Borkmann } 282604326b4SDaniel Borkmann 283604326b4SDaniel Borkmann msg->sg.data[i].length -= trim; 284604326b4SDaniel Borkmann sk_mem_uncharge(sk, trim); 285683916f6SJakub Kicinski /* Adjust copybreak if it falls into the trimmed part of last buf */ 286683916f6SJakub Kicinski if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) 287683916f6SJakub Kicinski msg->sg.copybreak = msg->sg.data[i].length; 288604326b4SDaniel Borkmann out: 289683916f6SJakub Kicinski sk_msg_iter_var_next(i); 290683916f6SJakub Kicinski msg->sg.end = i; 291683916f6SJakub Kicinski 292683916f6SJakub Kicinski /* If we trim data a full sg elem before curr pointer update 293683916f6SJakub Kicinski * copybreak and current so that any future copy operations 294683916f6SJakub Kicinski * start at new copy location. 295604326b4SDaniel Borkmann * However trimed data that has not yet been used in a copy op 296604326b4SDaniel Borkmann * does not require an update. 297604326b4SDaniel Borkmann */ 298683916f6SJakub Kicinski if (!msg->sg.size) { 299683916f6SJakub Kicinski msg->sg.curr = msg->sg.start; 300683916f6SJakub Kicinski msg->sg.copybreak = 0; 301683916f6SJakub Kicinski } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= 302683916f6SJakub Kicinski sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { 303683916f6SJakub Kicinski sk_msg_iter_var_prev(i); 304604326b4SDaniel Borkmann msg->sg.curr = i; 305604326b4SDaniel Borkmann msg->sg.copybreak = msg->sg.data[i].length; 306604326b4SDaniel Borkmann } 307604326b4SDaniel Borkmann } 308604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_trim); 309604326b4SDaniel Borkmann 310604326b4SDaniel Borkmann int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, 311604326b4SDaniel Borkmann struct sk_msg *msg, u32 bytes) 312604326b4SDaniel Borkmann { 313604326b4SDaniel Borkmann int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); 314604326b4SDaniel Borkmann const int to_max_pages = MAX_MSG_FRAGS; 315604326b4SDaniel Borkmann struct page *pages[MAX_MSG_FRAGS]; 316604326b4SDaniel Borkmann ssize_t orig, copied, use, offset; 317604326b4SDaniel Borkmann 318604326b4SDaniel Borkmann orig = msg->sg.size; 319604326b4SDaniel Borkmann while (bytes > 0) { 320604326b4SDaniel Borkmann i = 0; 321604326b4SDaniel Borkmann maxpages = to_max_pages - num_elems; 322604326b4SDaniel Borkmann if (maxpages == 0) { 323604326b4SDaniel Borkmann ret = -EFAULT; 324604326b4SDaniel Borkmann goto out; 325604326b4SDaniel Borkmann } 326604326b4SDaniel Borkmann 327604326b4SDaniel Borkmann copied = iov_iter_get_pages(from, pages, bytes, maxpages, 328604326b4SDaniel Borkmann &offset); 329604326b4SDaniel Borkmann if (copied <= 0) { 330604326b4SDaniel Borkmann ret = -EFAULT; 331604326b4SDaniel Borkmann goto out; 332604326b4SDaniel Borkmann } 333604326b4SDaniel Borkmann 334604326b4SDaniel Borkmann iov_iter_advance(from, copied); 335604326b4SDaniel Borkmann bytes -= copied; 336604326b4SDaniel Borkmann msg->sg.size += copied; 337604326b4SDaniel Borkmann 338604326b4SDaniel Borkmann while (copied) { 339604326b4SDaniel Borkmann use = min_t(int, copied, PAGE_SIZE - offset); 340604326b4SDaniel Borkmann sg_set_page(&msg->sg.data[msg->sg.end], 341604326b4SDaniel Borkmann pages[i], use, offset); 342604326b4SDaniel Borkmann sg_unmark_end(&msg->sg.data[msg->sg.end]); 343604326b4SDaniel Borkmann sk_mem_charge(sk, use); 344604326b4SDaniel Borkmann 345604326b4SDaniel Borkmann offset = 0; 346604326b4SDaniel Borkmann copied -= use; 347604326b4SDaniel Borkmann sk_msg_iter_next(msg, end); 348604326b4SDaniel Borkmann num_elems++; 349604326b4SDaniel Borkmann i++; 350604326b4SDaniel Borkmann } 351604326b4SDaniel Borkmann /* When zerocopy is mixed with sk_msg_*copy* operations we 352604326b4SDaniel Borkmann * may have a copybreak set in this case clear and prefer 353604326b4SDaniel Borkmann * zerocopy remainder when possible. 354604326b4SDaniel Borkmann */ 355604326b4SDaniel Borkmann msg->sg.copybreak = 0; 356604326b4SDaniel Borkmann msg->sg.curr = msg->sg.end; 357604326b4SDaniel Borkmann } 358604326b4SDaniel Borkmann out: 359604326b4SDaniel Borkmann /* Revert iov_iter updates, msg will need to use 'trim' later if it 360604326b4SDaniel Borkmann * also needs to be cleared. 361604326b4SDaniel Borkmann */ 362604326b4SDaniel Borkmann if (ret) 363604326b4SDaniel Borkmann iov_iter_revert(from, msg->sg.size - orig); 364604326b4SDaniel Borkmann return ret; 365604326b4SDaniel Borkmann } 366604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); 367604326b4SDaniel Borkmann 368604326b4SDaniel Borkmann int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, 369604326b4SDaniel Borkmann struct sk_msg *msg, u32 bytes) 370604326b4SDaniel Borkmann { 371604326b4SDaniel Borkmann int ret = -ENOSPC, i = msg->sg.curr; 372604326b4SDaniel Borkmann struct scatterlist *sge; 373604326b4SDaniel Borkmann u32 copy, buf_size; 374604326b4SDaniel Borkmann void *to; 375604326b4SDaniel Borkmann 376604326b4SDaniel Borkmann do { 377604326b4SDaniel Borkmann sge = sk_msg_elem(msg, i); 378604326b4SDaniel Borkmann /* This is possible if a trim operation shrunk the buffer */ 379604326b4SDaniel Borkmann if (msg->sg.copybreak >= sge->length) { 380604326b4SDaniel Borkmann msg->sg.copybreak = 0; 381604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 382604326b4SDaniel Borkmann if (i == msg->sg.end) 383604326b4SDaniel Borkmann break; 384604326b4SDaniel Borkmann sge = sk_msg_elem(msg, i); 385604326b4SDaniel Borkmann } 386604326b4SDaniel Borkmann 387604326b4SDaniel Borkmann buf_size = sge->length - msg->sg.copybreak; 388604326b4SDaniel Borkmann copy = (buf_size > bytes) ? bytes : buf_size; 389604326b4SDaniel Borkmann to = sg_virt(sge) + msg->sg.copybreak; 390604326b4SDaniel Borkmann msg->sg.copybreak += copy; 391604326b4SDaniel Borkmann if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) 392604326b4SDaniel Borkmann ret = copy_from_iter_nocache(to, copy, from); 393604326b4SDaniel Borkmann else 394604326b4SDaniel Borkmann ret = copy_from_iter(to, copy, from); 395604326b4SDaniel Borkmann if (ret != copy) { 396604326b4SDaniel Borkmann ret = -EFAULT; 397604326b4SDaniel Borkmann goto out; 398604326b4SDaniel Borkmann } 399604326b4SDaniel Borkmann bytes -= copy; 400604326b4SDaniel Borkmann if (!bytes) 401604326b4SDaniel Borkmann break; 402604326b4SDaniel Borkmann msg->sg.copybreak = 0; 403604326b4SDaniel Borkmann sk_msg_iter_var_next(i); 404604326b4SDaniel Borkmann } while (i != msg->sg.end); 405604326b4SDaniel Borkmann out: 406604326b4SDaniel Borkmann msg->sg.curr = i; 407604326b4SDaniel Borkmann return ret; 408604326b4SDaniel Borkmann } 409604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); 410604326b4SDaniel Borkmann 4112bc793e3SCong Wang /* Receive sk_msg from psock->ingress_msg to @msg. */ 4122bc793e3SCong Wang int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 4132bc793e3SCong Wang int len, int flags) 4142bc793e3SCong Wang { 4152bc793e3SCong Wang struct iov_iter *iter = &msg->msg_iter; 4162bc793e3SCong Wang int peek = flags & MSG_PEEK; 4172bc793e3SCong Wang struct sk_msg *msg_rx; 4182bc793e3SCong Wang int i, copied = 0; 4192bc793e3SCong Wang 4202bc793e3SCong Wang msg_rx = sk_psock_peek_msg(psock); 4212bc793e3SCong Wang while (copied != len) { 4222bc793e3SCong Wang struct scatterlist *sge; 4232bc793e3SCong Wang 4242bc793e3SCong Wang if (unlikely(!msg_rx)) 4252bc793e3SCong Wang break; 4262bc793e3SCong Wang 4272bc793e3SCong Wang i = msg_rx->sg.start; 4282bc793e3SCong Wang do { 4292bc793e3SCong Wang struct page *page; 4302bc793e3SCong Wang int copy; 4312bc793e3SCong Wang 4322bc793e3SCong Wang sge = sk_msg_elem(msg_rx, i); 4332bc793e3SCong Wang copy = sge->length; 4342bc793e3SCong Wang page = sg_page(sge); 4352bc793e3SCong Wang if (copied + copy > len) 4362bc793e3SCong Wang copy = len - copied; 4372bc793e3SCong Wang copy = copy_page_to_iter(page, sge->offset, copy, iter); 4382bc793e3SCong Wang if (!copy) 4392bc793e3SCong Wang return copied ? copied : -EFAULT; 4402bc793e3SCong Wang 4412bc793e3SCong Wang copied += copy; 4422bc793e3SCong Wang if (likely(!peek)) { 4432bc793e3SCong Wang sge->offset += copy; 4442bc793e3SCong Wang sge->length -= copy; 4452bc793e3SCong Wang if (!msg_rx->skb) 4462bc793e3SCong Wang sk_mem_uncharge(sk, copy); 4472bc793e3SCong Wang msg_rx->sg.size -= copy; 4482bc793e3SCong Wang 4492bc793e3SCong Wang if (!sge->length) { 4502bc793e3SCong Wang sk_msg_iter_var_next(i); 4512bc793e3SCong Wang if (!msg_rx->skb) 4522bc793e3SCong Wang put_page(page); 4532bc793e3SCong Wang } 4542bc793e3SCong Wang } else { 4552bc793e3SCong Wang /* Lets not optimize peek case if copy_page_to_iter 4562bc793e3SCong Wang * didn't copy the entire length lets just break. 4572bc793e3SCong Wang */ 4582bc793e3SCong Wang if (copy != sge->length) 4592bc793e3SCong Wang return copied; 4602bc793e3SCong Wang sk_msg_iter_var_next(i); 4612bc793e3SCong Wang } 4622bc793e3SCong Wang 4632bc793e3SCong Wang if (copied == len) 4642bc793e3SCong Wang break; 4652bc793e3SCong Wang } while (i != msg_rx->sg.end); 4662bc793e3SCong Wang 4672bc793e3SCong Wang if (unlikely(peek)) { 4682bc793e3SCong Wang msg_rx = sk_psock_next_msg(psock, msg_rx); 4692bc793e3SCong Wang if (!msg_rx) 4702bc793e3SCong Wang break; 4712bc793e3SCong Wang continue; 4722bc793e3SCong Wang } 4732bc793e3SCong Wang 4742bc793e3SCong Wang msg_rx->sg.start = i; 4752bc793e3SCong Wang if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { 4762bc793e3SCong Wang msg_rx = sk_psock_dequeue_msg(psock); 4772bc793e3SCong Wang kfree_sk_msg(msg_rx); 4782bc793e3SCong Wang } 4792bc793e3SCong Wang msg_rx = sk_psock_peek_msg(psock); 4802bc793e3SCong Wang } 4812bc793e3SCong Wang 4822bc793e3SCong Wang return copied; 4832bc793e3SCong Wang } 4842bc793e3SCong Wang EXPORT_SYMBOL_GPL(sk_msg_recvmsg); 4852bc793e3SCong Wang 486fb4e0a5eSCong Wang bool sk_msg_is_readable(struct sock *sk) 487fb4e0a5eSCong Wang { 488fb4e0a5eSCong Wang struct sk_psock *psock; 489fb4e0a5eSCong Wang bool empty = true; 490fb4e0a5eSCong Wang 491fb4e0a5eSCong Wang rcu_read_lock(); 492fb4e0a5eSCong Wang psock = sk_psock(sk); 493fb4e0a5eSCong Wang if (likely(psock)) 494fb4e0a5eSCong Wang empty = list_empty(&psock->ingress_msg); 495fb4e0a5eSCong Wang rcu_read_unlock(); 496fb4e0a5eSCong Wang return !empty; 497fb4e0a5eSCong Wang } 498fb4e0a5eSCong Wang EXPORT_SYMBOL_GPL(sk_msg_is_readable); 499fb4e0a5eSCong Wang 5006fa9201aSJohn Fastabend static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, 5016fa9201aSJohn Fastabend struct sk_buff *skb) 502604326b4SDaniel Borkmann { 503604326b4SDaniel Borkmann struct sk_msg *msg; 504604326b4SDaniel Borkmann 50536cd0e69SJohn Fastabend if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 5066fa9201aSJohn Fastabend return NULL; 5076fa9201aSJohn Fastabend 5086fa9201aSJohn Fastabend if (!sk_rmem_schedule(sk, skb, skb->truesize)) 5096fa9201aSJohn Fastabend return NULL; 51036cd0e69SJohn Fastabend 511190179f6SCong Wang msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); 512604326b4SDaniel Borkmann if (unlikely(!msg)) 5136fa9201aSJohn Fastabend return NULL; 514604326b4SDaniel Borkmann 515604326b4SDaniel Borkmann sk_msg_init(msg); 5166fa9201aSJohn Fastabend return msg; 5176fa9201aSJohn Fastabend } 5186fa9201aSJohn Fastabend 5196fa9201aSJohn Fastabend static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, 5207303524eSLiu Jian u32 off, u32 len, 5216fa9201aSJohn Fastabend struct sk_psock *psock, 5226fa9201aSJohn Fastabend struct sock *sk, 5236fa9201aSJohn Fastabend struct sk_msg *msg) 5246fa9201aSJohn Fastabend { 5254363023dSJohn Fastabend int num_sge, copied; 5266fa9201aSJohn Fastabend 5273527bfe6SLiu Jian num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); 5283527bfe6SLiu Jian if (num_sge < 0) { 5294363023dSJohn Fastabend /* skb linearize may fail with ENOMEM, but lets simply try again 5304363023dSJohn Fastabend * later if this happens. Under memory pressure we don't want to 5314363023dSJohn Fastabend * drop the skb. We need to linearize the skb so that the mapping 5324363023dSJohn Fastabend * in skb_to_sgvec can not error. 5334363023dSJohn Fastabend */ 5344363023dSJohn Fastabend if (skb_linearize(skb)) 5354363023dSJohn Fastabend return -EAGAIN; 5363527bfe6SLiu Jian 5377303524eSLiu Jian num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); 5387e6b27a6SJohn Fastabend if (unlikely(num_sge < 0)) 539604326b4SDaniel Borkmann return num_sge; 5403527bfe6SLiu Jian } 541604326b4SDaniel Borkmann 5427303524eSLiu Jian copied = len; 543604326b4SDaniel Borkmann msg->sg.start = 0; 544cabede8bSJohn Fastabend msg->sg.size = copied; 545031097d9SJakub Kicinski msg->sg.end = num_sge; 546604326b4SDaniel Borkmann msg->skb = skb; 547604326b4SDaniel Borkmann 548604326b4SDaniel Borkmann sk_psock_queue_msg(psock, msg); 549552de910SJohn Fastabend sk_psock_data_ready(sk, psock); 550604326b4SDaniel Borkmann return copied; 551604326b4SDaniel Borkmann } 552604326b4SDaniel Borkmann 5537303524eSLiu Jian static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, 5547303524eSLiu Jian u32 off, u32 len); 5552443ca66SJohn Fastabend 5567303524eSLiu Jian static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, 5577303524eSLiu Jian u32 off, u32 len) 5586fa9201aSJohn Fastabend { 5596fa9201aSJohn Fastabend struct sock *sk = psock->sk; 5606fa9201aSJohn Fastabend struct sk_msg *msg; 5617e6b27a6SJohn Fastabend int err; 5626fa9201aSJohn Fastabend 5632443ca66SJohn Fastabend /* If we are receiving on the same sock skb->sk is already assigned, 5642443ca66SJohn Fastabend * skip memory accounting and owner transition seeing it already set 5652443ca66SJohn Fastabend * correctly. 5662443ca66SJohn Fastabend */ 5672443ca66SJohn Fastabend if (unlikely(skb->sk == sk)) 5687303524eSLiu Jian return sk_psock_skb_ingress_self(psock, skb, off, len); 5696fa9201aSJohn Fastabend msg = sk_psock_create_ingress_msg(sk, skb); 5706fa9201aSJohn Fastabend if (!msg) 5716fa9201aSJohn Fastabend return -EAGAIN; 5726fa9201aSJohn Fastabend 5736fa9201aSJohn Fastabend /* This will transition ownership of the data from the socket where 5746fa9201aSJohn Fastabend * the BPF program was run initiating the redirect to the socket 5756fa9201aSJohn Fastabend * we will eventually receive this data on. The data will be released 5766fa9201aSJohn Fastabend * from skb_consume found in __tcp_bpf_recvmsg() after its been copied 5776fa9201aSJohn Fastabend * into user buffers. 5786fa9201aSJohn Fastabend */ 5796fa9201aSJohn Fastabend skb_set_owner_r(skb, sk); 5807303524eSLiu Jian err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); 5817e6b27a6SJohn Fastabend if (err < 0) 5827e6b27a6SJohn Fastabend kfree(msg); 5837e6b27a6SJohn Fastabend return err; 5846fa9201aSJohn Fastabend } 5856fa9201aSJohn Fastabend 5866fa9201aSJohn Fastabend /* Puts an skb on the ingress queue of the socket already assigned to the 5876fa9201aSJohn Fastabend * skb. In this case we do not need to check memory limits or skb_set_owner_r 5886fa9201aSJohn Fastabend * because the skb is already accounted for here. 5896fa9201aSJohn Fastabend */ 5907303524eSLiu Jian static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, 5917303524eSLiu Jian u32 off, u32 len) 5926fa9201aSJohn Fastabend { 5936fa9201aSJohn Fastabend struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); 5946fa9201aSJohn Fastabend struct sock *sk = psock->sk; 5957e6b27a6SJohn Fastabend int err; 5966fa9201aSJohn Fastabend 5976fa9201aSJohn Fastabend if (unlikely(!msg)) 5986fa9201aSJohn Fastabend return -EAGAIN; 5996fa9201aSJohn Fastabend sk_msg_init(msg); 600144748ebSJohn Fastabend skb_set_owner_r(skb, sk); 6017303524eSLiu Jian err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); 6027e6b27a6SJohn Fastabend if (err < 0) 6037e6b27a6SJohn Fastabend kfree(msg); 6047e6b27a6SJohn Fastabend return err; 6056fa9201aSJohn Fastabend } 6066fa9201aSJohn Fastabend 607604326b4SDaniel Borkmann static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, 608604326b4SDaniel Borkmann u32 off, u32 len, bool ingress) 609604326b4SDaniel Borkmann { 6109047f19eSJohn Fastabend if (!ingress) { 6119047f19eSJohn Fastabend if (!sock_writeable(psock->sk)) 6129047f19eSJohn Fastabend return -EAGAIN; 613799aa7f9SCong Wang return skb_send_sock(psock->sk, skb, off, len); 614604326b4SDaniel Borkmann } 6157303524eSLiu Jian return sk_psock_skb_ingress(psock, skb, off, len); 6169047f19eSJohn Fastabend } 617604326b4SDaniel Borkmann 618476d9801SJohn Fastabend static void sk_psock_skb_state(struct sk_psock *psock, 619476d9801SJohn Fastabend struct sk_psock_work_state *state, 620476d9801SJohn Fastabend struct sk_buff *skb, 621476d9801SJohn Fastabend int len, int off) 622476d9801SJohn Fastabend { 623476d9801SJohn Fastabend spin_lock_bh(&psock->ingress_lock); 624476d9801SJohn Fastabend if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 625476d9801SJohn Fastabend state->skb = skb; 626476d9801SJohn Fastabend state->len = len; 627476d9801SJohn Fastabend state->off = off; 628476d9801SJohn Fastabend } else { 629476d9801SJohn Fastabend sock_drop(psock->sk, skb); 630476d9801SJohn Fastabend } 631476d9801SJohn Fastabend spin_unlock_bh(&psock->ingress_lock); 632476d9801SJohn Fastabend } 633476d9801SJohn Fastabend 634604326b4SDaniel Borkmann static void sk_psock_backlog(struct work_struct *work) 635604326b4SDaniel Borkmann { 636604326b4SDaniel Borkmann struct sk_psock *psock = container_of(work, struct sk_psock, work); 637604326b4SDaniel Borkmann struct sk_psock_work_state *state = &psock->work_state; 638476d9801SJohn Fastabend struct sk_buff *skb = NULL; 639604326b4SDaniel Borkmann bool ingress; 640604326b4SDaniel Borkmann u32 len, off; 641604326b4SDaniel Borkmann int ret; 642604326b4SDaniel Borkmann 643799aa7f9SCong Wang mutex_lock(&psock->work_mutex); 644476d9801SJohn Fastabend if (unlikely(state->skb)) { 645476d9801SJohn Fastabend spin_lock_bh(&psock->ingress_lock); 646604326b4SDaniel Borkmann skb = state->skb; 647604326b4SDaniel Borkmann len = state->len; 648604326b4SDaniel Borkmann off = state->off; 649604326b4SDaniel Borkmann state->skb = NULL; 650476d9801SJohn Fastabend spin_unlock_bh(&psock->ingress_lock); 651604326b4SDaniel Borkmann } 652476d9801SJohn Fastabend if (skb) 653476d9801SJohn Fastabend goto start; 654604326b4SDaniel Borkmann 655604326b4SDaniel Borkmann while ((skb = skb_dequeue(&psock->ingress_skb))) { 656604326b4SDaniel Borkmann len = skb->len; 657604326b4SDaniel Borkmann off = 0; 6587303524eSLiu Jian if (skb_bpf_strparser(skb)) { 6597303524eSLiu Jian struct strp_msg *stm = strp_msg(skb); 6607303524eSLiu Jian 6617303524eSLiu Jian off = stm->offset; 6627303524eSLiu Jian len = stm->full_len; 6637303524eSLiu Jian } 664604326b4SDaniel Borkmann start: 665e3526bb9SCong Wang ingress = skb_bpf_ingress(skb); 666e3526bb9SCong Wang skb_bpf_redirect_clear(skb); 667604326b4SDaniel Borkmann do { 668604326b4SDaniel Borkmann ret = -EIO; 669799aa7f9SCong Wang if (!sock_flag(psock->sk, SOCK_DEAD)) 670604326b4SDaniel Borkmann ret = sk_psock_handle_skb(psock, skb, off, 671604326b4SDaniel Borkmann len, ingress); 672604326b4SDaniel Borkmann if (ret <= 0) { 673604326b4SDaniel Borkmann if (ret == -EAGAIN) { 674476d9801SJohn Fastabend sk_psock_skb_state(psock, state, skb, 675476d9801SJohn Fastabend len, off); 676604326b4SDaniel Borkmann goto end; 677604326b4SDaniel Borkmann } 678604326b4SDaniel Borkmann /* Hard errors break pipe and stop xmit. */ 679604326b4SDaniel Borkmann sk_psock_report_error(psock, ret ? -ret : EPIPE); 680604326b4SDaniel Borkmann sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 681781dd043SCong Wang sock_drop(psock->sk, skb); 682604326b4SDaniel Borkmann goto end; 683604326b4SDaniel Borkmann } 684604326b4SDaniel Borkmann off += ret; 685604326b4SDaniel Borkmann len -= ret; 686604326b4SDaniel Borkmann } while (len); 687604326b4SDaniel Borkmann 688604326b4SDaniel Borkmann if (!ingress) 689604326b4SDaniel Borkmann kfree_skb(skb); 690604326b4SDaniel Borkmann } 691604326b4SDaniel Borkmann end: 692799aa7f9SCong Wang mutex_unlock(&psock->work_mutex); 693604326b4SDaniel Borkmann } 694604326b4SDaniel Borkmann 695604326b4SDaniel Borkmann struct sk_psock *sk_psock_init(struct sock *sk, int node) 696604326b4SDaniel Borkmann { 6977b219da4SLorenz Bauer struct sk_psock *psock; 6987b219da4SLorenz Bauer struct proto *prot; 699604326b4SDaniel Borkmann 7007b219da4SLorenz Bauer write_lock_bh(&sk->sk_callback_lock); 7017b219da4SLorenz Bauer 7027b219da4SLorenz Bauer if (sk->sk_user_data) { 7037b219da4SLorenz Bauer psock = ERR_PTR(-EBUSY); 7047b219da4SLorenz Bauer goto out; 7057b219da4SLorenz Bauer } 7067b219da4SLorenz Bauer 7077b219da4SLorenz Bauer psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); 7087b219da4SLorenz Bauer if (!psock) { 7097b219da4SLorenz Bauer psock = ERR_PTR(-ENOMEM); 7107b219da4SLorenz Bauer goto out; 7117b219da4SLorenz Bauer } 7127b219da4SLorenz Bauer 7137b219da4SLorenz Bauer prot = READ_ONCE(sk->sk_prot); 714604326b4SDaniel Borkmann psock->sk = sk; 715604326b4SDaniel Borkmann psock->eval = __SK_NONE; 7167b219da4SLorenz Bauer psock->sk_proto = prot; 7177b219da4SLorenz Bauer psock->saved_unhash = prot->unhash; 718*d8616ee2SWang Yufen psock->saved_destroy = prot->destroy; 7197b219da4SLorenz Bauer psock->saved_close = prot->close; 7207b219da4SLorenz Bauer psock->saved_write_space = sk->sk_write_space; 721604326b4SDaniel Borkmann 722604326b4SDaniel Borkmann INIT_LIST_HEAD(&psock->link); 723604326b4SDaniel Borkmann spin_lock_init(&psock->link_lock); 724604326b4SDaniel Borkmann 725604326b4SDaniel Borkmann INIT_WORK(&psock->work, sk_psock_backlog); 726799aa7f9SCong Wang mutex_init(&psock->work_mutex); 727604326b4SDaniel Borkmann INIT_LIST_HEAD(&psock->ingress_msg); 728b01fd6e8SCong Wang spin_lock_init(&psock->ingress_lock); 729604326b4SDaniel Borkmann skb_queue_head_init(&psock->ingress_skb); 730604326b4SDaniel Borkmann 731604326b4SDaniel Borkmann sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); 732604326b4SDaniel Borkmann refcount_set(&psock->refcnt, 1); 733604326b4SDaniel Borkmann 734f1ff5ce2SJakub Sitnicki rcu_assign_sk_user_data_nocopy(sk, psock); 735604326b4SDaniel Borkmann sock_hold(sk); 736604326b4SDaniel Borkmann 7377b219da4SLorenz Bauer out: 7387b219da4SLorenz Bauer write_unlock_bh(&sk->sk_callback_lock); 739604326b4SDaniel Borkmann return psock; 740604326b4SDaniel Borkmann } 741604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_psock_init); 742604326b4SDaniel Borkmann 743604326b4SDaniel Borkmann struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) 744604326b4SDaniel Borkmann { 745604326b4SDaniel Borkmann struct sk_psock_link *link; 746604326b4SDaniel Borkmann 747604326b4SDaniel Borkmann spin_lock_bh(&psock->link_lock); 748604326b4SDaniel Borkmann link = list_first_entry_or_null(&psock->link, struct sk_psock_link, 749604326b4SDaniel Borkmann list); 750604326b4SDaniel Borkmann if (link) 751604326b4SDaniel Borkmann list_del(&link->list); 752604326b4SDaniel Borkmann spin_unlock_bh(&psock->link_lock); 753604326b4SDaniel Borkmann return link; 754604326b4SDaniel Borkmann } 755604326b4SDaniel Borkmann 756cd81cefbSCong Wang static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) 757604326b4SDaniel Borkmann { 758604326b4SDaniel Borkmann struct sk_msg *msg, *tmp; 759604326b4SDaniel Borkmann 760604326b4SDaniel Borkmann list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { 761604326b4SDaniel Borkmann list_del(&msg->list); 762604326b4SDaniel Borkmann sk_msg_free(psock->sk, msg); 763604326b4SDaniel Borkmann kfree(msg); 764604326b4SDaniel Borkmann } 765604326b4SDaniel Borkmann } 766604326b4SDaniel Borkmann 767799aa7f9SCong Wang static void __sk_psock_zap_ingress(struct sk_psock *psock) 768604326b4SDaniel Borkmann { 769e3526bb9SCong Wang struct sk_buff *skb; 770e3526bb9SCong Wang 77137f0e514SCong Wang while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { 772e3526bb9SCong Wang skb_bpf_redirect_clear(skb); 773781dd043SCong Wang sock_drop(psock->sk, skb); 774e3526bb9SCong Wang } 775476d9801SJohn Fastabend kfree_skb(psock->work_state.skb); 776476d9801SJohn Fastabend /* We null the skb here to ensure that calls to sk_psock_backlog 777476d9801SJohn Fastabend * do not pick up the free'd skb. 778476d9801SJohn Fastabend */ 779476d9801SJohn Fastabend psock->work_state.skb = NULL; 780604326b4SDaniel Borkmann __sk_psock_purge_ingress_msg(psock); 781604326b4SDaniel Borkmann } 782604326b4SDaniel Borkmann 783604326b4SDaniel Borkmann static void sk_psock_link_destroy(struct sk_psock *psock) 784604326b4SDaniel Borkmann { 785604326b4SDaniel Borkmann struct sk_psock_link *link, *tmp; 786604326b4SDaniel Borkmann 787604326b4SDaniel Borkmann list_for_each_entry_safe(link, tmp, &psock->link, list) { 788604326b4SDaniel Borkmann list_del(&link->list); 789604326b4SDaniel Borkmann sk_psock_free_link(link); 790604326b4SDaniel Borkmann } 791604326b4SDaniel Borkmann } 792604326b4SDaniel Borkmann 793799aa7f9SCong Wang void sk_psock_stop(struct sk_psock *psock, bool wait) 794799aa7f9SCong Wang { 795799aa7f9SCong Wang spin_lock_bh(&psock->ingress_lock); 796799aa7f9SCong Wang sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 797799aa7f9SCong Wang sk_psock_cork_free(psock); 798799aa7f9SCong Wang __sk_psock_zap_ingress(psock); 799799aa7f9SCong Wang spin_unlock_bh(&psock->ingress_lock); 800799aa7f9SCong Wang 801799aa7f9SCong Wang if (wait) 802799aa7f9SCong Wang cancel_work_sync(&psock->work); 803799aa7f9SCong Wang } 804799aa7f9SCong Wang 80588759609SCong Wang static void sk_psock_done_strp(struct sk_psock *psock); 80688759609SCong Wang 8077786dfc4SCong Wang static void sk_psock_destroy(struct work_struct *work) 808604326b4SDaniel Borkmann { 8097786dfc4SCong Wang struct sk_psock *psock = container_of(to_rcu_work(work), 8107786dfc4SCong Wang struct sk_psock, rwork); 811604326b4SDaniel Borkmann /* No sk_callback_lock since already detached. */ 81201489436SJohn Fastabend 81388759609SCong Wang sk_psock_done_strp(psock); 814604326b4SDaniel Borkmann 815604326b4SDaniel Borkmann cancel_work_sync(&psock->work); 816799aa7f9SCong Wang mutex_destroy(&psock->work_mutex); 817604326b4SDaniel Borkmann 818604326b4SDaniel Borkmann psock_progs_drop(&psock->progs); 819604326b4SDaniel Borkmann 820604326b4SDaniel Borkmann sk_psock_link_destroy(psock); 821604326b4SDaniel Borkmann sk_psock_cork_free(psock); 822604326b4SDaniel Borkmann 823604326b4SDaniel Borkmann if (psock->sk_redir) 824604326b4SDaniel Borkmann sock_put(psock->sk_redir); 825604326b4SDaniel Borkmann sock_put(psock->sk); 826604326b4SDaniel Borkmann kfree(psock); 827604326b4SDaniel Borkmann } 828604326b4SDaniel Borkmann 829604326b4SDaniel Borkmann void sk_psock_drop(struct sock *sk, struct sk_psock *psock) 830604326b4SDaniel Borkmann { 831604326b4SDaniel Borkmann write_lock_bh(&sk->sk_callback_lock); 83295fa1454SJohn Fastabend sk_psock_restore_proto(sk, psock); 83395fa1454SJohn Fastabend rcu_assign_sk_user_data(sk, NULL); 834ae8b8332SCong Wang if (psock->progs.stream_parser) 835604326b4SDaniel Borkmann sk_psock_stop_strp(sk, psock); 836a7ba4558SCong Wang else if (psock->progs.stream_verdict || psock->progs.skb_verdict) 837ef565928SJohn Fastabend sk_psock_stop_verdict(sk, psock); 838604326b4SDaniel Borkmann write_unlock_bh(&sk->sk_callback_lock); 839604326b4SDaniel Borkmann 840343597d5SJohn Fastabend sk_psock_stop(psock, false); 841343597d5SJohn Fastabend 8427786dfc4SCong Wang INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); 8437786dfc4SCong Wang queue_rcu_work(system_wq, &psock->rwork); 844604326b4SDaniel Borkmann } 845604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_psock_drop); 846604326b4SDaniel Borkmann 847604326b4SDaniel Borkmann static int sk_psock_map_verd(int verdict, bool redir) 848604326b4SDaniel Borkmann { 849604326b4SDaniel Borkmann switch (verdict) { 850604326b4SDaniel Borkmann case SK_PASS: 851604326b4SDaniel Borkmann return redir ? __SK_REDIRECT : __SK_PASS; 852604326b4SDaniel Borkmann case SK_DROP: 853604326b4SDaniel Borkmann default: 854604326b4SDaniel Borkmann break; 855604326b4SDaniel Borkmann } 856604326b4SDaniel Borkmann 857604326b4SDaniel Borkmann return __SK_DROP; 858604326b4SDaniel Borkmann } 859604326b4SDaniel Borkmann 860604326b4SDaniel Borkmann int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, 861604326b4SDaniel Borkmann struct sk_msg *msg) 862604326b4SDaniel Borkmann { 863604326b4SDaniel Borkmann struct bpf_prog *prog; 864604326b4SDaniel Borkmann int ret; 865604326b4SDaniel Borkmann 866604326b4SDaniel Borkmann rcu_read_lock(); 867604326b4SDaniel Borkmann prog = READ_ONCE(psock->progs.msg_parser); 868604326b4SDaniel Borkmann if (unlikely(!prog)) { 869604326b4SDaniel Borkmann ret = __SK_PASS; 870604326b4SDaniel Borkmann goto out; 871604326b4SDaniel Borkmann } 872604326b4SDaniel Borkmann 873604326b4SDaniel Borkmann sk_msg_compute_data_pointers(msg); 874604326b4SDaniel Borkmann msg->sk = sk; 8753d9f773cSDavid Miller ret = bpf_prog_run_pin_on_cpu(prog, msg); 876604326b4SDaniel Borkmann ret = sk_psock_map_verd(ret, msg->sk_redir); 877604326b4SDaniel Borkmann psock->apply_bytes = msg->apply_bytes; 878604326b4SDaniel Borkmann if (ret == __SK_REDIRECT) { 879604326b4SDaniel Borkmann if (psock->sk_redir) 880604326b4SDaniel Borkmann sock_put(psock->sk_redir); 881604326b4SDaniel Borkmann psock->sk_redir = msg->sk_redir; 882604326b4SDaniel Borkmann if (!psock->sk_redir) { 883604326b4SDaniel Borkmann ret = __SK_DROP; 884604326b4SDaniel Borkmann goto out; 885604326b4SDaniel Borkmann } 886604326b4SDaniel Borkmann sock_hold(psock->sk_redir); 887604326b4SDaniel Borkmann } 888604326b4SDaniel Borkmann out: 889604326b4SDaniel Borkmann rcu_read_unlock(); 890604326b4SDaniel Borkmann return ret; 891604326b4SDaniel Borkmann } 892604326b4SDaniel Borkmann EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); 893604326b4SDaniel Borkmann 89442830571SCong Wang static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) 895604326b4SDaniel Borkmann { 896604326b4SDaniel Borkmann struct sk_psock *psock_other; 897604326b4SDaniel Borkmann struct sock *sk_other; 898604326b4SDaniel Borkmann 899e3526bb9SCong Wang sk_other = skb_bpf_redirect_fetch(skb); 9009047f19eSJohn Fastabend /* This error is a buggy BPF program, it returned a redirect 9019047f19eSJohn Fastabend * return code, but then didn't set a redirect interface. 9029047f19eSJohn Fastabend */ 903ca2f5f21SJohn Fastabend if (unlikely(!sk_other)) { 9047303524eSLiu Jian skb_bpf_redirect_clear(skb); 905781dd043SCong Wang sock_drop(from->sk, skb); 9061581a6c1SCong Wang return -EIO; 907ca2f5f21SJohn Fastabend } 908ca2f5f21SJohn Fastabend psock_other = sk_psock(sk_other); 9099047f19eSJohn Fastabend /* This error indicates the socket is being torn down or had another 9109047f19eSJohn Fastabend * error that caused the pipe to break. We can't send a packet on 9119047f19eSJohn Fastabend * a socket that is in this state so we drop the skb. 9129047f19eSJohn Fastabend */ 913799aa7f9SCong Wang if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { 91430b9c54aSCong Wang skb_bpf_redirect_clear(skb); 915781dd043SCong Wang sock_drop(from->sk, skb); 9161581a6c1SCong Wang return -EIO; 917799aa7f9SCong Wang } 918799aa7f9SCong Wang spin_lock_bh(&psock_other->ingress_lock); 919799aa7f9SCong Wang if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { 920799aa7f9SCong Wang spin_unlock_bh(&psock_other->ingress_lock); 92130b9c54aSCong Wang skb_bpf_redirect_clear(skb); 922781dd043SCong Wang sock_drop(from->sk, skb); 9231581a6c1SCong Wang return -EIO; 924ca2f5f21SJohn Fastabend } 925ca2f5f21SJohn Fastabend 926ca2f5f21SJohn Fastabend skb_queue_tail(&psock_other->ingress_skb, skb); 927ca2f5f21SJohn Fastabend schedule_work(&psock_other->work); 928799aa7f9SCong Wang spin_unlock_bh(&psock_other->ingress_lock); 9291581a6c1SCong Wang return 0; 930ca2f5f21SJohn Fastabend } 931ca2f5f21SJohn Fastabend 93242830571SCong Wang static void sk_psock_tls_verdict_apply(struct sk_buff *skb, 93342830571SCong Wang struct sk_psock *from, int verdict) 934e91de6afSJohn Fastabend { 935e91de6afSJohn Fastabend switch (verdict) { 936e91de6afSJohn Fastabend case __SK_REDIRECT: 93742830571SCong Wang sk_psock_skb_redirect(from, skb); 938e91de6afSJohn Fastabend break; 939e91de6afSJohn Fastabend case __SK_PASS: 940e91de6afSJohn Fastabend case __SK_DROP: 941e91de6afSJohn Fastabend default: 942e91de6afSJohn Fastabend break; 943e91de6afSJohn Fastabend } 944e91de6afSJohn Fastabend } 945e91de6afSJohn Fastabend 946e91de6afSJohn Fastabend int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) 947e91de6afSJohn Fastabend { 948e91de6afSJohn Fastabend struct bpf_prog *prog; 949e91de6afSJohn Fastabend int ret = __SK_PASS; 950e91de6afSJohn Fastabend 951e91de6afSJohn Fastabend rcu_read_lock(); 952ae8b8332SCong Wang prog = READ_ONCE(psock->progs.stream_verdict); 953e91de6afSJohn Fastabend if (likely(prog)) { 9540b17ad25SJohn Fastabend skb->sk = psock->sk; 955e3526bb9SCong Wang skb_dst_drop(skb); 956e3526bb9SCong Wang skb_bpf_redirect_clear(skb); 95753334232SCong Wang ret = bpf_prog_run_pin_on_cpu(prog, skb); 958e3526bb9SCong Wang ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); 9590b17ad25SJohn Fastabend skb->sk = NULL; 960e91de6afSJohn Fastabend } 96142830571SCong Wang sk_psock_tls_verdict_apply(skb, psock, ret); 962e91de6afSJohn Fastabend rcu_read_unlock(); 963e91de6afSJohn Fastabend return ret; 964e91de6afSJohn Fastabend } 965e91de6afSJohn Fastabend EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); 966e91de6afSJohn Fastabend 9671581a6c1SCong Wang static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, 9681581a6c1SCong Wang int verdict) 969ca2f5f21SJohn Fastabend { 970ca2f5f21SJohn Fastabend struct sock *sk_other; 9711581a6c1SCong Wang int err = 0; 9727303524eSLiu Jian u32 len, off; 973ca2f5f21SJohn Fastabend 974604326b4SDaniel Borkmann switch (verdict) { 97551199405SJohn Fastabend case __SK_PASS: 9761581a6c1SCong Wang err = -EIO; 97751199405SJohn Fastabend sk_other = psock->sk; 97851199405SJohn Fastabend if (sock_flag(sk_other, SOCK_DEAD) || 97951199405SJohn Fastabend !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 9807303524eSLiu Jian skb_bpf_redirect_clear(skb); 98151199405SJohn Fastabend goto out_free; 98251199405SJohn Fastabend } 98351199405SJohn Fastabend 984e3526bb9SCong Wang skb_bpf_set_ingress(skb); 9859ecbfb06SJohn Fastabend 9869ecbfb06SJohn Fastabend /* If the queue is empty then we can submit directly 9879ecbfb06SJohn Fastabend * into the msg queue. If its not empty we have to 9889ecbfb06SJohn Fastabend * queue work otherwise we may get OOO data. Otherwise, 9899ecbfb06SJohn Fastabend * if sk_psock_skb_ingress errors will be handled by 9909ecbfb06SJohn Fastabend * retrying later from workqueue. 9919ecbfb06SJohn Fastabend */ 9929ecbfb06SJohn Fastabend if (skb_queue_empty(&psock->ingress_skb)) { 9937303524eSLiu Jian len = skb->len; 9947303524eSLiu Jian off = 0; 9957303524eSLiu Jian if (skb_bpf_strparser(skb)) { 9967303524eSLiu Jian struct strp_msg *stm = strp_msg(skb); 9977303524eSLiu Jian 9987303524eSLiu Jian off = stm->offset; 9997303524eSLiu Jian len = stm->full_len; 10007303524eSLiu Jian } 10017303524eSLiu Jian err = sk_psock_skb_ingress_self(psock, skb, off, len); 10029ecbfb06SJohn Fastabend } 10039ecbfb06SJohn Fastabend if (err < 0) { 1004799aa7f9SCong Wang spin_lock_bh(&psock->ingress_lock); 1005799aa7f9SCong Wang if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 100651199405SJohn Fastabend skb_queue_tail(&psock->ingress_skb, skb); 100751199405SJohn Fastabend schedule_work(&psock->work); 10080cf6672bSCong Wang err = 0; 10099ecbfb06SJohn Fastabend } 1010799aa7f9SCong Wang spin_unlock_bh(&psock->ingress_lock); 10110cf6672bSCong Wang if (err < 0) { 10120cf6672bSCong Wang skb_bpf_redirect_clear(skb); 10130cf6672bSCong Wang goto out_free; 10140cf6672bSCong Wang } 1015799aa7f9SCong Wang } 101651199405SJohn Fastabend break; 1017604326b4SDaniel Borkmann case __SK_REDIRECT: 101842830571SCong Wang err = sk_psock_skb_redirect(psock, skb); 1019604326b4SDaniel Borkmann break; 1020604326b4SDaniel Borkmann case __SK_DROP: 1021604326b4SDaniel Borkmann default: 1022604326b4SDaniel Borkmann out_free: 1023781dd043SCong Wang sock_drop(psock->sk, skb); 1024604326b4SDaniel Borkmann } 10251581a6c1SCong Wang 10261581a6c1SCong Wang return err; 1027604326b4SDaniel Borkmann } 1028604326b4SDaniel Borkmann 102988759609SCong Wang static void sk_psock_write_space(struct sock *sk) 103088759609SCong Wang { 103188759609SCong Wang struct sk_psock *psock; 103288759609SCong Wang void (*write_space)(struct sock *sk) = NULL; 103388759609SCong Wang 103488759609SCong Wang rcu_read_lock(); 103588759609SCong Wang psock = sk_psock(sk); 103688759609SCong Wang if (likely(psock)) { 103788759609SCong Wang if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) 103888759609SCong Wang schedule_work(&psock->work); 103988759609SCong Wang write_space = psock->saved_write_space; 104088759609SCong Wang } 104188759609SCong Wang rcu_read_unlock(); 104288759609SCong Wang if (write_space) 104388759609SCong Wang write_space(sk); 104488759609SCong Wang } 104588759609SCong Wang 104688759609SCong Wang #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) 1047604326b4SDaniel Borkmann static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) 1048604326b4SDaniel Borkmann { 10498025751dSJohn Fastabend struct sk_psock *psock; 1050604326b4SDaniel Borkmann struct bpf_prog *prog; 1051604326b4SDaniel Borkmann int ret = __SK_DROP; 10528025751dSJohn Fastabend struct sock *sk; 1053604326b4SDaniel Borkmann 1054604326b4SDaniel Borkmann rcu_read_lock(); 10558025751dSJohn Fastabend sk = strp->sk; 10568025751dSJohn Fastabend psock = sk_psock(sk); 10578025751dSJohn Fastabend if (unlikely(!psock)) { 1058781dd043SCong Wang sock_drop(sk, skb); 10598025751dSJohn Fastabend goto out; 10608025751dSJohn Fastabend } 1061ae8b8332SCong Wang prog = READ_ONCE(psock->progs.stream_verdict); 1062604326b4SDaniel Borkmann if (likely(prog)) { 1063144748ebSJohn Fastabend skb->sk = sk; 1064e3526bb9SCong Wang skb_dst_drop(skb); 1065e3526bb9SCong Wang skb_bpf_redirect_clear(skb); 106653334232SCong Wang ret = bpf_prog_run_pin_on_cpu(prog, skb); 10677303524eSLiu Jian if (ret == SK_PASS) 10687303524eSLiu Jian skb_bpf_set_strparser(skb); 1069e3526bb9SCong Wang ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); 1070144748ebSJohn Fastabend skb->sk = NULL; 1071604326b4SDaniel Borkmann } 1072604326b4SDaniel Borkmann sk_psock_verdict_apply(psock, skb, ret); 10738025751dSJohn Fastabend out: 107493dd5f18SJohn Fastabend rcu_read_unlock(); 1075604326b4SDaniel Borkmann } 1076604326b4SDaniel Borkmann 1077604326b4SDaniel Borkmann static int sk_psock_strp_read_done(struct strparser *strp, int err) 1078604326b4SDaniel Borkmann { 1079604326b4SDaniel Borkmann return err; 1080604326b4SDaniel Borkmann } 1081604326b4SDaniel Borkmann 1082604326b4SDaniel Borkmann static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) 1083604326b4SDaniel Borkmann { 10845a685cd9SCong Wang struct sk_psock *psock = container_of(strp, struct sk_psock, strp); 1085604326b4SDaniel Borkmann struct bpf_prog *prog; 1086604326b4SDaniel Borkmann int ret = skb->len; 1087604326b4SDaniel Borkmann 1088604326b4SDaniel Borkmann rcu_read_lock(); 1089ae8b8332SCong Wang prog = READ_ONCE(psock->progs.stream_parser); 10900b17ad25SJohn Fastabend if (likely(prog)) { 10910b17ad25SJohn Fastabend skb->sk = psock->sk; 109253334232SCong Wang ret = bpf_prog_run_pin_on_cpu(prog, skb); 10930b17ad25SJohn Fastabend skb->sk = NULL; 10940b17ad25SJohn Fastabend } 1095604326b4SDaniel Borkmann rcu_read_unlock(); 1096604326b4SDaniel Borkmann return ret; 1097604326b4SDaniel Borkmann } 1098604326b4SDaniel Borkmann 1099604326b4SDaniel Borkmann /* Called with socket lock held. */ 1100552de910SJohn Fastabend static void sk_psock_strp_data_ready(struct sock *sk) 1101604326b4SDaniel Borkmann { 1102604326b4SDaniel Borkmann struct sk_psock *psock; 1103604326b4SDaniel Borkmann 1104604326b4SDaniel Borkmann rcu_read_lock(); 1105604326b4SDaniel Borkmann psock = sk_psock(sk); 1106604326b4SDaniel Borkmann if (likely(psock)) { 1107e91de6afSJohn Fastabend if (tls_sw_has_ctx_rx(sk)) { 11085a685cd9SCong Wang psock->saved_data_ready(sk); 1109e91de6afSJohn Fastabend } else { 1110604326b4SDaniel Borkmann write_lock_bh(&sk->sk_callback_lock); 11115a685cd9SCong Wang strp_data_ready(&psock->strp); 1112604326b4SDaniel Borkmann write_unlock_bh(&sk->sk_callback_lock); 1113604326b4SDaniel Borkmann } 1114e91de6afSJohn Fastabend } 1115604326b4SDaniel Borkmann rcu_read_unlock(); 1116604326b4SDaniel Borkmann } 1117604326b4SDaniel Borkmann 111888759609SCong Wang int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) 111988759609SCong Wang { 112088759609SCong Wang static const struct strp_callbacks cb = { 112188759609SCong Wang .rcv_msg = sk_psock_strp_read, 112288759609SCong Wang .read_sock_done = sk_psock_strp_read_done, 112388759609SCong Wang .parse_msg = sk_psock_strp_parse, 112488759609SCong Wang }; 112588759609SCong Wang 11265a685cd9SCong Wang return strp_init(&psock->strp, sk, &cb); 112788759609SCong Wang } 112888759609SCong Wang 112988759609SCong Wang void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) 113088759609SCong Wang { 11315a685cd9SCong Wang if (psock->saved_data_ready) 113288759609SCong Wang return; 113388759609SCong Wang 11345a685cd9SCong Wang psock->saved_data_ready = sk->sk_data_ready; 113588759609SCong Wang sk->sk_data_ready = sk_psock_strp_data_ready; 113688759609SCong Wang sk->sk_write_space = sk_psock_write_space; 113788759609SCong Wang } 113888759609SCong Wang 113988759609SCong Wang void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) 114088759609SCong Wang { 1141c0d95d33SJohn Fastabend psock_set_prog(&psock->progs.stream_parser, NULL); 1142c0d95d33SJohn Fastabend 11435a685cd9SCong Wang if (!psock->saved_data_ready) 114488759609SCong Wang return; 114588759609SCong Wang 11465a685cd9SCong Wang sk->sk_data_ready = psock->saved_data_ready; 11475a685cd9SCong Wang psock->saved_data_ready = NULL; 11485a685cd9SCong Wang strp_stop(&psock->strp); 114988759609SCong Wang } 115088759609SCong Wang 115188759609SCong Wang static void sk_psock_done_strp(struct sk_psock *psock) 115288759609SCong Wang { 115388759609SCong Wang /* Parser has been stopped */ 1154ae8b8332SCong Wang if (psock->progs.stream_parser) 11555a685cd9SCong Wang strp_done(&psock->strp); 115688759609SCong Wang } 115788759609SCong Wang #else 115888759609SCong Wang static void sk_psock_done_strp(struct sk_psock *psock) 115988759609SCong Wang { 116088759609SCong Wang } 116188759609SCong Wang #endif /* CONFIG_BPF_STREAM_PARSER */ 116288759609SCong Wang 1163ef565928SJohn Fastabend static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, 1164ef565928SJohn Fastabend unsigned int offset, size_t orig_len) 1165ef565928SJohn Fastabend { 1166ef565928SJohn Fastabend struct sock *sk = (struct sock *)desc->arg.data; 1167ef565928SJohn Fastabend struct sk_psock *psock; 1168ef565928SJohn Fastabend struct bpf_prog *prog; 1169ef565928SJohn Fastabend int ret = __SK_DROP; 117060ce37b0SEric Dumazet int len = orig_len; 1171ef565928SJohn Fastabend 1172ef565928SJohn Fastabend /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ 1173ef565928SJohn Fastabend skb = skb_clone(skb, GFP_ATOMIC); 1174ef565928SJohn Fastabend if (!skb) { 1175ef565928SJohn Fastabend desc->error = -ENOMEM; 1176ef565928SJohn Fastabend return 0; 1177ef565928SJohn Fastabend } 1178ef565928SJohn Fastabend 1179ef565928SJohn Fastabend rcu_read_lock(); 1180ef565928SJohn Fastabend psock = sk_psock(sk); 1181ef565928SJohn Fastabend if (unlikely(!psock)) { 1182ef565928SJohn Fastabend len = 0; 1183781dd043SCong Wang sock_drop(sk, skb); 1184ef565928SJohn Fastabend goto out; 1185ef565928SJohn Fastabend } 1186ae8b8332SCong Wang prog = READ_ONCE(psock->progs.stream_verdict); 1187a7ba4558SCong Wang if (!prog) 1188a7ba4558SCong Wang prog = READ_ONCE(psock->progs.skb_verdict); 1189ef565928SJohn Fastabend if (likely(prog)) { 1190144748ebSJohn Fastabend skb->sk = sk; 1191e3526bb9SCong Wang skb_dst_drop(skb); 1192e3526bb9SCong Wang skb_bpf_redirect_clear(skb); 119353334232SCong Wang ret = bpf_prog_run_pin_on_cpu(prog, skb); 1194e3526bb9SCong Wang ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); 1195144748ebSJohn Fastabend skb->sk = NULL; 1196ef565928SJohn Fastabend } 11971581a6c1SCong Wang if (sk_psock_verdict_apply(psock, skb, ret) < 0) 11981581a6c1SCong Wang len = 0; 1199ef565928SJohn Fastabend out: 1200ef565928SJohn Fastabend rcu_read_unlock(); 1201ef565928SJohn Fastabend return len; 1202ef565928SJohn Fastabend } 1203ef565928SJohn Fastabend 1204ef565928SJohn Fastabend static void sk_psock_verdict_data_ready(struct sock *sk) 1205ef565928SJohn Fastabend { 1206ef565928SJohn Fastabend struct socket *sock = sk->sk_socket; 1207ef565928SJohn Fastabend read_descriptor_t desc; 1208ef565928SJohn Fastabend 1209ef565928SJohn Fastabend if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) 1210ef565928SJohn Fastabend return; 1211ef565928SJohn Fastabend 1212ef565928SJohn Fastabend desc.arg.data = sk; 1213ef565928SJohn Fastabend desc.error = 0; 1214ef565928SJohn Fastabend desc.count = 1; 1215ef565928SJohn Fastabend 1216ef565928SJohn Fastabend sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); 1217ef565928SJohn Fastabend } 1218ef565928SJohn Fastabend 1219ef565928SJohn Fastabend void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) 1220ef565928SJohn Fastabend { 12215a685cd9SCong Wang if (psock->saved_data_ready) 1222ef565928SJohn Fastabend return; 1223ef565928SJohn Fastabend 12245a685cd9SCong Wang psock->saved_data_ready = sk->sk_data_ready; 1225ef565928SJohn Fastabend sk->sk_data_ready = sk_psock_verdict_data_ready; 1226ef565928SJohn Fastabend sk->sk_write_space = sk_psock_write_space; 1227ef565928SJohn Fastabend } 1228ef565928SJohn Fastabend 1229ef565928SJohn Fastabend void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) 1230ef565928SJohn Fastabend { 1231c0d95d33SJohn Fastabend psock_set_prog(&psock->progs.stream_verdict, NULL); 1232c0d95d33SJohn Fastabend psock_set_prog(&psock->progs.skb_verdict, NULL); 1233c0d95d33SJohn Fastabend 12345a685cd9SCong Wang if (!psock->saved_data_ready) 1235ef565928SJohn Fastabend return; 1236ef565928SJohn Fastabend 12375a685cd9SCong Wang sk->sk_data_ready = psock->saved_data_ready; 12385a685cd9SCong Wang psock->saved_data_ready = NULL; 1239ef565928SJohn Fastabend } 1240