1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/ck.h> 31 #include <sys/lock.h> 32 #include <sys/malloc.h> 33 #include <sys/mbuf.h> 34 #include <sys/mutex.h> 35 #include <sys/socket.h> 36 #include <sys/socketvar.h> 37 #include <sys/syslog.h> 38 39 #include <netlink/netlink.h> 40 #include <netlink/netlink_ctl.h> 41 #include <netlink/netlink_linux.h> 42 #include <netlink/netlink_var.h> 43 44 #define DEBUG_MOD_NAME nl_io 45 #define DEBUG_MAX_LEVEL LOG_DEBUG3 46 #include <netlink/netlink_debug.h> 47 _DECLARE_DEBUG(LOG_INFO); 48 49 /* 50 * The logic below provide a p2p interface for receiving and 51 * sending netlink data between the kernel and userland. 52 */ 53 54 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp); 55 56 struct nl_buf * 57 nl_buf_alloc(size_t len, int mflag) 58 { 59 struct nl_buf *nb; 60 61 nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag); 62 if (__predict_true(nb != NULL)) { 63 nb->buflen = len; 64 nb->datalen = nb->offset = 0; 65 } 66 67 return (nb); 68 } 69 70 void 71 nl_buf_free(struct nl_buf *nb) 72 { 73 74 free(nb, M_NETLINK); 75 } 76 77 void 78 nl_schedule_taskqueue(struct nlpcb *nlp) 79 { 80 if (!nlp->nl_task_pending) { 81 nlp->nl_task_pending = true; 82 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 83 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 84 } else { 85 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 86 } 87 } 88 89 static bool 90 nl_process_received_one(struct nlpcb *nlp) 91 { 92 struct socket *so = nlp->nl_socket; 93 struct sockbuf *sb; 94 struct nl_buf *nb; 95 bool reschedule = false; 96 97 NLP_LOCK(nlp); 98 nlp->nl_task_pending = false; 99 NLP_UNLOCK(nlp); 100 101 /* 102 * Do not process queued up requests if there is no space to queue 103 * replies. 104 */ 105 sb = &so->so_rcv; 106 SOCK_RECVBUF_LOCK(so); 107 if (sb->sb_hiwat <= sb->sb_ccc) { 108 SOCK_RECVBUF_UNLOCK(so); 109 return (false); 110 } 111 SOCK_RECVBUF_UNLOCK(so); 112 113 sb = &so->so_snd; 114 SOCK_SENDBUF_LOCK(so); 115 while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) { 116 TAILQ_REMOVE(&sb->nl_queue, nb, tailq); 117 SOCK_SENDBUF_UNLOCK(so); 118 reschedule = nl_process_nbuf(nb, nlp); 119 SOCK_SENDBUF_LOCK(so); 120 if (reschedule) { 121 sb->sb_acc -= nb->datalen; 122 sb->sb_ccc -= nb->datalen; 123 /* XXXGL: potentially can reduce lock&unlock count. */ 124 sowwakeup_locked(so); 125 nl_buf_free(nb); 126 SOCK_SENDBUF_LOCK(so); 127 } else { 128 TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq); 129 break; 130 } 131 } 132 SOCK_SENDBUF_UNLOCK(so); 133 134 return (reschedule); 135 } 136 137 static void 138 nl_process_received(struct nlpcb *nlp) 139 { 140 NL_LOG(LOG_DEBUG3, "taskqueue called"); 141 142 if (__predict_false(nlp->nl_need_thread_setup)) { 143 nl_set_thread_nlp(curthread, nlp); 144 NLP_LOCK(nlp); 145 nlp->nl_need_thread_setup = false; 146 NLP_UNLOCK(nlp); 147 } 148 149 while (nl_process_received_one(nlp)) 150 ; 151 } 152 153 /* 154 * Called after some data have been read from the socket. 155 */ 156 void 157 nl_on_transmit(struct nlpcb *nlp) 158 { 159 NLP_LOCK(nlp); 160 161 struct socket *so = nlp->nl_socket; 162 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 163 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 164 unsigned long dropped_messages = nlp->nl_dropped_messages; 165 nlp->nl_dropped_bytes = 0; 166 nlp->nl_dropped_messages = 0; 167 168 struct sockbuf *sb = &so->so_rcv; 169 NLP_LOG(LOG_DEBUG, nlp, 170 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 171 "bytes: [%u/%u]", dropped_messages, dropped_bytes, 172 sb->sb_ccc, sb->sb_hiwat); 173 /* TODO: send netlink message */ 174 } 175 176 nl_schedule_taskqueue(nlp); 177 NLP_UNLOCK(nlp); 178 } 179 180 void 181 nl_taskqueue_handler(void *_arg, int pending) 182 { 183 struct nlpcb *nlp = (struct nlpcb *)_arg; 184 185 CURVNET_SET(nlp->nl_socket->so_vnet); 186 nl_process_received(nlp); 187 CURVNET_RESTORE(); 188 } 189 190 /* 191 * Tries to send current data buffer from writer. 192 * 193 * Returns true on success. 194 * If no queue overrunes happened, wakes up socket owner. 195 */ 196 bool 197 nl_send(struct nl_writer *nw, struct nlpcb *nlp) 198 { 199 struct socket *so = nlp->nl_socket; 200 struct sockbuf *sb = &so->so_rcv; 201 struct nl_buf *nb; 202 203 MPASS(nw->hdr == NULL); 204 MPASS(nw->buf != NULL); 205 MPASS(nw->buf->datalen > 0); 206 207 IF_DEBUG_LEVEL(LOG_DEBUG2) { 208 struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data; 209 NLP_LOG(LOG_DEBUG2, nlp, 210 "TX len %u msgs %u msg type %d first hdrlen %u", 211 nw->buf->datalen, nw->num_messages, hdr->nlmsg_type, 212 hdr->nlmsg_len); 213 } 214 215 if (nlp->nl_linux && linux_netlink_p != NULL && 216 __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) { 217 nl_buf_free(nw->buf); 218 nw->buf = NULL; 219 return (false); 220 } 221 222 nb = nw->buf; 223 nw->buf = NULL; 224 225 SOCK_RECVBUF_LOCK(so); 226 if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) { 227 SOCK_RECVBUF_UNLOCK(so); 228 NLP_LOCK(nlp); 229 nlp->nl_dropped_bytes += nb->datalen; 230 nlp->nl_dropped_messages += nw->num_messages; 231 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 232 (unsigned long)nlp->nl_dropped_messages, nw->num_messages, 233 (unsigned long)nlp->nl_dropped_bytes, nb->datalen); 234 NLP_UNLOCK(nlp); 235 nl_buf_free(nb); 236 return (false); 237 } else { 238 bool full; 239 240 TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); 241 sb->sb_acc += nb->datalen; 242 sb->sb_ccc += nb->datalen; 243 full = sb->sb_hiwat <= sb->sb_ccc; 244 sorwakeup_locked(so); 245 if (full) { 246 NLP_LOCK(nlp); 247 nlp->nl_tx_blocked = true; 248 NLP_UNLOCK(nlp); 249 } 250 return (true); 251 } 252 } 253 254 static int 255 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 256 struct nlpcb *nlp, struct nl_pstate *npt) 257 { 258 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 259 int error = 0; 260 261 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", 262 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, 263 hdr->nlmsg_pid); 264 265 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 266 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 267 hdr->nlmsg_len, remaining_length); 268 return (EINVAL); 269 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 270 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 271 return (EINVAL); 272 } 273 /* Stamp each message with sender pid */ 274 hdr->nlmsg_pid = nlp->nl_port; 275 276 npt->hdr = hdr; 277 278 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 279 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 280 hdr->nlmsg_type); 281 282 if (nlp->nl_linux && linux_netlink_p != NULL) { 283 struct nlmsghdr *hdr_orig = hdr; 284 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 285 if (hdr == NULL) { 286 /* Failed to translate to kernel format. Report an error back */ 287 hdr = hdr_orig; 288 npt->hdr = hdr; 289 if (hdr->nlmsg_flags & NLM_F_ACK) 290 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); 291 return (0); 292 } 293 } 294 error = handler(hdr, npt); 295 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 296 } 297 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 298 if (!npt->nw->suppress_ack) { 299 NL_LOG(LOG_DEBUG3, "ack"); 300 nlmsg_ack(nlp, error, hdr, npt); 301 } 302 } 303 304 return (0); 305 } 306 307 static void 308 npt_clear(struct nl_pstate *npt) 309 { 310 lb_clear(&npt->lb); 311 npt->error = 0; 312 npt->err_msg = NULL; 313 npt->err_off = 0; 314 npt->hdr = NULL; 315 npt->nw->suppress_ack = false; 316 } 317 318 /* 319 * Processes an incoming packet, which can contain multiple netlink messages 320 */ 321 static bool 322 nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp) 323 { 324 struct nlmsghdr *hdr; 325 int error; 326 327 NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket); 328 329 struct nl_writer nw = {}; 330 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 331 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 332 return (true); 333 } 334 335 nlmsg_ignore_limit(&nw); 336 337 struct nl_pstate npt = { 338 .nlp = nlp, 339 .lb.base = &nb->data[roundup2(nb->datalen, 8)], 340 .lb.size = nb->buflen - roundup2(nb->datalen, 8), 341 .nw = &nw, 342 .strict = nlp->nl_flags & NLF_STRICT, 343 }; 344 345 for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) { 346 hdr = (struct nlmsghdr *)&nb->data[nb->offset]; 347 /* Save length prior to calling handler */ 348 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 349 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", 350 nb->offset, nb->datalen); 351 npt_clear(&npt); 352 error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp, 353 &npt); 354 nb->offset += msglen; 355 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 356 break; 357 } 358 NL_LOG(LOG_DEBUG3, "packet parsing done"); 359 nlmsg_flush(&nw); 360 361 if (nlp->nl_tx_blocked) { 362 NLP_LOCK(nlp); 363 nlp->nl_tx_blocked = false; 364 NLP_UNLOCK(nlp); 365 return (false); 366 } else 367 return (true); 368 } 369