1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2021 Ng Peng Nam Sean 5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/ck.h> 31 #include <sys/lock.h> 32 #include <sys/malloc.h> 33 #include <sys/mbuf.h> 34 #include <sys/mutex.h> 35 #include <sys/socket.h> 36 #include <sys/socketvar.h> 37 #include <sys/syslog.h> 38 39 #include <netlink/netlink.h> 40 #include <netlink/netlink_ctl.h> 41 #include <netlink/netlink_linux.h> 42 #include <netlink/netlink_var.h> 43 44 #define DEBUG_MOD_NAME nl_io 45 #define DEBUG_MAX_LEVEL LOG_DEBUG3 46 #include <netlink/netlink_debug.h> 47 _DECLARE_DEBUG(LOG_INFO); 48 49 /* 50 * The logic below provide a p2p interface for receiving and 51 * sending netlink data between the kernel and userland. 52 */ 53 54 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp); 55 56 struct nl_buf * 57 nl_buf_alloc(size_t len, int mflag) 58 { 59 struct nl_buf *nb; 60 61 nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag); 62 if (__predict_true(nb != NULL)) { 63 nb->buflen = len; 64 nb->datalen = nb->offset = 0; 65 nb->control = NULL; 66 } 67 68 return (nb); 69 } 70 71 void 72 nl_buf_free(struct nl_buf *nb) 73 { 74 75 if (nb->control) 76 m_freem(nb->control); 77 free(nb, M_NETLINK); 78 } 79 80 void 81 nl_add_msg_info(struct nl_buf *nb) 82 { 83 /* XXXGL pass nlp as arg? */ 84 struct nlpcb *nlp = nl_get_thread_nlp(curthread); 85 NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", 86 curthread, nlp); 87 88 if (nlp == NULL) 89 return; 90 91 /* Prepare what we want to encode - PID, socket PID & msg seq */ 92 struct { 93 struct nlattr nla; 94 uint32_t val; 95 } data[] = { 96 { 97 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 98 .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, 99 .val = nlp->nl_process_id, 100 }, 101 { 102 .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), 103 .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, 104 .val = nlp->nl_port, 105 }, 106 }; 107 108 109 nb->control = sbcreatecontrol(data, sizeof(data), 110 NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); 111 112 if (__predict_true(nb->control != NULL)) 113 NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p", 114 (unsigned)sizeof(data), nb->control); 115 else 116 NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control", 117 (unsigned)sizeof(data)); 118 } 119 120 void 121 nl_schedule_taskqueue(struct nlpcb *nlp) 122 { 123 if (!nlp->nl_task_pending) { 124 nlp->nl_task_pending = true; 125 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); 126 NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); 127 } else { 128 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); 129 } 130 } 131 132 static bool 133 nl_process_received_one(struct nlpcb *nlp) 134 { 135 struct socket *so = nlp->nl_socket; 136 struct sockbuf *sb; 137 struct nl_buf *nb; 138 bool reschedule = false; 139 140 NLP_LOCK(nlp); 141 nlp->nl_task_pending = false; 142 NLP_UNLOCK(nlp); 143 144 /* 145 * Do not process queued up requests if there is no space to queue 146 * replies. 147 */ 148 sb = &so->so_rcv; 149 SOCK_RECVBUF_LOCK(so); 150 if (sb->sb_hiwat <= sb->sb_ccc) { 151 SOCK_RECVBUF_UNLOCK(so); 152 return (false); 153 } 154 SOCK_RECVBUF_UNLOCK(so); 155 156 sb = &so->so_snd; 157 SOCK_SENDBUF_LOCK(so); 158 while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) { 159 TAILQ_REMOVE(&sb->nl_queue, nb, tailq); 160 SOCK_SENDBUF_UNLOCK(so); 161 reschedule = nl_process_nbuf(nb, nlp); 162 SOCK_SENDBUF_LOCK(so); 163 if (reschedule) { 164 sb->sb_acc -= nb->datalen; 165 sb->sb_ccc -= nb->datalen; 166 /* XXXGL: potentially can reduce lock&unlock count. */ 167 sowwakeup_locked(so); 168 nl_buf_free(nb); 169 SOCK_SENDBUF_LOCK(so); 170 } else { 171 TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq); 172 break; 173 } 174 } 175 SOCK_SENDBUF_UNLOCK(so); 176 177 return (reschedule); 178 } 179 180 static void 181 nl_process_received(struct nlpcb *nlp) 182 { 183 NL_LOG(LOG_DEBUG3, "taskqueue called"); 184 185 if (__predict_false(nlp->nl_need_thread_setup)) { 186 nl_set_thread_nlp(curthread, nlp); 187 NLP_LOCK(nlp); 188 nlp->nl_need_thread_setup = false; 189 NLP_UNLOCK(nlp); 190 } 191 192 while (nl_process_received_one(nlp)) 193 ; 194 } 195 196 /* 197 * Called after some data have been read from the socket. 198 */ 199 void 200 nl_on_transmit(struct nlpcb *nlp) 201 { 202 NLP_LOCK(nlp); 203 204 struct socket *so = nlp->nl_socket; 205 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { 206 unsigned long dropped_bytes = nlp->nl_dropped_bytes; 207 unsigned long dropped_messages = nlp->nl_dropped_messages; 208 nlp->nl_dropped_bytes = 0; 209 nlp->nl_dropped_messages = 0; 210 211 struct sockbuf *sb = &so->so_rcv; 212 NLP_LOG(LOG_DEBUG, nlp, 213 "socket RX overflowed, %lu messages (%lu bytes) dropped. " 214 "bytes: [%u/%u]", dropped_messages, dropped_bytes, 215 sb->sb_ccc, sb->sb_hiwat); 216 /* TODO: send netlink message */ 217 } 218 219 nl_schedule_taskqueue(nlp); 220 NLP_UNLOCK(nlp); 221 } 222 223 void 224 nl_taskqueue_handler(void *_arg, int pending) 225 { 226 struct nlpcb *nlp = (struct nlpcb *)_arg; 227 228 CURVNET_SET(nlp->nl_socket->so_vnet); 229 nl_process_received(nlp); 230 CURVNET_RESTORE(); 231 } 232 233 /* 234 * Tries to send current data buffer from writer. 235 * 236 * Returns true on success. 237 * If no queue overrunes happened, wakes up socket owner. 238 */ 239 bool 240 nl_send_one(struct nl_writer *nw) 241 { 242 struct nlpcb *nlp = nw->nlp; 243 struct socket *so = nlp->nl_socket; 244 struct sockbuf *sb = &so->so_rcv; 245 struct nl_buf *nb; 246 247 MPASS(nw->hdr == NULL); 248 249 IF_DEBUG_LEVEL(LOG_DEBUG2) { 250 struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data; 251 NLP_LOG(LOG_DEBUG2, nlp, 252 "TX len %u msgs %u msg type %d first hdrlen %u", 253 nw->buf->datalen, nw->num_messages, hdr->nlmsg_type, 254 hdr->nlmsg_len); 255 } 256 257 if (nlp->nl_linux && linux_netlink_p != NULL && 258 __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) { 259 nl_buf_free(nw->buf); 260 nw->buf = NULL; 261 return (false); 262 } 263 264 nb = nw->buf; 265 nw->buf = NULL; 266 267 SOCK_RECVBUF_LOCK(so); 268 if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) { 269 SOCK_RECVBUF_UNLOCK(so); 270 NLP_LOCK(nlp); 271 nlp->nl_dropped_bytes += nb->datalen; 272 nlp->nl_dropped_messages += nw->num_messages; 273 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", 274 (unsigned long)nlp->nl_dropped_messages, nw->num_messages, 275 (unsigned long)nlp->nl_dropped_bytes, nb->datalen); 276 NLP_UNLOCK(nlp); 277 nl_buf_free(nb); 278 return (false); 279 } else { 280 bool full; 281 282 TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); 283 sb->sb_acc += nb->datalen; 284 sb->sb_ccc += nb->datalen; 285 full = sb->sb_hiwat <= sb->sb_ccc; 286 sorwakeup_locked(so); 287 if (full) { 288 NLP_LOCK(nlp); 289 nlp->nl_tx_blocked = true; 290 NLP_UNLOCK(nlp); 291 } 292 return (true); 293 } 294 } 295 296 static int 297 nl_receive_message(struct nlmsghdr *hdr, int remaining_length, 298 struct nlpcb *nlp, struct nl_pstate *npt) 299 { 300 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; 301 int error = 0; 302 303 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", 304 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, 305 hdr->nlmsg_pid); 306 307 if (__predict_false(hdr->nlmsg_len > remaining_length)) { 308 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", 309 hdr->nlmsg_len, remaining_length); 310 return (EINVAL); 311 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { 312 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); 313 return (EINVAL); 314 } 315 /* Stamp each message with sender pid */ 316 hdr->nlmsg_pid = nlp->nl_port; 317 318 npt->hdr = hdr; 319 320 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { 321 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", 322 hdr->nlmsg_type); 323 324 if (nlp->nl_linux && linux_netlink_p != NULL) { 325 struct nlmsghdr *hdr_orig = hdr; 326 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); 327 if (hdr == NULL) { 328 /* Failed to translate to kernel format. Report an error back */ 329 hdr = hdr_orig; 330 npt->hdr = hdr; 331 if (hdr->nlmsg_flags & NLM_F_ACK) 332 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); 333 return (0); 334 } 335 } 336 error = handler(hdr, npt); 337 NL_LOG(LOG_DEBUG2, "retcode: %d", error); 338 } 339 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { 340 if (!npt->nw->suppress_ack) { 341 NL_LOG(LOG_DEBUG3, "ack"); 342 nlmsg_ack(nlp, error, hdr, npt); 343 } 344 } 345 346 return (0); 347 } 348 349 static void 350 npt_clear(struct nl_pstate *npt) 351 { 352 lb_clear(&npt->lb); 353 npt->error = 0; 354 npt->err_msg = NULL; 355 npt->err_off = 0; 356 npt->hdr = NULL; 357 npt->nw->suppress_ack = false; 358 } 359 360 /* 361 * Processes an incoming packet, which can contain multiple netlink messages 362 */ 363 static bool 364 nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp) 365 { 366 struct nlmsghdr *hdr; 367 int error; 368 369 NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket); 370 371 struct nl_writer nw = {}; 372 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { 373 NL_LOG(LOG_DEBUG, "error allocating socket writer"); 374 return (true); 375 } 376 377 nlmsg_ignore_limit(&nw); 378 379 struct nl_pstate npt = { 380 .nlp = nlp, 381 .lb.base = &nb->data[roundup2(nb->datalen, 8)], 382 .lb.size = nb->buflen - roundup2(nb->datalen, 8), 383 .nw = &nw, 384 .strict = nlp->nl_flags & NLF_STRICT, 385 }; 386 387 for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) { 388 hdr = (struct nlmsghdr *)&nb->data[nb->offset]; 389 /* Save length prior to calling handler */ 390 int msglen = NLMSG_ALIGN(hdr->nlmsg_len); 391 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", 392 nb->offset, nb->datalen); 393 npt_clear(&npt); 394 error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp, 395 &npt); 396 nb->offset += msglen; 397 if (__predict_false(error != 0 || nlp->nl_tx_blocked)) 398 break; 399 } 400 NL_LOG(LOG_DEBUG3, "packet parsing done"); 401 nlmsg_flush(&nw); 402 403 if (nlp->nl_tx_blocked) { 404 NLP_LOCK(nlp); 405 nlp->nl_tx_blocked = false; 406 NLP_UNLOCK(nlp); 407 return (false); 408 } else 409 return (true); 410 } 411