diff --git a/src/libsystemd/sd-netlink/netlink-internal.h b/src/libsystemd/sd-netlink/netlink-internal.h index 1b8f4afcb0..51e523c1ba 100644 --- a/src/libsystemd/sd-netlink/netlink-internal.h +++ b/src/libsystemd/sd-netlink/netlink-internal.h @@ -55,6 +55,11 @@ typedef struct sd_netlink_slot { }; } sd_netlink_slot; +typedef struct NetlinkIgnoredSerial { + uint32_t serial; + usec_t timeout_usec; /* timestamp in CLOCK_MONOTONIC */ +} NetlinkIgnoredSerial; + typedef struct sd_netlink { unsigned n_ref; @@ -78,6 +83,7 @@ typedef struct sd_netlink { bool processing:1; uint32_t serial; + Hashmap *ignored_serials; struct Prioq *reply_callbacks_prioq; Hashmap *reply_callbacks; @@ -181,8 +187,7 @@ int sd_nfnl_call_batch( sd_netlink *nfnl, sd_netlink_message **messages, size_t n_messages, - uint64_t usec, - sd_netlink_message ***ret_messages); + uint64_t usec); int sd_nfnl_message_new( sd_netlink *nfnl, sd_netlink_message **ret, diff --git a/src/libsystemd/sd-netlink/netlink-message-nfnl.c b/src/libsystemd/sd-netlink/netlink-message-nfnl.c index 2b6e1dd82b..a485fd096f 100644 --- a/src/libsystemd/sd-netlink/netlink-message-nfnl.c +++ b/src/libsystemd/sd-netlink/netlink-message-nfnl.c @@ -7,7 +7,6 @@ #include "sd-netlink.h" #include "alloc-util.h" -#include "errno-util.h" #include "iovec-util.h" #include "netlink-internal.h" #include "netlink-util.h" @@ -119,7 +118,7 @@ int sd_nfnl_send_batch( return -ENOMEM; if (ret_serials) { - serials = new(uint32_t, n_messages); + serials = new(uint32_t, n_messages + 2); if (!serials) return -ENOMEM; } @@ -133,6 +132,9 @@ int sd_nfnl_send_batch( return r; netlink_seal_message(nfnl, batch_begin); + if (serials) + serials[c] = message_get_serial(batch_begin); + iovs[c++] = IOVEC_MAKE(batch_begin->hdr, batch_begin->hdr->nlmsg_len); for (size_t i = 0; i < n_messages; i++) { @@ -147,7 +149,7 @@ int sd_nfnl_send_batch( netlink_seal_message(nfnl, messages[i]); if (serials) - serials[i] = message_get_serial(messages[i]); + serials[c] = message_get_serial(messages[i]); /* It seems that the kernel accepts an arbitrary number. Let's set the lower 16 bits of the * serial of the first message. */ @@ -161,6 +163,9 @@ int sd_nfnl_send_batch( return r; netlink_seal_message(nfnl, batch_end); + if (serials) + serials[c] = message_get_serial(batch_end); + iovs[c++] = IOVEC_MAKE(batch_end->hdr, batch_end->hdr->nlmsg_len); assert(c == n_messages + 2); @@ -178,10 +183,8 @@ int sd_nfnl_call_batch( sd_netlink *nfnl, sd_netlink_message **messages, size_t n_messages, - uint64_t usec, - sd_netlink_message ***ret_messages) { + uint64_t usec) { - _cleanup_free_ sd_netlink_message **replies = NULL; _cleanup_free_ uint32_t *serials = NULL; int r; @@ -190,26 +193,40 @@ int sd_nfnl_call_batch( assert_return(messages, -EINVAL); assert_return(n_messages > 0, -EINVAL); - if (ret_messages) { - replies = new0(sd_netlink_message*, n_messages); - if (!replies) - return -ENOMEM; - } - r = sd_nfnl_send_batch(nfnl, messages, n_messages, &serials); if (r < 0) return r; - for (size_t i = 0; i < n_messages; i++) - RET_GATHER(r, - sd_netlink_read(nfnl, serials[i], usec, ret_messages ? replies + i : NULL)); - if (r < 0) - return r; + for (size_t i = 1; i <= n_messages; i++) { + /* If we have received an error, kernel may not send replies for later messages. Let's ignore + * remaining replies. */ + if (r < 0) { + (void) sd_netlink_ignore_serial(nfnl, serials[i], usec); + continue; + } - if (ret_messages) - *ret_messages = TAKE_PTR(replies); + r = sd_netlink_read(nfnl, serials[i], usec, /* ret= */ NULL); + if (r != -ETIMEDOUT) + continue; - return 0; + /* The kernel returns some errors, e.g. unprivileged, to the BATCH_BEGIN. Hence, if we have + * not received any replies for the batch body, try to read an error in the reply for the + * batch begin. Note, since v6.10 (bf2ac490d28c21a349e9eef81edc45320fca4a3c), we can expect + * that the kernel always replies the batch begin and end. When we bump the kernel baseline, + * we can read the reply for the batch begin at first. */ + int k = sd_netlink_read(nfnl, serials[0], usec, /* ret= */ NULL); + if (k < 0) + r = k; + + serials[0] = 0; /* indicates that we have read the reply. */ + } + + /* Ignore replies for batch begin and end if we have not read them. */ + if (serials[0] != 0) + (void) sd_netlink_ignore_serial(nfnl, serials[0], usec); + (void) sd_netlink_ignore_serial(nfnl, serials[n_messages + 1], usec); + + return r; } int sd_nfnl_nft_message_new_basechain( diff --git a/src/libsystemd/sd-netlink/netlink-socket.c b/src/libsystemd/sd-netlink/netlink-socket.c index 28fd2bc3d9..385d92a64f 100644 --- a/src/libsystemd/sd-netlink/netlink-socket.c +++ b/src/libsystemd/sd-netlink/netlink-socket.c @@ -222,6 +222,16 @@ static int netlink_queue_received_message(sd_netlink *nl, sd_netlink_message *m) assert(nl); assert(m); + serial = message_get_serial(m); + if (serial != 0) { + NetlinkIgnoredSerial *s = hashmap_remove(nl->ignored_serials, UINT32_TO_PTR(serial)); + if (s) { + /* We are not interested in the message anymore. */ + free(s); + return 0; + } + } + if (ordered_set_size(nl->rqueue) >= NETLINK_RQUEUE_MAX) return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS), "sd-netlink: exhausted the read queue size (%d)", NETLINK_RQUEUE_MAX); @@ -235,7 +245,6 @@ static int netlink_queue_received_message(sd_netlink *nl, sd_netlink_message *m) if (sd_netlink_message_is_broadcast(m)) return 0; - serial = message_get_serial(m); if (serial == 0) return 0; diff --git a/src/libsystemd/sd-netlink/sd-netlink.c b/src/libsystemd/sd-netlink/sd-netlink.c index cd66b2e832..2deb49557d 100644 --- a/src/libsystemd/sd-netlink/sd-netlink.c +++ b/src/libsystemd/sd-netlink/sd-netlink.c @@ -126,6 +126,8 @@ static sd_netlink *netlink_free(sd_netlink *nl) { assert(nl); + hashmap_free(nl->ignored_serials); + ordered_set_free(nl->rqueue); hashmap_free(nl->rqueue_by_serial); hashmap_free(nl->rqueue_partial_by_serial); @@ -152,6 +154,96 @@ static sd_netlink *netlink_free(sd_netlink *nl) { DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free); +static usec_t netlink_now(sd_netlink *nl, clock_t clock) { + assert(nl); + + usec_t now_usec; + if (nl->event && sd_event_now(nl->event, clock, &now_usec) > 0) + return now_usec; + + return now(clock); +} + +static usec_t timespan_to_timestamp(sd_netlink *nl, usec_t usec) { + static bool default_timeout_set = false; + static usec_t default_timeout; + int r; + + assert(nl); + + if (usec == 0) { + if (!default_timeout_set) { + const char *e; + + default_timeout_set = true; + default_timeout = NETLINK_DEFAULT_TIMEOUT_USEC; + + e = secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT"); + if (e) { + r = parse_sec(e, &default_timeout); + if (r < 0) + log_debug_errno(r, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m"); + } + } + + usec = default_timeout; + } + + return usec_add(netlink_now(nl, CLOCK_MONOTONIC), usec); +} + +static void netlink_trim_ignored_serials(sd_netlink *nl) { + NetlinkIgnoredSerial *s; + usec_t now_usec = 0; + + assert(nl); + + HASHMAP_FOREACH(s, nl->ignored_serials) { + if (s->timeout_usec == USEC_INFINITY) + continue; + + if (now_usec == 0) + now_usec = netlink_now(nl, CLOCK_MONOTONIC); + + if (s->timeout_usec < now_usec) + free(hashmap_remove(nl->ignored_serials, UINT32_TO_PTR(s->serial))); + } +} + +int sd_netlink_ignore_serial(sd_netlink *nl, uint32_t serial, uint64_t timeout_usec) { + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(serial != 0, -EINVAL); + + timeout_usec = timespan_to_timestamp(nl, timeout_usec); + + NetlinkIgnoredSerial *existing = hashmap_get(nl->ignored_serials, UINT32_TO_PTR(serial)); + if (existing) { + existing->timeout_usec = timeout_usec; + return 0; + } + + netlink_trim_ignored_serials(nl); + + _cleanup_free_ NetlinkIgnoredSerial *s = new(NetlinkIgnoredSerial, 1); + if (!s) + return -ENOMEM; + + *s = (NetlinkIgnoredSerial) { + .serial = serial, + .timeout_usec = timeout_usec, + }; + + r = hashmap_ensure_put(&nl->ignored_serials, &trivial_hash_ops_value_free, UINT32_TO_PTR(s->serial), s); + if (r < 0) + return r; + + TAKE_PTR(s); + return 0; +} + int sd_netlink_send( sd_netlink *nl, sd_netlink_message *message, @@ -204,7 +296,6 @@ static int process_timeout(sd_netlink *nl) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; struct reply_callback *c; sd_netlink_slot *slot; - usec_t n; int r; assert(nl); @@ -213,8 +304,7 @@ static int process_timeout(sd_netlink *nl) { if (!c) return 0; - n = now(CLOCK_MONOTONIC); - if (c->timeout > n) + if (c->timeout > netlink_now(nl, CLOCK_MONOTONIC)) return 0; r = message_new_synthetic_error(nl, -ETIMEDOUT, c->serial, &m); @@ -337,6 +427,8 @@ static int process_running(sd_netlink *nl, sd_netlink_message **ret) { assert(nl); + netlink_trim_ignored_serials(nl); + r = process_timeout(nl); if (r != 0) goto null_message; @@ -384,32 +476,6 @@ int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret) { return r; } -static usec_t timespan_to_timestamp(usec_t usec) { - static bool default_timeout_set = false; - static usec_t default_timeout; - int r; - - if (usec == 0) { - if (!default_timeout_set) { - const char *e; - - default_timeout_set = true; - default_timeout = NETLINK_DEFAULT_TIMEOUT_USEC; - - e = secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT"); - if (e) { - r = parse_sec(e, &default_timeout); - if (r < 0) - log_debug_errno(r, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m"); - } - } - - usec = default_timeout; - } - - return usec_add(now(CLOCK_MONOTONIC), usec); -} - static int netlink_poll(sd_netlink *nl, bool need_more, usec_t timeout_usec) { usec_t m = USEC_INFINITY; int r, e; @@ -434,7 +500,7 @@ static int netlink_poll(sd_netlink *nl, bool need_more, usec_t timeout_usec) { if (r < 0) return r; - m = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + m = usec_sub_unsigned(until, netlink_now(nl, CLOCK_MONOTONIC)); } r = fd_wait_for_event(nl->fd, e, MIN(m, timeout_usec)); @@ -508,7 +574,7 @@ int sd_netlink_call_async( return r; slot->reply_callback.callback = callback; - slot->reply_callback.timeout = timespan_to_timestamp(usec); + slot->reply_callback.timeout = timespan_to_timestamp(nl, usec); k = sd_netlink_send(nl, m, &slot->reply_callback.serial); if (k < 0) @@ -549,7 +615,7 @@ int sd_netlink_read( assert_return(nl, -EINVAL); assert_return(!netlink_pid_changed(nl), -ECHILD); - usec = timespan_to_timestamp(timeout); + usec = timespan_to_timestamp(nl, timeout); for (;;) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; @@ -591,7 +657,7 @@ int sd_netlink_read( if (usec != USEC_INFINITY) { usec_t n; - n = now(CLOCK_MONOTONIC); + n = netlink_now(nl, CLOCK_MONOTONIC); if (n >= usec) return -ETIMEDOUT; diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c index b59740166e..0f4073a085 100644 --- a/src/shared/firewall-util.c +++ b/src/shared/firewall-util.c @@ -807,7 +807,7 @@ static int fw_nftables_init_family(sd_netlink *nfnl, int family) { return r; assert(msgcnt < ELEMENTSOF(messages)); - r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS); if (r < 0 && r != -EEXIST) return r; @@ -919,7 +919,7 @@ int nft_set_element_modify_iprange( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } int nft_set_element_modify_ip( @@ -959,7 +959,7 @@ int nft_set_element_modify_ip( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } int nft_set_element_modify_any( @@ -987,7 +987,7 @@ int nft_set_element_modify_any( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } static int af_to_nfproto(int af) { @@ -1124,7 +1124,7 @@ static int fw_nftables_add_local_dnat_internal( return r; assert(msgcnt < ELEMENTSOF(messages)); - r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS); if (r == -EOVERFLOW && af == AF_INET6) { /* The current implementation of DNAT in systemd requires kernel's * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns diff --git a/src/systemd/sd-netlink.h b/src/systemd/sd-netlink.h index b8f0481d49..10705d3fe2 100644 --- a/src/systemd/sd-netlink.h +++ b/src/systemd/sd-netlink.h @@ -56,6 +56,8 @@ int sd_netlink_call_async(sd_netlink *nl, sd_netlink_slot **ret_slot, sd_netlink int sd_netlink_call(sd_netlink *nl, sd_netlink_message *message, uint64_t timeout, sd_netlink_message **ret); int sd_netlink_read(sd_netlink *nl, uint32_t serial, uint64_t timeout, sd_netlink_message **ret); +int sd_netlink_ignore_serial(sd_netlink *nl, uint32_t serial, uint64_t timeout_usec); + int sd_netlink_get_events(sd_netlink *nl); int sd_netlink_get_timeout(sd_netlink *nl, uint64_t *ret); int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret); diff --git a/src/test/test-firewall-util.c b/src/test/test-firewall-util.c index d7080fa349..aea6cded5a 100644 --- a/src/test/test-firewall-util.c +++ b/src/test/test-firewall-util.c @@ -1,8 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ -#include -#include - #include "sd-netlink.h" #include "firewall-util.h" @@ -79,9 +76,6 @@ TEST(v4) { static int intro(void) { int r; - if (getuid() != 0) - return log_tests_skipped("not root"); - ASSERT_OK_ERRNO(setenv("SYSTEMD_FIREWALL_UTIL_NFT_TABLE_NAME", "io.systemd-test.nat", /* overwrite = */ true)); ASSERT_OK_ERRNO(setenv("SYSTEMD_FIREWALL_UTIL_DNAT_MAP_NAME", "test_map_port_ipport", /* overwrite = */ true));