diff --git a/doc/userguide/capture-hardware/dpdk.rst b/doc/userguide/capture-hardware/dpdk.rst index 91ae1c876ca9..c68193a38d38 100644 --- a/doc/userguide/capture-hardware/dpdk.rst +++ b/doc/userguide/capture-hardware/dpdk.rst @@ -95,3 +95,41 @@ management and worker CPU set. - worker-cpu-set: cpu: [ 2,4,6,8 ] ... + +Interrupt (power-saving) mode +----------------------------- + +The DPDK is traditionally recognized for its polling mode operation. +In this mode, CPU cores are continuously querying for packets from +the Network Interface Card (NIC). While this approach offers benefits like +reduced latency and improved performance, it might not be the most efficient +in scenarios with sporadic or low traffic. +The constant polling can lead to unnecessary CPU consumption. +To address this, DPDK offers an `interrupt` mode. + +The obvious advantage that interrupt mode brings is power efficiency. +So far in our tests, we haven't observed a decrease in performance. Suricata's +performance has actually seen a slight improvement. +The (IPS runmode) users should be aware that interrupts can +introduce non-deterministic latency. However, the latency should never be +higher than in other (e.g. AF_PACKET/AF_XDP/...) capture methods. + +Interrupt mode in DPDK can be configured on a per-interface basis. +This allows for a hybrid setup where some workers operate in polling mode, +while others utilize the interrupt mode. +The configuration for the interrupt mode can be found and modified in the +DPDK section of the suricata.yaml file. + +Below is a sample configuration that demonstrates how to enable the interrupt mode for a specific interface: + +:: + + ... + dpdk: + eal-params: + proc-type: primary + + interfaces: + - interface: 0000:3b:00.0 + interrupt-mode: true + threads: 4 diff --git a/src/runmode-dpdk.c b/src/runmode-dpdk.c index 2cdf5cb32505..8a7643b250e6 100644 --- a/src/runmode-dpdk.c +++ b/src/runmode-dpdk.c @@ -111,6 +111,7 @@ static void *ParseDpdkConfigAndConfigureDevice(const char *iface); static void DPDKDerefConfig(void *conf); #define DPDK_CONFIG_DEFAULT_THREADS "auto" +#define DPDK_CONFIG_DEFAULT_INTERRUPT_MODE false #define DPDK_CONFIG_DEFAULT_MEMPOOL_SIZE 65535 #define DPDK_CONFIG_DEFAULT_MEMPOOL_CACHE_SIZE "auto" #define DPDK_CONFIG_DEFAULT_RX_DESCRIPTORS 1024 @@ -126,6 +127,7 @@ static void DPDKDerefConfig(void *conf); DPDKIfaceConfigAttributes dpdk_yaml = { .threads = "threads", + .irq_mode = "interrupt-mode", .promisc = "promisc", .multicast = "multicast", .checksum_checks = "checksum-checks", @@ -434,6 +436,15 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) SCReturnInt(0); } +static bool ConfigSetInterruptMode(DPDKIfaceConfig *iconf, bool enable) +{ + SCEnter(); + if (enable) + iconf->flags |= DPDK_IRQ_MODE; + + SCReturnBool(true); +} + static int ConfigSetRxQueues(DPDKIfaceConfig *iconf, uint16_t nb_queues) { SCEnter(); @@ -695,6 +706,17 @@ static int ConfigLoad(DPDKIfaceConfig *iconf, const char *iface) if (retval < 0) SCReturnInt(retval); + bool irq_enable; + retval = ConfGetChildValueBoolWithDefault(if_root, if_default, dpdk_yaml.irq_mode, &entry_bool); + if (retval != 1) { + irq_enable = DPDK_CONFIG_DEFAULT_INTERRUPT_MODE; + } else { + irq_enable = entry_bool ? true : false; + } + retval = ConfigSetInterruptMode(iconf, irq_enable); + if (retval != true) + SCReturnInt(-EINVAL); + // currently only mapping "1 thread == 1 RX (and 1 TX queue in IPS mode)" is supported retval = ConfigSetRxQueues(iconf, (uint16_t)iconf->threads); if (retval < 0) @@ -1106,6 +1128,11 @@ static void DeviceInitPortConf(const DPDKIfaceConfig *iconf, }, }; + SCLogConfig("%s: interrupt mode is %s", iconf->iface, + iconf->flags & DPDK_IRQ_MODE ? "enabled" : "disabled"); + if (iconf->flags & DPDK_IRQ_MODE) + port_conf->intr_conf.rxq = 1; + // configure RX offloads if (dev_info->rx_offload_capa & RTE_ETH_RX_OFFLOAD_RSS_HASH) { if (iconf->nb_rx_queues > 1) { diff --git a/src/runmode-dpdk.h b/src/runmode-dpdk.h index a00327ba9e24..152c1d687893 100644 --- a/src/runmode-dpdk.h +++ b/src/runmode-dpdk.h @@ -25,6 +25,7 @@ typedef struct DPDKIfaceConfigAttributes_ { const char *threads; + const char *irq_mode; const char *promisc; const char *multicast; const char *checksum_checks; diff --git a/src/source-dpdk.c b/src/source-dpdk.c index 54503e212271..480bb6c63676 100644 --- a/src/source-dpdk.c +++ b/src/source-dpdk.c @@ -93,6 +93,13 @@ TmEcode NoDPDKSupportExit(ThreadVars *tv, const void *initdata, void **data) #define BURST_SIZE 32 static struct timeval machine_start_time = { 0, 0 }; +// interrupt mode constants +#define MIN_ZERO_POLL_COUNT 10U +#define MIN_ZERO_POLL_COUNT_TO_SLEEP 10U +#define MINIMUM_SLEEP_TIME_US 1U +#define STANDARD_SLEEP_TIME_US 100U +#define MAX_EPOLL_TIMEOUT_MS 500U +static rte_spinlock_t intr_lock[RTE_MAX_ETHPORTS]; /** * \brief Structure to hold thread specific variables. @@ -104,6 +111,7 @@ typedef struct DPDKThreadVars_ { TmSlot *slot; LiveDevice *livedev; ChecksumValidationMode checksum_mode; + bool intr_enabled; /* references to packet and drop counters */ uint16_t capture_dpdk_packets; uint16_t capture_dpdk_rx_errs; @@ -142,6 +150,40 @@ static uint64_t CyclesToSeconds(uint64_t cycles); static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset); static uint64_t DPDKGetSeconds(void); +static bool InterruptsRXEnable(uint16_t port_id, uint16_t queue_id) +{ + uint32_t event_data = port_id << UINT16_WIDTH | queue_id; + int32_t ret = rte_eth_dev_rx_intr_ctl_q(port_id, queue_id, RTE_EPOLL_PER_THREAD, + RTE_INTR_EVENT_ADD, (void *)((uintptr_t)event_data)); + + if (ret != 0) { + SCLogError("%s-Q%d: failed to enable interrupt mode: %s", DPDKGetPortNameByPortID(port_id), + queue_id, rte_strerror(-ret)); + return false; + } + return true; +} + +static inline uint32_t InterruptsSleepHeuristic(uint32_t no_pkt_polls_count) +{ + if (no_pkt_polls_count < MIN_ZERO_POLL_COUNT_TO_SLEEP) + return MINIMUM_SLEEP_TIME_US; + + return STANDARD_SLEEP_TIME_US; +} + +static inline void InterruptsTurnOnOff(uint16_t port_id, uint16_t queue_id, bool on) +{ + rte_spinlock_lock(&(intr_lock[port_id])); + + if (on) + rte_eth_dev_rx_intr_enable(port_id, queue_id); + else + rte_eth_dev_rx_intr_disable(port_id, queue_id); + + rte_spinlock_unlock(&(intr_lock[port_id])); +} + static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset) { for (int i = offset; i < mbuf_cnt; i++) { @@ -377,6 +419,11 @@ static TmEcode ReceiveDPDKLoop(ThreadVars *tv, void *data, void *slot) rte_eth_stats_reset(ptv->port_id); rte_eth_xstats_reset(ptv->port_id); + + uint32_t pwd_zero_rx_packet_polls_count = 0; + if (ptv->intr_enabled && !InterruptsRXEnable(ptv->port_id, ptv->queue_id)) + SCReturnInt(TM_ECODE_FAILED); + while (1) { if (unlikely(suricata_ctl_flags != 0)) { SCLogDebug("Stopping Suricata!"); @@ -398,7 +445,27 @@ static TmEcode ReceiveDPDKLoop(ThreadVars *tv, void *data, void *slot) TmThreadsCaptureHandleTimeout(tv, NULL); last_timeout_msec = msecs; } - continue; + + if (!ptv->intr_enabled) + continue; + + pwd_zero_rx_packet_polls_count++; + if (pwd_zero_rx_packet_polls_count <= MIN_ZERO_POLL_COUNT) + continue; + + uint32_t pwd_idle_hint = InterruptsSleepHeuristic(pwd_zero_rx_packet_polls_count); + + if (pwd_idle_hint < STANDARD_SLEEP_TIME_US) { + rte_delay_us(pwd_idle_hint); + } else { + InterruptsTurnOnOff(ptv->port_id, ptv->queue_id, true); + struct rte_epoll_event event; + rte_epoll_wait(RTE_EPOLL_PER_THREAD, &event, 1, MAX_EPOLL_TIMEOUT_MS); + InterruptsTurnOnOff(ptv->port_id, ptv->queue_id, false); + continue; + } + } else if (ptv->intr_enabled && pwd_zero_rx_packet_polls_count) { + pwd_zero_rx_packet_polls_count = 0; } ptv->pkts += (uint64_t)nb_rx; @@ -522,6 +589,7 @@ static TmEcode ReceiveDPDKThreadInit(ThreadVars *tv, const void *initdata, void ptv->checksum_mode = dpdk_config->checksum_mode; ptv->threads = dpdk_config->threads; + ptv->intr_enabled = (dpdk_config->flags & DPDK_IRQ_MODE) ? true : false; ptv->port_id = dpdk_config->port_id; ptv->out_port_id = dpdk_config->out_port_id; ptv->port_socket_id = dpdk_config->socket_id; @@ -569,6 +637,9 @@ static TmEcode ReceiveDPDKThreadInit(ThreadVars *tv, const void *initdata, void "%s: unable to determine NIC's NUMA node, degraded performance can be expected", dpdk_config->iface); } + if (ptv->intr_enabled) { + rte_spinlock_init(&intr_lock[ptv->port_id]); + } } *data = (void *)ptv; diff --git a/src/source-dpdk.h b/src/source-dpdk.h index 3fdb63cb35d9..b962d866d4bd 100644 --- a/src/source-dpdk.h +++ b/src/source-dpdk.h @@ -38,6 +38,7 @@ typedef enum { DPDK_COPY_MODE_NONE, DPDK_COPY_MODE_TAP, DPDK_COPY_MODE_IPS } Dpd // General flags #define DPDK_PROMISC (1 << 0) /**< Promiscuous mode */ #define DPDK_MULTICAST (1 << 1) /**< Enable multicast packets */ +#define DPDK_IRQ_MODE (1 << 2) /**< Interrupt mode */ // Offloads #define DPDK_RX_CHECKSUM_OFFLOAD (1 << 4) /**< Enable chsum offload */ diff --git a/suricata.yaml.in b/suricata.yaml.in index 630399126dbe..e02d229a2d82 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -753,6 +753,7 @@ dpdk: # - auto takes all cores # in IPS mode it is required to specify the number of cores and the numbers on both interfaces must match threads: auto + # interrupt-mode: false # true to switch to interrupt mode promisc: true # promiscuous mode - capture all packets multicast: true # enables also detection on multicast packets checksum-checks: true # if Suricata should validate checksums