[Intel-wired-lan] [PATCH 09/10] igc: Enable RX via AF_XDP zero-copy

Maciej Fijalkowski maciej.fijalkowski at intel.com
Tue Dec 22 13:14:28 UTC 2020


On Thu, Dec 17, 2020 at 12:24:14PM -0800, Andre Guedes wrote:
> This patch adds support for receiving packets via AF_XDP zero-copy
> mechanism.
> 
> A new flag is added to 'enum igc_ring_flags_t' to indicate the ring has
> AF_XDP zero-copy enabled so proper ring setup is carried out during ring
> configuration in igc_configure_rx_ring().
> 
> RX buffers can now be allocated via the shared pages mechanism (default
> behavior of the driver) or via xsk pool (when AF_XDP zero-copy is
> enabled) so a union is added to the 'struct igc_rx_buffer' to cover both
> cases.
> 
> When AF_XDP zero-copy is enabled, rx buffers are allocated from the xsk
> pool using the new helper igc_alloc_rx_buffers_zc() which is the
> counterpart of igc_alloc_rx_buffers().
> 
> Likewise other Intel drivers that support AF_XDP zero-copy, in igc we
> have a dedicated path for cleaning up rx irqs when zero-copy is enabled.
> This avoids adding too many checks within igc_clean_rx_irq(), resulting
> in a more readable and efficient code since this function is called from
> the hot-path of the driver.
> 
> Signed-off-by: Andre Guedes <andre.guedes at intel.com>
> ---
>  drivers/net/ethernet/intel/igc/igc.h      |  22 +-
>  drivers/net/ethernet/intel/igc/igc_base.h |   1 +
>  drivers/net/ethernet/intel/igc/igc_main.c | 334 +++++++++++++++++++++-
>  drivers/net/ethernet/intel/igc/igc_xdp.c  |  98 +++++++
>  drivers/net/ethernet/intel/igc/igc_xdp.h  |   2 +
>  5 files changed, 438 insertions(+), 19 deletions(-)
> 

[...]

> +
> +static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
> +{
> +	struct igc_adapter *adapter = q_vector->adapter;
> +	struct igc_ring *ring = q_vector->rx.ring;
> +	u16 cleaned_count = igc_desc_unused(ring);
> +	int total_bytes = 0, total_packets = 0;
> +	struct bpf_prog *prog;
> +	bool failure = false;
> +	int xdp_status = 0;
> +
> +	rcu_read_lock();
> +
> +	prog = READ_ONCE(adapter->xdp_prog);
> +
> +	while (likely(total_packets < budget)) {
> +		union igc_adv_rx_desc *desc;
> +		struct igc_rx_buffer *bi;
> +		unsigned int size;
> +		int res;
> +
> +		desc = IGC_RX_DESC(ring, ring->next_to_clean);
> +		size = le16_to_cpu(desc->wb.upper.length);
> +		if (!size)
> +			break;
> +
> +		/* This memory barrier is needed to keep us from reading
> +		 * any other fields out of the rx_desc until we know the
> +		 * descriptor has been written back
> +		 */
> +		dma_rmb();
> +
> +		bi = &ring->rx_buffer_info[ring->next_to_clean];
> +
> +		if (igc_test_staterr(desc, IGC_RXDADV_STAT_TSIP)) {
> +			/* FIXME: For now, if packet buffer contains timestamp
> +			 * information, we discard it. Once XDP infrastructe

nit: s/infrastructe/infrastructure

> +			 * provides a way to report hardware timestamps, the
> +			 * code below should be fixed.
> +			 */
> +			bi->xdp->data += IGC_TS_HDR_LEN;
> +			size -= IGC_TS_HDR_LEN;
> +		}
> +
> +		bi->xdp->data_end = bi->xdp->data + size;
> +		xsk_buff_dma_sync_for_cpu(bi->xdp, ring->xsk_pool);
> +
> +		res = igc_xdp_do_run_prog(adapter, prog, bi->xdp);
> +		switch (res) {
> +		case IGC_XDP_PASS:
> +			igc_dispatch_skb_zc(q_vector, desc, bi->xdp);
> +			fallthrough;
> +		case IGC_XDP_CONSUMED:
> +			xsk_buff_free(bi->xdp);
> +			break;
> +		case IGC_XDP_TX:
> +		case IGC_XDP_REDIRECT:
> +			xdp_status |= res;
> +			break;
> +		}
> +
> +		bi->xdp = NULL;
> +		total_bytes += size;
> +		total_packets++;
> +		cleaned_count++;
> +		ring->next_to_clean++;
> +		if (ring->next_to_clean == ring->count)
> +			ring->next_to_clean = 0;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	if (cleaned_count >= IGC_RX_BUFFER_WRITE)
> +		failure = !igc_alloc_rx_buffers_zc(ring, cleaned_count);
> +
> +	if (xdp_status)
> +		igc_finalize_xdp(adapter, xdp_status);
> +
> +	igc_update_rx_stats(q_vector, total_packets, total_bytes);
> +
> +	if (xsk_uses_need_wakeup(ring->xsk_pool)) {
> +		if (failure || ring->next_to_clean == ring->next_to_use)
> +			xsk_set_rx_need_wakeup(ring->xsk_pool);
> +		else
> +			xsk_clear_rx_need_wakeup(ring->xsk_pool);
> +		return total_packets;
> +	}
> +
> +	return failure ? budget : total_packets;
> +}
> +
>  static void igc_update_tx_stats(struct igc_q_vector *q_vector,
>  				unsigned int packets, unsigned int bytes)
>  {
> @@ -2958,7 +3188,10 @@ static void igc_configure(struct igc_adapter *adapter)
>  	for (i = 0; i < adapter->num_rx_queues; i++) {
>  		struct igc_ring *ring = adapter->rx_ring[i];
>  
> -		igc_alloc_rx_buffers(ring, igc_desc_unused(ring));
> +		if (ring->xsk_pool)
> +			igc_alloc_rx_buffers_zc(ring, igc_desc_unused(ring));
> +		else
> +			igc_alloc_rx_buffers(ring, igc_desc_unused(ring));
>  	}
>  }
>  
> @@ -3573,14 +3806,18 @@ static int igc_poll(struct napi_struct *napi, int budget)
>  	struct igc_q_vector *q_vector = container_of(napi,
>  						     struct igc_q_vector,
>  						     napi);
> +	struct igc_ring *rx_ring = q_vector->rx.ring;
> +
>  	bool clean_complete = true;
>  	int work_done = 0;
>  
>  	if (q_vector->tx.ring)
>  		clean_complete = igc_clean_tx_irq(q_vector, budget);
>  
> -	if (q_vector->rx.ring) {
> -		int cleaned = igc_clean_rx_irq(q_vector, budget);
> +	if (rx_ring) {
> +		int cleaned = rx_ring->xsk_pool ?
> +			      igc_clean_rx_irq_zc(q_vector, budget) :
> +			      igc_clean_rx_irq(q_vector, budget);
>  
>  		work_done += cleaned;
>  		if (cleaned >= budget)
> @@ -5161,6 +5398,9 @@ static int igc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
>  	switch (bpf->command) {
>  	case XDP_SETUP_PROG:
>  		return igc_xdp_set_prog(adapter, bpf->prog, bpf->extack);
> +	case XDP_SETUP_XSK_POOL:
> +		return igc_xdp_setup_pool(adapter, bpf->xsk.pool,
> +					  bpf->xsk.queue_id);
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> @@ -5206,6 +5446,43 @@ static int igc_xdp_xmit(struct net_device *dev, int num_frames,
>  	return num_frames - drops;
>  }
>  
> +static void igc_trigger_rxtxq_interrupt(struct igc_adapter *adapter,
> +					struct igc_q_vector *q_vector)
> +{
> +	struct igc_hw *hw = &adapter->hw;
> +	u32 eics = 0;
> +
> +	eics |= q_vector->eims_value;
> +	wr32(IGC_EICS, eics);
> +}
> +
> +int igc_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
> +{
> +	struct igc_adapter *adapter = netdev_priv(dev);
> +	struct igc_q_vector *q_vector;
> +	struct igc_ring *ring;
> +
> +	if (test_bit(__IGC_DOWN, &adapter->state))
> +		return -ENETDOWN;
> +
> +	if (!igc_xdp_is_enabled(adapter))
> +		return -ENXIO;
> +
> +	if (queue_id >= adapter->num_rx_queues)
> +		return -EINVAL;
> +
> +	ring = adapter->rx_ring[queue_id];
> +
> +	if (!ring->xsk_pool)
> +		return -ENXIO;

ring local variable feels a bit unnecessary.

> +
> +	q_vector = adapter->q_vector[queue_id];
> +	if (!napi_if_scheduled_mark_missed(&q_vector->napi))
> +		igc_trigger_rxtxq_interrupt(adapter, q_vector);
> +
> +	return 0;
> +}
> +
>  static const struct net_device_ops igc_netdev_ops = {
>  	.ndo_open		= igc_open,
>  	.ndo_stop		= igc_close,
> @@ -5221,6 +5498,7 @@ static const struct net_device_ops igc_netdev_ops = {
>  	.ndo_setup_tc		= igc_setup_tc,
>  	.ndo_bpf		= igc_bpf,
>  	.ndo_xdp_xmit		= igc_xdp_xmit,
> +	.ndo_xsk_wakeup		= igc_xsk_wakeup,
>  };
>  
>  /* PCIe configuration access */
> @@ -5974,6 +6252,36 @@ struct net_device *igc_get_hw_dev(struct igc_hw *hw)
>  	return adapter->netdev;
>  }
>  
> +static void igc_disable_rx_ring_hw(struct igc_ring *ring)
> +{
> +	struct igc_hw *hw = &ring->q_vector->adapter->hw;
> +	u8 idx = ring->reg_idx;
> +	u32 rxdctl;
> +
> +	rxdctl = rd32(IGC_RXDCTL(idx));
> +	rxdctl &= ~IGC_RXDCTL_QUEUE_ENABLE;
> +	rxdctl |= IGC_RXDCTL_SWFLUSH;
> +	wr32(IGC_RXDCTL(idx), rxdctl);
> +}
> +
> +void igc_disable_rx_ring(struct igc_ring *ring)
> +{
> +	igc_disable_rx_ring_hw(ring);
> +	igc_clean_rx_ring(ring);
> +}
> +
> +void igc_enable_rx_ring(struct igc_ring *ring)
> +{
> +	struct igc_adapter *adapter = ring->q_vector->adapter;
> +
> +	igc_configure_rx_ring(adapter, ring);
> +
> +	if (ring->xsk_pool)
> +		igc_alloc_rx_buffers_zc(ring, igc_desc_unused(ring));
> +	else
> +		igc_alloc_rx_buffers(ring, igc_desc_unused(ring));
> +}
> +
>  /**
>   * igc_init_module - Driver Registration Routine
>   *
> diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c
> index 27c886a254f1..9515906d54e0 100644
> --- a/drivers/net/ethernet/intel/igc/igc_xdp.c
> +++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
> @@ -1,6 +1,8 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2020, Intel Corporation. */
>  
> +#include <net/xdp_sock_drv.h>
> +
>  #include "igc.h"
>  #include "igc_xdp.h"
>  
> @@ -31,3 +33,99 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog,
>  
>  	return 0;
>  }
> +
> +static int igc_xdp_enable_pool(struct igc_adapter *adapter,
> +			       struct xsk_buff_pool *pool, u16 queue_id)
> +{
> +	struct net_device *ndev = adapter->netdev;
> +	struct device *dev = &adapter->pdev->dev;
> +	struct igc_ring *rx_ring;
> +	struct napi_struct *napi;
> +	bool needs_reset;
> +	u32 frame_size;
> +	int err;
> +
> +	if (queue_id >= adapter->num_rx_queues)
> +		return -EINVAL;
> +
> +	frame_size = xsk_pool_get_rx_frame_size(pool);
> +	if (frame_size < ETH_FRAME_LEN + VLAN_HLEN * 2) {
> +		/* For now, when XDP is enabled, the driver doesn't support
> +		 * frames that span over multiple buffers. The max frame size
> +		 * considered here is the ethernet frame size + vlan double
> +		 * tagging.
> +		 */

I don't really follow that. You check if chunk size is less than
ETH_FRAME_LEN + VLAN_HLEN * 2. chunk size will be at least 2k if I recall
correctly. How is that related to fragmented buffers?

> +		return -EOPNOTSUPP;
> +	}
> +
> +	err = xsk_pool_dma_map(pool, dev, IGC_RX_DMA_ATTR);
> +	if (err) {
> +		netdev_err(ndev, "Failed to map xsk pool\n");
> +		return err;
> +	}
> +
> +	needs_reset = netif_running(adapter->netdev) && igc_xdp_is_enabled(adapter);
> +
> +	rx_ring = adapter->rx_ring[queue_id];
> +	napi = &rx_ring->q_vector->napi;
> +
> +	if (needs_reset) {
> +		igc_disable_rx_ring(rx_ring);
> +		napi_disable(napi);
> +	}
> +
> +	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
> +
> +	if (needs_reset) {
> +		napi_enable(napi);
> +		igc_enable_rx_ring(rx_ring);
> +
> +		err = igc_xsk_wakeup(ndev, queue_id, XDP_WAKEUP_RX);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}


More information about the Intel-wired-lan mailing list