[Intel-wired-lan] [net-next PATCH =v2] e1000: add initial XDP support

Fri Sep 2 06:50:31 UTC 2016

On Thu, 01 Sep 2016 14:39:44 -0700
John Fastabend <john.fastabend at gmail.com> wrote:

> From: Alexei Starovoitov <ast at fb.com>
> 
> This patch adds initial support for XDP on e1000 driver. Note e1000
> driver does not support page recycling in general which could be
> added as a further improvement. However XDP_DROP case will recycle.
> XDP_TX and XDP_PASS do not support recycling yet.
> 
> This patch includes the rcu_read_lock/rcu_read_unlock pair noted by
> Brenden Blanco in another pending patch.
> 
>   net/mlx4_en: protect ring->xdp_prog with rcu_read_lock
> 
> I tested this patch running e1000 in a VM using KVM over a tap
> device.
> 
> CC: William Tu <u9012063 at gmail.com>
> Signed-off-by: Alexei Starovoitov <ast at kernel.org>
> Signed-off-by: John Fastabend <john.r.fastabend at intel.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000.h      |    2 
>  drivers/net/ethernet/intel/e1000/e1000_main.c |  170 +++++++++++++++++++++++++
>  2 files changed, 169 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
> index d7bdea7..5cf8a0a 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000/e1000.h
> @@ -150,6 +150,7 @@ struct e1000_adapter;
>   */
>  struct e1000_tx_buffer {
>  	struct sk_buff *skb;
> +	struct page *page;
>  	dma_addr_t dma;
>  	unsigned long time_stamp;
>  	u16 length;
> @@ -279,6 +280,7 @@ struct e1000_adapter {
>  			     struct e1000_rx_ring *rx_ring,
>  			     int cleaned_count);
>  	struct e1000_rx_ring *rx_ring;      /* One per active queue */
> +	struct bpf_prog *prog;

The bpf_prog should be in the rx_ring structure.

>  	struct napi_struct napi;
>  
>  	int num_tx_queues;
> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
> index f42129d..141e32b 100644
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -32,6 +32,7 @@
>  #include <linux/prefetch.h>
>  #include <linux/bitops.h>
>  #include <linux/if_vlan.h>
> +#include <linux/bpf.h>
>  
>  char e1000_driver_name[] = "e1000";
>  static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
> @@ -842,6 +843,44 @@ static int e1000_set_features(struct net_device *netdev,
>  	return 0;
>  }
>  
> +static int e1000_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
> +{
> +	struct e1000_adapter *adapter = netdev_priv(netdev);
> +	struct bpf_prog *old_prog;
> +
> +	old_prog = xchg(&adapter->prog, prog);
> +	if (old_prog) {
> +		synchronize_net();
> +		bpf_prog_put(old_prog);
> +	}
> +
> +	if (netif_running(netdev))
> +		e1000_reinit_locked(adapter);
> +	else
> +		e1000_reset(adapter);
> +	return 0;
> +}
> +
> +static bool e1000_xdp_attached(struct net_device *dev)
> +{
> +	struct e1000_adapter *priv = netdev_priv(dev);
> +
> +	return !!priv->prog;
> +}
> +
> +static int e1000_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +	switch (xdp->command) {
> +	case XDP_SETUP_PROG:
> +		return e1000_xdp_set(dev, xdp->prog);
> +	case XDP_QUERY_PROG:
> +		xdp->prog_attached = e1000_xdp_attached(dev);
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  static const struct net_device_ops e1000_netdev_ops = {
>  	.ndo_open		= e1000_open,
>  	.ndo_stop		= e1000_close,
> @@ -860,6 +899,7 @@ static const struct net_device_ops e1000_netdev_ops = {
>  #endif
>  	.ndo_fix_features	= e1000_fix_features,
>  	.ndo_set_features	= e1000_set_features,
> +	.ndo_xdp		= e1000_xdp,
>  };
>  
>  /**
> @@ -1276,6 +1316,9 @@ static void e1000_remove(struct pci_dev *pdev)
>  	e1000_down_and_stop(adapter);
>  	e1000_release_manageability(adapter);
>  
> +	if (adapter->prog)
> +		bpf_prog_put(adapter->prog);
> +
>  	unregister_netdev(netdev);
>  
>  	e1000_phy_hw_reset(hw);
> @@ -1859,7 +1902,7 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
>  	struct e1000_hw *hw = &adapter->hw;
>  	u32 rdlen, rctl, rxcsum;
>  
> -	if (adapter->netdev->mtu > ETH_DATA_LEN) {
> +	if (adapter->netdev->mtu > ETH_DATA_LEN || adapter->prog) {
>  		rdlen = adapter->rx_ring[0].count *
>  			sizeof(struct e1000_rx_desc);
>  		adapter->clean_rx = e1000_clean_jumbo_rx_irq;
> @@ -1973,6 +2016,11 @@ e1000_unmap_and_free_tx_resource(struct e1000_adapter *adapter,
>  		dev_kfree_skb_any(buffer_info->skb);
>  		buffer_info->skb = NULL;
>  	}
> +	if (buffer_info->page) {
> +		put_page(buffer_info->page);
> +		buffer_info->page = NULL;
> +	}
> +
>  	buffer_info->time_stamp = 0;
>  	/* buffer_info must be completely set up in the transmit path */
>  }
> @@ -3298,6 +3346,62 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
>  	return NETDEV_TX_OK;
>  }
>  
> +static void e1000_tx_map_rxpage(struct e1000_tx_ring *tx_ring,
> +				struct e1000_rx_buffer *rx_buffer_info,
> +				unsigned int len)
> +{
> +	struct e1000_tx_buffer *buffer_info;
> +	unsigned int i = tx_ring->next_to_use;
> +
> +	buffer_info = &tx_ring->buffer_info[i];
> +
> +	buffer_info->length = len;
> +	buffer_info->time_stamp = jiffies;
> +	buffer_info->mapped_as_page = false;
> +	buffer_info->dma = rx_buffer_info->dma;
> +	buffer_info->next_to_watch = i;
> +	buffer_info->page = rx_buffer_info->rxbuf.page;
> +
> +	tx_ring->buffer_info[i].skb = NULL;
> +	tx_ring->buffer_info[i].segs = 1;
> +	tx_ring->buffer_info[i].bytecount = len;
> +	tx_ring->buffer_info[i].next_to_watch = i;
> +}
> +
> +static void e1000_xmit_raw_frame(struct e1000_rx_buffer *rx_buffer_info,
> +				 unsigned int len,
> +				 struct net_device *netdev,
> +				 struct e1000_adapter *adapter)
> +{
> +	struct netdev_queue *txq = netdev_get_tx_queue(netdev, 0);
> +	struct e1000_hw *hw = &adapter->hw;
> +	struct e1000_tx_ring *tx_ring;
> +
> +	if (len > E1000_MAX_DATA_PER_TXD)
> +		return;
> +
> +	/* e1000 only support a single txq at the moment so the queue is being
> +	 * shared with stack. To support this requires locking to ensure the
> +	 * stack and XDP are not running at the same time. Devices with
> +	 * multiple queues should allocate a separate queue space.
> +	 */
> +	HARD_TX_LOCK(netdev, txq, smp_processor_id());
> +
> +	tx_ring = adapter->tx_ring;
> +
> +	if (E1000_DESC_UNUSED(tx_ring) < 2)
> +		return;
> +
> +	e1000_tx_map_rxpage(tx_ring, rx_buffer_info, len);
> +
> +	e1000_tx_queue(adapter, tx_ring, 0/*tx_flags*/, 1);
> +
> +	writel(tx_ring->next_to_use, hw->hw_addr + tx_ring->tdt);
> +	mmiowb();
> +
> +	HARD_TX_UNLOCK(netdev, txq);
> +}

Above is going to give really bad XDP_TX performance. Both locking and
a HW TX tailptr pointer per TX packet, that is as bad as it gets.

You might say this is just for testing my eBPF-XDP program. BUT people
wanting to try XDP is going to start with this driver, and they will be
disappointed and never return (and no they will not read the comment in
the code).

It should be fairly easy to introduce a bulking/bundling XDP_TX
facility into the TX-ring (taking HARD_TX_LOCK a single time), and then
flush the TX-ring at the end of the loop (in e1000_clean_jumbo_rx_irq).
All you need is an array/stack of RX *buffer_info ptrs being build up
in the XDP_TX case. (Experiments show minimum bulking/array size should
be 8).

If you want to get fancy, and save space in the bulking structure,
then you can even just use the RX ring index "i" to describe which RX
packets need to be XDP_TX'ed. (as the driver code "owns" this part of
the ring, until updating rx_ring->next_to_clean).

>  #define NUM_REGS 38 /* 1 based count */
>  static void e1000_regdump(struct e1000_adapter *adapter)
>  {
> @@ -4142,6 +4246,19 @@ static struct sk_buff *e1000_alloc_rx_skb(struct e1000_adapter *adapter,
>  	return skb;
>  }
>  
> +static inline int e1000_call_bpf(struct bpf_prog *prog, void *data,
> +				 unsigned int length)
> +{
> +	struct xdp_buff xdp;
> +	int ret;
> +
> +	xdp.data = data;
> +	xdp.data_end = data + length;
> +	ret = BPF_PROG_RUN(prog, (void *)&xdp);
> +
> +	return ret;
> +}
> +
>  /**
>   * e1000_clean_jumbo_rx_irq - Send received data up the network stack; legacy
>   * @adapter: board private structure
> @@ -4160,12 +4277,15 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  	struct pci_dev *pdev = adapter->pdev;
>  	struct e1000_rx_desc *rx_desc, *next_rxd;
>  	struct e1000_rx_buffer *buffer_info, *next_buffer;
> +	struct bpf_prog *prog;
>  	u32 length;
>  	unsigned int i;
>  	int cleaned_count = 0;
>  	bool cleaned = false;
>  	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>  
> +	rcu_read_lock(); /* rcu lock needed here to protect xdp programs */
> +	prog = READ_ONCE(adapter->prog);
>  	i = rx_ring->next_to_clean;
>  	rx_desc = E1000_RX_DESC(*rx_ring, i);
>  	buffer_info = &rx_ring->buffer_info[i];
> @@ -4191,12 +4311,55 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter,
>  
>  		cleaned = true;
>  		cleaned_count++;
> +		length = le16_to_cpu(rx_desc->length);
> +
> +		if (prog) {
> +			struct page *p = buffer_info->rxbuf.page;
> +			dma_addr_t dma = buffer_info->dma;
> +			int act;
> +
> +			if (unlikely(!(status & E1000_RXD_STAT_EOP))) {
> +				/* attached bpf disallows larger than page
> +				 * packets, so this is hw error or corruption
> +				 */
> +				pr_info_once("%s buggy !eop\n", netdev->name);
> +				break;
> +			}
> +			if (unlikely(rx_ring->rx_skb_top)) {
> +				pr_info_once("%s ring resizing bug\n",
> +					     netdev->name);
> +				break;
> +			}
> +			dma_sync_single_for_cpu(&pdev->dev, dma,
> +						length, DMA_FROM_DEVICE);
> +			act = e1000_call_bpf(prog, page_address(p), length);
> +			switch (act) {
> +			case XDP_PASS:
> +				break;
> +			case XDP_TX:
> +				dma_sync_single_for_device(&pdev->dev,
> +							   dma,
> +							   length,
> +							   DMA_TO_DEVICE);
> +				e1000_xmit_raw_frame(buffer_info, length,
> +						     netdev, adapter);
> +				buffer_info->rxbuf.page = NULL;
> +			case XDP_DROP:
> +			default:
> +				/* re-use mapped page. keep buffer_info->dma
> +				 * as-is, so that e1000_alloc_jumbo_rx_buffers
> +				 * only needs to put it back into rx ring
> +				 */
> +				total_rx_bytes += length;
> +				total_rx_packets++;
> +				goto next_desc;
> +			}
> +		}
> +
>  		dma_unmap_page(&pdev->dev, buffer_info->dma,
>  			       adapter->rx_buffer_len, DMA_FROM_DEVICE);
>  		buffer_info->dma = 0;
>  
> -		length = le16_to_cpu(rx_desc->length);
> -
>  		/* errors is only valid for DD + EOP descriptors */
>  		if (unlikely((status & E1000_RXD_STAT_EOP) &&
>  		    (rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK))) {
> @@ -4330,6 +4493,7 @@ next_desc:
>  		rx_desc = next_rxd;
>  		buffer_info = next_buffer;
>  	}
> +	rcu_read_unlock();
>  	rx_ring->next_to_clean = i;
>  
>  	cleaned_count = E1000_DESC_UNUSED(rx_ring);
> 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer