[Intel-wired-lan] [net-next PATCH v2 1/3] ixgbe: add XDP support for pass and drop actions

Alexander Duyck alexander.duyck at gmail.com
Thu Mar 2 16:22:42 UTC 2017


On Wed, Mar 1, 2017 at 4:54 PM, John Fastabend <john.fastabend at gmail.com> wrote:
> Basic XDP drop support for ixgbe. Uses READ_ONCE/xchg semantics on XDP
> programs instead of rcu primitives as suggested by Daniel Borkmann and
> Alex Duyck.
>
> Signed-off-by: John Fastabend <john.r.fastabend at intel.com>

So it still looks like this is doing a bunch of hacking in the Rx
path.  I suggested a few items below to streamline this.

> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    2
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  114 ++++++++++++++++++++++++-
>  2 files changed, 113 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> index b812913..2d12c24 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> @@ -273,6 +273,7 @@ struct ixgbe_ring {
>         struct ixgbe_ring *next;        /* pointer to next ring in q_vector */
>         struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
>         struct net_device *netdev;      /* netdev ring belongs to */
> +       struct bpf_prog *xdp_prog;
>         struct device *dev;             /* device for DMA mapping */
>         struct ixgbe_fwd_adapter *l2_accel_priv;
>         void *desc;                     /* descriptor ring memory */
> @@ -510,6 +511,7 @@ struct ixgbe_adapter {
>         unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
>         /* OS defined structs */
>         struct net_device *netdev;
> +       struct bpf_prog *xdp_prog;
>         struct pci_dev *pdev;
>
>         unsigned long state;
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index e3da397..0b802b5 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -49,6 +49,9 @@
>  #include <linux/if_macvlan.h>
>  #include <linux/if_bridge.h>
>  #include <linux/prefetch.h>
> +#include <linux/bpf.h>
> +#include <linux/bpf_trace.h>
> +#include <linux/atomic.h>
>  #include <scsi/fc/fc_fcoe.h>
>  #include <net/udp_tunnel.h>
>  #include <net/pkt_cls.h>
> @@ -2051,7 +2054,7 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
>                 /* hand second half of page back to the ring */
>                 ixgbe_reuse_rx_page(rx_ring, rx_buffer);
>         } else {
> -               if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
> +               if (skb && IXGBE_CB(skb)->dma == rx_buffer->dma) {
>                         /* the page has been released from the ring */
>                         IXGBE_CB(skb)->page_released = true;
>                 } else {

So you can probably change this to "if (!IS_ERR(skb) &&
IXGBE_CB(skb)->dma == rx_buffer->dma) {" more on why below.

> @@ -2157,6 +2160,50 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
>         return skb;
>  }
>
> +#define IXGBE_XDP_PASS 0
> +#define IXGBE_XDP_CONSUMED 1
> +
> +static int ixgbe_run_xdp(struct ixgbe_ring  *rx_ring,
> +                        struct ixgbe_rx_buffer *rx_buffer,
> +                        unsigned int size)
> +{

Change the return type here to an sk_buff pointer.

> +       int result = IXGBE_XDP_PASS;
> +       struct bpf_prog *xdp_prog;
> +       struct xdp_buff xdp;
> +       void *addr;
> +       u32 act;
> +
> +       rcu_read_lock();
> +       xdp_prog = READ_ONCE(rx_ring->xdp_prog);
> +
> +       if (!xdp_prog)
> +               goto xdp_out;
> +
> +       addr = page_address(rx_buffer->page) + rx_buffer->page_offset;
> +       xdp.data_hard_start = addr;
> +       xdp.data = addr;
> +       xdp.data_end = addr + size;
> +
> +       act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +       switch (act) {
> +       case XDP_PASS:
> +               goto xdp_out;
> +       default:
> +               bpf_warn_invalid_xdp_action(act);
> +       case XDP_TX:
> +       case XDP_ABORTED:
> +               trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
> +               /* fallthrough -- handle aborts by dropping packet */
> +       case XDP_DROP:
> +               rx_buffer->pagecnt_bias++; /* give page back */
> +               result = IXGBE_XDP_CONSUMED;
> +               break;
> +       }
> +xdp_out:
> +       rcu_read_unlock();
> +       return result;
> +}
> +

Instead of returning result directly I would suggest returning
"ERR_PTR(-result)", or you can cast IXGBE_XDP_CONSUMED as - 1 and just
return "ERR_PTR(result)".

>  /**
>   * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
>   * @q_vector: structure containing interrupt and ring information
> @@ -2187,6 +2234,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
>                 struct ixgbe_rx_buffer *rx_buffer;
>                 struct sk_buff *skb;
>                 unsigned int size;
> +               int consumed;
>
>                 /* return some buffers to hardware, one at a time is too slow */
>                 if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {

With the changes above this bit becomes unnecessary.

> @@ -2207,6 +2255,18 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
>
>                 rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);
>
> +               consumed = ixgbe_run_xdp(rx_ring, rx_buffer, size);
> +               rcu_read_unlock();
> +

Did you leave an rcu_read_unlock here?  I'm assuming you meant to drop this.

> +               if (consumed) {
> +                       ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb);
> +                       cleaned_count++;
> +                       ixgbe_is_non_eop(rx_ring, rx_desc, skb);
> +                       total_rx_packets++;
> +                       total_rx_bytes += size;
> +                       continue;
> +               }
> +

Looks like this wasn't adding one back to the pagecnt_bias.  That
would trigger a memory leak.

This bit will need to change a bit.  Basically it should become:
    /* retrieve a buffer from the ring */
    if (!skb)
        skb = ixgbe_run_xdp(rx_ring, rx_buffer, size);

    if (IS_ERR(skb)) {
        total_rx_packets++;
        total_rx_bytes += size;
        rx_buffer->pagecnt_bias++;
    } else if (skb)

Then the only other change that should be needed is a minor tweak to
ixgbe_cleanup_headers() by adding the following to the start of the
function:
    if (IS_ERR(skb))
        return true;

>                 /* retrieve a buffer from the ring */
>                 if (skb)
>                         ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
> @@ -6121,9 +6181,13 @@ static int ixgbe_setup_all_rx_resources(struct ixgbe_adapter *adapter)
>         int i, err = 0;
>
>         for (i = 0; i < adapter->num_rx_queues; i++) {
> -               err = ixgbe_setup_rx_resources(adapter->rx_ring[i]);
> -               if (!err)
> +               struct ixgbe_ring *rx_ring = adapter->rx_ring[i];
> +
> +               err = ixgbe_setup_rx_resources(rx_ring);
> +               if (!err) {
> +                       xchg(&rx_ring->xdp_prog, adapter->xdp_prog);
>                         continue;
> +               }
>
>                 e_err(probe, "Allocation for Rx Queue %u failed\n", i);
>                 goto err_setup_rx;

I still think it makes more sense to just place this in
ixgbe_setup_rx_resources.  No point in putting here as it just adds
more complication.

If I am not mistaken all you would have to add is the xchg line and
that would be it.

> @@ -6191,6 +6255,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring)
>
>         vfree(rx_ring->rx_buffer_info);
>         rx_ring->rx_buffer_info = NULL;
> +       xchg(&rx_ring->xdp_prog, NULL);
>
>         /* if not set, then don't free */
>         if (!rx_ring->desc)

Just for the sake of symmetry I would suggest placing the xchg in
front of the vfree.  Also this code being here makes a stronger
argument for us setting up xdp in setup_rx_resources.

> @@ -9455,6 +9520,48 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
>         return features;
>  }
>
> +static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
> +{
> +       int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
> +       struct ixgbe_adapter *adapter = netdev_priv(dev);
> +       struct bpf_prog *old_adapter_prog;
> +
> +       /* verify ixgbe ring attributes are sufficient for XDP */
> +       for (i = 0; i < adapter->num_rx_queues; i++) {
> +               struct ixgbe_ring *ring = adapter->rx_ring[i];
> +
> +               if (ring_is_rsc_enabled(ring))
> +                       return -EINVAL;
> +
> +               if (frame_size > ixgbe_rx_bufsz(ring))
> +                       return -EINVAL;
> +       }
> +
> +       old_adapter_prog = xchg(&adapter->xdp_prog, prog);
> +       for (i = 0; i < adapter->num_rx_queues; i++)
> +               xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
> +
> +       if (old_adapter_prog)
> +               bpf_prog_put(old_adapter_prog);
> +
> +       return 0;
> +}
> +
> +static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +       struct ixgbe_adapter *adapter = netdev_priv(dev);
> +
> +       switch (xdp->command) {
> +       case XDP_SETUP_PROG:
> +               return ixgbe_xdp_setup(dev, xdp->prog);
> +       case XDP_QUERY_PROG:
> +               xdp->prog_attached = !!(adapter->rx_ring[0]->xdp_prog);
> +               return 0;
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
>  static const struct net_device_ops ixgbe_netdev_ops = {
>         .ndo_open               = ixgbe_open,
>         .ndo_stop               = ixgbe_close,
> @@ -9500,6 +9607,7 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
>         .ndo_udp_tunnel_add     = ixgbe_add_udp_tunnel_port,
>         .ndo_udp_tunnel_del     = ixgbe_del_udp_tunnel_port,
>         .ndo_features_check     = ixgbe_features_check,
> +       .ndo_xdp                = ixgbe_xdp,
>  };
>
>  /**
>


More information about the Intel-wired-lan mailing list