[Intel-wired-lan] [PATCH net-next v4 12/12] igc: Add support for Frame Preemption verification
Vinicius Costa Gomes
vinicius.gomes at intel.com
Tue Apr 12 00:13:21 UTC 2022
Vladimir Oltean <vladimir.oltean at nxp.com> writes:
> On Fri, Jun 25, 2021 at 05:33:14PM -0700, Vinicius Costa Gomes wrote:
>> Add support for sending/receiving Frame Preemption verification
>> frames.
>>
>> The i225 hardware doesn't implement the process of verification
>> internally, this is left to the driver.
>>
>> Add a simple implementation of the state machine defined in IEEE
>> 802.3-2018, Section 99.4.7.
>>
>> For now, the state machine is started manually by the user, when
>> enabling verification. Example:
>>
>> $ ethtool --set-frame-preemption IFACE disable-verify off
>>
>> The "verified" condition is set to true when the SMD-V frame is sent,
>> and the SMD-R frame is received. So, it only tracks the transmission
>> side. This seems to be what's expected from IEEE 802.3-2018.
>>
>> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes at intel.com>
>> ---
>> drivers/net/ethernet/intel/igc/igc.h | 15 ++
>> drivers/net/ethernet/intel/igc/igc_defines.h | 13 ++
>> drivers/net/ethernet/intel/igc/igc_ethtool.c | 20 +-
>> drivers/net/ethernet/intel/igc/igc_main.c | 216 +++++++++++++++++++
>> 4 files changed, 261 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
>> index 9b2ddcbf65fb..84234efed781 100644
>> --- a/drivers/net/ethernet/intel/igc/igc.h
>> +++ b/drivers/net/ethernet/intel/igc/igc.h
>> @@ -122,6 +122,13 @@ struct igc_ring {
>> struct xsk_buff_pool *xsk_pool;
>> } ____cacheline_internodealigned_in_smp;
>>
>> +enum frame_preemption_state {
>> + FRAME_PREEMPTION_STATE_FAILED,
>> + FRAME_PREEMPTION_STATE_DONE,
>> + FRAME_PREEMPTION_STATE_START,
>> + FRAME_PREEMPTION_STATE_SENT,
>> +};
>> +
>> /* Board specific private data structure */
>> struct igc_adapter {
>> struct net_device *netdev;
>> @@ -240,6 +247,14 @@ struct igc_adapter {
>> struct timespec64 start;
>> struct timespec64 period;
>> } perout[IGC_N_PEROUT];
>> +
>> + struct delayed_work fp_verification_work;
>> + unsigned long fp_start;
>> + bool fp_received_smd_v;
>> + bool fp_received_smd_r;
>> + unsigned int fp_verify_cnt;
>> + enum frame_preemption_state fp_tx_state;
>> + bool fp_disable_verify;
>> };
>>
>> void igc_up(struct igc_adapter *adapter);
>> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
>> index a2ea057d8e6e..cf46f5d5a505 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
>> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
>> @@ -268,6 +268,8 @@
>> #define IGC_TXD_DTYP_C 0x00000000 /* Context Descriptor */
>> #define IGC_TXD_POPTS_IXSM 0x01 /* Insert IP checksum */
>> #define IGC_TXD_POPTS_TXSM 0x02 /* Insert TCP/UDP checksum */
>> +#define IGC_TXD_POPTS_SMD_V 0x10 /* Transmitted packet is a SMD-Verify */
>> +#define IGC_TXD_POPTS_SMD_R 0x20 /* Transmitted packet is a SMD-Response */
>> #define IGC_TXD_CMD_EOP 0x01000000 /* End of Packet */
>> #define IGC_TXD_CMD_IC 0x04000000 /* Insert Checksum */
>> #define IGC_TXD_CMD_DEXT 0x20000000 /* Desc extension (0 = legacy) */
>> @@ -327,9 +329,20 @@
>>
>> #define IGC_RXDEXT_STATERR_LB 0x00040000
>>
>> +#define IGC_RXD_STAT_SMD_V 0x2000 /* Received packet is SMD-Verify packet */
>> +#define IGC_RXD_STAT_SMD_R 0x4000 /* Received packet is SMD-Response packet */
>> +
>
> So the i225 gives you the ability to select from multiple
> Start-of-mPacket-Delimiter values on a per-TX descriptor basis?
> And this is in addition to configuring that TX ring as preemptable I
> guess? Because I notice that you're sending on the TX ring affine to the
> current CPU that the verification work item is running on (which you
> don't check anywhere that it is configured as going to the pMAC or
> not).
Yeah, talking to the hardware folks, those descriptors are handled
differently by the hardware.
> And on RX, it always gives you the kind of SMD that the packet had
> (including the classic SFD for express packets)?
> Cool.
I would use another word, but yeah :-)
>
> It would be nice if I could connect back to back an i225 board with an
> NXP LS1028A to see if the verification state machines pass both ways (on
> LS1028A it is 100% hardware based, we just enable/disable the feature
> and we can monitor the state changes via an interrupt).
>
My life would be easier if that were the case here.
>> /* Advanced Receive Descriptor bit definitions */
>> #define IGC_RXDADV_STAT_TSIP 0x08000 /* timestamp in packet */
>>
>> +#define IGC_RXDADV_STAT_SMD_TYPE_MASK 0x06000
>> +#define IGC_RXDADV_STAT_SMD_TYPE_SHIFT 13
>> +
>> +#define IGC_SMD_TYPE_SFD 0x0
>> +#define IGC_SMD_TYPE_SMD_V 0x1
>> +#define IGC_SMD_TYPE_SMD_R 0x2
>> +#define IGC_SMD_TYPE_COMPLETE 0x3
>> +
>> #define IGC_RXDEXT_STATERR_L4E 0x20000000
>> #define IGC_RXDEXT_STATERR_IPE 0x40000000
>> #define IGC_RXDEXT_STATERR_RXE 0x80000000
>> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> index 84d5afe92154..f52a7be3af66 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> @@ -1649,6 +1649,8 @@ static int igc_ethtool_get_preempt(struct net_device *netdev,
>>
>> fpcmd->enabled = adapter->frame_preemption_active;
>> fpcmd->add_frag_size = adapter->add_frag_size;
>> + fpcmd->verified = adapter->fp_tx_state == FRAME_PREEMPTION_STATE_DONE;
>> + fpcmd->disable_verify = adapter->fp_disable_verify;
>>
>> return 0;
>> }
>> @@ -1664,10 +1666,22 @@ static int igc_ethtool_set_preempt(struct net_device *netdev,
>> return -EINVAL;
>> }
>>
>> - adapter->frame_preemption_active = fpcmd->enabled;
>> - adapter->add_frag_size = fpcmd->add_frag_size;
>> + if (!fpcmd->disable_verify && adapter->fp_disable_verify) {
>> + adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
>> + schedule_delayed_work(&adapter->fp_verification_work, msecs_to_jiffies(10));
>
> Not sure how much you'd like to tune this, but the spec has a
> configurable verifyTime between 1 ms and 128 ms. You chose the default
> value, so we should be ok for now.
We can add a configurable for that later, via ethtool for example.
>
>> + }
>>
>> - return igc_tsn_offload_apply(adapter);
>> + adapter->fp_disable_verify = fpcmd->disable_verify;
>> +
>> + if (adapter->frame_preemption_active != fpcmd->enabled ||
>> + adapter->add_frag_size != fpcmd->add_frag_size) {
>> + adapter->frame_preemption_active = fpcmd->enabled;
>> + adapter->add_frag_size = fpcmd->add_frag_size;
>> +
>> + return igc_tsn_offload_apply(adapter);
>> + }
>> +
>> + return 0;
>> }
>>
>> static int igc_ethtool_begin(struct net_device *netdev)
>> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
>> index 20dac04a02f2..ed55bd13e4a1 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_main.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
>> @@ -28,6 +28,11 @@
>> #define IGC_XDP_TX BIT(1)
>> #define IGC_XDP_REDIRECT BIT(2)
>>
>> +#define IGC_FP_TIMEOUT msecs_to_jiffies(100)
>> +#define IGC_MAX_VERIFY_CNT 3
>> +
>> +#define IGC_FP_SMD_FRAME_SIZE 60
>> +
>> static int debug = -1;
>>
>> MODULE_AUTHOR("Intel Corporation, <linux.nics at intel.com>");
>> @@ -2169,6 +2174,79 @@ static int igc_xdp_init_tx_descriptor(struct igc_ring *ring,
>> return 0;
>> }
>>
>> +static int igc_fp_init_smd_frame(struct igc_ring *ring, struct igc_tx_buffer *buffer,
>> + struct sk_buff *skb)
>> +{
>> + dma_addr_t dma;
>> + unsigned int size;
>> +
>> + size = skb_headlen(skb);
>> +
>> + dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
>> + if (dma_mapping_error(ring->dev, dma)) {
>> + netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
>> + return -ENOMEM;
>> + }
>> +
>> + buffer->skb = skb;
>> + buffer->protocol = 0;
>> + buffer->bytecount = skb->len;
>> + buffer->gso_segs = 1;
>> + buffer->time_stamp = jiffies;
>> + dma_unmap_len_set(buffer, len, skb->len);
>> + dma_unmap_addr_set(buffer, dma, dma);
>> +
>> + return 0;
>> +}
>> +
>> +static int igc_fp_init_tx_descriptor(struct igc_ring *ring,
>> + struct sk_buff *skb, int type)
>> +{
>> + struct igc_tx_buffer *buffer;
>> + union igc_adv_tx_desc *desc;
>> + u32 cmd_type, olinfo_status;
>> + int err;
>> +
>> + if (!igc_desc_unused(ring))
>> + return -EBUSY;
>> +
>> + buffer = &ring->tx_buffer_info[ring->next_to_use];
>> + err = igc_fp_init_smd_frame(ring, buffer, skb);
>> + if (err)
>> + return err;
>> +
>> + cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
>> + IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
>> + buffer->bytecount;
>> + olinfo_status = buffer->bytecount << IGC_ADVTXD_PAYLEN_SHIFT;
>> +
>> + switch (type) {
>> + case IGC_SMD_TYPE_SMD_V:
>> + olinfo_status |= (IGC_TXD_POPTS_SMD_V << 8);
>> + break;
>> + case IGC_SMD_TYPE_SMD_R:
>> + olinfo_status |= (IGC_TXD_POPTS_SMD_R << 8);
>> + break;
>> + default:
>> + return -EINVAL;
>> + }
>> +
>> + desc = IGC_TX_DESC(ring, ring->next_to_use);
>> + desc->read.cmd_type_len = cpu_to_le32(cmd_type);
>> + desc->read.olinfo_status = cpu_to_le32(olinfo_status);
>> + desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(buffer, dma));
>> +
>> + netdev_tx_sent_queue(txring_txq(ring), skb->len);
>> +
>> + buffer->next_to_watch = desc;
>> +
>> + ring->next_to_use++;
>> + if (ring->next_to_use == ring->count)
>> + ring->next_to_use = 0;
>> +
>> + return 0;
>> +}
>> +
>> static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter,
>> int cpu)
>> {
>> @@ -2299,6 +2377,19 @@ static void igc_update_rx_stats(struct igc_q_vector *q_vector,
>> q_vector->rx.total_bytes += bytes;
>> }
>>
>> +static int igc_rx_desc_smd_type(union igc_adv_rx_desc *rx_desc)
>> +{
>> + u32 status = le32_to_cpu(rx_desc->wb.upper.status_error);
>> +
>> + return (status & IGC_RXDADV_STAT_SMD_TYPE_MASK)
>> + >> IGC_RXDADV_STAT_SMD_TYPE_SHIFT;
>> +}
>> +
>> +static bool igc_check_smd_frame(struct igc_rx_buffer *rx_buffer, unsigned int size)
>> +{
>> + return size == 60;
>
> You should probably also verify that the contents is 60 octets of zeroes (sans the mCRC)?
>
Yeah, I will add some checks for that.
>> +}
>> +
>> static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>> {
>> unsigned int total_bytes = 0, total_packets = 0;
>> @@ -2315,6 +2406,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>> ktime_t timestamp = 0;
>> struct xdp_buff xdp;
>> int pkt_offset = 0;
>> + int smd_type;
>> void *pktbuf;
>>
>> /* return some buffers to hardware, one at a time is too slow */
>> @@ -2346,6 +2438,22 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
>> size -= IGC_TS_HDR_LEN;
>> }
>>
>> + smd_type = igc_rx_desc_smd_type(rx_desc);
>> +
>> + if (smd_type == IGC_SMD_TYPE_SMD_V || smd_type == IGC_SMD_TYPE_SMD_R) {
>
> I guess the performance people will love you for this change. You should
> probably guard it by an "if (unlikely(disableVerify == false))" condition.
>
Will add the unlikely().
>> + if (igc_check_smd_frame(rx_buffer, size)) {
>> + adapter->fp_received_smd_v = smd_type == IGC_SMD_TYPE_SMD_V;
>> + adapter->fp_received_smd_r = smd_type == IGC_SMD_TYPE_SMD_R;
>> + schedule_delayed_work(&adapter->fp_verification_work, 0);
>> + }
>> +
>> + /* Advance the ring next-to-clean */
>> + igc_is_non_eop(rx_ring, rx_desc);
>> +
>> + cleaned_count++;
>> + continue;
>> + }
>> +
>> if (!skb) {
>> xdp_init_buff(&xdp, truesize, &rx_ring->xdp_rxq);
>> xdp_prepare_buff(&xdp, pktbuf - igc_rx_offset(rx_ring),
>> @@ -5607,6 +5715,107 @@ static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter,
>> return igc_tsn_offload_apply(adapter);
>> }
>>
>> +/* I225 doesn't send the SMD frames automatically, we need to handle
>> + * them ourselves.
>> + */
>> +static int igc_xmit_smd_frame(struct igc_adapter *adapter, int type)
>> +{
>> + int cpu = smp_processor_id();
>> + struct netdev_queue *nq;
>> + struct igc_ring *ring;
>> + struct sk_buff *skb;
>> + void *data;
>> + int err;
>> +
>> + if (!netif_running(adapter->netdev))
>> + return -ENOTCONN;
>> +
>> + /* FIXME: rename this function to something less specific, as
>> + * it can be used outside XDP.
>> + */
>> + ring = igc_xdp_get_tx_ring(adapter, cpu);
>> + nq = txring_txq(ring);
>> +
>> + skb = alloc_skb(IGC_FP_SMD_FRAME_SIZE, GFP_KERNEL);
>> + if (!skb)
>> + return -ENOMEM;
>> +
>> + data = skb_put(skb, IGC_FP_SMD_FRAME_SIZE);
>> + memset(data, 0, IGC_FP_SMD_FRAME_SIZE);
>> +
>> + __netif_tx_lock(nq, cpu);
>> +
>> + err = igc_fp_init_tx_descriptor(ring, skb, type);
>> +
>> + igc_flush_tx_descriptors(ring);
>> +
>> + __netif_tx_unlock(nq);
>> +
>> + return err;
>> +}
>> +
>> +static void igc_fp_verification_work(struct work_struct *work)
>> +{
>> + struct delayed_work *dwork = to_delayed_work(work);
>> + struct igc_adapter *adapter;
>> + int err;
>> +
>> + adapter = container_of(dwork, struct igc_adapter, fp_verification_work);
>> +
>> + if (adapter->fp_disable_verify)
>> + goto done;
>> +
>> + switch (adapter->fp_tx_state) {
>> + case FRAME_PREEMPTION_STATE_START:
>> + adapter->fp_received_smd_r = false;
>> + err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_V);
>> + if (err < 0)
>> + netdev_err(adapter->netdev, "Error sending SMD-V frame\n");
>
> On TX error should you really advance to the STATE_SENT?
>
We tried to send a SMD-V frame and it failed, the error was probably
transient (unable to allocate memory) and it's going to be retried later.
>> +
>> + adapter->fp_tx_state = FRAME_PREEMPTION_STATE_SENT;
>> + adapter->fp_start = jiffies;
>> + schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
>> + break;
>> +
>> + case FRAME_PREEMPTION_STATE_SENT:
>> + if (adapter->fp_received_smd_r) {
>> + adapter->fp_tx_state = FRAME_PREEMPTION_STATE_DONE;
>> + adapter->fp_received_smd_r = false;
>> + break;
>> + }
>> +
>> + if (time_is_before_jiffies(adapter->fp_start + IGC_FP_TIMEOUT)) {
>> + adapter->fp_verify_cnt++;
>> + netdev_warn(adapter->netdev, "Timeout waiting for SMD-R frame\n");
>> +
>> + if (adapter->fp_verify_cnt > IGC_MAX_VERIFY_CNT) {
>> + adapter->fp_verify_cnt = 0;
>> + adapter->fp_tx_state = FRAME_PREEMPTION_STATE_FAILED;
>> + netdev_err(adapter->netdev,
>> + "Exceeded number of attempts for frame preemption verification\n");
>> + } else {
>> + adapter->fp_tx_state = FRAME_PREEMPTION_STATE_START;
>> + }
>> + schedule_delayed_work(&adapter->fp_verification_work, IGC_FP_TIMEOUT);
>> + }
>> +
>> + break;
>> +
>> + case FRAME_PREEMPTION_STATE_FAILED:
>> + case FRAME_PREEMPTION_STATE_DONE:
>> + break;
>> + }
>> +
>> +done:
>> + if (adapter->fp_received_smd_v) {
>> + err = igc_xmit_smd_frame(adapter, IGC_SMD_TYPE_SMD_R);
>> + if (err < 0)
>> + netdev_err(adapter->netdev, "Error sending SMD-R frame\n");
>> +
>> + adapter->fp_received_smd_v = false;
>> + }
>> +}
>> +
>> static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type,
>> void *type_data)
>> {
>> @@ -6023,6 +6232,7 @@ static int igc_probe(struct pci_dev *pdev,
>>
>> INIT_WORK(&adapter->reset_task, igc_reset_task);
>> INIT_WORK(&adapter->watchdog_task, igc_watchdog_task);
>> + INIT_DELAYED_WORK(&adapter->fp_verification_work, igc_fp_verification_work);
>>
>> /* Initialize link properties that are user-changeable */
>> adapter->fc_autoneg = true;
>> @@ -6044,6 +6254,12 @@ static int igc_probe(struct pci_dev *pdev,
>>
>> igc_ptp_init(adapter);
>>
>> + /* FIXME: This sets the default to not do the verification
>> + * automatically, when we have support in multiple
>> + * controllers, this default can be changed.
>> + */
>> + adapter->fp_disable_verify = true;
>> +
>
> Hmmmmm. So we need to instruct our users to explicitly enable
> verification in their ethtool-based scripts, since the default values
> will vary wildly from one vendor to another. On LS1028A I see no reason
> why verification would be disabled by default.
>
Reading 99.4.3 (IEEE 802.3-2018) again, that "Verification may be disabled"
seems to imply that it should be enabled by default.
I will change this.
>> /* reset the hardware with the new settings */
>> igc_reset(adapter);
>>
>> --
>> 2.32.0
>>
--
Vinicius
More information about the Intel-wired-lan
mailing list