[Intel-wired-lan] [PATCH] igc: Add PCIe link recovery for I225/I226

Paul Menzel pmenzel at molgen.mpg.de
Wed Feb 11 15:00:17 UTC 2026


[Cc: +Linux PCI folks]


Dear Harshank,


Thank you for your patch. Just a note to please version your patches. 
`git format-patch` has the switch `--reroll-count` or `-v` to do this.

Am 10.02.26 um 21:34 schrieb Harshank Matkar:
> From: Harshank Matkar <harshankmatkar1304 at outlook.com>
> 
> When ASPM L0s transitions occur on Intel I225/I226 controllers,
> transient PCIe link instability can cause register read failures
> (0xFFFFFFFF responses). Implement a multi-layer recovery strategy:
> 
> 1. Immediate retries: 3 attempts with 100-200μs delays

Why were these delays chosen?

> 2. Link retraining: Trigger PCIe link retraining via capabilities
> 3. Device detachment: Only as last resort after max attempts
> 
> The recovery mechanism includes rate limiting, maximum attempt
> tracking, and device presence validation to prevent false detaches
> on transient ASPM glitches while maintaining safety through
> bounded retry limits.

It’d be great if you added the test setup and case.

> Signed-off-by: Harshank Matkar <harshankmatkar1304 at outlook.com>
> ---
>   drivers/net/ethernet/intel/igc/igc.h      |   6 +-
>   drivers/net/ethernet/intel/igc/igc_main.c | 155 ++++++++++++++++++++--
>   2 files changed, 149 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
> index a427f05814c1..2ef488b279b9 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -346,6 +346,10 @@ struct igc_adapter {
>   	struct mutex led_mutex;
>   	struct igc_led_classdev *leds;
>   	bool leds_available;
> +
> +	/* PCIe recovery tracking */
> +	unsigned int pcie_recovery_attempts;
> +	unsigned long last_recovery_time;
>   };
>   
>   void igc_up(struct igc_adapter *adapter);
> @@ -422,7 +426,7 @@ enum igc_rss_type_num {
>   	IGC_RSS_TYPE_MAX		= 10,
>   };
>   #define IGC_RSS_TYPE_MAX_TABLE		16
> -#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
> +#define IGC_RSS_TYPE_MASK		GENMASK(3, 0) /* 4-bits (3:0) = mask 0x0F */
>   
>   /* igc_rss_type - Rx descriptor RSS type field */
>   static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 89a321a344d2..f0daa3edbb79 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -1411,9 +1411,8 @@ static int igc_tx_map(struct igc_ring *tx_ring,
>   	/* Make sure there is space in the ring for the next send. */
>   	igc_maybe_stop_tx(tx_ring, DESC_NEEDED);
>   
> -	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
> +	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more())
>   		writel(i, tx_ring->tail);
> -	}
>   
>   	return 0;
>   dma_error:
> @@ -1613,8 +1612,7 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
>   	 * otherwise try next time
>   	 */
>   	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
> -		count += TXD_USE_COUNT(skb_frag_size(
> -						&skb_shinfo(skb)->frags[f]));
> +		count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]));

Unrelated.

>   
>   	if (igc_maybe_stop_tx(tx_ring, count + 5)) {
>   		/* this is a hard error */
> @@ -2524,7 +2522,6 @@ static int __igc_xdp_run_prog(struct igc_adapter *adapter,
>   		if (xdp_do_redirect(adapter->netdev, xdp, prog) < 0)
>   			goto out_failure;
>   		return IGC_XDP_REDIRECT;
> -		break;
>   	default:
>   		bpf_warn_invalid_xdp_action(adapter->netdev, prog, act);
>   		fallthrough;
> @@ -2791,7 +2788,7 @@ static struct igc_xdp_buff *xsk_buff_to_igc_ctx(struct xdp_buff *xdp)
>   	 * igc_xdp_buff shares its layout with xdp_buff_xsk and private
>   	 * igc_xdp_buff fields fall into xdp_buff_xsk->cb
>   	 */
> -       return (struct igc_xdp_buff *)xdp;
> +	return (struct igc_xdp_buff *)xdp;

Correct, but should be a separate patch.

>   }
>   
>   static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
> @@ -3895,9 +3892,8 @@ static int igc_enable_nfc_rule(struct igc_adapter *adapter,
>   {
>   	int err;
>   
> -	if (rule->flex) {
> +	if (rule->flex)
>   		return igc_add_flex_filter(adapter, rule);
> -	}

Correct, but unrelated and I think cosmetic changes are not wanted.

>   
>   	if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
>   		err = igc_add_etype_filter(adapter, rule->filter.etype,
> @@ -6984,11 +6980,112 @@ static const struct net_device_ops igc_netdev_ops = {
>   	.ndo_hwtstamp_set	= igc_ptp_hwtstamp_set,
>   };
>   
> +#define IGC_REGISTER_READ_RETRIES	3
> +#define IGC_PCIE_RECOVERY_MAX_ATTEMPTS	10
> +#define IGC_PCIE_RECOVERY_INTERVAL_MS	1000
> +
> +/**
> + * igc_pcie_link_recover - Attempt PCIe link recovery
> + * @adapter: board private structure
> + *
> + * Attempts to recover a failed PCIe link by triggering a link retrain.
> + * Rate-limited to 1 attempt per second, maximum 10 attempts.
> + *
> + * Returns true if recovery was successful, false otherwise.
> + */
> +static bool igc_pcie_link_recover(struct igc_adapter *adapter)
> +{
> +	struct pci_dev *pdev = adapter->pdev;
> +	unsigned long now = jiffies;
> +	u16 lnksta, lnkctl;
> +	int ret;
> +	int i;
> +
> +	/* Rate limiting: no more than 1 attempt per second */
> +	if (time_before(now, adapter->last_recovery_time +
> +			msecs_to_jiffies(IGC_PCIE_RECOVERY_INTERVAL_MS)))
> +		return false;
> +
> +	/* Maximum attempt limit */
> +	if (adapter->pcie_recovery_attempts >= IGC_PCIE_RECOVERY_MAX_ATTEMPTS) {
> +		netdev_err(adapter->netdev,
> +			   "Exceeded maximum PCIe recovery attempts (%d)\n",
> +			   IGC_PCIE_RECOVERY_MAX_ATTEMPTS);
> +		return false;
> +	}
> +
> +	adapter->last_recovery_time = now;
> +	adapter->pcie_recovery_attempts++;
> +
> +	netdev_warn(adapter->netdev,
> +		    "Attempting PCIe link recovery (attempt %d/%d)\n",
> +		    adapter->pcie_recovery_attempts,
> +		    IGC_PCIE_RECOVERY_MAX_ATTEMPTS);
> +
> +	/* Check if device is still present on the bus */
> +	if (!pci_device_is_present(pdev)) {
> +		netdev_err(adapter->netdev, "Device not present on PCIe bus\n");
> +		return false;
> +	}
> +
> +	/* Check link status */
> +	ret = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
> +	if (ret) {
> +		netdev_err(adapter->netdev, "Failed to read link status\n");
> +		return false;
> +	}
> +
> +	/* Trigger link retrain if link appears down */
> +	if (!(lnksta & PCI_EXP_LNKSTA_DLLLA)) {
> +		netdev_info(adapter->netdev,
> +			    "Link down, attempting retrain\n");
> +
> +		/* Read link control register */
> +		ret = pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, &lnkctl);
> +		if (ret == 0) {
> +			/* Trigger retrain by setting RL bit */
> +			pcie_capability_write_word(pdev, PCI_EXP_LNKCTL,
> +						   lnkctl | PCI_EXP_LNKCTL_RL);
> +
> +			/* Wait for retrain to complete (up to 1 second) */
> +			for (i = 0; i < 100; i++) {
> +				usleep_range(10000, 20000);
> +				ret = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA,
> +								&lnksta);
> +				if (ret == 0 && (lnksta & PCI_EXP_LNKSTA_DLLLA) &&
> +				    !(lnksta & PCI_EXP_LNKSTA_LT)) {
> +					netdev_info(adapter->netdev,
> +						    "PCIe link recovery successful\n");
> +					return true;
> +				}
> +			}
> +		}
> +	}
> +
> +	/* Give the link some additional time to recover on its own */
> +	msleep(100);

That’s quite a long delay. Is that according to some standard?

> +
> +	/* Check if device is responding now */
> +	if (pci_device_is_present(pdev)) {
> +		ret = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
> +		if (ret == 0 && (lnksta & PCI_EXP_LNKSTA_DLLLA)) {
> +			netdev_info(adapter->netdev,
> +				    "PCIe link recovered after delay\n");
> +			return true;
> +		}
> +	}
> +
> +	netdev_warn(adapter->netdev, "PCIe link recovery failed\n");
> +	return false;
> +}
> +
>   u32 igc_rd32(struct igc_hw *hw, u32 reg)
>   {
>   	struct igc_adapter *igc = container_of(hw, struct igc_adapter, hw);
>   	u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr);
> +	struct net_device *netdev = igc->netdev;
>   	u32 value = 0;
> +	int retry;
>   
>   	if (IGC_REMOVED(hw_addr))
>   		return ~value;
> @@ -6997,13 +7094,49 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg)
>   
>   	/* reads should not return all F's */
>   	if (!(~value) && (!reg || !(~readl(hw_addr)))) {
> -		struct net_device *netdev = igc->netdev;
> +		/* Layer 1: Immediate retries with short delays (100-200μs) */
> +		for (retry = 0; retry < IGC_REGISTER_READ_RETRIES; retry++) {
> +			usleep_range(100, 200);
> +			value = readl(&hw_addr[reg]);
> +
> +			/* If we got a valid read, return immediately */
> +			if (~value || (reg && ~readl(hw_addr))) {
> +				netdev_dbg(netdev,
> +					   "Register read recovered after %d retries\n",
> +					   retry + 1);
> +				return value;
> +			}
> +		}
> +
> +		/* Layer 2: Attempt full PCIe link recovery */
> +		netdev_warn(netdev,
> +			    "All immediate retries failed, attempting PCIe link recovery\n");
> +
> +		if (igc_pcie_link_recover(igc)) {
> +			/* Recovery succeeded, re-read the register */
> +			hw_addr = READ_ONCE(hw->hw_addr);
> +			if (hw_addr && !IGC_REMOVED(hw_addr)) {
> +				value = readl(&hw_addr[reg]);
> +
> +				/* Verify the read is valid */
> +				if (~value || (reg && ~readl(hw_addr))) {
> +					netdev_info(netdev,
> +						    "Register read successful after link recovery\n");
> +					return value;
> +				}
> +			}
> +		}
> +
> +		/* Layer 3: All recovery attempts failed, detach device */
> +		netdev_err(netdev,
> +			   "PCIe link lost after %d retries and recovery attempts, device now detached\n",
> +			   IGC_REGISTER_READ_RETRIES);
>   
>   		hw->hw_addr = NULL;
>   		netif_device_detach(netdev);
> -		netdev_err(netdev, "PCIe link lost, device now detached\n");
> +
>   		WARN(pci_device_is_present(igc->pdev),
> -		     "igc: Failed to read reg 0x%x!\n", reg);
> +		     "igc: Failed to read reg 0x%x after all recovery attempts!\n", reg);
>   	}
>   
>   	return value;

The idea looks good, but maybe there is something in Linux’ PCIe core to 
achieve something similar?


Kind regards,

Paul


More information about the Intel-wired-lan mailing list