[Intel-wired-lan] [PATCH net-next] ice-: Add MDD logging via devlink health

Michal Swiatkowski michal.swiatkowski at linux.intel.com
Tue Dec 20 09:46:38 UTC 2022


On Mon, Dec 19, 2022 at 03:07:00PM +0100, Kalyan Kodamagula wrote:
Hi,

Thanks for patch, please remove - from title (ice-:)

> From: Ben Shelton <benjamin.h.shelton at intel.com>
> 
> - Enable DEVLINK_SUPPORT for ice_sw build.

What is ice_sw build?

> 
> - Add a devlink health reporter for MDD events. The 'dump' handler will
>   return the information captured in each call to
>   ice_handle_mdd_event(). A device reset (CORER/PFR) will put the
>   reporter back in healthy state.
> 
> Signed-off-by: Ben Shelton <benjamin.h.shelton at intel.com>
> Signed-off-by: Kalyan Kodamagula <kalyan.kodamagula at intel.com>
> ---
>  drivers/net/ethernet/intel/ice/ice.h         |  24 +++
>  drivers/net/ethernet/intel/ice/ice_devlink.c | 189 +++++++++++++++++++
>  drivers/net/ethernet/intel/ice/ice_devlink.h |   6 +
>  drivers/net/ethernet/intel/ice/ice_main.c    |  10 +
>  4 files changed, 229 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> index 001500afc4a6..433c514e73fb 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -503,6 +503,29 @@ struct ice_agg_node {
>  	u8 valid;
>  };
>  
> +enum ice_mdd_src {
> +	ICE_MDD_SRC_NONE = 0,
> +	ICE_MDD_SRC_TX_PQM,
> +	ICE_MDD_SRC_TX_TCLAN,
> +	ICE_MDD_SRC_TX_TDPU,
> +	ICE_MDD_SRC_RX
> +};
> +
> +struct ice_mdd_event {
> +	struct list_head list;
> +	enum ice_mdd_src src;
> +	u8 pf_num;
> +	u16 vf_num;
> +	u8 event;
> +	u16 queue;
> +};
> +
> +struct ice_mdd_reporter {
> +	struct devlink_health_reporter *reporter;
> +	u16 count;
> +	struct list_head event_list;
> +};
> +
>  struct ice_pf {
>  	struct pci_dev *pdev;
>  
> @@ -512,6 +535,7 @@ struct ice_pf {
>  
>  	/* devlink port data */
>  	struct devlink_port devlink_port;
> +	struct ice_mdd_reporter mdd_reporter;
>  
>  	/* OS reserved IRQ details */
>  	struct msix_entry *msix_entries;
> diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
> index e6ec20079ced..79a12cd94110 100644
> --- a/drivers/net/ethernet/intel/ice/ice_devlink.c
> +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
> @@ -1364,3 +1364,192 @@ void ice_devlink_destroy_regions(struct ice_pf *pf)
>  	if (pf->devcaps_region)
>  		devlink_region_destroy(pf->devcaps_region);
>  }
> +
> +#define ICE_MDD_SRC_TO_STR(_src) \
> +	((_src) == ICE_MDD_SRC_NONE ? "none"            \
> +	: (_src) == ICE_MDD_SRC_TX_PQM ? "tx_pqm"      \
> +	: (_src) == ICE_MDD_SRC_TX_TCLAN ? "tx_tclan"  \
> +	: (_src) == ICE_MDD_SRC_TX_TDPU ? "tx_tdpu"    \
> +	: (_src) == ICE_MDD_SRC_RX ? "rx"              \
> +	: "invalid")
> +
> +static int
> +ice_mdd_reporter_dump(struct devlink_health_reporter *reporter,
> +		      struct devlink_fmsg *fmsg, void *priv_ctx,
> +		      struct netlink_ext_ack __always_unused *extack)
> +{
> +	struct ice_pf *pf = devlink_health_reporter_priv(reporter);
> +	struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> +	struct ice_mdd_event *mdd_event;
> +	int err;
> +
> +	err = devlink_fmsg_u32_pair_put(fmsg, "count",
> +					mdd_reporter->count);
> +	if (err)
> +		return err;
> +
> +	list_for_each_entry(mdd_event, &mdd_reporter->event_list, list) {
> +		char *src;
> +
> +		err = devlink_fmsg_obj_nest_start(fmsg);
> +		if (err)
> +			return err;
> +
> +		src = ICE_MDD_SRC_TO_STR(mdd_event->src);
> +
> +		err = devlink_fmsg_string_pair_put(fmsg, "src", src);
> +		if (err)
> +			return err;
> +
> +		err = devlink_fmsg_u8_pair_put(fmsg, "pf_num",
> +					       mdd_event->pf_num);
> +		if (err)
> +			return err;
> +
> +		err = devlink_fmsg_u32_pair_put(fmsg, "mdd_vf_num",
> +						mdd_event->vf_num);
> +		if (err)
> +			return err;
> +
> +		err = devlink_fmsg_u8_pair_put(fmsg, "mdd_event",
> +					       mdd_event->event);
> +		if (err)
> +			return err;
> +
> +		err = devlink_fmsg_u32_pair_put(fmsg, "mdd_queue",
> +						mdd_event->queue);
> +		if (err)
> +			return err;
> +
> +		err = devlink_fmsg_obj_nest_end(fmsg);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct devlink_health_reporter_ops ice_mdd_reporter_ops = {
> +	.name = "mdd",
> +	.dump = ice_mdd_reporter_dump,
> +};
> +
> +/**
> + * ice_devlink_init_mdd_reporter - Initialize MDD devlink health reporter
> + * @pf: the PF device structure
> + *
> + * Create devlink health reporter used to handle MDD events.
> + */
> +void ice_devlink_init_mdd_reporter(struct ice_pf *pf)
> +{
> +	struct devlink *devlink = priv_to_devlink(pf);
> +	struct device *dev = ice_pf_to_dev(pf);
> +
> +	INIT_LIST_HEAD(&pf->mdd_reporter.event_list);
> +
> +	pf->mdd_reporter.reporter =
> +		devlink_health_reporter_create(devlink,
> +					       &ice_mdd_reporter_ops,
> +					       0, /* graceful period */
> +					       pf); /* private data */
> +
> +	if (IS_ERR(pf->mdd_reporter.reporter)) {
> +		dev_err(dev, "failed to create devlink MDD health reporter");
> +	}
> +}
> +
> +/**
> + * ice_devlink_destroy_mdd_reporter - Destroy MDD devlink health reporter
> + * @pf: the PF device structure
> + *
> + * Remove previously created MDD health reporter for this PF.
> + */
> +void ice_devlink_destroy_mdd_reporter(struct ice_pf *pf)
> +{
> +	if (pf->mdd_reporter.reporter)
> +		devlink_health_reporter_destroy(pf->mdd_reporter.reporter);
I wonder if the list of mdd_event shouldn't be cleared here.

> +}
> +
> +/**
> + * ice_devlink_report_mdd_event - Report an MDD event through devlink health
> + * @pf: the PF device structure
> + * @src: the HW block that was the source of this MDD event
> + * @pf_num: the pf_num on which the MDD event occurred
> + * @vf_num: the vf_num on which the MDD event occurred
> + * @event: the event type of the MDD event
> + * @queue: the queue on which the MDD event occurred
> + *
> + * Report an MDD event that has occurred on this PF.
> + */
> +void
> +ice_devlink_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src,
> +			     u8 pf_num, u16 vf_num, u8 event, u16 queue)
> +{
> +	struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> +	struct ice_mdd_event *mdd_event;
> +	int err;
> +
> +	if (!mdd_reporter->reporter)
> +		return;
> +
> +	mdd_reporter->count++;
> +
> +	mdd_event = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*mdd_event),
> +				 GFP_KERNEL);
> +	if (!mdd_event)
> +		return;
> +
> +	mdd_event->src = src;
> +	mdd_event->pf_num = pf_num;
> +	mdd_event->vf_num = vf_num;
> +	mdd_event->event = event;
> +	mdd_event->queue = queue;
> +
> +	list_add_tail(&mdd_event->list, &mdd_reporter->event_list);
> +
> +	mdd_event = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*mdd_event),
> +				 GFP_KERNEL);
If it dosn't need to be freed in clear routine it is fine, but if it
need please use normal kzalloc instead of devm.

> +	if (!mdd_event)
> +		return;
> +
> +	mdd_event->src = src;
> +	mdd_event->pf_num = pf_num;
> +	mdd_event->vf_num = vf_num;
> +	mdd_event->event = event;
> +	mdd_event->queue = queue;
Why the mdd_event is created two time?

> +
> +	list_add_tail(&mdd_event->list, &mdd_reporter->event_list);
> +
> +	err = devlink_health_report(mdd_reporter->reporter,
> +				    "Malicious Driver Detection event\n",
> +				    pf);
> +	if (err)
> +		dev_err(ice_pf_to_dev(pf),
> +			"failed to report MDD via devlink health\n");
Shouldn't mdd_event be removed from list (and freed) in case of error?

> +}
> +
> +/**
> + * ice_devlink_clear_after_reset - clear devlink health issues after a reset
> + * @pf: the PF device structure
> + *
> + * Mark the PF in healthy state again after a reset has completed.
> + */
> +void ice_devlink_clear_after_reset(struct ice_pf *pf)
> +{
> +	struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> +	enum devlink_health_reporter_state new_state =
> +		DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
> +	struct ice_mdd_event *mdd_event, *tmp;
> +
> +	if (!mdd_reporter->reporter)
> +		return;
> +
> +	devlink_health_reporter_state_update(mdd_reporter->reporter,
> +					     new_state);
> +	pf->mdd_reporter.count = 0;
> +
> +	list_for_each_entry_safe(mdd_event, tmp, &mdd_reporter->event_list,
> +				 list) {
> +	list_del(&mdd_event->list);
Need tab, and probably mdd_event should be freed.

> +	}
> +}
> diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.h b/drivers/net/ethernet/intel/ice/ice_devlink.h
> index fe006d9946f8..5632d23b6518 100644
> --- a/drivers/net/ethernet/intel/ice/ice_devlink.h
> +++ b/drivers/net/ethernet/intel/ice/ice_devlink.h
> @@ -18,4 +18,10 @@ void ice_devlink_destroy_vf_port(struct ice_vf *vf);
>  void ice_devlink_init_regions(struct ice_pf *pf);
>  void ice_devlink_destroy_regions(struct ice_pf *pf);
>  
> +void ice_devlink_init_mdd_reporter(struct ice_pf *pf);
> +void ice_devlink_destroy_mdd_reporter(struct ice_pf *pf);
> +void ice_devlink_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src,
> +				  u8 pf_num, u16 vf_num, u8 event, u16 queue);
> +void ice_devlink_clear_after_reset(struct ice_pf *pf);
> +
>  #endif /* _ICE_DEVLINK_H_ */
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> index 0f6718719453..a55ce7887c1b 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -1720,6 +1720,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
>  		if (netif_msg_tx_err(pf))
>  			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
>  				 event, queue, pf_num, vf_num);
> +		ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_TX_PQM, pf_num,
> +					     vf_num, event, queue);
>  		wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
>  	}
>  
> @@ -1737,6 +1739,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
>  		if (netif_msg_tx_err(pf))
>  			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
>  				 event, queue, pf_num, vf_num);
> +		ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_TX_TCLAN, pf_num,
> +					     vf_num, event, queue);
>  		wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff);
>  	}
>  
> @@ -1754,6 +1758,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
>  		if (netif_msg_rx_err(pf))
>  			dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
>  				 event, queue, pf_num, vf_num);
> +		ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_RX, pf_num,
> +					     vf_num, event, queue);
>  		wr32(hw, GL_MDET_RX, 0xffffffff);
>  	}
>  
> @@ -4731,6 +4737,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
>  	}
>  
>  	ice_devlink_init_regions(pf);
> +	ice_devlink_init_mdd_reporter(pf);
>  
>  	pf->hw.udp_tunnel_nic.set_port = ice_udp_tunnel_set_port;
>  	pf->hw.udp_tunnel_nic.unset_port = ice_udp_tunnel_unset_port;
> @@ -4960,6 +4967,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
>  	devm_kfree(dev, pf->vsi);
>  err_init_pf_unroll:
>  	ice_deinit_pf(pf);
> +	ice_devlink_destroy_mdd_reporter(pf);
>  	ice_devlink_destroy_regions(pf);
>  	ice_deinit_hw(hw);
>  err_exit_unroll:
> @@ -5079,6 +5087,7 @@ static void ice_remove(struct pci_dev *pdev)
>  		ice_vsi_free_q_vectors(pf->vsi[i]);
>  	}
>  	ice_deinit_pf(pf);
> +	ice_devlink_destroy_mdd_reporter(pf);
>  	ice_devlink_destroy_regions(pf);
>  	ice_deinit_hw(&pf->hw);
>  
> @@ -7265,6 +7274,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
>  
>  	/* if we get here, reset flow is successful */
>  	clear_bit(ICE_RESET_FAILED, pf->state);
> +	ice_devlink_clear_after_reset(pf);
>  
>  	ice_plug_aux_dev(pf);
>  	return;
> -- 
> 2.38.1
> 
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan


More information about the Intel-wired-lan mailing list