[Intel-wired-lan] [PATCH net-next] ice-: Add MDD logging via devlink health
Michal Swiatkowski
michal.swiatkowski at linux.intel.com
Tue Dec 20 09:46:38 UTC 2022
On Mon, Dec 19, 2022 at 03:07:00PM +0100, Kalyan Kodamagula wrote:
Hi,
Thanks for patch, please remove - from title (ice-:)
> From: Ben Shelton <benjamin.h.shelton at intel.com>
>
> - Enable DEVLINK_SUPPORT for ice_sw build.
What is ice_sw build?
>
> - Add a devlink health reporter for MDD events. The 'dump' handler will
> return the information captured in each call to
> ice_handle_mdd_event(). A device reset (CORER/PFR) will put the
> reporter back in healthy state.
>
> Signed-off-by: Ben Shelton <benjamin.h.shelton at intel.com>
> Signed-off-by: Kalyan Kodamagula <kalyan.kodamagula at intel.com>
> ---
> drivers/net/ethernet/intel/ice/ice.h | 24 +++
> drivers/net/ethernet/intel/ice/ice_devlink.c | 189 +++++++++++++++++++
> drivers/net/ethernet/intel/ice/ice_devlink.h | 6 +
> drivers/net/ethernet/intel/ice/ice_main.c | 10 +
> 4 files changed, 229 insertions(+)
>
> diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> index 001500afc4a6..433c514e73fb 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -503,6 +503,29 @@ struct ice_agg_node {
> u8 valid;
> };
>
> +enum ice_mdd_src {
> + ICE_MDD_SRC_NONE = 0,
> + ICE_MDD_SRC_TX_PQM,
> + ICE_MDD_SRC_TX_TCLAN,
> + ICE_MDD_SRC_TX_TDPU,
> + ICE_MDD_SRC_RX
> +};
> +
> +struct ice_mdd_event {
> + struct list_head list;
> + enum ice_mdd_src src;
> + u8 pf_num;
> + u16 vf_num;
> + u8 event;
> + u16 queue;
> +};
> +
> +struct ice_mdd_reporter {
> + struct devlink_health_reporter *reporter;
> + u16 count;
> + struct list_head event_list;
> +};
> +
> struct ice_pf {
> struct pci_dev *pdev;
>
> @@ -512,6 +535,7 @@ struct ice_pf {
>
> /* devlink port data */
> struct devlink_port devlink_port;
> + struct ice_mdd_reporter mdd_reporter;
>
> /* OS reserved IRQ details */
> struct msix_entry *msix_entries;
> diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
> index e6ec20079ced..79a12cd94110 100644
> --- a/drivers/net/ethernet/intel/ice/ice_devlink.c
> +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
> @@ -1364,3 +1364,192 @@ void ice_devlink_destroy_regions(struct ice_pf *pf)
> if (pf->devcaps_region)
> devlink_region_destroy(pf->devcaps_region);
> }
> +
> +#define ICE_MDD_SRC_TO_STR(_src) \
> + ((_src) == ICE_MDD_SRC_NONE ? "none" \
> + : (_src) == ICE_MDD_SRC_TX_PQM ? "tx_pqm" \
> + : (_src) == ICE_MDD_SRC_TX_TCLAN ? "tx_tclan" \
> + : (_src) == ICE_MDD_SRC_TX_TDPU ? "tx_tdpu" \
> + : (_src) == ICE_MDD_SRC_RX ? "rx" \
> + : "invalid")
> +
> +static int
> +ice_mdd_reporter_dump(struct devlink_health_reporter *reporter,
> + struct devlink_fmsg *fmsg, void *priv_ctx,
> + struct netlink_ext_ack __always_unused *extack)
> +{
> + struct ice_pf *pf = devlink_health_reporter_priv(reporter);
> + struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> + struct ice_mdd_event *mdd_event;
> + int err;
> +
> + err = devlink_fmsg_u32_pair_put(fmsg, "count",
> + mdd_reporter->count);
> + if (err)
> + return err;
> +
> + list_for_each_entry(mdd_event, &mdd_reporter->event_list, list) {
> + char *src;
> +
> + err = devlink_fmsg_obj_nest_start(fmsg);
> + if (err)
> + return err;
> +
> + src = ICE_MDD_SRC_TO_STR(mdd_event->src);
> +
> + err = devlink_fmsg_string_pair_put(fmsg, "src", src);
> + if (err)
> + return err;
> +
> + err = devlink_fmsg_u8_pair_put(fmsg, "pf_num",
> + mdd_event->pf_num);
> + if (err)
> + return err;
> +
> + err = devlink_fmsg_u32_pair_put(fmsg, "mdd_vf_num",
> + mdd_event->vf_num);
> + if (err)
> + return err;
> +
> + err = devlink_fmsg_u8_pair_put(fmsg, "mdd_event",
> + mdd_event->event);
> + if (err)
> + return err;
> +
> + err = devlink_fmsg_u32_pair_put(fmsg, "mdd_queue",
> + mdd_event->queue);
> + if (err)
> + return err;
> +
> + err = devlink_fmsg_obj_nest_end(fmsg);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +
> +static const struct devlink_health_reporter_ops ice_mdd_reporter_ops = {
> + .name = "mdd",
> + .dump = ice_mdd_reporter_dump,
> +};
> +
> +/**
> + * ice_devlink_init_mdd_reporter - Initialize MDD devlink health reporter
> + * @pf: the PF device structure
> + *
> + * Create devlink health reporter used to handle MDD events.
> + */
> +void ice_devlink_init_mdd_reporter(struct ice_pf *pf)
> +{
> + struct devlink *devlink = priv_to_devlink(pf);
> + struct device *dev = ice_pf_to_dev(pf);
> +
> + INIT_LIST_HEAD(&pf->mdd_reporter.event_list);
> +
> + pf->mdd_reporter.reporter =
> + devlink_health_reporter_create(devlink,
> + &ice_mdd_reporter_ops,
> + 0, /* graceful period */
> + pf); /* private data */
> +
> + if (IS_ERR(pf->mdd_reporter.reporter)) {
> + dev_err(dev, "failed to create devlink MDD health reporter");
> + }
> +}
> +
> +/**
> + * ice_devlink_destroy_mdd_reporter - Destroy MDD devlink health reporter
> + * @pf: the PF device structure
> + *
> + * Remove previously created MDD health reporter for this PF.
> + */
> +void ice_devlink_destroy_mdd_reporter(struct ice_pf *pf)
> +{
> + if (pf->mdd_reporter.reporter)
> + devlink_health_reporter_destroy(pf->mdd_reporter.reporter);
I wonder if the list of mdd_event shouldn't be cleared here.
> +}
> +
> +/**
> + * ice_devlink_report_mdd_event - Report an MDD event through devlink health
> + * @pf: the PF device structure
> + * @src: the HW block that was the source of this MDD event
> + * @pf_num: the pf_num on which the MDD event occurred
> + * @vf_num: the vf_num on which the MDD event occurred
> + * @event: the event type of the MDD event
> + * @queue: the queue on which the MDD event occurred
> + *
> + * Report an MDD event that has occurred on this PF.
> + */
> +void
> +ice_devlink_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src,
> + u8 pf_num, u16 vf_num, u8 event, u16 queue)
> +{
> + struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> + struct ice_mdd_event *mdd_event;
> + int err;
> +
> + if (!mdd_reporter->reporter)
> + return;
> +
> + mdd_reporter->count++;
> +
> + mdd_event = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*mdd_event),
> + GFP_KERNEL);
> + if (!mdd_event)
> + return;
> +
> + mdd_event->src = src;
> + mdd_event->pf_num = pf_num;
> + mdd_event->vf_num = vf_num;
> + mdd_event->event = event;
> + mdd_event->queue = queue;
> +
> + list_add_tail(&mdd_event->list, &mdd_reporter->event_list);
> +
> + mdd_event = devm_kzalloc(ice_pf_to_dev(pf), sizeof(*mdd_event),
> + GFP_KERNEL);
If it dosn't need to be freed in clear routine it is fine, but if it
need please use normal kzalloc instead of devm.
> + if (!mdd_event)
> + return;
> +
> + mdd_event->src = src;
> + mdd_event->pf_num = pf_num;
> + mdd_event->vf_num = vf_num;
> + mdd_event->event = event;
> + mdd_event->queue = queue;
Why the mdd_event is created two time?
> +
> + list_add_tail(&mdd_event->list, &mdd_reporter->event_list);
> +
> + err = devlink_health_report(mdd_reporter->reporter,
> + "Malicious Driver Detection event\n",
> + pf);
> + if (err)
> + dev_err(ice_pf_to_dev(pf),
> + "failed to report MDD via devlink health\n");
Shouldn't mdd_event be removed from list (and freed) in case of error?
> +}
> +
> +/**
> + * ice_devlink_clear_after_reset - clear devlink health issues after a reset
> + * @pf: the PF device structure
> + *
> + * Mark the PF in healthy state again after a reset has completed.
> + */
> +void ice_devlink_clear_after_reset(struct ice_pf *pf)
> +{
> + struct ice_mdd_reporter *mdd_reporter = &pf->mdd_reporter;
> + enum devlink_health_reporter_state new_state =
> + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
> + struct ice_mdd_event *mdd_event, *tmp;
> +
> + if (!mdd_reporter->reporter)
> + return;
> +
> + devlink_health_reporter_state_update(mdd_reporter->reporter,
> + new_state);
> + pf->mdd_reporter.count = 0;
> +
> + list_for_each_entry_safe(mdd_event, tmp, &mdd_reporter->event_list,
> + list) {
> + list_del(&mdd_event->list);
Need tab, and probably mdd_event should be freed.
> + }
> +}
> diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.h b/drivers/net/ethernet/intel/ice/ice_devlink.h
> index fe006d9946f8..5632d23b6518 100644
> --- a/drivers/net/ethernet/intel/ice/ice_devlink.h
> +++ b/drivers/net/ethernet/intel/ice/ice_devlink.h
> @@ -18,4 +18,10 @@ void ice_devlink_destroy_vf_port(struct ice_vf *vf);
> void ice_devlink_init_regions(struct ice_pf *pf);
> void ice_devlink_destroy_regions(struct ice_pf *pf);
>
> +void ice_devlink_init_mdd_reporter(struct ice_pf *pf);
> +void ice_devlink_destroy_mdd_reporter(struct ice_pf *pf);
> +void ice_devlink_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src,
> + u8 pf_num, u16 vf_num, u8 event, u16 queue);
> +void ice_devlink_clear_after_reset(struct ice_pf *pf);
> +
> #endif /* _ICE_DEVLINK_H_ */
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> index 0f6718719453..a55ce7887c1b 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -1720,6 +1720,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> if (netif_msg_tx_err(pf))
> dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
> event, queue, pf_num, vf_num);
> + ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_TX_PQM, pf_num,
> + vf_num, event, queue);
> wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
> }
>
> @@ -1737,6 +1739,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> if (netif_msg_tx_err(pf))
> dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
> event, queue, pf_num, vf_num);
> + ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_TX_TCLAN, pf_num,
> + vf_num, event, queue);
> wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff);
> }
>
> @@ -1754,6 +1758,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> if (netif_msg_rx_err(pf))
> dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
> event, queue, pf_num, vf_num);
> + ice_devlink_report_mdd_event(pf, ICE_MDD_SRC_RX, pf_num,
> + vf_num, event, queue);
> wr32(hw, GL_MDET_RX, 0xffffffff);
> }
>
> @@ -4731,6 +4737,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> }
>
> ice_devlink_init_regions(pf);
> + ice_devlink_init_mdd_reporter(pf);
>
> pf->hw.udp_tunnel_nic.set_port = ice_udp_tunnel_set_port;
> pf->hw.udp_tunnel_nic.unset_port = ice_udp_tunnel_unset_port;
> @@ -4960,6 +4967,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> devm_kfree(dev, pf->vsi);
> err_init_pf_unroll:
> ice_deinit_pf(pf);
> + ice_devlink_destroy_mdd_reporter(pf);
> ice_devlink_destroy_regions(pf);
> ice_deinit_hw(hw);
> err_exit_unroll:
> @@ -5079,6 +5087,7 @@ static void ice_remove(struct pci_dev *pdev)
> ice_vsi_free_q_vectors(pf->vsi[i]);
> }
> ice_deinit_pf(pf);
> + ice_devlink_destroy_mdd_reporter(pf);
> ice_devlink_destroy_regions(pf);
> ice_deinit_hw(&pf->hw);
>
> @@ -7265,6 +7274,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
>
> /* if we get here, reset flow is successful */
> clear_bit(ICE_RESET_FAILED, pf->state);
> + ice_devlink_clear_after_reset(pf);
>
> ice_plug_aux_dev(pf);
> return;
> --
> 2.38.1
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
More information about the Intel-wired-lan
mailing list