Re: [PATCH V3 5/5] perf/amd/iommu: Enable support for multiple IOMMUs

From: Borislav Petkov <bp@alien8.de>
To: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: joro@8bytes.org, peterz@infradead.org, mingo@redhat.com,
	acme@kernel.org, andihartmann@freenet.de,
	linux-kernel@vger.kernel.org, iommu@lists.linux-foundation.org
Subject: Re: [PATCH V3 5/5] perf/amd/iommu: Enable support for multiple IOMMUs
Date: Wed, 10 Feb 2016 18:14:01 +0100	[thread overview]
Message-ID: <20160210171401.GI23914@pd.tnic> (raw)
In-Reply-To: <1455058435-8716-6-git-send-email-Suravee.Suthikulpanit@amd.com>

On Tue, Feb 09, 2016 at 04:53:55PM -0600, Suravee Suthikulpanit wrote:
> The current amd_iommu_pc_get_set_reg_val() does not support muli-IOMMU
> system. This patch replace amd_iommu_pc_get_set_reg_val() with
> amd_iommu_pc_set_reg_val() and amd_iommu_pc_[set|get]_cnt_vals().
> 
> Also, the current struct hw_perf_event.prev_count can only store the
> previous counter value only from one IOMMU. So, this patch introduces
> a new data structure, perf_amd_iommu.prev_cnts, to track previous value
> of each performance counter of each bank of each IOMMU.
> 
> Last, this patch introduce perf_iommu_cnts to temporary hold counter
> values from each IOMMU for internal use when manages counters among
> multiple IOMMUs.
> 
> Note that this implementation makes an assumption that the counters
> on all IOMMUs will be programmed the same way (i.e with the same events).
> 
> Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
> ---
>  arch/x86/kernel/cpu/perf_event_amd_iommu.c | 125 +++++++++++++++++++++--------
>  drivers/iommu/amd_iommu_init.c             |  87 +++++++++++++++++---
>  include/linux/perf/perf_event_amd_iommu.h  |   8 +-
>  3 files changed, 174 insertions(+), 46 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> index 791bbcf..ce6ba3f 100644
> --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
> @@ -42,6 +42,12 @@ struct perf_amd_iommu {
>  	u64 cntr_assign_mask;
>  	raw_spinlock_t lock;
>  	const struct attribute_group *attr_groups[4];
> +
> +	/* This is a 3D array used to store the previous count values
> +	 * from each performance counter of each bank of each IOMMU.
> +	 * I.E. size of array = (num iommus * num banks * num counters)
> +	 */
> +	local64_t *prev_cnts;
>  };
>  
>  #define format_group	attr_groups[0]
> @@ -121,6 +127,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
>  	{ /* end: all zeroes */ },
>  };
>  
> +/* This is an array used to temporary hold the current values
> + * read from a particular perf counter from each IOMMU.
> + */
> +static u64 *perf_iommu_cnts;
> +
>  /*---------------------------------------------
>   * sysfs cpumask attributes
>   *---------------------------------------------*/
> @@ -256,44 +267,46 @@ static void perf_iommu_enable_event(struct perf_event *ev)
>  	u64 reg = 0ULL;
>  
>  	reg = csource;
> -	amd_iommu_pc_get_set_reg_val(devid,
> +	amd_iommu_pc_set_reg_val(devid,
>  			_GET_BANK(ev), _GET_CNTR(ev) ,
> -			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
> +			 IOMMU_PC_COUNTER_SRC_REG, &reg);

Read-impairing indentation. It should be aligned at the opening brace:

	function_name(param1, param2,
		      param3, ...

Ditto for the calls below.

>  
>  	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);

0ULL?

Is this supposed to clear the old value from the previous read above?

What's wrong with

	reg = devid | (_GET_DEVID_MASK(ev) << 32);

?

Same for the rest.

>  	if (reg)
>  		reg |= (1UL << 31);

All those can be turned into

		reg |= BIT(31);

> -	amd_iommu_pc_get_set_reg_val(devid,
> +	amd_iommu_pc_set_reg_val(devid,
>  			_GET_BANK(ev), _GET_CNTR(ev) ,
> -			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
> +			 IOMMU_PC_DEVID_MATCH_REG, &reg);
>  
>  	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
>  	if (reg)
>  		reg |= (1UL << 31);
> -	amd_iommu_pc_get_set_reg_val(devid,
> +	amd_iommu_pc_set_reg_val(devid,
>  			_GET_BANK(ev), _GET_CNTR(ev) ,
> -			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
> +			 IOMMU_PC_PASID_MATCH_REG, &reg);
>  
>  	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
>  	if (reg)
>  		reg |= (1UL << 31);
> -	amd_iommu_pc_get_set_reg_val(devid,
> +	amd_iommu_pc_set_reg_val(devid,
>  			_GET_BANK(ev), _GET_CNTR(ev) ,
> -			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
> +			 IOMMU_PC_DOMID_MATCH_REG, &reg);
>  }
>  
>  static void perf_iommu_disable_event(struct perf_event *event)
>  {
>  	u64 reg = 0ULL;
>  
> -	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> +	amd_iommu_pc_set_reg_val(_GET_DEVID(event),
>  			_GET_BANK(event), _GET_CNTR(event),
> -			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
> +			IOMMU_PC_COUNTER_SRC_REG, &reg);
>  }
>  
>  static void perf_iommu_start(struct perf_event *event, int flags)
>  {
>  	struct hw_perf_event *hwc = &event->hw;
> +	struct perf_amd_iommu *perf_iommu =
> +			container_of(event->pmu, struct perf_amd_iommu, pmu);

There's no need to make it less readable and still fit within the 80
cols rule. Feel free to relax it a little.

>  	pr_debug("perf: amd_iommu:perf_iommu_start\n");
>  	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
> @@ -303,10 +316,19 @@ static void perf_iommu_start(struct perf_event *event, int flags)
>  	hwc->state = 0;
>  
>  	if (flags & PERF_EF_RELOAD) {
> -		u64 prev_raw_count =  local64_read(&hwc->prev_count);
> -		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> -				_GET_BANK(event), _GET_CNTR(event),
> -				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
> +		int i;
> +
> +		for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
> +			int index = get_iommu_bnk_cnt_evt_idx(perf_iommu, i,
> +					  _GET_BANK(event), _GET_CNTR(event));

No need for the line break.

> +
> +			perf_iommu_cnts[i] = local64_read(
> +						&perf_iommu->prev_cnts[index]);

Yuck. What's wrong with:

			perf_iommu_cnts[i] = local64_read(&perf_iommu->prev_cnts[index]);

?

It looks much better to me.

> +		}
> +
> +		amd_iommu_pc_set_cnt_vals(_GET_BANK(event), _GET_CNTR(event),
> +					  amd_iommu_get_num_iommus(),
> +					  perf_iommu_cnts);

This one looks good.

>  	}
>  
>  	perf_iommu_enable_event(event);
> @@ -316,29 +338,47 @@ static void perf_iommu_start(struct perf_event *event, int flags)
>  
>  static void perf_iommu_read(struct perf_event *event)
>  {
> -	u64 count = 0ULL;
> -	u64 prev_raw_count = 0ULL;
> +	int i;
>  	u64 delta = 0ULL;
>  	struct hw_perf_event *hwc = &event->hw;
> -	pr_debug("perf: amd_iommu:perf_iommu_read\n");
> -
> -	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
> -				_GET_BANK(event), _GET_CNTR(event),
> -				IOMMU_PC_COUNTER_REG, &count, false);
> +	struct perf_amd_iommu *perf_iommu =
> +			container_of(event->pmu, struct perf_amd_iommu, pmu);

Indentation.

>  
> -	/* IOMMU pc counter register is only 48 bits */
> -	count &= 0xFFFFFFFFFFFFULL;
> +	pr_debug("perf: amd_iommu:perf_iommu_read\n");

What is that debug statement good for? Can it go?

>From looking at that file, it has a bunch more which are completely
useless and can be deleted. In a separate patch.

> -	prev_raw_count =  local64_read(&hwc->prev_count);
> -	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> -					count) != prev_raw_count)
> +	if (amd_iommu_pc_get_cnt_vals(_GET_BANK(event), _GET_CNTR(event),
> +				      amd_iommu_get_num_iommus(),
> +				      perf_iommu_cnts))
>  		return;
>  
> -	/* Handling 48-bit counter overflowing */
> -	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
> -	delta >>= COUNTER_SHIFT;
> -	local64_add(delta, &event->count);
> -
> +	/* Now we re-aggregating event counts and prev-counts
> +	 * from all IOMMUs
> +	 */

Kernel comment style:

	/*
	 * Start of first sentence...
	 * Second sentence...
	 * ...
	 */

> +	local64_set(&hwc->prev_count, 0);
> +
> +	for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
> +		int indx = get_iommu_bnk_cnt_evt_idx(perf_iommu, i,
> +					  _GET_BANK(event), _GET_CNTR(event));

Indentation.

> +		u64 prev_raw_count = local64_read(&perf_iommu->prev_cnts[indx]);
> +
> +		/* IOMMU pc counter register is only 48 bits */
> +		perf_iommu_cnts[i] &= 0xFFFFFFFFFFFFULL;

				   &= GENMASK_ULL(48, 0)

> +
> +		/*
> +		 * Since we do not enable counter overflow interrupts,
> +		 * we do not have to worry about prev_count changing on us
> +		 */
> +		local64_set(&perf_iommu->prev_cnts[indx],
> +				perf_iommu_cnts[i]);

No need to break it.

> +
> +		local64_add(prev_raw_count, &hwc->prev_count);
> +
> +		/* Handling 48-bit counter overflowing */

		/* Handle 48-bit counter overflow: */

reads better.

> +		delta = (perf_iommu_cnts[i] << COUNTER_SHIFT) -
> +				(prev_raw_count << COUNTER_SHIFT);

No need for the linebreak.

> +		delta >>= COUNTER_SHIFT;
> +		local64_add(delta, &event->count);
> +	}
>  }
>  
>  static void perf_iommu_stop(struct perf_event *event, int flags)
> @@ -428,10 +468,14 @@ static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
>  
>  static __init void amd_iommu_pc_exit(void)
>  {
> -	if (__perf_iommu.events_group != NULL) {
> -		kfree(__perf_iommu.events_group);
> -		__perf_iommu.events_group = NULL;
> -	}
> +	kfree(__perf_iommu.events_group);
> +	__perf_iommu.events_group = NULL;
> +
> +	kfree(__perf_iommu.prev_cnts);
> +	__perf_iommu.prev_cnts = NULL;
> +
> +	kfree(perf_iommu_cnts);
> +	perf_iommu_cnts = NULL;
>  }
>  
>  static __init int _init_perf_amd_iommu(
> @@ -461,6 +505,17 @@ static __init int _init_perf_amd_iommu(
>  	perf_iommu->null_group = NULL;
>  	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
>  
> +	perf_iommu->prev_cnts = kzalloc(sizeof(*perf_iommu->prev_cnts) *
> +		(amd_iommu_get_num_iommus() * perf_iommu->max_banks *
> +		perf_iommu->max_counters), GFP_KERNEL);

Indentation.

	perf_iommu->prev_cnts = kzalloc(sizeof(*perf_iommu->prev_cnts) *
					amd_iommu_get_num_iommus() *
					perf_iommu->max_banks *
					perf_iommu->max_counters), GFP_KERNEL);

looks more readable to me.

> +	if (!perf_iommu->prev_cnts)
> +		return -ENOMEM;
> +
> +	perf_iommu_cnts = kzalloc(sizeof(*perf_iommu_cnts) *
> +				  amd_iommu_get_num_iommus(), GFP_KERNEL);

No need for the linebreak.

> +	if (!perf_iommu_cnts)
> +		return -ENOMEM;
> +
>  	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
>  	if (ret) {
>  		pr_err("perf: amd_iommu: Failed to initialized.\n");

You can define pr_fmt to "perf/amd_iommu: " at the beginning of this
file and then you don't need to supply that prefix each time you call
pr_*

And that sentence above it wrong. It should be:

	pr_err("Error initializing AMD IOMMU perf counters.\n");

or something like that.

> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 531b2e1..7b1b561 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -1133,6 +1133,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
>  	return 0;
>  }
>  
> +static int _amd_iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
> +					 u8 bank, u8 cntr, u8 fxn,
> +					 u64 *value, bool is_write);

static int _amd_iommu_this_func_name_is_def_too_long

>  
>  static void init_iommu_perf_ctr(struct amd_iommu *iommu)
>  {
> @@ -1144,8 +1147,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
>  	amd_iommu_pc_present = true;
>  
>  	/* Check if the performance counters can be written to */
> -	if ((0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val, true)) ||
> -	    (0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val2, false)) ||
> +	if ((_amd_iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
> +	    (_amd_iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
>  	    (val != val2)) {
>  		pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
>  		amd_iommu_pc_present = false;
> @@ -2294,10 +2297,10 @@ u8 amd_iommu_pc_get_max_counters(void)
>  }
>  EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
>  
> -int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
> -				    u64 *value, bool is_write)
> +static int _amd_iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
> +					 u8 bank, u8 cntr, u8 fxn,
> +					 u64 *value, bool is_write)

Ditto.

>  {
> -	struct amd_iommu *iommu;
>  	u32 offset;
>  	u32 max_offset_lim;
>  
> @@ -2305,9 +2308,6 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
>  	if (!amd_iommu_pc_present)
>  		return -ENODEV;
>  
> -	/* Locate the iommu associated with the device ID */
> -	iommu = amd_iommu_rlookup_table[devid];
> -
>  	/* Check for valid iommu and pc register indexing */
>  	if (WARN_ON((iommu == NULL) || (fxn > 0x28) || (fxn & 7)))
>  		return -ENODEV;
> @@ -2332,4 +2332,73 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
>  
>  	return 0;
>  }
> -EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
> +
> +int amd_iommu_pc_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn, u64 *value)
> +{
> +	struct amd_iommu *iommu;
> +
> +	for_each_iommu(iommu) {
> +		int ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> +							fxn, value, true);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_set_reg_val);

Why isn't this EXPORT_SYMBOL_GPL?

> +
> +int amd_iommu_pc_set_cnt_vals(u8 bank, u8 cntr, int num, u64 *value)

That whole naming is hard to read. What's wrong with

amd_iommu_set_counters()

?

> +{
> +	struct amd_iommu *iommu;
> +	int i = 0;
> +
> +	if (num > amd_iommus_present)
> +		return -EINVAL;
> +
> +	for_each_iommu(iommu) {
> +		int ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> +							IOMMU_PC_COUNTER_REG,
> +							&value[i], true);
> +		if (ret)
> +			return ret;
> +		if (i++ == amd_iommus_present)
> +			break;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_set_cnt_vals);

Ditto, why not EXPORT_SYMBOL_GPL ?

> +
> +int amd_iommu_pc_get_cnt_vals(u8 bank, u8 cntr, int num, u64 *value)
> +{
> +	struct amd_iommu *iommu;
> +	int i = 0, ret;
> +
> +	if (!num)
> +		return -EINVAL;
> +
> +	/*
> +	 * Here, we read the specified counters on all IOMMU,

Plural is IOMMUs ?

> +	 * which should have been programmed the same way.
> +	 * and aggregate the counter values.

That comment is a sentence with a fullstop in the middle.

> +	 */
> +	for_each_iommu(iommu) {
> +		u64 tmp;
> +
> +		if (i >= num)
> +			return -EINVAL;
> +
> +		ret = _amd_iommu_pc_get_set_reg_val(iommu, bank, cntr,
> +							IOMMU_PC_COUNTER_REG,
> +							&tmp, false);
> +		if (ret)
> +			return ret;

So if one of those intermediary calls of _amd_iommu_pc_get_set_reg_val()
fails, we return here not having read all counters. Unwind and handle
that error properly maybe?

Same issue above.

> +
> +		/* IOMMU pc counter register is only 48 bits */
> +		value[i] = tmp & 0xFFFFFFFFFFFFULL;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(amd_iommu_pc_get_cnt_vals);
> diff --git a/include/linux/perf/perf_event_amd_iommu.h b/include/linux/perf/perf_event_amd_iommu.h
> index cb820c2..be1a17d 100644
> --- a/include/linux/perf/perf_event_amd_iommu.h
> +++ b/include/linux/perf/perf_event_amd_iommu.h
> @@ -33,7 +33,11 @@ extern u8 amd_iommu_pc_get_max_banks(void);
>  
>  extern u8 amd_iommu_pc_get_max_counters(void);
>  
> -extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
> -			u8 fxn, u64 *value, bool is_write);
> +extern int amd_iommu_pc_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
> +				    u64 *value);
> +
> +extern int amd_iommu_pc_set_cnt_vals(u8 bank, u8 cntr, int num, u64 *value);
> +
> +extern int amd_iommu_pc_get_cnt_vals(u8 bank, u8 cntr, int num, u64 *value);

No need for that "extern"

>  #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
> -- 
> 2.5.0
> 
> 

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.