linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes
@ 2021-01-08 15:35 Steve Wahl
  2021-01-08 15:35 ` [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id Steve Wahl
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Steve Wahl @ 2021-01-08 15:35 UTC (permalink / raw)
  To: Steve Wahl, rja_direct, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Borislav Petkov, x86,
	H. Peter Anvin, linux-kernel
  Cc: Liang, Kan

For Intel uncore, the registers being used to identify the die don't
contain enough bits to uniquely identify more than 8 dies.  On
systems with more than 8 dies, this results in error messages of the
form "skx_uncore: probe of XXXX:XX:XX.X failed with error -22", and
some perf counters showing up as "<not supported>".

On such systems, use NUMA information to determine die id.

Continue to use the register information with 8 or fewer numa nodes to
cover cases like NUMA not being enabled.

The first patch moves translation from physical to logical die id
earlier in the code, and stores only the logical id.  The logical id
is the only one that is really used.  Without this change the second
patch would have to store both physical and logical id, which was much
more complicated.

The second patch adds the alternative of deriving the logical die id
from the NUMA information when there are more than 8 nodes.

Steve Wahl (2):
  perf/x86/intel/uncore: Store the logical die id instead of the
    physical die id.
  perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA
    info

 arch/x86/events/intel/uncore.c       |  58 +++++---------
 arch/x86/events/intel/uncore.h       |   5 +-
 arch/x86/events/intel/uncore_snb.c   |   2 +-
 arch/x86/events/intel/uncore_snbep.c | 114 ++++++++++++++++++---------
 4 files changed, 99 insertions(+), 80 deletions(-)

-- 
2.26.2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id.
  2021-01-08 15:35 [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Steve Wahl
@ 2021-01-08 15:35 ` Steve Wahl
  2021-01-14 11:29   ` [tip: perf/core] " tip-bot2 for Steve Wahl
  2021-01-08 15:35 ` [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info Steve Wahl
  2021-01-11 16:39 ` [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Liang, Kan
  2 siblings, 1 reply; 10+ messages in thread
From: Steve Wahl @ 2021-01-08 15:35 UTC (permalink / raw)
  To: Steve Wahl, rja_direct, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Borislav Petkov, x86,
	H. Peter Anvin, linux-kernel
  Cc: Liang, Kan

The phys_id isn't really used other than to map to a logical die id.
Calculate the logical die id earlier, and store that instead of the
phys_id.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
---
 arch/x86/events/intel/uncore.c       | 58 ++++++++++------------------
 arch/x86/events/intel/uncore.h       |  5 +--
 arch/x86/events/intel/uncore_snb.c   |  2 +-
 arch/x86/events/intel/uncore_snbep.c | 31 +++++++--------
 4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f82dc8..33c8180d5a87 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
 
 MODULE_LICENSE("GPL");
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
-	int phys_id = -1;
+	int die_id = -1;
 
 	raw_spin_lock(&pci2phy_map_lock);
 	list_for_each_entry(map, &pci2phy_map_head, list) {
 		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
+			die_id = map->pbus_to_dieid[bus->number];
 			break;
 		}
 	}
 	raw_spin_unlock(&pci2phy_map_lock);
 
-	return phys_id;
+	return die_id;
 }
 
 static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ struct pci2phy_map *__find_pci2phy_map(int segment)
 	alloc = NULL;
 	map->segment = segment;
 	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
+		map->pbus_to_dieid[i] = -1;
 	list_add_tail(&map->list, &pci2phy_map_head);
 
 end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 
 	uncore_pmu_init_hrtimer(box);
 	box->cpu = -1;
-	box->pci_phys_id = -1;
 	box->dieid = -1;
 
 	/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
 /*
  * Get the die information of a PCI device.
  * @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
-				       int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
 {
-	*phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (*phys_id < 0)
-		return -ENODEV;
-
-	*die = (topology_max_die_per_package() > 1) ? *phys_id :
-				topology_phys_to_logical_pkg(*phys_id);
+	*die = uncore_pcibus_to_dieid(pdev->bus);
 	if (*die < 0)
 		return -EINVAL;
 
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
  * @pdev: The PCI device.
  * @type: The corresponding PMU type of the device.
  * @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
 static int uncore_pci_pmu_register(struct pci_dev *pdev,
 				   struct intel_uncore_type *type,
 				   struct intel_uncore_pmu *pmu,
-				   int phys_id, int die)
+				   int die)
 {
 	struct intel_uncore_box *box;
 	int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
 	atomic_inc(&box->refcnt);
-	box->pci_phys_id = phys_id;
 	box->dieid = die;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 {
 	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu = NULL;
-	int phys_id, die, ret;
+	int die, ret;
 
-	ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+	ret = uncore_pci_get_dev_die_info(pdev, &die);
 	if (ret)
 		return ret;
 
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
 	}
 
-	ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+	ret = uncore_pci_pmu_register(pdev, type, pmu, die);
 
 	pci_set_drvdata(pdev, pmu->boxes[die]);
 
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 /*
  * Unregister the PMU of a PCI device
  * @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
-				      int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
 {
 	struct intel_uncore_box *box = pmu->boxes[die];
 
-	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-		return;
-
 	pmu->boxes[die] = NULL;
 	if (atomic_dec_return(&pmu->activeboxes) == 0)
 		uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box;
 	struct intel_uncore_pmu *pmu;
-	int i, phys_id, die;
+	int i, die;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return;
 
 	box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct intel_uncore_pmu *pmu;
-	int phys_id, die;
+	int die;
 
 	/* Unregister the PMU when the device is going to be deleted. */
 	if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	if (!pmu)
 		return NOTIFY_DONE;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return NOTIFY_DONE;
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 
 	return NOTIFY_OK;
 }
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
 	struct pci_dev *pci_sub_dev;
 	bool notify = false;
 	unsigned int devfn;
-	int phys_id, die;
+	int die;
 
 	while (ids && ids->vendor) {
 		pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
 			if (!pmu)
 				continue;
 
-			if (uncore_pci_get_dev_die_info(pci_sub_dev,
-							&phys_id, &die))
+			if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
 				continue;
 
 			if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
-						     phys_id, die))
+						     die))
 				notify = true;
 		}
 		ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea154349d..a3c6e1643ad2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int pci_phys_id;
 	int dieid;	/* Logical die ID */
 	int n_active;	/* number of active events */
 	int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
 struct pci2phy_map {
 	struct list_head list;
 	int segment;
-	int pbus_to_physid[256];
+	int pbus_to_dieid[256];
 };
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
 
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893e2e22..51271288499e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
 		pci_dev_put(dev);
 		return -ENOMEM;
 	}
-	map->pbus_to_physid[bus] = 0;
+	map->pbus_to_dieid[bus] = 0;
 	raw_spin_unlock(&pci2phy_map_lock);
 
 	pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215d..2d7014dc46f6 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
 	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
+	int i, bus, nodeid, segment, die_id;
 	struct pci2phy_map *map;
 	int err = 0;
 	u32 config = 0;
@@ -1395,7 +1395,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		 */
 		for (i = 0; i < 8; i++) {
 			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
+				if (topology_max_die_per_package() > 1)
+					die_id = i;
+				else
+					die_id = topology_phys_to_logical_pkg(i);
+				map->pbus_to_dieid[bus] = die_id;
 				break;
 			}
 		}
@@ -1412,17 +1416,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 			i = -1;
 			if (reverse) {
 				for (bus = 255; bus >= 0; bus--) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			} else {
 				for (bus = 0; bus <= 255; bus++) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			}
 		}
@@ -4646,19 +4650,14 @@ int snr_uncore_pci_init(void)
 static struct pci_dev *snr_uncore_get_mc_dev(int id)
 {
 	struct pci_dev *mc_dev = NULL;
-	int phys_id, pkg;
+	int pkg;
 
 	while (1) {
 		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
 		if (!mc_dev)
 			break;
-		phys_id = uncore_pcibus_to_physid(mc_dev->bus);
-		if (phys_id < 0)
-			continue;
-		pkg = topology_phys_to_logical_pkg(phys_id);
-		if (pkg < 0)
-			continue;
-		else if (pkg == id)
+		pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+		if (pkg == id)
 			break;
 	}
 	return mc_dev;
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-08 15:35 [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Steve Wahl
  2021-01-08 15:35 ` [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id Steve Wahl
@ 2021-01-08 15:35 ` Steve Wahl
  2021-01-11 13:00   ` Peter Zijlstra
  2021-01-14 11:29   ` [tip: perf/core] " tip-bot2 for Steve Wahl
  2021-01-11 16:39 ` [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Liang, Kan
  2 siblings, 2 replies; 10+ messages in thread
From: Steve Wahl @ 2021-01-08 15:35 UTC (permalink / raw)
  To: Steve Wahl, rja_direct, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Borislav Petkov, x86,
	H. Peter Anvin, linux-kernel
  Cc: Liang, Kan

The registers used to determine which die a pci bus belongs to don't
contain enough information to uniquely specify more than 8 dies, so
when more than 8 dies are present, use NUMA information instead.

Continue to use the previous method for 8 or fewer because it
works there, and covers cases of NUMA being disabled.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
---
 arch/x86/events/intel/uncore_snbep.c | 93 +++++++++++++++++++---------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2d7014dc46f6..b79951d0707c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1370,40 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		if (!ubox_dev)
 			break;
 		bus = ubox_dev->bus->number;
-		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
-		if (err)
-			break;
-		nodeid = config & NODE_ID_MASK;
-		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
-		if (err)
-			break;
+		/*
+		 * The nodeid and idmap registers only contain enough
+		 * information to handle 8 nodes.  On systems with more
+		 * than 8 nodes, we need to rely on NUMA information,
+		 * filled in from BIOS supplied information, to determine
+		 * the topology.
+		 */
+		if (nr_node_ids <= 8) {
+			/* get the Node ID of the local register */
+			err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+			if (err)
+				break;
+			nodeid = config & NODE_ID_MASK;
+			/* get the Node ID mapping */
+			err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+			if (err)
+				break;
 
-		segment = pci_domain_nr(ubox_dev->bus);
-		raw_spin_lock(&pci2phy_map_lock);
-		map = __find_pci2phy_map(segment);
-		if (!map) {
+			segment = pci_domain_nr(ubox_dev->bus);
+			raw_spin_lock(&pci2phy_map_lock);
+			map = __find_pci2phy_map(segment);
+			if (!map) {
+				raw_spin_unlock(&pci2phy_map_lock);
+				err = -ENOMEM;
+				break;
+			}
+
+			/*
+			 * every three bits in the Node ID mapping register maps
+			 * to a particular node.
+			 */
+			for (i = 0; i < 8; i++) {
+				if (nodeid == ((config >> (3 * i)) & 0x7)) {
+					if (topology_max_die_per_package() > 1)
+						die_id = i;
+					else
+						die_id = topology_phys_to_logical_pkg(i);
+					map->pbus_to_dieid[bus] = die_id;
+					break;
+				}
+			}
 			raw_spin_unlock(&pci2phy_map_lock);
-			err = -ENOMEM;
-			break;
-		}
+		} else {
+			int node = pcibus_to_node(ubox_dev->bus);
+			int cpu;
+
+			segment = pci_domain_nr(ubox_dev->bus);
+			raw_spin_lock(&pci2phy_map_lock);
+			map = __find_pci2phy_map(segment);
+			if (!map) {
+				raw_spin_unlock(&pci2phy_map_lock);
+				err = -ENOMEM;
+				break;
+			}
 
-		/*
-		 * every three bits in the Node ID mapping register maps
-		 * to a particular node.
-		 */
-		for (i = 0; i < 8; i++) {
-			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				if (topology_max_die_per_package() > 1)
-					die_id = i;
-				else
-					die_id = topology_phys_to_logical_pkg(i);
-				map->pbus_to_dieid[bus] = die_id;
+			die_id = -1;
+			for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+				struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+				if (c->initialized && cpu_to_node(cpu) == node) {
+					map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+					break;
+				}
+			}
+			raw_spin_unlock(&pci2phy_map_lock);
+
+			if (WARN_ON_ONCE(die_id == -1)) {
+				err = -EINVAL;
 				break;
 			}
 		}
-		raw_spin_unlock(&pci2phy_map_lock);
 	}
 
 	if (!err) {
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-08 15:35 ` [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info Steve Wahl
@ 2021-01-11 13:00   ` Peter Zijlstra
  2021-01-11 15:45     ` Steve Wahl
  2021-01-14 11:29   ` [tip: perf/core] " tip-bot2 for Steve Wahl
  1 sibling, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-01-11 13:00 UTC (permalink / raw)
  To: Steve Wahl
  Cc: rja_direct, Ingo Molnar, Arnaldo Carvalho de Melo, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, Thomas Gleixner,
	Borislav Petkov, x86, H. Peter Anvin, linux-kernel, Liang, Kan

On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:


> +		/*
> +		 * The nodeid and idmap registers only contain enough
> +		 * information to handle 8 nodes.  On systems with more
> +		 * than 8 nodes, we need to rely on NUMA information,
> +		 * filled in from BIOS supplied information, to determine
> +		 * the topology.
> +		 */

Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
bonghits qualitee :/

> +		if (nr_node_ids <= 8) {
> +			/* get the Node ID of the local register */
> +			err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
> +			if (err)
> +				break;
> +			nodeid = config & NODE_ID_MASK;
> +			/* get the Node ID mapping */
> +			err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
> +			if (err)
> +				break;
>  
> +			segment = pci_domain_nr(ubox_dev->bus);
> +			raw_spin_lock(&pci2phy_map_lock);
> +			map = __find_pci2phy_map(segment);
> +			if (!map) {
> +				raw_spin_unlock(&pci2phy_map_lock);
> +				err = -ENOMEM;
> +				break;
> +			}
> +
> +			/*
> +			 * every three bits in the Node ID mapping register maps
> +			 * to a particular node.
> +			 */
> +			for (i = 0; i < 8; i++) {
> +				if (nodeid == ((config >> (3 * i)) & 0x7)) {
> +					if (topology_max_die_per_package() > 1)
> +						die_id = i;
> +					else
> +						die_id = topology_phys_to_logical_pkg(i);
> +					map->pbus_to_dieid[bus] = die_id;
> +					break;
> +				}
> +			}
>  			raw_spin_unlock(&pci2phy_map_lock);
> +		} else {
> +			int node = pcibus_to_node(ubox_dev->bus);
> +			int cpu;
> +
> +			segment = pci_domain_nr(ubox_dev->bus);
> +			raw_spin_lock(&pci2phy_map_lock);
> +			map = __find_pci2phy_map(segment);
> +			if (!map) {
> +				raw_spin_unlock(&pci2phy_map_lock);
> +				err = -ENOMEM;
> +				break;
> +			}
> +			die_id = -1;
> +			for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
> +				struct cpuinfo_x86 *c = &cpu_data(cpu);
> +
> +				if (c->initialized && cpu_to_node(cpu) == node) {
> +					map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
> +					break;
> +				}
> +			}
> +			raw_spin_unlock(&pci2phy_map_lock);
> +
> +			if (WARN_ON_ONCE(die_id == -1)) {
> +				err = -EINVAL;
>  				break;
>  			}

This seems to assume a single die per node; is that fundemantally
correct?

Did you consider malicious BIOS data? I think we're good, but I didn't
look too hard.

>  		}
>  	}
>  
>  	if (!err) {
> -- 
> 2.26.2
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-11 13:00   ` Peter Zijlstra
@ 2021-01-11 15:45     ` Steve Wahl
  2021-01-12 15:07       ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Steve Wahl @ 2021-01-11 15:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steve Wahl, rja_direct, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Thomas Gleixner, Borislav Petkov, x86, H. Peter Anvin,
	linux-kernel, Liang, Kan

On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
> 
> 
> > +		/*
> > +		 * The nodeid and idmap registers only contain enough
> > +		 * information to handle 8 nodes.  On systems with more
> > +		 * than 8 nodes, we need to rely on NUMA information,
> > +		 * filled in from BIOS supplied information, to determine
> > +		 * the topology.
> > +		 */
> 
> Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> bonghits qualitee :/

I work too close to BIOS people (virtually, at least for the moment)
to safely make disparaging remarks. :-) While the origin is the BIOS,
I'm using pieces that were already being pulled from the BIOS tables
for NUMA purposes.  

> > +		if (nr_node_ids <= 8) {
> > +			/* get the Node ID of the local register */
> > +			err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
> > +			if (err)
> > +				break;
> > +			nodeid = config & NODE_ID_MASK;
> > +			/* get the Node ID mapping */
> > +			err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
> > +			if (err)
> > +				break;
> >  
> > +			segment = pci_domain_nr(ubox_dev->bus);
> > +			raw_spin_lock(&pci2phy_map_lock);
> > +			map = __find_pci2phy_map(segment);
> > +			if (!map) {
> > +				raw_spin_unlock(&pci2phy_map_lock);
> > +				err = -ENOMEM;
> > +				break;
> > +			}
> > +
> > +			/*
> > +			 * every three bits in the Node ID mapping register maps
> > +			 * to a particular node.
> > +			 */
> > +			for (i = 0; i < 8; i++) {
> > +				if (nodeid == ((config >> (3 * i)) & 0x7)) {
> > +					if (topology_max_die_per_package() > 1)
> > +						die_id = i;
> > +					else
> > +						die_id = topology_phys_to_logical_pkg(i);
> > +					map->pbus_to_dieid[bus] = die_id;
> > +					break;
> > +				}
> > +			}
> >  			raw_spin_unlock(&pci2phy_map_lock);
> > +		} else {
> > +			int node = pcibus_to_node(ubox_dev->bus);
> > +			int cpu;
> > +
> > +			segment = pci_domain_nr(ubox_dev->bus);
> > +			raw_spin_lock(&pci2phy_map_lock);
> > +			map = __find_pci2phy_map(segment);
> > +			if (!map) {
> > +				raw_spin_unlock(&pci2phy_map_lock);
> > +				err = -ENOMEM;
> > +				break;
> > +			}
> > +			die_id = -1;
> > +			for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
> > +				struct cpuinfo_x86 *c = &cpu_data(cpu);
> > +
> > +				if (c->initialized && cpu_to_node(cpu) == node) {
> > +					map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
> > +					break;
> > +				}
> > +			}
> > +			raw_spin_unlock(&pci2phy_map_lock);
> > +
> > +			if (WARN_ON_ONCE(die_id == -1)) {
> > +				err = -EINVAL;
> >  				break;
> >  			}
> 
> This seems to assume a single die per node; is that fundemantally
> correct?

It should work for one or more nodes per die; i.e. sub-NUMA clustering
should work.  If there are any systems with fewer nodes than dies
(more than one die in a NUMA node) it will likely fail.  It's not
clear to me whether nodes < dies is a possibility or not; however,
note that this situation would be broken with or without my changes.

> Did you consider malicious BIOS data? I think we're good, but I didn't
> look too hard.

I did not consider malicious BIOS data.  With quick thought toward it,
I believe the worst that could happen is the counters get associated
with the wrong die, and only under circumstances where the previous
code would have aborted mapping the counters to dies completely (which
it does when there are more than 8 dies).

Thank you for taking the time to look at this!

--> Steve

-- 
Steve Wahl, Hewlett Packard Enterprise

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes
  2021-01-08 15:35 [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Steve Wahl
  2021-01-08 15:35 ` [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id Steve Wahl
  2021-01-08 15:35 ` [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info Steve Wahl
@ 2021-01-11 16:39 ` Liang, Kan
  2 siblings, 0 replies; 10+ messages in thread
From: Liang, Kan @ 2021-01-11 16:39 UTC (permalink / raw)
  To: Steve Wahl, rja_direct, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Thomas Gleixner, Borislav Petkov, x86,
	H. Peter Anvin, linux-kernel
  Cc: Liang, Kan



On 1/8/2021 10:35 AM, Steve Wahl wrote:
> For Intel uncore, the registers being used to identify the die don't
> contain enough bits to uniquely identify more than 8 dies.  On
> systems with more than 8 dies, this results in error messages of the
> form "skx_uncore: probe of XXXX:XX:XX.X failed with error -22", and
> some perf counters showing up as "<not supported>".
> 
> On such systems, use NUMA information to determine die id.
> 
> Continue to use the register information with 8 or fewer numa nodes to
> cover cases like NUMA not being enabled.
> 
> The first patch moves translation from physical to logical die id
> earlier in the code, and stores only the logical id.  The logical id
> is the only one that is really used.  Without this change the second
> patch would have to store both physical and logical id, which was much
> more complicated.
> 
> The second patch adds the alternative of deriving the logical die id
> from the NUMA information when there are more than 8 nodes.
> 
> Steve Wahl (2):
>    perf/x86/intel/uncore: Store the logical die id instead of the
>      physical die id.
>    perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA
>      info
> 
>   arch/x86/events/intel/uncore.c       |  58 +++++---------
>   arch/x86/events/intel/uncore.h       |   5 +-
>   arch/x86/events/intel/uncore_snb.c   |   2 +-
>   arch/x86/events/intel/uncore_snbep.c | 114 ++++++++++++++++++---------
>   4 files changed, 99 insertions(+), 80 deletions(-)
> 

Thanks Steve for working on the issue. The patch set looks good to me.

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>

Thanks,
Kan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-11 15:45     ` Steve Wahl
@ 2021-01-12 15:07       ` Peter Zijlstra
  2021-01-12 19:42         ` Steve Wahl
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-01-12 15:07 UTC (permalink / raw)
  To: Steve Wahl
  Cc: rja_direct, Ingo Molnar, Arnaldo Carvalho de Melo, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, Thomas Gleixner,
	Borislav Petkov, x86, H. Peter Anvin, linux-kernel, Liang, Kan

On Mon, Jan 11, 2021 at 09:45:16AM -0600, Steve Wahl wrote:
> On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
> > 
> > 
> > > +		/*
> > > +		 * The nodeid and idmap registers only contain enough
> > > +		 * information to handle 8 nodes.  On systems with more
> > > +		 * than 8 nodes, we need to rely on NUMA information,
> > > +		 * filled in from BIOS supplied information, to determine
> > > +		 * the topology.
> > > +		 */
> > 
> > Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> > bonghits qualitee :/
> 
> I work too close to BIOS people (virtually, at least for the moment)
> to safely make disparaging remarks. :-) While the origin is the BIOS,
> I'm using pieces that were already being pulled from the BIOS tables
> for NUMA purposes.

:-) It's just that we've had too much 'fun' with PCI node bindings in
the past.




^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-12 15:07       ` Peter Zijlstra
@ 2021-01-12 19:42         ` Steve Wahl
  0 siblings, 0 replies; 10+ messages in thread
From: Steve Wahl @ 2021-01-12 19:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steve Wahl, rja_direct, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
	Thomas Gleixner, Borislav Petkov, x86, H. Peter Anvin,
	linux-kernel, Liang, Kan

On Tue, Jan 12, 2021 at 04:07:15PM +0100, Peter Zijlstra wrote:
> On Mon, Jan 11, 2021 at 09:45:16AM -0600, Steve Wahl wrote:
> > On Mon, Jan 11, 2021 at 02:00:33PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 08, 2021 at 09:35:49AM -0600, Steve Wahl wrote:
> > > 
> > > 
> > > > +		/*
> > > > +		 * The nodeid and idmap registers only contain enough
> > > > +		 * information to handle 8 nodes.  On systems with more
> > > > +		 * than 8 nodes, we need to rely on NUMA information,
> > > > +		 * filled in from BIOS supplied information, to determine
> > > > +		 * the topology.
> > > > +		 */
> > > 
> > > Egads.. do we realy have to trust BIOS data? BIOS crud tends to be
> > > bonghits qualitee :/
> > 
> > I work too close to BIOS people (virtually, at least for the moment)
> > to safely make disparaging remarks. :-) While the origin is the BIOS,
> > I'm using pieces that were already being pulled from the BIOS tables
> > for NUMA purposes.
> 
> :-) It's just that we've had too much 'fun' with PCI node bindings in
> the past.

I wasn't aware of that, but I understand.  Fortunately, this patch
should't touch cases that aren't already broken (>8 nodes); working
cases continue to use the existing methods.

Thanks!

--> Steve Wahl

-- 
Steve Wahl, Hewlett Packard Enterprise

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip: perf/core] perf/x86/intel/uncore: Store the logical die id instead of the physical die id.
  2021-01-08 15:35 ` [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id Steve Wahl
@ 2021-01-14 11:29   ` tip-bot2 for Steve Wahl
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot2 for Steve Wahl @ 2021-01-14 11:29 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Steve Wahl, Peter Zijlstra (Intel), Kan Liang, x86, linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     ba9506be4e402ee597b8f41204008b97989b5eef
Gitweb:        https://git.kernel.org/tip/ba9506be4e402ee597b8f41204008b97989b5eef
Author:        Steve Wahl <steve.wahl@hpe.com>
AuthorDate:    Fri, 08 Jan 2021 09:35:48 -06:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 14 Jan 2021 11:20:13 +01:00

perf/x86/intel/uncore: Store the logical die id instead of the physical die id.

The phys_id isn't really used other than to map to a logical die id.
Calculate the logical die id earlier, and store that instead of the
phys_id.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20210108153549.108989-2-steve.wahl@hpe.com
---
 arch/x86/events/intel/uncore.c       | 58 +++++++++------------------
 arch/x86/events/intel/uncore.h       |  5 +--
 arch/x86/events/intel/uncore_snb.c   |  2 +-
 arch/x86/events/intel/uncore_snbep.c | 31 ++++++--------
 4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f..33c8180 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
 
 MODULE_LICENSE("GPL");
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
-	int phys_id = -1;
+	int die_id = -1;
 
 	raw_spin_lock(&pci2phy_map_lock);
 	list_for_each_entry(map, &pci2phy_map_head, list) {
 		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
+			die_id = map->pbus_to_dieid[bus->number];
 			break;
 		}
 	}
 	raw_spin_unlock(&pci2phy_map_lock);
 
-	return phys_id;
+	return die_id;
 }
 
 static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
 	alloc = NULL;
 	map->segment = segment;
 	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
+		map->pbus_to_dieid[i] = -1;
 	list_add_tail(&map->list, &pci2phy_map_head);
 
 end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 
 	uncore_pmu_init_hrtimer(box);
 	box->cpu = -1;
-	box->pci_phys_id = -1;
 	box->dieid = -1;
 
 	/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
 /*
  * Get the die information of a PCI device.
  * @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
-				       int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
 {
-	*phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (*phys_id < 0)
-		return -ENODEV;
-
-	*die = (topology_max_die_per_package() > 1) ? *phys_id :
-				topology_phys_to_logical_pkg(*phys_id);
+	*die = uncore_pcibus_to_dieid(pdev->bus);
 	if (*die < 0)
 		return -EINVAL;
 
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
  * @pdev: The PCI device.
  * @type: The corresponding PMU type of the device.
  * @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
 static int uncore_pci_pmu_register(struct pci_dev *pdev,
 				   struct intel_uncore_type *type,
 				   struct intel_uncore_pmu *pmu,
-				   int phys_id, int die)
+				   int die)
 {
 	struct intel_uncore_box *box;
 	int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
 	atomic_inc(&box->refcnt);
-	box->pci_phys_id = phys_id;
 	box->dieid = die;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 {
 	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu = NULL;
-	int phys_id, die, ret;
+	int die, ret;
 
-	ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+	ret = uncore_pci_get_dev_die_info(pdev, &die);
 	if (ret)
 		return ret;
 
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
 	}
 
-	ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+	ret = uncore_pci_pmu_register(pdev, type, pmu, die);
 
 	pci_set_drvdata(pdev, pmu->boxes[die]);
 
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 /*
  * Unregister the PMU of a PCI device
  * @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
  * @die: The die id which the device maps to.
  */
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
-				      int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
 {
 	struct intel_uncore_box *box = pmu->boxes[die];
 
-	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-		return;
-
 	pmu->boxes[die] = NULL;
 	if (atomic_dec_return(&pmu->activeboxes) == 0)
 		uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box;
 	struct intel_uncore_pmu *pmu;
-	int i, phys_id, die;
+	int i, die;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return;
 
 	box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 
 	pci_set_drvdata(pdev, NULL);
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	struct device *dev = data;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct intel_uncore_pmu *pmu;
-	int phys_id, die;
+	int die;
 
 	/* Unregister the PMU when the device is going to be deleted. */
 	if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
 	if (!pmu)
 		return NOTIFY_DONE;
 
-	if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+	if (uncore_pci_get_dev_die_info(pdev, &die))
 		return NOTIFY_DONE;
 
-	uncore_pci_pmu_unregister(pmu, phys_id, die);
+	uncore_pci_pmu_unregister(pmu, die);
 
 	return NOTIFY_OK;
 }
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
 	struct pci_dev *pci_sub_dev;
 	bool notify = false;
 	unsigned int devfn;
-	int phys_id, die;
+	int die;
 
 	while (ids && ids->vendor) {
 		pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
 			if (!pmu)
 				continue;
 
-			if (uncore_pci_get_dev_die_info(pci_sub_dev,
-							&phys_id, &die))
+			if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
 				continue;
 
 			if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
-						     phys_id, die))
+						     die))
 				notify = true;
 		}
 		ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea15..a3c6e16 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int pci_phys_id;
 	int dieid;	/* Logical die ID */
 	int n_active;	/* number of active events */
 	int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
 struct pci2phy_map {
 	struct list_head list;
 	int segment;
-	int pbus_to_physid[256];
+	int pbus_to_dieid[256];
 };
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
 
 ssize_t uncore_event_show(struct device *dev,
 			  struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893..5127128 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
 		pci_dev_put(dev);
 		return -ENOMEM;
 	}
-	map->pbus_to_physid[bus] = 0;
+	map->pbus_to_dieid[bus] = 0;
 	raw_spin_unlock(&pci2phy_map_lock);
 
 	pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb182..2d7014d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
 	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
+	int i, bus, nodeid, segment, die_id;
 	struct pci2phy_map *map;
 	int err = 0;
 	u32 config = 0;
@@ -1395,7 +1395,11 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		 */
 		for (i = 0; i < 8; i++) {
 			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
+				if (topology_max_die_per_package() > 1)
+					die_id = i;
+				else
+					die_id = topology_phys_to_logical_pkg(i);
+				map->pbus_to_dieid[bus] = die_id;
 				break;
 			}
 		}
@@ -1412,17 +1416,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 			i = -1;
 			if (reverse) {
 				for (bus = 255; bus >= 0; bus--) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			} else {
 				for (bus = 0; bus <= 255; bus++) {
-					if (map->pbus_to_physid[bus] >= 0)
-						i = map->pbus_to_physid[bus];
+					if (map->pbus_to_dieid[bus] >= 0)
+						i = map->pbus_to_dieid[bus];
 					else
-						map->pbus_to_physid[bus] = i;
+						map->pbus_to_dieid[bus] = i;
 				}
 			}
 		}
@@ -4646,19 +4650,14 @@ int snr_uncore_pci_init(void)
 static struct pci_dev *snr_uncore_get_mc_dev(int id)
 {
 	struct pci_dev *mc_dev = NULL;
-	int phys_id, pkg;
+	int pkg;
 
 	while (1) {
 		mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
 		if (!mc_dev)
 			break;
-		phys_id = uncore_pcibus_to_physid(mc_dev->bus);
-		if (phys_id < 0)
-			continue;
-		pkg = topology_phys_to_logical_pkg(phys_id);
-		if (pkg < 0)
-			continue;
-		else if (pkg == id)
+		pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+		if (pkg == id)
 			break;
 	}
 	return mc_dev;

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip: perf/core] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info
  2021-01-08 15:35 ` [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info Steve Wahl
  2021-01-11 13:00   ` Peter Zijlstra
@ 2021-01-14 11:29   ` tip-bot2 for Steve Wahl
  1 sibling, 0 replies; 10+ messages in thread
From: tip-bot2 for Steve Wahl @ 2021-01-14 11:29 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Steve Wahl, Peter Zijlstra (Intel), Kan Liang, x86, linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     9a7832ce3d920426a36cdd78eda4b3568d4d09e3
Gitweb:        https://git.kernel.org/tip/9a7832ce3d920426a36cdd78eda4b3568d4d09e3
Author:        Steve Wahl <steve.wahl@hpe.com>
AuthorDate:    Fri, 08 Jan 2021 09:35:49 -06:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 14 Jan 2021 11:20:14 +01:00

perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info

The registers used to determine which die a pci bus belongs to don't
contain enough information to uniquely specify more than 8 dies, so
when more than 8 dies are present, use NUMA information instead.

Continue to use the previous method for 8 or fewer because it
works there, and covers cases of NUMA being disabled.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20210108153549.108989-3-steve.wahl@hpe.com
---
 arch/x86/events/intel/uncore_snbep.c | 93 ++++++++++++++++++---------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2d7014d..b79951d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1370,40 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
 		if (!ubox_dev)
 			break;
 		bus = ubox_dev->bus->number;
-		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
-		if (err)
-			break;
-		nodeid = config & NODE_ID_MASK;
-		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
-		if (err)
-			break;
+		/*
+		 * The nodeid and idmap registers only contain enough
+		 * information to handle 8 nodes.  On systems with more
+		 * than 8 nodes, we need to rely on NUMA information,
+		 * filled in from BIOS supplied information, to determine
+		 * the topology.
+		 */
+		if (nr_node_ids <= 8) {
+			/* get the Node ID of the local register */
+			err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+			if (err)
+				break;
+			nodeid = config & NODE_ID_MASK;
+			/* get the Node ID mapping */
+			err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+			if (err)
+				break;
 
-		segment = pci_domain_nr(ubox_dev->bus);
-		raw_spin_lock(&pci2phy_map_lock);
-		map = __find_pci2phy_map(segment);
-		if (!map) {
+			segment = pci_domain_nr(ubox_dev->bus);
+			raw_spin_lock(&pci2phy_map_lock);
+			map = __find_pci2phy_map(segment);
+			if (!map) {
+				raw_spin_unlock(&pci2phy_map_lock);
+				err = -ENOMEM;
+				break;
+			}
+
+			/*
+			 * every three bits in the Node ID mapping register maps
+			 * to a particular node.
+			 */
+			for (i = 0; i < 8; i++) {
+				if (nodeid == ((config >> (3 * i)) & 0x7)) {
+					if (topology_max_die_per_package() > 1)
+						die_id = i;
+					else
+						die_id = topology_phys_to_logical_pkg(i);
+					map->pbus_to_dieid[bus] = die_id;
+					break;
+				}
+			}
 			raw_spin_unlock(&pci2phy_map_lock);
-			err = -ENOMEM;
-			break;
-		}
+		} else {
+			int node = pcibus_to_node(ubox_dev->bus);
+			int cpu;
+
+			segment = pci_domain_nr(ubox_dev->bus);
+			raw_spin_lock(&pci2phy_map_lock);
+			map = __find_pci2phy_map(segment);
+			if (!map) {
+				raw_spin_unlock(&pci2phy_map_lock);
+				err = -ENOMEM;
+				break;
+			}
 
-		/*
-		 * every three bits in the Node ID mapping register maps
-		 * to a particular node.
-		 */
-		for (i = 0; i < 8; i++) {
-			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				if (topology_max_die_per_package() > 1)
-					die_id = i;
-				else
-					die_id = topology_phys_to_logical_pkg(i);
-				map->pbus_to_dieid[bus] = die_id;
+			die_id = -1;
+			for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+				struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+				if (c->initialized && cpu_to_node(cpu) == node) {
+					map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+					break;
+				}
+			}
+			raw_spin_unlock(&pci2phy_map_lock);
+
+			if (WARN_ON_ONCE(die_id == -1)) {
+				err = -EINVAL;
 				break;
 			}
 		}
-		raw_spin_unlock(&pci2phy_map_lock);
 	}
 
 	if (!err) {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-01-14 11:30 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-08 15:35 [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Steve Wahl
2021-01-08 15:35 ` [PATCH 1/2] perf/x86/intel/uncore: Store the logical die id instead of the physical die id Steve Wahl
2021-01-14 11:29   ` [tip: perf/core] " tip-bot2 for Steve Wahl
2021-01-08 15:35 ` [PATCH 2/2] perf/x86/intel/uncore: With > 8 nodes, get pci bus die id from NUMA info Steve Wahl
2021-01-11 13:00   ` Peter Zijlstra
2021-01-11 15:45     ` Steve Wahl
2021-01-12 15:07       ` Peter Zijlstra
2021-01-12 19:42         ` Steve Wahl
2021-01-14 11:29   ` [tip: perf/core] " tip-bot2 for Steve Wahl
2021-01-11 16:39 ` [PATCH 0/2] perf/x86/intel/uncore: Derive die id from NUMA info with more than 8 nodes Liang, Kan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).