linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC/RFT] amd64_edac: avoid doing post-probe setup
@ 2015-03-19  0:49 Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 1/3] EDAC: amd64: stop allocating ecc settings separately Dmitry Torokhov
                   ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19  0:49 UTC (permalink / raw)
  To: Borislav Petkov, Doug Thompson, Tejun Heo
  Cc: linux-kernel, linux-edac, Mauro Carvalho Chehab, Tetsuo Handa,
	Olof Johansson, Arjan van de Ven, Greg Kroah-Hartman,
	Luis R . Rodriguez

While testing asynchronous probing Luis noticed that this driver was
failing to initialize. It is happening because the driver tries to check
the result of binding, assuming that it is done synchronously, and either
abort the loading, or perform some additional setup steps.  This series
tried to work around it by moving these post-binding steps into probe() to
be executed when binding node 0.

Untested as I do not have AMD boxes.

Thanks!

-- 
Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/3] EDAC: amd64: stop allocating ecc settings separately
  2015-03-19  0:49 [RFC/RFT] amd64_edac: avoid doing post-probe setup Dmitry Torokhov
@ 2015-03-19  0:49 ` Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 2/3] EDAC: amd64_edac: clean up remove_one_instance() Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early Dmitry Torokhov
  2 siblings, 0 replies; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19  0:49 UTC (permalink / raw)
  To: Borislav Petkov, Doug Thompson, Tejun Heo
  Cc: linux-kernel, linux-edac, Mauro Carvalho Chehab, Tetsuo Handa,
	Olof Johansson, Arjan van de Ven, Greg Kroah-Hartman,
	Luis R . Rodriguez

There is no reason for allocating and managing ECC settings separately
when we can merge them into the dirver-private structure.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/edac/amd64_edac.c | 77 ++++++++++++++---------------------------------
 drivers/edac/amd64_edac.h | 27 +++++++++--------
 2 files changed, 37 insertions(+), 67 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 92772ff..c0ebc06 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -20,9 +20,6 @@ static struct msr __percpu *msrs;
  */
 static atomic_t drv_instances = ATOMIC_INIT(0);
 
-/* Per-node stuff */
-static struct ecc_settings **ecc_stngs;
-
 /*
  * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
  * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
@@ -2790,6 +2787,18 @@ static int init_one_instance(struct pci_dev *F2)
 	if (err)
 		goto err_free;
 
+	if (!ecc_enabled(pvt->F3, nid)) {
+		ret = -ENODEV;
+
+		if (!ecc_enable_override)
+			goto err_siblings;
+
+		amd64_warn("Forcing ECC on!\n");
+
+		if (!enable_ecc_error_reporting(&pvt->ecc, nid, pvt->F3))
+			goto err_siblings;
+	}
+
 	read_mc_regs(pvt);
 
 	/*
@@ -2818,7 +2827,7 @@ static int init_one_instance(struct pci_dev *F2)
 
 	mci = edac_mc_alloc(nid, ARRAY_SIZE(layers), layers, 0);
 	if (!mci)
-		goto err_siblings;
+		goto err_restore_ecc;
 
 	mci->pvt_info = pvt;
 	mci->pdev = &pvt->F2->dev;
@@ -2847,6 +2856,9 @@ static int init_one_instance(struct pci_dev *F2)
 err_add_mc:
 	edac_mc_free(mci);
 
+err_restore_ecc:
+	restore_ecc_error_reporting(&pvt->ecc, nid, pvt->F3);
+
 err_siblings:
 	free_mc_sibling_devs(pvt);
 
@@ -2860,10 +2872,7 @@ err_ret:
 static int probe_one_instance(struct pci_dev *pdev,
 			      const struct pci_device_id *mc_type)
 {
-	u16 nid = amd_get_node_id(pdev);
-	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
-	struct ecc_settings *s;
-	int ret = 0;
+	int ret;
 
 	ret = pci_enable_device(pdev);
 	if (ret < 0) {
@@ -2871,39 +2880,12 @@ static int probe_one_instance(struct pci_dev *pdev,
 		return -EIO;
 	}
 
-	ret = -ENOMEM;
-	s = kzalloc(sizeof(struct ecc_settings), GFP_KERNEL);
-	if (!s)
-		goto err_out;
-
-	ecc_stngs[nid] = s;
-
-	if (!ecc_enabled(F3, nid)) {
-		ret = -ENODEV;
-
-		if (!ecc_enable_override)
-			goto err_enable;
-
-		amd64_warn("Forcing ECC on!\n");
-
-		if (!enable_ecc_error_reporting(s, nid, F3))
-			goto err_enable;
-	}
-
 	ret = init_one_instance(pdev);
-	if (ret < 0) {
-		amd64_err("Error probing instance: %d\n", nid);
-		restore_ecc_error_reporting(s, nid, F3);
-	}
+	if (ret < 0)
+		amd64_err("Error probing instance: %d\n",
+			  amd_get_node_id(pdev));
 
 	return ret;
-
-err_enable:
-	kfree(s);
-	ecc_stngs[nid] = NULL;
-
-err_out:
-	return ret;
 }
 
 static void remove_one_instance(struct pci_dev *pdev)
@@ -2912,7 +2894,6 @@ static void remove_one_instance(struct pci_dev *pdev)
 	struct amd64_pvt *pvt;
 	u16 nid = amd_get_node_id(pdev);
 	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
-	struct ecc_settings *s = ecc_stngs[nid];
 
 	mci = find_mci_by_dev(&pdev->dev);
 	WARN_ON(!mci);
@@ -2924,7 +2905,7 @@ static void remove_one_instance(struct pci_dev *pdev)
 
 	pvt = mci->pvt_info;
 
-	restore_ecc_error_reporting(s, nid, F3);
+	restore_ecc_error_reporting(&pvt->ecc, nid, F3);
 
 	free_mc_sibling_devs(pvt);
 
@@ -2932,9 +2913,6 @@ static void remove_one_instance(struct pci_dev *pdev)
 	amd_report_gart_errors(false);
 	amd_unregister_ecc_decoder(decode_bus_error);
 
-	kfree(ecc_stngs[nid]);
-	ecc_stngs[nid] = NULL;
-
 	/* Free the EDAC CORE resources */
 	mci->pvt_info = NULL;
 
@@ -2998,13 +2976,9 @@ static int __init amd64_edac_init(void)
 		goto err_ret;
 
 	err = -ENOMEM;
-	ecc_stngs = kzalloc(amd_nb_num() * sizeof(ecc_stngs[0]), GFP_KERNEL);
-	if (!ecc_stngs)
-		goto err_free;
-
 	msrs = msrs_alloc();
 	if (!msrs)
-		goto err_free;
+		goto err_ret;
 
 	err = pci_register_driver(&amd64_pci_driver);
 	if (err)
@@ -3029,10 +3003,6 @@ err_pci:
 	msrs_free(msrs);
 	msrs = NULL;
 
-err_free:
-	kfree(ecc_stngs);
-	ecc_stngs = NULL;
-
 err_ret:
 	return err;
 }
@@ -3044,9 +3014,6 @@ static void __exit amd64_edac_exit(void)
 
 	pci_unregister_driver(&amd64_pci_driver);
 
-	kfree(ecc_stngs);
-	ecc_stngs = NULL;
-
 	msrs_free(msrs);
 	msrs = NULL;
 }
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 4bdec75..a62ccba 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -340,6 +340,19 @@ struct chip_select {
 	u8 m_cnt;
 };
 
+/*
+ * per-node ECC settings descriptor
+ */
+struct ecc_settings {
+	u32 old_nbctl;
+	bool nbctl_valid;
+
+	struct flags {
+		unsigned long nb_mce_enable:1;
+		unsigned long nb_ecc_prev:1;
+	} flags;
+};
+
 struct amd64_pvt {
 	struct low_ops *ops;
 
@@ -385,6 +398,8 @@ struct amd64_pvt {
 	/* place to store error injection parameters prior to issue */
 	struct error_injection injection;
 
+	struct ecc_settings ecc;
+
 	/* cache the dram_type */
 	enum mem_type dram_type;
 };
@@ -439,18 +454,6 @@ static inline u8 dct_sel_interleave_addr(struct amd64_pvt *pvt)
 
 	return	((pvt)->dct_sel_lo >> 6) & 0x3;
 }
-/*
- * per-node ECC settings descriptor
- */
-struct ecc_settings {
-	u32 old_nbctl;
-	bool nbctl_valid;
-
-	struct flags {
-		unsigned long nb_mce_enable:1;
-		unsigned long nb_ecc_prev:1;
-	} flags;
-};
 
 #ifdef CONFIG_EDAC_DEBUG
 extern const struct attribute_group amd64_edac_dbg_group;
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/3] EDAC: amd64_edac: clean up remove_one_instance()
  2015-03-19  0:49 [RFC/RFT] amd64_edac: avoid doing post-probe setup Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 1/3] EDAC: amd64: stop allocating ecc settings separately Dmitry Torokhov
@ 2015-03-19  0:49 ` Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early Dmitry Torokhov
  2 siblings, 0 replies; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19  0:49 UTC (permalink / raw)
  To: Borislav Petkov, Doug Thompson, Tejun Heo
  Cc: linux-kernel, linux-edac, Mauro Carvalho Chehab, Tetsuo Handa,
	Olof Johansson, Arjan van de Ven, Greg Kroah-Hartman,
	Luis R . Rodriguez

We can get much of data from driver-proivate structure so make use of
it.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/edac/amd64_edac.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index c0ebc06..d23dad9 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2774,7 +2774,7 @@ static int init_one_instance(struct pci_dev *F2)
 	if (!pvt)
 		goto err_ret;
 
-	pvt->mc_node_id	= nid;
+	pvt->mc_node_id = nid;
 	pvt->F2 = F2;
 
 	ret = -EINVAL;
@@ -2849,6 +2849,8 @@ static int init_one_instance(struct pci_dev *F2)
 
 	amd_register_ecc_decoder(decode_bus_error);
 
+	pci_set_drvdata(F2, pvt);
+
 	atomic_inc(&drv_instances);
 
 	return 0;
@@ -2890,22 +2892,15 @@ static int probe_one_instance(struct pci_dev *pdev,
 
 static void remove_one_instance(struct pci_dev *pdev)
 {
+	struct amd64_pvt *pvt = pci_get_drvdata(pdev);
 	struct mem_ctl_info *mci;
-	struct amd64_pvt *pvt;
-	u16 nid = amd_get_node_id(pdev);
-	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
-
-	mci = find_mci_by_dev(&pdev->dev);
-	WARN_ON(!mci);
 
 	/* Remove from EDAC CORE tracking list */
 	mci = edac_mc_del_mc(&pdev->dev);
-	if (!mci)
+	if (WARN_ON(!mci))
 		return;
 
-	pvt = mci->pvt_info;
-
-	restore_ecc_error_reporting(&pvt->ecc, nid, F3);
+	restore_ecc_error_reporting(&pvt->ecc, pvt->mc_node_id, pvt->F3);
 
 	free_mc_sibling_devs(pvt);
 
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19  0:49 [RFC/RFT] amd64_edac: avoid doing post-probe setup Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 1/3] EDAC: amd64: stop allocating ecc settings separately Dmitry Torokhov
  2015-03-19  0:49 ` [PATCH 2/3] EDAC: amd64_edac: clean up remove_one_instance() Dmitry Torokhov
@ 2015-03-19  0:49 ` Dmitry Torokhov
  2015-03-19  9:40   ` Borislav Petkov
  2 siblings, 1 reply; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19  0:49 UTC (permalink / raw)
  To: Borislav Petkov, Doug Thompson, Tejun Heo
  Cc: linux-kernel, linux-edac, Mauro Carvalho Chehab, Tetsuo Handa,
	Olof Johansson, Arjan van de Ven, Greg Kroah-Hartman,
	Luis R . Rodriguez

This change moves setting up PCI control that used to be done after
driver tried to bind to present PCI devices into the probe() method,
thus allowing probing to be done asynchronously, if needed.

To keep as close as possible to the previous behavior we explicitly
check for presence of supported PCI devices; still we may end up with
driver loaded even if we did not bind to any of them (for example if ECC
is disabled).

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/edac/amd64_edac.c | 49 ++++++++++++++---------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index d23dad9..0b9e14f 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -16,11 +16,6 @@ module_param(ecc_enable_override, int, 0644);
 static struct msr __percpu *msrs;
 
 /*
- * count successfully initialized driver instances for setup_pci_device()
- */
-static atomic_t drv_instances = ATOMIC_INIT(0);
-
-/*
  * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
  * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
  * or higher value'.
@@ -2851,7 +2846,14 @@ static int init_one_instance(struct pci_dev *F2)
 
 	pci_set_drvdata(F2, pvt);
 
-	atomic_inc(&drv_instances);
+	if (nid == 0 && !pci_ctl) {
+		pci_ctl = edac_pci_create_generic_ctl(&pvt->F2->dev,
+						      EDAC_MOD_STR);
+		if (!pci_ctl) {
+			pr_warn("%s(): Unable to create PCI control\n", __func__);
+			pr_warn("%s(): PCI error report via EDAC not set\n", __func__);
+		}
+	}
 
 	return 0;
 
@@ -2895,6 +2897,9 @@ static void remove_one_instance(struct pci_dev *pdev)
 	struct amd64_pvt *pvt = pci_get_drvdata(pdev);
 	struct mem_ctl_info *mci;
 
+	if (pvt->mc_node_id == 0 && pci_ctl)
+		edac_pci_release_generic_ctl(pci_ctl);
+
 	/* Remove from EDAC CORE tracking list */
 	mci = edac_mc_del_mc(&pdev->dev);
 	if (WARN_ON(!mci))
@@ -2939,26 +2944,6 @@ static struct pci_driver amd64_pci_driver = {
 	.id_table	= amd64_pci_table,
 };
 
-static void setup_pci_device(void)
-{
-	struct mem_ctl_info *mci;
-	struct amd64_pvt *pvt;
-
-	if (pci_ctl)
-		return;
-
-	mci = edac_mc_find(0);
-	if (!mci)
-		return;
-
-	pvt = mci->pvt_info;
-	pci_ctl = edac_pci_create_generic_ctl(&pvt->F2->dev, EDAC_MOD_STR);
-	if (!pci_ctl) {
-		pr_warn("%s(): Unable to create PCI control\n", __func__);
-		pr_warn("%s(): PCI error report via EDAC not set\n", __func__);
-	}
-}
-
 static int __init amd64_edac_init(void)
 {
 	int err = -ENODEV;
@@ -2967,6 +2952,9 @@ static int __init amd64_edac_init(void)
 
 	opstate_init();
 
+	if (!pci_dev_present(amd64_pci_table))
+		goto err_ret;
+
 	if (amd_cache_northbridges() < 0)
 		goto err_ret;
 
@@ -2979,21 +2967,12 @@ static int __init amd64_edac_init(void)
 	if (err)
 		goto err_pci;
 
-	err = -ENODEV;
-	if (!atomic_read(&drv_instances))
-		goto err_no_instances;
-
-	setup_pci_device();
-
 #ifdef CONFIG_X86_32
 	amd64_err("%s on 32-bit is unsupported. USE AT YOUR OWN RISK!\n", EDAC_MOD_STR);
 #endif
 
 	return 0;
 
-err_no_instances:
-	pci_unregister_driver(&amd64_pci_driver);
-
 err_pci:
 	msrs_free(msrs);
 	msrs = NULL;
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19  0:49 ` [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early Dmitry Torokhov
@ 2015-03-19  9:40   ` Borislav Petkov
  2015-03-19 15:29     ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19  9:40 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Doug Thompson, Tejun Heo, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Wed, Mar 18, 2015 at 05:49:10PM -0700, Dmitry Torokhov wrote:
> This change moves setting up PCI control that used to be done after
> driver tried to bind to present PCI devices into the probe() method,
> thus allowing probing to be done asynchronously, if needed.
> 
> To keep as close as possible to the previous behavior we explicitly
> check for presence of supported PCI devices; still we may end up with
> driver loaded even if we did not bind to any of them (for example if ECC
> is disabled).

This is why I did the drv_instances hack - I don't want it to load if
there's no ECC support on the system as it is confusing to people and
tools.

It'd need to be able for the async probing to unload the driver if not a
single node instance loads successfully.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19  9:40   ` Borislav Petkov
@ 2015-03-19 15:29     ` Tejun Heo
  2015-03-19 15:35       ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 15:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

Hello, Borislav.

On Thu, Mar 19, 2015 at 10:40:54AM +0100, Borislav Petkov wrote:
> This is why I did the drv_instances hack - I don't want it to load if
> there's no ECC support on the system as it is confusing to people and
> tools.
> 
> It'd need to be able for the async probing to unload the driver if not a
> single node instance loads successfully.

I don't really think this is the type of things we want to be doing in
the specific drivers.  It makes the driver behave differnetly from
everything else.  If a feature like this is actually necessary, please
implement a proper abstraction at the driver layer (e.g. a flag to
indicate that if the initial probe fails, there's no point in keeping
the driver around) but frankly I don't think this matters enough to
warrant such extra complexities.

This is a gloss layering violation.  Please don't do things like this.
I'm all for ripping out the hack even w/o considering the async probe
issue.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:29     ` Tejun Heo
@ 2015-03-19 15:35       ` Borislav Petkov
  2015-03-19 15:52         ` Dmitry Torokhov
  2015-03-19 15:55         ` Tejun Heo
  0 siblings, 2 replies; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 15:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 11:29:57AM -0400, Tejun Heo wrote:
> This is a gloss layering violation.  Please don't do things like this.
> I'm all for ripping out the hack even w/o considering the async probe
> issue.

And I don't want to leave the driver loaded when there's nothing to
be loaded for. One instance in this driver's specific case is one
northbridge and there are numascale boxes with hundreds of northbridges.

If you have a better idea about how to unload the driver, asynchronously
or not, after all probe() calls have failed, I'm all ears.

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:35       ` Borislav Petkov
@ 2015-03-19 15:52         ` Dmitry Torokhov
  2015-03-19 15:59           ` Borislav Petkov
  2015-03-19 15:55         ` Tejun Heo
  1 sibling, 1 reply; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19 15:52 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tejun Heo, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 04:35:06PM +0100, Borislav Petkov wrote:
> On Thu, Mar 19, 2015 at 11:29:57AM -0400, Tejun Heo wrote:
> > This is a gloss layering violation.  Please don't do things like this.
> > I'm all for ripping out the hack even w/o considering the async probe
> > issue.
> 
> And I don't want to leave the driver loaded when there's nothing to
> be loaded for. One instance in this driver's specific case is one
> northbridge and there are numascale boxes with hundreds of northbridges.

Why does the number of bridges matter? Yo can have bazillion bridges, it
doe snot mean you'll have more than one copy of driver code. Note that
even with the changes we do not leave the driver bound to the devices if
there is no ECC.

> 
> If you have a better idea about how to unload the driver, asynchronously
> or not, after all probe() calls have failed, I'm all ears.

Given that PCI is hot pluggable you can never know when PCI done
enumerating (in  a broad sense).

Thanks.

-- 
Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:35       ` Borislav Petkov
  2015-03-19 15:52         ` Dmitry Torokhov
@ 2015-03-19 15:55         ` Tejun Heo
  2015-03-19 16:01           ` Borislav Petkov
  1 sibling, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 15:55 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 04:35:06PM +0100, Borislav Petkov wrote:
> And I don't want to leave the driver loaded when there's nothing to
> be loaded for. One instance in this driver's specific case is one
> northbridge and there are numascale boxes with hundreds of northbridges.
> 
> If you have a better idea about how to unload the driver, asynchronously
> or not, after all probe() calls have failed, I'm all ears.

We don't go around and implement random hacks ignoring layering and
conventions even if that one off case seems to benefit whatever corner
case minutely, because those kind of hacks accumulate and hinder with
improvements at much larger scale and the benefit here is minute.

The driver model has been moving onto separating module load and
probing because that makes far more sense to most drivers that we use
nowadays and also gives userland a cleaner way to manage modules by
separating the two operations - loading and probing.

If you think this is a big enough problem, please go ahead and build
proper infrastructure for it.  Be it a module attribute or even just a
flag telling userland to indicate that the module can be
auto-unloaded.  Sure, it might look like a smart thing from a very
confined viewpoint but you're seeing the problem unfolding right now.
This code is standing in the way of a much more impactful generic
driver layer improvement.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:52         ` Dmitry Torokhov
@ 2015-03-19 15:59           ` Borislav Petkov
  2015-03-19 16:12             ` Dmitry Torokhov
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 15:59 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Tejun Heo, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 08:52:53AM -0700, Dmitry Torokhov wrote:
> Why does the number of bridges matter? Yo can have bazillion bridges, it

Because it only makes sense to leave the driver loaded when at least one
northbridge supports and has ECC enabled.

> doe snot mean you'll have more than one copy of driver code. Note that
> even with the changes we do not leave the driver bound to the devices if
> there is no ECC.

I know, but the driver remains loaded and it could mislead people that
they actually have error decoding enabled.

Btw, why are you guys jumping on this driver?

I don't really believe it would matter if it loaded asynchronously on
the type of boxes it is supposed to be running. So why not mark it as
sync and we all forget about the whole issue?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:55         ` Tejun Heo
@ 2015-03-19 16:01           ` Borislav Petkov
  2015-03-19 16:12             ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 16:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 11:55:53AM -0400, Tejun Heo wrote:
<snip needless schooling on driver core>

> This code is standing in the way of a much more impactful generic
> driver layer improvement.

So why not mark it as synchronously loading and forget about it?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 15:59           ` Borislav Petkov
@ 2015-03-19 16:12             ` Dmitry Torokhov
  2015-03-19 16:23               ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19 16:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tejun Heo, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 04:59:53PM +0100, Borislav Petkov wrote:
> On Thu, Mar 19, 2015 at 08:52:53AM -0700, Dmitry Torokhov wrote:
> > Why does the number of bridges matter? Yo can have bazillion bridges, it
> 
> Because it only makes sense to leave the driver loaded when at least one
> northbridge supports and has ECC enabled.

By the same token it only makes sense to load e1000e when I have e1000e
device loaded, but we allow it to load anyway. Or psmouse. Or pretty
much any other drivers (sans some platform code). The fact is that we
for long time have separated module loading and driver binding. Loading
driver even without the devices is standard behavior.

> 
> > doe snot mean you'll have more than one copy of driver code. Note that
> > even with the changes we do not leave the driver bound to the devices if
> > there is no ECC.
> 
> I know, but the driver remains loaded and it could mislead people that
> they actually have error decoding enabled.

Does anyone look at the loaded modules to see if functionality is
available? They should look if any devices are actually bound to the
driver (in sysfs). I mean, even without the changes I can unbind the
edac driver from northbridge via sysfs thus disabling error decoding and
"mislead" people.

What is worse is that if I unbind node 0 device the pci_ctl still stays
around, it is not disabled. And it is not reenabled after I rebind that
node 0 device. So actually, right now the driver is broken WRT the
dynamic binding/unbinding.

> 
> Btw, why are you guys jumping on this driver?
> 
> I don't really believe it would matter if it loaded asynchronously on
> the type of boxes it is supposed to be running. So why not mark it as
> sync and we all forget about the whole issue?

Yes, we can. We have to keep the FORCE_SYNCHRONOUS flag for platform
drivers registered with platform_driver_probe() anyways (or maybe not if
we decide to kill platform_driver_probe too).

Still most of the patch 3 is needed for correct operation and I think
patches 1 and 2 improve the code as well.

Thanks.

-- 
Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:01           ` Borislav Petkov
@ 2015-03-19 16:12             ` Tejun Heo
  2015-03-19 16:57               ` Dmitry Torokhov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 16:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 05:01:32PM +0100, Borislav Petkov wrote:
> On Thu, Mar 19, 2015 at 11:55:53AM -0400, Tejun Heo wrote:
> <snip needless schooling on driver core>
> 
> > This code is standing in the way of a much more impactful generic
> > driver layer improvement.
> 
> So why not mark it as synchronously loading and forget about it?

Isn't that obvious?  Because hacks like this are likely to cause other
problems down the road and set bad precedences.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:12             ` Dmitry Torokhov
@ 2015-03-19 16:23               ` Borislav Petkov
  2015-03-19 16:33                 ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 16:23 UTC (permalink / raw)
  To: Dmitry Torokhov, Tejun Heo
  Cc: Doug Thompson, linux-kernel, linux-edac, Mauro Carvalho Chehab,
	Tetsuo Handa, Olof Johansson, Arjan van de Ven,
	Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 09:12:26AM -0700, Dmitry Torokhov wrote:
> By the same token it only makes sense to load e1000e when I have e1000e
> device loaded, but we allow it to load anyway. Or psmouse. Or pretty
> much any other drivers (sans some platform code). The fact is that we
> for long time have separated module loading and driver binding. Loading
> driver even without the devices is standard behavior.

FWIW, I always hated that.

> Does anyone look at the loaded modules to see if functionality is
> available? They should look if any devices are actually bound to the
> driver (in sysfs). I mean, even without the changes I can unbind the
> edac driver from northbridge via sysfs thus disabling error decoding and
> "mislead" people.

Yeah yeah, do people listen to what we say how something should be used
properly and not use it the way they feel like? Of course not!

But enough wasting time fruitlessly, I think I have an idea:

How about I go and iterate over all NBs (northbridges) on the system and
check whether at least one has ECC enabled so that the driver can load?
And do that in the init function.

If I detect at least one NB which is ok, I can then continue and do
pci_register_driver(). If there are no suitable NBs, I return an error
and don't even touch PCI.

Would that be something which would work for what you're trying to
achieve?

If yes, I could give it a try but it won't happen like immediately. For
the time being, we could use the PROBE_FORCE_SYNCHRONOUS thing.

While doing that, I could take a look at Dmitry's cleanups too.

Yes, no?

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:23               ` Borislav Petkov
@ 2015-03-19 16:33                 ` Tejun Heo
  2015-03-19 16:45                   ` Borislav Petkov
  2015-03-19 16:52                   ` Dmitry Torokhov
  0 siblings, 2 replies; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 16:33 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

Hello, Borislav.

On Thu, Mar 19, 2015 at 05:23:02PM +0100, Borislav Petkov wrote:
> On Thu, Mar 19, 2015 at 09:12:26AM -0700, Dmitry Torokhov wrote:
> > By the same token it only makes sense to load e1000e when I have e1000e
> > device loaded, but we allow it to load anyway. Or psmouse. Or pretty
> > much any other drivers (sans some platform code). The fact is that we
> > for long time have separated module loading and driver binding. Loading
> > driver even without the devices is standard behavior.
> 
> FWIW, I always hated that.

You understand that there are technical reasons behind the current
behavior?  This is not something people just did on a whim.  We used
to have autounload and all that but over time moved away from it
because the trade-offs around the behavior shifted.

I don't get why you don't understand this.  As a general rule, we
don't go and implement one-off behaviors like this because it's well
understood that things like this are more costly in the longer term.
As said multiple times before, if you think this is a class of problem
worth solving, do so properly.  Please stop shell scripting in your
->probe() in kernel.

> If I detect at least one NB which is ok, I can then continue and do
> pci_register_driver(). If there are no suitable NBs, I return an error
> and don't even touch PCI.

Please don't.  Consider it nacked preemptively.  If you want to solve
this and can justify the added complexity, solve it in a general way -
teach it to the driver model.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:33                 ` Tejun Heo
@ 2015-03-19 16:45                   ` Borislav Petkov
  2015-03-19 16:49                     ` Tejun Heo
  2015-03-19 16:52                   ` Dmitry Torokhov
  1 sibling, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 16:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 12:33:30PM -0400, Tejun Heo wrote:
> Please stop shell scripting in your ->probe() in kernel.

I don't think you understand me: I'm not going to shell script in my
->probe() function. This is what I'm going to do:

init():

	for each northbridge {
		check if northbridge has ecc;
	}

	if no northbridge has ecc {
		return;
	}

	pci_register_driver();

Now this is clean and doesn't violate any models. It is basically a
check whether the driver should load or not. What's wrong with that, for
chrissake?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:45                   ` Borislav Petkov
@ 2015-03-19 16:49                     ` Tejun Heo
  2015-03-19 16:56                       ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 16:49 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 05:45:06PM +0100, Borislav Petkov wrote:
> Now this is clean and doesn't violate any models. It is basically a
> check whether the driver should load or not. What's wrong with that, for
> chrissake?

That it's repeating iteration performed at bus layer?  How is that not
a layering violation?  Just stop.  Don't do it.  Nothing needs to be
done.  Drop it.  You're solving a non-existing problem.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:33                 ` Tejun Heo
  2015-03-19 16:45                   ` Borislav Petkov
@ 2015-03-19 16:52                   ` Dmitry Torokhov
  2015-03-19 17:09                     ` Tejun Heo
  1 sibling, 1 reply; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19 16:52 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Borislav Petkov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 12:33:30PM -0400, Tejun Heo wrote:
> Hello, Borislav.
> 
> On Thu, Mar 19, 2015 at 05:23:02PM +0100, Borislav Petkov wrote:
> > On Thu, Mar 19, 2015 at 09:12:26AM -0700, Dmitry Torokhov wrote:
> > > By the same token it only makes sense to load e1000e when I have e1000e
> > > device loaded, but we allow it to load anyway. Or psmouse. Or pretty
> > > much any other drivers (sans some platform code). The fact is that we
> > > for long time have separated module loading and driver binding. Loading
> > > driver even without the devices is standard behavior.
> > 
> > FWIW, I always hated that.
> 
> You understand that there are technical reasons behind the current
> behavior?  This is not something people just did on a whim.  We used
> to have autounload and all that but over time moved away from it
> because the trade-offs around the behavior shifted.
> 
> I don't get why you don't understand this.  As a general rule, we
> don't go and implement one-off behaviors like this because it's well
> understood that things like this are more costly in the longer term.
> As said multiple times before, if you think this is a class of problem
> worth solving, do so properly.  Please stop shell scripting in your
> ->probe() in kernel.
> 
> > If I detect at least one NB which is ok, I can then continue and do
> > pci_register_driver(). If there are no suitable NBs, I return an error
> > and don't even touch PCI.
> 
> Please don't.  Consider it nacked preemptively.  If you want to solve
> this and can justify the added complexity, solve it in a general way -
> teach it to the driver model.

I do not think you can teach it to the driver model in general: there is
no definite point when probing is "done". The drivers and devices come
and go, at random moments in time.

So here the driver just assumes that all devices will be enumerated
before the driver is loaded and acts upon this knowledge. The whole
schema is fragile (I mean can I compile it in the kernel and see
breaking because of  link order changes and driver is now initialized
before PCI devices are scanned? Possibly...).

Thanks.

-- 
Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:49                     ` Tejun Heo
@ 2015-03-19 16:56                       ` Borislav Petkov
  2015-03-19 17:03                         ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 16:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 12:49:08PM -0400, Tejun Heo wrote:
> That it's repeating iteration performed at bus layer?

I'm afraid I don't understand. We do cache all AMD nothbridges in
arch/x86/kernel/amd_nb.c and use them to query different things about hw
configuration. And we do that in different places already.

So I'm proposing to use that already cached info to query the ECC status
too.

I don't see anything wrong with that and I don't see how that would
repeat any iteration...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:12             ` Tejun Heo
@ 2015-03-19 16:57               ` Dmitry Torokhov
  2015-03-19 17:22                 ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Dmitry Torokhov @ 2015-03-19 16:57 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Borislav Petkov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 12:12:41PM -0400, Tejun Heo wrote:
> On Thu, Mar 19, 2015 at 05:01:32PM +0100, Borislav Petkov wrote:
> > On Thu, Mar 19, 2015 at 11:55:53AM -0400, Tejun Heo wrote:
> > <snip needless schooling on driver core>
> > 
> > > This code is standing in the way of a much more impactful generic
> > > driver layer improvement.
> > 
> > So why not mark it as synchronously loading and forget about it?
> 
> Isn't that obvious?  Because hacks like this are likely to cause other
> problems down the road and set bad precedences.

In all fairness platform_driver_probe() does pretty much the same and
forces us to mark thus drivers with PROBE_FORCE_SYNCHRONOUS as well. And
there are quite a few of them:

dtor@dtor-ws:~/kernel/work$ grep -r "platform_driver_probe" drivers/ |
wc -l
186

What makes edac unusual is that PCI bus is hot-pluggable and thus PCI
drivers are rarely one-shot.

Thanks.

-- 
Dmitry

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:56                       ` Borislav Petkov
@ 2015-03-19 17:03                         ` Tejun Heo
  2015-03-19 17:04                           ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:03 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 05:56:20PM +0100, Borislav Petkov wrote:
> I don't see anything wrong with that and I don't see how that would
> repeat any iteration...

Just don't do it.  It makes the driver behave differently from others
for no good reason.  Driver being loaded or not doesn't indicate
anything anymore other than the code is loaded and you're adding
complexity to just make it more confusing and you don't have good
enough technical reasons to justify the deviation.  What are you even
fighting for?

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:03                         ` Tejun Heo
@ 2015-03-19 17:04                           ` Borislav Petkov
  2015-03-19 17:10                             ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 17:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 01:03:39PM -0400, Tejun Heo wrote:
> Just don't do it. It makes the driver behave differently from others
> for no good reason. Driver being loaded or not doesn't indicate
> anything anymore other than the code is loaded and you're adding
> complexity to just make it more confusing and you don't have good
> enough technical reasons to justify the deviation. What are you even
> fighting for?

Scroll back a couple of messages.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:52                   ` Dmitry Torokhov
@ 2015-03-19 17:09                     ` Tejun Heo
  0 siblings, 0 replies; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:09 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Borislav Petkov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

Hello, Dmitry.

On Thu, Mar 19, 2015 at 09:52:31AM -0700, Dmitry Torokhov wrote:
> I do not think you can teach it to the driver model in general: there is
> no definite point when probing is "done". The drivers and devices come
> and go, at random moments in time.

I mean, it can check right after the initial iteration is complete.
More below.

> So here the driver just assumes that all devices will be enumerated
> before the driver is loaded and acts upon this knowledge. The whole
> schema is fragile (I mean can I compile it in the kernel and see
> breaking because of  link order changes and driver is now initialized
> before PCI devices are scanned? Possibly...).

So yeah, I agree with you.  This is a totally unnecessary and fragile
hack which shouldn't exist.  The point of pushing "do it in a generic
manner" is forcing people to step away from their myopic views.  It's
kinda easy to get trapped in one's own bubble and trying to apply the
solution (or even the problem itself) at larger scale often helps
actually understanding the larger picture.  And there are off chances
that the person sees a clear use case that other people fail to see.
If the person can come up with an acceptable generic mechanism which
can justify its complexity, it's a win-win situation.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:04                           ` Borislav Petkov
@ 2015-03-19 17:10                             ` Tejun Heo
  2015-03-19 17:15                               ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:10 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 06:04:17PM +0100, Borislav Petkov wrote:
> On Thu, Mar 19, 2015 at 01:03:39PM -0400, Tejun Heo wrote:
> > Just don't do it. It makes the driver behave differently from others
> > for no good reason. Driver being loaded or not doesn't indicate
> > anything anymore other than the code is loaded and you're adding
> > complexity to just make it more confusing and you don't have good
> > enough technical reasons to justify the deviation. What are you even
> > fighting for?
> 
> Scroll back a couple of messages.

lsmod indicating hardware capability?  Really?  You're just adding
confusion to the mix.  Stop.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:10                             ` Tejun Heo
@ 2015-03-19 17:15                               ` Tejun Heo
  2015-03-19 17:27                                 ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 01:10:47PM -0400, Tejun Heo wrote:
> > Scroll back a couple of messages.
> 
> lsmod indicating hardware capability?  Really?  You're just adding
> confusion to the mix.  Stop.

To add a bit, seriously, try to take a step back from your one driver
and look at the larger picture.  The association between module being
loaded or not and hardware capability has been long broken.  It's not
a useable way to communicate anything to userland.  What if the module
is built-in?  What's the difference between your precious one driver
and all others?  How is userland supposed to tell?  This is really you
implementing what should have been in your /etc/rc.local in kernel
driver and is totally unacceptable.  Please don't pull stunts like
this.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 16:57               ` Dmitry Torokhov
@ 2015-03-19 17:22                 ` Tejun Heo
  0 siblings, 0 replies; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:22 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Borislav Petkov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 09:57:41AM -0700, Dmitry Torokhov wrote:
> In all fairness platform_driver_probe() does pretty much the same and
> forces us to mark thus drivers with PROBE_FORCE_SYNCHRONOUS as well. And
> there are quite a few of them:
> 
> dtor@dtor-ws:~/kernel/work$ grep -r "platform_driver_probe" drivers/ |
> wc -l
> 186
> 
> What makes edac unusual is that PCI bus is hot-pluggable and thus PCI
> drivers are rarely one-shot.

As long as it's not buried in each driver and has some generic model,
it's okay.  They're at least annotated and digging them out and
handling them as a class of drivers is okay but we really should stay
away from one-off hacks in individual drivers.  Things like that add a
lot of overhead in the long term.  I'm just kinda baffled that
Borislav's response to "that's hacky, let's not do that or do that in
a generic manner" is coming back with more hacks.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:15                               ` Tejun Heo
@ 2015-03-19 17:27                                 ` Borislav Petkov
  2015-03-19 17:47                                   ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 17:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 01:15:57PM -0400, Tejun Heo wrote:
> To add a bit, seriously, try to take a step back from your one driver
> and look at the larger picture.  The association between module being
> loaded or not and hardware capability has been long broken.  It's not
> a useable way to communicate anything to userland.  What if the module
> is built-in?  What's the difference between your precious one driver
> and all others?  How is userland supposed to tell?  This is really you
> implementing what should have been in your /etc/rc.local in kernel
> driver and is totally unacceptable.  Please don't pull stunts like
> this.

Ok, since when is a driver returning !0 from its init routine and thus
not registering, wrong? And my "precious" driver, as you put it, is by
far not the only one.

And since when am I the bad guy for wanting to not confuse my users by
simply not loading the driver if there's no need for it?

 [ And yes, I have had bug reports of people saying amd64_edac is loaded
 but why am I not getting any errors reported. ]

And yes, it is dumb to load drivers and leave them loaded even when
there's no need for them/no use. Not from some layering/driver
model/whatever technical perspective but from a simple usability
standpoint.

If I do lsmod and see a bunch of drivers but don't know what is used or
not, then we have failed.

And why are you wasting so much time with debating this?

If the driver init routine would do

	if (!ecc_supported)
		return -EINVAL;

would be fine for you but if it did the same thing in a more involved
manner, then that's a problem?

Geez.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:27                                 ` Borislav Petkov
@ 2015-03-19 17:47                                   ` Tejun Heo
  2015-03-19 17:54                                     ` Borislav Petkov
  0 siblings, 1 reply; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 17:47 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

Hello, Borislav.

On Thu, Mar 19, 2015 at 06:27:10PM +0100, Borislav Petkov wrote:
> Ok, since when is a driver returning !0 from its init routine and thus
> not registering, wrong? And my "precious" driver, as you put it, is by
> far not the only one.

The synchronicity actually caused problems with certain storage
drivers which may take a very long time probing and userland timing
out.  We've never been really clear about the two.  Traditionally most
drivers have been synchronous and that worked fine as the hw
configurations have been mostly static.  As things get more dynamic,
decoupling module loading and probing becomes necessary.  Some
interactions aren't immediately obvious.  e.g. because userland is now
more involved in handling hotplug event and thus driver / device
management, they get also more involved with managing modules which in
turn mmakes things like insmod taking five minutes problematic.

> And since when am I the bad guy for wanting to not confuse my users by
> simply not loading the driver if there's no need for it?
>
>  [ And yes, I have had bug reports of people saying amd64_edac is loaded
>  but why am I not getting any errors reported. ]

The same reason that loading e1000 doesn't give the user the matching
ethernet port unless the hardware is actually available.  Please read
on.

> And yes, it is dumb to load drivers and leave them loaded even when
> there's no need for them/no use. Not from some layering/driver
> model/whatever technical perspective but from a simple usability
> standpoint.

That could be a valid point but you can't just go off and implement
that behavior specifically for your driver in a custom way.  If you
think the current overall behavior is bad, please justify your
reasoning and work on a generic solution.  The specific behavior isn't
the core problem here.  The way your driver deviates in an one-off way
is.

> If I do lsmod and see a bunch of drivers but don't know what is used or
> not, then we have failed.

Please see above.

> And why are you wasting so much time with debating this?

Because I've spent so much time and effort hunting down one-off cases
like this all over the place while working on the driver model,
workqueue, freezer and so on.  These things do add up and shut off
possibilities of a lot more meaningful possibilities.  This is the
wrong trade off to make.

> If the driver init routine would do
> 
> 	if (!ecc_supported)
> 		return -EINVAL;
> 
> would be fine for you but if it did the same thing in a more involved
> manner, then that's a problem?

The way -probe() behaves across different drivers is kind of a
scattershot right now but at least the above would have been easy to
spot and understand unlike the current hack.  Again, if you think this
is a problem worth solving and it might as well be, please do so in a
generic manner.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:47                                   ` Tejun Heo
@ 2015-03-19 17:54                                     ` Borislav Petkov
  2015-03-19 18:05                                       ` Tejun Heo
  0 siblings, 1 reply; 30+ messages in thread
From: Borislav Petkov @ 2015-03-19 17:54 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 01:47:38PM -0400, Tejun Heo wrote:
> The way -probe() behaves across different drivers is kind of a
> scattershot right now but at least the above would have been easy to
> spot and understand unlike the current hack.  Again, if you think this
> is a problem worth solving and it might as well be, please do so in a
> generic manner.

Ok, let's mark the driver synchronous for now and I can take a look when
there's more time.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early.
  2015-03-19 17:54                                     ` Borislav Petkov
@ 2015-03-19 18:05                                       ` Tejun Heo
  0 siblings, 0 replies; 30+ messages in thread
From: Tejun Heo @ 2015-03-19 18:05 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dmitry Torokhov, Doug Thompson, linux-kernel, linux-edac,
	Mauro Carvalho Chehab, Tetsuo Handa, Olof Johansson,
	Arjan van de Ven, Greg Kroah-Hartman, Luis R . Rodriguez

On Thu, Mar 19, 2015 at 06:54:22PM +0100, Borislav Petkov wrote:
> Ok, let's mark the driver synchronous for now and I can take a look when
> there's more time.

Yeah, it might come down to just marking drivers which can't see
hotplugged devices with a flag and let the generic driver model fail
probe if the initial scan doesn't give an attached device.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2015-03-19 18:05 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-19  0:49 [RFC/RFT] amd64_edac: avoid doing post-probe setup Dmitry Torokhov
2015-03-19  0:49 ` [PATCH 1/3] EDAC: amd64: stop allocating ecc settings separately Dmitry Torokhov
2015-03-19  0:49 ` [PATCH 2/3] EDAC: amd64_edac: clean up remove_one_instance() Dmitry Torokhov
2015-03-19  0:49 ` [PATCH 3/3] EDAC: amd64_edac: decide if driver can load successfully early Dmitry Torokhov
2015-03-19  9:40   ` Borislav Petkov
2015-03-19 15:29     ` Tejun Heo
2015-03-19 15:35       ` Borislav Petkov
2015-03-19 15:52         ` Dmitry Torokhov
2015-03-19 15:59           ` Borislav Petkov
2015-03-19 16:12             ` Dmitry Torokhov
2015-03-19 16:23               ` Borislav Petkov
2015-03-19 16:33                 ` Tejun Heo
2015-03-19 16:45                   ` Borislav Petkov
2015-03-19 16:49                     ` Tejun Heo
2015-03-19 16:56                       ` Borislav Petkov
2015-03-19 17:03                         ` Tejun Heo
2015-03-19 17:04                           ` Borislav Petkov
2015-03-19 17:10                             ` Tejun Heo
2015-03-19 17:15                               ` Tejun Heo
2015-03-19 17:27                                 ` Borislav Petkov
2015-03-19 17:47                                   ` Tejun Heo
2015-03-19 17:54                                     ` Borislav Petkov
2015-03-19 18:05                                       ` Tejun Heo
2015-03-19 16:52                   ` Dmitry Torokhov
2015-03-19 17:09                     ` Tejun Heo
2015-03-19 15:55         ` Tejun Heo
2015-03-19 16:01           ` Borislav Petkov
2015-03-19 16:12             ` Tejun Heo
2015-03-19 16:57               ` Dmitry Torokhov
2015-03-19 17:22                 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).