All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mlx4: Add support for EEH error recovery
@ 2012-07-20 19:55 Kleber Sacilotto de Souza
  2012-07-22 10:29 ` Or Gerlitz
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-20 19:55 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Jack Morgenstein, Yevgeny Petrilin, Or Gerlitz, cascardo,
	brking, Kleber Sacilotto de Souza

Currently the mlx4 drivers don't have the necessary callbacks to
implement EEH errors detection and recovery, so the PCI layer uses the
probe and remove callbacks to try to recover the device after an error on
the bus. However, these callbacks have race conditions with the internal
catastrophic error recovery functions, which will also detect the error
and this can cause the system to crash if both EEH and catas functions
try to reset the device.

This patch adds the necessary error recovery callbacks and makes sure
that the internal catastrophic error functions will not try to reset the
device in such scenarios. It also adds some calls to
pci_channel_offline() to suppress reads/writes on the bus when the slot
cannot accept I/O operations so we prevent unnecessary accesses to the
bus and speed up the device removal.

Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c |   25 ++++++++++----
 drivers/net/ethernet/mellanox/mlx4/cmd.c   |   49 ++++++++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/main.c  |   30 ++++++++++++++++-
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 915e947..9c656fe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -69,16 +69,21 @@ static void poll_catas(unsigned long dev_ptr)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	if (readl(priv->catas_err.map)) {
-		dump_err_buf(dev);
-
-		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+		/* If the device is off-line, we cannot try to recover it */
+		if (pci_channel_offline(dev->pdev))
+			mod_timer(&priv->catas_err.timer,
+				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+		else {
+			dump_err_buf(dev);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 
-		if (internal_err_reset) {
-			spin_lock(&catas_lock);
-			list_add(&priv->catas_err.list, &catas_list);
-			spin_unlock(&catas_lock);
+			if (internal_err_reset) {
+				spin_lock(&catas_lock);
+				list_add(&priv->catas_err.list, &catas_list);
+				spin_unlock(&catas_lock);
 
-			queue_work(mlx4_wq, &catas_work);
+				queue_work(mlx4_wq, &catas_work);
+			}
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -100,6 +105,10 @@ static void catas_reset(struct work_struct *work)
 	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
 		struct pci_dev *pdev = priv->dev.pdev;
 
+		/* If the device is off-line, we cannot reset it */
+		if (pci_channel_offline(pdev))
+			continue;
+
 		ret = mlx4_restart_one(priv->dev.pdev);
 		/* 'priv' now is not valid */
 		if (ret)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7e94987..c8fef43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -296,7 +296,12 @@ int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
 
 static int cmd_pending(struct mlx4_dev *dev)
 {
-	u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+	u32 status;
+
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
 
 	return (status & swab32(1 << HCR_GO_BIT)) ||
 		(mlx4_priv(dev)->cmd.toggle ==
@@ -314,11 +319,29 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 
 	mutex_lock(&cmd->hcr_mutex);
 
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		ret = -EIO;
+		goto out;
+	}
+
 	end = jiffies;
 	if (event)
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+
 		if (time_after_eq(jiffies, end)) {
 			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
 			goto out;
@@ -431,14 +454,33 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = -EIO;
+		goto out;
+	}
+
 	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
 		goto out;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
-	while (cmd_pending(dev) && time_before(jiffies, end))
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out;
+		}
+
 		cond_resched();
+	}
 
 	if (cmd_pending(dev)) {
 		err = -ETIMEDOUT;
@@ -532,6 +574,9 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
 	       u16 op, unsigned long timeout, int native)
 {
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
 		if (mlx4_priv(dev)->cmd.use_events)
 			return mlx4_cmd_wait(dev, in_param, out_param,
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 4264516..e717091 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1775,6 +1775,9 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
 	void __iomem *owner;
 	u32 ret;
 
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
 	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
@@ -1791,6 +1794,9 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 {
 	void __iomem *owner;
 
+	if (pci_channel_offline(dev->pdev))
+		return;
+
 	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
@@ -2237,11 +2243,33 @@ static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = {
 
 MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
 
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	mlx4_remove_one(pdev);
+
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+	int ret = __mlx4_init_one(pdev, NULL);
+
+	return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+static struct pci_error_handlers mlx4_err_handler = {
+	.error_detected = mlx4_pci_err_detected,
+	.slot_reset     = mlx4_pci_slot_reset,
+};
+
 static struct pci_driver mlx4_driver = {
 	.name		= DRV_NAME,
 	.id_table	= mlx4_pci_table,
 	.probe		= mlx4_init_one,
-	.remove		= __devexit_p(mlx4_remove_one)
+	.remove		= __devexit_p(mlx4_remove_one),
+	.err_handler    = &mlx4_err_handler,
 };
 
 static int __init mlx4_verify_params(void)
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-20 19:55 [PATCH] mlx4: Add support for EEH error recovery Kleber Sacilotto de Souza
@ 2012-07-22 10:29 ` Or Gerlitz
       [not found] ` <500BD558.2060803@mellanox.com>
  2012-07-24 21:03 ` David Miller
  2 siblings, 0 replies; 24+ messages in thread
From: Or Gerlitz @ 2012-07-22 10:29 UTC (permalink / raw)
  To: Kleber Sacilotto de Souza
  Cc: David S. Miller, netdev, Jack Morgenstein, Yevgeny Petrilin,
	cascardo, brking, Shlomo Pongratz

On 7/20/2012 10:55 PM, Kleber Sacilotto de Souza wrote:
> Currently the mlx4 drivers don't have the necessary callbacks to
> implement EEH errors detection and recovery [...]

Hi,

Shlomo Pongratz  from Mellanox took a look on the patch, and we'd like 
to clarify with you something - OTOH you mention EEH (Extended Error 
Handling) error recovery which is a PPC related term mentioned in 
Documentation/powerpc/eeh-pci-error-recovery.txt, but OTHO you've 
implemented callbacks mentioned in the AER (Advanced Error Reporting) 
doc of Documentation/PCI/pcieaer-howto.txt, is there anything in the 
code you added which maybe implicitly assumes PPC arch?


Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
       [not found] ` <500BD558.2060803@mellanox.com>
@ 2012-07-23  0:15   ` David Miller
  2012-07-23 13:18     ` Kleber Sacilotto de Souza
  0 siblings, 1 reply; 24+ messages in thread
From: David Miller @ 2012-07-23  0:15 UTC (permalink / raw)
  To: ogerlitz; +Cc: klebers, netdev, jackm, yevgenyp, cascardo, brking, shlomop

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun, 22 Jul 2012 13:26:32 +0300

> is there anything in the code you added which maybe implicitly
> assumes PPC arch?

He implemented support for a standard PCI API in the kernel, he
happened to test it on a particular platform, and I think that's
the long and short of it.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23  0:15   ` David Miller
@ 2012-07-23 13:18     ` Kleber Sacilotto de Souza
  2012-07-23 13:45       ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-23 13:18 UTC (permalink / raw)
  To: David Miller; +Cc: ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On 07/22/2012 09:15 PM, David Miller wrote:

> From: Or Gerlitz <ogerlitz@mellanox.com>
> Date: Sun, 22 Jul 2012 13:26:32 +0300
> 
>> is there anything in the code you added which maybe implicitly
>> assumes PPC arch?
> 
> He implemented support for a standard PCI API in the kernel, he
> happened to test it on a particular platform, and I think that's
> the long and short of it.
> 


Exactly. The callbacks implemented are from standard PCI error recovery
(Documentation/PCI/pci-error-recovery.txt) and the changes doesn't
assume any platform in specific. The code was tested only on powerpc
systems since I don't have any mlx4 card on other platforms, however,
these changes shouldn't make the error recover any worse than the
current state.

-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 13:18     ` Kleber Sacilotto de Souza
@ 2012-07-23 13:45       ` Or Gerlitz
  2012-07-23 18:12         ` Kleber Sacilotto de Souza
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2012-07-23 13:45 UTC (permalink / raw)
  To: Kleber Sacilotto de Souza
  Cc: David Miller, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On 7/23/2012 4:18 PM, Kleber Sacilotto de Souza wrote:
> Exactly. The callbacks implemented are from standard PCI error recovery
> (Documentation/PCI/pci-error-recovery.txt) and the changes doesn't
> assume any platform in specific. The code was tested only on powerpc
> systems [...]

So how did you test that? using the kernel provided error injection 
support and user space tool (which?) or in another way? we've trying 
quickly here to inject errors using /sbin/ear-inject from 
ras-utils-6.1-1.el6.x86_64 on a kernel built with

CONFIG_PCIEAER=y
CONFIG_PCIEAER_INJECT=m

and it failed to inject errors, SB details.

Or.
> since I don't have any mlx4 card on other platforms, however,
> these changes shouldn't make the error recover any worse than the
> current state.

> # lspci | grep 08.00.1
> 08:00.1 Ethernet controller: Intel Corporation 82575EB Gigabit Network 
> Connection (rev 02)

> # cat /tmp/intel.aer
> AER
> BUS 8 DEV 0 FN 1
> COR_STATUS BAD_TLP
> HEADER_LOG 0 1 2 3

> # /sbin/aer-inject < /tmp/intel.aer
> Error: Failed to write, Invalid argument



> # strace -F -f /sbin/aer-inject < /tmp/intel.aer
> [...]

> open("/dev/aer_inject", O_WRONLY)       = 3
> write(3, "\10\0\1\0\0\0\0\0@\0\0\0\0\0\0\0\1\0\0\0\2\0\0\0\3\0\0\0", 
> 28) = -1 EINVAL (Invalid argument)
> write(2, "Error: ", 7Error: )                  = 7
> write(2, "Failed to write", 15Failed to write)         = 15
> write(2, ", Invalid argument\n", 19, Invalid argument
> )    = 19
> exit_group(-1)                          = ?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 13:45       ` Or Gerlitz
@ 2012-07-23 18:12         ` Kleber Sacilotto de Souza
  2012-07-23 20:53           ` Kleber Sacilotto de Souza
  0 siblings, 1 reply; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-23 18:12 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On 07/23/2012 10:45 AM, Or Gerlitz wrote:

> On 7/23/2012 4:18 PM, Kleber Sacilotto de Souza wrote:
>> Exactly. The callbacks implemented are from standard PCI error recovery
>> (Documentation/PCI/pci-error-recovery.txt) and the changes doesn't
>> assume any platform in specific. The code was tested only on powerpc
>> systems [...]
> 
> So how did you test that? using the kernel provided error injection
> support and user space tool (which?) or in another way? we've trying
> quickly here to inject errors using /sbin/ear-inject from
> ras-utils-6.1-1.el6.x86_64 on a kernel built with
> 
> CONFIG_PCIEAER=y
> CONFIG_PCIEAER_INJECT=m


For powerpc we have an IBM internal user space tool that injects the
error on the bus with the aid of the system firmware. The kernel used
was built with the option:

CONFIG_EEH=y

and without the AER options. I will run some more tests with the AER
options activated.

> 
> and it failed to inject errors, SB details.
> 
> Or.
>> since I don't have any mlx4 card on other platforms, however,
>> these changes shouldn't make the error recover any worse than the
>> current state.
> 
>> # lspci | grep 08.00.1
>> 08:00.1 Ethernet controller: Intel Corporation 82575EB Gigabit Network
>> Connection (rev 02)
> 
>> # cat /tmp/intel.aer
>> AER
>> BUS 8 DEV 0 FN 1
>> COR_STATUS BAD_TLP
>> HEADER_LOG 0 1 2 3
> 
>> # /sbin/aer-inject < /tmp/intel.aer
>> Error: Failed to write, Invalid argument
> 
> 
> 
>> # strace -F -f /sbin/aer-inject < /tmp/intel.aer
>> [...]
> 
>> open("/dev/aer_inject", O_WRONLY)       = 3
>> write(3, "\10\0\1\0\0\0\0\0@\0\0\0\0\0\0\0\1\0\0\0\2\0\0\0\3\0\0\0",
>> 28) = -1 EINVAL (Invalid argument)
>> write(2, "Error: ", 7Error: )                  = 7
>> write(2, "Failed to write", 15Failed to write)         = 15
>> write(2, ", Invalid argument\n", 19, Invalid argument
>> )    = 19
>> exit_group(-1)                          = ?
> 
> 
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 18:12         ` Kleber Sacilotto de Souza
@ 2012-07-23 20:53           ` Kleber Sacilotto de Souza
  2012-07-23 21:26             ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-23 20:53 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On 07/23/2012 03:12 PM, Kleber Sacilotto de Souza wrote:

> On 07/23/2012 10:45 AM, Or Gerlitz wrote:
> 
>> On 7/23/2012 4:18 PM, Kleber Sacilotto de Souza wrote:
>>> Exactly. The callbacks implemented are from standard PCI error recovery
>>> (Documentation/PCI/pci-error-recovery.txt) and the changes doesn't
>>> assume any platform in specific. The code was tested only on powerpc
>>> systems [...]
>>
>> So how did you test that? using the kernel provided error injection
>> support and user space tool (which?) or in another way? we've trying
>> quickly here to inject errors using /sbin/ear-inject from
>> ras-utils-6.1-1.el6.x86_64 on a kernel built with
>>
>> CONFIG_PCIEAER=y
>> CONFIG_PCIEAER_INJECT=m
> 
> 
> For powerpc we have an IBM internal user space tool that injects the
> error on the bus with the aid of the system firmware. The kernel used
> was built with the option:
> 
> CONFIG_EEH=y
> 
> and without the AER options. I will run some more tests with the AER
> options activated.


I tested the powerpc error injection with

CONFIG_EEH=y
CONFIG_PCIEAER=y
CONFIG_PCIEAER_INJECT=m

and with the aer_inject module loaded and it didn't affect the EEH
recovery, the adapter recovered as expected.

> 
>>
>> and it failed to inject errors, SB details.
>>
>> Or.
>>> since I don't have any mlx4 card on other platforms, however,
>>> these changes shouldn't make the error recover any worse than the
>>> current state.
>>
>>> # lspci | grep 08.00.1
>>> 08:00.1 Ethernet controller: Intel Corporation 82575EB Gigabit Network
>>> Connection (rev 02)
>>
>>> # cat /tmp/intel.aer
>>> AER
>>> BUS 8 DEV 0 FN 1
>>> COR_STATUS BAD_TLP
>>> HEADER_LOG 0 1 2 3
>>
>>> # /sbin/aer-inject < /tmp/intel.aer
>>> Error: Failed to write, Invalid argument
>>
>>
>>
>>> # strace -F -f /sbin/aer-inject < /tmp/intel.aer
>>> [...]
>>
>>> open("/dev/aer_inject", O_WRONLY)       = 3
>>> write(3, "\10\0\1\0\0\0\0\0@\0\0\0\0\0\0\0\1\0\0\0\2\0\0\0\3\0\0\0",
>>> 28) = -1 EINVAL (Invalid argument)
>>> write(2, "Error: ", 7Error: )                  = 7
>>> write(2, "Failed to write", 15Failed to write)         = 15
>>> write(2, ", Invalid argument\n", 19, Invalid argument
>>> )    = 19
>>> exit_group(-1)                          = ?
>>
>>
>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
> 
> 



-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 20:53           ` Kleber Sacilotto de Souza
@ 2012-07-23 21:26             ` Or Gerlitz
  2012-07-23 21:34               ` David Miller
  2012-07-24 13:12               ` Kleber Sacilotto de Souza
  0 siblings, 2 replies; 24+ messages in thread
From: Or Gerlitz @ 2012-07-23 21:26 UTC (permalink / raw)
  To: Kleber Sacilotto de Souza
  Cc: Or Gerlitz, David Miller, netdev, jackm, yevgenyp, cascardo,
	brking, shlomop

Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
>> For powerpc we have an IBM internal user space tool that injects the
>> error on the bus with the aid of the system firmware. The kernel used
>> was built with the option:
>> CONFIG_EEH=y
>> and without the AER options. I will run some more tests with the AER
>> options activated.

> I tested the powerpc error injection with
>
> CONFIG_EEH=y
> CONFIG_PCIEAER=y
> CONFIG_PCIEAER_INJECT=m
>
> and with the aer_inject module loaded and it didn't affect the EEH
> recovery, the adapter recovered as expected.

I wasn't sure to follow what did you mean by "it didn't affect the EEH
recovery", how did you use the aer_inject module, is that through
user-space tool which is available for us?

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 21:26             ` Or Gerlitz
@ 2012-07-23 21:34               ` David Miller
  2012-07-23 21:42                 ` Or Gerlitz
  2012-07-24 13:12               ` Kleber Sacilotto de Souza
  1 sibling, 1 reply; 24+ messages in thread
From: David Miller @ 2012-07-23 21:34 UTC (permalink / raw)
  To: or.gerlitz
  Cc: klebers, ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

From: Or Gerlitz <or.gerlitz@gmail.com>
Date: Tue, 24 Jul 2012 00:26:51 +0300

> I wasn't sure to follow what did you mean by "it didn't affect the EEH
> recovery", how did you use the aer_inject module, is that through
> user-space tool which is available for us?

Can we please move forward, if he implemented the feature properly
and he tested it successfully, unless you can find a logic or
stylistic flaw in his patch please ACK it.

You can't hold his changes back while you work out how _YOU_ can
test it to your liking.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 21:34               ` David Miller
@ 2012-07-23 21:42                 ` Or Gerlitz
  2012-07-23 21:44                   ` David Miller
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2012-07-23 21:42 UTC (permalink / raw)
  To: David Miller
  Cc: klebers, ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On Tue, Jul 24, 2012 at 12:34 AM, David Miller <davem@davemloft.net> wrote:

> Can we please move forward, if he implemented the feature properly
> and he tested it successfully, unless you can find a logic or
> stylistic flaw in his patch please ACK it.
>
> You can't hold his changes back while you work out how _YOU_ can
> test it to your liking.

Hi Dave,

We're trying to act in  R/R (Responsive and Responsible) manner -
namely Shlomo did code review of the patches and we want to further
evaluate them by testing, I think its fully legitimate to test a patch
before ACK-ing.  Doing these types of tests isn't around my personal
typical daily menu and I'm asking for some directives from the author
on how to issue that testing, I don't see what wrong here. We're
planning anyway to go deeper around this area and enhance the PCI
hotplug /error handling related code in the driver, so there's an
initial learing curve here, makes sense? we can move the Q&A for the
testing to be off-list if you prefer it to go that way.

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 21:42                 ` Or Gerlitz
@ 2012-07-23 21:44                   ` David Miller
  2012-07-23 22:02                     ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: David Miller @ 2012-07-23 21:44 UTC (permalink / raw)
  To: or.gerlitz
  Cc: klebers, ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

From: Or Gerlitz <or.gerlitz@gmail.com>
Date: Tue, 24 Jul 2012 00:42:08 +0300

> On Tue, Jul 24, 2012 at 12:34 AM, David Miller <davem@davemloft.net> wrote:
> 
>> Can we please move forward, if he implemented the feature properly
>> and he tested it successfully, unless you can find a logic or
>> stylistic flaw in his patch please ACK it.
>>
>> You can't hold his changes back while you work out how _YOU_ can
>> test it to your liking.
> 
> Hi Dave,
> 
> We're trying to act in  R/R (Responsive and Responsible) manner -
> namely Shlomo did code review of the patches and we want to further
> evaluate them by testing, I think its fully legitimate to test a patch
> before ACK-ing.  Doing these types of tests isn't around my personal
> typical daily menu and I'm asking for some directives from the author
> on how to issue that testing, I don't see what wrong here. We're
> planning anyway to go deeper around this area and enhance the PCI
> hotplug /error handling related code in the driver, so there's an
> initial learing curve here, makes sense? we can move the Q&A for the
> testing to be off-list if you prefer it to go that way.

But ACK his patch, because you have not found any problems with it.

This is taking days, and you're stalling further progress.

I never let patches rot in patchwork more than a few days, as this
patch has already.

Either ACK or provide a legitimate reason to reject it now.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 21:44                   ` David Miller
@ 2012-07-23 22:02                     ` Or Gerlitz
  2012-07-23 22:21                       ` David Miller
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2012-07-23 22:02 UTC (permalink / raw)
  To: David Miller
  Cc: klebers, ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

On Tue, Jul 24, 2012 at 12:44 AM, David Miller <davem@davemloft.net> wrote:
> But ACK his patch, because you have not found any problems with it.

Again, we wanted to test the patch before providing ACK, and it takes us a
bit to catch up on testing of this area, happens.


> This is taking days, and you're stalling further progress.
> I never let patches rot in patchwork more than a few days, as this patch has already.
> Either ACK or provide a legitimate reason to reject it now.

understood. As its a bit off working hours here now, lets see what
tomorrow yield testing wise - but by tomorrow we either ACK or provide
reason to reject/change it.

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 22:02                     ` Or Gerlitz
@ 2012-07-23 22:21                       ` David Miller
  0 siblings, 0 replies; 24+ messages in thread
From: David Miller @ 2012-07-23 22:21 UTC (permalink / raw)
  To: or.gerlitz
  Cc: klebers, ogerlitz, netdev, jackm, yevgenyp, cascardo, brking, shlomop

From: Or Gerlitz <or.gerlitz@gmail.com>
Date: Tue, 24 Jul 2012 01:02:10 +0300

> As its a bit off working hours here now, lets see what tomorrow
> yield testing wise - but by tomorrow we either ACK or provide reason
> to reject/change it.

Thank you.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-23 21:26             ` Or Gerlitz
  2012-07-23 21:34               ` David Miller
@ 2012-07-24 13:12               ` Kleber Sacilotto de Souza
  2012-07-24 17:09                 ` Shlomo Pongartz
  1 sibling, 1 reply; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-24 13:12 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Or Gerlitz, David Miller, netdev, jackm, yevgenyp, cascardo,
	brking, shlomop

On 07/23/2012 06:26 PM, Or Gerlitz wrote:

> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
>>> For powerpc we have an IBM internal user space tool that injects the
>>> error on the bus with the aid of the system firmware. The kernel used
>>> was built with the option:
>>> CONFIG_EEH=y
>>> and without the AER options. I will run some more tests with the AER
>>> options activated.
> 
>> I tested the powerpc error injection with
>>
>> CONFIG_EEH=y
>> CONFIG_PCIEAER=y
>> CONFIG_PCIEAER_INJECT=m
>>
>> and with the aer_inject module loaded and it didn't affect the EEH
>> recovery, the adapter recovered as expected.
> 
> I wasn't sure to follow what did you mean by "it didn't affect the EEH
> recovery", how did you use the aer_inject module, is that through
> user-space tool which is available for us?


I wanted to say that I was testing before only with the EEH option
activated, then I activated the AER options on my powerpc system just to
make sure these options when activate wouldn't affect the EEH recovery.
I haven't injected and AER error since I don't have a system with
hardware support for it.


Thanks,
-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 13:12               ` Kleber Sacilotto de Souza
@ 2012-07-24 17:09                 ` Shlomo Pongartz
  2012-07-24 17:35                   ` Kleber Sacilotto de Souza
  0 siblings, 1 reply; 24+ messages in thread
From: Shlomo Pongartz @ 2012-07-24 17:09 UTC (permalink / raw)
  To: Kleber Sacilotto de Souza
  Cc: Or Gerlitz, Or Gerlitz, David Miller, netdev, jackm, yevgenyp,
	cascardo, brking

On 7/24/2012 4:12 PM, Kleber Sacilotto de Souza wrote:
> On 07/23/2012 06:26 PM, Or Gerlitz wrote:
>
>> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
>>>> For powerpc we have an IBM internal user space tool that injects the
>>>> error on the bus with the aid of the system firmware. The kernel used
>>>> was built with the option:
>>>> CONFIG_EEH=y
>>>> and without the AER options. I will run some more tests with the AER
>>>> options activated.
>>> I tested the powerpc error injection with
>>>
>>> CONFIG_EEH=y
>>> CONFIG_PCIEAER=y
>>> CONFIG_PCIEAER_INJECT=m
>>>
>>> and with the aer_inject module loaded and it didn't affect the EEH
>>> recovery, the adapter recovered as expected.
>> I wasn't sure to follow what did you mean by "it didn't affect the EEH
>> recovery", how did you use the aer_inject module, is that through
>> user-space tool which is available for us?
>
> I wanted to say that I was testing before only with the EEH option
> activated, then I activated the AER options on my powerpc system just to
> make sure these options when activate wouldn't affect the EEH recovery.
> I haven't injected and AER error since I don't have a system with
> hardware support for it.
>
>
> Thanks,

Hi

Using a special extender card I've powered down the card.
None of the callbacks were called (I added printks to be sure).
Shouldn't one on the callbacks be called?

Shlomo Pongratz.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 17:09                 ` Shlomo Pongartz
@ 2012-07-24 17:35                   ` Kleber Sacilotto de Souza
  2012-07-24 18:08                     ` Thadeu Lima de Souza Cascardo
  0 siblings, 1 reply; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-24 17:35 UTC (permalink / raw)
  To: Shlomo Pongartz
  Cc: Or Gerlitz, Or Gerlitz, David Miller, netdev, jackm, yevgenyp,
	cascardo, brking

On 07/24/2012 02:09 PM, Shlomo Pongartz wrote:

> On 7/24/2012 4:12 PM, Kleber Sacilotto de Souza wrote:
>> On 07/23/2012 06:26 PM, Or Gerlitz wrote:
>>
>>> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
>>>>> For powerpc we have an IBM internal user space tool that injects the
>>>>> error on the bus with the aid of the system firmware. The kernel used
>>>>> was built with the option:
>>>>> CONFIG_EEH=y
>>>>> and without the AER options. I will run some more tests with the AER
>>>>> options activated.
>>>> I tested the powerpc error injection with
>>>>
>>>> CONFIG_EEH=y
>>>> CONFIG_PCIEAER=y
>>>> CONFIG_PCIEAER_INJECT=m
>>>>
>>>> and with the aer_inject module loaded and it didn't affect the EEH
>>>> recovery, the adapter recovered as expected.
>>> I wasn't sure to follow what did you mean by "it didn't affect the EEH
>>> recovery", how did you use the aer_inject module, is that through
>>> user-space tool which is available for us?
>>
>> I wanted to say that I was testing before only with the EEH option
>> activated, then I activated the AER options on my powerpc system just to
>> make sure these options when activate wouldn't affect the EEH recovery.
>> I haven't injected and AER error since I don't have a system with
>> hardware support for it.
>>
>>
>> Thanks,
> 
> Hi
> 
> Using a special extender card I've powered down the card.
> None of the callbacks were called (I added printks to be sure).
> Shouldn't one on the callbacks be called?
> 
> Shlomo Pongratz.
> 


What does this extender card do exactly? If it does hot plugging, it
will call the remove and probe callbacks.


-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 17:35                   ` Kleber Sacilotto de Souza
@ 2012-07-24 18:08                     ` Thadeu Lima de Souza Cascardo
  2012-07-24 18:35                       ` Shlomo Pongratz
  2012-07-24 18:39                       ` Shlomo Pongratz
  0 siblings, 2 replies; 24+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-24 18:08 UTC (permalink / raw)
  To: Kleber Sacilotto de Souza
  Cc: Shlomo Pongartz, Or Gerlitz, Or Gerlitz, David Miller, netdev,
	jackm, yevgenyp, brking

On Tue, Jul 24, 2012 at 02:35:51PM -0300, Kleber Sacilotto de Souza wrote:
> On 07/24/2012 02:09 PM, Shlomo Pongartz wrote:
> 
> > On 7/24/2012 4:12 PM, Kleber Sacilotto de Souza wrote:
> >> On 07/23/2012 06:26 PM, Or Gerlitz wrote:
> >>
> >>> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
> >>>>> For powerpc we have an IBM internal user space tool that injects the
> >>>>> error on the bus with the aid of the system firmware. The kernel used
> >>>>> was built with the option:
> >>>>> CONFIG_EEH=y
> >>>>> and without the AER options. I will run some more tests with the AER
> >>>>> options activated.
> >>>> I tested the powerpc error injection with
> >>>>
> >>>> CONFIG_EEH=y
> >>>> CONFIG_PCIEAER=y
> >>>> CONFIG_PCIEAER_INJECT=m
> >>>>
> >>>> and with the aer_inject module loaded and it didn't affect the EEH
> >>>> recovery, the adapter recovered as expected.
> >>> I wasn't sure to follow what did you mean by "it didn't affect the EEH
> >>> recovery", how did you use the aer_inject module, is that through
> >>> user-space tool which is available for us?
> >>
> >> I wanted to say that I was testing before only with the EEH option
> >> activated, then I activated the AER options on my powerpc system just to
> >> make sure these options when activate wouldn't affect the EEH recovery.
> >> I haven't injected and AER error since I don't have a system with
> >> hardware support for it.
> >>
> >>
> >> Thanks,
> > 
> > Hi
> > 
> > Using a special extender card I've powered down the card.
> > None of the callbacks were called (I added printks to be sure).
> > Shouldn't one on the callbacks be called?
> > 
> > Shlomo Pongratz.
> > 
> 
> 
> What does this extender card do exactly? If it does hot plugging, it
> will call the remove and probe callbacks.
> 
> 
> -- 
> Kleber Sacilotto de Souza
> IBM Linux Technology Center

I assume it just powers down the card, ie, it will not respond anymore
to any bus messages, which may cause an error report.

Shlomo, is it the system you are using AER-capable? From what I see from
code and documentation, you must have root ports which support AER. Is
that the case?

Regards.
Cascardo.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 18:08                     ` Thadeu Lima de Souza Cascardo
@ 2012-07-24 18:35                       ` Shlomo Pongratz
  2012-07-24 18:39                       ` Shlomo Pongratz
  1 sibling, 0 replies; 24+ messages in thread
From: Shlomo Pongratz @ 2012-07-24 18:35 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, Kleber Sacilotto de Souza
  Cc: Or Gerlitz, Or Gerlitz, David Miller, netdev, jackm,
	Yevgeny Petrilin, brking

On Tue, Jul 24, 2012 at 02:35:51PM -0300, Kleber Sacilotto de Souza wrote:
> On 07/24/2012 02:09 PM, Shlomo Pongartz wrote:
>
> > On 7/24/2012 4:12 PM, Kleber Sacilotto de Souza wrote:
> >> On 07/23/2012 06:26 PM, Or Gerlitz wrote:
> >>
> >>> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
> >>>>> For powerpc we have an IBM internal user space tool that injects the
> >>>>> error on the bus with the aid of the system firmware. The kernel used
> >>>>> was built with the option:
> >>>>> CONFIG_EEH=y
> >>>>> and without the AER options. I will run some more tests with the AER
> >>>>> options activated.
> >>>> I tested the powerpc error injection with
> >>>>
> >>>> CONFIG_EEH=y
> >>>> CONFIG_PCIEAER=y
> >>>> CONFIG_PCIEAER_INJECT=m
> >>>>
> >>>> and with the aer_inject module loaded and it didn't affect the EEH
> >>>> recovery, the adapter recovered as expected.
> >>> I wasn't sure to follow what did you mean by "it didn't affect the EEH
> >>> recovery", how did you use the aer_inject module, is that through
> >>> user-space tool which is available for us?
> >>
> >> I wanted to say that I was testing before only with the EEH option
> >> activated, then I activated the AER options on my powerpc system just to
> >> make sure these options when activate wouldn't affect the EEH recovery.
> >> I haven't injected and AER error since I don't have a system with
> >> hardware support for it.
> >>
> >>
> >> Thanks,
> >
> > Hi
> >
> > Using a special extender card I've powered down the card.
> > None of the callbacks were called (I added printks to be sure).
> > Shouldn't one on the callbacks be called?
> >
> > Shlomo Pongratz.
> >
>
>
> What does this extender card do exactly? If it does hot plugging, it
> will call the remove and probe callbacks.
>
>
> --
> Kleber Sacilotto de Souza
> IBM Linux Technology Center

I assume it just powers down the card, ie, it will not respond anymore
to any bus messages, which may cause an error report.

Shlomo, is it the system you are using AER-capable? From what I see from
code and documentation, you must have root ports which support AER. Is
that the case?

Regards.
Cascardo.


The extender card is a passive card that only enables me to power on/off the PCI card stacked on it.
I'll check the mother board and BIOS ASAP.

Regards,
Shlomo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 18:08                     ` Thadeu Lima de Souza Cascardo
  2012-07-24 18:35                       ` Shlomo Pongratz
@ 2012-07-24 18:39                       ` Shlomo Pongratz
  1 sibling, 0 replies; 24+ messages in thread
From: Shlomo Pongratz @ 2012-07-24 18:39 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, Kleber Sacilotto de Souza
  Cc: Or Gerlitz, Or Gerlitz, David Miller, netdev, jackm,
	Yevgeny Petrilin, brking


________________________________________
From: Thadeu Lima de Souza Cascardo [cascardo@linux.vnet.ibm.com]
Sent: Tuesday, July 24, 2012 9:08 PM
To: Kleber Sacilotto de Souza
Cc: Shlomo Pongratz; Or Gerlitz; Or Gerlitz; David Miller; netdev@vger.kernel.org; jackm@dev.mellanox.co.il; Yevgeny Petrilin; brking@linux.vnet.ibm.com
Subject: Re: [PATCH] mlx4: Add support for EEH error recovery

On Tue, Jul 24, 2012 at 02:35:51PM -0300, Kleber Sacilotto de Souza wrote:
> On 07/24/2012 02:09 PM, Shlomo Pongartz wrote:
>
> > On 7/24/2012 4:12 PM, Kleber Sacilotto de Souza wrote:
> >> On 07/23/2012 06:26 PM, Or Gerlitz wrote:
> >>
> >>> Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> wrote:
> >>>>> For powerpc we have an IBM internal user space tool that injects the
> >>>>> error on the bus with the aid of the system firmware. The kernel used
> >>>>> was built with the option:
> >>>>> CONFIG_EEH=y
> >>>>> and without the AER options. I will run some more tests with the AER
> >>>>> options activated.
> >>>> I tested the powerpc error injection with
> >>>>
> >>>> CONFIG_EEH=y
> >>>> CONFIG_PCIEAER=y
> >>>> CONFIG_PCIEAER_INJECT=m
> >>>>
> >>>> and with the aer_inject module loaded and it didn't affect the EEH
> >>>> recovery, the adapter recovered as expected.
> >>> I wasn't sure to follow what did you mean by "it didn't affect the EEH
> >>> recovery", how did you use the aer_inject module, is that through
> >>> user-space tool which is available for us?
> >>
> >> I wanted to say that I was testing before only with the EEH option
> >> activated, then I activated the AER options on my powerpc system just to
> >> make sure these options when activate wouldn't affect the EEH recovery.
> >> I haven't injected and AER error since I don't have a system with
> >> hardware support for it.
> >>
> >>
> >> Thanks,
> >
> > Hi
> >
> > Using a special extender card I've powered down the card.
> > None of the callbacks were called (I added printks to be sure).
> > Shouldn't one on the callbacks be called?
> >
> > Shlomo Pongratz.
> >
>
>
> What does this extender card do exactly? If it does hot plugging, it
> will call the remove and probe callbacks.
>
>
> --
> Kleber Sacilotto de Souza
> IBM Linux Technology Center

I assume it just powers down the card, ie, it will not respond anymore
to any bus messages, which may cause an error report.

Shlomo, is it the system you are using AER-capable? From what I see from
code and documentation, you must have root ports which support AER. Is
that the case?

Regards.
Cascardo.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-20 19:55 [PATCH] mlx4: Add support for EEH error recovery Kleber Sacilotto de Souza
  2012-07-22 10:29 ` Or Gerlitz
       [not found] ` <500BD558.2060803@mellanox.com>
@ 2012-07-24 21:03 ` David Miller
  2012-07-24 22:30   ` Or Gerlitz
  2 siblings, 1 reply; 24+ messages in thread
From: David Miller @ 2012-07-24 21:03 UTC (permalink / raw)
  To: klebers; +Cc: netdev, jackm, yevgenyp, ogerlitz, cascardo, brking

From: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Date: Fri, 20 Jul 2012 16:55:43 -0300

> Currently the mlx4 drivers don't have the necessary callbacks to
> implement EEH errors detection and recovery, so the PCI layer uses the
> probe and remove callbacks to try to recover the device after an error on
> the bus. However, these callbacks have race conditions with the internal
> catastrophic error recovery functions, which will also detect the error
> and this can cause the system to crash if both EEH and catas functions
> try to reset the device.
> 
> This patch adds the necessary error recovery callbacks and makes sure
> that the internal catastrophic error functions will not try to reset the
> device in such scenarios. It also adds some calls to
> pci_channel_offline() to suppress reads/writes on the bus when the slot
> cannot accept I/O operations so we prevent unnecessary accesses to the
> bus and speed up the device removal.
> 
> Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>

Or, you promised an ACK today, I still haven't seen it.

There is no reason Kleber should be penalized and have his work
miss the merge window just because you guys can't be bothered
to approve this patch in a reasonable amount of time.

Therefore I'm just going to apply it later today, and don't do this
with someone's submission ever again, it impedes progress and
frustrates contributors.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 21:03 ` David Miller
@ 2012-07-24 22:30   ` Or Gerlitz
  2012-07-25 14:38     ` Shlomo Pongartz
       [not found]     ` <5010070B.5040405@mellanox.com>
  0 siblings, 2 replies; 24+ messages in thread
From: Or Gerlitz @ 2012-07-24 22:30 UTC (permalink / raw)
  To: David Miller
  Cc: klebers, netdev, jackm, yevgenyp, ogerlitz, cascardo, brking,
	Shlomo Pongratz

On Wed, Jul 25, 2012 at 12:03 AM, David Miller <davem@davemloft.net> wrote:

> Or, you promised an ACK today, I still haven't seen it.

It turned out that reacted we did, but not the ACK way.

Again, code review wise, we intended to ack it, but Shlomo has set
testing environment, under which he had some issues with the patch, as
such he preferred not to ACK it but rather bring up the issues on the
list and sort them out 1st. I thought it would be wrong to over-rule
this preference of him, and this way is fair-enough with the author
and your guide-lines, maybe I had to be more aggressive with ACKing
this, as of the merge window closing coming. So tomorrow.

Or.

> There is no reason Kleber should be penalized and have his work
> miss the merge window just because you guys can't be bothered
> to approve this patch in a reasonable amount of time.
>
> Therefore I'm just going to apply it later today, and don't do this
> with someone's submission ever again, it impedes progress and
> frustrates contributors.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
  2012-07-24 22:30   ` Or Gerlitz
@ 2012-07-25 14:38     ` Shlomo Pongartz
       [not found]     ` <5010070B.5040405@mellanox.com>
  1 sibling, 0 replies; 24+ messages in thread
From: Shlomo Pongartz @ 2012-07-25 14:38 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, klebers, netdev, jackm, yevgenyp, ogerlitz,
	cascardo, brking

On 7/25/2012 1:30 AM, Or Gerlitz wrote:
> On Wed, Jul 25, 2012 at 12:03 AM, David Miller <davem@davemloft.net> wrote:
>
>> Or, you promised an ACK today, I still haven't seen it.
> It turned out that reacted we did, but not the ACK way.
>
> Again, code review wise, we intended to ack it, but Shlomo has set
> testing environment, under which he had some issues with the patch, as
> such he preferred not to ACK it but rather bring up the issues on the
> list and sort them out 1st. I thought it would be wrong to over-rule
> this preference of him, and this way is fair-enough with the author
> and your guide-lines, maybe I had to be more aggressive with ACKing
> this, as of the merge window closing coming. So tomorrow.
>
> Or.
>
>> There is no reason Kleber should be penalized and have his work
>> miss the merge window just because you guys can't be bothered
>> to approve this patch in a reasonable amount of time.
>>
>> Therefore I'm just going to apply it later today, and don't do this
>> with someone's submission ever again, it impedes progress and
>> frustrates contributors.
> .
>
The code looks fine and can be merged.
We will continue to test it under various conditions.
Sorry for the delay.
Thank you.
Shlomo Pongratz.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
       [not found]     ` <5010070B.5040405@mellanox.com>
@ 2012-07-25 15:02       ` Kleber Sacilotto de Souza
  2012-07-25 22:19       ` David Miller
  1 sibling, 0 replies; 24+ messages in thread
From: Kleber Sacilotto de Souza @ 2012-07-25 15:02 UTC (permalink / raw)
  To: Shlomo Pongartz
  Cc: Or Gerlitz, David Miller, netdev, jackm, yevgenyp, ogerlitz,
	cascardo, brking

On 07/25/2012 11:47 AM, Shlomo Pongartz wrote:

> On 7/25/2012 1:30 AM, Or Gerlitz wrote:
>> On Wed, Jul 25, 2012 at 12:03 AM, David Miller <davem@davemloft.net> wrote:
>>
>>> Or, you promised an ACK today, I still haven't seen it.
>> It turned out that reacted we did, but not the ACK way.
>>
>> Again, code review wise, we intended to ack it, but Shlomo has set
>> testing environment, under which he had some issues with the patch, as
>> such he preferred not to ACK it but rather bring up the issues on the
>> list and sort them out 1st. I thought it would be wrong to over-rule
>> this preference of him, and this way is fair-enough with the author
>> and your guide-lines, maybe I had to be more aggressive with ACKing
>> this, as of the merge window closing coming. So tomorrow.
>>
>> Or.
>>
>>> There is no reason Kleber should be penalized and have his work
>>> miss the merge window just because you guys can't be bothered
>>> to approve this patch in a reasonable amount of time.
>>>
>>> Therefore I'm just going to apply it later today, and don't do this
>>> with someone's submission ever again, it impedes progress and
>>> frustrates contributors.
>> .
>>
> 
> Hi Kleber,
> 
>  
> 
> I reviewed the patch and it seems fine, here's my ACK
> 
>  
> 
> Acked-by: Shlomo Pongratz <shlomop@mellanox.com
> <mailto:shlomop@mellanox.com>>
> 
> 


Hi Shlomo,

Thank you for reviewing my patch.

-- 
Kleber Sacilotto de Souza
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mlx4: Add support for EEH error recovery
       [not found]     ` <5010070B.5040405@mellanox.com>
  2012-07-25 15:02       ` Kleber Sacilotto de Souza
@ 2012-07-25 22:19       ` David Miller
  1 sibling, 0 replies; 24+ messages in thread
From: David Miller @ 2012-07-25 22:19 UTC (permalink / raw)
  To: shlomop
  Cc: or.gerlitz, klebers, netdev, jackm, yevgenyp, ogerlitz, cascardo, brking

From: Shlomo Pongartz <shlomop@mellanox.com>
Date: Wed, 25 Jul 2012 17:47:39 +0300

> Acked-by: Shlomo Pongratz <shlomop@mellanox.com
> <mailto:shlomop@mellanox.com>>

This really isn't how an Acked-by: tag should look, there should
simply be a closing ">" at the end of the first line, but it's somehow
been replaced with a newline and that "<mailto:shlomop@mellanox.com>"
URL instead.

I'll fix it up this time, but in the future I will not and your
ACK will simply get lost.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2012-07-25 22:19 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-20 19:55 [PATCH] mlx4: Add support for EEH error recovery Kleber Sacilotto de Souza
2012-07-22 10:29 ` Or Gerlitz
     [not found] ` <500BD558.2060803@mellanox.com>
2012-07-23  0:15   ` David Miller
2012-07-23 13:18     ` Kleber Sacilotto de Souza
2012-07-23 13:45       ` Or Gerlitz
2012-07-23 18:12         ` Kleber Sacilotto de Souza
2012-07-23 20:53           ` Kleber Sacilotto de Souza
2012-07-23 21:26             ` Or Gerlitz
2012-07-23 21:34               ` David Miller
2012-07-23 21:42                 ` Or Gerlitz
2012-07-23 21:44                   ` David Miller
2012-07-23 22:02                     ` Or Gerlitz
2012-07-23 22:21                       ` David Miller
2012-07-24 13:12               ` Kleber Sacilotto de Souza
2012-07-24 17:09                 ` Shlomo Pongartz
2012-07-24 17:35                   ` Kleber Sacilotto de Souza
2012-07-24 18:08                     ` Thadeu Lima de Souza Cascardo
2012-07-24 18:35                       ` Shlomo Pongratz
2012-07-24 18:39                       ` Shlomo Pongratz
2012-07-24 21:03 ` David Miller
2012-07-24 22:30   ` Or Gerlitz
2012-07-25 14:38     ` Shlomo Pongartz
     [not found]     ` <5010070B.5040405@mellanox.com>
2012-07-25 15:02       ` Kleber Sacilotto de Souza
2012-07-25 22:19       ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.