All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] pNFS: make DS availability problems visible in log
@ 2021-03-03 15:33 Roberto Bergantinos Corpas
  2021-03-04 20:20 ` Chuck Lever
  0 siblings, 1 reply; 6+ messages in thread
From: Roberto Bergantinos Corpas @ 2021-03-03 15:33 UTC (permalink / raw)
  To: trond.myklebust; +Cc: anna.schumaker, linux-nfs

Would be interesting to promote DS availability logging outside debug
so that we are more aware that I/O is diverted to MDS and some part
of the infraestructure failed.

Also added logging for failed DS connection attempts.

Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
---
 fs/nfs/filelayout/filelayout.c         | 4 ++--
 fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
 fs/nfs/pnfs_nfs.c                      | 6 +++++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7f5aa0403e16..fef2d31a501a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	case -EIO:
 	case -ETIMEDOUT:
 	case -EPIPE:
-		dprintk("%s DS connection error %d\n", __func__,
+		pr_warn("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		nfs4_mark_deviceid_unavailable(devid);
 		pnfs_error_mark_layout_for_return(inode, lseg);
@@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		fallthrough;
 	default:
 reset:
-		dprintk("%s Retry through MDS. Error %d\n", __func__,
+		pr_warn("%s Retry through MDS. Error %d\n", __func__,
 			task->tk_status);
 		return -NFS4ERR_RESET_TO_MDS;
 	}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a163533446fa..7150d94e80e6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1129,7 +1129,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 	case -EIO:
 	case -ETIMEDOUT:
 	case -EPIPE:
-		dprintk("%s DS connection error %d\n", __func__,
+		pr_warn("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
 				&devid->deviceid);
@@ -1139,7 +1139,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 		if (ff_layout_avoid_mds_available_ds(lseg))
 			return -NFS4ERR_RESET_TO_PNFS;
 reset:
-		dprintk("%s Retry through MDS. Error %d\n", __func__,
+		pr_warn("%s Retry through MDS. Error %d\n", __func__,
 			task->tk_status);
 		return -NFS4ERR_RESET_TO_MDS;
 	}
@@ -1167,7 +1167,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
 		goto out_retry;
 	default:
-		dprintk("%s DS connection error %d\n", __func__,
+		pr_warn("%s DS connection error %d\n", __func__,
 			task->tk_status);
 		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
 				&devid->deviceid);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 679767ac258d..322661a48348 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 						(struct sockaddr *)&da->da_addr,
 						da->da_addrlen, IPPROTO_TCP,
 						timeo, retrans, minor_version);
-			if (IS_ERR(clp))
+			if (IS_ERR(clp)) {
+				pr_warn("%s: DS: %s unable to connect with address %s, error: %ld\n",
+					__func__, ds->ds_remotestr, da->da_remotestr, PTR_ERR(clp));
 				continue;
+			}
 
 			status = nfs4_init_ds_session(clp,
 					mds_srv->nfs_client->cl_lease_time);
@@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 	}
 
 	if (IS_ERR(clp)) {
+		pr_warn("%s: no DS available\n", __func__);
 		status = PTR_ERR(clp);
 		goto out;
 	}
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] pNFS: make DS availability problems visible in log
  2021-03-03 15:33 [PATCH] pNFS: make DS availability problems visible in log Roberto Bergantinos Corpas
@ 2021-03-04 20:20 ` Chuck Lever
  2021-03-04 20:35   ` Trond Myklebust
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Lever @ 2021-03-04 20:20 UTC (permalink / raw)
  To: Roberto Bergantinos Corpas
  Cc: Trond Myklebust, Anna Schumaker, Linux NFS Mailing List

Hello Roberto!

> On Mar 3, 2021, at 10:33 AM, Roberto Bergantinos Corpas <rbergant@redhat.com> wrote:
> 
> Would be interesting to promote DS availability logging outside debug
> so that we are more aware that I/O is diverted to MDS and some part
> of the infraestructure failed.
> 
> Also added logging for failed DS connection attempts.

Given that this enables remote system behavior to generate
kernel log traffic that can fill the local root partition,
I'd like to see either:

- the explicit use of rate limiting, or

- these dprintks replaced with tracepoints


> Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
> ---
> fs/nfs/filelayout/filelayout.c         | 4 ++--
> fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
> fs/nfs/pnfs_nfs.c                      | 6 +++++-
> 3 files changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
> index 7f5aa0403e16..fef2d31a501a 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
> 	case -EIO:
> 	case -ETIMEDOUT:
> 	case -EPIPE:
> -		dprintk("%s DS connection error %d\n", __func__,
> +		pr_warn("%s DS connection error %d\n", __func__,
> 			task->tk_status);
> 		nfs4_mark_deviceid_unavailable(devid);
> 		pnfs_error_mark_layout_for_return(inode, lseg);
> @@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
> 		fallthrough;
> 	default:
> reset:
> -		dprintk("%s Retry through MDS. Error %d\n", __func__,
> +		pr_warn("%s Retry through MDS. Error %d\n", __func__,
> 			task->tk_status);
> 		return -NFS4ERR_RESET_TO_MDS;
> 	}
> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
> index a163533446fa..7150d94e80e6 100644
> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> @@ -1129,7 +1129,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
> 	case -EIO:
> 	case -ETIMEDOUT:
> 	case -EPIPE:
> -		dprintk("%s DS connection error %d\n", __func__,
> +		pr_warn("%s DS connection error %d\n", __func__,
> 			task->tk_status);
> 		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> 				&devid->deviceid);
> @@ -1139,7 +1139,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
> 		if (ff_layout_avoid_mds_available_ds(lseg))
> 			return -NFS4ERR_RESET_TO_PNFS;
> reset:
> -		dprintk("%s Retry through MDS. Error %d\n", __func__,
> +		pr_warn("%s Retry through MDS. Error %d\n", __func__,
> 			task->tk_status);
> 		return -NFS4ERR_RESET_TO_MDS;
> 	}
> @@ -1167,7 +1167,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
> 		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
> 		goto out_retry;
> 	default:
> -		dprintk("%s DS connection error %d\n", __func__,
> +		pr_warn("%s DS connection error %d\n", __func__,
> 			task->tk_status);
> 		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> 				&devid->deviceid);
> diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> index 679767ac258d..322661a48348 100644
> --- a/fs/nfs/pnfs_nfs.c
> +++ b/fs/nfs/pnfs_nfs.c
> @@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
> 						(struct sockaddr *)&da->da_addr,
> 						da->da_addrlen, IPPROTO_TCP,
> 						timeo, retrans, minor_version);
> -			if (IS_ERR(clp))
> +			if (IS_ERR(clp)) {
> +				pr_warn("%s: DS: %s unable to connect with address %s, error: %ld\n",
> +					__func__, ds->ds_remotestr, da->da_remotestr, PTR_ERR(clp));
> 				continue;
> +			}
> 
> 			status = nfs4_init_ds_session(clp,
> 					mds_srv->nfs_client->cl_lease_time);
> @@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
> 	}
> 
> 	if (IS_ERR(clp)) {
> +		pr_warn("%s: no DS available\n", __func__);
> 		status = PTR_ERR(clp);
> 		goto out;
> 	}
> -- 
> 2.21.0
> 

--
Chuck Lever




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] pNFS: make DS availability problems visible in log
  2021-03-04 20:20 ` Chuck Lever
@ 2021-03-04 20:35   ` Trond Myklebust
  2021-03-05 12:29     ` Roberto Bergantinos Corpas
  0 siblings, 1 reply; 6+ messages in thread
From: Trond Myklebust @ 2021-03-04 20:35 UTC (permalink / raw)
  To: rbergant, chuck.lever; +Cc: linux-nfs, anna.schumaker

On Thu, 2021-03-04 at 20:20 +0000, Chuck Lever wrote:
> Hello Roberto!
> 
> > On Mar 3, 2021, at 10:33 AM, Roberto Bergantinos Corpas <
> > rbergant@redhat.com> wrote:
> > 
> > Would be interesting to promote DS availability logging outside
> > debug
> > so that we are more aware that I/O is diverted to MDS and some part
> > of the infraestructure failed.
> > 
> > Also added logging for failed DS connection attempts.
> 
> Given that this enables remote system behavior to generate
> kernel log traffic that can fill the local root partition,
> I'd like to see either:
> 
> - the explicit use of rate limiting, or
> 
> - these dprintks replaced with tracepoints

I cannot accept a pr_warn(), even a rate limited one, for a timeout
error or for a connection error in the data path. Those are just too
nasty to deal with in a syslog.

Tracepoints would be acceptable.

> 
> 
> > Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
> > ---
> > fs/nfs/filelayout/filelayout.c         | 4 ++--
> > fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
> > fs/nfs/pnfs_nfs.c                      | 6 +++++-
> > 3 files changed, 10 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/nfs/filelayout/filelayout.c
> > b/fs/nfs/filelayout/filelayout.c
> > index 7f5aa0403e16..fef2d31a501a 100644
> > --- a/fs/nfs/filelayout/filelayout.c
> > +++ b/fs/nfs/filelayout/filelayout.c
> > @@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct
> > rpc_task *task,
> >         case -EIO:
> >         case -ETIMEDOUT:
> >         case -EPIPE:
> > -               dprintk("%s DS connection error %d\n", __func__,
> > +               pr_warn("%s DS connection error %d\n", __func__,
> >                         task->tk_status);
> >                 nfs4_mark_deviceid_unavailable(devid);
> >                 pnfs_error_mark_layout_for_return(inode, lseg);
> > @@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct
> > rpc_task *task,
> >                 fallthrough;
> >         default:
> > reset:
> > -               dprintk("%s Retry through MDS. Error %d\n",
> > __func__,
> > +               pr_warn("%s Retry through MDS. Error %d\n",
> > __func__,
> >                         task->tk_status);
> >                 return -NFS4ERR_RESET_TO_MDS;
> >         }
> > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c
> > b/fs/nfs/flexfilelayout/flexfilelayout.c
> > index a163533446fa..7150d94e80e6 100644
> > --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> > @@ -1129,7 +1129,7 @@ static int
> > ff_layout_async_handle_error_v4(struct rpc_task *task,
> >         case -EIO:
> >         case -ETIMEDOUT:
> >         case -EPIPE:
> > -               dprintk("%s DS connection error %d\n", __func__,
> > +               pr_warn("%s DS connection error %d\n", __func__,
> >                         task->tk_status);
> >                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> >                                 &devid->deviceid);
> > @@ -1139,7 +1139,7 @@ static int
> > ff_layout_async_handle_error_v4(struct rpc_task *task,
> >                 if (ff_layout_avoid_mds_available_ds(lseg))
> >                         return -NFS4ERR_RESET_TO_PNFS;
> > reset:
> > -               dprintk("%s Retry through MDS. Error %d\n",
> > __func__,
> > +               pr_warn("%s Retry through MDS. Error %d\n",
> > __func__,
> >                         task->tk_status);
> >                 return -NFS4ERR_RESET_TO_MDS;
> >         }
> > @@ -1167,7 +1167,7 @@ static int
> > ff_layout_async_handle_error_v3(struct rpc_task *task,
> >                 nfs_inc_stats(lseg->pls_layout->plh_inode,
> > NFSIOS_DELAY);
> >                 goto out_retry;
> >         default:
> > -               dprintk("%s DS connection error %d\n", __func__,
> > +               pr_warn("%s DS connection error %d\n", __func__,
> >                         task->tk_status);
> >                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> >                                 &devid->deviceid);
> > diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> > index 679767ac258d..322661a48348 100644
> > --- a/fs/nfs/pnfs_nfs.c
> > +++ b/fs/nfs/pnfs_nfs.c
> > @@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> > nfs_server *mds_srv,
> >                                                 (struct sockaddr
> > *)&da->da_addr,
> >                                                 da->da_addrlen,
> > IPPROTO_TCP,
> >                                                 timeo, retrans,
> > minor_version);
> > -                       if (IS_ERR(clp))
> > +                       if (IS_ERR(clp)) {
> > +                               pr_warn("%s: DS: %s unable to
> > connect with address %s, error: %ld\n",
> > +                                       __func__, ds->ds_remotestr,
> > da->da_remotestr, PTR_ERR(clp));
> >                                 continue;
> > +                       }
> > 
> >                         status = nfs4_init_ds_session(clp,
> >                                         mds_srv->nfs_client-
> > >cl_lease_time);
> > @@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> > nfs_server *mds_srv,
> >         }
> > 
> >         if (IS_ERR(clp)) {
> > +               pr_warn("%s: no DS available\n", __func__);
> >                 status = PTR_ERR(clp);
> >                 goto out;
> >         }
> > -- 
> > 2.21.0
> > 
> 
> --
> Chuck Lever
> 
> 
> 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] pNFS: make DS availability problems visible in log
  2021-03-04 20:35   ` Trond Myklebust
@ 2021-03-05 12:29     ` Roberto Bergantinos Corpas
  2021-03-05 13:42       ` Benjamin Coddington
  0 siblings, 1 reply; 6+ messages in thread
From: Roberto Bergantinos Corpas @ 2021-03-05 12:29 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: chuck.lever, linux-nfs, anna.schumaker

Trond, Chuck

thanks for feedback. I understand the point

Sure we can turn those printks into tracepoints, done. However
original point was flag somehow DS issues
outside debug, i.e. user intervention. What about just a pr_warn_once
just for the case when at _nfs4_pnfs_v4_ds_connect
we could not reach any DS ?

On Thu, Mar 4, 2021 at 9:35 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Thu, 2021-03-04 at 20:20 +0000, Chuck Lever wrote:
> > Hello Roberto!
> >
> > > On Mar 3, 2021, at 10:33 AM, Roberto Bergantinos Corpas <
> > > rbergant@redhat.com> wrote:
> > >
> > > Would be interesting to promote DS availability logging outside
> > > debug
> > > so that we are more aware that I/O is diverted to MDS and some part
> > > of the infraestructure failed.
> > >
> > > Also added logging for failed DS connection attempts.
> >
> > Given that this enables remote system behavior to generate
> > kernel log traffic that can fill the local root partition,
> > I'd like to see either:
> >
> > - the explicit use of rate limiting, or
> >
> > - these dprintks replaced with tracepoints
>
> I cannot accept a pr_warn(), even a rate limited one, for a timeout
> error or for a connection error in the data path. Those are just too
> nasty to deal with in a syslog.
>
> Tracepoints would be acceptable.
>
> >
> >
> > > Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
> > > ---
> > > fs/nfs/filelayout/filelayout.c         | 4 ++--
> > > fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
> > > fs/nfs/pnfs_nfs.c                      | 6 +++++-
> > > 3 files changed, 10 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/fs/nfs/filelayout/filelayout.c
> > > b/fs/nfs/filelayout/filelayout.c
> > > index 7f5aa0403e16..fef2d31a501a 100644
> > > --- a/fs/nfs/filelayout/filelayout.c
> > > +++ b/fs/nfs/filelayout/filelayout.c
> > > @@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct
> > > rpc_task *task,
> > >         case -EIO:
> > >         case -ETIMEDOUT:
> > >         case -EPIPE:
> > > -               dprintk("%s DS connection error %d\n", __func__,
> > > +               pr_warn("%s DS connection error %d\n", __func__,
> > >                         task->tk_status);
> > >                 nfs4_mark_deviceid_unavailable(devid);
> > >                 pnfs_error_mark_layout_for_return(inode, lseg);
> > > @@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct
> > > rpc_task *task,
> > >                 fallthrough;
> > >         default:
> > > reset:
> > > -               dprintk("%s Retry through MDS. Error %d\n",
> > > __func__,
> > > +               pr_warn("%s Retry through MDS. Error %d\n",
> > > __func__,
> > >                         task->tk_status);
> > >                 return -NFS4ERR_RESET_TO_MDS;
> > >         }
> > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c
> > > b/fs/nfs/flexfilelayout/flexfilelayout.c
> > > index a163533446fa..7150d94e80e6 100644
> > > --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> > > @@ -1129,7 +1129,7 @@ static int
> > > ff_layout_async_handle_error_v4(struct rpc_task *task,
> > >         case -EIO:
> > >         case -ETIMEDOUT:
> > >         case -EPIPE:
> > > -               dprintk("%s DS connection error %d\n", __func__,
> > > +               pr_warn("%s DS connection error %d\n", __func__,
> > >                         task->tk_status);
> > >                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> > >                                 &devid->deviceid);
> > > @@ -1139,7 +1139,7 @@ static int
> > > ff_layout_async_handle_error_v4(struct rpc_task *task,
> > >                 if (ff_layout_avoid_mds_available_ds(lseg))
> > >                         return -NFS4ERR_RESET_TO_PNFS;
> > > reset:
> > > -               dprintk("%s Retry through MDS. Error %d\n",
> > > __func__,
> > > +               pr_warn("%s Retry through MDS. Error %d\n",
> > > __func__,
> > >                         task->tk_status);
> > >                 return -NFS4ERR_RESET_TO_MDS;
> > >         }
> > > @@ -1167,7 +1167,7 @@ static int
> > > ff_layout_async_handle_error_v3(struct rpc_task *task,
> > >                 nfs_inc_stats(lseg->pls_layout->plh_inode,
> > > NFSIOS_DELAY);
> > >                 goto out_retry;
> > >         default:
> > > -               dprintk("%s DS connection error %d\n", __func__,
> > > +               pr_warn("%s DS connection error %d\n", __func__,
> > >                         task->tk_status);
> > >                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> > >                                 &devid->deviceid);
> > > diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> > > index 679767ac258d..322661a48348 100644
> > > --- a/fs/nfs/pnfs_nfs.c
> > > +++ b/fs/nfs/pnfs_nfs.c
> > > @@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> > > nfs_server *mds_srv,
> > >                                                 (struct sockaddr
> > > *)&da->da_addr,
> > >                                                 da->da_addrlen,
> > > IPPROTO_TCP,
> > >                                                 timeo, retrans,
> > > minor_version);
> > > -                       if (IS_ERR(clp))
> > > +                       if (IS_ERR(clp)) {
> > > +                               pr_warn("%s: DS: %s unable to
> > > connect with address %s, error: %ld\n",
> > > +                                       __func__, ds->ds_remotestr,
> > > da->da_remotestr, PTR_ERR(clp));
> > >                                 continue;
> > > +                       }
> > >
> > >                         status = nfs4_init_ds_session(clp,
> > >                                         mds_srv->nfs_client-
> > > >cl_lease_time);
> > > @@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> > > nfs_server *mds_srv,
> > >         }
> > >
> > >         if (IS_ERR(clp)) {
> > > +               pr_warn("%s: no DS available\n", __func__);
> > >                 status = PTR_ERR(clp);
> > >                 goto out;
> > >         }
> > > --
> > > 2.21.0
> > >
> >
> > --
> > Chuck Lever
> >
> >
> >
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] pNFS: make DS availability problems visible in log
  2021-03-05 12:29     ` Roberto Bergantinos Corpas
@ 2021-03-05 13:42       ` Benjamin Coddington
  2021-03-05 15:24         ` Roberto Bergantinos Corpas
  0 siblings, 1 reply; 6+ messages in thread
From: Benjamin Coddington @ 2021-03-05 13:42 UTC (permalink / raw)
  To: Roberto Bergantinos Corpas
  Cc: Trond Myklebust, chuck.lever, linux-nfs, anna.schumaker

There's always going to be some sort of user intervention, even if the 
intervention is to read the system's logs.  What does the solution to 
notice DS problems ideally look like to the user?  Instead of "check the 
logs for DS problems", couldn't you replace that with "run this script 
to check for DS problems"?

Ben

On 5 Mar 2021, at 7:29, Roberto Bergantinos Corpas wrote:

> Trond, Chuck
>
> thanks for feedback. I understand the point
>
> Sure we can turn those printks into tracepoints, done. However
> original point was flag somehow DS issues
> outside debug, i.e. user intervention. What about just a pr_warn_once
> just for the case when at _nfs4_pnfs_v4_ds_connect
> we could not reach any DS ?
>
> On Thu, Mar 4, 2021 at 9:35 PM Trond Myklebust 
> <trondmy@hammerspace.com> wrote:
>>
>> On Thu, 2021-03-04 at 20:20 +0000, Chuck Lever wrote:
>>> Hello Roberto!
>>>
>>>> On Mar 3, 2021, at 10:33 AM, Roberto Bergantinos Corpas <
>>>> rbergant@redhat.com> wrote:
>>>>
>>>> Would be interesting to promote DS availability logging outside
>>>> debug
>>>> so that we are more aware that I/O is diverted to MDS and some part
>>>> of the infraestructure failed.
>>>>
>>>> Also added logging for failed DS connection attempts.
>>>
>>> Given that this enables remote system behavior to generate
>>> kernel log traffic that can fill the local root partition,
>>> I'd like to see either:
>>>
>>> - the explicit use of rate limiting, or
>>>
>>> - these dprintks replaced with tracepoints
>>
>> I cannot accept a pr_warn(), even a rate limited one, for a timeout
>> error or for a connection error in the data path. Those are just too
>> nasty to deal with in a syslog.
>>
>> Tracepoints would be acceptable.
>>
>>>
>>>
>>>> Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
>>>> ---
>>>> fs/nfs/filelayout/filelayout.c         | 4 ++--
>>>> fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
>>>> fs/nfs/pnfs_nfs.c                      | 6 +++++-
>>>> 3 files changed, 10 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/filelayout/filelayout.c
>>>> b/fs/nfs/filelayout/filelayout.c
>>>> index 7f5aa0403e16..fef2d31a501a 100644
>>>> --- a/fs/nfs/filelayout/filelayout.c
>>>> +++ b/fs/nfs/filelayout/filelayout.c
>>>> @@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct
>>>> rpc_task *task,
>>>>         case -EIO:
>>>>         case -ETIMEDOUT:
>>>>         case -EPIPE:
>>>> -               dprintk("%s DS connection error %d\n", __func__,
>>>> +               pr_warn("%s DS connection error %d\n", __func__,
>>>>                         task->tk_status);
>>>>                 nfs4_mark_deviceid_unavailable(devid);
>>>>                 pnfs_error_mark_layout_for_return(inode, lseg);
>>>> @@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct
>>>> rpc_task *task,
>>>>                 fallthrough;
>>>>         default:
>>>> reset:
>>>> -               dprintk("%s Retry through MDS. Error %d\n",
>>>> __func__,
>>>> +               pr_warn("%s Retry through MDS. Error %d\n",
>>>> __func__,
>>>>                         task->tk_status);
>>>>                 return -NFS4ERR_RESET_TO_MDS;
>>>>         }
>>>> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c
>>>> b/fs/nfs/flexfilelayout/flexfilelayout.c
>>>> index a163533446fa..7150d94e80e6 100644
>>>> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
>>>> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
>>>> @@ -1129,7 +1129,7 @@ static int
>>>> ff_layout_async_handle_error_v4(struct rpc_task *task,
>>>>         case -EIO:
>>>>         case -ETIMEDOUT:
>>>>         case -EPIPE:
>>>> -               dprintk("%s DS connection error %d\n", __func__,
>>>> +               pr_warn("%s DS connection error %d\n", __func__,
>>>>                         task->tk_status);
>>>>                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
>>>>                                 &devid->deviceid);
>>>> @@ -1139,7 +1139,7 @@ static int
>>>> ff_layout_async_handle_error_v4(struct rpc_task *task,
>>>>                 if (ff_layout_avoid_mds_available_ds(lseg))
>>>>                         return -NFS4ERR_RESET_TO_PNFS;
>>>> reset:
>>>> -               dprintk("%s Retry through MDS. Error %d\n",
>>>> __func__,
>>>> +               pr_warn("%s Retry through MDS. Error %d\n",
>>>> __func__,
>>>>                         task->tk_status);
>>>>                 return -NFS4ERR_RESET_TO_MDS;
>>>>         }
>>>> @@ -1167,7 +1167,7 @@ static int
>>>> ff_layout_async_handle_error_v3(struct rpc_task *task,
>>>>                 nfs_inc_stats(lseg->pls_layout->plh_inode,
>>>> NFSIOS_DELAY);
>>>>                 goto out_retry;
>>>>         default:
>>>> -               dprintk("%s DS connection error %d\n", __func__,
>>>> +               pr_warn("%s DS connection error %d\n", __func__,
>>>>                         task->tk_status);
>>>>                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
>>>>                                 &devid->deviceid);
>>>> diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
>>>> index 679767ac258d..322661a48348 100644
>>>> --- a/fs/nfs/pnfs_nfs.c
>>>> +++ b/fs/nfs/pnfs_nfs.c
>>>> @@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct
>>>> nfs_server *mds_srv,
>>>>                                                 (struct sockaddr
>>>> *)&da->da_addr,
>>>>                                                 da->da_addrlen,
>>>> IPPROTO_TCP,
>>>>                                                 timeo, retrans,
>>>> minor_version);
>>>> -                       if (IS_ERR(clp))
>>>> +                       if (IS_ERR(clp)) {
>>>> +                               pr_warn("%s: DS: %s unable to
>>>> connect with address %s, error: %ld\n",
>>>> +                                       __func__, ds->ds_remotestr,
>>>> da->da_remotestr, PTR_ERR(clp));
>>>>                                 continue;
>>>> +                       }
>>>>
>>>>                         status = nfs4_init_ds_session(clp,
>>>>                                         mds_srv->nfs_client-
>>>>> cl_lease_time);
>>>> @@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct
>>>> nfs_server *mds_srv,
>>>>         }
>>>>
>>>>         if (IS_ERR(clp)) {
>>>> +               pr_warn("%s: no DS available\n", __func__);
>>>>                 status = PTR_ERR(clp);
>>>>                 goto out;
>>>>         }
>>>> --
>>>> 2.21.0
>>>>
>>>
>>> --
>>> Chuck Lever
>>>
>>>
>>>
>>
>> --
>> Trond Myklebust
>> Linux NFS client maintainer, Hammerspace
>> trond.myklebust@hammerspace.com
>>
>>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] pNFS: make DS availability problems visible in log
  2021-03-05 13:42       ` Benjamin Coddington
@ 2021-03-05 15:24         ` Roberto Bergantinos Corpas
  0 siblings, 0 replies; 6+ messages in thread
From: Roberto Bergantinos Corpas @ 2021-03-05 15:24 UTC (permalink / raw)
  To: Benjamin Coddington
  Cc: Trond Myklebust, Chuck Lever, Linux NFS Mailing List, anna.schumaker

Hi Ben

  Yes you're right always some intervention is required but I guess
something on logs is a bit more exposed to log scan/monitoring tools
from the start,
and in general quicker to notice. I understand patch is way too chatty though.

  But  in the event of i.e. a mount, and no DS is available and we
divert I/O to MDS, i don't think its a bad idea notify
that scenario on logs, at least once, can give user a hint some part
of the pNFS infrastructure went away. That was my original point, but
since life
goes on through MDS maybe is not worth a more alarming log message, i
can go just for tracepoints.

rgds
roberto

On Fri, Mar 5, 2021 at 2:42 PM Benjamin Coddington <bcodding@redhat.com> wrote:
>
> There's always going to be some sort of user intervention, even if the
> intervention is to read the system's logs.  What does the solution to
> notice DS problems ideally look like to the user?  Instead of "check the
> logs for DS problems", couldn't you replace that with "run this script
> to check for DS problems"?
>
> Ben
>
> On 5 Mar 2021, at 7:29, Roberto Bergantinos Corpas wrote:
>
> > Trond, Chuck
> >
> > thanks for feedback. I understand the point
> >
> > Sure we can turn those printks into tracepoints, done. However
> > original point was flag somehow DS issues
> > outside debug, i.e. user intervention. What about just a pr_warn_once
> > just for the case when at _nfs4_pnfs_v4_ds_connect
> > we could not reach any DS ?
> >
> > On Thu, Mar 4, 2021 at 9:35 PM Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> >>
> >> On Thu, 2021-03-04 at 20:20 +0000, Chuck Lever wrote:
> >>> Hello Roberto!
> >>>
> >>>> On Mar 3, 2021, at 10:33 AM, Roberto Bergantinos Corpas <
> >>>> rbergant@redhat.com> wrote:
> >>>>
> >>>> Would be interesting to promote DS availability logging outside
> >>>> debug
> >>>> so that we are more aware that I/O is diverted to MDS and some part
> >>>> of the infraestructure failed.
> >>>>
> >>>> Also added logging for failed DS connection attempts.
> >>>
> >>> Given that this enables remote system behavior to generate
> >>> kernel log traffic that can fill the local root partition,
> >>> I'd like to see either:
> >>>
> >>> - the explicit use of rate limiting, or
> >>>
> >>> - these dprintks replaced with tracepoints
> >>
> >> I cannot accept a pr_warn(), even a rate limited one, for a timeout
> >> error or for a connection error in the data path. Those are just too
> >> nasty to deal with in a syslog.
> >>
> >> Tracepoints would be acceptable.
> >>
> >>>
> >>>
> >>>> Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
> >>>> ---
> >>>> fs/nfs/filelayout/filelayout.c         | 4 ++--
> >>>> fs/nfs/flexfilelayout/flexfilelayout.c | 6 +++---
> >>>> fs/nfs/pnfs_nfs.c                      | 6 +++++-
> >>>> 3 files changed, 10 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/fs/nfs/filelayout/filelayout.c
> >>>> b/fs/nfs/filelayout/filelayout.c
> >>>> index 7f5aa0403e16..fef2d31a501a 100644
> >>>> --- a/fs/nfs/filelayout/filelayout.c
> >>>> +++ b/fs/nfs/filelayout/filelayout.c
> >>>> @@ -181,7 +181,7 @@ static int filelayout_async_handle_error(struct
> >>>> rpc_task *task,
> >>>>         case -EIO:
> >>>>         case -ETIMEDOUT:
> >>>>         case -EPIPE:
> >>>> -               dprintk("%s DS connection error %d\n", __func__,
> >>>> +               pr_warn("%s DS connection error %d\n", __func__,
> >>>>                         task->tk_status);
> >>>>                 nfs4_mark_deviceid_unavailable(devid);
> >>>>                 pnfs_error_mark_layout_for_return(inode, lseg);
> >>>> @@ -190,7 +190,7 @@ static int filelayout_async_handle_error(struct
> >>>> rpc_task *task,
> >>>>                 fallthrough;
> >>>>         default:
> >>>> reset:
> >>>> -               dprintk("%s Retry through MDS. Error %d\n",
> >>>> __func__,
> >>>> +               pr_warn("%s Retry through MDS. Error %d\n",
> >>>> __func__,
> >>>>                         task->tk_status);
> >>>>                 return -NFS4ERR_RESET_TO_MDS;
> >>>>         }
> >>>> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c
> >>>> b/fs/nfs/flexfilelayout/flexfilelayout.c
> >>>> index a163533446fa..7150d94e80e6 100644
> >>>> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> >>>> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> >>>> @@ -1129,7 +1129,7 @@ static int
> >>>> ff_layout_async_handle_error_v4(struct rpc_task *task,
> >>>>         case -EIO:
> >>>>         case -ETIMEDOUT:
> >>>>         case -EPIPE:
> >>>> -               dprintk("%s DS connection error %d\n", __func__,
> >>>> +               pr_warn("%s DS connection error %d\n", __func__,
> >>>>                         task->tk_status);
> >>>>                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> >>>>                                 &devid->deviceid);
> >>>> @@ -1139,7 +1139,7 @@ static int
> >>>> ff_layout_async_handle_error_v4(struct rpc_task *task,
> >>>>                 if (ff_layout_avoid_mds_available_ds(lseg))
> >>>>                         return -NFS4ERR_RESET_TO_PNFS;
> >>>> reset:
> >>>> -               dprintk("%s Retry through MDS. Error %d\n",
> >>>> __func__,
> >>>> +               pr_warn("%s Retry through MDS. Error %d\n",
> >>>> __func__,
> >>>>                         task->tk_status);
> >>>>                 return -NFS4ERR_RESET_TO_MDS;
> >>>>         }
> >>>> @@ -1167,7 +1167,7 @@ static int
> >>>> ff_layout_async_handle_error_v3(struct rpc_task *task,
> >>>>                 nfs_inc_stats(lseg->pls_layout->plh_inode,
> >>>> NFSIOS_DELAY);
> >>>>                 goto out_retry;
> >>>>         default:
> >>>> -               dprintk("%s DS connection error %d\n", __func__,
> >>>> +               pr_warn("%s DS connection error %d\n", __func__,
> >>>>                         task->tk_status);
> >>>>                 nfs4_delete_deviceid(devid->ld, devid->nfs_client,
> >>>>                                 &devid->deviceid);
> >>>> diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> >>>> index 679767ac258d..322661a48348 100644
> >>>> --- a/fs/nfs/pnfs_nfs.c
> >>>> +++ b/fs/nfs/pnfs_nfs.c
> >>>> @@ -934,8 +934,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> >>>> nfs_server *mds_srv,
> >>>>                                                 (struct sockaddr
> >>>> *)&da->da_addr,
> >>>>                                                 da->da_addrlen,
> >>>> IPPROTO_TCP,
> >>>>                                                 timeo, retrans,
> >>>> minor_version);
> >>>> -                       if (IS_ERR(clp))
> >>>> +                       if (IS_ERR(clp)) {
> >>>> +                               pr_warn("%s: DS: %s unable to
> >>>> connect with address %s, error: %ld\n",
> >>>> +                                       __func__, ds->ds_remotestr,
> >>>> da->da_remotestr, PTR_ERR(clp));
> >>>>                                 continue;
> >>>> +                       }
> >>>>
> >>>>                         status = nfs4_init_ds_session(clp,
> >>>>                                         mds_srv->nfs_client-
> >>>>> cl_lease_time);
> >>>> @@ -949,6 +952,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct
> >>>> nfs_server *mds_srv,
> >>>>         }
> >>>>
> >>>>         if (IS_ERR(clp)) {
> >>>> +               pr_warn("%s: no DS available\n", __func__);
> >>>>                 status = PTR_ERR(clp);
> >>>>                 goto out;
> >>>>         }
> >>>> --
> >>>> 2.21.0
> >>>>
> >>>
> >>> --
> >>> Chuck Lever
> >>>
> >>>
> >>>
> >>
> >> --
> >> Trond Myklebust
> >> Linux NFS client maintainer, Hammerspace
> >> trond.myklebust@hammerspace.com
> >>
> >>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-03-05 15:25 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-03 15:33 [PATCH] pNFS: make DS availability problems visible in log Roberto Bergantinos Corpas
2021-03-04 20:20 ` Chuck Lever
2021-03-04 20:35   ` Trond Myklebust
2021-03-05 12:29     ` Roberto Bergantinos Corpas
2021-03-05 13:42       ` Benjamin Coddington
2021-03-05 15:24         ` Roberto Bergantinos Corpas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.