All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfsdump: fix race condition between lseek() and read()/write()
@ 2016-04-21 13:06 Eryu Guan
  2017-03-27 20:20 ` Darrick J. Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Eryu Guan @ 2016-04-21 13:06 UTC (permalink / raw)
  To: xfs; +Cc: Eryu Guan

There's a race condition in the [get|put]_invtrecord() routines, because
a lseek() followed by a read()/write() is not atmoic, the file offset
might be changed before read()/write().

xfs/302 catches this failure as:
xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.

And it can be reproduced by running multi-stream dump in a tight loop
  mount /dev/<dev> /mnt/xfs
  mkdir /mnt/xfs/dumpdir
  # populate dumpdir here
  while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
  	:
  done

Fix it by replacing the "lseek(); read()/write()" sequence by
pread()/pwrite(), which make the seek and I/O an atomic operation.

Also convert and remove all *_SEEKCUR routines to "SEEK_SET" variants,
because they depend on the maintenance of current file offset, but
pread()/pwrite() don't change file offset.

Signed-off-by: Eryu Guan <eguan@redhat.com>
---

Tested via the reproducer and xfstests "-g dump" run, with both v4 and v5 XFS.

I'm not sure if this is the right fix, perhaps what should be fixed is the
"INVLOCK()", which is now implemented by flock(2), and doesn't work in
multi-thread env, if what it's meant to protect is concurrent accesses from
different threads, not processes.

If so, it seems to me that making INVLOCK() a pthread rw lock could fix the
race condition as well. But the INVLOCK calls are almost everywhere, I didn't
find a simple way to try it.

 common/inventory.c   |  4 ++--
 inventory/inv_api.c  |  5 ++---
 inventory/inv_core.c | 24 ++++--------------------
 inventory/inv_idx.c  |  4 ++--
 inventory/inv_priv.h |  9 ---------
 5 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/common/inventory.c b/common/inventory.c
index d1b810c..0e9c256 100644
--- a/common/inventory.c
+++ b/common/inventory.c
@@ -471,8 +471,8 @@ inv_stream_close(
 	}
 			
 	if (dowrite) {
-		rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, sizeof( invt_stream_t ),
-					       (off64_t) -(sizeof( invt_stream_t )) );
+		rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
+				      tok->md_stream_off);
 	}
  end:
 	INVLOCK( fd, LOCK_UN );
diff --git a/inventory/inv_api.c b/inventory/inv_api.c
index acca40b..46fdde8 100644
--- a/inventory/inv_api.c
+++ b/inventory/inv_api.c
@@ -409,9 +409,8 @@ inv_stream_close(
 		}
 			
 		if (dowrite) {
-			rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, 
-			             sizeof( invt_stream_t ),
-				     -(off64_t)(sizeof( invt_stream_t )) );
+			rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
+					      tok->md_stream_off);
 		}
 	}
 
diff --git a/inventory/inv_core.c b/inventory/inv_core.c
index a17c2c9..42d0ac4 100644
--- a/inventory/inv_core.c
+++ b/inventory/inv_core.c
@@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
 	if ( dolock ) 
 		INVLOCK( fd, LOCK_SH );
 
-	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
-		INV_PERROR( _("Error in reading inventory record "
-			      "(lseek failed): ") );
-		if ( dolock ) 
-			INVLOCK( fd, LOCK_UN );
-		return -1;
-	}
-	
-	nread = read( fd, buf, bufsz );
-
+	nread = pread(fd, buf, bufsz, (off_t)off);
 	if (  nread != (int) bufsz ) {
 		INV_PERROR( _("Error in reading inventory record :") );
-		if ( dolock ) 
+		if ( dolock )
 			INVLOCK( fd, LOCK_UN );
 		return -1;
 	}
@@ -162,15 +153,8 @@ put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
 	if ( dolock )
 		INVLOCK( fd, LOCK_EX );
 	
-	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
-		INV_PERROR( _("Error in writing inventory record "
-			      "(lseek failed): ") );
-		if ( dolock ) 
-			INVLOCK( fd, LOCK_UN );
-		return -1;
-	}
-	
-	if (( nwritten = write( fd, buf, bufsz ) ) != (int) bufsz ) {
+	nwritten = pwrite(fd, buf, bufsz, (off_t)off);
+	if (nwritten != (int) bufsz ) {
 		INV_PERROR( _("Error in writing inventory record :") );
 		if ( dolock )
 			INVLOCK( fd, LOCK_UN );
diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
index 95529e8..cd9b9cb 100644
--- a/inventory/inv_idx.c
+++ b/inventory/inv_idx.c
@@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)
 			      ent.ie_timeperiod.tp_start,
 			      ent.ie_timeperiod.tp_end );
 #endif
-	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
-				      -(off64_t)(sizeof( invt_entry_t )));
+	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
+			      tok->sd_invtok->d_invindex_off);
 	
 #ifdef INVT_DEBUG
 	{
diff --git a/inventory/inv_priv.h b/inventory/inv_priv.h
index 1690271..cd1b527 100644
--- a/inventory/inv_priv.h
+++ b/inventory/inv_priv.h
@@ -303,9 +303,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
 #define GET_REC_NOLOCK( fd, buf, sz, off )  \
                  get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
 
-#define GET_REC_SEEKCUR( fd, buf, sz, off )  \
-                 get_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
-
 #define GET_ALLHDRS_N_CNTS( fd, h, c, hsz, csz ) \
                  get_headerinfo( fd, h, c, hsz, csz, INVT_DOLOCK )
 
@@ -318,12 +315,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
 #define PUT_REC_NOLOCK( fd, buf, sz, off )  \
                  put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
 
-#define PUT_REC_SEEKCUR( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
-
-#define PUT_REC_NOLOCK_SEEKCUR( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DONTLOCK )
-
 
 #define GET_COUNTERS( fd, cnt ) get_counters( fd, (void **)(cnt), \
 					      sizeof(invt_counter_t) )
-- 
2.5.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2016-04-21 13:06 [PATCH] xfsdump: fix race condition between lseek() and read()/write() Eryu Guan
@ 2017-03-27 20:20 ` Darrick J. Wong
  2017-03-28  3:41   ` Eryu Guan
  2017-07-12 18:46 ` Eric Sandeen
  2017-07-12 20:56 ` Eric Sandeen
  2 siblings, 1 reply; 10+ messages in thread
From: Darrick J. Wong @ 2017-03-27 20:20 UTC (permalink / raw)
  To: Eryu Guan; +Cc: xfs, Eric Sandeen

[move to new list]

On Thu, Apr 21, 2016 at 09:06:56PM +0800, Eryu Guan wrote:
> There's a race condition in the [get|put]_invtrecord() routines, because
> a lseek() followed by a read()/write() is not atmoic, the file offset
> might be changed before read()/write().
> 
> xfs/302 catches this failure as:
> xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
> xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.
> 
> And it can be reproduced by running multi-stream dump in a tight loop
>   mount /dev/<dev> /mnt/xfs
>   mkdir /mnt/xfs/dumpdir
>   # populate dumpdir here
>   while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
>   	:
>   done
> 
> Fix it by replacing the "lseek(); read()/write()" sequence by
> pread()/pwrite(), which make the seek and I/O an atomic operation.
> 
> Also convert and remove all *_SEEKCUR routines to "SEEK_SET" variants,
> because they depend on the maintenance of current file offset, but
> pread()/pwrite() don't change file offset.
> 
> Signed-off-by: Eryu Guan <eguan@redhat.com>
> ---
> 
> Tested via the reproducer and xfstests "-g dump" run, with both v4 and v5 XFS.
> 
> I'm not sure if this is the right fix, perhaps what should be fixed is the
> "INVLOCK()", which is now implemented by flock(2), and doesn't work in
> multi-thread env, if what it's meant to protect is concurrent accesses from
> different threads, not processes.
> 
> If so, it seems to me that making INVLOCK() a pthread rw lock could fix the
> race condition as well. But the INVLOCK calls are almost everywhere, I didn't
> find a simple way to try it.

I wonder, did this ever make any progress?  Offhand it looks ok, but then
I'm no xfsdump expert.

(Yes, our QA is bugging me about xfs/302 failures too...)

--D

> 
>  common/inventory.c   |  4 ++--
>  inventory/inv_api.c  |  5 ++---
>  inventory/inv_core.c | 24 ++++--------------------
>  inventory/inv_idx.c  |  4 ++--
>  inventory/inv_priv.h |  9 ---------
>  5 files changed, 10 insertions(+), 36 deletions(-)
> 
> diff --git a/common/inventory.c b/common/inventory.c
> index d1b810c..0e9c256 100644
> --- a/common/inventory.c
> +++ b/common/inventory.c
> @@ -471,8 +471,8 @@ inv_stream_close(
>  	}
>  			
>  	if (dowrite) {
> -		rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, sizeof( invt_stream_t ),
> -					       (off64_t) -(sizeof( invt_stream_t )) );
> +		rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +				      tok->md_stream_off);
>  	}
>   end:
>  	INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_api.c b/inventory/inv_api.c
> index acca40b..46fdde8 100644
> --- a/inventory/inv_api.c
> +++ b/inventory/inv_api.c
> @@ -409,9 +409,8 @@ inv_stream_close(
>  		}
>  			
>  		if (dowrite) {
> -			rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, 
> -			             sizeof( invt_stream_t ),
> -				     -(off64_t)(sizeof( invt_stream_t )) );
> +			rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +					      tok->md_stream_off);
>  		}
>  	}
>  
> diff --git a/inventory/inv_core.c b/inventory/inv_core.c
> index a17c2c9..42d0ac4 100644
> --- a/inventory/inv_core.c
> +++ b/inventory/inv_core.c
> @@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock ) 
>  		INVLOCK( fd, LOCK_SH );
>  
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in reading inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	nread = read( fd, buf, bufsz );
> -
> +	nread = pread(fd, buf, bufsz, (off_t)off);
>  	if (  nread != (int) bufsz ) {
>  		INV_PERROR( _("Error in reading inventory record :") );
> -		if ( dolock ) 
> +		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
>  		return -1;
>  	}
> @@ -162,15 +153,8 @@ put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock )
>  		INVLOCK( fd, LOCK_EX );
>  	
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in writing inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	if (( nwritten = write( fd, buf, bufsz ) ) != (int) bufsz ) {
> +	nwritten = pwrite(fd, buf, bufsz, (off_t)off);
> +	if (nwritten != (int) bufsz ) {
>  		INV_PERROR( _("Error in writing inventory record :") );
>  		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
> index 95529e8..cd9b9cb 100644
> --- a/inventory/inv_idx.c
> +++ b/inventory/inv_idx.c
> @@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)
>  			      ent.ie_timeperiod.tp_start,
>  			      ent.ie_timeperiod.tp_end );
>  #endif
> -	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
> -				      -(off64_t)(sizeof( invt_entry_t )));
> +	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
> +			      tok->sd_invtok->d_invindex_off);
>  	
>  #ifdef INVT_DEBUG
>  	{
> diff --git a/inventory/inv_priv.h b/inventory/inv_priv.h
> index 1690271..cd1b527 100644
> --- a/inventory/inv_priv.h
> +++ b/inventory/inv_priv.h
> @@ -303,9 +303,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>  #define GET_REC_NOLOCK( fd, buf, sz, off )  \
>                   get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
>  
> -#define GET_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 get_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> -
>  #define GET_ALLHDRS_N_CNTS( fd, h, c, hsz, csz ) \
>                   get_headerinfo( fd, h, c, hsz, csz, INVT_DOLOCK )
>  
> @@ -318,12 +315,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>  #define PUT_REC_NOLOCK( fd, buf, sz, off )  \
>                   put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
>  
> -#define PUT_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> -
> -#define PUT_REC_NOLOCK_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DONTLOCK )
> -
>  
>  #define GET_COUNTERS( fd, cnt ) get_counters( fd, (void **)(cnt), \
>  					      sizeof(invt_counter_t) )
> -- 
> 2.5.5
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2017-03-27 20:20 ` Darrick J. Wong
@ 2017-03-28  3:41   ` Eryu Guan
  2017-07-12 18:26     ` Darrick J. Wong
  0 siblings, 1 reply; 10+ messages in thread
From: Eryu Guan @ 2017-03-28  3:41 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs, Eric Sandeen

On Mon, Mar 27, 2017 at 01:20:30PM -0700, Darrick J. Wong wrote:
> [move to new list]
> 
> On Thu, Apr 21, 2016 at 09:06:56PM +0800, Eryu Guan wrote:
> > There's a race condition in the [get|put]_invtrecord() routines, because
> > a lseek() followed by a read()/write() is not atmoic, the file offset
> > might be changed before read()/write().
> > 
> > xfs/302 catches this failure as:
> > xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
> > xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.
> > 
> > And it can be reproduced by running multi-stream dump in a tight loop
> >   mount /dev/<dev> /mnt/xfs
> >   mkdir /mnt/xfs/dumpdir
> >   # populate dumpdir here
> >   while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
> >   	:
> >   done
> > 
> > Fix it by replacing the "lseek(); read()/write()" sequence by
> > pread()/pwrite(), which make the seek and I/O an atomic operation.
> > 
> > Also convert and remove all *_SEEKCUR routines to "SEEK_SET" variants,
> > because they depend on the maintenance of current file offset, but
> > pread()/pwrite() don't change file offset.
> > 
> > Signed-off-by: Eryu Guan <eguan@redhat.com>
> > ---
> > 
> > Tested via the reproducer and xfstests "-g dump" run, with both v4 and v5 XFS.
> > 
> > I'm not sure if this is the right fix, perhaps what should be fixed is the
> > "INVLOCK()", which is now implemented by flock(2), and doesn't work in
> > multi-thread env, if what it's meant to protect is concurrent accesses from
> > different threads, not processes.
> > 
> > If so, it seems to me that making INVLOCK() a pthread rw lock could fix the
> > race condition as well. But the INVLOCK calls are almost everywhere, I didn't
> > find a simple way to try it.
> 
> I wonder, did this ever make any progress?  Offhand it looks ok, but then
> I'm no xfsdump expert.

No, you're the first one to comment on this patch :)

> 
> (Yes, our QA is bugging me about xfs/302 failures too...)

JFYI, xfs/059 and xfs/301 also fail due to this bug, just that xfs/059
failure rarely happens.

Thanks,
Eryu

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2017-03-28  3:41   ` Eryu Guan
@ 2017-07-12 18:26     ` Darrick J. Wong
  0 siblings, 0 replies; 10+ messages in thread
From: Darrick J. Wong @ 2017-07-12 18:26 UTC (permalink / raw)
  To: Eryu Guan; +Cc: xfs, Eric Sandeen

On Tue, Mar 28, 2017 at 11:41:30AM +0800, Eryu Guan wrote:
> On Mon, Mar 27, 2017 at 01:20:30PM -0700, Darrick J. Wong wrote:
> > [move to new list]
> > 
> > On Thu, Apr 21, 2016 at 09:06:56PM +0800, Eryu Guan wrote:
> > > There's a race condition in the [get|put]_invtrecord() routines, because
> > > a lseek() followed by a read()/write() is not atmoic, the file offset
> > > might be changed before read()/write().
> > > 
> > > xfs/302 catches this failure as:
> > > xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
> > > xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.
> > > 
> > > And it can be reproduced by running multi-stream dump in a tight loop
> > >   mount /dev/<dev> /mnt/xfs
> > >   mkdir /mnt/xfs/dumpdir
> > >   # populate dumpdir here
> > >   while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
> > >   	:
> > >   done
> > > 
> > > Fix it by replacing the "lseek(); read()/write()" sequence by
> > > pread()/pwrite(), which make the seek and I/O an atomic operation.
> > > 
> > > Also convert and remove all *_SEEKCUR routines to "SEEK_SET" variants,
> > > because they depend on the maintenance of current file offset, but
> > > pread()/pwrite() don't change file offset.
> > > 
> > > Signed-off-by: Eryu Guan <eguan@redhat.com>
> > > ---
> > > 
> > > Tested via the reproducer and xfstests "-g dump" run, with both v4 and v5 XFS.
> > > 
> > > I'm not sure if this is the right fix, perhaps what should be fixed is the
> > > "INVLOCK()", which is now implemented by flock(2), and doesn't work in
> > > multi-thread env, if what it's meant to protect is concurrent accesses from
> > > different threads, not processes.
> > > 
> > > If so, it seems to me that making INVLOCK() a pthread rw lock could fix the
> > > race condition as well. But the INVLOCK calls are almost everywhere, I didn't
> > > find a simple way to try it.
> > 
> > I wonder, did this ever make any progress?  Offhand it looks ok, but then
> > I'm no xfsdump expert.
> 
> No, you're the first one to comment on this patch :)
> 
> > 
> > (Yes, our QA is bugging me about xfs/302 failures too...)
> 
> JFYI, xfs/059 and xfs/301 also fail due to this bug, just that xfs/059
> failure rarely happens.

Looks fine to me,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> 
> Thanks,
> Eryu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2016-04-21 13:06 [PATCH] xfsdump: fix race condition between lseek() and read()/write() Eryu Guan
  2017-03-27 20:20 ` Darrick J. Wong
@ 2017-07-12 18:46 ` Eric Sandeen
  2017-07-12 19:33   ` Eric Sandeen
  2017-07-12 20:56 ` Eric Sandeen
  2 siblings, 1 reply; 10+ messages in thread
From: Eric Sandeen @ 2017-07-12 18:46 UTC (permalink / raw)
  To: linux-xfs



On 04/21/2016 08:06 AM, Eryu Guan wrote:
> There's a race condition in the [get|put]_invtrecord() routines, because
> a lseek() followed by a read()/write() is not atmoic, the file offset
> might be changed before read()/write().
> 
> xfs/302 catches this failure as:
> xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
> xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.
> 
> And it can be reproduced by running multi-stream dump in a tight loop
>   mount /dev/<dev> /mnt/xfs
>   mkdir /mnt/xfs/dumpdir
>   # populate dumpdir here
>   while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
>   	:
>   done
> 
> Fix it by replacing the "lseek(); read()/write()" sequence by
> pread()/pwrite(), which make the seek and I/O an atomic operation.

This seems ok to me; it seems safer and more obvious than changing up
the locking to use pthread locks ... I guess my only handwavy concern is
whether the offsets PUT_REC_NOLOCK are correct?  See below

> 
> Also convert and remove all *_SEEKCUR routines to "SEEK_SET" variants,
> because they depend on the maintenance of current file offset, but
> pread()/pwrite() don't change file offset.
> 
> Signed-off-by: Eryu Guan <eguan@redhat.com>
> ---
> 
> Tested via the reproducer and xfstests "-g dump" run, with both v4 and v5 XFS.
> 
> I'm not sure if this is the right fix, perhaps what should be fixed is the
> "INVLOCK()", which is now implemented by flock(2), and doesn't work in
> multi-thread env, if what it's meant to protect is concurrent accesses from
> different threads, not processes.
> 
> If so, it seems to me that making INVLOCK() a pthread rw lock could fix the
> race condition as well. But the INVLOCK calls are almost everywhere, I didn't
> find a simple way to try it.
> 
>  common/inventory.c   |  4 ++--
>  inventory/inv_api.c  |  5 ++---
>  inventory/inv_core.c | 24 ++++--------------------
>  inventory/inv_idx.c  |  4 ++--
>  inventory/inv_priv.h |  9 ---------
>  5 files changed, 10 insertions(+), 36 deletions(-)
> 
> diff --git a/common/inventory.c b/common/inventory.c
> index d1b810c..0e9c256 100644
> --- a/common/inventory.c
> +++ b/common/inventory.c
> @@ -471,8 +471,8 @@ inv_stream_close(
>  	}
>  			
>  	if (dowrite) {
> -		rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, sizeof( invt_stream_t ),
> -					       (off64_t) -(sizeof( invt_stream_t )) );
> +		rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +				      tok->md_stream_off);
>  	}
>   end:
>  	INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_api.c b/inventory/inv_api.c
> index acca40b..46fdde8 100644
> --- a/inventory/inv_api.c
> +++ b/inventory/inv_api.c
> @@ -409,9 +409,8 @@ inv_stream_close(
>  		}
>  			
>  		if (dowrite) {
> -			rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, 
> -			             sizeof( invt_stream_t ),
> -				     -(off64_t)(sizeof( invt_stream_t )) );
> +			rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +					      tok->md_stream_off);
>  		}
>  	}
>  
> diff --git a/inventory/inv_core.c b/inventory/inv_core.c
> index a17c2c9..42d0ac4 100644
> --- a/inventory/inv_core.c
> +++ b/inventory/inv_core.c
> @@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock ) 
>  		INVLOCK( fd, LOCK_SH );
>  
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in reading inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	nread = read( fd, buf, bufsz );
> -
> +	nread = pread(fd, buf, bufsz, (off_t)off);
>  	if (  nread != (int) bufsz ) {
>  		INV_PERROR( _("Error in reading inventory record :") );
> -		if ( dolock ) 
> +		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
>  		return -1;
>  	}
> @@ -162,15 +153,8 @@ put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock )
>  		INVLOCK( fd, LOCK_EX );
>  	
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in writing inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	if (( nwritten = write( fd, buf, bufsz ) ) != (int) bufsz ) {
> +	nwritten = pwrite(fd, buf, bufsz, (off_t)off);
> +	if (nwritten != (int) bufsz ) {
>  		INV_PERROR( _("Error in writing inventory record :") );
>  		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
> index 95529e8..cd9b9cb 100644
> --- a/inventory/inv_idx.c
> +++ b/inventory/inv_idx.c
> @@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)

        rval = GET_REC_NOLOCK( fd, &ent, sizeof( invt_entry_t ),
                                tok->sd_invtok->d_invindex_off);

This sets the offset to d_invindex_off


>  			      ent.ie_timeperiod.tp_start,
>  			      ent.ie_timeperiod.tp_end );
>  #endif
> -	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
> -				      -(off64_t)(sizeof( invt_entry_t )));

This backed up one entry from the current offset...

> +	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
> +			      tok->sd_invtok->d_invindex_off);

but this keeps it at the current offset, no?  Isn't this a different behavior?

>  	
>  #ifdef INVT_DEBUG
>  	{
> diff --git a/inventory/inv_priv.h b/inventory/inv_priv.h
> index 1690271..cd1b527 100644
> --- a/inventory/inv_priv.h
> +++ b/inventory/inv_priv.h
> @@ -303,9 +303,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>  #define GET_REC_NOLOCK( fd, buf, sz, off )  \
>                   get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
>  
> -#define GET_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 get_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> -
>  #define GET_ALLHDRS_N_CNTS( fd, h, c, hsz, csz ) \
>                   get_headerinfo( fd, h, c, hsz, csz, INVT_DOLOCK )
>  
> @@ -318,12 +315,6 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>  #define PUT_REC_NOLOCK( fd, buf, sz, off )  \
>                   put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
>  
> -#define PUT_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> -
> -#define PUT_REC_NOLOCK_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DONTLOCK )
> -
>  
>  #define GET_COUNTERS( fd, cnt ) get_counters( fd, (void **)(cnt), \
>  					      sizeof(invt_counter_t) )
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2017-07-12 18:46 ` Eric Sandeen
@ 2017-07-12 19:33   ` Eric Sandeen
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Sandeen @ 2017-07-12 19:33 UTC (permalink / raw)
  To: linux-xfs


On 07/12/2017 01:46 PM, Eric Sandeen wrote:
> 
> 

Sorry, I was wrong ...


>> diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
>> index 95529e8..cd9b9cb 100644
>> --- a/inventory/inv_idx.c
>> +++ b/inventory/inv_idx.c
>> @@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)
> 
>         rval = GET_REC_NOLOCK( fd, &ent, sizeof( invt_entry_t ),
>                                 tok->sd_invtok->d_invindex_off);
> 
> This sets the offset to d_invindex_off

... and reads sizeof(invt_entry_t), advancing by that much.

>>  			      ent.ie_timeperiod.tp_start,
>>  			      ent.ie_timeperiod.tp_end );
>>  #endif
>> -	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
>> -				      -(off64_t)(sizeof( invt_entry_t )));
> 
> This backed up one entry from the current offset...
which goes back to...

> 
>> +	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
>> +			      tok->sd_invtok->d_invindex_off);

...d_invindex_off.  Sorry for the false alarm.

-eric

> 
> but this keeps it at the current offset, no?  Isn't this a different behavior?
> 
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2016-04-21 13:06 [PATCH] xfsdump: fix race condition between lseek() and read()/write() Eryu Guan
  2017-03-27 20:20 ` Darrick J. Wong
  2017-07-12 18:46 ` Eric Sandeen
@ 2017-07-12 20:56 ` Eric Sandeen
  2017-07-13  7:28   ` Eryu Guan
  2017-07-13  8:41   ` [PATCH v2] " Eryu Guan
  2 siblings, 2 replies; 10+ messages in thread
From: Eric Sandeen @ 2017-07-12 20:56 UTC (permalink / raw)
  To: linux-xfs, Eryu Guan

Hm, but isn't this a problem?

On 04/21/2016 08:06 AM, Eryu Guan wrote:
> diff --git a/inventory/inv_core.c b/inventory/inv_core.c
> index a17c2c9..42d0ac4 100644
> --- a/inventory/inv_core.c
> +++ b/inventory/inv_core.c
> @@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock ) 
>  		INVLOCK( fd, LOCK_SH );
>  
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {

whence can be _SET or _CUR depending on the caller, but with this
patch whence is no longer used, and -

> -		INV_PERROR( _("Error in reading inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	nread = read( fd, buf, bufsz );
> -
> +	nread = pread(fd, buf, bufsz, (off_t)off);

This is only equivalent to SET, no?  With the other changes, perhaps we're
no longer called via GET_REC_SEEKCUR - but in that case, the whence arg should
be removed?  Except for that pesky test code which explicitly passes SEEK_CUR ...


-Eric

>  	if (  nread != (int) bufsz ) {
>  		INV_PERROR( _("Error in reading inventory record :") );
> -		if ( dolock ) 
> +		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
>  		return -1;
>  	}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfsdump: fix race condition between lseek() and read()/write()
  2017-07-12 20:56 ` Eric Sandeen
@ 2017-07-13  7:28   ` Eryu Guan
  2017-07-13  8:41   ` [PATCH v2] " Eryu Guan
  1 sibling, 0 replies; 10+ messages in thread
From: Eryu Guan @ 2017-07-13  7:28 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: linux-xfs

On Wed, Jul 12, 2017 at 03:56:00PM -0500, Eric Sandeen wrote:
> Hm, but isn't this a problem?

/me rewinds my mind by 15 months and confirms it is a problem :)

> 
> On 04/21/2016 08:06 AM, Eryu Guan wrote:
> > diff --git a/inventory/inv_core.c b/inventory/inv_core.c
> > index a17c2c9..42d0ac4 100644
> > --- a/inventory/inv_core.c
> > +++ b/inventory/inv_core.c
> > @@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
> >  	if ( dolock ) 
> >  		INVLOCK( fd, LOCK_SH );
> >  
> > -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> 
> whence can be _SET or _CUR depending on the caller, but with this
> patch whence is no longer used, and -
> 
> > -		INV_PERROR( _("Error in reading inventory record "
> > -			      "(lseek failed): ") );
> > -		if ( dolock ) 
> > -			INVLOCK( fd, LOCK_UN );
> > -		return -1;
> > -	}
> > -	
> > -	nread = read( fd, buf, bufsz );
> > -
> > +	nread = pread(fd, buf, bufsz, (off_t)off);
> 
> This is only equivalent to SET, no?  With the other changes, perhaps we're

Yes, it's only equivalent to SET.

> no longer called via GET_REC_SEEKCUR - but in that case, the whence arg should
> be removed?  Except for that pesky test code which explicitly passes SEEK_CUR ...

Agreed, the whence arg can be removed, the test code requires some
updates though.

I'm now testing a new version, I'll post it once it passes '-g dump'
tests.

Thanks for the review!

Eryu

> 
> 
> -Eric
> 
> >  	if (  nread != (int) bufsz ) {
> >  		INV_PERROR( _("Error in reading inventory record :") );
> > -		if ( dolock ) 
> > +		if ( dolock )
> >  			INVLOCK( fd, LOCK_UN );
> >  		return -1;
> >  	}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] xfsdump: fix race condition between lseek() and read()/write()
  2017-07-12 20:56 ` Eric Sandeen
  2017-07-13  7:28   ` Eryu Guan
@ 2017-07-13  8:41   ` Eryu Guan
  2017-07-13 20:10     ` Eric Sandeen
  1 sibling, 1 reply; 10+ messages in thread
From: Eryu Guan @ 2017-07-13  8:41 UTC (permalink / raw)
  To: linux-xfs; +Cc: Eryu Guan

There's a race condition in the [get|put]_invtrecord() routines, because
a lseek() followed by a read()/write() is not atmoic, the file offset
might be changed before read()/write().

xfs/302 catches this failure as:
xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.

And it can be reproduced by running multi-stream dump in a tight loop
  mount /dev/<dev> /mnt/xfs
  mkdir /mnt/xfs/dumpdir
  # populate dumpdir here
  while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
  	:
  done

Fix it by replacing the "lseek(); read()/write()" sequence by
pread()/pwrite(), which make the seek and I/O an atomic operation.

Also convert all *_SEEKCUR routines to "SEEK_SET" variants and
remove the *_SEEKCUR macros, because they depend on the maintenance
of current file offset, but pread()/pwrite() don't change file
offset.

And in inventory/testmain.c, get|put_invtrecord() are called
directly, not from the GET|PUT_REC_* macros, so maintain the offset
explicitly there.

Signed-off-by: Eryu Guan <eguan@redhat.com>
---
v2 passed xfstests -g dump group tests too, with different block size fs(512,
1k, 4k), with different feature enabled (v4, v5, reflink).

But I didn't really test testmain.c, it's not compiled by default, and I
suspect it won't be compiled anyway, the usage of inv_put_sessioninfo() doesn't
match its declaration, and there're might be other problems as well.

v2:
- remove whence argument from get|pub_invtrecord()
- update testmain.c to use get|pub_invtrecord() correctly

 common/inventory.c   |  4 ++--
 inventory/inv_api.c  |  5 ++---
 inventory/inv_core.c | 29 ++++++-----------------------
 inventory/inv_idx.c  |  4 ++--
 inventory/inv_priv.h | 21 ++++++---------------
 inventory/testmain.c | 34 +++++++++++++++++++++++-----------
 6 files changed, 41 insertions(+), 56 deletions(-)

diff --git a/common/inventory.c b/common/inventory.c
index d1b810c5bd10..0e9c2562ec25 100644
--- a/common/inventory.c
+++ b/common/inventory.c
@@ -471,8 +471,8 @@ inv_stream_close(
 	}
 			
 	if (dowrite) {
-		rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, sizeof( invt_stream_t ),
-					       (off64_t) -(sizeof( invt_stream_t )) );
+		rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
+				      tok->md_stream_off);
 	}
  end:
 	INVLOCK( fd, LOCK_UN );
diff --git a/inventory/inv_api.c b/inventory/inv_api.c
index acca40b00fff..46fdde84740b 100644
--- a/inventory/inv_api.c
+++ b/inventory/inv_api.c
@@ -409,9 +409,8 @@ inv_stream_close(
 		}
 			
 		if (dowrite) {
-			rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, 
-			             sizeof( invt_stream_t ),
-				     -(off64_t)(sizeof( invt_stream_t )) );
+			rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
+					      tok->md_stream_off);
 		}
 	}
 
diff --git a/inventory/inv_core.c b/inventory/inv_core.c
index a17c2c9c5603..5ef519c91347 100644
--- a/inventory/inv_core.c
+++ b/inventory/inv_core.c
@@ -112,7 +112,7 @@ get_headers( int fd, void **hdrs, size_t bufsz, size_t off )
 
 int
 get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, 
-	        int whence, bool_t dolock )
+		bool_t dolock )
 {
 	int  nread;
 	
@@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
 	if ( dolock ) 
 		INVLOCK( fd, LOCK_SH );
 
-	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
-		INV_PERROR( _("Error in reading inventory record "
-			      "(lseek failed): ") );
-		if ( dolock ) 
-			INVLOCK( fd, LOCK_UN );
-		return -1;
-	}
-	
-	nread = read( fd, buf, bufsz );
-
+	nread = pread(fd, buf, bufsz, (off_t)off);
 	if (  nread != (int) bufsz ) {
 		INV_PERROR( _("Error in reading inventory record :") );
-		if ( dolock ) 
+		if ( dolock )
 			INVLOCK( fd, LOCK_UN );
 		return -1;
 	}
@@ -154,23 +145,15 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
 /*----------------------------------------------------------------------*/
 
 int
-put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, 
-	        int whence, bool_t dolock )
+put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock )
 {
 	int nwritten;
 	
 	if ( dolock )
 		INVLOCK( fd, LOCK_EX );
 	
-	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
-		INV_PERROR( _("Error in writing inventory record "
-			      "(lseek failed): ") );
-		if ( dolock ) 
-			INVLOCK( fd, LOCK_UN );
-		return -1;
-	}
-	
-	if (( nwritten = write( fd, buf, bufsz ) ) != (int) bufsz ) {
+	nwritten = pwrite(fd, buf, bufsz, (off_t)off);
+	if (nwritten != (int) bufsz ) {
 		INV_PERROR( _("Error in writing inventory record :") );
 		if ( dolock )
 			INVLOCK( fd, LOCK_UN );
diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
index 95529e8135af..cd9b9cbe7a79 100644
--- a/inventory/inv_idx.c
+++ b/inventory/inv_idx.c
@@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)
 			      ent.ie_timeperiod.tp_start,
 			      ent.ie_timeperiod.tp_end );
 #endif
-	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
-				      -(off64_t)(sizeof( invt_entry_t )));
+	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
+			      tok->sd_invtok->d_invindex_off);
 	
 #ifdef INVT_DEBUG
 	{
diff --git a/inventory/inv_priv.h b/inventory/inv_priv.h
index 1690271dd3e3..aa94a3349d79 100644
--- a/inventory/inv_priv.h
+++ b/inventory/inv_priv.h
@@ -298,13 +298,10 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
 
 
 #define GET_REC( fd, buf, sz, off )  \
-                 get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DOLOCK )
+                 get_invtrecord( fd, buf, sz, off, INVT_DOLOCK )
 
 #define GET_REC_NOLOCK( fd, buf, sz, off )  \
-                 get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
-
-#define GET_REC_SEEKCUR( fd, buf, sz, off )  \
-                 get_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
+                 get_invtrecord( fd, buf, sz, off, INVT_DONTLOCK )
 
 #define GET_ALLHDRS_N_CNTS( fd, h, c, hsz, csz ) \
                  get_headerinfo( fd, h, c, hsz, csz, INVT_DOLOCK )
@@ -313,16 +310,10 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
                  get_headerinfo( fd, h, c, hsz, csz, INVT_DONTLOCK )
 
 #define PUT_REC( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DOLOCK )
+                 put_invtrecord( fd, buf, sz, off, INVT_DOLOCK )
 
 #define PUT_REC_NOLOCK( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
-
-#define PUT_REC_SEEKCUR( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
-
-#define PUT_REC_NOLOCK_SEEKCUR( fd, buf, sz, off )  \
-                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DONTLOCK )
+                 put_invtrecord( fd, buf, sz, off, INVT_DONTLOCK )
 
 
 #define GET_COUNTERS( fd, cnt ) get_counters( fd, (void **)(cnt), \
@@ -515,10 +506,10 @@ int
 get_invtentry( char *fname, time32_t tm, invt_entry_t *buf, size_t bufsz );
 
 int
-get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, int, bool_t dolock );
+get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock );
 
 int
-put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, int, bool_t dolock );
+put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock );
 
 inv_idbtoken_t
 get_token( int fd, int objfd );
diff --git a/inventory/testmain.c b/inventory/testmain.c
index ecddf54c4f90..05e3c02af792 100644
--- a/inventory/testmain.c
+++ b/inventory/testmain.c
@@ -89,6 +89,7 @@ int
 recons_test( int howmany )
 {
 	int fd, i, rval = 1;
+	off64_t off = 0;
 	
 	ses sarr[ SESLIM];
 	
@@ -96,14 +97,16 @@ recons_test( int howmany )
 	
 	for ( i=0; i<howmany && i < SESLIM; i++ ){
 		rval = get_invtrecord( fd, &sarr[i], 
-				       sizeof( uuid_t ) + sizeof( size_t ), 0,
-				       SEEK_CUR, BOOL_FALSE );
+				       sizeof( uuid_t ) + sizeof( size_t ), off,
+				       BOOL_FALSE );
 		assert( rval > 0 );
 		assert( sarr[i].sz > 0 );
 		sarr[i].buf = calloc( 1,  sarr[i].sz );
-		rval = get_invtrecord( fd, sarr[i].buf,  sarr[i].sz, 0, SEEK_CUR,
+		off += (off64_t)(sizeof(uuid_t) + sizeof(size_t));
+		rval = get_invtrecord( fd, sarr[i].buf,  sarr[i].sz, off,
 				       BOOL_FALSE );
 		assert( rval > 0 );
+		off += sarr[i].sz;
 	}
 	
 	
@@ -132,8 +135,7 @@ delete_test( int n )
 	fd = open( "moids", O_RDONLY );
 	if ( fd < 0 ) return -1;
 	
-	get_invtrecord( fd, &moid, sizeof(uuid_t), (n-1)* sizeof( uuid_t),
-		        SEEK_SET, 0 );
+	get_invtrecord( fd, &moid, sizeof(uuid_t), (n-1)* sizeof( uuid_t), 0 );
 	uuid_to_string( &moid, &str, &stat );
 	printf("Searching for Moid = %s\n", str );
 	free( str );
@@ -263,7 +265,11 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
 	char strbuf[128];
 	void *bufp;
 	size_t sz;
+#ifdef RECONS
 	int rfd;
+	off64_t off;
+	struct stat64 statbuf;
+#endif
 
 #ifdef FIRSTTIME
 	printf("first time!\n");
@@ -285,6 +291,11 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
 #ifdef RECONS
 	rfd = open( sesfile, O_RDWR | O_CREAT );
 	fchmod( rfd, INV_PERMS );
+	if (fstat64(fd, &statbuf) < 0) {
+		perror("fstat64 session file");
+		return -1;
+	}
+	off = (off64_t)statbuf.st_size;
 #endif
 
 	for ( i = 0; i < nsess; i++ ) {
@@ -325,12 +336,13 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
 	
 #ifdef RECONS
 		if (inv_get_sessioninfo( tok2, &bufp, &sz ) == BOOL_TRUE ) {
-		      put_invtrecord( rfd, fsidp, sizeof( uuid_t ), 0, 
-				        SEEK_END, BOOL_FALSE );
-			
-			put_invtrecord( rfd, &sz, sizeof( size_t ), 0,
-				        SEEK_END, BOOL_FALSE); 
-			put_invtrecord( rfd, bufp, sz, 0, SEEK_END, BOOL_FALSE );
+			put_invtrecord( rfd, fsidp, sizeof( uuid_t ), off,
+					BOOL_FALSE );
+			off += (off64_t)sizeof(uuid_t);
+			put_invtrecord( rfd, &sz, sizeof( size_t ), off,
+					BOOL_FALSE);
+			off += (off64_t)sizeof(size_t);
+			put_invtrecord( rfd, bufp, sz, off, BOOL_FALSE );
 		}
 #endif
 #ifdef NOTDEF
-- 
2.13.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] xfsdump: fix race condition between lseek() and read()/write()
  2017-07-13  8:41   ` [PATCH v2] " Eryu Guan
@ 2017-07-13 20:10     ` Eric Sandeen
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Sandeen @ 2017-07-13 20:10 UTC (permalink / raw)
  To: Eryu Guan, linux-xfs

On 07/13/2017 03:41 AM, Eryu Guan wrote:
> There's a race condition in the [get|put]_invtrecord() routines, because
> a lseek() followed by a read()/write() is not atmoic, the file offset
> might be changed before read()/write().
> 
> xfs/302 catches this failure as:
> xfsdump: drive 1: INV : Unknown version 0 - Expected version 1
> xfsdump: inv_core.c:66: get_counters: Assertion `((invt_counter_t *)(*cntpp))->ic_vernum == (inv_version_t) 1' failed.
> 
> And it can be reproduced by running multi-stream dump in a tight loop
>   mount /dev/<dev> /mnt/xfs
>   mkdir /mnt/xfs/dumpdir
>   # populate dumpdir here
>   while xfsdump -M l1 -M l2 -f d1 -f d2 -L ses /mnt/xfs -s dumpdir; do
>   	:
>   done
> 
> Fix it by replacing the "lseek(); read()/write()" sequence by
> pread()/pwrite(), which make the seek and I/O an atomic operation.

Thanks, let's go with it :)

Reviewed-by: Eric Sandeen <sandeen@redhat.com>

> Also convert all *_SEEKCUR routines to "SEEK_SET" variants and
> remove the *_SEEKCUR macros, because they depend on the maintenance
> of current file offset, but pread()/pwrite() don't change file
> offset.
> 
> And in inventory/testmain.c, get|put_invtrecord() are called
> directly, not from the GET|PUT_REC_* macros, so maintain the offset
> explicitly there.
> 
> Signed-off-by: Eryu Guan <eguan@redhat.com>
> ---
> v2 passed xfstests -g dump group tests too, with different block size fs(512,
> 1k, 4k), with different feature enabled (v4, v5, reflink).
> 
> But I didn't really test testmain.c, it's not compiled by default, and I
> suspect it won't be compiled anyway, the usage of inv_put_sessioninfo() doesn't
> match its declaration, and there're might be other problems as well.
> 
> v2:
> - remove whence argument from get|pub_invtrecord()
> - update testmain.c to use get|pub_invtrecord() correctly
> 
>  common/inventory.c   |  4 ++--
>  inventory/inv_api.c  |  5 ++---
>  inventory/inv_core.c | 29 ++++++-----------------------
>  inventory/inv_idx.c  |  4 ++--
>  inventory/inv_priv.h | 21 ++++++---------------
>  inventory/testmain.c | 34 +++++++++++++++++++++++-----------
>  6 files changed, 41 insertions(+), 56 deletions(-)
> 
> diff --git a/common/inventory.c b/common/inventory.c
> index d1b810c5bd10..0e9c2562ec25 100644
> --- a/common/inventory.c
> +++ b/common/inventory.c
> @@ -471,8 +471,8 @@ inv_stream_close(
>  	}
>  			
>  	if (dowrite) {
> -		rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, sizeof( invt_stream_t ),
> -					       (off64_t) -(sizeof( invt_stream_t )) );
> +		rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +				      tok->md_stream_off);
>  	}
>   end:
>  	INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_api.c b/inventory/inv_api.c
> index acca40b00fff..46fdde84740b 100644
> --- a/inventory/inv_api.c
> +++ b/inventory/inv_api.c
> @@ -409,9 +409,8 @@ inv_stream_close(
>  		}
>  			
>  		if (dowrite) {
> -			rval = PUT_REC_NOLOCK_SEEKCUR( fd, &strm, 
> -			             sizeof( invt_stream_t ),
> -				     -(off64_t)(sizeof( invt_stream_t )) );
> +			rval = PUT_REC_NOLOCK(fd, &strm, sizeof(invt_stream_t),
> +					      tok->md_stream_off);
>  		}
>  	}
>  
> diff --git a/inventory/inv_core.c b/inventory/inv_core.c
> index a17c2c9c5603..5ef519c91347 100644
> --- a/inventory/inv_core.c
> +++ b/inventory/inv_core.c
> @@ -112,7 +112,7 @@ get_headers( int fd, void **hdrs, size_t bufsz, size_t off )
>  
>  int
>  get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, 
> -	        int whence, bool_t dolock )
> +		bool_t dolock )
>  {
>  	int  nread;
>  	
> @@ -121,19 +121,10 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  	if ( dolock ) 
>  		INVLOCK( fd, LOCK_SH );
>  
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in reading inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	nread = read( fd, buf, bufsz );
> -
> +	nread = pread(fd, buf, bufsz, (off_t)off);
>  	if (  nread != (int) bufsz ) {
>  		INV_PERROR( _("Error in reading inventory record :") );
> -		if ( dolock ) 
> +		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
>  		return -1;
>  	}
> @@ -154,23 +145,15 @@ get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off,
>  /*----------------------------------------------------------------------*/
>  
>  int
> -put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, 
> -	        int whence, bool_t dolock )
> +put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock )
>  {
>  	int nwritten;
>  	
>  	if ( dolock )
>  		INVLOCK( fd, LOCK_EX );
>  	
> -	if ( lseek( fd, (off_t)off, whence ) < 0 ) {
> -		INV_PERROR( _("Error in writing inventory record "
> -			      "(lseek failed): ") );
> -		if ( dolock ) 
> -			INVLOCK( fd, LOCK_UN );
> -		return -1;
> -	}
> -	
> -	if (( nwritten = write( fd, buf, bufsz ) ) != (int) bufsz ) {
> +	nwritten = pwrite(fd, buf, bufsz, (off_t)off);
> +	if (nwritten != (int) bufsz ) {
>  		INV_PERROR( _("Error in writing inventory record :") );
>  		if ( dolock )
>  			INVLOCK( fd, LOCK_UN );
> diff --git a/inventory/inv_idx.c b/inventory/inv_idx.c
> index 95529e8135af..cd9b9cbe7a79 100644
> --- a/inventory/inv_idx.c
> +++ b/inventory/inv_idx.c
> @@ -341,8 +341,8 @@ idx_put_sesstime( inv_sestoken_t tok, bool_t whichtime)
>  			      ent.ie_timeperiod.tp_start,
>  			      ent.ie_timeperiod.tp_end );
>  #endif
> -	rval = PUT_REC_NOLOCK_SEEKCUR( fd, &ent, sizeof( invt_entry_t ),
> -				      -(off64_t)(sizeof( invt_entry_t )));
> +	rval = PUT_REC_NOLOCK(fd, &ent, sizeof(invt_entry_t),
> +			      tok->sd_invtok->d_invindex_off);
>  	
>  #ifdef INVT_DEBUG
>  	{
> diff --git a/inventory/inv_priv.h b/inventory/inv_priv.h
> index 1690271dd3e3..aa94a3349d79 100644
> --- a/inventory/inv_priv.h
> +++ b/inventory/inv_priv.h
> @@ -298,13 +298,10 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>  
>  
>  #define GET_REC( fd, buf, sz, off )  \
> -                 get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DOLOCK )
> +                 get_invtrecord( fd, buf, sz, off, INVT_DOLOCK )
>  
>  #define GET_REC_NOLOCK( fd, buf, sz, off )  \
> -                 get_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
> -
> -#define GET_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 get_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> +                 get_invtrecord( fd, buf, sz, off, INVT_DONTLOCK )
>  
>  #define GET_ALLHDRS_N_CNTS( fd, h, c, hsz, csz ) \
>                   get_headerinfo( fd, h, c, hsz, csz, INVT_DOLOCK )
> @@ -313,16 +310,10 @@ typedef bool_t (*search_callback_t) (int, invt_seshdr_t *, void *, void *);
>                   get_headerinfo( fd, h, c, hsz, csz, INVT_DONTLOCK )
>  
>  #define PUT_REC( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DOLOCK )
> +                 put_invtrecord( fd, buf, sz, off, INVT_DOLOCK )
>  
>  #define PUT_REC_NOLOCK( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_SET, INVT_DONTLOCK )
> -
> -#define PUT_REC_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DOLOCK )
> -
> -#define PUT_REC_NOLOCK_SEEKCUR( fd, buf, sz, off )  \
> -                 put_invtrecord( fd, buf, sz, off, SEEK_CUR, INVT_DONTLOCK )
> +                 put_invtrecord( fd, buf, sz, off, INVT_DONTLOCK )
>  
>  
>  #define GET_COUNTERS( fd, cnt ) get_counters( fd, (void **)(cnt), \
> @@ -515,10 +506,10 @@ int
>  get_invtentry( char *fname, time32_t tm, invt_entry_t *buf, size_t bufsz );
>  
>  int
> -get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, int, bool_t dolock );
> +get_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock );
>  
>  int
> -put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, int, bool_t dolock );
> +put_invtrecord( int fd, void *buf, size_t bufsz, off64_t off, bool_t dolock );
>  
>  inv_idbtoken_t
>  get_token( int fd, int objfd );
> diff --git a/inventory/testmain.c b/inventory/testmain.c
> index ecddf54c4f90..05e3c02af792 100644
> --- a/inventory/testmain.c
> +++ b/inventory/testmain.c
> @@ -89,6 +89,7 @@ int
>  recons_test( int howmany )
>  {
>  	int fd, i, rval = 1;
> +	off64_t off = 0;
>  	
>  	ses sarr[ SESLIM];
>  	
> @@ -96,14 +97,16 @@ recons_test( int howmany )
>  	
>  	for ( i=0; i<howmany && i < SESLIM; i++ ){
>  		rval = get_invtrecord( fd, &sarr[i], 
> -				       sizeof( uuid_t ) + sizeof( size_t ), 0,
> -				       SEEK_CUR, BOOL_FALSE );
> +				       sizeof( uuid_t ) + sizeof( size_t ), off,
> +				       BOOL_FALSE );
>  		assert( rval > 0 );
>  		assert( sarr[i].sz > 0 );
>  		sarr[i].buf = calloc( 1,  sarr[i].sz );
> -		rval = get_invtrecord( fd, sarr[i].buf,  sarr[i].sz, 0, SEEK_CUR,
> +		off += (off64_t)(sizeof(uuid_t) + sizeof(size_t));
> +		rval = get_invtrecord( fd, sarr[i].buf,  sarr[i].sz, off,
>  				       BOOL_FALSE );
>  		assert( rval > 0 );
> +		off += sarr[i].sz;
>  	}
>  	
>  	
> @@ -132,8 +135,7 @@ delete_test( int n )
>  	fd = open( "moids", O_RDONLY );
>  	if ( fd < 0 ) return -1;
>  	
> -	get_invtrecord( fd, &moid, sizeof(uuid_t), (n-1)* sizeof( uuid_t),
> -		        SEEK_SET, 0 );
> +	get_invtrecord( fd, &moid, sizeof(uuid_t), (n-1)* sizeof( uuid_t), 0 );
>  	uuid_to_string( &moid, &str, &stat );
>  	printf("Searching for Moid = %s\n", str );
>  	free( str );
> @@ -263,7 +265,11 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
>  	char strbuf[128];
>  	void *bufp;
>  	size_t sz;
> +#ifdef RECONS
>  	int rfd;
> +	off64_t off;
> +	struct stat64 statbuf;
> +#endif
>  
>  #ifdef FIRSTTIME
>  	printf("first time!\n");
> @@ -285,6 +291,11 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
>  #ifdef RECONS
>  	rfd = open( sesfile, O_RDWR | O_CREAT );
>  	fchmod( rfd, INV_PERMS );
> +	if (fstat64(fd, &statbuf) < 0) {
> +		perror("fstat64 session file");
> +		return -1;
> +	}
> +	off = (off64_t)statbuf.st_size;
>  #endif
>  
>  	for ( i = 0; i < nsess; i++ ) {
> @@ -325,12 +336,13 @@ write_test( int nsess, int nstreams, int nmedia, int dumplevel )
>  	
>  #ifdef RECONS
>  		if (inv_get_sessioninfo( tok2, &bufp, &sz ) == BOOL_TRUE ) {
> -		      put_invtrecord( rfd, fsidp, sizeof( uuid_t ), 0, 
> -				        SEEK_END, BOOL_FALSE );
> -			
> -			put_invtrecord( rfd, &sz, sizeof( size_t ), 0,
> -				        SEEK_END, BOOL_FALSE); 
> -			put_invtrecord( rfd, bufp, sz, 0, SEEK_END, BOOL_FALSE );
> +			put_invtrecord( rfd, fsidp, sizeof( uuid_t ), off,
> +					BOOL_FALSE );
> +			off += (off64_t)sizeof(uuid_t);
> +			put_invtrecord( rfd, &sz, sizeof( size_t ), off,
> +					BOOL_FALSE);
> +			off += (off64_t)sizeof(size_t);
> +			put_invtrecord( rfd, bufp, sz, off, BOOL_FALSE );
>  		}
>  #endif
>  #ifdef NOTDEF
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-07-13 20:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-21 13:06 [PATCH] xfsdump: fix race condition between lseek() and read()/write() Eryu Guan
2017-03-27 20:20 ` Darrick J. Wong
2017-03-28  3:41   ` Eryu Guan
2017-07-12 18:26     ` Darrick J. Wong
2017-07-12 18:46 ` Eric Sandeen
2017-07-12 19:33   ` Eric Sandeen
2017-07-12 20:56 ` Eric Sandeen
2017-07-13  7:28   ` Eryu Guan
2017-07-13  8:41   ` [PATCH v2] " Eryu Guan
2017-07-13 20:10     ` Eric Sandeen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.