All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: hch@lst.de, snitzer@redhat.com, axboe@kernel.dk,
	dm-devel@redhat.com, James.Bottomley@suse.de,
	linux-kernel@vger.kernel.org, martin.petersen@oracle.com,
	akpm@linux-foundation.org, linux-scsi@vger.kernel.org
Subject: Re: [PATCH 1/2] block: fix leaks associated with discard request payload
Date: Mon, 28 Jun 2010 17:25:36 +0200	[thread overview]
Message-ID: <20100628152536.GA16013@lst.de> (raw)
In-Reply-To: <20100628171218Q.fujita.tomonori@lab.ntt.co.jp>

On Mon, Jun 28, 2010 at 05:14:28PM +0900, FUJITA Tomonori wrote:
> > While I see the problems with leaking ressources in that case I still
> > can't quite explain the hang I see.
> 
> Any way to reproduce the hang without ssd drives?

Actually the SSDs don't fully hang, they just causes lots of I/O errors
and hit the error handler hard.  The hard hang is when running under
qemu.  Apply the patch below, then create an if=scsi drive that resides
on an XFS filesystem, and you'll have scsi TP support in the guest:


Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2010-06-26 14:40:42.580004436 +0200
+++ qemu/hw/scsi-disk.c	2010-06-26 14:59:20.338020011 +0200
@@ -397,6 +397,7 @@ static int scsi_disk_emulate_inquiry(SCS
         }
         case 0xb0: /* block device characteristics */
         {
+            static const int trim_sectors = (2 * 1024 * 1024) / 512;
             unsigned int min_io_size =
                     s->qdev.conf.min_io_size / s->qdev.blocksize;
             unsigned int opt_io_size =
@@ -416,6 +417,12 @@ static int scsi_disk_emulate_inquiry(SCS
             outbuf[13] = (opt_io_size >> 16) & 0xff;
             outbuf[14] = (opt_io_size >> 8) & 0xff;
             outbuf[15] = opt_io_size & 0xff;
+
+            /* optimal unmap granularity */
+            outbuf[28] = (trim_sectors >> 24) & 0xff;
+            outbuf[29] = (trim_sectors >> 16) & 0xff;
+            outbuf[30] = (trim_sectors >> 8) & 0xff;
+            outbuf[31] = trim_sectors & 0xff;
             break;
         }
         default:
@@ -824,6 +831,11 @@ static int scsi_disk_emulate_command(SCS
             outbuf[11] = 0;
             outbuf[12] = 0;
             outbuf[13] = get_physical_block_exp(&s->qdev.conf);
+
+            /* set TPE and TPRZ bits if the format supports discard */
+            if (bdrv_is_discardable(s->bs))
+                outbuf[14] = 0x80 | 0x40;
+
             /* Protection, exponent and lowest lba field left blank. */
             buflen = req->cmd.xfer;
             break;
@@ -988,6 +1000,28 @@ static int32_t scsi_send_command(SCSIDev
         r->sector_count = len * s->cluster_size;
         is_write = 1;
         break;
+    case WRITE_SAME_16:
+//        printf("WRITE SAME(16) (sector %" PRId64 ", count %d)\n", lba, len);
+
+//        if (lba + len > s->max_lba)
+        if (lba > s->max_lba)
+            goto illegal_lba; // check the condition code
+        /*
+         * We only support WRITE SAME with the unmap bit set for now.
+         */
+        if (!(buf[1] & 0x8))
+            goto fail;
+
+        rc = bdrv_discard(s->bs, lba * s->cluster_size, len * s->cluster_size);
+        if (rc < 0) {
+            /* XXX: better error code ?*/
+            goto fail;
+        }
+
+        scsi_req_set_status(&r->req, GOOD, NO_SENSE);
+        scsi_req_complete(&r->req);
+        scsi_remove_request(r);
+        return 0;
     default:
 	DPRINTF("Unknown SCSI command (%2.2x)\n", buf[0]);
     fail:
Index: qemu/hw/scsi-defs.h
===================================================================
--- qemu.orig/hw/scsi-defs.h	2010-06-26 14:40:42.589025947 +0200
+++ qemu/hw/scsi-defs.h	2010-06-26 14:40:43.820253983 +0200
@@ -84,6 +84,7 @@
 #define MODE_SENSE_10         0x5a
 #define PERSISTENT_RESERVE_IN 0x5e
 #define PERSISTENT_RESERVE_OUT 0x5f
+#define WRITE_SAME_16         0x93
 #define MAINTENANCE_IN        0xa3
 #define MAINTENANCE_OUT       0xa4
 #define MOVE_MEDIUM           0xa5
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2010-06-26 14:40:42.597004296 +0200
+++ qemu/block.c	2010-06-26 14:40:43.824253703 +0200
@@ -1286,6 +1286,11 @@ int bdrv_is_sg(BlockDriverState *bs)
     return bs->sg;
 }
 
+int bdrv_is_discardable(BlockDriverState *bs)
+{
+    return bs->discardable;
+}
+
 int bdrv_enable_write_cache(BlockDriverState *bs)
 {
     return bs->enable_write_cache;
@@ -1431,6 +1436,13 @@ int bdrv_has_zero_init(BlockDriverState
     return 1;
 }
 
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    if (!bs->drv || !bs->drv->bdrv_discard)
+        return -ENOTSUP;
+    return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
+}
+
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2010-06-26 14:40:42.606004157 +0200
+++ qemu/block.h	2010-06-26 14:40:43.831254122 +0200
@@ -135,6 +135,7 @@ void bdrv_flush(BlockDriverState *bs);
 void bdrv_flush_all(void);
 void bdrv_close_all(void);
 
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_has_zero_init(BlockDriverState *bs);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 	int *pnum);
@@ -162,6 +163,7 @@ BlockErrorAction bdrv_get_on_error(Block
 int bdrv_is_removable(BlockDriverState *bs);
 int bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_is_sg(BlockDriverState *bs);
+int bdrv_is_discardable(BlockDriverState *bs);
 int bdrv_enable_write_cache(BlockDriverState *bs);
 int bdrv_is_inserted(BlockDriverState *bs);
 int bdrv_media_changed(BlockDriverState *bs);
Index: qemu/block/raw-posix.c
===================================================================
--- qemu.orig/block/raw-posix.c	2010-06-26 14:40:42.614025947 +0200
+++ qemu/block/raw-posix.c	2010-06-26 14:42:33.449255585 +0200
@@ -68,6 +68,10 @@
 #include <sys/diskslice.h>
 #endif
 
+#ifdef CONFIG_XFS
+#include <xfs/xfs.h>
+#endif
+
 //#define DEBUG_FLOPPY
 
 //#define DEBUG_BLOCK
@@ -117,6 +121,9 @@ typedef struct BDRVRawState {
     int use_aio;
     void *aio_ctx;
 #endif
+#ifdef CONFIG_XFS
+    int is_xfs : 1;
+#endif
     uint8_t* aligned_buf;
 } BDRVRawState;
 
@@ -189,6 +196,13 @@ static int raw_open_common(BlockDriverSt
 #endif
     }
 
+#ifdef CONFIG_XFS
+    if (platform_test_xfs_fd(s->fd)) {
+        s->is_xfs = 1;
+        bs->discardable = 1;
+    }
+#endif
+
     return 0;
 
 out_free_buf:
@@ -731,6 +745,36 @@ static void raw_flush(BlockDriverState *
     qemu_fdatasync(s->fd);
 }
 
+#ifdef CONFIG_XFS
+static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
+{
+    struct xfs_flock64 fl;
+
+    memset(&fl, 0, sizeof(fl));
+    fl.l_whence = SEEK_SET;
+    fl.l_start = sector_num << 9;
+    fl.l_len = (int64_t)nb_sectors << 9;
+
+    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
+        printf("cannot punch hole (%s)\n", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+#endif
+
+static int raw_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    BDRVRawState *s = bs->opaque;
+
+#ifdef CONFIG_XFS
+    if (s->is_xfs)
+        return xfs_discard(s, sector_num, nb_sectors);
+#endif
+
+    return -EOPNOTSUPP;
+}
 
 static QEMUOptionParameter raw_create_options[] = {
     {
@@ -752,6 +796,7 @@ static BlockDriver bdrv_file = {
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_flush = raw_flush,
+    .bdrv_discard = raw_discard,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2010-06-26 14:40:42.636003738 +0200
+++ qemu/block_int.h	2010-06-26 14:40:43.843033002 +0200
@@ -73,6 +73,8 @@ struct BlockDriver {
         BlockDriverCompletionFunc *cb, void *opaque);
     BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque);
+    int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors);
 
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
@@ -141,6 +143,7 @@ struct BlockDriverState {
     int encrypted; /* if true, the media is encrypted */
     int valid_key; /* if true, a valid encryption key has been set */
     int sg;        /* if true, the device is a /dev/sg* */
+    int discardable;/* if true the device supports TRIM/UNMAP */
     /* event callback when inserting/removing */
     void (*change_cb)(void *opaque);
     void *change_opaque;
Index: qemu/configure
===================================================================
--- qemu.orig/configure	2010-06-26 14:40:42.644003947 +0200
+++ qemu/configure	2010-06-26 14:40:43.850005973 +0200
@@ -272,6 +272,7 @@ xen=""
 linux_aio=""
 attr=""
 vhost_net=""
+xfs=""
 
 gprof="no"
 debug_tcg="no"
@@ -1284,6 +1285,27 @@ EOF
 fi
 
 ##########################################
+# xfsctl() probe, used for raw-posix
+if test "$xfs" != "no" ; then
+  cat > $TMPC << EOF
+#include <xfs/xfs.h>
+int main(void)
+{
+    xfsctl(NULL, 0, 0, NULL);
+    return 0;
+}
+EOF
+  if compile_prog "" "" ; then
+    xfs="yes"
+  else
+    if test "$xfs" = "yes" ; then
+      feature_not_found "uuid"
+    fi
+    xfs=no
+  fi
+fi
+
+##########################################
 # vde libraries probe
 if test "$vde" != "no" ; then
   vde_libs="-lvdeplug"
@@ -2115,6 +2137,7 @@ echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
 echo "vhost-net support $vhost_net"
+echo "xfsctl support    $xfs"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -2235,6 +2258,9 @@ fi
 if test "$uuid" = "yes" ; then
   echo "CONFIG_UUID=y" >> $config_host_mak
 fi
+if test "$xfs" = "yes" ; then
+  echo "CONFIG_XFS=y" >> $config_host_mak
+fi
 qemu_version=`head $source_path/VERSION`
 echo "VERSION=$qemu_version" >>$config_host_mak
 echo "PKGVERSION=$pkgversion" >>$config_host_mak
Index: qemu/block/raw.c
===================================================================
--- qemu.orig/block/raw.c	2010-06-26 14:40:42.628004296 +0200
+++ qemu/block/raw.c	2010-06-26 14:40:43.852014913 +0200
@@ -6,6 +6,7 @@
 static int raw_open(BlockDriverState *bs, int flags)
 {
     bs->sg = bs->file->sg;
+    bs->discardable = bs->file->discardable;
     return 0;
 }
 
@@ -65,6 +66,11 @@ static int raw_probe(const uint8_t *buf,
    return 1; /* everything can be opened as raw image */
 }
 
+static int raw_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    return bdrv_discard(bs->file, sector_num, nb_sectors);
+}
+
 static int raw_is_inserted(BlockDriverState *bs)
 {
     return bdrv_is_inserted(bs->file);
@@ -125,6 +131,7 @@ static BlockDriver bdrv_raw = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush     = raw_aio_flush,
+    .bdrv_discard	= raw_discard,
 
     .bdrv_is_inserted   = raw_is_inserted,
     .bdrv_eject         = raw_eject,

  parent reply	other threads:[~2010-06-28 15:26 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-18 14:59 [PATCH, RFC] block: don't allocate a payload for discard request Christoph Hellwig
2010-06-19  4:25 ` Mike Snitzer
2010-06-22 18:00   ` Mike Snitzer
2010-06-26 19:56     ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer
2010-06-27  8:49       ` FUJITA Tomonori
2010-06-27  9:26         ` Christoph Hellwig
2010-06-27 10:01           ` FUJITA Tomonori
2010-06-27 10:35             ` FUJITA Tomonori
2010-06-27 11:07               ` Christoph Hellwig
2010-06-27 12:32                 ` FUJITA Tomonori
2010-06-27 14:16                   ` Mike Snitzer
2010-06-27 15:35                     ` Christoph Hellwig
2010-06-27 16:23                       ` FUJITA Tomonori
2010-06-27 15:33                   ` Christoph Hellwig
2010-06-28  7:57                   ` Christoph Hellwig
2010-06-28  8:14                     ` FUJITA Tomonori
2010-06-28  8:18                       ` Jens Axboe
2010-06-28  8:45                         ` FUJITA Tomonori
2010-06-28  8:45                           ` FUJITA Tomonori
2010-06-28 15:25                       ` Christoph Hellwig [this message]
2010-06-30 11:55                         ` FUJITA Tomonori
2010-07-01  4:21                           ` FUJITA Tomonori
2010-06-27  9:38       ` Christoph Hellwig
2010-06-27 15:29       ` James Bottomley
2010-06-28 17:16         ` Martin K. Petersen
2010-06-29  8:00           ` Boaz Harrosh
2010-06-29 22:28         ` [dm-devel] " Mikulas Patocka
2010-06-29 23:03           ` James Bottomley
2010-06-29 23:51             ` Mike Snitzer
2010-06-29 23:51               ` Mike Snitzer
2010-06-30  0:11             ` [dm-devel] " Mikulas Patocka
2010-06-30 14:22               ` James Bottomley
2010-06-30 15:36                 ` Mike Snitzer
2010-06-30 16:26                   ` James Bottomley
2010-07-01 12:28                 ` [dm-devel] " Mikulas Patocka
2010-07-01 12:46                   ` Mike Snitzer
2010-07-01 14:03                     ` Mikulas Patocka
2010-07-01 14:03                       ` Mikulas Patocka
2010-07-01 12:49                   ` [dm-devel] " Alasdair G Kergon
2010-06-30  8:32         ` Boaz Harrosh
2010-06-30  8:42           ` Christoph Hellwig
2010-06-30 10:25             ` Boaz Harrosh
2010-06-30 10:41               ` Christoph Hellwig
2010-06-30 10:57                 ` Boaz Harrosh
2010-06-30 12:18                   ` Mike Snitzer
2010-06-26 19:56     ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer
2010-06-27  9:39       ` Christoph Hellwig
2010-06-27 14:00         ` Mike Snitzer
2010-06-27 14:55       ` [PATCH 2/2 v2] " Mike Snitzer
2010-06-27 15:33         ` Christoph Hellwig
2010-06-28 10:33       ` [PATCH 2/2] " FUJITA Tomonori
2010-06-28 12:29         ` Mike Snitzer
2010-06-28 15:15           ` FUJITA Tomonori
2010-06-28 15:31             ` Mike Snitzer
2010-06-28 12:34       ` Jens Axboe
2010-06-28 12:37         ` Mike Snitzer
2010-06-28 12:41           ` Jens Axboe
2010-06-28 12:44             ` Christoph Hellwig
2010-06-28 12:49               ` Jens Axboe
2010-06-28 12:45             ` Mike Snitzer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100628152536.GA16013@lst.de \
    --to=hch@lst.de \
    --cc=James.Bottomley@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=dm-devel@redhat.com \
    --cc=fujita.tomonori@lab.ntt.co.jp \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=snitzer@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.