linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] v2: block subsystem refcounter conversions
@ 2017-04-20 11:27 Elena Reshetova
  2017-04-20 11:27 ` [PATCH 1/5] block: convert bio.__bi_cnt from atomic_t to refcount_t Elena Reshetova
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova

changes in v2:
Not needed WARNs are removed since refcount_t warns by itself.
BUG_ONs are left as it is, since refcount_t doesn't bug by default.

This series, for block subsystem, replaces atomic_t reference
counters with the new refcount_t type and API (see include/linux/refcount.h).
By doing this we prevent intentional or accidental
underflows or overflows that can lead to use-after-free vulnerabilities.


Elena Reshetova (5):
  block: convert bio.__bi_cnt from atomic_t to refcount_t
  block: convert blk_queue_tag.refcnt from atomic_t to refcount_t
  block: convert blkcg_gq.refcnt from atomic_t to refcount_t
  block: convert io_context.active_ref from atomic_t to refcount_t
  block: convert bsg_device.ref_count from atomic_t to refcount_t

 block/bio.c                | 6 +++---
 block/blk-cgroup.c         | 2 +-
 block/blk-ioc.c            | 4 ++--
 block/blk-tag.c            | 8 ++++----
 block/bsg.c                | 9 +++++----
 block/cfq-iosched.c        | 4 ++--
 fs/btrfs/volumes.c         | 2 +-
 include/linux/bio.h        | 4 ++--
 include/linux/blk-cgroup.h | 9 ++++-----
 include/linux/blk_types.h  | 3 ++-
 include/linux/blkdev.h     | 3 ++-
 include/linux/iocontext.h  | 6 +++---
 12 files changed, 31 insertions(+), 29 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/5] block: convert bio.__bi_cnt from atomic_t to refcount_t
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
@ 2017-04-20 11:27 ` Elena Reshetova
  2017-04-20 11:27 ` [PATCH 2/5] block: convert blk_queue_tag.refcnt " Elena Reshetova
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 block/bio.c               | 6 +++---
 fs/btrfs/volumes.c        | 2 +-
 include/linux/bio.h       | 4 ++--
 include/linux/blk_types.h | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e0..3dffc17 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -275,7 +275,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 {
 	memset(bio, 0, sizeof(*bio));
 	atomic_set(&bio->__bi_remaining, 1);
-	atomic_set(&bio->__bi_cnt, 1);
+	refcount_set(&bio->__bi_cnt, 1);
 
 	bio->bi_io_vec = table;
 	bio->bi_max_vecs = max_vecs;
@@ -543,12 +543,12 @@ void bio_put(struct bio *bio)
 	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
 	else {
-		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+		BIO_BUG_ON(!refcount_read(&bio->__bi_cnt));
 
 		/*
 		 * last put frees it
 		 */
-		if (atomic_dec_and_test(&bio->__bi_cnt))
+		if (refcount_dec_and_test(&bio->__bi_cnt))
 			bio_free(bio);
 	}
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 13e55d1..ff1fe9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -441,7 +441,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
-		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
+		BUG_ON(refcount_read(&cur->__bi_cnt) == 0);
 
 		/*
 		 * if we're doing the sync list, record that our
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e52119..44ac678 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -234,7 +234,7 @@ static inline void bio_get(struct bio *bio)
 {
 	bio->bi_flags |= (1 << BIO_REFFED);
 	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_cnt);
+	refcount_inc(&bio->__bi_cnt);
 }
 
 static inline void bio_cnt_set(struct bio *bio, unsigned int count)
@@ -243,7 +243,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 		bio->bi_flags |= (1 << BIO_REFFED);
 		smp_mb__before_atomic();
 	}
-	atomic_set(&bio->__bi_cnt, count);
+	refcount_set(&bio->__bi_cnt, count);
 }
 
 static inline bool bio_flagged(struct bio *bio, unsigned int bit)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb..c41407f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -7,6 +7,7 @@
 
 #include <linux/types.h>
 #include <linux/bvec.h>
+#include <linux/refcount.h>
 
 struct bio_set;
 struct bio;
@@ -73,7 +74,7 @@ struct bio {
 
 	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	atomic_t		__bi_cnt;	/* pin count */
+	refcount_t		__bi_cnt;	/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/5] block: convert blk_queue_tag.refcnt from atomic_t to refcount_t
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
  2017-04-20 11:27 ` [PATCH 1/5] block: convert bio.__bi_cnt from atomic_t to refcount_t Elena Reshetova
@ 2017-04-20 11:27 ` Elena Reshetova
  2017-04-20 11:27 ` [PATCH 3/5] block: convert blkcg_gq.refcnt " Elena Reshetova
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 block/blk-tag.c        | 8 ++++----
 include/linux/blkdev.h | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/blk-tag.c b/block/blk-tag.c
index 07cc329..d83555e 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL(blk_queue_find_tag);
  */
 void blk_free_tags(struct blk_queue_tag *bqt)
 {
-	if (atomic_dec_and_test(&bqt->refcnt)) {
+	if (refcount_dec_and_test(&bqt->refcnt)) {
 		BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
 							bqt->max_depth);
 
@@ -130,7 +130,7 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
 	if (init_tag_map(q, tags, depth))
 		goto fail;
 
-	atomic_set(&tags->refcnt, 1);
+	refcount_set(&tags->refcnt, 1);
 	tags->alloc_policy = alloc_policy;
 	tags->next_tag = 0;
 	return tags;
@@ -180,7 +180,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
 		queue_flag_set(QUEUE_FLAG_QUEUED, q);
 		return 0;
 	} else
-		atomic_inc(&tags->refcnt);
+		refcount_inc(&tags->refcnt);
 
 	/*
 	 * assign it, all done
@@ -225,7 +225,7 @@ int blk_queue_resize_tags(struct request_queue *q, int new_depth)
 	 * Currently cannot replace a shared tag map with a new
 	 * one, so error out if this is the case
 	 */
-	if (atomic_read(&bqt->refcnt) != 1)
+	if (refcount_read(&bqt->refcnt) != 1)
 		return -EBUSY;
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aecca0e..95ef11c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/refcount.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -288,7 +289,7 @@ struct blk_queue_tag {
 	unsigned long *tag_map;		/* bit map of free/busy tags */
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
-	atomic_t refcnt;		/* map can be shared */
+	refcount_t refcnt;		/* map can be shared */
 	int alloc_policy;		/* tag allocation policy */
 	int next_tag;			/* next tag */
 };
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/5] block: convert blkcg_gq.refcnt from atomic_t to refcount_t
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
  2017-04-20 11:27 ` [PATCH 1/5] block: convert bio.__bi_cnt from atomic_t to refcount_t Elena Reshetova
  2017-04-20 11:27 ` [PATCH 2/5] block: convert blk_queue_tag.refcnt " Elena Reshetova
@ 2017-04-20 11:27 ` Elena Reshetova
  2017-04-20 11:27 ` [PATCH 4/5] block: convert io_context.active_ref " Elena Reshetova
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 block/blk-cgroup.c         | 2 +-
 include/linux/blk-cgroup.h | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 295e98c2..75de844 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -106,7 +106,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
-	atomic_set(&blkg->refcnt, 1);
+	refcount_set(&blkg->refcnt, 1);
 
 	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
 	if (blkcg != &blkcg_root) {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 01b62e7..e54f048 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -122,7 +123,7 @@ struct blkcg_gq {
 	struct request_list		rl;
 
 	/* reference count */
-	atomic_t			refcnt;
+	refcount_t			refcnt;
 
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
@@ -354,8 +355,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
  */
 static inline void blkg_get(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	atomic_inc(&blkg->refcnt);
+	refcount_inc(&blkg->refcnt);
 }
 
 void __blkg_release_rcu(struct rcu_head *rcu);
@@ -366,8 +366,7 @@ void __blkg_release_rcu(struct rcu_head *rcu);
  */
 static inline void blkg_put(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	if (atomic_dec_and_test(&blkg->refcnt))
+	if (refcount_dec_and_test(&blkg->refcnt))
 		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/5] block: convert io_context.active_ref from atomic_t to refcount_t
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
                   ` (2 preceding siblings ...)
  2017-04-20 11:27 ` [PATCH 3/5] block: convert blkcg_gq.refcnt " Elena Reshetova
@ 2017-04-20 11:27 ` Elena Reshetova
  2017-04-20 11:27 ` [PATCH 5/5] block: convert bsg_device.ref_count " Elena Reshetova
  2017-04-20 13:56 ` [PATCH 0/5] v2: block subsystem refcounter conversions Christoph Hellwig
  5 siblings, 0 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 block/blk-ioc.c           | 4 ++--
 block/cfq-iosched.c       | 4 ++--
 include/linux/iocontext.h | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b12f9c8..130dc23 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -173,7 +173,7 @@ void put_io_context_active(struct io_context *ioc)
 	unsigned long flags;
 	struct io_cq *icq;
 
-	if (!atomic_dec_and_test(&ioc->active_ref)) {
+	if (!refcount_dec_and_test(&ioc->active_ref)) {
 		put_io_context(ioc);
 		return;
 	}
@@ -256,7 +256,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
 	atomic_set(&ioc->nr_tasks, 1);
-	atomic_set(&ioc->active_ref, 1);
+	refcount_set(&ioc->active_ref, 1);
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->icq_list);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9212627..2871bb9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2959,7 +2959,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
+	if (!cic || !refcount_read(&cic->icq.ioc->active_ref))
 		return;
 
 	/*
@@ -3959,7 +3959,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
 		enable_idle = 0;
-	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
+	else if (!refcount_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
 		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index df38db2..e47b907 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/refcount.h>
 #include <linux/workqueue.h>
 
 enum {
@@ -96,7 +97,7 @@ struct io_cq {
  */
 struct io_context {
 	atomic_long_t refcount;
-	atomic_t active_ref;
+	refcount_t active_ref;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -128,9 +129,8 @@ struct io_context {
 static inline void get_io_context_active(struct io_context *ioc)
 {
 	WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
-	WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
 	atomic_long_inc(&ioc->refcount);
-	atomic_inc(&ioc->active_ref);
+	refcount_inc(&ioc->active_ref);
 }
 
 static inline void ioc_task_link(struct io_context *ioc)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 5/5] block: convert bsg_device.ref_count from atomic_t to refcount_t
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
                   ` (3 preceding siblings ...)
  2017-04-20 11:27 ` [PATCH 4/5] block: convert io_context.active_ref " Elena Reshetova
@ 2017-04-20 11:27 ` Elena Reshetova
  2017-04-20 13:56 ` [PATCH 0/5] v2: block subsystem refcounter conversions Christoph Hellwig
  5 siblings, 0 replies; 20+ messages in thread
From: Elena Reshetova @ 2017-04-20 11:27 UTC (permalink / raw)
  To: axboe
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba, Elena Reshetova, Hans Liljestrand, Kees Cook,
	David Windsor

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 block/bsg.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index 74835db..6d0ceb9 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/bsg.h>
 #include <linux/slab.h>
+#include <linux/refcount.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -38,7 +39,7 @@ struct bsg_device {
 	struct list_head busy_list;
 	struct list_head done_list;
 	struct hlist_node dev_list;
-	atomic_t ref_count;
+	refcount_t ref_count;
 	int queued_cmds;
 	int done_cmds;
 	wait_queue_head_t wq_done;
@@ -711,7 +712,7 @@ static int bsg_put_device(struct bsg_device *bd)
 
 	mutex_lock(&bsg_mutex);
 
-	do_free = atomic_dec_and_test(&bd->ref_count);
+	do_free = refcount_dec_and_test(&bd->ref_count);
 	if (!do_free) {
 		mutex_unlock(&bsg_mutex);
 		goto out;
@@ -763,7 +764,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 
 	bsg_set_block(bd, file);
 
-	atomic_set(&bd->ref_count, 1);
+	refcount_set(&bd->ref_count, 1);
 	mutex_lock(&bsg_mutex);
 	hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
 
@@ -783,7 +784,7 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q)
 
 	hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) {
 		if (bd->queue == q) {
-			atomic_inc(&bd->ref_count);
+			refcount_inc(&bd->ref_count);
 			goto found;
 		}
 	}
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
                   ` (4 preceding siblings ...)
  2017-04-20 11:27 ` [PATCH 5/5] block: convert bsg_device.ref_count " Elena Reshetova
@ 2017-04-20 13:56 ` Christoph Hellwig
  2017-04-20 16:10   ` Reshetova, Elena
  5 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2017-04-20 13:56 UTC (permalink / raw)
  To: Elena Reshetova
  Cc: axboe, james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba


All the objections from DaveM on the amount of cycles spent on the
new refcount_t apply to the block layer fast path operations as well.

Please don't send any more conversions until those have been resolved.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-20 13:56 ` [PATCH 0/5] v2: block subsystem refcounter conversions Christoph Hellwig
@ 2017-04-20 16:10   ` Reshetova, Elena
  2017-04-20 18:33     ` Eric Biggers
  0 siblings, 1 reply; 20+ messages in thread
From: Reshetova, Elena @ 2017-04-20 16:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, peterz, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba


> All the objections from DaveM on the amount of cycles spent on the
> new refcount_t apply to the block layer fast path operations as well.

Ok, could you please indicate the correct way to measure the impact for the block layer? 
We can do the measurements. 

Best Regards,
Elena.

> 
> Please don't send any more conversions until those have been resolved.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-20 16:10   ` Reshetova, Elena
@ 2017-04-20 18:33     ` Eric Biggers
  2017-04-21 10:55       ` Reshetova, Elena
  2017-04-21 10:56       ` Christoph Hellwig
  0 siblings, 2 replies; 20+ messages in thread
From: Eric Biggers @ 2017-04-20 18:33 UTC (permalink / raw)
  To: Reshetova, Elena
  Cc: Christoph Hellwig, axboe, james.bottomley, linux-kernel,
	linux-block, linux-scsi, linux-btrfs, peterz, gregkh,
	fujita.tomonori, mingo, clm, jbacik, dsterba

Hi Elena,

On Thu, Apr 20, 2017 at 04:10:16PM +0000, Reshetova, Elena wrote:
> 
> > All the objections from DaveM on the amount of cycles spent on the
> > new refcount_t apply to the block layer fast path operations as well.
> 
> Ok, could you please indicate the correct way to measure the impact for the block layer? 
> We can do the measurements. 
> 
> Best Regards,
> Elena.
> 
> > 
> > Please don't send any more conversions until those have been resolved.

Like I suggested months ago, how about doing an efficient implementation of
refcount_t which doesn't use the bloated cmpxchg loop?  Then there would be no
need for endless performance arguments.  In fact, in PaX there are already
example implementations for several architectures.  It's unfortunate that
they're still being ignored for some reason.

At the very least, what is there now could probably be made about twice as fast
by removing the checks that don't actually help mitigate refcount overflow bugs,
specifically all the checks in refcount_dec(), and the part of refcount_inc()
where it doesn't allow incrementing a 0 refcount.  Hint: if a refcount is 0, the
object has already been freed, so the attacker will have had the opportunity to
allocate it with contents they control already.

Of course, having extra checks behind a debug option is fine.  But they should
not be part of the base feature; the base feature should just be mitigation of
reference count *overflows*.  It would be nice to do more, of course; but when
the extra stuff prevents people from using refcount_t for performance reasons,
it defeats the point of the feature in the first place.

I strongly encourage anyone who has been involved in refcount_t to experiment
with removing a reference count decrement somewhere in their kernel, then trying
to exploit it to gain code execution.  If you don't know what you're trying to
protect against, you will not know which defences work and which don't.

- Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-20 18:33     ` Eric Biggers
@ 2017-04-21 10:55       ` Reshetova, Elena
  2017-04-21 14:03         ` Jens Axboe
                           ` (2 more replies)
  2017-04-21 10:56       ` Christoph Hellwig
  1 sibling, 3 replies; 20+ messages in thread
From: Reshetova, Elena @ 2017-04-21 10:55 UTC (permalink / raw)
  To: Eric Biggers, Kees Cook
  Cc: Christoph Hellwig, axboe, james.bottomley, linux-kernel,
	linux-block, linux-scsi, linux-btrfs, peterz, gregkh,
	fujita.tomonori, mingo, clm, jbacik, dsterba

> Hi Elena,
> 
> On Thu, Apr 20, 2017 at 04:10:16PM +0000, Reshetova, Elena wrote:
> >
> > > All the objections from DaveM on the amount of cycles spent on the
> > > new refcount_t apply to the block layer fast path operations as well.
> >
> > Ok, could you please indicate the correct way to measure the impact for the
> block layer?
> > We can do the measurements.
> >
> > Best Regards,
> > Elena.
> >
> > >
> > > Please don't send any more conversions until those have been resolved.
> 
> Like I suggested months ago, how about doing an efficient implementation of
> refcount_t which doesn't use the bloated cmpxchg loop?  Then there would be
> no
> need for endless performance arguments.  In fact, in PaX there are already
> example implementations for several architectures.  It's unfortunate that
> they're still being ignored for some reason.

Providing efficient arch. specific implementation for refcount_t is likely the next step
to make performance-sensitive places happy. However, in order to do it, we will need
to measure for each subsystem a) current atomic_t usage - base measurement 
b) pure refcount_t usage impact
c) arch. specific refcount_t impact 

Otherwise I have a chicken and egg problem here: people want better performance
and want to see arch. specific code for this, but in order to convince maintainer
in need of arch. specific code, I need to show that we do have indeed a performance issue. 

So, that's why I was endlessly asking for each subsystem that said that pointed out 
performance reasons to give hints on what should be measured. 
This way we can come back with real numbers (including for arch. specific implement) and
then decide what makes the most sense. 

> 
> At the very least, what is there now could probably be made about twice as fast
> by removing the checks that don't actually help mitigate refcount overflow bugs,
> specifically all the checks in refcount_dec(), and the part of refcount_inc()
> where it doesn't allow incrementing a 0 refcount.  Hint: if a refcount is 0, the
> object has already been freed, so the attacker will have had the opportunity to
> allocate it with contents they control already.

refcount_dec() is used very little through the code actually, it is more like an exception
case since in order to use it one must really be sure that refcounter doesn't drop to zero.
Removing the warn around it wouldn't likely affect much overall and thus it is better to
stay to discourage people of API itself :)

refcount_inc() is of course a different story, it is extensively used. I guess the perf issue
on checking increment from zero might only come from WARNs being printed,
but not really from an additional check here for zero since it is trivial and part of
the needed loop anyway. So, I think only removing the
WARNs might have any visible impact, but even this is probably not really that big. 

So, I think these changes won't really help adoption of interface if arguments against
is performance. If we do have a performance issue, I think arch. specific implementation
is likely the only way to considerably speed it up. 

> 
> Of course, having extra checks behind a debug option is fine.  But they should
> not be part of the base feature; the base feature should just be mitigation of
> reference count *overflows*.  It would be nice to do more, of course; but when
> the extra stuff prevents people from using refcount_t for performance reasons,
> it defeats the point of the feature in the first place.

Sure, but as I said above, I think the smaller tricks and fixes won't be convincing enough,
so their value is questionable. 

> 
> I strongly encourage anyone who has been involved in refcount_t to experiment
> with removing a reference count decrement somewhere in their kernel, then
> trying
> to exploit it to gain code execution.  If you don't know what you're trying to
> protect against, you will not know which defences work and which don't.

Well, we had successful CVEs and even exploits on this in the past. 
@Kees, do you store a list of them in the project?

What do you actually want to verify? The fact that if you manage to create use-after-free
situation, you are able to take advantage of it? 

Best Regards,
Elena.

> 
> - Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-20 18:33     ` Eric Biggers
  2017-04-21 10:55       ` Reshetova, Elena
@ 2017-04-21 10:56       ` Christoph Hellwig
  1 sibling, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2017-04-21 10:56 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Reshetova, Elena, Christoph Hellwig, axboe, james.bottomley,
	linux-kernel, linux-block, linux-scsi, linux-btrfs, peterz,
	gregkh, fujita.tomonori, mingo, clm, jbacik, dsterba

On Thu, Apr 20, 2017 at 11:33:19AM -0700, Eric Biggers wrote:
> Like I suggested months ago, how about doing an efficient implementation of
> refcount_t which doesn't use the bloated cmpxchg loop?  Then there would be no
> need for endless performance arguments.  In fact, in PaX there are already
> example implementations for several architectures.  It's unfortunate that
> they're still being ignored for some reason.

Yes.  I will also NAK other conversions until this is done, which is a
bit sad as the recount_t API itself is very useful.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 10:55       ` Reshetova, Elena
@ 2017-04-21 14:03         ` Jens Axboe
  2017-04-21 15:22           ` Peter Zijlstra
  2017-04-21 17:11         ` Kees Cook
  2017-04-21 19:55         ` Eric Biggers
  2 siblings, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2017-04-21 14:03 UTC (permalink / raw)
  To: Reshetova, Elena, Eric Biggers, Kees Cook
  Cc: Christoph Hellwig, james.bottomley, linux-kernel, linux-block,
	linux-scsi, linux-btrfs, peterz, gregkh, fujita.tomonori, mingo,
	clm, jbacik, dsterba

On 04/21/2017 04:55 AM, Reshetova, Elena wrote:
>>>> Please don't send any more conversions until those have been resolved.
>>
>> Like I suggested months ago, how about doing an efficient implementation of
>> refcount_t which doesn't use the bloated cmpxchg loop?  Then there would be
>> no
>> need for endless performance arguments.  In fact, in PaX there are already
>> example implementations for several architectures.  It's unfortunate that
>> they're still being ignored for some reason.
> 
> Providing efficient arch. specific implementation for refcount_t is
> likely the next step to make performance-sensitive places happy.
> However, in order to do it, we will need to measure for each subsystem
> a) current atomic_t usage - base measurement 
> b) pure refcount_t usage impact
> c) arch. specific refcount_t impact 

> Otherwise I have a chicken and egg problem here: people want better
> performance and want to see arch. specific code for this, but in order
> to convince maintainer in need of arch. specific code, I need to show
> that we do have indeed a performance issue. 

Sorry, but that's a big load of...

You have it so easy - the code is completely standalone, building a
small test framework around it and measuring performance in _user space_
is trivial. Don't go around benchmarking conversions in specific
subsystems. Generate numbers showing how refcount_t compares to atomic_t
for various (valid) use cases.

Once you have that, convincing people to adopt it would be so much
easier. So stop talking about excuses, and start producing some numbers.
If you can't do that, my level of interest in converting anything in
block is zero.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 14:03         ` Jens Axboe
@ 2017-04-21 15:22           ` Peter Zijlstra
  2017-04-21 16:29             ` Jens Axboe
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2017-04-21 15:22 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Reshetova, Elena, Eric Biggers, Kees Cook, Christoph Hellwig,
	james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba

On Fri, Apr 21, 2017 at 08:03:13AM -0600, Jens Axboe wrote:
> You have it so easy - the code is completely standalone, building a
> small test framework around it and measuring performance in _user space_
> is trivial.

Something like this you mean:

  https://lkml.kernel.org/r/20170322165144.dtidvvbxey7w5pbd@hirez.programming.kicks-ass.net

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 15:22           ` Peter Zijlstra
@ 2017-04-21 16:29             ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2017-04-21 16:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Reshetova, Elena, Eric Biggers, Kees Cook, Christoph Hellwig,
	james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba

On 04/21/2017 09:22 AM, Peter Zijlstra wrote:
> On Fri, Apr 21, 2017 at 08:03:13AM -0600, Jens Axboe wrote:
>> You have it so easy - the code is completely standalone, building a
>> small test framework around it and measuring performance in _user space_
>> is trivial.
> 
> Something like this you mean:
> 
>   https://lkml.kernel.org/r/20170322165144.dtidvvbxey7w5pbd@hirez.programming.kicks-ass.net

Yes, exactly. But it'd be great to see better coverage of the API, and
various cases of X threads running operations on the same refcount,
so we get a full view of that side too. The above looks like just
counting cycles for a single op of foo_inc().

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 10:55       ` Reshetova, Elena
  2017-04-21 14:03         ` Jens Axboe
@ 2017-04-21 17:11         ` Kees Cook
  2017-04-21 19:55         ` Eric Biggers
  2 siblings, 0 replies; 20+ messages in thread
From: Kees Cook @ 2017-04-21 17:11 UTC (permalink / raw)
  To: Reshetova, Elena, Eric Biggers, Christoph Hellwig, axboe, peterz
  Cc: james.bottomley, linux-kernel, linux-block, linux-scsi,
	linux-btrfs, gregkh, fujita.tomonori, mingo, clm, jbacik,
	dsterba

On Fri, Apr 21, 2017 at 3:55 AM, Reshetova, Elena
<elena.reshetova@intel.com> wrote:
> On Thu, Apr 20, 2017 at 11:33 AM, Eric Biggers <ebiggers3@gmail.com> wrote:
>> Of course, having extra checks behind a debug option is fine.  But they should
>> not be part of the base feature; the base feature should just be mitigation of
>> reference count *overflows*.  It would be nice to do more, of course; but when
>> the extra stuff prevents people from using refcount_t for performance reasons,
>> it defeats the point of the feature in the first place.
>
> Sure, but as I said above, I think the smaller tricks and fixes won't be convincing enough,
> so their value is questionable.
>
>> I strongly encourage anyone who has been involved in refcount_t to experiment
>> with removing a reference count decrement somewhere in their kernel, then trying
>> to exploit it to gain code execution.  If you don't know what you're trying to
>> protect against, you will not know which defences work and which don't.
>
> Well, we had successful CVEs and even exploits on this in the past.
> @Kees, do you store a list of them in the project?

Here are two from last year:
http://perception-point.io/2016/01/14/analysis-and-exploitation-of-a-linux-kernel-vulnerability-cve-2016-0728/
http://cyseclabs.com/page?n=02012016

I don't disagree with Eric on the threat model: the primary concern
for reference count protection is the overflow condition. Detecting a
prior use-after-free is certainly nice to have, but the more important
case is the initial overflow.

How about we introduce something like CONFIG_HAVE_ARCH_FAST_REFCOUNT_T
for per-arch implementations and CONFIG_FAST_REFCOUNT_T that trades
coverage for speed, and checks only the overflow condition. This gets
us the critical coverage without the changes in performance. This is
basically what PaX/grsecurity already did: there is a tiny change to
the atomic inc functions to detect the wrap.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 10:55       ` Reshetova, Elena
  2017-04-21 14:03         ` Jens Axboe
  2017-04-21 17:11         ` Kees Cook
@ 2017-04-21 19:55         ` Eric Biggers
  2017-04-21 20:22           ` Kees Cook
  2 siblings, 1 reply; 20+ messages in thread
From: Eric Biggers @ 2017-04-21 19:55 UTC (permalink / raw)
  To: Reshetova, Elena
  Cc: Kees Cook, Christoph Hellwig, axboe, james.bottomley,
	linux-kernel, linux-block, linux-scsi, linux-btrfs, peterz,
	gregkh, fujita.tomonori, mingo, clm, jbacik, dsterba

Hi Elena,

On Fri, Apr 21, 2017 at 10:55:29AM +0000, Reshetova, Elena wrote:
> > 
> > At the very least, what is there now could probably be made about twice as fast
> > by removing the checks that don't actually help mitigate refcount overflow bugs,
> > specifically all the checks in refcount_dec(), and the part of refcount_inc()
> > where it doesn't allow incrementing a 0 refcount.  Hint: if a refcount is 0, the
> > object has already been freed, so the attacker will have had the opportunity to
> > allocate it with contents they control already.
> 
> refcount_dec() is used very little through the code actually, it is more like an exception
> case since in order to use it one must really be sure that refcounter doesn't drop to zero.
> Removing the warn around it wouldn't likely affect much overall and thus it is better to
> stay to discourage people of API itself :)
> 
> refcount_inc() is of course a different story, it is extensively used. I guess the perf issue
> on checking increment from zero might only come from WARNs being printed,
> but not really from an additional check here for zero since it is trivial and part of
> the needed loop anyway. So, I think only removing the
> WARNs might have any visible impact, but even this is probably not really that big. 
> 
> So, I think these changes won't really help adoption of interface if arguments against
> is performance. If we do have a performance issue, I think arch. specific implementation
> is likely the only way to considerably speed it up. 

I should have used refcount_dec_and_test() as the example, as the same applies
to both refcount_dec() and refcount_dec_and_test().  There is negligible
security benefit to have these refcount release operations checked vs. just
calling through to atomic_dec() and atomic_dec_and_test().  It's unfortunate,
but there is no known way to detect ahead of time (i.e. before exploitation) if
there are too many refcount releases, only too many refcount acquires.

The WARN is only executed if there is a bug, so naturally it's only a problem if
the functions are to be inlined, creating bloat.  The actual performance problem
is the overhead associated with using comparisons and cmpxchg's to avoid
changing a refcount that is 0 or UINT_MAX.  The more efficient approach would be
to (a) drop the check for 0, and (b) don't require full operation to be atomic,
but rather do something like "lock incl %[counter]; js <handle_error>".  Yes
it's not "atomic", and people have complained about this, but there is no
technical reason why it needs to be atomic.  Attackers do *not* care whether
your exploit mitigation is theoretically "atomic" or not, they only care whether
it works or not.  And besides, it's not even "atomic_t" anymore, it's
"refcount_t".  

> > Of course, having extra checks behind a debug option is fine.  But they should
> > not be part of the base feature; the base feature should just be mitigation of
> > reference count *overflows*.  It would be nice to do more, of course; but when
> > the extra stuff prevents people from using refcount_t for performance reasons,
> > it defeats the point of the feature in the first place.
> 
> Sure, but as I said above, I think the smaller tricks and fixes won't be convincing enough,
> so their value is questionable. 

This makes no sense, as the main point of the feature is supposed to be the
security improvement.  As-is, the extra debugging stuff is actually preventing
the security improvement from being adopted, which is unfortunate.

- Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 19:55         ` Eric Biggers
@ 2017-04-21 20:22           ` Kees Cook
  2017-04-21 21:27             ` James Bottomley
  0 siblings, 1 reply; 20+ messages in thread
From: Kees Cook @ 2017-04-21 20:22 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Reshetova, Elena, Christoph Hellwig, axboe, james.bottomley,
	linux-kernel, linux-block, linux-scsi, linux-btrfs, peterz,
	gregkh, fujita.tomonori, mingo, clm, jbacik, dsterba

On Fri, Apr 21, 2017 at 12:55 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
> Hi Elena,
>
> On Fri, Apr 21, 2017 at 10:55:29AM +0000, Reshetova, Elena wrote:
>> >
>> > At the very least, what is there now could probably be made about twice as fast
>> > by removing the checks that don't actually help mitigate refcount overflow bugs,
>> > specifically all the checks in refcount_dec(), and the part of refcount_inc()
>> > where it doesn't allow incrementing a 0 refcount.  Hint: if a refcount is 0, the
>> > object has already been freed, so the attacker will have had the opportunity to
>> > allocate it with contents they control already.
>>
>> refcount_dec() is used very little through the code actually, it is more like an exception
>> case since in order to use it one must really be sure that refcounter doesn't drop to zero.
>> Removing the warn around it wouldn't likely affect much overall and thus it is better to
>> stay to discourage people of API itself :)
>>
>> refcount_inc() is of course a different story, it is extensively used. I guess the perf issue
>> on checking increment from zero might only come from WARNs being printed,
>> but not really from an additional check here for zero since it is trivial and part of
>> the needed loop anyway. So, I think only removing the
>> WARNs might have any visible impact, but even this is probably not really that big.
>>
>> So, I think these changes won't really help adoption of interface if arguments against
>> is performance. If we do have a performance issue, I think arch. specific implementation
>> is likely the only way to considerably speed it up.
>
> I should have used refcount_dec_and_test() as the example, as the same applies
> to both refcount_dec() and refcount_dec_and_test().  There is negligible
> security benefit to have these refcount release operations checked vs. just
> calling through to atomic_dec() and atomic_dec_and_test().  It's unfortunate,
> but there is no known way to detect ahead of time (i.e. before exploitation) if
> there are too many refcount releases, only too many refcount acquires.
>
> The WARN is only executed if there is a bug, so naturally it's only a problem if
> the functions are to be inlined, creating bloat.  The actual performance problem
> is the overhead associated with using comparisons and cmpxchg's to avoid
> changing a refcount that is 0 or UINT_MAX.  The more efficient approach would be
> to (a) drop the check for 0, and (b) don't require full operation to be atomic,
> but rather do something like "lock incl %[counter]; js <handle_error>".  Yes
> it's not "atomic", and people have complained about this, but there is no
> technical reason why it needs to be atomic.  Attackers do *not* care whether
> your exploit mitigation is theoretically "atomic" or not, they only care whether
> it works or not.  And besides, it's not even "atomic_t" anymore, it's
> "refcount_t".
>
>> > Of course, having extra checks behind a debug option is fine.  But they should
>> > not be part of the base feature; the base feature should just be mitigation of
>> > reference count *overflows*.  It would be nice to do more, of course; but when
>> > the extra stuff prevents people from using refcount_t for performance reasons,
>> > it defeats the point of the feature in the first place.
>>
>> Sure, but as I said above, I think the smaller tricks and fixes won't be convincing enough,
>> so their value is questionable.
>
> This makes no sense, as the main point of the feature is supposed to be the
> security improvement.  As-is, the extra debugging stuff is actually preventing
> the security improvement from being adopted, which is unfortunate.

We've been trying to handle the conflicting desires of those wanting
very precise refcounting implementation and gaining the security
protections. Ultimately, the best way forward seemed to be to first
land the precise refcounting implementation, and start conversion
until we ran into concerns over performance. Now, since we're here, we
can move forward with getting a fast implementation that provides the
desired security protections without too greatly messing with the
refcount API.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 20:22           ` Kees Cook
@ 2017-04-21 21:27             ` James Bottomley
  2017-04-21 21:30               ` Kees Cook
  0 siblings, 1 reply; 20+ messages in thread
From: James Bottomley @ 2017-04-21 21:27 UTC (permalink / raw)
  To: Kees Cook, Eric Biggers
  Cc: Reshetova, Elena, Christoph Hellwig, axboe, linux-kernel,
	linux-block, linux-scsi, linux-btrfs, peterz, gregkh,
	fujita.tomonori, mingo, clm, jbacik, dsterba

On Fri, 2017-04-21 at 13:22 -0700, Kees Cook wrote:
> On Fri, Apr 21, 2017 at 12:55 PM, Eric Biggers <ebiggers3@gmail.com> 
> wrote:
> > > > Of course, having extra checks behind a debug option is fine. 
> > > >  But they should not be part of the base feature; the base 
> > > > feature should just be mitigation of reference count
> > > > *overflows*.  It would be nice to do more, of
> > > > course; but when the extra stuff prevents people from using 
> > > > refcount_t for performance reasons, it defeats the point of the
> > > > feature in the first place.
> > > 
> > > Sure, but as I said above, I think the smaller tricks and fixes 
> > > won't be convincing enough, so their value is questionable.
> > 
> > This makes no sense, as the main point of the feature is supposed 
> > to be the security improvement.  As-is, the extra debugging stuff 
> > is actually preventing the security improvement from being adopted,
> > which is unfortunate.
> 
> We've been trying to handle the conflicting desires of those wanting
> very precise refcounting implementation and gaining the security
> protections. Ultimately, the best way forward seemed to be to first
> land the precise refcounting implementation, and start conversion
> until we ran into concerns over performance. Now, since we're here, 
> we can move forward with getting a fast implementation that provides 
> the desired security protections without too greatly messing with the
> refcount API.

But that's not what it really looks like.  What it looks like is
someone came up with a new API and is now intent on forcing it through
the kernel in about 500 patches using security as the hammer.

If we were really concerned about security first, we'd have fixed the
atomic overflow problem in the atomics and then built the precise
refcounting on that and tried to persuade people of the merits.

Why can't we still do this?  It looks like the overflow protection will
add only a couple of cycles to the atomics.  We can move the current
version to an unchecked variant which can be used either in truly
performance critical regions with no security implications or if
someone really needs the atomic to overflow.  From my point of view it
would give us the 90% case (security) and stop this endless argument
over the fast paths.  Subsystems which have already moved to refcount
would stay there and the rest get to evaluate a migration on the merits
of the operational utility.

James

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 21:27             ` James Bottomley
@ 2017-04-21 21:30               ` Kees Cook
  2017-04-21 22:01                 ` James Bottomley
  0 siblings, 1 reply; 20+ messages in thread
From: Kees Cook @ 2017-04-21 21:30 UTC (permalink / raw)
  To: James Bottomley
  Cc: Eric Biggers, Reshetova, Elena, Christoph Hellwig, axboe,
	linux-kernel, linux-block, linux-scsi, linux-btrfs, peterz,
	gregkh, fujita.tomonori, mingo, clm, jbacik, dsterba

On Fri, Apr 21, 2017 at 2:27 PM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Fri, 2017-04-21 at 13:22 -0700, Kees Cook wrote:
>> On Fri, Apr 21, 2017 at 12:55 PM, Eric Biggers <ebiggers3@gmail.com>
>> wrote:
>> > > > Of course, having extra checks behind a debug option is fine.
>> > > >  But they should not be part of the base feature; the base
>> > > > feature should just be mitigation of reference count
>> > > > *overflows*.  It would be nice to do more, of
>> > > > course; but when the extra stuff prevents people from using
>> > > > refcount_t for performance reasons, it defeats the point of the
>> > > > feature in the first place.
>> > >
>> > > Sure, but as I said above, I think the smaller tricks and fixes
>> > > won't be convincing enough, so their value is questionable.
>> >
>> > This makes no sense, as the main point of the feature is supposed
>> > to be the security improvement.  As-is, the extra debugging stuff
>> > is actually preventing the security improvement from being adopted,
>> > which is unfortunate.
>>
>> We've been trying to handle the conflicting desires of those wanting
>> very precise refcounting implementation and gaining the security
>> protections. Ultimately, the best way forward seemed to be to first
>> land the precise refcounting implementation, and start conversion
>> until we ran into concerns over performance. Now, since we're here,
>> we can move forward with getting a fast implementation that provides
>> the desired security protections without too greatly messing with the
>> refcount API.
>
> But that's not what it really looks like.  What it looks like is
> someone came up with a new API and is now intent on forcing it through
> the kernel in about 500 patches using security as the hammer.

The intent is to split refcounting and statistical counters away from
atomics, since they have distinct APIs. This will let us audit the
remaining atomic uses much more easily.

> If we were really concerned about security first, we'd have fixed the
> atomic overflow problem in the atomics and then built the precise
> refcounting on that and tried to persuade people of the merits.

I agree, but this approach was NAKed by multiple atomics maintainers.

> Why can't we still do this?  It looks like the overflow protection will
> add only a couple of cycles to the atomics.  We can move the current
> version to an unchecked variant which can be used either in truly
> performance critical regions with no security implications or if
> someone really needs the atomic to overflow.  From my point of view it
> would give us the 90% case (security) and stop this endless argument
> over the fast paths.  Subsystems which have already moved to refcount
> would stay there and the rest get to evaluate a migration on the merits
> of the operational utility.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 0/5] v2: block subsystem refcounter conversions
  2017-04-21 21:30               ` Kees Cook
@ 2017-04-21 22:01                 ` James Bottomley
  0 siblings, 0 replies; 20+ messages in thread
From: James Bottomley @ 2017-04-21 22:01 UTC (permalink / raw)
  To: Kees Cook
  Cc: Eric Biggers, Reshetova, Elena, Christoph Hellwig, axboe,
	linux-kernel, linux-block, linux-scsi, linux-btrfs, peterz,
	gregkh, fujita.tomonori, mingo, clm, jbacik, dsterba

On Fri, 2017-04-21 at 14:30 -0700, Kees Cook wrote:
> On Fri, Apr 21, 2017 at 2:27 PM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Fri, 2017-04-21 at 13:22 -0700, Kees Cook wrote:
> > > On Fri, Apr 21, 2017 at 12:55 PM, Eric Biggers <
> > > ebiggers3@gmail.com>
> > > wrote:
> > > > > > Of course, having extra checks behind a debug option is 
> > > > > > fine.  But they should not be part of the base feature; the 
> > > > > > base feature should just be mitigation of reference count
> > > > > > *overflows*.  It would be nice to do more, of course; but 
> > > > > > when the extra stuff prevents people from using refcount_t 
> > > > > > for performance reasons, it defeats the point of the
> > > > > > feature in the first place.
> > > > > 
> > > > > Sure, but as I said above, I think the smaller tricks and 
> > > > > fixes won't be convincing enough, so their value is
> > > > > questionable.
> > > > 
> > > > This makes no sense, as the main point of the feature is 
> > > > supposed to be the security improvement.  As-is, the extra 
> > > > debugging stuff is actually preventing the security improvement 
> > > > from being adopted, which is unfortunate.
> > > 
> > > We've been trying to handle the conflicting desires of those 
> > > wanting very precise refcounting implementation and gaining the 
> > > security protections. Ultimately, the best way forward seemed to 
> > > be to first land the precise refcounting implementation, and 
> > > start conversion until we ran into concerns over performance. 
> > > Now, since we're here, we can move forward with getting a fast 
> > > implementation that provides the desired security protections 
> > > without too greatly messing with the refcount API.
> > 
> > But that's not what it really looks like.  What it looks like is
> > someone came up with a new API and is now intent on forcing it 
> > through the kernel in about 500 patches using security as the
> > hammer.
> 
> The intent is to split refcounting and statistical counters away from
> atomics, since they have distinct APIs. This will let us audit the
> remaining atomic uses much more easily.

But the security problem is counter overflow, right?  That problem, as
far as I can see exists in the atomics as well.  I'm sure there might
be one or two corner cases depending on overflow actually working, but
I can't think of them.

The refcount vs atomic comes on the precise meaning of decrement to
zero.  I'm not saying there's no benefit to detecting the condition,
but the security problem looks to be much more pressing which is why I
think this can be argued on the merits later.

> > If we were really concerned about security first, we'd have fixed 
> > the atomic overflow problem in the atomics and then built the 
> > precise refcounting on that and tried to persuade people of the
> > merits.
> 
> I agree, but this approach was NAKed by multiple atomics maintainers.

Overriding that decision by trying to convince all the consumers to
move to a new API doesn't seem to be going so well either.  Perhaps we
could assist you in changing the minds of the atomics maintainers ...
what was the primary problem?  The additional couple of cycles or the
fact that some use cases want overflow (or something else)?

James

> > Why can't we still do this?  It looks like the overflow protection 
> > will add only a couple of cycles to the atomics.  We can move the
> > current version to an unchecked variant which can be used either in 
> > truly performance critical regions with no security implications or 
> > if someone really needs the atomic to overflow.  From my point of 
> > view it would give us the 90% case (security) and stop this endless
> > argument over the fast paths.  Subsystems which have already moved 
> > to refcount would stay there and the rest get to evaluate a 
> > migration on the merits of the operational utility.
> 
> -Kees
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-04-21 22:01 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-20 11:27 [PATCH 0/5] v2: block subsystem refcounter conversions Elena Reshetova
2017-04-20 11:27 ` [PATCH 1/5] block: convert bio.__bi_cnt from atomic_t to refcount_t Elena Reshetova
2017-04-20 11:27 ` [PATCH 2/5] block: convert blk_queue_tag.refcnt " Elena Reshetova
2017-04-20 11:27 ` [PATCH 3/5] block: convert blkcg_gq.refcnt " Elena Reshetova
2017-04-20 11:27 ` [PATCH 4/5] block: convert io_context.active_ref " Elena Reshetova
2017-04-20 11:27 ` [PATCH 5/5] block: convert bsg_device.ref_count " Elena Reshetova
2017-04-20 13:56 ` [PATCH 0/5] v2: block subsystem refcounter conversions Christoph Hellwig
2017-04-20 16:10   ` Reshetova, Elena
2017-04-20 18:33     ` Eric Biggers
2017-04-21 10:55       ` Reshetova, Elena
2017-04-21 14:03         ` Jens Axboe
2017-04-21 15:22           ` Peter Zijlstra
2017-04-21 16:29             ` Jens Axboe
2017-04-21 17:11         ` Kees Cook
2017-04-21 19:55         ` Eric Biggers
2017-04-21 20:22           ` Kees Cook
2017-04-21 21:27             ` James Bottomley
2017-04-21 21:30               ` Kees Cook
2017-04-21 22:01                 ` James Bottomley
2017-04-21 10:56       ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).