[PATCH] service: fix race in service on app lcore function

* [PATCH] service: fix race in service on app lcore function
@ 2017-10-31 11:49 Harry van Haaren
  2017-11-01 17:09 ` Bruce Richardson
  2017-11-01 18:48 ` [PATCH v2] " Harry van Haaren
  0 siblings, 2 replies; 7+ messages in thread
From: Harry van Haaren @ 2017-10-31 11:49 UTC (permalink / raw)
  To: dev; +Cc: pbhagavatula, thomas, Harry van Haaren

This commit fixes a possible race condition if an application
uses the service-cores infrastructure and the function to run
a service on an application lcore at the same time.

The fix is to change the num_mapped_cores variable to be an
atomic variable. This causes concurrent accesses by multiple
threads to a service using rte_service_run_iter_on_app_lcore()
to detect if another core is currently mapped to the service,
and refuses to run if it is not multi-thread safe.

No performance impact is expected as the mappings for the
service-cores changes at control-path frequency, hence the
change from an ordinary write to an atomic write will not
have any significant impact.

Two unit tests were added to verify the behaviour of the
function to run a service on an application core, testing both
a multi-thread safe service, and a multi-thread unsafe service.

The doxygen API documentation for the function has been updated
to reflect the current and correct behaviour.

Fixes: e9139a32f6e8 ("service: add function to run on app lcore")

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

 lib/librte_eal/common/include/rte_service.h |  19 +++--
 lib/librte_eal/common/rte_service.c         |  37 ++++++++--
 test/test/test_service_cores.c              | 109 ++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 14 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_service.h b/lib/librte_eal/common/include/rte_service.h
index d9de5ad..0183013 100644
--- a/lib/librte_eal/common/include/rte_service.h
+++ b/lib/librte_eal/common/include/rte_service.h
@@ -232,11 +232,18 @@ int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable);
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice
  *
- * This function runs a service callback from a non-service lcore context.
- * The *id* of the service to be run is passed in, and the service-callback
- * is executed on the calling lcore immediately if possible. If the service is
- * not multi-thread capable and another thread is currently executing it, this
- * function returns without running the callback.
+ * This function runs a service callback from a non-service lcore.
+ *
+ * This function is designed to enable gradual porting to service cores, and
+ * to enable unit tests to verify a service behaves as expected.
+ *
+ * When called, this function ensures that the service identified by *id* is
+ * safe to run on this lcore. Multi-thread safe services are invoked even if
+ * other cores are running them as they are multi-thread safe. Multi-thread
+ * unsafe services are not invoked if a service-lcore is mapped to the service,
+ * and this function will return -EBUSY to indicate this to the user. If no
+ * other lcore is running a multi-thread unsafe service, it will be run by the
+ * calling lcore.
  *
  * Note that any thread calling this function MUST be a DPDK EAL thread, as
  * the *rte_lcore_id* function is used to access internal data structures.
@@ -244,7 +251,7 @@ int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable);
  * @retval 0 Service was run on the calling thread successfully
  * @retval -EBUSY Another lcore is executing the service, and it is not a
  *         multi-thread safe service, so the service was not run on this lcore
- * @retval -ENOEXEC Service is not in a runnable state
+ * @retval -ENOEXEC Service is not in a run-able state
  * @retval -EINVAL Invalid service id
  */
 int32_t rte_service_run_iter_on_app_lcore(uint32_t id);
diff --git a/lib/librte_eal/common/rte_service.c b/lib/librte_eal/common/rte_service.c
index f17bf4b..ca5a01d 100644
--- a/lib/librte_eal/common/rte_service.c
+++ b/lib/librte_eal/common/rte_service.c
@@ -77,7 +77,7 @@ struct rte_service_spec_impl {
 	uint8_t internal_flags;
 
 	/* per service statistics */
-	uint32_t num_mapped_cores;
+	rte_atomic32_t num_mapped_cores;
 	uint64_t calls;
 	uint64_t cycles_spent;
 } __rte_cache_aligned;
@@ -325,7 +325,7 @@ rte_service_runstate_get(uint32_t id)
 	rte_smp_rmb();
 
 	int check_disabled = !(s->internal_flags & SERVICE_F_START_CHECK);
-	int lcore_mapped = (s->num_mapped_cores > 0);
+	int lcore_mapped = (rte_atomic32_read(&s->num_mapped_cores) > 0);
 
 	return (s->app_runstate == RUNSTATE_RUNNING) &&
 		(s->comp_runstate == RUNSTATE_RUNNING) &&
@@ -365,7 +365,7 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask)
 	 * mapped, atomic ops are not required.
 	 */
 	const int use_atomics = (service_mt_safe(s) == 0) &&
-				(s->num_mapped_cores > 1);
+				(rte_atomic32_read(&s->num_mapped_cores) > 1);
 	if (use_atomics) {
 		if (!rte_atomic32_cmpset((uint32_t *)&s->execute_lock, 0, 1))
 			return -EBUSY;
@@ -381,8 +381,28 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask)
 int32_t rte_service_run_iter_on_app_lcore(uint32_t id)
 {
 	/* run service on calling core, using all-ones as the service mask */
+	if (!service_valid(id))
+		return -EINVAL;
+
 	struct core_state *cs = &lcore_states[rte_lcore_id()];
-	return service_run(id, cs, UINT64_MAX);
+	struct rte_service_spec_impl *s = &rte_services[id];
+
+	/* Atomically add this core to the mapped cores first, then examine if
+	 * we can run the service. This avoids a race condition between
+	 * checking the value, and atomically adding to the mapped count.
+	 */
+	rte_atomic32_inc(&s->num_mapped_cores);
+
+	if (service_mt_safe(s) == 0 &&
+			rte_atomic32_read(&s->num_mapped_cores) > 1) {
+		rte_atomic32_dec(&s->num_mapped_cores);
+		return -EBUSY;
+	}
+
+	int ret = service_run(id, cs, UINT64_MAX);
+	rte_atomic32_dec(&s->num_mapped_cores);
+
+	return ret;
 }
 
 static int32_t
@@ -522,10 +542,10 @@ service_update(struct rte_service_spec *service, uint32_t lcore,
 	if (set) {
 		if (*set) {
 			lcore_states[lcore].service_mask |= sid_mask;
-			rte_services[sid].num_mapped_cores++;
+			rte_atomic32_inc(&rte_services[sid].num_mapped_cores);
 		} else {
 			lcore_states[lcore].service_mask &= ~(sid_mask);
-			rte_services[sid].num_mapped_cores--;
+			rte_atomic32_dec(&rte_services[sid].num_mapped_cores);
 		}
 	}
 
@@ -568,7 +588,7 @@ int32_t rte_service_lcore_reset_all(void)
 		lcore_states[i].runstate = RUNSTATE_STOPPED;
 	}
 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++)
-		rte_services[i].num_mapped_cores = 0;
+		rte_atomic32_set(&rte_services[i].num_mapped_cores, 0);
 
 	rte_smp_wmb();
 
@@ -664,7 +684,8 @@ rte_service_lcore_stop(uint32_t lcore)
 	for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
 		int32_t enabled = service_mask & (UINT64_C(1) << i);
 		int32_t service_running = rte_service_runstate_get(i);
-		int32_t only_core = rte_services[i].num_mapped_cores == 1;
+		int32_t only_core = (1 ==
+			rte_atomic32_read(&rte_services[i].num_mapped_cores));
 
 		/* if the core is mapped, and the service is running, and this
 		 * is the only core that is mapped, the service would cease to
diff --git a/test/test/test_service_cores.c b/test/test/test_service_cores.c
index ee8313e..6d7542b 100644
--- a/test/test/test_service_cores.c
+++ b/test/test/test_service_cores.c
@@ -548,6 +548,113 @@ service_mt_unsafe_poll(void)
 	return TEST_SUCCESS;
 }
 
+static int32_t
+delay_as_a_mt_safe_service(void *args)
+{
+	RTE_SET_USED(args);
+	uint32_t *params = args;
+
+	/* retrieve done flag and atomic lock to inc/dec */
+	uint32_t *done = &params[0];
+	rte_atomic32_t *lock = (rte_atomic32_t *)&params[1];
+
+	while (!*done) {
+		rte_atomic32_inc(lock);
+		rte_delay_us(500);
+		if (rte_atomic32_read(lock) > 1)
+			/* pass: second core has simultaneously incremented */
+			*done = 1;
+		rte_atomic32_dec(lock);
+	}
+
+	return 0;
+}
+
+static int32_t
+delay_as_a_service(void *args)
+{
+	uint32_t *done = (uint32_t *)args;
+	while (!*done)
+		rte_delay_ms(5);
+	return 0;
+}
+
+static int
+service_run_on_app_core_func(void *arg)
+{
+	uint32_t *delay_service_id = (uint32_t *)arg;
+	return rte_service_run_iter_on_app_lcore(*delay_service_id);
+}
+
+static int
+service_app_lcore_poll_impl(const int mt_safe)
+{
+	uint32_t params[2] = {0};
+
+	struct rte_service_spec service;
+	memset(&service, 0, sizeof(struct rte_service_spec));
+	snprintf(service.name, sizeof(service.name), MT_SAFE_SERVICE_NAME);
+	if (mt_safe) {
+		service.callback = delay_as_a_mt_safe_service;
+		service.callback_userdata = params;
+		service.capabilities |= RTE_SERVICE_CAP_MT_SAFE;
+	} else {
+		service.callback = delay_as_a_service;
+		service.callback_userdata = &params;
+	}
+
+	uint32_t id;
+	TEST_ASSERT_EQUAL(0, rte_service_component_register(&service, &id),
+			"Register of app lcore delay service failed");
+
+	rte_service_component_runstate_set(id, 1);
+	rte_service_runstate_set(id, 1);
+
+	uint32_t app_core2 = rte_get_next_lcore(slcore_id, 1, 1);
+	int app_core2_ret = rte_eal_remote_launch(service_run_on_app_core_func,
+						  &id, app_core2);
+
+	rte_delay_ms(100);
+
+	int app_core1_ret = service_run_on_app_core_func(&id);
+
+	/* flag done, then wait for the spawned 2nd core to return */
+	params[0] = 1;
+	rte_eal_mp_wait_lcore();
+
+	/* core two gets launched first - and should hold the service lock */
+	TEST_ASSERT_EQUAL(0, app_core2_ret,
+			"App core2 : run service didn't return zero");
+
+	if (mt_safe) {
+		/* mt safe should have both cores return 0 for success */
+		TEST_ASSERT_EQUAL(0, app_core1_ret,
+				"MT Safe: App core1 didn't return 0");
+	} else {
+		/* core one attempts to run later - should be blocked */
+		TEST_ASSERT_EQUAL(-EBUSY, app_core1_ret,
+				"MT Unsafe: App core1 didn't return -EBUSY");
+	}
+
+	unregister_all();
+
+	return TEST_SUCCESS;
+}
+
+static int
+service_app_lcore_mt_safe(void)
+{
+	const int mt_safe = 1;
+	return service_app_lcore_poll_impl(mt_safe);
+}
+
+static int
+service_app_lcore_mt_unsafe(void)
+{
+	const int mt_safe = 0;
+	return service_app_lcore_poll_impl(mt_safe);
+}
+
 /* start and stop a service core - ensuring it goes back to sleep */
 static int
 service_lcore_start_stop(void)
@@ -613,6 +720,8 @@ static struct unit_test_suite service_tests  = {
 		TEST_CASE_ST(dummy_register, NULL, service_lcore_en_dis_able),
 		TEST_CASE_ST(dummy_register, NULL, service_mt_unsafe_poll),
 		TEST_CASE_ST(dummy_register, NULL, service_mt_safe_poll),
+		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
+		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread