All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
@ 2017-05-08  3:58 Yang Feng
  2017-05-10 22:36 ` Xose Vazquez Perez
  2017-05-11 11:14 ` Martin Wilck
  0 siblings, 2 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-08  3:58 UTC (permalink / raw)
  To: dm-devel; +Cc: guanjunxiong, philip.yang, hege09, zouming.zouming, shenhong09

Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a time-delay algorithm. And the
time-delay algorithm is dependent on the following arguments(delay_interval,
cons_num).
The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path
   continuously, the IOs' average delay can be calculated.
2. According to the average delay of each path and the weight value
   "delay_interval", the priority "rc" of each path can be provided.

     delay_interval  delay_interval  delay_interval       delay_interval
    |---------------|---------------|---------------|	 |---------------|
    |priority rank1 |priority rank2 |priority rank3 |... |priority rank4 |
    |---------------|---------------|---------------|    |---------------|
                       Priority Rank Partitioning
---
 libmultipath/Makefile                   |   2 +-
 libmultipath/checkers/Makefile          |   7 +-
 libmultipath/checkers/emc_clariion.c    |   2 +-
 libmultipath/checkers/libsg.c           |  94 ------------
 libmultipath/checkers/libsg.h           |   9 --
 libmultipath/checkers/readsector0.c     |   2 +-
 libmultipath/libsg.c                    |  94 ++++++++++++
 libmultipath/libsg.h                    |   9 ++
 libmultipath/prioritizers/Makefile      |   6 +-
 libmultipath/prioritizers/delayedpath.c | 246 ++++++++++++++++++++++++++++++++
 libmultipath/prioritizers/delayedpath.h |  14 ++
 11 files changed, 373 insertions(+), 112 deletions(-)
 delete mode 100644 libmultipath/checkers/libsg.c
 delete mode 100644 libmultipath/checkers/libsg.h
 create mode 100644 libmultipath/libsg.c
 create mode 100644 libmultipath/libsg.h
 create mode 100644 libmultipath/prioritizers/delayedpath.c
 create mode 100644 libmultipath/prioritizers/delayedpath.h

diff --git a/libmultipath/Makefile b/libmultipath/Makefile
index 1f5ec25..a4d725a 100644
--- a/libmultipath/Makefile
+++ b/libmultipath/Makefile
@@ -41,7 +41,7 @@ OBJS = memory.o parser.o vector.o devmapper.o callout.o \
 	structs.o discovery.o propsel.o dict.o \
 	pgpolicies.o debug.o defaults.o uevent.o time-util.o \
 	switchgroup.o uxsock.o print.o alias.o log_pthread.o \
-	log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
+	log.o configure.o structs_vec.o sysfs.o libsg.o prio.o checkers.o \
 	lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o
 
 all: $(LIBS)
diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile
index 4970fc0..7e433ca 100644
--- a/libmultipath/checkers/Makefile
+++ b/libmultipath/checkers/Makefile
@@ -14,19 +14,16 @@ LIBS= \
 	libcheckemc_clariion.so \
 	libcheckhp_sw.so \
 	libcheckrdac.so
-ifneq ($(ENABLE_RADOS),0)
-LIBS += libcheckrbd.so
-endif
 
 all: $(LIBS)
 
 libcheckrbd.so: rbd.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lrados -ludev
 
-libcheckdirectio.so: libsg.o directio.o
+libcheckdirectio.so: ../libsg.o directio.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -laio
 
-libcheck%.so: libsg.o %.o
+libcheck%.so: ../libsg.o %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
 install:
diff --git a/libmultipath/checkers/emc_clariion.c b/libmultipath/checkers/emc_clariion.c
index 9c1ffed..e4ba757 100644
--- a/libmultipath/checkers/emc_clariion.c
+++ b/libmultipath/checkers/emc_clariion.c
@@ -12,7 +12,7 @@
 #include <errno.h>
 
 #include "../libmultipath/sg_include.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"
 #include "checkers.h"
 #include "debug.h"
 #include "memory.h"
diff --git a/libmultipath/checkers/libsg.c b/libmultipath/checkers/libsg.c
deleted file mode 100644
index 958ea92..0000000
--- a/libmultipath/checkers/libsg.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Christophe Varoqui
- */
-#include <string.h>
-#include <sys/ioctl.h>
-#include <errno.h>
-#include <sys/stat.h>
-
-#include "checkers.h"
-#include "libsg.h"
-#include "../libmultipath/sg_include.h"
-
-int
-sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	 unsigned char * sense, int sense_len, unsigned int timeout)
-{
-	/* defaults */
-	int blocks;
-	long long start_block = 0;
-	int bs = 512;
-	int cdbsz = 10;
-
-	unsigned char rdCmd[cdbsz];
-	unsigned char *sbb = sense;
-	struct sg_io_hdr io_hdr;
-	int res;
-	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
-	int sz_ind;
-	struct stat filestatus;
-	int retry_count = 3;
-
-	if (fstat(sg_fd, &filestatus) != 0)
-		return PATH_DOWN;
-	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
-	blocks = buff_len / bs;
-	memset(rdCmd, 0, cdbsz);
-	sz_ind = 1;
-	rdCmd[0] = rd_opcode[sz_ind];
-	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
-	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
-	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
-	rdCmd[5] = (unsigned char)(start_block & 0xff);
-	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
-	rdCmd[8] = (unsigned char)(blocks & 0xff);
-
-	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
-	io_hdr.interface_id = 'S';
-	io_hdr.cmd_len = cdbsz;
-	io_hdr.cmdp = rdCmd;
-	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
-	io_hdr.dxfer_len = bs * blocks;
-	io_hdr.dxferp = buff;
-	io_hdr.mx_sb_len = sense_len;
-	io_hdr.sbp = sense;
-	io_hdr.timeout = timeout * 1000;
-	io_hdr.pack_id = (int)start_block;
-
-retry:
-	memset(sense, 0, sense_len);
-	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR == errno));
-
-	if (res < 0) {
-		if (ENOMEM == errno) {
-			return PATH_UP;
-		}
-		return PATH_DOWN;
-	}
-
-	if ((0 == io_hdr.status) &&
-	    (0 == io_hdr.host_status) &&
-	    (0 == io_hdr.driver_status)) {
-		return PATH_UP;
-	} else {
-		int key = 0;
-
-		if (io_hdr.sb_len_wr > 3) {
-			if (sbb[0] == 0x72 || sbb[0] == 0x73)
-				key = sbb[1] & 0x0f;
-			else if (io_hdr.sb_len_wr > 13 &&
-				 ((sbb[0] & 0x7f) == 0x70 ||
-				  (sbb[0] & 0x7f) == 0x71))
-				key = sbb[2] & 0x0f;
-		}
-
-		/*
-		 * Retry if UNIT_ATTENTION check condition.
-		 */
-		if (key == 0x6) {
-			if (--retry_count)
-				goto retry;
-		}
-		return PATH_DOWN;
-	}
-}
diff --git a/libmultipath/checkers/libsg.h b/libmultipath/checkers/libsg.h
deleted file mode 100644
index 3994f45..0000000
--- a/libmultipath/checkers/libsg.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _LIBSG_H
-#define _LIBSG_H
-
-#define SENSE_BUFF_LEN 32
-
-int sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	     unsigned char * sense, int sense_len, unsigned int timeout);
-
-#endif /* _LIBSG_H */
diff --git a/libmultipath/checkers/readsector0.c b/libmultipath/checkers/readsector0.c
index 8fccb46..d70c5c5 100644
--- a/libmultipath/checkers/readsector0.c
+++ b/libmultipath/checkers/readsector0.c
@@ -4,7 +4,7 @@
 #include <stdio.h>
 
 #include "checkers.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"
 
 #define MSG_READSECTOR0_UP	"readsector0 checker reports path is up"
 #define MSG_READSECTOR0_DOWN	"readsector0 checker reports path is down"
diff --git a/libmultipath/libsg.c b/libmultipath/libsg.c
new file mode 100644
index 0000000..99c91a4
--- /dev/null
+++ b/libmultipath/libsg.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2004, 2005 Christophe Varoqui
+ */
+#include <string.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "checkers.h"
+#include "libsg.h"
+#include "sg_include.h"
+
+int
+sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	 unsigned char * sense, int sense_len, unsigned int timeout)
+{
+	/* defaults */
+	int blocks;
+	long long start_block = 0;
+	int bs = 512;
+	int cdbsz = 10;
+
+	unsigned char rdCmd[cdbsz];
+	unsigned char *sbb = sense;
+	struct sg_io_hdr io_hdr;
+	int res;
+	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
+	int sz_ind;
+	struct stat filestatus;
+	int retry_count = 3;
+
+	if (fstat(sg_fd, &filestatus) != 0)
+		return PATH_DOWN;
+	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
+	blocks = buff_len / bs;
+	memset(rdCmd, 0, cdbsz);
+	sz_ind = 1;
+	rdCmd[0] = rd_opcode[sz_ind];
+	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
+	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
+	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
+	rdCmd[5] = (unsigned char)(start_block & 0xff);
+	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
+	rdCmd[8] = (unsigned char)(blocks & 0xff);
+
+	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
+	io_hdr.interface_id = 'S';
+	io_hdr.cmd_len = cdbsz;
+	io_hdr.cmdp = rdCmd;
+	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	io_hdr.dxfer_len = bs * blocks;
+	io_hdr.dxferp = buff;
+	io_hdr.mx_sb_len = sense_len;
+	io_hdr.sbp = sense;
+	io_hdr.timeout = timeout * 1000;
+	io_hdr.pack_id = (int)start_block;
+
+retry:
+	memset(sense, 0, sense_len);
+	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR == errno));
+
+	if (res < 0) {
+		if (ENOMEM == errno) {
+			return PATH_UP;
+		}
+		return PATH_DOWN;
+	}
+
+	if ((0 == io_hdr.status) &&
+	    (0 == io_hdr.host_status) &&
+	    (0 == io_hdr.driver_status)) {
+		return PATH_UP;
+	} else {
+		int key = 0;
+
+		if (io_hdr.sb_len_wr > 3) {
+			if (sbb[0] == 0x72 || sbb[0] == 0x73)
+				key = sbb[1] & 0x0f;
+			else if (io_hdr.sb_len_wr > 13 &&
+				 ((sbb[0] & 0x7f) == 0x70 ||
+				  (sbb[0] & 0x7f) == 0x71))
+				key = sbb[2] & 0x0f;
+		}
+
+		/*
+		 * Retry if UNIT_ATTENTION check condition.
+		 */
+		if (key == 0x6) {
+			if (--retry_count)
+				goto retry;
+		}
+		return PATH_DOWN;
+	}
+}
diff --git a/libmultipath/libsg.h b/libmultipath/libsg.h
new file mode 100644
index 0000000..3994f45
--- /dev/null
+++ b/libmultipath/libsg.h
@@ -0,0 +1,9 @@
+#ifndef _LIBSG_H
+#define _LIBSG_H
+
+#define SENSE_BUFF_LEN 32
+
+int sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	     unsigned char * sense, int sense_len, unsigned int timeout);
+
+#endif /* _LIBSG_H */
diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..7e3da51 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriodelayedpath.so \
+	libpriosysfs.so 
 
 all: $(LIBS)
 
 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
+libpriodelayedpath.so: delayedpath.o  ../libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
new file mode 100644
index 0000000..4c1cfea
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.c
@@ -0,0 +1,246 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority 
+ * values of specific paths are provided by a time-delay algorithm. And the
+ * time-delay algorithm is dependent on arguments.
+ * 
+ * The principle of the algorithm as follows: 
+ * 1. By sending a certain number "cons_num" of read IOs to the current path 
+ *    continuously, the IOs' average delay can be calculated. 
+ * 2. According to the average delay of each path and the weight value 
+ *    "delay_interval", the priority "rc" of each path can be provided. 
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../libmultipath/libsg.h"
+
+#include "delayedpath.h"
+
+#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
+#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
+#define DEFAULT_CONS_NUM        20    
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_SEC                "SEC"
+#define CHAR_MSEC               "MSEC"
+#define CHAR_USEC               "USEC"
+
+enum interval_type {
+    INTERVAL_SEC,
+    INTERVAL_MSEC,
+    INTERVAL_USEC,
+    INTERVAL_INVALID
+};
+
+static int conversion_ratio[] = {
+	[INTERVAL_SEC]		= USEC_PER_SEC,
+	[INTERVAL_MSEC]	    = USEC_PER_MSEC,
+	[INTERVAL_USEC]		= USEC_PER_USEC,
+	[INTERVAL_INVALID]	= 0,
+};
+
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+    
+	return ret;
+}
+
+static int get_interval_type(char *source, char *type)
+{  
+    /*is USEC*/
+    if ((strstr(source, CHAR_USEC) != NULL)
+        && (strstr(source, CHAR_USEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_USEC, strlen(CHAR_USEC)+1);        
+        return INTERVAL_USEC;
+    }
+
+    /*is MSEC*/
+    if ((strstr(source, CHAR_MSEC) != NULL) 
+        && (strstr(source, CHAR_MSEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_MSEC, strlen(CHAR_MSEC)+1);
+        return INTERVAL_MSEC;
+    }
+
+    /*is SEC*/
+	if ((strstr(source, CHAR_SEC) != NULL)
+        && (strstr(source, CHAR_SEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_SEC, strlen(CHAR_SEC)+1);
+        return INTERVAL_SEC;
+    }
+
+    return INTERVAL_INVALID;
+}
+
+static int get_string_from_under(char *args,
+                                        char *beforestring,
+                                        char *afterstring,
+                                        int *type)
+{
+    char source[MAX_CHAR_SIZE];
+    char char_type[MAX_CHAR_SIZE];
+    char under[] = "_";
+    char *token  = NULL;
+    char *tmp = NULL;
+    char *saveptr = NULL;
+    unsigned int size = strlen(args);
+
+    if ((args == NULL) || (beforestring == NULL) 
+        || (afterstring == NULL) || (type == NULL))
+        return 0;
+
+    /* int type */
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+        return 0;
+    
+    memcpy(source, args, size+1);
+    if (strstr(source, under) == NULL)
+        return 0;
+
+    *type = get_interval_type(source, char_type);
+    if (*type == INTERVAL_INVALID)
+        return 0;
+
+    token = strtok_r(source, under, &saveptr);
+    token = strtok(token, char_type);
+    if ((token == NULL) || (saveptr == NULL))
+        return 0;
+
+    tmp = token;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    tmp = saveptr;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    strncpy(beforestring, token, strlen(token) + 1);
+    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
+    return 1;
+}
+
+int checkargvalid(int delay_interval, int cons_num, int type)
+{
+    if (type == INTERVAL_SEC)
+    {
+        if ((delay_interval < 1) || (delay_interval > 60))
+            return 0;
+    }
+    else if (type != INTERVAL_INVALID)
+    {
+        if ((delay_interval < 1) || (delay_interval >= 1000))
+            return 0;
+    }
+    
+    if ((cons_num < 3) || (cons_num > 1000))
+        return 0;
+
+    return 1;
+}
+
+int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num, int *type)
+{
+    char delayintervalstr[MAX_CHAR_SIZE];
+    char consnumstr[MAX_CHAR_SIZE];
+
+    if (get_string_from_under(args, delayintervalstr, consnumstr, type) == 0)
+        return 0;
+
+    *delay_interval = atoi(delayintervalstr);
+    *cons_num = atoi(consnumstr);
+
+    if (checkargvalid(*delay_interval, *cons_num, *type) == 0)
+        return 0;
+    
+    return 1;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+    
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, delay_interval, cons_num, type, temp;
+    long long delay, avgdelay, ratio;
+    long long min = THRES_USEC_VALUE;
+    long long max = 0;
+    long long toldelay = 0;
+    long long before, after;
+    struct timeval tv;
+
+    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
+    {
+        condlog(3, "%s: get delay arg fail", pp->dev);
+        delay_interval = DEFAULT_DELAY_INTERVAL;
+        cons_num = DEFAULT_CONS_NUM;
+        type = INTERVAL_MSEC;
+    }
+
+    temp = cons_num;
+    while (temp-- > 0)
+    {
+        (void)gettimeofday(&tv, NULL);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return 1;
+        }
+        
+        (void)gettimeofday(&tv, NULL);
+        after = timeval_to_us(&tv);
+
+        delay = after - before;
+        if (delay < 0)
+        {
+            condlog(0, "%s: delay calc error", pp->dev);
+            return 1;
+        }
+    	
+        min = (min <= delay) ? min : delay;
+        max = (max >= delay) ? max : delay;
+                
+        toldelay += delay;
+    }
+
+    toldelay -= min + max;
+    avgdelay = toldelay/(long long)(cons_num - 2);
+    if (avgdelay > THRES_USEC_VALUE) 
+    {           
+        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
+        return 1;
+    }
+    
+	ratio = get_conversion_ratio(type);
+	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) * ratio)));
+
+    return rc;
+}
diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
new file mode 100644
index 0000000..ca89702
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.h
@@ -0,0 +1,14 @@
+#ifndef _DELAYEDPATH_H
+#define _DELAYEDPATH_H
+
+#define PRIO_DELAYED_PATH "delayedpath"
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static inline long long timeval_to_us(const struct timeval *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + tv->tv_usec;
+}
+
+#endif
-- 
2.6.4.windows.1

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-08  3:58 [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Yang Feng
@ 2017-05-10 22:36 ` Xose Vazquez Perez
  2017-05-11  4:57   ` Philip Yang
  2017-05-11 11:14 ` Martin Wilck
  1 sibling, 1 reply; 19+ messages in thread
From: Xose Vazquez Perez @ 2017-05-10 22:36 UTC (permalink / raw)
  To: Yang Feng, dm-devel, Christophe Varoqui
  Cc: guanjunxiong, hege09, zouming.zouming, shenhong09

On 05/08/2017 05:58 AM, Yang Feng wrote:

> Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a time-delay algorithm. And the
> time-delay algorithm is dependent on the following arguments(delay_interval,
> cons_num).
This new feature should be documented in multipath/multipath.conf.5

> diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile
> index 4970fc0..7e433ca 100644
> --- a/libmultipath/checkers/Makefile
> +++ b/libmultipath/checkers/Makefile
> @@ -14,19 +14,16 @@ LIBS= \
>  	libcheckemc_clariion.so \
>  	libcheckhp_sw.so \
>  	libcheckrdac.so
> -ifneq ($(ENABLE_RADOS),0)
> -LIBS += libcheckrbd.so
> -endif

Is it right?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-10 22:36 ` Xose Vazquez Perez
@ 2017-05-11  4:57   ` Philip Yang
  0 siblings, 0 replies; 19+ messages in thread
From: Philip Yang @ 2017-05-11  4:57 UTC (permalink / raw)
  To: Xose Vazquez Perez, Christophe Varoqui
  Cc: dm-devel, hege09, zouming.zouming, guanjunxiong, shenhong09

Hi Xose and Christophe,

On 2017/5/11 6:36, Xose Vazquez Perez wrote:
> On 05/08/2017 05:58 AM, Yang Feng wrote:
>
>> Prioritizer for device mapper multipath, where the corresponding priority
>> values of specific paths are provided by a time-delay algorithm. And the
>> time-delay algorithm is dependent on the following arguments(delay_interval,
>> cons_num).
> This new feature should be documented in multipath/multipath.conf.5
>
multipath/multipath.conf.5 has be documented in the following patch.

>> diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile
>> index 4970fc0..7e433ca 100644
>> --- a/libmultipath/checkers/Makefile
>> +++ b/libmultipath/checkers/Makefile
>> @@ -14,19 +14,16 @@ LIBS= \
>>  	libcheckemc_clariion.so \
>>  	libcheckhp_sw.so \
>>  	libcheckrdac.so
>> -ifneq ($(ENABLE_RADOS),0)
>> -LIBS += libcheckrbd.so
>> -endif
>
> Is it right?
>
>
Thanks, fixed as the flollowing patch.


---
Prioritizer for device mapper multipath, where the corresponding priority values of specific paths are provided by a time-delay algorithm. And the time-delay algorithm is dependent on the following arguments(delay_interval, cons_num).
The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path
   continuously, the IOs' average delay can be calculated.
2. According to the average delay of each path and the weight value
   "delay_interval", the priority "rc" of each path can be provided.

     delay_interval  delay_interval  delay_interval       delay_interval
    |---------------|---------------|---------------|	 |---------------|
    |priority rank 1|priority rank 2|priority rank 3|... |priority rank x|
    |---------------|---------------|---------------|    |---------------|
                       Priority Rank Partitioning
---
 libmultipath/Makefile                   |   2 +-
 libmultipath/checkers/Makefile          |   4 +-
 libmultipath/checkers/emc_clariion.c    |   2 +-
 libmultipath/checkers/libsg.c           |  94 ------------
 libmultipath/checkers/libsg.h           |   9 --
 libmultipath/checkers/readsector0.c     |   2 +-
 libmultipath/libsg.c                    |  94 ++++++++++++
 libmultipath/libsg.h                    |   9 ++
 libmultipath/prioritizers/Makefile      |   6 +-
 libmultipath/prioritizers/delayedpath.c | 246 ++++++++++++++++++++++++++++++++
 libmultipath/prioritizers/delayedpath.h |  14 ++
 multipath/multipath.conf.5              |  19 +++++++++++++++++++
 12 files changed, 392 insertions(+), 109 deletions(-)  delete mode 100644 libmultipath/checkers/libsg.c  delete mode 100644 libmultipath/checkers/libsg.h  create mode 100644 libmultipath/libsg.c  create mode 100644 libmultipath/libsg.h  create mode 100644 libmultipath/prioritizers/delayedpath.c
 create mode 100644 libmultipath/prioritizers/delayedpath.h

diff --git a/libmultipath/Makefile b/libmultipath/Makefile index 1f5ec25..a4d725a 100644
--- a/libmultipath/Makefile
+++ b/libmultipath/Makefile
@@ -41,7 +41,7 @@ OBJS = memory.o parser.o vector.o devmapper.o callout.o \
 	structs.o discovery.o propsel.o dict.o \
 	pgpolicies.o debug.o defaults.o uevent.o time-util.o \
 	switchgroup.o uxsock.o print.o alias.o log_pthread.o \
-	log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
+	log.o configure.o structs_vec.o sysfs.o libsg.o prio.o checkers.o \
 	lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o

 all: $(LIBS)
diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile index 4970fc0..7e433ca 100644
--- a/libmultipath/checkers/Makefile
+++ b/libmultipath/checkers/Makefile
@@ -14,19 +14,16 @@ LIBS= \
 	libcheckemc_clariion.so \
 	libcheckhp_sw.so \
 	libcheckrdac.so
 ifneq ($(ENABLE_RADOS),0)
 LIBS += libcheckrbd.so
 endif

 all: $(LIBS)

 libcheckrbd.so: rbd.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lrados -ludev

-libcheckdirectio.so: libsg.o directio.o
+libcheckdirectio.so: ../libsg.o directio.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -laio

-libcheck%.so: libsg.o %.o
+libcheck%.so: ../libsg.o %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

 install:
diff --git a/libmultipath/checkers/emc_clariion.c b/libmultipath/checkers/emc_clariion.c
index 9c1ffed..e4ba757 100644
--- a/libmultipath/checkers/emc_clariion.c
+++ b/libmultipath/checkers/emc_clariion.c
@@ -12,7 +12,7 @@
 #include <errno.h>

 #include "../libmultipath/sg_include.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"
 #include "checkers.h"
 #include "debug.h"
 #include "memory.h"
diff --git a/libmultipath/checkers/libsg.c b/libmultipath/checkers/libsg.c deleted file mode 100644 index 958ea92..0000000
--- a/libmultipath/checkers/libsg.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Christophe Varoqui
- */
-#include <string.h>
-#include <sys/ioctl.h>
-#include <errno.h>
-#include <sys/stat.h>
-
-#include "checkers.h"
-#include "libsg.h"
-#include "../libmultipath/sg_include.h"
-
-int
-sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	 unsigned char * sense, int sense_len, unsigned int timeout)
-{
-	/* defaults */
-	int blocks;
-	long long start_block = 0;
-	int bs = 512;
-	int cdbsz = 10;
-
-	unsigned char rdCmd[cdbsz];
-	unsigned char *sbb = sense;
-	struct sg_io_hdr io_hdr;
-	int res;
-	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
-	int sz_ind;
-	struct stat filestatus;
-	int retry_count = 3;
-
-	if (fstat(sg_fd, &filestatus) != 0)
-		return PATH_DOWN;
-	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
-	blocks = buff_len / bs;
-	memset(rdCmd, 0, cdbsz);
-	sz_ind = 1;
-	rdCmd[0] = rd_opcode[sz_ind];
-	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
-	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
-	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
-	rdCmd[5] = (unsigned char)(start_block & 0xff);
-	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
-	rdCmd[8] = (unsigned char)(blocks & 0xff);
-
-	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
-	io_hdr.interface_id = 'S';
-	io_hdr.cmd_len = cdbsz;
-	io_hdr.cmdp = rdCmd;
-	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
-	io_hdr.dxfer_len = bs * blocks;
-	io_hdr.dxferp = buff;
-	io_hdr.mx_sb_len = sense_len;
-	io_hdr.sbp = sense;
-	io_hdr.timeout = timeout * 1000;
-	io_hdr.pack_id = (int)start_block;
-
-retry:
-	memset(sense, 0, sense_len);
-	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR == errno));
-
-	if (res < 0) {
-		if (ENOMEM == errno) {
-			return PATH_UP;
-		}
-		return PATH_DOWN;
-	}
-
-	if ((0 == io_hdr.status) &&
-	    (0 == io_hdr.host_status) &&
-	    (0 == io_hdr.driver_status)) {
-		return PATH_UP;
-	} else {
-		int key = 0;
-
-		if (io_hdr.sb_len_wr > 3) {
-			if (sbb[0] == 0x72 || sbb[0] == 0x73)
-				key = sbb[1] & 0x0f;
-			else if (io_hdr.sb_len_wr > 13 &&
-				 ((sbb[0] & 0x7f) == 0x70 ||
-				  (sbb[0] & 0x7f) == 0x71))
-				key = sbb[2] & 0x0f;
-		}
-
-		/*
-		 * Retry if UNIT_ATTENTION check condition.
-		 */
-		if (key == 0x6) {
-			if (--retry_count)
-				goto retry;
-		}
-		return PATH_DOWN;
-	}
-}
diff --git a/libmultipath/checkers/libsg.h b/libmultipath/checkers/libsg.h deleted file mode 100644 index 3994f45..0000000
--- a/libmultipath/checkers/libsg.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _LIBSG_H
-#define _LIBSG_H
-
-#define SENSE_BUFF_LEN 32
-
-int sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	     unsigned char * sense, int sense_len, unsigned int timeout);
-
-#endif /* _LIBSG_H */
diff --git a/libmultipath/checkers/readsector0.c b/libmultipath/checkers/readsector0.c
index 8fccb46..d70c5c5 100644
--- a/libmultipath/checkers/readsector0.c
+++ b/libmultipath/checkers/readsector0.c
@@ -4,7 +4,7 @@
 #include <stdio.h>

 #include "checkers.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"

 #define MSG_READSECTOR0_UP	"readsector0 checker reports path is up"
 #define MSG_READSECTOR0_DOWN	"readsector0 checker reports path is down"
diff --git a/libmultipath/libsg.c b/libmultipath/libsg.c new file mode 100644 index 0000000..99c91a4
--- /dev/null
+++ b/libmultipath/libsg.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2004, 2005 Christophe Varoqui  */ #include <string.h>
+#include <sys/ioctl.h> #include <errno.h> #include <sys/stat.h>
+
+#include "checkers.h"
+#include "libsg.h"
+#include "sg_include.h"
+
+int
+sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	 unsigned char * sense, int sense_len, unsigned int timeout) {
+	/* defaults */
+	int blocks;
+	long long start_block = 0;
+	int bs = 512;
+	int cdbsz = 10;
+
+	unsigned char rdCmd[cdbsz];
+	unsigned char *sbb = sense;
+	struct sg_io_hdr io_hdr;
+	int res;
+	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
+	int sz_ind;
+	struct stat filestatus;
+	int retry_count = 3;
+
+	if (fstat(sg_fd, &filestatus) != 0)
+		return PATH_DOWN;
+	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
+	blocks = buff_len / bs;
+	memset(rdCmd, 0, cdbsz);
+	sz_ind = 1;
+	rdCmd[0] = rd_opcode[sz_ind];
+	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
+	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
+	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
+	rdCmd[5] = (unsigned char)(start_block & 0xff);
+	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
+	rdCmd[8] = (unsigned char)(blocks & 0xff);
+
+	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
+	io_hdr.interface_id = 'S';
+	io_hdr.cmd_len = cdbsz;
+	io_hdr.cmdp = rdCmd;
+	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	io_hdr.dxfer_len = bs * blocks;
+	io_hdr.dxferp = buff;
+	io_hdr.mx_sb_len = sense_len;
+	io_hdr.sbp = sense;
+	io_hdr.timeout = timeout * 1000;
+	io_hdr.pack_id = (int)start_block;
+
+retry:
+	memset(sense, 0, sense_len);
+	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR ==
+errno));
+
+	if (res < 0) {
+		if (ENOMEM == errno) {
+			return PATH_UP;
+		}
+		return PATH_DOWN;
+	}
+
+	if ((0 == io_hdr.status) &&
+	    (0 == io_hdr.host_status) &&
+	    (0 == io_hdr.driver_status)) {
+		return PATH_UP;
+	} else {
+		int key = 0;
+
+		if (io_hdr.sb_len_wr > 3) {
+			if (sbb[0] == 0x72 || sbb[0] == 0x73)
+				key = sbb[1] & 0x0f;
+			else if (io_hdr.sb_len_wr > 13 &&
+				 ((sbb[0] & 0x7f) == 0x70 ||
+				  (sbb[0] & 0x7f) == 0x71))
+				key = sbb[2] & 0x0f;
+		}
+
+		/*
+		 * Retry if UNIT_ATTENTION check condition.
+		 */
+		if (key == 0x6) {
+			if (--retry_count)
+				goto retry;
+		}
+		return PATH_DOWN;
+	}
+}
diff --git a/libmultipath/libsg.h b/libmultipath/libsg.h new file mode 100644 index 0000000..3994f45
--- /dev/null
+++ b/libmultipath/libsg.h
@@ -0,0 +1,9 @@
+#ifndef _LIBSG_H
+#define _LIBSG_H
+
+#define SENSE_BUFF_LEN 32
+
+int sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	     unsigned char * sense, int sense_len, unsigned int timeout);
+
+#endif /* _LIBSG_H */
diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..7e3da51 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriodelayedpath.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriodelayedpath.so: delayedpath.o  ../libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
new file mode 100644
index 0000000..4c1cfea
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.c
@@ -0,0 +1,246 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding
+priority
+ * values of specific paths are provided by a time-delay algorithm. And
+the
+ * time-delay algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "cons_num" of read IOs to the current path
+ *    continuously, the IOs' average delay can be calculated.
+ * 2. According to the average delay of each path and the weight value
+ *    "delay_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../libmultipath/libsg.h"
+
+#include "delayedpath.h"
+
+#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
+#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
+#define DEFAULT_CONS_NUM        20
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_SEC                "SEC"
+#define CHAR_MSEC               "MSEC"
+#define CHAR_USEC               "USEC"
+
+enum interval_type {
+    INTERVAL_SEC,
+    INTERVAL_MSEC,
+    INTERVAL_USEC,
+    INTERVAL_INVALID
+};
+
+static int conversion_ratio[] = {
+	[INTERVAL_SEC]		= USEC_PER_SEC,
+	[INTERVAL_MSEC]	    = USEC_PER_MSEC,
+	[INTERVAL_USEC]		= USEC_PER_USEC,
+	[INTERVAL_INVALID]	= 0,
+};
+
+
+static int do_readsector0(int fd, unsigned int timeout) {
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+static int get_interval_type(char *source, char *type) {
+    /*is USEC*/
+    if ((strstr(source, CHAR_USEC) != NULL)
+        && (strstr(source, CHAR_USEC)[4] == '|'))
+    {
+        memcpy(type, CHAR_USEC, strlen(CHAR_USEC)+1);
+        return INTERVAL_USEC;
+    }
+
+    /*is MSEC*/
+    if ((strstr(source, CHAR_MSEC) != NULL)
+        && (strstr(source, CHAR_MSEC)[4] == '|'))
+    {
+        memcpy(type, CHAR_MSEC, strlen(CHAR_MSEC)+1);
+        return INTERVAL_MSEC;
+    }
+
+    /*is SEC*/
+	if ((strstr(source, CHAR_SEC) != NULL)
+        && (strstr(source, CHAR_SEC)[4] == '|'))
+    {
+        memcpy(type, CHAR_SEC, strlen(CHAR_SEC)+1);
+        return INTERVAL_SEC;
+    }
+
+    return INTERVAL_INVALID;
+}
+
+static int get_string_from_vertica(char *args,
+                                        char *beforestring,
+                                        char *afterstring,
+                                        int *type) {
+    char source[MAX_CHAR_SIZE];
+    char char_type[MAX_CHAR_SIZE];
+    char vertica[] = "|";
+    char *token  = NULL;
+    char *tmp = NULL;
+    char *saveptr = NULL;
+    unsigned int size = strlen(args);
+
+    if ((args == NULL) || (beforestring == NULL)
+        || (afterstring == NULL) || (type == NULL))
+        return 0;
+
+    /* int type */
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+        return 0;
+
+    memcpy(source, args, size+1);
+    if (strstr(source, vertica) == NULL)
+        return 0;
+
+    *type = get_interval_type(source, char_type);
+    if (*type == INTERVAL_INVALID)
+        return 0;
+
+    token = strtok_r(source, vertica, &saveptr);
+    token = strtok(token, char_type);
+    if ((token == NULL) || (saveptr == NULL))
+        return 0;
+
+    tmp = token;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    tmp = saveptr;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    strncpy(beforestring, token, strlen(token) + 1);
+    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
+    return 1;
+}
+
+int checkargvalid(int delay_interval, int cons_num, int type) {
+    if (type == INTERVAL_SEC)
+    {
+        if ((delay_interval < 1) || (delay_interval > 60))
+            return 0;
+    }
+    else if (type != INTERVAL_INVALID)
+    {
+        if ((delay_interval < 1) || (delay_interval >= 1000))
+            return 0;
+    }
+
+    if ((cons_num < 3) || (cons_num > 1000))
+        return 0;
+
+    return 1;
+}
+
+int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num,
+int *type) {
+    char delayintervalstr[MAX_CHAR_SIZE];
+    char consnumstr[MAX_CHAR_SIZE];
+
+    if (get_string_from_vertica(args, delayintervalstr, consnumstr, type) == 0)
+        return 0;
+
+    *delay_interval = atoi(delayintervalstr);
+    *cons_num = atoi(consnumstr);
+
+    if (checkargvalid(*delay_interval, *cons_num, *type) == 0)
+        return 0;
+
+    return 1;
+}
+
+long long get_conversion_ratio(int type) {
+    return conversion_ratio[type];
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout) {
+    int rc, delay_interval, cons_num, type, temp;
+    long long delay, avgdelay, ratio;
+    long long min = THRES_USEC_VALUE;
+    long long max = 0;
+    long long toldelay = 0;
+    long long before, after;
+    struct timeval tv;
+
+    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
+    {
+        condlog(3, "%s: get delay arg fail", pp->dev);
+        delay_interval = DEFAULT_DELAY_INTERVAL;
+        cons_num = DEFAULT_CONS_NUM;
+        type = INTERVAL_MSEC;
+    }
+
+    temp = cons_num;
+    while (temp-- > 0)
+    {
+        (void)gettimeofday(&tv, NULL);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return 1;
+        }
+
+        (void)gettimeofday(&tv, NULL);
+        after = timeval_to_us(&tv);
+
+        delay = after - before;
+        if (delay < 0)
+        {
+            condlog(0, "%s: delay calc error", pp->dev);
+            return 1;
+        }
+    	
+        min = (min <= delay) ? min : delay;
+        max = (max >= delay) ? max : delay;
+
+        toldelay += delay;
+    }
+
+    toldelay -= min + max;
+    avgdelay = toldelay/(long long)(cons_num - 2);
+    if (avgdelay > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
+        return 1;
+    }
+
+	ratio = get_conversion_ratio(type);
+	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) *
+ratio)));
+
+    return rc;
+}
diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
new file mode 100644
index 0000000..ca89702
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.h
@@ -0,0 +1,14 @@
+#ifndef _DELAYEDPATH_H
+#define _DELAYEDPATH_H
+
+#define PRIO_DELAYED_PATH "delayedpath"
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static inline long long timeval_to_us(const struct timeval *tv) {
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + tv->tv_usec; }
+
+#endif
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..b787634 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I delayedpath
+Generate the path priority based on a time-delay algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,21 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I delayed
+Needs a value of the form
+\fI"<delay_interval|cons_num>"\fR
+.RS
+.TP 8
+.I delay_interval
+The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXSEC, or XXXUSEC, or XXXMSEC. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, SEC [1, 60], USEC [1, 1000), MSEC [1, 1000),
+For example: 10SEC, or 100USEC, or 100MSEC. The default is: 10MSEC.
+.TP
+.I cons_num
+The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
+For example: 30. The default is: 20.
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
-- 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-08  3:58 [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Yang Feng
  2017-05-10 22:36 ` Xose Vazquez Perez
@ 2017-05-11 11:14 ` Martin Wilck
  2017-05-15 10:44   ` Yang Feng
  1 sibling, 1 reply; 19+ messages in thread
From: Martin Wilck @ 2017-05-11 11:14 UTC (permalink / raw)
  To: Yang Feng, dm-devel; +Cc: guanjunxiong, hege09, zouming.zouming, shenhong09

Hello Yang,

thank you for your work. Please find my remarks below.

On Mon, 2017-05-08 at 11:58 +0800, Yang Feng wrote:
> Prioritizer for device mapper multipath, where the corresponding
> priority
> values of specific paths are provided by a time-delay algorithm. And
> the
> time-delay algorithm is dependent on the following
> arguments(delay_interval,
> cons_num).
> The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current
> path
>    continuously, the IOs' average delay can be calculated.
> 2. According to the average delay of each path and the weight value
>    "delay_interval", the priority "rc" of each path can be provided.
> 
>      delay_interval  delay_interval  delay_interval       delay_inter

How does this algorithm behave under load? Can we be sure that
priorities don't start to fluctuate wildly because busy paths will
usually have longer latencies than idle ones?


> val
>     |---------------|---------------|---------------|	 |----
> -----------|
>     |priority rank1 |priority rank2 |priority rank3 |... |priority
> rank4 |
>     |---------------|---------------|---------------|    |-----------
> ----|
>                        Priority Rank Partitioning
> ---
>  libmultipath/Makefile                   |   2 +-
>  libmultipath/checkers/Makefile          |   7 +-
>  libmultipath/checkers/emc_clariion.c    |   2 +-
>  libmultipath/checkers/libsg.c           |  94 ------------
>  libmultipath/checkers/libsg.h           |   9 --
>  libmultipath/checkers/readsector0.c     |   2 +-
>  libmultipath/libsg.c                    |  94 ++++++++++++
>  libmultipath/libsg.h                    |   9 ++
>  libmultipath/prioritizers/Makefile      |   6 +-
>  libmultipath/prioritizers/delayedpath.c | 246 

Why do you have to move libsg for this? It's already used by various
checkers, why can't your checker do the same? If you really need to do
it, you should at least separate that part of the patch from the added
code.

> diff --git a/libmultipath/prioritizers/delayedpath.c
> b/libmultipath/prioritizers/delayedpath.c
> new file mode 100644
> index 0000000..4c1cfea
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.c
> @@ -0,0 +1,246 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights
> Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding
> priority 
> + * values of specific paths are provided by a time-delay algorithm.
> And the
> + * time-delay algorithm is dependent on arguments.
> + * 
> + * The principle of the algorithm as follows: 
> + * 1. By sending a certain number "cons_num" of read IOs to the
> current path 
> + *    continuously, the IOs' average delay can be calculated. 
> + * 2. According to the average delay of each path and the weight
> value 
> + *    "delay_interval", the priority "rc" of each path can be
> provided. 
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <ctype.h>
> +#include <sys/time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../libmultipath/libsg.h"
> +
> +#include "delayedpath.h"
> +
> +#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
> +#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
> +#define DEFAULT_CONS_NUM        20    
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_SEC                "SEC"
> +#define CHAR_MSEC               "MSEC"
> +#define CHAR_USEC               "USEC"

I suggest to use "s", "ms", and "us" here instead.

If you create an array of "const char*" instead like you did for
conversion_ratio below, you could implement get_interval_type() more
elegantly using a loop over that array.

> +
> +enum interval_type {
> +    INTERVAL_SEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_USEC,
> +    INTERVAL_INVALID
> +};
> +
> +static int conversion_ratio[] = {
> +	[INTERVAL_SEC]		= USEC_PER_SEC,
> +	[INTERVAL_MSEC]	    = USEC_PER_MSEC,
> +	[INTERVAL_USEC]		= USEC_PER_USEC,
> +	[INTERVAL_INVALID]	= 0,
> +};
> +
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +    
> +	return ret;
> +}
> +
> +static int get_interval_type(char *source, char *type)
> +{  
> +    /*is USEC*/
> +    if ((strstr(source, CHAR_USEC) != NULL)
> +        && (strstr(source, CHAR_USEC)[4] == '_'))

Please avoid these double strstr() invocation. The compiler may
optimize it away, but it just looks strange. The following would 
look better to me, and I find it actually more readable:

        if (((p = strstr(source, CHAR_USEC)) != NULL) && p[4] == '_')

> +    {
> +        memcpy(type, CHAR_USEC, strlen(CHAR_USEC)+1);        
> +        return INTERVAL_USEC;
> +    }
> +
> +    /*is MSEC*/
> +    if ((strstr(source, CHAR_MSEC) != NULL) 
> +        && (strstr(source, CHAR_MSEC)[4] == '_'))
> +    {
> +        memcpy(type, CHAR_MSEC, strlen(CHAR_MSEC)+1);
> +        return INTERVAL_MSEC;
> +    }
> +
> +    /*is SEC*/
> +	if ((strstr(source, CHAR_SEC) != NULL)
> +        && (strstr(source, CHAR_SEC)[4] == '_'))
> +    {
> +        memcpy(type, CHAR_SEC, strlen(CHAR_SEC)+1);
> +        return INTERVAL_SEC;
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +static int get_string_from_under(char *args,
> +                                        char *beforestring,
> +                                        char *afterstring,
> +                                        int *type)

Maybe you could figure out a more descriptive name for this function?

A comment in the code showing how the string to be parsed typically
looks like would be helpful for the reader.

> +{
> +    char source[MAX_CHAR_SIZE];
> +    char char_type[MAX_CHAR_SIZE];
> +    char under[] = "_";
> +    char *token  = NULL;
> +    char *tmp = NULL;
> +    char *saveptr = NULL;
> +    unsigned int size = strlen(args);
> +
> +    if ((args == NULL) || (beforestring == NULL) 
> +        || (afterstring == NULL) || (type == NULL))
> +        return 0;
> +
> +    /* int type */
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +        return 0;
> +    
> +    memcpy(source, args, size+1);
> +    if (strstr(source, under) == NULL)
> +        return 0;
> +
> +    *type = get_interval_type(source, char_type);
> +    if (*type == INTERVAL_INVALID)
> +        return 0;
> +
> +    token = strtok_r(source, under, &saveptr);
> +    token = strtok(token, char_type);

I'm pretty sure this is is not what you intended to write. If char_type
is "usec", this would split the string at the possible delimiters 'u',
's', 'e', and 'c' (the 2nd argument of strtok(3) is not a sequence, but
a 'set' of bytes). It might accidentally work with the input strings
you are using (in particular because you only look at the first token),
but nevertheless it's wrong.

> +    if ((token == NULL) || (saveptr == NULL))
> +        return 0;
> +
> +    tmp = token;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +            return 0;
> +
> +    tmp = saveptr;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +            return 0;
> +
> +    strncpy(beforestring, token, strlen(token) + 1);
> +    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
> +    return 1;
> +}

I don't think it's safe to use saveptr the way you do it. The strtok_r
man page says this parameter is for "internal use". While it makes
sense to assume that it points to the next token, I'm not sure if
that's guaranteed. You would be safe by calling 

    somevar = strtok_r(NULL, under, &saveptr)

and use "somevar".

In general, this whole parsing code is odd. IIUC this parses input
looking like ([0-9]+)(SEC|MSEC|USEC)_([0-9]+) and sets beforestring,
type, and afterstring to the regex matches \1, \2, and \3,
respectively.

Why don't you start parsing from the beginning of the input, e.g. with
strtoul(), and look at the rest later?

> +
> +int checkargvalid(int delay_interval, int cons_num, int type)
> +{
> +    if (type == INTERVAL_SEC)
> +    {
> +        if ((delay_interval < 1) || (delay_interval > 60))
> +            return 0;
> +    }
> +    else if (type != INTERVAL_INVALID)
> +    {
> +        if ((delay_interval < 1) || (delay_interval >= 1000))
> +            return 0;
> +    }

You could be more forgiving here. 15000MSEC could be a legal value.

> +    
> +    if ((cons_num < 3) || (cons_num > 1000))
> +        return 0;
> +
> +    return 1;
> +}
> +
> +int get_delay_pref_arg(char *args, int *delay_interval, int
> *cons_num, int *type)
> +{
> +    char delayintervalstr[MAX_CHAR_SIZE];
> +    char consnumstr[MAX_CHAR_SIZE];
> +
> +    if (get_string_from_under(args, delayintervalstr, consnumstr,
> type) == 0)
> +        return 0;

It might be good to write the parser so that the consnum part can be
left out by the user, and assume a reasonable default in that case.

> +
> +    *delay_interval = atoi(delayintervalstr);
> +    *cons_num = atoi(consnumstr);
> +
> +    if (checkargvalid(*delay_interval, *cons_num, *type) == 0)
> +        return 0;
> +    
> +    return 1;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +    
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, delay_interval, cons_num, type, temp;
> +    long long delay, avgdelay, ratio;
> +    long long min = THRES_USEC_VALUE;
> +    long long max = 0;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timeval tv;
> +
> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type)
> == 0)
> +    {
> +        condlog(3, "%s: get delay arg fail", pp->dev);
> +        delay_interval = DEFAULT_DELAY_INTERVAL;
> +        cons_num = DEFAULT_CONS_NUM;
> +        type = INTERVAL_MSEC;
> +    }
> +
> +    temp = cons_num;
> +    while (temp-- > 0)
> +    {
> +        (void)gettimeofday(&tv, NULL);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return 1;
> +        }
> +        
> +        (void)gettimeofday(&tv, NULL);

It's better to use clock_gettime(CLOCK_MONOTONIC, ...) here. Then you
can throw away the delay < 0 check below.

> +        after = timeval_to_us(&tv);
> +
> +        delay = after - before;
> +        if (delay < 0)
> +        {
> +            condlog(0, "%s: delay calc error", pp->dev);
> +            return 1;
> +        }
> +    	
> +        min = (min <= delay) ? min : delay;
> +        max = (max >= delay) ? max : delay;
> +                
> +        toldelay += delay;
> +    }
> +
> +    toldelay -= min + max;
> +    avgdelay = toldelay/(long long)(cons_num - 2);
> +    if (avgdelay > THRES_USEC_VALUE) 
> +    {           
> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
> +        return 1;
> +    }
> +    
> +	ratio = get_conversion_ratio(type);
> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
> long)delay_interval) * ratio)));
> +
> +    return rc;
> +}

Is it reasonable to do these interval calculations synchronously in
getprio()? cons_num is limited to 1000, so this routine could issue
1000 reads on the device before returning. In particular if the device
is under IO load and the delay is high, execution if this routine could
be really slow.

It would make more sense to me to have a separate thread that
calculates some sort of "running average" for the delay of the
different paths, and have getprio() just fetch the current value of
that variable.

Regards
Martin

-- 
Dr. Martin Wilck <mwilck@suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-11 11:14 ` Martin Wilck
@ 2017-05-15 10:44   ` Yang Feng
  2017-05-16 14:53     ` Yang Feng
                       ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-15 10:44 UTC (permalink / raw)
  To: Martin Wilck
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09

Hello Martin,

Thank you very much for your remarks. I am sorry for late reply, Please find my answer and the updated patch.

> Hello Yang,
> 
> thank you for your work. Please find my remarks below.
> 
> On Mon, 2017-05-08 at 11:58 +0800, Yang Feng wrote:
>> Prioritizer for device mapper multipath, where the corresponding
>> priority
>> values of specific paths are provided by a time-delay algorithm. And
>> the
>> time-delay algorithm is dependent on the following
>> arguments(delay_interval,
>> cons_num).
>> The principle of the algorithm is illustrated as follows:
>> 1. By sending a certain number "cons_num" of read IOs to the current
>> path
>>    continuously, the IOs' average delay can be calculated.
>> 2. According to the average delay of each path and the weight value
>>    "delay_interval", the priority "rc" of each path can be provided.
>>
>>      delay_interval  delay_interval  delay_interval       delay_inter
> 
> How does this algorithm behave under load? Can we be sure that
> priorities don't start to fluctuate wildly because busy paths will
> usually have longer latencies than idle ones?
I have a lot of test under load. When the appropriate value of argument "delay_interval" is set,
this algorithm behave well and can separate the paths who's average delay is more than others.
When add a new path or the path's state change from down to up, getprio() of the prioritizer is triggered, and
the current path is not under IOs.

>>  libmultipath/Makefile                   |   2 +-
>>  libmultipath/checkers/Makefile          |   7 +-
>>  libmultipath/checkers/emc_clariion.c    |   2 +-
>>  libmultipath/checkers/libsg.c           |  94 ------------
>>  libmultipath/checkers/libsg.h           |   9 --
>>  libmultipath/checkers/readsector0.c     |   2 +-
>>  libmultipath/libsg.c                    |  94 ++++++++++++
>>  libmultipath/libsg.h                    |   9 ++
>>  libmultipath/prioritizers/Makefile      |   6 +-
>>  libmultipath/prioritizers/delayedpath.c | 246 
> 
> Why do you have to move libsg for this? It's already used by various
> checkers, why can't your checker do the same? If you really need to do
> it, you should at least separate that part of the patch from the added
> code.
OK, this time, libsg will not be moved.

>> +
>> +#define CHAR_SEC                "SEC"
>> +#define CHAR_MSEC               "MSEC"
>> +#define CHAR_USEC               "USEC"
> 
> I suggest to use "s", "ms", and "us" here instead.
OK, as the following patch.

> If you create an array of "const char*" instead like you did for
> conversion_ratio below, you could implement get_interval_type() more
> elegantly using a loop over that array.
OK, as the following patch.

>> +static int get_interval_type(char *source, char *type)
>> +{  
>> +    /*is USEC*/
>> +    if ((strstr(source, CHAR_USEC) != NULL)
>> +        && (strstr(source, CHAR_USEC)[4] == '_'))
> 
> Please avoid these double strstr() invocation. The compiler may
> optimize it away, but it just looks strange. The following would 
> look better to me, and I find it actually more readable:
> 
>         if (((p = strstr(source, CHAR_USEC)) != NULL) && p[4] == '_')
OK, as the following patch.

>> +static int get_string_from_under(char *args,
>> +                                        char *beforestring,
>> +                                        char *afterstring,
>> +                                        int *type)
> 
> Maybe you could figure out a more descriptive name for this function?
> 
> A comment in the code showing how the string to be parsed typically
> looks like would be helpful for the reader.
OK, as the following patch.

>> +    token = strtok_r(source, under, &saveptr);
>> +    token = strtok(token, char_type);
> 
> I'm pretty sure this is is not what you intended to write. If char_type
> is "usec", this would split the string at the possible delimiters 'u',
> 's', 'e', and 'c' (the 2nd argument of strtok(3) is not a sequence, but
> a 'set' of bytes). It might accidentally work with the input strings
> you are using (in particular because you only look at the first token),
> but nevertheless it's wrong.
OK, as the following patch.

>> +    if ((token == NULL) || (saveptr == NULL))
>> +        return 0;
>> +
>> +    tmp = token;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +            return 0;
>> +
>> +    tmp = saveptr;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +            return 0;
>> +
>> +    strncpy(beforestring, token, strlen(token) + 1);
>> +    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
>> +    return 1;
>> +}
> 
> I don't think it's safe to use saveptr the way you do it. The strtok_r
> man page says this parameter is for "internal use". While it makes
> sense to assume that it points to the next token, I'm not sure if
> that's guaranteed. You would be safe by calling 
> 
>     somevar = strtok_r(NULL, under, &saveptr)
> 
> and use "somevar".
OK, as the following patch.

> 
> In general, this whole parsing code is odd. IIUC this parses input
> looking like ([0-9]+)(SEC|MSEC|USEC)_([0-9]+) and sets beforestring,
> type, and afterstring to the regex matches \1, \2, and \3,
> respectively.
> 
> Why don't you start parsing from the beginning of the input, e.g. with
> strtoul(), and look at the rest later?
OK, as the following patch.

>> +
>> +int checkargvalid(int delay_interval, int cons_num, int type)
>> +{
>> +    if (type == INTERVAL_SEC)
>> +    {
>> +        if ((delay_interval < 1) || (delay_interval > 60))
>> +            return 0;
>> +    }
>> +    else if (type != INTERVAL_INVALID)
>> +    {
>> +        if ((delay_interval < 1) || (delay_interval >= 1000))
>> +            return 0;
>> +    }
> 
> You could be more forgiving here. 15000MSEC could be a legal value.
Because this value is more than 1 second, you can use the unit second.

> 
>> +    
>> +    if ((cons_num < 3) || (cons_num > 1000))
>> +        return 0;
>> +
>> +    return 1;
>> +}
>> +
>> +int get_delay_pref_arg(char *args, int *delay_interval, int
>> *cons_num, int *type)
>> +{
>> +    char delayintervalstr[MAX_CHAR_SIZE];
>> +    char consnumstr[MAX_CHAR_SIZE];
>> +
>> +    if (get_string_from_under(args, delayintervalstr, consnumstr,
>> type) == 0)
>> +        return 0;
> 
> It might be good to write the parser so that the consnum part can be
> left out by the user, and assume a reasonable default in that case.
OK, as the following patch.

>> +    while (temp-- > 0)
>> +    {
>> +        (void)gettimeofday(&tv, NULL);
>> +        before = timeval_to_us(&tv);		
>> +
>> +        if (do_readsector0(pp->fd, timeout) == 2)
>> +        {
>> +            condlog(0, "%s: path down", pp->dev);
>> +            return 1;
>> +        }
>> +        
>> +        (void)gettimeofday(&tv, NULL);
> 
> It's better to use clock_gettime(CLOCK_MONOTONIC, ...) here. Then you
> can throw away the delay < 0 check below.
OK, as the following patch.

>> +    toldelay -= min + max;
>> +    avgdelay = toldelay/(long long)(cons_num - 2);
>> +    if (avgdelay > THRES_USEC_VALUE) 
>> +    {           
>> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
>> +        return 1;
>> +    }
>> +    
>> +	ratio = get_conversion_ratio(type);
>> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
>> long)delay_interval) * ratio)));
>> +
>> +    return rc;
>> +}
> 
> Is it reasonable to do these interval calculations synchronously in
> getprio()? cons_num is limited to 1000, so this routine could issue
> 1000 reads on the device before returning. In particular if the device
> is under IO load and the delay is high, execution if this routine could
> be really slow.
> 
> It would make more sense to me to have a separate thread that
> calculates some sort of "running average" for the delay of the
> different paths, and have getprio() just fetch the current value of
> that variable.
> 
> Regards
> Martin
> 
When add a new path or the path's state change from down to up, getprio() of the prioritizer is triggered, and
the current path is not under IOs.
Usually, the 1000 reads will be finish fastly.
And have getprio() need to get the up-to-date average delay of the current path, these interval calculations synchronously
is essential because of unreasonable asynchronously process's executing time D-value, particulayly when add a new path or
the path's state change down to up.

Please find the up-to-date patch below:

---
From 1a9426dfbad00b5dbefc7020603e40e8896e4869 Mon Sep 17 00:00:00 2001
From: Yang Feng <philip.yang@huawei.com>
Date: Mon, 15 May 2017 18:33:29 +0800
Subject: [PATCH] [dm-devel] [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
 Prioritizer for device mapper multipath, where the corresponding priority values of specific paths are provided by a time-delay
 algorithm. And the time-delay algorithm is dependent on the following arguments(delay_interval, cons_num). The principle of the
 algorithm is illustrated as follows:
 1. By sending a certain number "cons_num" of read IOs to the current path    continuously, the IOs' average delay can be calculated.
 2. According to the average delay of each path and the weight value "delay_interval", the priority "rc" of each path can be provided.

           delay_interval  delay_interval  delay_interval      delay_interval
	 |---------------|---------------|---------------|   |---------------|
	 |priority rank 1|priority rank 2|priority rank 3|...|priority rank x|
         |---------------|---------------|---------------|   |---------------|
		               Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile      |   6 +-
 libmultipath/prioritizers/delayedpath.c | 261 ++++++++++++++++++++++++++++++++
 libmultipath/prioritizers/delayedpath.h |  17 +++
 multipath/multipath.conf.5              |  19 +++
 4 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/delayedpath.c
 create mode 100644 libmultipath/prioritizers/delayedpath.h

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..8df5234 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriodelayedpath.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriodelayedpath.so: delayedpath.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
new file mode 100644
index 0000000..0490e8d
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.c
@@ -0,0 +1,261 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a time-delay algorithm. And the
+ * time-delay algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "cons_num" of read IOs to the current path
+ *    continuously, the IOs' average delay can be calculated.
+ * 2. According to the average delay of each path and the weight value
+ *    "delay_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#include "delayedpath.h"
+
+#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
+#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
+#define DEFAULT_CONS_NUM        20
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_SEC                "s"
+#define CHAR_MSEC               "ms"
+#define CHAR_USEC               "us"
+
+enum interval_type {
+    INTERVAL_SEC,
+    INTERVAL_MSEC,
+    INTERVAL_USEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char interval_unit_str[][MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+static const int conversion_ratio[] = {
+	[INTERVAL_SEC]		= USEC_PER_SEC,
+	[INTERVAL_MSEC]	        = USEC_PER_MSEC,
+	[INTERVAL_USEC]		= USEC_PER_USEC,
+	[INTERVAL_INVALID]	= 0
+};
+
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+static int get_interval_type(char *source, char *type)
+{
+    char *p;
+    int size;
+    int i;
+
+    for (i = 0; i < sizeof(interval_unit_str)/MAX_CHAR_SIZE; i++)
+    {
+        size = strlen(interval_unit_str[i]);
+        p = strstr(source, interval_unit_str[i]);
+        if (p != NULL && p[size] == '|')
+        {
+            memcpy(type, interval_unit_str[i], size+1);
+            return interval_unit_type[i];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+/* In multipath.conf, args form: delay_interval|cons_num. For example,
+*  args is "10ms|20", this function can get 10, ms, and 20.
+*/
+static int get_digit_and_type(char *args,
+                              int *interval,
+                              int *consnum,
+                              int *type)
+{
+    char typestr[MAX_CHAR_SIZE];
+    char source[MAX_CHAR_SIZE];
+    char vertica[] = "|";
+    char *tokenbefore = NULL;
+    char *tokenafter = NULL;
+    char *tmp = NULL;
+    unsigned int size = strlen(args);
+
+    if ((args == NULL) || (interval == NULL)
+        || (consnum == NULL) || (type == NULL))
+        return 0;
+
+    /* int type */
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+        return 0;
+
+    memcpy(source, args, size+1);
+    if (strstr(source, vertica) == NULL)
+        return 0;
+
+    *type = get_interval_type(source, typestr);
+    if (*type == INTERVAL_INVALID)
+    {
+        condlog(0, "delay_interval type is invalid");
+        return 0;
+    }
+
+    tokenbefore = strtok(source, vertica);
+    tokenafter = strtok(NULL, vertica);
+    typestr[1] = '\0';
+    tokenbefore = strtok(tokenbefore, typestr);
+    if ((tokenbefore == NULL) || (tokenafter == NULL))
+        return 0;
+
+    tmp = tokenbefore;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+        {
+            condlog(0, "delay_interval string include invalid char");
+            return 0;
+        }
+
+    tmp = tokenafter;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+        {
+            condlog(0, "cons_num string include invalid char");
+            return 0;
+        }
+
+    *interval = atoi(tokenbefore);
+    *consnum = atoi(tokenafter);
+
+    return 1;
+}
+
+int check_args_valid(int delay_interval, int cons_num, int type)
+{
+    if (type == INTERVAL_SEC)
+    {
+        if ((delay_interval < 1) || (delay_interval > 60))
+        {
+            condlog(0, "delay_interval values is invalid");
+            return 0;
+        }
+    }
+    else if (type != INTERVAL_INVALID)
+    {
+        if ((delay_interval < 1) || (delay_interval >= 1000))
+        {
+            condlog(0, "delay_interval values is invalid");
+            return 0;
+        }
+    }
+
+    if ((cons_num < 3) || (cons_num > 1000))
+    {
+        condlog(0, "cons_num values is invalid");
+        return 0;
+    }
+
+    return 1;
+}
+
+int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num, int *interval_type)
+{
+    if (get_digit_and_type(args, delay_interval, cons_num, interval_type) == 0)
+        return 0;
+
+    if (check_args_valid(*delay_interval, *cons_num, *interval_type) == 0)
+        return 0;
+
+    return 1;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, delay_interval, cons_num, type, temp;
+    long long delay, avgdelay, ratio;
+    long long min = THRES_USEC_VALUE;
+    long long max = 0;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+	    return -PRIO_NO_INFORMATION;
+
+    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
+    {
+        condlog(3, "%s: get delay arg fail", pp->dev);
+        delay_interval = DEFAULT_DELAY_INTERVAL;
+        cons_num = DEFAULT_CONS_NUM;
+        type = INTERVAL_MSEC;
+    }
+
+    temp = cons_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        delay = after - before;
+    	
+        min = (min <= delay) ? min : delay;
+        max = (max >= delay) ? max : delay;
+
+        toldelay += delay;
+    }
+
+    toldelay -= min + max;
+    avgdelay = toldelay/(long long)(cons_num - 2);
+    if (avgdelay > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
+        return 1;
+    }
+
+	ratio = get_conversion_ratio(type);
+	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) * ratio)));
+
+    return rc;
+}
diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
new file mode 100644
index 0000000..d8213e9
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.h
@@ -0,0 +1,17 @@
+#ifndef _DELAYEDPATH_H
+#define _DELAYEDPATH_H
+
+#define PRIO_DELAYED_PATH "delayedpath"
+
+#define PRIO_NO_INFORMATION 5
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+#endif
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..f1e126e 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I delayedpath
+Generate the path priority based on a time-delay algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,21 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I delayed
+Needs a value of the form
+\fI"<delay_interval|cons_num>"\fR
+.RS
+.TP 8
+.I delay_interval
+The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
+For example: 10s, or 100us, or 100ms. The default is: 10ms.
+.TP
+.I cons_num
+The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
+For example: 30. The default is: 20.
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
-- 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-15 10:44   ` Yang Feng
@ 2017-05-16 14:53     ` Yang Feng
  2017-05-16 18:54     ` Martin Wilck
  2017-05-16 21:38     ` [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Benjamin Marzinski
  2 siblings, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-16 14:53 UTC (permalink / raw)
  To: Martin Wilck
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09

Hello Martin,

How about this patch?
Thanks a lot,
Best regards.


On 2017/5/15 18:44, Yang Feng wrote:
> Hello Martin,
> 
> Thank you very much for your remarks. I am sorry for late reply, Please find my answer and the updated patch.
> 
>> Hello Yang,
>>
>> thank you for your work. Please find my remarks below.
>>
>> On Mon, 2017-05-08 at 11:58 +0800, Yang Feng wrote:
>>> Prioritizer for device mapper multipath, where the corresponding
>>> priority
>>> values of specific paths are provided by a time-delay algorithm. And
>>> the
>>> time-delay algorithm is dependent on the following
>>> arguments(delay_interval,
>>> cons_num).
>>> The principle of the algorithm is illustrated as follows:
>>> 1. By sending a certain number "cons_num" of read IOs to the current
>>> path
>>>    continuously, the IOs' average delay can be calculated.
>>> 2. According to the average delay of each path and the weight value
>>>    "delay_interval", the priority "rc" of each path can be provided.
>>>
>>>      delay_interval  delay_interval  delay_interval       delay_inter
>>
>> How does this algorithm behave under load? Can we be sure that
>> priorities don't start to fluctuate wildly because busy paths will
>> usually have longer latencies than idle ones?
> I have a lot of test under load. When the appropriate value of argument "delay_interval" is set,
> this algorithm behave well and can separate the paths who's average delay is more than others.
> When add a new path or the path's state change from down to up, getprio() of the prioritizer is triggered, and
> the current path is not under IOs.
> 
>>>  libmultipath/Makefile                   |   2 +-
>>>  libmultipath/checkers/Makefile          |   7 +-
>>>  libmultipath/checkers/emc_clariion.c    |   2 +-
>>>  libmultipath/checkers/libsg.c           |  94 ------------
>>>  libmultipath/checkers/libsg.h           |   9 --
>>>  libmultipath/checkers/readsector0.c     |   2 +-
>>>  libmultipath/libsg.c                    |  94 ++++++++++++
>>>  libmultipath/libsg.h                    |   9 ++
>>>  libmultipath/prioritizers/Makefile      |   6 +-
>>>  libmultipath/prioritizers/delayedpath.c | 246 
>>
>> Why do you have to move libsg for this? It's already used by various
>> checkers, why can't your checker do the same? If you really need to do
>> it, you should at least separate that part of the patch from the added
>> code.
> OK, this time, libsg will not be moved.
> 
>>> +
>>> +#define CHAR_SEC                "SEC"
>>> +#define CHAR_MSEC               "MSEC"
>>> +#define CHAR_USEC               "USEC"
>>
>> I suggest to use "s", "ms", and "us" here instead.
> OK, as the following patch.
> 
>> If you create an array of "const char*" instead like you did for
>> conversion_ratio below, you could implement get_interval_type() more
>> elegantly using a loop over that array.
> OK, as the following patch.
> 
>>> +static int get_interval_type(char *source, char *type)
>>> +{  
>>> +    /*is USEC*/
>>> +    if ((strstr(source, CHAR_USEC) != NULL)
>>> +        && (strstr(source, CHAR_USEC)[4] == '_'))
>>
>> Please avoid these double strstr() invocation. The compiler may
>> optimize it away, but it just looks strange. The following would 
>> look better to me, and I find it actually more readable:
>>
>>         if (((p = strstr(source, CHAR_USEC)) != NULL) && p[4] == '_')
> OK, as the following patch.
> 
>>> +static int get_string_from_under(char *args,
>>> +                                        char *beforestring,
>>> +                                        char *afterstring,
>>> +                                        int *type)
>>
>> Maybe you could figure out a more descriptive name for this function?
>>
>> A comment in the code showing how the string to be parsed typically
>> looks like would be helpful for the reader.
> OK, as the following patch.
> 
>>> +    token = strtok_r(source, under, &saveptr);
>>> +    token = strtok(token, char_type);
>>
>> I'm pretty sure this is is not what you intended to write. If char_type
>> is "usec", this would split the string at the possible delimiters 'u',
>> 's', 'e', and 'c' (the 2nd argument of strtok(3) is not a sequence, but
>> a 'set' of bytes). It might accidentally work with the input strings
>> you are using (in particular because you only look at the first token),
>> but nevertheless it's wrong.
> OK, as the following patch.
> 
>>> +    if ((token == NULL) || (saveptr == NULL))
>>> +        return 0;
>>> +
>>> +    tmp = token;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +            return 0;
>>> +
>>> +    tmp = saveptr;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +            return 0;
>>> +
>>> +    strncpy(beforestring, token, strlen(token) + 1);
>>> +    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
>>> +    return 1;
>>> +}
>>
>> I don't think it's safe to use saveptr the way you do it. The strtok_r
>> man page says this parameter is for "internal use". While it makes
>> sense to assume that it points to the next token, I'm not sure if
>> that's guaranteed. You would be safe by calling 
>>
>>     somevar = strtok_r(NULL, under, &saveptr)
>>
>> and use "somevar".
> OK, as the following patch.
> 
>>
>> In general, this whole parsing code is odd. IIUC this parses input
>> looking like ([0-9]+)(SEC|MSEC|USEC)_([0-9]+) and sets beforestring,
>> type, and afterstring to the regex matches \1, \2, and \3,
>> respectively.
>>
>> Why don't you start parsing from the beginning of the input, e.g. with
>> strtoul(), and look at the rest later?
> OK, as the following patch.
> 
>>> +
>>> +int checkargvalid(int delay_interval, int cons_num, int type)
>>> +{
>>> +    if (type == INTERVAL_SEC)
>>> +    {
>>> +        if ((delay_interval < 1) || (delay_interval > 60))
>>> +            return 0;
>>> +    }
>>> +    else if (type != INTERVAL_INVALID)
>>> +    {
>>> +        if ((delay_interval < 1) || (delay_interval >= 1000))
>>> +            return 0;
>>> +    }
>>
>> You could be more forgiving here. 15000MSEC could be a legal value.
> Because this value is more than 1 second, you can use the unit second.
> 
>>
>>> +    
>>> +    if ((cons_num < 3) || (cons_num > 1000))
>>> +        return 0;
>>> +
>>> +    return 1;
>>> +}
>>> +
>>> +int get_delay_pref_arg(char *args, int *delay_interval, int
>>> *cons_num, int *type)
>>> +{
>>> +    char delayintervalstr[MAX_CHAR_SIZE];
>>> +    char consnumstr[MAX_CHAR_SIZE];
>>> +
>>> +    if (get_string_from_under(args, delayintervalstr, consnumstr,
>>> type) == 0)
>>> +        return 0;
>>
>> It might be good to write the parser so that the consnum part can be
>> left out by the user, and assume a reasonable default in that case.
> OK, as the following patch.
> 
>>> +    while (temp-- > 0)
>>> +    {
>>> +        (void)gettimeofday(&tv, NULL);
>>> +        before = timeval_to_us(&tv);		
>>> +
>>> +        if (do_readsector0(pp->fd, timeout) == 2)
>>> +        {
>>> +            condlog(0, "%s: path down", pp->dev);
>>> +            return 1;
>>> +        }
>>> +        
>>> +        (void)gettimeofday(&tv, NULL);
>>
>> It's better to use clock_gettime(CLOCK_MONOTONIC, ...) here. Then you
>> can throw away the delay < 0 check below.
> OK, as the following patch.
> 
>>> +    toldelay -= min + max;
>>> +    avgdelay = toldelay/(long long)(cons_num - 2);
>>> +    if (avgdelay > THRES_USEC_VALUE) 
>>> +    {           
>>> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
>>> +        return 1;
>>> +    }
>>> +    
>>> +	ratio = get_conversion_ratio(type);
>>> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
>>> long)delay_interval) * ratio)));
>>> +
>>> +    return rc;
>>> +}
>>
>> Is it reasonable to do these interval calculations synchronously in
>> getprio()? cons_num is limited to 1000, so this routine could issue
>> 1000 reads on the device before returning. In particular if the device
>> is under IO load and the delay is high, execution if this routine could
>> be really slow.
>>
>> It would make more sense to me to have a separate thread that
>> calculates some sort of "running average" for the delay of the
>> different paths, and have getprio() just fetch the current value of
>> that variable.
>>
>> Regards
>> Martin
>>
> When add a new path or the path's state change from down to up, getprio() of the prioritizer is triggered, and
> the current path is not under IOs.
> Usually, the 1000 reads will be finish fastly.
> And have getprio() need to get the up-to-date average delay of the current path, these interval calculations synchronously
> is essential because of unreasonable asynchronously process's executing time D-value, particulayly when add a new path or
> the path's state change down to up.
> 
> Please find the up-to-date patch below:
> 
> ---
>>From 1a9426dfbad00b5dbefc7020603e40e8896e4869 Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Mon, 15 May 2017 18:33:29 +0800
> Subject: [PATCH] [dm-devel] [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
>  Prioritizer for device mapper multipath, where the corresponding priority values of specific paths are provided by a time-delay
>  algorithm. And the time-delay algorithm is dependent on the following arguments(delay_interval, cons_num). The principle of the
>  algorithm is illustrated as follows:
>  1. By sending a certain number "cons_num" of read IOs to the current path    continuously, the IOs' average delay can be calculated.
>  2. According to the average delay of each path and the weight value "delay_interval", the priority "rc" of each path can be provided.
> 
>            delay_interval  delay_interval  delay_interval      delay_interval
> 	 |---------------|---------------|---------------|   |---------------|
> 	 |priority rank 1|priority rank 2|priority rank 3|...|priority rank x|
>          |---------------|---------------|---------------|   |---------------|
> 		               Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile      |   6 +-
>  libmultipath/prioritizers/delayedpath.c | 261 ++++++++++++++++++++++++++++++++
>  libmultipath/prioritizers/delayedpath.h |  17 +++
>  multipath/multipath.conf.5              |  19 +++
>  4 files changed, 302 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/delayedpath.c
>  create mode 100644 libmultipath/prioritizers/delayedpath.h
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..8df5234 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriodelayedpath.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriodelayedpath.so: delayedpath.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
> new file mode 100644
> index 0000000..0490e8d
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.c
> @@ -0,0 +1,261 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a time-delay algorithm. And the
> + * time-delay algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "cons_num" of read IOs to the current path
> + *    continuously, the IOs' average delay can be calculated.
> + * 2. According to the average delay of each path and the weight value
> + *    "delay_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#include "delayedpath.h"
> +
> +#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
> +#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
> +#define DEFAULT_CONS_NUM        20
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_SEC                "s"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_USEC               "us"
> +
> +enum interval_type {
> +    INTERVAL_SEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_USEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +static const int conversion_ratio[] = {
> +	[INTERVAL_SEC]		= USEC_PER_SEC,
> +	[INTERVAL_MSEC]	        = USEC_PER_MSEC,
> +	[INTERVAL_USEC]		= USEC_PER_USEC,
> +	[INTERVAL_INVALID]	= 0
> +};
> +
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +static int get_interval_type(char *source, char *type)
> +{
> +    char *p;
> +    int size;
> +    int i;
> +
> +    for (i = 0; i < sizeof(interval_unit_str)/MAX_CHAR_SIZE; i++)
> +    {
> +        size = strlen(interval_unit_str[i]);
> +        p = strstr(source, interval_unit_str[i]);
> +        if (p != NULL && p[size] == '|')
> +        {
> +            memcpy(type, interval_unit_str[i], size+1);
> +            return interval_unit_type[i];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +/* In multipath.conf, args form: delay_interval|cons_num. For example,
> +*  args is "10ms|20", this function can get 10, ms, and 20.
> +*/
> +static int get_digit_and_type(char *args,
> +                              int *interval,
> +                              int *consnum,
> +                              int *type)
> +{
> +    char typestr[MAX_CHAR_SIZE];
> +    char source[MAX_CHAR_SIZE];
> +    char vertica[] = "|";
> +    char *tokenbefore = NULL;
> +    char *tokenafter = NULL;
> +    char *tmp = NULL;
> +    unsigned int size = strlen(args);
> +
> +    if ((args == NULL) || (interval == NULL)
> +        || (consnum == NULL) || (type == NULL))
> +        return 0;
> +
> +    /* int type */
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +        return 0;
> +
> +    memcpy(source, args, size+1);
> +    if (strstr(source, vertica) == NULL)
> +        return 0;
> +
> +    *type = get_interval_type(source, typestr);
> +    if (*type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "delay_interval type is invalid");
> +        return 0;
> +    }
> +
> +    tokenbefore = strtok(source, vertica);
> +    tokenafter = strtok(NULL, vertica);
> +    typestr[1] = '\0';
> +    tokenbefore = strtok(tokenbefore, typestr);
> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
> +        return 0;
> +
> +    tmp = tokenbefore;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "delay_interval string include invalid char");
> +            return 0;
> +        }
> +
> +    tmp = tokenafter;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "cons_num string include invalid char");
> +            return 0;
> +        }
> +
> +    *interval = atoi(tokenbefore);
> +    *consnum = atoi(tokenafter);
> +
> +    return 1;
> +}
> +
> +int check_args_valid(int delay_interval, int cons_num, int type)
> +{
> +    if (type == INTERVAL_SEC)
> +    {
> +        if ((delay_interval < 1) || (delay_interval > 60))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +    else if (type != INTERVAL_INVALID)
> +    {
> +        if ((delay_interval < 1) || (delay_interval >= 1000))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +
> +    if ((cons_num < 3) || (cons_num > 1000))
> +    {
> +        condlog(0, "cons_num values is invalid");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num, int *interval_type)
> +{
> +    if (get_digit_and_type(args, delay_interval, cons_num, interval_type) == 0)
> +        return 0;
> +
> +    if (check_args_valid(*delay_interval, *cons_num, *interval_type) == 0)
> +        return 0;
> +
> +    return 1;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, delay_interval, cons_num, type, temp;
> +    long long delay, avgdelay, ratio;
> +    long long min = THRES_USEC_VALUE;
> +    long long max = 0;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +	    return -PRIO_NO_INFORMATION;
> +
> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
> +    {
> +        condlog(3, "%s: get delay arg fail", pp->dev);
> +        delay_interval = DEFAULT_DELAY_INTERVAL;
> +        cons_num = DEFAULT_CONS_NUM;
> +        type = INTERVAL_MSEC;
> +    }
> +
> +    temp = cons_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        delay = after - before;
> +    	
> +        min = (min <= delay) ? min : delay;
> +        max = (max >= delay) ? max : delay;
> +
> +        toldelay += delay;
> +    }
> +
> +    toldelay -= min + max;
> +    avgdelay = toldelay/(long long)(cons_num - 2);
> +    if (avgdelay > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
> +        return 1;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) * ratio)));
> +
> +    return rc;
> +}
> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
> new file mode 100644
> index 0000000..d8213e9
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.h
> @@ -0,0 +1,17 @@
> +#ifndef _DELAYEDPATH_H
> +#define _DELAYEDPATH_H
> +
> +#define PRIO_DELAYED_PATH "delayedpath"
> +
> +#define PRIO_NO_INFORMATION 5
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +#endif
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..f1e126e 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I delayedpath
> +Generate the path priority based on a time-delay algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,21 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I delayed
> +Needs a value of the form
> +\fI"<delay_interval|cons_num>"\fR
> +.RS
> +.TP 8
> +.I delay_interval
> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
> +.TP
> +.I cons_num
> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
> +For example: 30. The default is: 20.
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-15 10:44   ` Yang Feng
  2017-05-16 14:53     ` Yang Feng
@ 2017-05-16 18:54     ` Martin Wilck
  2017-05-19  8:43       ` Yang Feng
  2017-05-16 21:38     ` [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Benjamin Marzinski
  2 siblings, 1 reply; 19+ messages in thread
From: Martin Wilck @ 2017-05-16 18:54 UTC (permalink / raw)
  To: Yang Feng
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09

Hi Yang,

Thanks for improving your patch. Please find my comments below.

On Mon, 2017-05-15 at 18:44 +0800, Yang Feng wrote:
> Hello Martin,
> 
> Thank you very much for your remarks. I am sorry for late reply,
> Please find my answer and the updated patch.
> 
> > Hello Yang,
> > 
> > thank you for your work. Please find my remarks below.
> > 
> > On Mon, 2017-05-08 at 11:58 +0800, Yang Feng wrote:
> > > Prioritizer for device mapper multipath, where the corresponding
> > > priority
> > > values of specific paths are provided by a time-delay algorithm.
> > > And
> > > the
> > > time-delay algorithm is dependent on the following
> > > arguments(delay_interval,
> > > cons_num).
> > > The principle of the algorithm is illustrated as follows:
> > > 1. By sending a certain number "cons_num" of read IOs to the
> > > current
> > > path
> > >    continuously, the IOs' average delay can be calculated.
> > > 2. According to the average delay of each path and the weight
> > > value
> > >    "delay_interval", the priority "rc" of each path can be
> > > provided.
> > > 
> > >      delay_interval  delay_interval  delay_interval       delay_i
> > > nter
> > 
> > How does this algorithm behave under load? Can we be sure that
> > priorities don't start to fluctuate wildly because busy paths will
> > usually have longer latencies than idle ones?
> 
> I have a lot of test under load. When the appropriate value of
> argument "delay_interval" is set,
> this algorithm behave well and can separate the paths who's average
> delay is more than others.
> When add a new path or the path's state change from down to up,
> getprio() of the prioritizer is triggered, and
> the current path is not under IOs.

> > >  libmultipath/Makefile                   |   2 +-
> > >  libmultipath/checkers/Makefile          |   7 +-
> > >  libmultipath/checkers/emc_clariion.c    |   2 +-
> > >  libmultipath/checkers/libsg.c           |  94 ------------
> > >  libmultipath/checkers/libsg.h           |   9 --
> > >  libmultipath/checkers/readsector0.c     |   2 +-
> > >  libmultipath/libsg.c                    |  94 ++++++++++++
> > >  libmultipath/libsg.h                    |   9 ++
> > >  libmultipath/prioritizers/Makefile      |   6 +-
> > >  libmultipath/prioritizers/delayedpath.c | 246 

Please think about the name once again. Maybe you should call it
"io_latency" or "path_latency" instead of "delayedpath"?

> > 
> > Why do you have to move libsg for this? It's already used by
> > various
> > checkers, why can't your checker do the same? If you really need to
> > do
> > it, you should at least separate that part of the patch from the
> > added
> > code.
> 
> OK, this time, libsg will not be moved.
> 
> > > +
> > > +#define CHAR_SEC                "SEC"
> > > +#define CHAR_MSEC               "MSEC"
> > > +#define CHAR_USEC               "USEC"
> > 
> > I suggest to use "s", "ms", and "us" here instead.
> 
> OK, as the following patch.
> 
> > If you create an array of "const char*" instead like you did for
> > conversion_ratio below, you could implement get_interval_type()
> > more
> > elegantly using a loop over that array.
> 
> OK, as the following patch.
> 
> > > +static int get_interval_type(char *source, char *type)
> > > +{  
> > > +    /*is USEC*/
> > > +    if ((strstr(source, CHAR_USEC) != NULL)
> > > +        && (strstr(source, CHAR_USEC)[4] == '_'))
> > 
> > Please avoid these double strstr() invocation. The compiler may
> > optimize it away, but it just looks strange. The following would 
> > look better to me, and I find it actually more readable:
> > 
> >         if (((p = strstr(source, CHAR_USEC)) != NULL) && p[4] ==
> > '_')
> 
> OK, as the following patch.
> 
> > > +static int get_string_from_under(char *args,
> > > +                                        char *beforestring,
> > > +                                        char *afterstring,
> > > +                                        int *type)
> > 
> > Maybe you could figure out a more descriptive name for this
> > function?
> > 
> > A comment in the code showing how the string to be parsed typically
> > looks like would be helpful for the reader.
> 
> OK, as the following patch.
> 
> > > +    token = strtok_r(source, under, &saveptr);
> > > +    token = strtok(token, char_type);
> > 
> > I'm pretty sure this is is not what you intended to write. If
> > char_type
> > is "usec", this would split the string at the possible delimiters
> > 'u',
> > 's', 'e', and 'c' (the 2nd argument of strtok(3) is not a sequence,
> > but
> > a 'set' of bytes). It might accidentally work with the input
> > strings
> > you are using (in particular because you only look at the first
> > token),
> > but nevertheless it's wrong.
> 
> OK, as the following patch.
> 
> > > +    if ((token == NULL) || (saveptr == NULL))
> > > +        return 0;
> > > +
> > > +    tmp = token;
> > > +    while (*tmp != '\0')
> > > +        if (!isdigit(*tmp++))
> > > +            return 0;
> > > +
> > > +    tmp = saveptr;
> > > +    while (*tmp != '\0')
> > > +        if (!isdigit(*tmp++))
> > > +            return 0;
> > > +
> > > +    strncpy(beforestring, token, strlen(token) + 1);
> > > +    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
> > > +    return 1;
> > > +}
> > 
> > I don't think it's safe to use saveptr the way you do it. The
> > strtok_r
> > man page says this parameter is for "internal use". While it makes
> > sense to assume that it points to the next token, I'm not sure if
> > that's guaranteed. You would be safe by calling 
> > 
> >     somevar = strtok_r(NULL, under, &saveptr)
> > 
> > and use "somevar".
> 
> OK, as the following patch.
> 
> > 
> > In general, this whole parsing code is odd. IIUC this parses input
> > looking like ([0-9]+)(SEC|MSEC|USEC)_([0-9]+) and sets
> > beforestring,
> > type, and afterstring to the regex matches \1, \2, and \3,
> > respectively.
> > 
> > Why don't you start parsing from the beginning of the input, e.g.
> > with
> > strtoul(), and look at the rest later?
> 
> OK, as the following patch.

Hm, I can't see a lot of difference in the parsing code wrt the
previous version. IMO it's still non-straightforward and hard to
comprehend. Maybe I didn't express myself clearly enough. Here is how
I'd code this:

 1. Verify that the string starts with a digit. Error if it does not.
 2. Parse the delay interval using strtoul().
 3. The "end" pointer of strtoul() points to the unit, which has to be
"s", "ms" or "us". Verify, and set the unit accordingly.
 4. Verify that the next character is '|', and that it's followed by a
digit.
 5. Parse the number with strtoul()
 6. Verify that there's no garbage at the end of the string.

> > > +
> > > +int checkargvalid(int delay_interval, int cons_num, int type)
> > > +{
> > > +    if (type == INTERVAL_SEC)
> > > +    {
> > > +        if ((delay_interval < 1) || (delay_interval > 60))
> > > +            return 0;
> > > +    }
> > > +    else if (type != INTERVAL_INVALID)
> > > +    {
> > > +        if ((delay_interval < 1) || (delay_interval >= 1000))
> > > +            return 0;
> > > +    }
> > 
> > You could be more forgiving here. 15000MSEC could be a legal value.
> 
> Because this value is more than 1 second, you can use the unit
> second.

Please follow the https://en.wikipedia.org/wiki/Robustness_principle.
If a user enters "1500ms" here, the parsing will silently fail, and
with it the whole prio algorithm. This will cause user confusion.
Please don't do this.

> 
> > 
> > > +    
> > > +    if ((cons_num < 3) || (cons_num > 1000))
> > > +        return 0;
> > > +
> > > +    return 1;
> > > +}
> > > +
> > > +int get_delay_pref_arg(char *args, int *delay_interval, int
> > > *cons_num, int *type)
> > > +{
> > > +    char delayintervalstr[MAX_CHAR_SIZE];
> > > +    char consnumstr[MAX_CHAR_SIZE];
> > > +
> > > +    if (get_string_from_under(args, delayintervalstr,
> > > consnumstr,
> > > type) == 0)
> > > +        return 0;
> > 
> > It might be good to write the parser so that the consnum part can
> > be
> > left out by the user, and assume a reasonable default in that case.
> 
> OK, as the following patch.
> 
> > > +    while (temp-- > 0)
> > > +    {
> > > +        (void)gettimeofday(&tv, NULL);
> > > +        before = timeval_to_us(&tv);		
> > > +
> > > +        if (do_readsector0(pp->fd, timeout) == 2)
> > > +        {
> > > +            condlog(0, "%s: path down", pp->dev);
> > > +            return 1;
> > > +        }
> > > +        
> > > +        (void)gettimeofday(&tv, NULL);
> > 
> > It's better to use clock_gettime(CLOCK_MONOTONIC, ...) here. Then
> > you
> > can throw away the delay < 0 check below.
> 
> OK, as the following patch.
> 
> > > +    toldelay -= min + max;
> > > +    avgdelay = toldelay/(long long)(cons_num - 2);
> > > +    if (avgdelay > THRES_USEC_VALUE) 
> > > +    {           
> > > +        condlog(0, "%s: avgdelay is more than thresold", pp-
> > > >dev);
> > > +        return 1;
> > > +    }
> > > +    
> > > +	ratio = get_conversion_ratio(type);
> > > +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
> > > long)delay_interval) * ratio)));
> > > +
> > > +    return rc;
> > > +}
> > 
> > Is it reasonable to do these interval calculations synchronously in
> > getprio()? cons_num is limited to 1000, so this routine could issue
> > 1000 reads on the device before returning. In particular if the
> > device
> > is under IO load and the delay is high, execution if this routine
> > could
> > be really slow.
> > 
> > It would make more sense to me to have a separate thread that
> > calculates some sort of "running average" for the delay of the
> > different paths, and have getprio() just fetch the current value of
> > that variable.
> > 
> > Regards
> > Martin
> > 
> 
> When add a new path or the path's state change from down to up,
> getprio() of the prioritizer is triggered, and
> the current path is not under IOs.
> Usually, the 1000 reads will be finish fastly.
> And have getprio() need to get the up-to-date average delay of the
> current path, these interval calculations synchronously
> is essential because of unreasonable asynchronously process's
> executing time D-value, particulayly when add a new path or
> the path's state change down to up.

I am sorry, I was on the wrong track here. Doing this asynchronously is
really not going to work. Sorry about that. The default "cons" value is
20, so that should be acceptable. 

However please consider lowering the upper bound, I kind of doubt that
1000 IOs will finish quickly. More often than not, a lot of paths will
appear at the same time (e.g. if a port of a storage array is enabled)
and we'll have to send 1000 IOs to each one.

> 
> Please find the up-to-date patch below:
> 
> ---
> From 1a9426dfbad00b5dbefc7020603e40e8896e4869 Mon Sep 17 00:00:00
> 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Mon, 15 May 2017 18:33:29 +0800
> Subject: [PATCH] [dm-devel] [PATCH] multipath-tools:Prioritizer based
> on a time-delay algorithm
>  Prioritizer for device mapper multipath, where the corresponding
> priority values of specific paths are provided by a time-delay
>  algorithm. And the time-delay algorithm is dependent on the
> following arguments(delay_interval, cons_num). The principle of the
>  algorithm is illustrated as follows:
>  1. By sending a certain number "cons_num" of read IOs to the current
> path    continuously, the IOs' average delay can be calculated.
>  2. According to the average delay of each path and the weight value
> "delay_interval", the priority "rc" of each path can be provided.
> 
>            delay_interval  delay_interval  delay_interval      delay_
> interval
> 	 |---------------|---------------|---------------|   |---------
> ------|
> 	 |priority rank 1|priority rank 2|priority rank 3|...|priority
> rank x|
>          |---------------|---------------|---------------|   |-------
> --------|
> 		               Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile      |   6 +-
>  libmultipath/prioritizers/delayedpath.c | 261
> ++++++++++++++++++++++++++++++++
>  libmultipath/prioritizers/delayedpath.h |  17 +++
>  multipath/multipath.conf.5              |  19 +++
>  4 files changed, 302 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/delayedpath.c
>  create mode 100644 libmultipath/prioritizers/delayedpath.h
> 
> diff --git a/libmultipath/prioritizers/Makefile
> b/libmultipath/prioritizers/Makefile
> index 36b42e4..8df5234 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriodelayedpath.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriodelayedpath.so: delayedpath.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/delayedpath.c
> b/libmultipath/prioritizers/delayedpath.c
> new file mode 100644
> index 0000000..0490e8d
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.c
> @@ -0,0 +1,261 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights
> Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding
> priority
> + * values of specific paths are provided by a time-delay algorithm.
> And the
> + * time-delay algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "cons_num" of read IOs to the
> current path
> + *    continuously, the IOs' average delay can be calculated.
> + * 2. According to the average delay of each path and the weight
> value
> + *    "delay_interval", the priority "rc" of each path can be
> provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#include "delayedpath.h"
> +
> +#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
> +#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
> +#define DEFAULT_CONS_NUM        20
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_SEC                "s"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_USEC               "us"
> +
> +enum interval_type {
> +    INTERVAL_SEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_USEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment
> sequence */
> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};

It's irritiating that you use a different ordering here than in the
interval_type enum.

> +
> +static const int conversion_ratio[] = {
> +	[INTERVAL_SEC]		= USEC_PER_SEC,
> +	[INTERVAL_MSEC]	        = USEC_PER_MSEC,
> +	[INTERVAL_USEC]		= USEC_PER_USEC,
> +	[INTERVAL_INVALID]	= 0
> +};
> +
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +static int get_interval_type(char *source, char *type)
> +{
> +    char *p;
> +    int size;
> +    int i;
> +
> +    for (i = 0; i < sizeof(interval_unit_str)/MAX_CHAR_SIZE; i++)
> +    {
> +        size = strlen(interval_unit_str[i]);
> +        p = strstr(source, interval_unit_str[i]);
> +        if (p != NULL && p[size] == '|')
> +        {
> +            memcpy(type, interval_unit_str[i], size+1);
> +            return interval_unit_type[i];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +/* In multipath.conf, args form: delay_interval|cons_num. For
> example,
> +*  args is "10ms|20", this function can get 10, ms, and 20.
> +*/
> +static int get_digit_and_type(char *args,
> +                              int *interval,
> +                              int *consnum,
> +                              int *type)
> +{
> +    char typestr[MAX_CHAR_SIZE];
> +    char source[MAX_CHAR_SIZE];
> +    char vertica[] = "|";
> +    char *tokenbefore = NULL;
> +    char *tokenafter = NULL;
> +    char *tmp = NULL;
> +    unsigned int size = strlen(args);
> +
> +    if ((args == NULL) || (interval == NULL)
> +        || (consnum == NULL) || (type == NULL))
> +        return 0;
> +
> +    /* int type */
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +        return 0;
> +
> +    memcpy(source, args, size+1);
> +    if (strstr(source, vertica) == NULL)
> +        return 0;
> +
> +    *type = get_interval_type(source, typestr);
> +    if (*type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "delay_interval type is invalid");
> +        return 0;
> +    }
> +
> +    tokenbefore = strtok(source, vertica);
> +    tokenafter = strtok(NULL, vertica);
> +    typestr[1] = '\0';
> +    tokenbefore = strtok(tokenbefore, typestr);
> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
> +        return 0;
> +
> +    tmp = tokenbefore;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "delay_interval string include invalid
> char");
> +            return 0;
> +        }
> +
> +    tmp = tokenafter;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "cons_num string include invalid char");
> +            return 0;
> +        }
> +
> +    *interval = atoi(tokenbefore);
> +    *consnum = atoi(tokenafter);
> +
> +    return 1;
> +}

see above.

> +
> +int check_args_valid(int delay_interval, int cons_num, int type)
> +{
> +    if (type == INTERVAL_SEC)
> +    {
> +        if ((delay_interval < 1) || (delay_interval > 60))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +    else if (type != INTERVAL_INVALID)
> +    {
> +        if ((delay_interval < 1) || (delay_interval >= 1000))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +
> +    if ((cons_num < 3) || (cons_num > 1000))
> +    {
> +        condlog(0, "cons_num values is invalid");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +

see above.

> +int get_delay_pref_arg(char *args, int *delay_interval, int
> *cons_num, int *interval_type)
> +{
> +    if (get_digit_and_type(args, delay_interval, cons_num,
> interval_type) == 0)
> +        return 0;
> +
> +    if (check_args_valid(*delay_interval, *cons_num, *interval_type)
> == 0)
> +        return 0;
> +
> +    return 1;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, delay_interval, cons_num, type, temp;
> +    long long delay, avgdelay, ratio;
> +    long long min = THRES_USEC_VALUE;
> +    long long max = 0;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +	    return -PRIO_NO_INFORMATION;
> +
> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type)
> == 0)
> +    {
> +        condlog(3, "%s: get delay arg fail", pp->dev);
> +        delay_interval = DEFAULT_DELAY_INTERVAL;
> +        cons_num = DEFAULT_CONS_NUM;
> +        type = INTERVAL_MSEC;
> +    }
> +
> +    temp = cons_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        delay = after - before;
> +    	
> +        min = (min <= delay) ? min : delay;
> +        max = (max >= delay) ? max : delay;
> +
> +        toldelay += delay;
> +    }
> +
> +    toldelay -= min + max;

Why are you doing this? If you want to discard "extreme" values, this
is probably not sufficient. If cons == 3, this will have the effect to
use a single measurement rather than an average, is that intended?

Btw, as you are doing statistics here anyway, you may want to calculate
the estimate of the standard deviation and warn the user if the
delay_interval is smaller than, say, 2 * standard deviation.

Please consider printing a message with the measured value at debug
level 3 or higher.

> +    avgdelay = toldelay/(long long)(cons_num - 2);
> +    if (avgdelay > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
> +        return 1;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
> long)delay_interval) * ratio)));
> +
> +    return rc;
> +}
> diff --git a/libmultipath/prioritizers/delayedpath.h
> b/libmultipath/prioritizers/delayedpath.h
> new file mode 100644
> index 0000000..d8213e9
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.h
> @@ -0,0 +1,17 @@
> +#ifndef _DELAYEDPATH_H
> +#define _DELAYEDPATH_H
> +
> +#define PRIO_DELAYED_PATH "delayedpath"
> +
> +#define PRIO_NO_INFORMATION 5
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv-
> >tv_nsec >> 10);
> +}
> +
> +#endif
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..f1e126e 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I delayedpath
> +Generate the path priority based on a time-delay algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,21 @@ these values can be looked up through sysfs or
> by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I delayed

should be "delayedpath" here?

> +Needs a value of the form
> +\fI"<delay_interval|cons_num>"\fR
> +.RS
> +.TP 8
> +.I delay_interval
> +The interval values of average IO-time-delay between two different
> neighbour ranks of path priority, used to partition different
> priority ranks.

It might be good to give an example here, like this:

"If delay_interval=10ms, the paths will be grouped in priority groups
with path latency 0-10ms, 10-20ms, 20-30ms, etc." 

> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or
> Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1,
> 1000),
> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
> +.TP
> +.I cons_num
> +The number of read IOs sent to the current path continuously, used
> to calculate the average IO-time-delay. Valid Values: Integer, [3,
> 1000].
> +For example: 30. The default is: 20.
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred
> path\fR bit
>  set will always be in their own path group.

-- 
Dr. Martin Wilck <mwilck@suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-15 10:44   ` Yang Feng
  2017-05-16 14:53     ` Yang Feng
  2017-05-16 18:54     ` Martin Wilck
@ 2017-05-16 21:38     ` Benjamin Marzinski
  2017-05-19  9:45       ` Yang Feng
  2 siblings, 1 reply; 19+ messages in thread
From: Benjamin Marzinski @ 2017-05-16 21:38 UTC (permalink / raw)
  To: Yang Feng
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09, Martin Wilck

On Mon, May 15, 2017 at 06:44:47PM +0800, Yang Feng wrote:
> Please find the up-to-date patch below:
> 

First, one overall question. We have dynamic path selectors available to
deal with paths that are just simply slower that other paths, but can
still be used together.  Is there specific hardware or a specific setup
where this isn't good enough and we really need to seperate these paths
into different pathgroups, but we can't find out deterministically how
the groups should be set up?  It just seems like there could be a less
hacky solution to this problem, but perhaps there are some situations
where this is truly the best option. I'm just wondering what those are.

other commenets inlined

> ---
> >From 1a9426dfbad00b5dbefc7020603e40e8896e4869 Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Mon, 15 May 2017 18:33:29 +0800
> Subject: [PATCH] [dm-devel] [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
>  Prioritizer for device mapper multipath, where the corresponding priority values of specific paths are provided by a time-delay
>  algorithm. And the time-delay algorithm is dependent on the following arguments(delay_interval, cons_num). The principle of the
>  algorithm is illustrated as follows:
>  1. By sending a certain number "cons_num" of read IOs to the current path    continuously, the IOs' average delay can be calculated.
>  2. According to the average delay of each path and the weight value "delay_interval", the priority "rc" of each path can be provided.
> 
>            delay_interval  delay_interval  delay_interval      delay_interval
> 	 |---------------|---------------|---------------|   |---------------|
> 	 |priority rank 1|priority rank 2|priority rank 3|...|priority rank x|
>          |---------------|---------------|---------------|   |---------------|
> 		               Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile      |   6 +-
>  libmultipath/prioritizers/delayedpath.c | 261 ++++++++++++++++++++++++++++++++
>  libmultipath/prioritizers/delayedpath.h |  17 +++
>  multipath/multipath.conf.5              |  19 +++
>  4 files changed, 302 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/delayedpath.c
>  create mode 100644 libmultipath/prioritizers/delayedpath.h
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..8df5234 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriodelayedpath.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriodelayedpath.so: delayedpath.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
> new file mode 100644
> index 0000000..0490e8d
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.c
> @@ -0,0 +1,261 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a time-delay algorithm. And the
> + * time-delay algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "cons_num" of read IOs to the current path
> + *    continuously, the IOs' average delay can be calculated.
> + * 2. According to the average delay of each path and the weight value
> + *    "delay_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#include "delayedpath.h"
> +
> +#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
> +#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
> +#define DEFAULT_CONS_NUM        20
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_SEC                "s"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_USEC               "us"
> +
> +enum interval_type {
> +    INTERVAL_SEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_USEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC

This is a nit, but for constant strings, could you please use "char
*var" instead of "char var[]", to be consistent with the rest of the
multipath code.

> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +static const int conversion_ratio[] = {
> +	[INTERVAL_SEC]		= USEC_PER_SEC,
> +	[INTERVAL_MSEC]	        = USEC_PER_MSEC,
> +	[INTERVAL_USEC]		= USEC_PER_USEC,
> +	[INTERVAL_INVALID]	= 0
> +};
> +
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +static int get_interval_type(char *source, char *type)
> +{
> +    char *p;
> +    int size;
> +    int i;
> +
> +    for (i = 0; i < sizeof(interval_unit_str)/MAX_CHAR_SIZE; i++)
> +    {
> +        size = strlen(interval_unit_str[i]);
> +        p = strstr(source, interval_unit_str[i]);
> +        if (p != NULL && p[size] == '|')
> +        {
> +            memcpy(type, interval_unit_str[i], size+1);
> +            return interval_unit_type[i];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +/* In multipath.conf, args form: delay_interval|cons_num. For example,
> +*  args is "10ms|20", this function can get 10, ms, and 20.
> +*/
> +static int get_digit_and_type(char *args,
> +                              int *interval,
> +                              int *consnum,
> +                              int *type)
> +{
> +    char typestr[MAX_CHAR_SIZE];
> +    char source[MAX_CHAR_SIZE];
> +    char vertica[] = "|";
> +    char *tokenbefore = NULL;
> +    char *tokenafter = NULL;
> +    char *tmp = NULL;
> +    unsigned int size = strlen(args);
> +
> +    if ((args == NULL) || (interval == NULL)
> +        || (consnum == NULL) || (type == NULL))
> +        return 0;
> +
> +    /* int type */
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +        return 0;

You should probably have log messages for these error returns.

> +
> +    memcpy(source, args, size+1);
> +    if (strstr(source, vertica) == NULL)
> +        return 0;
> +
> +    *type = get_interval_type(source, typestr);
> +    if (*type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "delay_interval type is invalid");
> +        return 0;
> +    }

I'm confused here. How do you get to use the default interval. Shouldn't
you accept "20s|" and "|30" and as valid inputs that use the defaults
for the part they don't specify. 

> +    tokenbefore = strtok(source, vertica);
> +    tokenafter = strtok(NULL, vertica);
> +    typestr[1] = '\0';
> +    tokenbefore = strtok(tokenbefore, typestr);
> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
> +        return 0;
> +
> +    tmp = tokenbefore;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "delay_interval string include invalid char");
> +            return 0;
> +        }
> +
> +    tmp = tokenafter;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +        {
> +            condlog(0, "cons_num string include invalid char");
> +            return 0;
> +        }
> +
> +    *interval = atoi(tokenbefore);

Why do you keep track of the type and the interval seperately? Can't you
just find out the type, and use that to multiply the interval once you
read it, and then just use that value, instead of keeping track of two
values across multiple functions?

> +    *consnum = atoi(tokenafter);
> +
> +    return 1;
> +}
> +
> +int check_args_valid(int delay_interval, int cons_num, int type)
> +{
> +    if (type == INTERVAL_SEC)
> +    {
> +        if ((delay_interval < 1) || (delay_interval > 60))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +    else if (type != INTERVAL_INVALID)
> +    {
> +        if ((delay_interval < 1) || (delay_interval >= 1000))
> +        {
> +            condlog(0, "delay_interval values is invalid");
> +            return 0;
> +        }
> +    }
> +
> +    if ((cons_num < 3) || (cons_num > 1000))
> +    {
> +        condlog(0, "cons_num values is invalid");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num, int *interval_type)
> +{
> +    if (get_digit_and_type(args, delay_interval, cons_num, interval_type) == 0)
> +        return 0;
> +
> +    if (check_args_valid(*delay_interval, *cons_num, *interval_type) == 0)
> +        return 0;
> +
> +    return 1;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, delay_interval, cons_num, type, temp;
> +    long long delay, avgdelay, ratio;
> +    long long min = THRES_USEC_VALUE;
> +    long long max = 0;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +	    return -PRIO_NO_INFORMATION;
> +
> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
> +    {
> +        condlog(3, "%s: get delay arg fail", pp->dev);

Why use the word "fail" in this message? Not setting prio_args to get
the defaults seems like a perfectly valid choice.

> +        delay_interval = DEFAULT_DELAY_INTERVAL;
> +        cons_num = DEFAULT_CONS_NUM;
> +        type = INTERVAL_MSEC;
> +    }
> +
> +    temp = cons_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        delay = after - before;
> +    	
> +        min = (min <= delay) ? min : delay;
> +        max = (max >= delay) ? max : delay;
> +
> +        toldelay += delay;
> +    }
> +
> +    toldelay -= min + max;
> +    avgdelay = toldelay/(long long)(cons_num - 2);
> +    if (avgdelay > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
> +        return 1;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) * ratio)));
> +
> +    return rc;
> +}
> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
> new file mode 100644
> index 0000000..d8213e9
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.h
> @@ -0,0 +1,17 @@
> +#ifndef _DELAYEDPATH_H
> +#define _DELAYEDPATH_H
> +
> +#define PRIO_DELAYED_PATH "delayedpath"

In order for the rest of the code to refer to this prioritizer, this
define should be in prio.h with the other prioritizer names, and as long
as delayedpath.c includes prio.h, there's no need to put it in
delayedpath.h.

> +
> +#define PRIO_NO_INFORMATION 5

The rest of the multipath code only cares if getprio returns a negative
number of not. It doesn't check what the specific negative number is.  I
realize the the alua prioritizer returns a set of error codes, but they
aren't used, or even usable in their present form. If we wanted to have
better error reporting, we should set up a universal set of error codes
in prio.h, and have all prioritizers use them, instead of having each
prioritizer define its own error codes. There's no reason why your
prioritizer needs to return this error code instead of -1.

> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}

No other file besides delayedpath.c will likely be including this .h
file, so I don't see any purpose for these being defined here.  In fact,
I don't see why you can't just have a .c file without a .h file like the
majority of prioritizers.  I'm pretty sure that none of the prioritizers
really need their own .h file.

> +#endif
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..f1e126e 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I delayedpath
> +Generate the path priority based on a time-delay algorithm.
> +Requires prio_args keyword.

Really it doesn't require prio_args if you want to use the default
values, and should probably say so.

> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,21 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I delayed
> +Needs a value of the form
> +\fI"<delay_interval|cons_num>"\fR
> +.RS
> +.TP 8
> +.I delay_interval
> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
> +.TP
> +.I cons_num
> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
> +For example: 30. The default is: 20.
> +.RE
> +.TP 12

Looking at the "weighted" prio_args definition just above your "delayed"
definition, the pipe character "|" is being used to say that any of a
set of options is allowed.  Your definition has it being a literal
character, but it's still inside the angle brackets that usually
delineate a variable.  perhaps "<delay_interval>|<io_num>" would be
easier to understand, or even "[delayed_interval]|[io_num]" if you can
omit these to use the defaults.

-Ben

>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> -- 
> 
> 
> 
> 
> 
> 
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-16 18:54     ` Martin Wilck
@ 2017-05-19  8:43       ` Yang Feng
  2017-05-22  8:01         ` Yang Feng
  2017-05-24  1:58         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
  0 siblings, 2 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-19  8:43 UTC (permalink / raw)
  To: Martin Wilck
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09

Hello Martin,

Firstly, thank you very much for your comments.
And find my replys and the up-to-date patch.

Best regards!


> Please think about the name once again. Maybe you should call it
> "io_latency" or "path_latency" instead of "delayedpath"?
OK, as the following patch.

> 
> Hm, I can't see a lot of difference in the parsing code wrt the
> previous version. IMO it's still non-straightforward and hard to
> comprehend. Maybe I didn't express myself clearly enough. Here is how
> I'd code this:
> 
>  1. Verify that the string starts with a digit. Error if it does not.
>  2. Parse the delay interval using strtoul().
>  3. The "end" pointer of strtoul() points to the unit, which has to be
> "s", "ms" or "us". Verify, and set the unit accordingly.
>  4. Verify that the next character is '|', and that it's followed by a
> digit.
>  5. Parse the number with strtoul()
>  6. Verify that there's no garbage at the end of the string. 
Thank you , as the following patch.

> 
> Please follow the https://en.wikipedia.org/wiki/Robustness_principle.
> If a user enters "1500ms" here, the parsing will silently fail, and
> with it the whole prio algorithm. This will cause user confusion.
> Please don't do this
Thank you , as the following patch.

> 
> However please consider lowering the upper bound, I kind of doubt that
> 1000 IOs will finish quickly. More often than not, a lot of paths will
> appear at the same time (e.g. if a port of a storage array is enabled)
> and we'll have to send 1000 IOs to each one.
> 
OK, the upper bound lower to 200, as the following patch.

>> +    while (temp-- > 0)
>> +    {
>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>> +        before = timeval_to_us(&tv);		
>> +
>> +        if (do_readsector0(pp->fd, timeout) == 2)
>> +        {
>> +            condlog(0, "%s: path down", pp->dev);
>> +            return -1;
>> +        }
>> +
>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>> +        after = timeval_to_us(&tv);
>> +
>> +        delay = after - before;
>> +    	
>> +        min = (min <= delay) ? min : delay;
>> +        max = (max >= delay) ? max : delay;
>> +
>> +        toldelay += delay;
>> +    }
>> +
>> +    toldelay -= min + max;
> 
> Why are you doing this? If you want to discard "extreme" values, this
> is probably not sufficient. If cons == 3, this will have the effect to
> use a single measurement rather than an average, is that intended?
> 
> Btw, as you are doing statistics here anyway, you may want to calculate
> the estimate of the standard deviation and warn the user if the
> delay_interval is smaller than, say, 2 * standard deviation.
> 
> Please consider printing a message with the measured value at debug
> level 3 or higher.
OK, as the following patch.

>>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>>  .RE
>>  .TP 12
>> +.I delayed
> 
> should be "delayedpath" here?
OK, as the following patch.
> 
>> +Needs a value of the form
>> +\fI"<delay_interval|cons_num>"\fR
>> +.RS
>> +.TP 8
>> +.I delay_interval
>> +The interval values of average IO-time-delay between two different
>> neighbour ranks of path priority, used to partition different
>> priority ranks.
> 
> It might be good to give an example here, like this:
> 
> "If delay_interval=10ms, the paths will be grouped in priority groups
> with path latency 0-10ms, 10-20ms, 20-30ms, etc." 
OK, as the following patch.>

---
From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
From: Yang Feng <philip.yang@huawei.com>
Date: Fri, 19 May 2017 16:09:07 +0800
Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.

                   latency_interval   latency_interval   latency_interval       latency_interval
         	 |------------------|------------------|------------------|...|------------------|
		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
		 |------------------|------------------|------------------|...|------------------|
				          Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile       |   6 +-
 libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
 multipath/multipath.conf.5               |  18 ++
 3 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/path_latency.c

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..d2f20f6 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriopath_latency.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
new file mode 100644
index 0000000..a666b6c
--- /dev/null
+++ b/libmultipath/prioritizers/path_latency.c
@@ -0,0 +1,271 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a latency algorithm. And the
+ * latency algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "io_num" of read IOs to the current path
+ *    continuously, the IOs' average latency can be calculated.
+ * 2. According to the average latency of each path and the weight value
+ *    "latency_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
+
+#define MAX_IO_NUM              200
+#define MIN_IO_NUM              10
+
+#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
+#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
+
+#define DEFAULT_PRIORITY        0
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_USEC               "us"
+#define CHAR_MSEC               "ms"
+#define CHAR_SEC                "s"
+
+enum interval_type {
+    INTERVAL_USEC,
+    INTERVAL_MSEC,
+    INTERVAL_SEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char *interval_unit_str[MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static const int conversion_ratio[] = {
+    [INTERVAL_USEC]		= USEC_PER_USEC,
+    [INTERVAL_MSEC]     = USEC_PER_MSEC,
+    [INTERVAL_SEC]		= USEC_PER_SEC,
+    [INTERVAL_INVALID]	= 0
+};
+
+static long long path_latency[MAX_IO_NUM];
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+int check_args_valid(int io_num, long long latency_interval, int type)
+{
+    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
+    {
+        condlog(0, "args io_num is more than the valid values range");
+        return 0;
+    }
+
+    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
+    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
+    {
+        condlog(0, "args latency_interval is more than the valid values range");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int get_interval_type(char *type)
+{
+    int index;
+
+    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
+    {
+        if (strcmp(type, interval_unit_str[index]) == 0)
+        {
+            return interval_unit_type[index];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+/* In multipath.conf, args form: io_num|latency_interval. For example,
+*  args is "20|10ms", this function can get 20, 10.
+*/
+static int get_interval_and_ionum(char *args,
+                                        int *ionum,
+                                        long long *interval)
+{
+    char source[MAX_CHAR_SIZE];
+    char vertica = '|';
+    char *endstrbefore = NULL;
+    char *endstrafter = NULL;
+    int type;
+    unsigned int size = strlen(args);
+    long long ratio;
+
+    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
+    {
+        condlog(0, "args string is NULL");
+        return 0;
+    }
+
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+    {
+        condlog(0, "args string's size is too long");
+        return 0;
+    }
+
+    memcpy(source, args, size+1);
+
+    if (!isdigit(source[0]))
+    {
+        condlog(0, "args io_num string's first char is not digit");
+        return 0;
+    }
+
+    *ionum = (int)strtoul(source, &endstrbefore, 10);
+    if (endstrbefore[0] != vertica)
+    {
+        condlog(0, "segmentation char is invalid");
+        return 0;
+    }
+
+    if (!isdigit(endstrbefore[1]))
+    {
+        condlog(0, "args latency_interval string's first char is not digit");
+        return 0;
+    }
+
+    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
+    type = get_interval_type(endstrafter);
+    if (type == INTERVAL_INVALID)
+    {
+        condlog(0, "args latency_interval type is invalid");
+        return 0;
+    }
+
+    if (check_args_valid(*ionum, *interval, type) == 0)
+    {
+        return 0;
+    }
+
+	ratio = get_conversion_ratio(type);
+    *interval *= (long long)ratio;
+
+    return 1;
+}
+
+long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
+{
+    int index;
+    long long total = 0;
+
+    for (index = 0; index < size; index++)
+    {
+        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
+    }
+
+    total /= (size-1);
+
+    return (long long)sqrt((double)total);
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, temp;
+    int index = 0;
+    int io_num;
+    long long latency_interval;
+    long long avglatency;
+    long long standard_deviation;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+		return -1;
+
+    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
+    {
+        condlog(0, "%s: get path_latency args fail", pp->dev);
+        return DEFAULT_PRIORITY;
+    }
+
+    memset(path_latency, 0, sizeof(path_latency));
+
+    temp = io_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        path_latency[index] = after - before;
+        toldelay += path_latency[index++];
+    }
+
+    avglatency = toldelay/(long long)io_num;
+    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
+
+    if (avglatency > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
+        return DEFAULT_PRIORITY;
+    }
+
+    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
+    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
+    if (latency_interval <= (2 * standard_deviation))
+        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
+            pp->dev, standard_deviation);
+
+	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
+    return rc;
+}
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..3dd0d77 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I path_latency
+Generate the path priority based on a latency algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I path_latency
+Needs a value of the form
+\fI"<latency_interval>|<io_num>"\fR
+.RS
+.TP 8
+.I latency_interval
+The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
+For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
+.TP
+.I io_num
+The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
-- 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-16 21:38     ` [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Benjamin Marzinski
@ 2017-05-19  9:45       ` Yang Feng
  2017-05-22  8:02         ` Yang Feng
                           ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-19  9:45 UTC (permalink / raw)
  To: Benjamin Marzinski
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09, Martin Wilck

Hi Benjamin,

Thank you very much for your comments.
Please find my replys and the up-to-date patch.
Best regards!

> 
> First, one overall question. We have dynamic path selectors available to
> deal with paths that are just simply slower that other paths, but can
> still be used together.  Is there specific hardware or a specific setup
> where this isn't good enough and we really need to seperate these paths
> into different pathgroups, but we can't find out deterministically how
> the groups should be set up?  It just seems like there could be a less
> hacky solution to this problem, but perhaps there are some situations
> where this is truly the best option. I'm just wondering what those are.1. In the Storage-Backup environment of HyperCluster,includs one storage array near
to the host and one remote storage array, and the two storage arrays have the same hardware.
The same LUN is writed or readed by the two storage arrays.
However, usually, the average latency of the paths of the remote storage array is much higher than the
near storage array's.
apparently, the prioritizer can be a good automatic solution.
And the current selectors don't solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
2. In the environment of single storage array, the prioritizer can automatically separate the paths who's latency is much higher,
IOs will not send to this paths.
But the current selectors don't solve this problem, IOPS will be influenced unavoidably.

>> +
>> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
>> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
>> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> 
> This is a nit, but for constant strings, could you please use "char
> *var" instead of "char var[]", to be consistent with the rest of the
> multipath code.
Thanks, as the following patch.

>> +    if ((args == NULL) || (interval == NULL)
>> +        || (consnum == NULL) || (type == NULL))
>> +        return 0;
>> +
>> +    /* int type */
>> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
>> +        return 0;
> 
> You should probably have log messages for these error returns.
Thanks, as the following patch.

>> +
>> +    memcpy(source, args, size+1);
>> +    if (strstr(source, vertica) == NULL)
>> +        return 0;
>> +
>> +    *type = get_interval_type(source, typestr);
>> +    if (*type == INTERVAL_INVALID)
>> +    {
>> +        condlog(0, "delay_interval type is invalid");
>> +        return 0;
>> +    }
> 
> I'm confused here. How do you get to use the default interval. Shouldn't
> you accept "20s|" and "|30" and as valid inputs that use the defaults
> for the part they don't specify. 
OK,the default arguments value is removed. If get inputs failed, return default priority "0".
As the following patch.
> 
>> +    tokenbefore = strtok(source, vertica);
>> +    tokenafter = strtok(NULL, vertica);
>> +    typestr[1] = '\0';
>> +    tokenbefore = strtok(tokenbefore, typestr);
>> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
>> +        return 0;
>> +
>> +    tmp = tokenbefore;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +        {
>> +            condlog(0, "delay_interval string include invalid char");
>> +            return 0;
>> +        }
>> +
>> +    tmp = tokenafter;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +        {
>> +            condlog(0, "cons_num string include invalid char");
>> +            return 0;
>> +        }
>> +
>> +    *interval = atoi(tokenbefore);
> 
> Why do you keep track of the type and the interval seperately? Can't you
> just find out the type, and use that to multiply the interval once you
> read it, and then just use that value, instead of keeping track of two
> values across multiple functions?
Thanks, as the following patch.

>> +
>> +	if (pp->fd < 0)
>> +	    return -PRIO_NO_INFORMATION;
>> +
>> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
>> +    {
>> +        condlog(3, "%s: get delay arg fail", pp->dev);
> 
> Why use the word "fail" in this message? Not setting prio_args to get
> the defaults seems like a perfectly valid choice.
The defaults are not used. Insteadly, return default priority "0". See below.

>> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
>> new file mode 100644
>> index 0000000..d8213e9
>> --- /dev/null
>> +++ b/libmultipath/prioritizers/delayedpath.h
>> @@ -0,0 +1,17 @@
>> +#ifndef _DELAYEDPATH_H
>> +#define _DELAYEDPATH_H
>> +
>> +#define PRIO_DELAYED_PATH "delayedpath"
> 
> In order for the rest of the code to refer to this prioritizer, this
> define should be in prio.h with the other prioritizer names, and as long
> as delayedpath.c includes prio.h, there's no need to put it in
> delayedpath.h.
OK, as the following patch.
> 
>> +
>> +#define PRIO_NO_INFORMATION 5
> 
> The rest of the multipath code only cares if getprio returns a negative
> number of not. It doesn't check what the specific negative number is.  I
> realize the the alua prioritizer returns a set of error codes, but they
> aren't used, or even usable in their present form. If we wanted to have
> better error reporting, we should set up a universal set of error codes
> in prio.h, and have all prioritizers use them, instead of having each
> prioritizer define its own error codes. There's no reason why your
> prioritizer needs to return this error code instead of -1.
OK, as the following patch.
> 
>> +
>> +#define USEC_PER_SEC      1000000LL
>> +#define USEC_PER_MSEC     1000LL
>> +#define USEC_PER_USEC     1LL
>> +
>> +static inline long long timeval_to_us(const struct timespec *tv)
>> +{
>> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
>> +}
> 
> No other file besides delayedpath.c will likely be including this .h
> file, so I don't see any purpose for these being defined here.  In fact,
> I don't see why you can't just have a .c file without a .h file like the
> majority of prioritizers.  I'm pretty sure that none of the prioritizers
> really need their own .h file.
OK, as the following patch.
> 
>> +#endif
>> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
>> index 5939688..f1e126e 100644
>> --- a/multipath/multipath.conf.5
>> +++ b/multipath/multipath.conf.5
>> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>>  Generate the path priority based on the regular expression and the
>>  priority provided as argument. Requires prio_args keyword.
>>  .TP
>> +.I delayedpath
>> +Generate the path priority based on a time-delay algorithm.
>> +Requires prio_args keyword.
> 
> Really it doesn't require prio_args if you want to use the default
> values, and should probably say so.
The default args is discarded, as the following patch.

>> +.I delayed
>> +Needs a value of the form
>> +\fI"<delay_interval|cons_num>"\fR
>> +.RS
>> +.TP 8
>> +.I delay_interval
>> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
>> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
>> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
>> +.TP
>> +.I cons_num
>> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
>> +For example: 30. The default is: 20.
>> +.RE
>> +.TP 12
> 
> Looking at the "weighted" prio_args definition just above your "delayed"
> definition, the pipe character "|" is being used to say that any of a
> set of options is allowed.  Your definition has it being a literal
> character, but it's still inside the angle brackets that usually
> delineate a variable.  perhaps "<delay_interval>|<io_num>" would be
> easier to understand, or even "[delayed_interval]|[io_num]" if you can
> omit these to use the defaults.
OK, as the following patch.
The up-to-date patch as follows:

---
>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
From: Yang Feng <philip.yang@huawei.com>
Date: Fri, 19 May 2017 16:09:07 +0800
Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.

                   latency_interval   latency_interval   latency_interval       latency_interval
         	 |------------------|------------------|------------------|...|------------------|
		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
		 |------------------|------------------|------------------|...|------------------|
				          Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile       |   6 +-
 libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
 multipath/multipath.conf.5               |  18 ++
 libmultipath/prio.h 			  |   1 +
 4 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/path_latency.c

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..d2f20f6 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriopath_latency.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
new file mode 100644
index 0000000..a666b6c
--- /dev/null
+++ b/libmultipath/prioritizers/path_latency.c
@@ -0,0 +1,271 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a latency algorithm. And the
+ * latency algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "io_num" of read IOs to the current path
+ *    continuously, the IOs' average latency can be calculated.
+ * 2. According to the average latency of each path and the weight value
+ *    "latency_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
+
+#define MAX_IO_NUM              200
+#define MIN_IO_NUM              10
+
+#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
+#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
+
+#define DEFAULT_PRIORITY        0
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_USEC               "us"
+#define CHAR_MSEC               "ms"
+#define CHAR_SEC                "s"
+
+enum interval_type {
+    INTERVAL_USEC,
+    INTERVAL_MSEC,
+    INTERVAL_SEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char *interval_unit_str[MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static const int conversion_ratio[] = {
+    [INTERVAL_USEC]		= USEC_PER_USEC,
+    [INTERVAL_MSEC]     = USEC_PER_MSEC,
+    [INTERVAL_SEC]		= USEC_PER_SEC,
+    [INTERVAL_INVALID]	= 0
+};
+
+static long long path_latency[MAX_IO_NUM];
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+int check_args_valid(int io_num, long long latency_interval, int type)
+{
+    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
+    {
+        condlog(0, "args io_num is more than the valid values range");
+        return 0;
+    }
+
+    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
+    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
+    {
+        condlog(0, "args latency_interval is more than the valid values range");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int get_interval_type(char *type)
+{
+    int index;
+
+    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
+    {
+        if (strcmp(type, interval_unit_str[index]) == 0)
+        {
+            return interval_unit_type[index];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+/* In multipath.conf, args form: io_num|latency_interval. For example,
+*  args is "20|10ms", this function can get 20, 10.
+*/
+static int get_interval_and_ionum(char *args,
+                                        int *ionum,
+                                        long long *interval)
+{
+    char source[MAX_CHAR_SIZE];
+    char vertica = '|';
+    char *endstrbefore = NULL;
+    char *endstrafter = NULL;
+    int type;
+    unsigned int size = strlen(args);
+    long long ratio;
+
+    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
+    {
+        condlog(0, "args string is NULL");
+        return 0;
+    }
+
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+    {
+        condlog(0, "args string's size is too long");
+        return 0;
+    }
+
+    memcpy(source, args, size+1);
+
+    if (!isdigit(source[0]))
+    {
+        condlog(0, "args io_num string's first char is not digit");
+        return 0;
+    }
+
+    *ionum = (int)strtoul(source, &endstrbefore, 10);
+    if (endstrbefore[0] != vertica)
+    {
+        condlog(0, "segmentation char is invalid");
+        return 0;
+    }
+
+    if (!isdigit(endstrbefore[1]))
+    {
+        condlog(0, "args latency_interval string's first char is not digit");
+        return 0;
+    }
+
+    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
+    type = get_interval_type(endstrafter);
+    if (type == INTERVAL_INVALID)
+    {
+        condlog(0, "args latency_interval type is invalid");
+        return 0;
+    }
+
+    if (check_args_valid(*ionum, *interval, type) == 0)
+    {
+        return 0;
+    }
+
+	ratio = get_conversion_ratio(type);
+    *interval *= (long long)ratio;
+
+    return 1;
+}
+
+long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
+{
+    int index;
+    long long total = 0;
+
+    for (index = 0; index < size; index++)
+    {
+        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
+    }
+
+    total /= (size-1);
+
+    return (long long)sqrt((double)total);
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, temp;
+    int index = 0;
+    int io_num;
+    long long latency_interval;
+    long long avglatency;
+    long long standard_deviation;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+		return -1;
+
+    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
+    {
+        condlog(0, "%s: get path_latency args fail", pp->dev);
+        return DEFAULT_PRIORITY;
+    }
+
+    memset(path_latency, 0, sizeof(path_latency));
+
+    temp = io_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        path_latency[index] = after - before;
+        toldelay += path_latency[index++];
+    }
+
+    avglatency = toldelay/(long long)io_num;
+    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
+
+    if (avglatency > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
+        return DEFAULT_PRIORITY;
+    }
+
+    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
+    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
+    if (latency_interval <= (2 * standard_deviation))
+        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
+            pp->dev, standard_deviation);
+
+	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
+    return rc;
+}
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..3dd0d77 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I path_latency
+Generate the path priority based on a latency algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I path_latency
+Needs a value of the form
+\fI"<latency_interval>|<io_num>"\fR
+.RS
+.TP 8
+.I latency_interval
+The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
+For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
+.TP
+.I io_num
+The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
diff --git a/libmultipath/prio.h b/libmultipath/prio.h
index 0193c52..c97fe39 100644
--- a/libmultipath/prio.h
+++ b/libmultipath/prio.h
@@ -29,6 +29,7 @@ struct path;
 #define PRIO_RDAC		"rdac"
 #define PRIO_WEIGHTED_PATH	"weightedpath"
 #define PRIO_SYSFS		"sysfs"
+#define PRIO_PATH_LATENCY	"path_latency"

 /*
  * Value used to mark the fact prio was not defined
-- 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-19  8:43       ` Yang Feng
@ 2017-05-22  8:01         ` Yang Feng
  2017-05-24  1:58         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
  1 sibling, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-22  8:01 UTC (permalink / raw)
  To: Martin Wilck, Christophe Varoqui
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09, qiuxin

Hello Martin and Christophe,

How about this patch?
Thanks a lot.
Best.


On 2017/5/19 16:43, Yang Feng wrote:
> Hello Martin,
> 
> Firstly, thank you very much for your comments.
> And find my replys and the up-to-date patch.
> 
> Best regards!
> 
> 
>> Please think about the name once again. Maybe you should call it
>> "io_latency" or "path_latency" instead of "delayedpath"?
> OK, as the following patch.
> 
>>
>> Hm, I can't see a lot of difference in the parsing code wrt the
>> previous version. IMO it's still non-straightforward and hard to
>> comprehend. Maybe I didn't express myself clearly enough. Here is how
>> I'd code this:
>>
>>  1. Verify that the string starts with a digit. Error if it does not.
>>  2. Parse the delay interval using strtoul().
>>  3. The "end" pointer of strtoul() points to the unit, which has to be
>> "s", "ms" or "us". Verify, and set the unit accordingly.
>>  4. Verify that the next character is '|', and that it's followed by a
>> digit.
>>  5. Parse the number with strtoul()
>>  6. Verify that there's no garbage at the end of the string. 
> Thank you , as the following patch.
> 
>>
>> Please follow the https://en.wikipedia.org/wiki/Robustness_principle.
>> If a user enters "1500ms" here, the parsing will silently fail, and
>> with it the whole prio algorithm. This will cause user confusion.
>> Please don't do this
> Thank you , as the following patch.
> 
>>
>> However please consider lowering the upper bound, I kind of doubt that
>> 1000 IOs will finish quickly. More often than not, a lot of paths will
>> appear at the same time (e.g. if a port of a storage array is enabled)
>> and we'll have to send 1000 IOs to each one.
>>
> OK, the upper bound lower to 200, as the following patch.
> 
>>> +    while (temp-- > 0)
>>> +    {
>>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>>> +        before = timeval_to_us(&tv);		
>>> +
>>> +        if (do_readsector0(pp->fd, timeout) == 2)
>>> +        {
>>> +            condlog(0, "%s: path down", pp->dev);
>>> +            return -1;
>>> +        }
>>> +
>>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>>> +        after = timeval_to_us(&tv);
>>> +
>>> +        delay = after - before;
>>> +    	
>>> +        min = (min <= delay) ? min : delay;
>>> +        max = (max >= delay) ? max : delay;
>>> +
>>> +        toldelay += delay;
>>> +    }
>>> +
>>> +    toldelay -= min + max;
>>
>> Why are you doing this? If you want to discard "extreme" values, this
>> is probably not sufficient. If cons == 3, this will have the effect to
>> use a single measurement rather than an average, is that intended?
>>
>> Btw, as you are doing statistics here anyway, you may want to calculate
>> the estimate of the standard deviation and warn the user if the
>> delay_interval is smaller than, say, 2 * standard deviation.
>>
>> Please consider printing a message with the measured value at debug
>> level 3 or higher.
> OK, as the following patch.
> 
>>>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>>>  .RE
>>>  .TP 12
>>> +.I delayed
>>
>> should be "delayedpath" here?
> OK, as the following patch.
>>
>>> +Needs a value of the form
>>> +\fI"<delay_interval|cons_num>"\fR
>>> +.RS
>>> +.TP 8
>>> +.I delay_interval
>>> +The interval values of average IO-time-delay between two different
>>> neighbour ranks of path priority, used to partition different
>>> priority ranks.
>>
>> It might be good to give an example here, like this:
>>
>> "If delay_interval=10ms, the paths will be grouped in priority groups
>> with path latency 0-10ms, 10-20ms, 20-30ms, etc." 
> OK, as the following patch.>
> 
> ---
>>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Fri, 19 May 2017 16:09:07 +0800
> Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
> arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
> 2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.
> 
>                    latency_interval   latency_interval   latency_interval       latency_interval
>          	 |------------------|------------------|------------------|...|------------------|
> 		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
> 		 |------------------|------------------|------------------|...|------------------|
> 				          Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile       |   6 +-
>  libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
>  multipath/multipath.conf.5               |  18 ++
>  3 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/path_latency.c
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..d2f20f6 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriopath_latency.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
> new file mode 100644
> index 0000000..a666b6c
> --- /dev/null
> +++ b/libmultipath/prioritizers/path_latency.c
> @@ -0,0 +1,271 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a latency algorithm. And the
> + * latency algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "io_num" of read IOs to the current path
> + *    continuously, the IOs' average latency can be calculated.
> + * 2. According to the average latency of each path and the weight value
> + *    "latency_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <math.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
> +
> +#define MAX_IO_NUM              200
> +#define MIN_IO_NUM              10
> +
> +#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
> +#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
> +
> +#define DEFAULT_PRIORITY        0
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_USEC               "us"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_SEC                "s"
> +
> +enum interval_type {
> +    INTERVAL_USEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_SEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char *interval_unit_str[MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static const int conversion_ratio[] = {
> +    [INTERVAL_USEC]		= USEC_PER_USEC,
> +    [INTERVAL_MSEC]     = USEC_PER_MSEC,
> +    [INTERVAL_SEC]		= USEC_PER_SEC,
> +    [INTERVAL_INVALID]	= 0
> +};
> +
> +static long long path_latency[MAX_IO_NUM];
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +int check_args_valid(int io_num, long long latency_interval, int type)
> +{
> +    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
> +    {
> +        condlog(0, "args io_num is more than the valid values range");
> +        return 0;
> +    }
> +
> +    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
> +    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
> +    {
> +        condlog(0, "args latency_interval is more than the valid values range");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int get_interval_type(char *type)
> +{
> +    int index;
> +
> +    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
> +    {
> +        if (strcmp(type, interval_unit_str[index]) == 0)
> +        {
> +            return interval_unit_type[index];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +/* In multipath.conf, args form: io_num|latency_interval. For example,
> +*  args is "20|10ms", this function can get 20, 10.
> +*/
> +static int get_interval_and_ionum(char *args,
> +                                        int *ionum,
> +                                        long long *interval)
> +{
> +    char source[MAX_CHAR_SIZE];
> +    char vertica = '|';
> +    char *endstrbefore = NULL;
> +    char *endstrafter = NULL;
> +    int type;
> +    unsigned int size = strlen(args);
> +    long long ratio;
> +
> +    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
> +    {
> +        condlog(0, "args string is NULL");
> +        return 0;
> +    }
> +
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +    {
> +        condlog(0, "args string's size is too long");
> +        return 0;
> +    }
> +
> +    memcpy(source, args, size+1);
> +
> +    if (!isdigit(source[0]))
> +    {
> +        condlog(0, "args io_num string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *ionum = (int)strtoul(source, &endstrbefore, 10);
> +    if (endstrbefore[0] != vertica)
> +    {
> +        condlog(0, "segmentation char is invalid");
> +        return 0;
> +    }
> +
> +    if (!isdigit(endstrbefore[1]))
> +    {
> +        condlog(0, "args latency_interval string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
> +    type = get_interval_type(endstrafter);
> +    if (type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "args latency_interval type is invalid");
> +        return 0;
> +    }
> +
> +    if (check_args_valid(*ionum, *interval, type) == 0)
> +    {
> +        return 0;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +    *interval *= (long long)ratio;
> +
> +    return 1;
> +}
> +
> +long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
> +{
> +    int index;
> +    long long total = 0;
> +
> +    for (index = 0; index < size; index++)
> +    {
> +        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
> +    }
> +
> +    total /= (size-1);
> +
> +    return (long long)sqrt((double)total);
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, temp;
> +    int index = 0;
> +    int io_num;
> +    long long latency_interval;
> +    long long avglatency;
> +    long long standard_deviation;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +		return -1;
> +
> +    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
> +    {
> +        condlog(0, "%s: get path_latency args fail", pp->dev);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    memset(path_latency, 0, sizeof(path_latency));
> +
> +    temp = io_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        path_latency[index] = after - before;
> +        toldelay += path_latency[index++];
> +    }
> +
> +    avglatency = toldelay/(long long)io_num;
> +    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
> +
> +    if (avglatency > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
> +    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
> +    if (latency_interval <= (2 * standard_deviation))
> +        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
> +            pp->dev, standard_deviation);
> +
> +	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
> +    return rc;
> +}
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..3dd0d77 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I path_latency
> +Generate the path priority based on a latency algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I path_latency
> +Needs a value of the form
> +\fI"<latency_interval>|<io_num>"\fR
> +.RS
> +.TP 8
> +.I latency_interval
> +The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
> +For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
> +.TP
> +.I io_num
> +The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-19  9:45       ` Yang Feng
@ 2017-05-22  8:02         ` Yang Feng
  2017-05-24  1:59         ` Yang Feng
  2017-05-24  2:22         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
  2 siblings, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-22  8:02 UTC (permalink / raw)
  To: Benjamin Marzinski, Christophe Varoqui
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09, Martin Wilck

Hello Benjamin and Christophe,

How about this patch?
Thanks a lot.
Best.



On 2017/5/19 17:45, Yang Feng wrote:
> Hi Benjamin,
> 
> Thank you very much for your comments.
> Please find my replys and the up-to-date patch.
> Best regards!
> 
>>
>> First, one overall question. We have dynamic path selectors available to
>> deal with paths that are just simply slower that other paths, but can
>> still be used together.  Is there specific hardware or a specific setup
>> where this isn't good enough and we really need to seperate these paths
>> into different pathgroups, but we can't find out deterministically how
>> the groups should be set up?  It just seems like there could be a less
>> hacky solution to this problem, but perhaps there are some situations
>> where this is truly the best option. I'm just wondering what those are.1. In the Storage-Backup environment of HyperCluster,includs one storage array near
> to the host and one remote storage array, and the two storage arrays have the same hardware.
> The same LUN is writed or readed by the two storage arrays.
> However, usually, the average latency of the paths of the remote storage array is much higher than the
> near storage array's.
> apparently, the prioritizer can be a good automatic solution.
> And the current selectors don't solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
> 2. In the environment of single storage array, the prioritizer can automatically separate the paths who's latency is much higher,
> IOs will not send to this paths.
> But the current selectors don't solve this problem, IOPS will be influenced unavoidably.
> 
>>> +
>>> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
>>> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
>>> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
>>
>> This is a nit, but for constant strings, could you please use "char
>> *var" instead of "char var[]", to be consistent with the rest of the
>> multipath code.
> Thanks, as the following patch.
> 
>>> +    if ((args == NULL) || (interval == NULL)
>>> +        || (consnum == NULL) || (type == NULL))
>>> +        return 0;
>>> +
>>> +    /* int type */
>>> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
>>> +        return 0;
>>
>> You should probably have log messages for these error returns.
> Thanks, as the following patch.
> 
>>> +
>>> +    memcpy(source, args, size+1);
>>> +    if (strstr(source, vertica) == NULL)
>>> +        return 0;
>>> +
>>> +    *type = get_interval_type(source, typestr);
>>> +    if (*type == INTERVAL_INVALID)
>>> +    {
>>> +        condlog(0, "delay_interval type is invalid");
>>> +        return 0;
>>> +    }
>>
>> I'm confused here. How do you get to use the default interval. Shouldn't
>> you accept "20s|" and "|30" and as valid inputs that use the defaults
>> for the part they don't specify. 
> OK,the default arguments value is removed. If get inputs failed, return default priority "0".
> As the following patch.
>>
>>> +    tokenbefore = strtok(source, vertica);
>>> +    tokenafter = strtok(NULL, vertica);
>>> +    typestr[1] = '\0';
>>> +    tokenbefore = strtok(tokenbefore, typestr);
>>> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
>>> +        return 0;
>>> +
>>> +    tmp = tokenbefore;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +        {
>>> +            condlog(0, "delay_interval string include invalid char");
>>> +            return 0;
>>> +        }
>>> +
>>> +    tmp = tokenafter;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +        {
>>> +            condlog(0, "cons_num string include invalid char");
>>> +            return 0;
>>> +        }
>>> +
>>> +    *interval = atoi(tokenbefore);
>>
>> Why do you keep track of the type and the interval seperately? Can't you
>> just find out the type, and use that to multiply the interval once you
>> read it, and then just use that value, instead of keeping track of two
>> values across multiple functions?
> Thanks, as the following patch.
> 
>>> +
>>> +	if (pp->fd < 0)
>>> +	    return -PRIO_NO_INFORMATION;
>>> +
>>> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
>>> +    {
>>> +        condlog(3, "%s: get delay arg fail", pp->dev);
>>
>> Why use the word "fail" in this message? Not setting prio_args to get
>> the defaults seems like a perfectly valid choice.
> The defaults are not used. Insteadly, return default priority "0". See below.
> 
>>> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
>>> new file mode 100644
>>> index 0000000..d8213e9
>>> --- /dev/null
>>> +++ b/libmultipath/prioritizers/delayedpath.h
>>> @@ -0,0 +1,17 @@
>>> +#ifndef _DELAYEDPATH_H
>>> +#define _DELAYEDPATH_H
>>> +
>>> +#define PRIO_DELAYED_PATH "delayedpath"
>>
>> In order for the rest of the code to refer to this prioritizer, this
>> define should be in prio.h with the other prioritizer names, and as long
>> as delayedpath.c includes prio.h, there's no need to put it in
>> delayedpath.h.
> OK, as the following patch.
>>
>>> +
>>> +#define PRIO_NO_INFORMATION 5
>>
>> The rest of the multipath code only cares if getprio returns a negative
>> number of not. It doesn't check what the specific negative number is.  I
>> realize the the alua prioritizer returns a set of error codes, but they
>> aren't used, or even usable in their present form. If we wanted to have
>> better error reporting, we should set up a universal set of error codes
>> in prio.h, and have all prioritizers use them, instead of having each
>> prioritizer define its own error codes. There's no reason why your
>> prioritizer needs to return this error code instead of -1.
> OK, as the following patch.
>>
>>> +
>>> +#define USEC_PER_SEC      1000000LL
>>> +#define USEC_PER_MSEC     1000LL
>>> +#define USEC_PER_USEC     1LL
>>> +
>>> +static inline long long timeval_to_us(const struct timespec *tv)
>>> +{
>>> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
>>> +}
>>
>> No other file besides delayedpath.c will likely be including this .h
>> file, so I don't see any purpose for these being defined here.  In fact,
>> I don't see why you can't just have a .c file without a .h file like the
>> majority of prioritizers.  I'm pretty sure that none of the prioritizers
>> really need their own .h file.
> OK, as the following patch.
>>
>>> +#endif
>>> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
>>> index 5939688..f1e126e 100644
>>> --- a/multipath/multipath.conf.5
>>> +++ b/multipath/multipath.conf.5
>>> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>>>  Generate the path priority based on the regular expression and the
>>>  priority provided as argument. Requires prio_args keyword.
>>>  .TP
>>> +.I delayedpath
>>> +Generate the path priority based on a time-delay algorithm.
>>> +Requires prio_args keyword.
>>
>> Really it doesn't require prio_args if you want to use the default
>> values, and should probably say so.
> The default args is discarded, as the following patch.
> 
>>> +.I delayed
>>> +Needs a value of the form
>>> +\fI"<delay_interval|cons_num>"\fR
>>> +.RS
>>> +.TP 8
>>> +.I delay_interval
>>> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
>>> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
>>> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
>>> +.TP
>>> +.I cons_num
>>> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
>>> +For example: 30. The default is: 20.
>>> +.RE
>>> +.TP 12
>>
>> Looking at the "weighted" prio_args definition just above your "delayed"
>> definition, the pipe character "|" is being used to say that any of a
>> set of options is allowed.  Your definition has it being a literal
>> character, but it's still inside the angle brackets that usually
>> delineate a variable.  perhaps "<delay_interval>|<io_num>" would be
>> easier to understand, or even "[delayed_interval]|[io_num]" if you can
>> omit these to use the defaults.
> OK, as the following patch.
> The up-to-date patch as follows:
> 
> ---
>>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Fri, 19 May 2017 16:09:07 +0800
> Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
> arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
> 2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.
> 
>                    latency_interval   latency_interval   latency_interval       latency_interval
>          	 |------------------|------------------|------------------|...|------------------|
> 		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
> 		 |------------------|------------------|------------------|...|------------------|
> 				          Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile       |   6 +-
>  libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
>  multipath/multipath.conf.5               |  18 ++
>  libmultipath/prio.h 			  |   1 +
>  4 files changed, 295 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/path_latency.c
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..d2f20f6 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriopath_latency.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
> new file mode 100644
> index 0000000..a666b6c
> --- /dev/null
> +++ b/libmultipath/prioritizers/path_latency.c
> @@ -0,0 +1,271 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a latency algorithm. And the
> + * latency algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "io_num" of read IOs to the current path
> + *    continuously, the IOs' average latency can be calculated.
> + * 2. According to the average latency of each path and the weight value
> + *    "latency_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <math.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
> +
> +#define MAX_IO_NUM              200
> +#define MIN_IO_NUM              10
> +
> +#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
> +#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
> +
> +#define DEFAULT_PRIORITY        0
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_USEC               "us"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_SEC                "s"
> +
> +enum interval_type {
> +    INTERVAL_USEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_SEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char *interval_unit_str[MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static const int conversion_ratio[] = {
> +    [INTERVAL_USEC]		= USEC_PER_USEC,
> +    [INTERVAL_MSEC]     = USEC_PER_MSEC,
> +    [INTERVAL_SEC]		= USEC_PER_SEC,
> +    [INTERVAL_INVALID]	= 0
> +};
> +
> +static long long path_latency[MAX_IO_NUM];
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +int check_args_valid(int io_num, long long latency_interval, int type)
> +{
> +    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
> +    {
> +        condlog(0, "args io_num is more than the valid values range");
> +        return 0;
> +    }
> +
> +    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
> +    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
> +    {
> +        condlog(0, "args latency_interval is more than the valid values range");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int get_interval_type(char *type)
> +{
> +    int index;
> +
> +    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
> +    {
> +        if (strcmp(type, interval_unit_str[index]) == 0)
> +        {
> +            return interval_unit_type[index];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +/* In multipath.conf, args form: io_num|latency_interval. For example,
> +*  args is "20|10ms", this function can get 20, 10.
> +*/
> +static int get_interval_and_ionum(char *args,
> +                                        int *ionum,
> +                                        long long *interval)
> +{
> +    char source[MAX_CHAR_SIZE];
> +    char vertica = '|';
> +    char *endstrbefore = NULL;
> +    char *endstrafter = NULL;
> +    int type;
> +    unsigned int size = strlen(args);
> +    long long ratio;
> +
> +    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
> +    {
> +        condlog(0, "args string is NULL");
> +        return 0;
> +    }
> +
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +    {
> +        condlog(0, "args string's size is too long");
> +        return 0;
> +    }
> +
> +    memcpy(source, args, size+1);
> +
> +    if (!isdigit(source[0]))
> +    {
> +        condlog(0, "args io_num string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *ionum = (int)strtoul(source, &endstrbefore, 10);
> +    if (endstrbefore[0] != vertica)
> +    {
> +        condlog(0, "segmentation char is invalid");
> +        return 0;
> +    }
> +
> +    if (!isdigit(endstrbefore[1]))
> +    {
> +        condlog(0, "args latency_interval string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
> +    type = get_interval_type(endstrafter);
> +    if (type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "args latency_interval type is invalid");
> +        return 0;
> +    }
> +
> +    if (check_args_valid(*ionum, *interval, type) == 0)
> +    {
> +        return 0;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +    *interval *= (long long)ratio;
> +
> +    return 1;
> +}
> +
> +long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
> +{
> +    int index;
> +    long long total = 0;
> +
> +    for (index = 0; index < size; index++)
> +    {
> +        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
> +    }
> +
> +    total /= (size-1);
> +
> +    return (long long)sqrt((double)total);
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, temp;
> +    int index = 0;
> +    int io_num;
> +    long long latency_interval;
> +    long long avglatency;
> +    long long standard_deviation;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +		return -1;
> +
> +    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
> +    {
> +        condlog(0, "%s: get path_latency args fail", pp->dev);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    memset(path_latency, 0, sizeof(path_latency));
> +
> +    temp = io_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        path_latency[index] = after - before;
> +        toldelay += path_latency[index++];
> +    }
> +
> +    avglatency = toldelay/(long long)io_num;
> +    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
> +
> +    if (avglatency > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
> +    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
> +    if (latency_interval <= (2 * standard_deviation))
> +        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
> +            pp->dev, standard_deviation);
> +
> +	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
> +    return rc;
> +}
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..3dd0d77 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I path_latency
> +Generate the path priority based on a latency algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I path_latency
> +Needs a value of the form
> +\fI"<latency_interval>|<io_num>"\fR
> +.RS
> +.TP 8
> +.I latency_interval
> +The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
> +For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
> +.TP
> +.I io_num
> +The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> diff --git a/libmultipath/prio.h b/libmultipath/prio.h
> index 0193c52..c97fe39 100644
> --- a/libmultipath/prio.h
> +++ b/libmultipath/prio.h
> @@ -29,6 +29,7 @@ struct path;
>  #define PRIO_RDAC		"rdac"
>  #define PRIO_WEIGHTED_PATH	"weightedpath"
>  #define PRIO_SYSFS		"sysfs"
> +#define PRIO_PATH_LATENCY	"path_latency"
> 
>  /*
>   * Value used to mark the fact prio was not defined
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a latency algorithm
  2017-05-19  8:43       ` Yang Feng
  2017-05-22  8:01         ` Yang Feng
@ 2017-05-24  1:58         ` Yang Feng
  1 sibling, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-24  1:58 UTC (permalink / raw)
  To: Martin Wilck
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09

Hello Martin,

How about this patch?
Look forward your replies.
Thank you very much.
Best regards!




On 2017/5/19 16:43, Yang Feng wrote:
> Hello Martin,
> 
> Firstly, thank you very much for your comments.
> And find my replys and the up-to-date patch.
> 
> Best regards!
> 
> 
>> Please think about the name once again. Maybe you should call it
>> "io_latency" or "path_latency" instead of "delayedpath"?
> OK, as the following patch.
> 
>>
>> Hm, I can't see a lot of difference in the parsing code wrt the
>> previous version. IMO it's still non-straightforward and hard to
>> comprehend. Maybe I didn't express myself clearly enough. Here is how
>> I'd code this:
>>
>>  1. Verify that the string starts with a digit. Error if it does not.
>>  2. Parse the delay interval using strtoul().
>>  3. The "end" pointer of strtoul() points to the unit, which has to be
>> "s", "ms" or "us". Verify, and set the unit accordingly.
>>  4. Verify that the next character is '|', and that it's followed by a
>> digit.
>>  5. Parse the number with strtoul()
>>  6. Verify that there's no garbage at the end of the string. 
> Thank you , as the following patch.
> 
>>
>> Please follow the https://en.wikipedia.org/wiki/Robustness_principle.
>> If a user enters "1500ms" here, the parsing will silently fail, and
>> with it the whole prio algorithm. This will cause user confusion.
>> Please don't do this
> Thank you , as the following patch.
> 
>>
>> However please consider lowering the upper bound, I kind of doubt that
>> 1000 IOs will finish quickly. More often than not, a lot of paths will
>> appear at the same time (e.g. if a port of a storage array is enabled)
>> and we'll have to send 1000 IOs to each one.
>>
> OK, the upper bound lower to 200, as the following patch.
> 
>>> +    while (temp-- > 0)
>>> +    {
>>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>>> +        before = timeval_to_us(&tv);		
>>> +
>>> +        if (do_readsector0(pp->fd, timeout) == 2)
>>> +        {
>>> +            condlog(0, "%s: path down", pp->dev);
>>> +            return -1;
>>> +        }
>>> +
>>> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
>>> +        after = timeval_to_us(&tv);
>>> +
>>> +        delay = after - before;
>>> +    	
>>> +        min = (min <= delay) ? min : delay;
>>> +        max = (max >= delay) ? max : delay;
>>> +
>>> +        toldelay += delay;
>>> +    }
>>> +
>>> +    toldelay -= min + max;
>>
>> Why are you doing this? If you want to discard "extreme" values, this
>> is probably not sufficient. If cons == 3, this will have the effect to
>> use a single measurement rather than an average, is that intended?
>>
>> Btw, as you are doing statistics here anyway, you may want to calculate
>> the estimate of the standard deviation and warn the user if the
>> delay_interval is smaller than, say, 2 * standard deviation.
>>
>> Please consider printing a message with the measured value at debug
>> level 3 or higher.
> OK, as the following patch.
> 
>>>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>>>  .RE
>>>  .TP 12
>>> +.I delayed
>>
>> should be "delayedpath" here?
> OK, as the following patch.
>>
>>> +Needs a value of the form
>>> +\fI"<delay_interval|cons_num>"\fR
>>> +.RS
>>> +.TP 8
>>> +.I delay_interval
>>> +The interval values of average IO-time-delay between two different
>>> neighbour ranks of path priority, used to partition different
>>> priority ranks.
>>
>> It might be good to give an example here, like this:
>>
>> "If delay_interval=10ms, the paths will be grouped in priority groups
>> with path latency 0-10ms, 10-20ms, 20-30ms, etc." 
> OK, as the following patch.>
> 
> ---
>>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Fri, 19 May 2017 16:09:07 +0800
> Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
> arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
> 2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.
> 
>                    latency_interval   latency_interval   latency_interval       latency_interval
>          	 |------------------|------------------|------------------|...|------------------|
> 		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
> 		 |------------------|------------------|------------------|...|------------------|
> 				          Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile       |   6 +-
>  libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
>  multipath/multipath.conf.5               |  18 ++
>  3 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/path_latency.c
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..d2f20f6 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriopath_latency.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
> new file mode 100644
> index 0000000..a666b6c
> --- /dev/null
> +++ b/libmultipath/prioritizers/path_latency.c
> @@ -0,0 +1,271 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a latency algorithm. And the
> + * latency algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "io_num" of read IOs to the current path
> + *    continuously, the IOs' average latency can be calculated.
> + * 2. According to the average latency of each path and the weight value
> + *    "latency_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <math.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
> +
> +#define MAX_IO_NUM              200
> +#define MIN_IO_NUM              10
> +
> +#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
> +#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
> +
> +#define DEFAULT_PRIORITY        0
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_USEC               "us"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_SEC                "s"
> +
> +enum interval_type {
> +    INTERVAL_USEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_SEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char *interval_unit_str[MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static const int conversion_ratio[] = {
> +    [INTERVAL_USEC]		= USEC_PER_USEC,
> +    [INTERVAL_MSEC]     = USEC_PER_MSEC,
> +    [INTERVAL_SEC]		= USEC_PER_SEC,
> +    [INTERVAL_INVALID]	= 0
> +};
> +
> +static long long path_latency[MAX_IO_NUM];
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +int check_args_valid(int io_num, long long latency_interval, int type)
> +{
> +    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
> +    {
> +        condlog(0, "args io_num is more than the valid values range");
> +        return 0;
> +    }
> +
> +    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
> +    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
> +    {
> +        condlog(0, "args latency_interval is more than the valid values range");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int get_interval_type(char *type)
> +{
> +    int index;
> +
> +    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
> +    {
> +        if (strcmp(type, interval_unit_str[index]) == 0)
> +        {
> +            return interval_unit_type[index];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +/* In multipath.conf, args form: io_num|latency_interval. For example,
> +*  args is "20|10ms", this function can get 20, 10.
> +*/
> +static int get_interval_and_ionum(char *args,
> +                                        int *ionum,
> +                                        long long *interval)
> +{
> +    char source[MAX_CHAR_SIZE];
> +    char vertica = '|';
> +    char *endstrbefore = NULL;
> +    char *endstrafter = NULL;
> +    int type;
> +    unsigned int size = strlen(args);
> +    long long ratio;
> +
> +    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
> +    {
> +        condlog(0, "args string is NULL");
> +        return 0;
> +    }
> +
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +    {
> +        condlog(0, "args string's size is too long");
> +        return 0;
> +    }
> +
> +    memcpy(source, args, size+1);
> +
> +    if (!isdigit(source[0]))
> +    {
> +        condlog(0, "args io_num string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *ionum = (int)strtoul(source, &endstrbefore, 10);
> +    if (endstrbefore[0] != vertica)
> +    {
> +        condlog(0, "segmentation char is invalid");
> +        return 0;
> +    }
> +
> +    if (!isdigit(endstrbefore[1]))
> +    {
> +        condlog(0, "args latency_interval string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
> +    type = get_interval_type(endstrafter);
> +    if (type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "args latency_interval type is invalid");
> +        return 0;
> +    }
> +
> +    if (check_args_valid(*ionum, *interval, type) == 0)
> +    {
> +        return 0;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +    *interval *= (long long)ratio;
> +
> +    return 1;
> +}
> +
> +long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
> +{
> +    int index;
> +    long long total = 0;
> +
> +    for (index = 0; index < size; index++)
> +    {
> +        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
> +    }
> +
> +    total /= (size-1);
> +
> +    return (long long)sqrt((double)total);
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, temp;
> +    int index = 0;
> +    int io_num;
> +    long long latency_interval;
> +    long long avglatency;
> +    long long standard_deviation;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +		return -1;
> +
> +    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
> +    {
> +        condlog(0, "%s: get path_latency args fail", pp->dev);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    memset(path_latency, 0, sizeof(path_latency));
> +
> +    temp = io_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        path_latency[index] = after - before;
> +        toldelay += path_latency[index++];
> +    }
> +
> +    avglatency = toldelay/(long long)io_num;
> +    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
> +
> +    if (avglatency > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
> +    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
> +    if (latency_interval <= (2 * standard_deviation))
> +        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
> +            pp->dev, standard_deviation);
> +
> +	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
> +    return rc;
> +}
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..3dd0d77 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I path_latency
> +Generate the path priority based on a latency algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I path_latency
> +Needs a value of the form
> +\fI"<latency_interval>|<io_num>"\fR
> +.RS
> +.TP 8
> +.I latency_interval
> +The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
> +For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
> +.TP
> +.I io_num
> +The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm
  2017-05-19  9:45       ` Yang Feng
  2017-05-22  8:02         ` Yang Feng
@ 2017-05-24  1:59         ` Yang Feng
  2017-05-24  2:22         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
  2 siblings, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-24  1:59 UTC (permalink / raw)
  To: Benjamin Marzinski
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong, shenhong09,
	dm-devel, hege09, Martin Wilck

Hello Benjamin,

How about this patch?
Look forward your replies.
Thank you very much.
Best regards!





On 2017/5/19 17:45, Yang Feng wrote:
> Hi Benjamin,
> 
> Thank you very much for your comments.
> Please find my replys and the up-to-date patch.
> Best regards!
> 
>>
>> First, one overall question. We have dynamic path selectors available to
>> deal with paths that are just simply slower that other paths, but can
>> still be used together.  Is there specific hardware or a specific setup
>> where this isn't good enough and we really need to seperate these paths
>> into different pathgroups, but we can't find out deterministically how
>> the groups should be set up?  It just seems like there could be a less
>> hacky solution to this problem, but perhaps there are some situations
>> where this is truly the best option. I'm just wondering what those are.1. In the Storage-Backup environment of HyperCluster,includs one storage array near
> to the host and one remote storage array, and the two storage arrays have the same hardware.
> The same LUN is writed or readed by the two storage arrays.
> However, usually, the average latency of the paths of the remote storage array is much higher than the
> near storage array's.
> apparently, the prioritizer can be a good automatic solution.
> And the current selectors don't solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
> 2. In the environment of single storage array, the prioritizer can automatically separate the paths who's latency is much higher,
> IOs will not send to this paths.
> But the current selectors don't solve this problem, IOPS will be influenced unavoidably.
> 
>>> +
>>> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
>>> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
>>> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
>>
>> This is a nit, but for constant strings, could you please use "char
>> *var" instead of "char var[]", to be consistent with the rest of the
>> multipath code.
> Thanks, as the following patch.
> 
>>> +    if ((args == NULL) || (interval == NULL)
>>> +        || (consnum == NULL) || (type == NULL))
>>> +        return 0;
>>> +
>>> +    /* int type */
>>> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
>>> +        return 0;
>>
>> You should probably have log messages for these error returns.
> Thanks, as the following patch.
> 
>>> +
>>> +    memcpy(source, args, size+1);
>>> +    if (strstr(source, vertica) == NULL)
>>> +        return 0;
>>> +
>>> +    *type = get_interval_type(source, typestr);
>>> +    if (*type == INTERVAL_INVALID)
>>> +    {
>>> +        condlog(0, "delay_interval type is invalid");
>>> +        return 0;
>>> +    }
>>
>> I'm confused here. How do you get to use the default interval. Shouldn't
>> you accept "20s|" and "|30" and as valid inputs that use the defaults
>> for the part they don't specify. 
> OK,the default arguments value is removed. If get inputs failed, return default priority "0".
> As the following patch.
>>
>>> +    tokenbefore = strtok(source, vertica);
>>> +    tokenafter = strtok(NULL, vertica);
>>> +    typestr[1] = '\0';
>>> +    tokenbefore = strtok(tokenbefore, typestr);
>>> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
>>> +        return 0;
>>> +
>>> +    tmp = tokenbefore;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +        {
>>> +            condlog(0, "delay_interval string include invalid char");
>>> +            return 0;
>>> +        }
>>> +
>>> +    tmp = tokenafter;
>>> +    while (*tmp != '\0')
>>> +        if (!isdigit(*tmp++))
>>> +        {
>>> +            condlog(0, "cons_num string include invalid char");
>>> +            return 0;
>>> +        }
>>> +
>>> +    *interval = atoi(tokenbefore);
>>
>> Why do you keep track of the type and the interval seperately? Can't you
>> just find out the type, and use that to multiply the interval once you
>> read it, and then just use that value, instead of keeping track of two
>> values across multiple functions?
> Thanks, as the following patch.
> 
>>> +
>>> +	if (pp->fd < 0)
>>> +	    return -PRIO_NO_INFORMATION;
>>> +
>>> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
>>> +    {
>>> +        condlog(3, "%s: get delay arg fail", pp->dev);
>>
>> Why use the word "fail" in this message? Not setting prio_args to get
>> the defaults seems like a perfectly valid choice.
> The defaults are not used. Insteadly, return default priority "0". See below.
> 
>>> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
>>> new file mode 100644
>>> index 0000000..d8213e9
>>> --- /dev/null
>>> +++ b/libmultipath/prioritizers/delayedpath.h
>>> @@ -0,0 +1,17 @@
>>> +#ifndef _DELAYEDPATH_H
>>> +#define _DELAYEDPATH_H
>>> +
>>> +#define PRIO_DELAYED_PATH "delayedpath"
>>
>> In order for the rest of the code to refer to this prioritizer, this
>> define should be in prio.h with the other prioritizer names, and as long
>> as delayedpath.c includes prio.h, there's no need to put it in
>> delayedpath.h.
> OK, as the following patch.
>>
>>> +
>>> +#define PRIO_NO_INFORMATION 5
>>
>> The rest of the multipath code only cares if getprio returns a negative
>> number of not. It doesn't check what the specific negative number is.  I
>> realize the the alua prioritizer returns a set of error codes, but they
>> aren't used, or even usable in their present form. If we wanted to have
>> better error reporting, we should set up a universal set of error codes
>> in prio.h, and have all prioritizers use them, instead of having each
>> prioritizer define its own error codes. There's no reason why your
>> prioritizer needs to return this error code instead of -1.
> OK, as the following patch.
>>
>>> +
>>> +#define USEC_PER_SEC      1000000LL
>>> +#define USEC_PER_MSEC     1000LL
>>> +#define USEC_PER_USEC     1LL
>>> +
>>> +static inline long long timeval_to_us(const struct timespec *tv)
>>> +{
>>> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
>>> +}
>>
>> No other file besides delayedpath.c will likely be including this .h
>> file, so I don't see any purpose for these being defined here.  In fact,
>> I don't see why you can't just have a .c file without a .h file like the
>> majority of prioritizers.  I'm pretty sure that none of the prioritizers
>> really need their own .h file.
> OK, as the following patch.
>>
>>> +#endif
>>> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
>>> index 5939688..f1e126e 100644
>>> --- a/multipath/multipath.conf.5
>>> +++ b/multipath/multipath.conf.5
>>> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>>>  Generate the path priority based on the regular expression and the
>>>  priority provided as argument. Requires prio_args keyword.
>>>  .TP
>>> +.I delayedpath
>>> +Generate the path priority based on a time-delay algorithm.
>>> +Requires prio_args keyword.
>>
>> Really it doesn't require prio_args if you want to use the default
>> values, and should probably say so.
> The default args is discarded, as the following patch.
> 
>>> +.I delayed
>>> +Needs a value of the form
>>> +\fI"<delay_interval|cons_num>"\fR
>>> +.RS
>>> +.TP 8
>>> +.I delay_interval
>>> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
>>> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
>>> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
>>> +.TP
>>> +.I cons_num
>>> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
>>> +For example: 30. The default is: 20.
>>> +.RE
>>> +.TP 12
>>
>> Looking at the "weighted" prio_args definition just above your "delayed"
>> definition, the pipe character "|" is being used to say that any of a
>> set of options is allowed.  Your definition has it being a literal
>> character, but it's still inside the angle brackets that usually
>> delineate a variable.  perhaps "<delay_interval>|<io_num>" would be
>> easier to understand, or even "[delayed_interval]|[io_num]" if you can
>> omit these to use the defaults.
> OK, as the following patch.
> The up-to-date patch as follows:
> 
> ---
>>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Fri, 19 May 2017 16:09:07 +0800
> Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
> arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
> 2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.
> 
>                    latency_interval   latency_interval   latency_interval       latency_interval
>          	 |------------------|------------------|------------------|...|------------------|
> 		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
> 		 |------------------|------------------|------------------|...|------------------|
> 				          Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile       |   6 +-
>  libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
>  multipath/multipath.conf.5               |  18 ++
>  libmultipath/prio.h 			  |   1 +
>  4 files changed, 295 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/path_latency.c
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..d2f20f6 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriopath_latency.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
> new file mode 100644
> index 0000000..a666b6c
> --- /dev/null
> +++ b/libmultipath/prioritizers/path_latency.c
> @@ -0,0 +1,271 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a latency algorithm. And the
> + * latency algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "io_num" of read IOs to the current path
> + *    continuously, the IOs' average latency can be calculated.
> + * 2. According to the average latency of each path and the weight value
> + *    "latency_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <math.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
> +
> +#define MAX_IO_NUM              200
> +#define MIN_IO_NUM              10
> +
> +#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
> +#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
> +
> +#define DEFAULT_PRIORITY        0
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_USEC               "us"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_SEC                "s"
> +
> +enum interval_type {
> +    INTERVAL_USEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_SEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char *interval_unit_str[MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static const int conversion_ratio[] = {
> +    [INTERVAL_USEC]		= USEC_PER_USEC,
> +    [INTERVAL_MSEC]     = USEC_PER_MSEC,
> +    [INTERVAL_SEC]		= USEC_PER_SEC,
> +    [INTERVAL_INVALID]	= 0
> +};
> +
> +static long long path_latency[MAX_IO_NUM];
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +int check_args_valid(int io_num, long long latency_interval, int type)
> +{
> +    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
> +    {
> +        condlog(0, "args io_num is more than the valid values range");
> +        return 0;
> +    }
> +
> +    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
> +    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
> +    {
> +        condlog(0, "args latency_interval is more than the valid values range");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int get_interval_type(char *type)
> +{
> +    int index;
> +
> +    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
> +    {
> +        if (strcmp(type, interval_unit_str[index]) == 0)
> +        {
> +            return interval_unit_type[index];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +/* In multipath.conf, args form: io_num|latency_interval. For example,
> +*  args is "20|10ms", this function can get 20, 10.
> +*/
> +static int get_interval_and_ionum(char *args,
> +                                        int *ionum,
> +                                        long long *interval)
> +{
> +    char source[MAX_CHAR_SIZE];
> +    char vertica = '|';
> +    char *endstrbefore = NULL;
> +    char *endstrafter = NULL;
> +    int type;
> +    unsigned int size = strlen(args);
> +    long long ratio;
> +
> +    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
> +    {
> +        condlog(0, "args string is NULL");
> +        return 0;
> +    }
> +
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +    {
> +        condlog(0, "args string's size is too long");
> +        return 0;
> +    }
> +
> +    memcpy(source, args, size+1);
> +
> +    if (!isdigit(source[0]))
> +    {
> +        condlog(0, "args io_num string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *ionum = (int)strtoul(source, &endstrbefore, 10);
> +    if (endstrbefore[0] != vertica)
> +    {
> +        condlog(0, "segmentation char is invalid");
> +        return 0;
> +    }
> +
> +    if (!isdigit(endstrbefore[1]))
> +    {
> +        condlog(0, "args latency_interval string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
> +    type = get_interval_type(endstrafter);
> +    if (type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "args latency_interval type is invalid");
> +        return 0;
> +    }
> +
> +    if (check_args_valid(*ionum, *interval, type) == 0)
> +    {
> +        return 0;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +    *interval *= (long long)ratio;
> +
> +    return 1;
> +}
> +
> +long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
> +{
> +    int index;
> +    long long total = 0;
> +
> +    for (index = 0; index < size; index++)
> +    {
> +        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
> +    }
> +
> +    total /= (size-1);
> +
> +    return (long long)sqrt((double)total);
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, temp;
> +    int index = 0;
> +    int io_num;
> +    long long latency_interval;
> +    long long avglatency;
> +    long long standard_deviation;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +		return -1;
> +
> +    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
> +    {
> +        condlog(0, "%s: get path_latency args fail", pp->dev);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    memset(path_latency, 0, sizeof(path_latency));
> +
> +    temp = io_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        path_latency[index] = after - before;
> +        toldelay += path_latency[index++];
> +    }
> +
> +    avglatency = toldelay/(long long)io_num;
> +    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
> +
> +    if (avglatency > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
> +    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
> +    if (latency_interval <= (2 * standard_deviation))
> +        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
> +            pp->dev, standard_deviation);
> +
> +	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
> +    return rc;
> +}
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..3dd0d77 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I path_latency
> +Generate the path priority based on a latency algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I path_latency
> +Needs a value of the form
> +\fI"<latency_interval>|<io_num>"\fR
> +.RS
> +.TP 8
> +.I latency_interval
> +The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
> +For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
> +.TP
> +.I io_num
> +The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> diff --git a/libmultipath/prio.h b/libmultipath/prio.h
> index 0193c52..c97fe39 100644
> --- a/libmultipath/prio.h
> +++ b/libmultipath/prio.h
> @@ -29,6 +29,7 @@ struct path;
>  #define PRIO_RDAC		"rdac"
>  #define PRIO_WEIGHTED_PATH	"weightedpath"
>  #define PRIO_SYSFS		"sysfs"
> +#define PRIO_PATH_LATENCY	"path_latency"
> 
>  /*
>   * Value used to mark the fact prio was not defined
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a latency algorithm
  2017-05-19  9:45       ` Yang Feng
  2017-05-22  8:02         ` Yang Feng
  2017-05-24  1:59         ` Yang Feng
@ 2017-05-24  2:22         ` Yang Feng
  2017-05-24  2:50           ` Yang Feng
  2 siblings, 1 reply; 19+ messages in thread
From: Yang Feng @ 2017-05-24  2:22 UTC (permalink / raw)
  To: Benjamin Marzinski
  Cc: zouming.zouming, guanjunxiong, shenhong09, dm-devel, hege09

Hello Benjamin,

Sorry for reply's word format error, and fixed now.
Thank you very much for your comments.
Please find my replys and the up-to-date patch.

Look forward your reply.
Best regards!

> 
> First, one overall question. We have dynamic path selectors available to
> deal with paths that are just simply slower that other paths, but can
> still be used together.  Is there specific hardware or a specific setup
> where this isn't good enough and we really need to seperate these paths
> into different pathgroups, but we can't find out deterministically how
> the groups should be set up?  It just seems like there could be a less
> hacky solution to this problem, but perhaps there are some situations
> where this is truly the best option. I'm just wondering what those are.

1. In the Storage-Backup environment of HyperCluster, includes one storage array near
to the host and one remote storage array, and the two storage arrays have the same hardware.
The same LUN is writed or readed by the two storage arrays.
However, usually, the average latency of the paths of the remote storage array is much higher than the
near storage array's.
apparently, the prioritizer can be a good automatic solution.
And the current selectors don't solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
2. In the environment of single storage array, the prioritizer can automatically separate the paths who's latency is much higher,
IOs will not send to this paths.
But the current selectors don't solve this problem, IOPS will be influenced unavoidably.

>> +
>> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
>> +static const char interval_unit_str[][MAX_CHAR_SIZE] = {
>> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> 
> This is a nit, but for constant strings, could you please use "char
> *var" instead of "char var[]", to be consistent with the rest of the
> multipath code.
Thanks, as the following patch.

>> +    if ((args == NULL) || (interval == NULL)
>> +        || (consnum == NULL) || (type == NULL))
>> +        return 0;
>> +
>> +    /* int type */
>> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
>> +        return 0;
> 
> You should probably have log messages for these error returns.
Thanks, as the following patch.

>> +
>> +    memcpy(source, args, size+1);
>> +    if (strstr(source, vertica) == NULL)
>> +        return 0;
>> +
>> +    *type = get_interval_type(source, typestr);
>> +    if (*type == INTERVAL_INVALID)
>> +    {
>> +        condlog(0, "delay_interval type is invalid");
>> +        return 0;
>> +    }
> 
> I'm confused here. How do you get to use the default interval. Shouldn't
> you accept "20s|" and "|30" and as valid inputs that use the defaults
> for the part they don't specify. 
OK,the default arguments value is removed. If get inputs failed, return default priority "0".
As the following patch.
> 
>> +    tokenbefore = strtok(source, vertica);
>> +    tokenafter = strtok(NULL, vertica);
>> +    typestr[1] = '\0';
>> +    tokenbefore = strtok(tokenbefore, typestr);
>> +    if ((tokenbefore == NULL) || (tokenafter == NULL))
>> +        return 0;
>> +
>> +    tmp = tokenbefore;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +        {
>> +            condlog(0, "delay_interval string include invalid char");
>> +            return 0;
>> +        }
>> +
>> +    tmp = tokenafter;
>> +    while (*tmp != '\0')
>> +        if (!isdigit(*tmp++))
>> +        {
>> +            condlog(0, "cons_num string include invalid char");
>> +            return 0;
>> +        }
>> +
>> +    *interval = atoi(tokenbefore);
> 
> Why do you keep track of the type and the interval seperately? Can't you
> just find out the type, and use that to multiply the interval once you
> read it, and then just use that value, instead of keeping track of two
> values across multiple functions?
Thanks, as the following patch.

>> +
>> +	if (pp->fd < 0)
>> +	    return -PRIO_NO_INFORMATION;
>> +
>> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
>> +    {
>> +        condlog(3, "%s: get delay arg fail", pp->dev);
> 
> Why use the word "fail" in this message? Not setting prio_args to get
> the defaults seems like a perfectly valid choice.
The defaults are not used. Insteadly, return default priority "0". See below.

>> diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
>> new file mode 100644
>> index 0000000..d8213e9
>> --- /dev/null
>> +++ b/libmultipath/prioritizers/delayedpath.h
>> @@ -0,0 +1,17 @@
>> +#ifndef _DELAYEDPATH_H
>> +#define _DELAYEDPATH_H
>> +
>> +#define PRIO_DELAYED_PATH "delayedpath"
> 
> In order for the rest of the code to refer to this prioritizer, this
> define should be in prio.h with the other prioritizer names, and as long
> as delayedpath.c includes prio.h, there's no need to put it in
> delayedpath.h.
OK, as the following patch.
> 
>> +
>> +#define PRIO_NO_INFORMATION 5
> 
> The rest of the multipath code only cares if getprio returns a negative
> number of not. It doesn't check what the specific negative number is.  I
> realize the the alua prioritizer returns a set of error codes, but they
> aren't used, or even usable in their present form. If we wanted to have
> better error reporting, we should set up a universal set of error codes
> in prio.h, and have all prioritizers use them, instead of having each
> prioritizer define its own error codes. There's no reason why your
> prioritizer needs to return this error code instead of -1.
OK, as the following patch.
> 
>> +
>> +#define USEC_PER_SEC      1000000LL
>> +#define USEC_PER_MSEC     1000LL
>> +#define USEC_PER_USEC     1LL
>> +
>> +static inline long long timeval_to_us(const struct timespec *tv)
>> +{
>> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
>> +}
> 
> No other file besides delayedpath.c will likely be including this .h
> file, so I don't see any purpose for these being defined here.  In fact,
> I don't see why you can't just have a .c file without a .h file like the
> majority of prioritizers.  I'm pretty sure that none of the prioritizers
> really need their own .h file.
OK, as the following patch.
> 
>> +#endif
>> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
>> index 5939688..f1e126e 100644
>> --- a/multipath/multipath.conf.5
>> +++ b/multipath/multipath.conf.5
>> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>>  Generate the path priority based on the regular expression and the
>>  priority provided as argument. Requires prio_args keyword.
>>  .TP
>> +.I delayedpath
>> +Generate the path priority based on a time-delay algorithm.
>> +Requires prio_args keyword.
> 
> Really it doesn't require prio_args if you want to use the default
> values, and should probably say so.
The default args is discarded, as the following patch.

>> +.I delayed
>> +Needs a value of the form
>> +\fI"<delay_interval|cons_num>"\fR
>> +.RS
>> +.TP 8
>> +.I delay_interval
>> +The interval values of average IO-time-delay between two different neighbour ranks of path priority, used to partition different priority ranks.
>> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 1000), us [1, 1000),
>> +For example: 10s, or 100us, or 100ms. The default is: 10ms.
>> +.TP
>> +.I cons_num
>> +The number of read IOs sent to the current path continuously, used to calculate the average IO-time-delay. Valid Values: Integer, [3, 1000].
>> +For example: 30. The default is: 20.
>> +.RE
>> +.TP 12
> 
> Looking at the "weighted" prio_args definition just above your "delayed"
> definition, the pipe character "|" is being used to say that any of a
> set of options is allowed.  Your definition has it being a literal
> character, but it's still inside the angle brackets that usually
> delineate a variable.  perhaps "<delay_interval>|<io_num>" would be
> easier to understand, or even "[delayed_interval]|[io_num]" if you can
> omit these to use the defaults.
OK, as the following patch.
The up-to-date patch as follows:

---
>From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
From: Yang Feng <philip.yang@huawei.com>
Date: Fri, 19 May 2017 16:09:07 +0800
Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.

                   latency_interval   latency_interval   latency_interval       latency_interval
         	 |------------------|------------------|------------------|...|------------------|
		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
		 |------------------|------------------|------------------|...|------------------|
				          Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile       |   6 +-
 libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
 multipath/multipath.conf.5               |  18 ++
 libmultipath/prio.h 			  |   1 +
 4 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/path_latency.c

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..d2f20f6 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriopath_latency.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
new file mode 100644
index 0000000..a666b6c
--- /dev/null
+++ b/libmultipath/prioritizers/path_latency.c
@@ -0,0 +1,271 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a latency algorithm. And the
+ * latency algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "io_num" of read IOs to the current path
+ *    continuously, the IOs' average latency can be calculated.
+ * 2. According to the average latency of each path and the weight value
+ *    "latency_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
+
+#define MAX_IO_NUM              200
+#define MIN_IO_NUM              10
+
+#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
+#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
+
+#define DEFAULT_PRIORITY        0
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_USEC               "us"
+#define CHAR_MSEC               "ms"
+#define CHAR_SEC                "s"
+
+enum interval_type {
+    INTERVAL_USEC,
+    INTERVAL_MSEC,
+    INTERVAL_SEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char *interval_unit_str[MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static const int conversion_ratio[] = {
+    [INTERVAL_USEC]		= USEC_PER_USEC,
+    [INTERVAL_MSEC]     = USEC_PER_MSEC,
+    [INTERVAL_SEC]		= USEC_PER_SEC,
+    [INTERVAL_INVALID]	= 0
+};
+
+static long long path_latency[MAX_IO_NUM];
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+int check_args_valid(int io_num, long long latency_interval, int type)
+{
+    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
+    {
+        condlog(0, "args io_num is more than the valid values range");
+        return 0;
+    }
+
+    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
+    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
+    {
+        condlog(0, "args latency_interval is more than the valid values range");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int get_interval_type(char *type)
+{
+    int index;
+
+    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
+    {
+        if (strcmp(type, interval_unit_str[index]) == 0)
+        {
+            return interval_unit_type[index];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+/* In multipath.conf, args form: io_num|latency_interval. For example,
+*  args is "20|10ms", this function can get 20, 10.
+*/
+static int get_interval_and_ionum(char *args,
+                                        int *ionum,
+                                        long long *interval)
+{
+    char source[MAX_CHAR_SIZE];
+    char vertica = '|';
+    char *endstrbefore = NULL;
+    char *endstrafter = NULL;
+    int type;
+    unsigned int size = strlen(args);
+    long long ratio;
+
+    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
+    {
+        condlog(0, "args string is NULL");
+        return 0;
+    }
+
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+    {
+        condlog(0, "args string's size is too long");
+        return 0;
+    }
+
+    memcpy(source, args, size+1);
+
+    if (!isdigit(source[0]))
+    {
+        condlog(0, "args io_num string's first char is not digit");
+        return 0;
+    }
+
+    *ionum = (int)strtoul(source, &endstrbefore, 10);
+    if (endstrbefore[0] != vertica)
+    {
+        condlog(0, "segmentation char is invalid");
+        return 0;
+    }
+
+    if (!isdigit(endstrbefore[1]))
+    {
+        condlog(0, "args latency_interval string's first char is not digit");
+        return 0;
+    }
+
+    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
+    type = get_interval_type(endstrafter);
+    if (type == INTERVAL_INVALID)
+    {
+        condlog(0, "args latency_interval type is invalid");
+        return 0;
+    }
+
+    if (check_args_valid(*ionum, *interval, type) == 0)
+    {
+        return 0;
+    }
+
+	ratio = get_conversion_ratio(type);
+    *interval *= (long long)ratio;
+
+    return 1;
+}
+
+long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
+{
+    int index;
+    long long total = 0;
+
+    for (index = 0; index < size; index++)
+    {
+        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
+    }
+
+    total /= (size-1);
+
+    return (long long)sqrt((double)total);
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, temp;
+    int index = 0;
+    int io_num;
+    long long latency_interval;
+    long long avglatency;
+    long long standard_deviation;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+		return -1;
+
+    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
+    {
+        condlog(0, "%s: get path_latency args fail", pp->dev);
+        return DEFAULT_PRIORITY;
+    }
+
+    memset(path_latency, 0, sizeof(path_latency));
+
+    temp = io_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        path_latency[index] = after - before;
+        toldelay += path_latency[index++];
+    }
+
+    avglatency = toldelay/(long long)io_num;
+    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
+
+    if (avglatency > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
+        return DEFAULT_PRIORITY;
+    }
+
+    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
+    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
+    if (latency_interval <= (2 * standard_deviation))
+        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
+            pp->dev, standard_deviation);
+
+	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
+    return rc;
+}
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..3dd0d77 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I path_latency
+Generate the path priority based on a latency algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I path_latency
+Needs a value of the form
+\fI"<latency_interval>|<io_num>"\fR
+.RS
+.TP 8
+.I latency_interval
+The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
+For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
+.TP
+.I io_num
+The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
diff --git a/libmultipath/prio.h b/libmultipath/prio.h
index 0193c52..c97fe39 100644
--- a/libmultipath/prio.h
+++ b/libmultipath/prio.h
@@ -29,6 +29,7 @@ struct path;
 #define PRIO_RDAC		"rdac"
 #define PRIO_WEIGHTED_PATH	"weightedpath"
 #define PRIO_SYSFS		"sysfs"
+#define PRIO_PATH_LATENCY	"path_latency"

 /*
  * Value used to mark the fact prio was not defined
-- 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a latency algorithm
  2017-05-24  2:22         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
@ 2017-05-24  2:50           ` Yang Feng
  2017-05-24 19:57             ` Benjamin Marzinski
  2017-06-01  1:50             ` [PATCH v2] multipath-tools: Prioritizer " Yang Feng
  0 siblings, 2 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-24  2:50 UTC (permalink / raw)
  To: Christophe Varoqui
  Cc: zouming.zouming, guanjunxiong, Chengjike (ISSP),
	shenhong09, dm-devel, hege09

Hello Christophe,

Our technology team is working on a open source contribution for multipath.
We think that this patch is essential to isolate the high latency paths and
avoid high fluctuation of io performance, particularly, for the Storage-Backup
environment of HyperCluster.

Look forward your reply very much.
Respects and regards!


1. In the Storage-Backup environment of HyperCluster, includes one storage array near
to the host and one remote storage array, and the two storage arrays have the same hardware.
The same LUN is writed or readed by the two storage arrays.However, usually, the average
latency of the paths of the remote storage array is much higher than the near storage array's.
Apparently, the prioritizer can be a good automatic solution. And the current selectors don't
solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
2. In the environment of single storage array, the prioritizer can automatically separate the paths
who's latency is much higher, IOs will not send to this paths. But the current selectors don't solve
this problem, io performance will be influenced unavoidably.


The up-to-date patch as follows:
---
From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
From: Yang Feng <philip.yang@huawei.com>
Date: Fri, 19 May 2017 16:09:07 +0800
Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.

                   latency_interval   latency_interval   latency_interval       latency_interval
         	 |------------------|------------------|------------------|...|------------------|
		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
		 |------------------|------------------|------------------|...|------------------|
				          Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile       |   6 +-
 libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
 multipath/multipath.conf.5               |  18 ++
 libmultipath/prio.h 			  |   1 +
 4 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/path_latency.c

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..d2f20f6 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriopath_latency.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
new file mode 100644
index 0000000..a666b6c
--- /dev/null
+++ b/libmultipath/prioritizers/path_latency.c
@@ -0,0 +1,271 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a latency algorithm. And the
+ * latency algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "io_num" of read IOs to the current path
+ *    continuously, the IOs' average latency can be calculated.
+ * 2. According to the average latency of each path and the weight value
+ *    "latency_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
+
+#define MAX_IO_NUM              200
+#define MIN_IO_NUM              10
+
+#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
+#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
+
+#define DEFAULT_PRIORITY        0
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_USEC               "us"
+#define CHAR_MSEC               "ms"
+#define CHAR_SEC                "s"
+
+enum interval_type {
+    INTERVAL_USEC,
+    INTERVAL_MSEC,
+    INTERVAL_SEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char *interval_unit_str[MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static const int conversion_ratio[] = {
+    [INTERVAL_USEC]		= USEC_PER_USEC,
+    [INTERVAL_MSEC]     = USEC_PER_MSEC,
+    [INTERVAL_SEC]		= USEC_PER_SEC,
+    [INTERVAL_INVALID]	= 0
+};
+
+static long long path_latency[MAX_IO_NUM];
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+int check_args_valid(int io_num, long long latency_interval, int type)
+{
+    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
+    {
+        condlog(0, "args io_num is more than the valid values range");
+        return 0;
+    }
+
+    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
+    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
+    {
+        condlog(0, "args latency_interval is more than the valid values range");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int get_interval_type(char *type)
+{
+    int index;
+
+    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
+    {
+        if (strcmp(type, interval_unit_str[index]) == 0)
+        {
+            return interval_unit_type[index];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+/* In multipath.conf, args form: io_num|latency_interval. For example,
+*  args is "20|10ms", this function can get 20, 10.
+*/
+static int get_interval_and_ionum(char *args,
+                                        int *ionum,
+                                        long long *interval)
+{
+    char source[MAX_CHAR_SIZE];
+    char vertica = '|';
+    char *endstrbefore = NULL;
+    char *endstrafter = NULL;
+    int type;
+    unsigned int size = strlen(args);
+    long long ratio;
+
+    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
+    {
+        condlog(0, "args string is NULL");
+        return 0;
+    }
+
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+    {
+        condlog(0, "args string's size is too long");
+        return 0;
+    }
+
+    memcpy(source, args, size+1);
+
+    if (!isdigit(source[0]))
+    {
+        condlog(0, "args io_num string's first char is not digit");
+        return 0;
+    }
+
+    *ionum = (int)strtoul(source, &endstrbefore, 10);
+    if (endstrbefore[0] != vertica)
+    {
+        condlog(0, "segmentation char is invalid");
+        return 0;
+    }
+
+    if (!isdigit(endstrbefore[1]))
+    {
+        condlog(0, "args latency_interval string's first char is not digit");
+        return 0;
+    }
+
+    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
+    type = get_interval_type(endstrafter);
+    if (type == INTERVAL_INVALID)
+    {
+        condlog(0, "args latency_interval type is invalid");
+        return 0;
+    }
+
+    if (check_args_valid(*ionum, *interval, type) == 0)
+    {
+        return 0;
+    }
+
+	ratio = get_conversion_ratio(type);
+    *interval *= (long long)ratio;
+
+    return 1;
+}
+
+long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
+{
+    int index;
+    long long total = 0;
+
+    for (index = 0; index < size; index++)
+    {
+        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
+    }
+
+    total /= (size-1);
+
+    return (long long)sqrt((double)total);
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, temp;
+    int index = 0;
+    int io_num;
+    long long latency_interval;
+    long long avglatency;
+    long long standard_deviation;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+		return -1;
+
+    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
+    {
+        condlog(0, "%s: get path_latency args fail", pp->dev);
+        return DEFAULT_PRIORITY;
+    }
+
+    memset(path_latency, 0, sizeof(path_latency));
+
+    temp = io_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        path_latency[index] = after - before;
+        toldelay += path_latency[index++];
+    }
+
+    avglatency = toldelay/(long long)io_num;
+    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
+
+    if (avglatency > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
+        return DEFAULT_PRIORITY;
+    }
+
+    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
+    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
+    if (latency_interval <= (2 * standard_deviation))
+        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
+            pp->dev, standard_deviation);
+
+	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
+    return rc;
+}
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..3dd0d77 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I path_latency
+Generate the path priority based on a latency algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I path_latency
+Needs a value of the form
+\fI"<latency_interval>|<io_num>"\fR
+.RS
+.TP 8
+.I latency_interval
+The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
+For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
+.TP
+.I io_num
+The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
diff --git a/libmultipath/prio.h b/libmultipath/prio.h
index 0193c52..c97fe39 100644
--- a/libmultipath/prio.h
+++ b/libmultipath/prio.h
@@ -29,6 +29,7 @@ struct path;
 #define PRIO_RDAC		"rdac"
 #define PRIO_WEIGHTED_PATH	"weightedpath"
 #define PRIO_SYSFS		"sysfs"
+#define PRIO_PATH_LATENCY	"path_latency"

 /*
  * Value used to mark the fact prio was not defined
-- 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a latency algorithm
  2017-05-24  2:50           ` Yang Feng
@ 2017-05-24 19:57             ` Benjamin Marzinski
  2017-05-25  1:55               ` Yang Feng
  2017-06-01  1:50             ` [PATCH v2] multipath-tools: Prioritizer " Yang Feng
  1 sibling, 1 reply; 19+ messages in thread
From: Benjamin Marzinski @ 2017-05-24 19:57 UTC (permalink / raw)
  To: Yang Feng
  Cc: zouming.zouming, guanjunxiong, Chengjike (ISSP),
	shenhong09, dm-devel, hege09

On Wed, May 24, 2017 at 10:50:18AM +0800, Yang Feng wrote:
> Hello Christophe,

ACK

-Ben
 
> Our technology team is working on a open source contribution for multipath.
> We think that this patch is essential to isolate the high latency paths and
> avoid high fluctuation of io performance, particularly, for the Storage-Backup
> environment of HyperCluster.
> 
> Look forward your reply very much.
> Respects and regards!
> 
> 
> 1. In the Storage-Backup environment of HyperCluster, includes one storage array near
> to the host and one remote storage array, and the two storage arrays have the same hardware.
> The same LUN is writed or readed by the two storage arrays.However, usually, the average
> latency of the paths of the remote storage array is much higher than the near storage array's.
> Apparently, the prioritizer can be a good automatic solution. And the current selectors don't
> solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.
> 2. In the environment of single storage array, the prioritizer can automatically separate the paths
> who's latency is much higher, IOs will not send to this paths. But the current selectors don't solve
> this problem, io performance will be influenced unavoidably.
> 
> 
> The up-to-date patch as follows:
> ---
> >From 58d718fdd34550bd9c4a32c6e9a87099c1e45a9f Mon Sep 17 00:00:00 2001
> From: Yang Feng <philip.yang@huawei.com>
> Date: Fri, 19 May 2017 16:09:07 +0800
> Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
> arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
> 2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.
> 
>                    latency_interval   latency_interval   latency_interval       latency_interval
>          	 |------------------|------------------|------------------|...|------------------|
> 		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
> 		 |------------------|------------------|------------------|...|------------------|
> 				          Priority Rank Partitioning
> ---
>  libmultipath/prioritizers/Makefile       |   6 +-
>  libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
>  multipath/multipath.conf.5               |  18 ++
>  libmultipath/prio.h 			  |   1 +
>  4 files changed, 295 insertions(+), 1 deletion(-)
>  create mode 100644 libmultipath/prioritizers/path_latency.c
> 
> diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
> index 36b42e4..d2f20f6 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -18,13 +18,17 @@ LIBS = \
>  	libpriorandom.so \
>  	libpriordac.so \
>  	libprioweightedpath.so \
> -	libpriosysfs.so
> +	libpriopath_latency.so \
> +	libpriosysfs.so
> 
>  all: $(LIBS)
> 
>  libprioalua.so: alua.o alua_rtpg.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> +libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
> +	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
> +
>  libprio%.so: %.o
>  	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
> 
> diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
> new file mode 100644
> index 0000000..a666b6c
> --- /dev/null
> +++ b/libmultipath/prioritizers/path_latency.c
> @@ -0,0 +1,271 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding priority
> + * values of specific paths are provided by a latency algorithm. And the
> + * latency algorithm is dependent on arguments.
> + *
> + * The principle of the algorithm as follows:
> + * 1. By sending a certain number "io_num" of read IOs to the current path
> + *    continuously, the IOs' average latency can be calculated.
> + * 2. According to the average latency of each path and the weight value
> + *    "latency_interval", the priority "rc" of each path can be provided.
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <math.h>
> +#include <ctype.h>
> +#include <time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../checkers/libsg.h"
> +
> +#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
> +
> +#define MAX_IO_NUM              200
> +#define MIN_IO_NUM              10
> +
> +#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
> +#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
> +
> +#define DEFAULT_PRIORITY        0
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_USEC               "us"
> +#define CHAR_MSEC               "ms"
> +#define CHAR_SEC                "s"
> +
> +enum interval_type {
> +    INTERVAL_USEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_SEC,
> +    INTERVAL_INVALID
> +};
> +
> +/* interval_unit_str and interval_unit_type keep the same assignment sequence */
> +static const char *interval_unit_str[MAX_CHAR_SIZE] = {
> +    CHAR_USEC, CHAR_MSEC, CHAR_SEC
> +};
> +static const int interval_unit_type[] = {
> +    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
> +};
> +
> +#define USEC_PER_SEC      1000000LL
> +#define USEC_PER_MSEC     1000LL
> +#define USEC_PER_USEC     1LL
> +
> +static const int conversion_ratio[] = {
> +    [INTERVAL_USEC]		= USEC_PER_USEC,
> +    [INTERVAL_MSEC]     = USEC_PER_MSEC,
> +    [INTERVAL_SEC]		= USEC_PER_SEC,
> +    [INTERVAL_INVALID]	= 0
> +};
> +
> +static long long path_latency[MAX_IO_NUM];
> +
> +static inline long long timeval_to_us(const struct timespec *tv)
> +{
> +	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
> +}
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +
> +	return ret;
> +}
> +
> +int check_args_valid(int io_num, long long latency_interval, int type)
> +{
> +    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
> +    {
> +        condlog(0, "args io_num is more than the valid values range");
> +        return 0;
> +    }
> +
> +    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
> +    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
> +    {
> +        condlog(0, "args latency_interval is more than the valid values range");
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int get_interval_type(char *type)
> +{
> +    int index;
> +
> +    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
> +    {
> +        if (strcmp(type, interval_unit_str[index]) == 0)
> +        {
> +            return interval_unit_type[index];
> +        }
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +
> +/* In multipath.conf, args form: io_num|latency_interval. For example,
> +*  args is "20|10ms", this function can get 20, 10.
> +*/
> +static int get_interval_and_ionum(char *args,
> +                                        int *ionum,
> +                                        long long *interval)
> +{
> +    char source[MAX_CHAR_SIZE];
> +    char vertica = '|';
> +    char *endstrbefore = NULL;
> +    char *endstrafter = NULL;
> +    int type;
> +    unsigned int size = strlen(args);
> +    long long ratio;
> +
> +    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
> +    {
> +        condlog(0, "args string is NULL");
> +        return 0;
> +    }
> +
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +    {
> +        condlog(0, "args string's size is too long");
> +        return 0;
> +    }
> +
> +    memcpy(source, args, size+1);
> +
> +    if (!isdigit(source[0]))
> +    {
> +        condlog(0, "args io_num string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *ionum = (int)strtoul(source, &endstrbefore, 10);
> +    if (endstrbefore[0] != vertica)
> +    {
> +        condlog(0, "segmentation char is invalid");
> +        return 0;
> +    }
> +
> +    if (!isdigit(endstrbefore[1]))
> +    {
> +        condlog(0, "args latency_interval string's first char is not digit");
> +        return 0;
> +    }
> +
> +    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
> +    type = get_interval_type(endstrafter);
> +    if (type == INTERVAL_INVALID)
> +    {
> +        condlog(0, "args latency_interval type is invalid");
> +        return 0;
> +    }
> +
> +    if (check_args_valid(*ionum, *interval, type) == 0)
> +    {
> +        return 0;
> +    }
> +
> +	ratio = get_conversion_ratio(type);
> +    *interval *= (long long)ratio;
> +
> +    return 1;
> +}
> +
> +long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
> +{
> +    int index;
> +    long long total = 0;
> +
> +    for (index = 0; index < size; index++)
> +    {
> +        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
> +    }
> +
> +    total /= (size-1);
> +
> +    return (long long)sqrt((double)total);
> +}
> +
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, temp;
> +    int index = 0;
> +    int io_num;
> +    long long latency_interval;
> +    long long avglatency;
> +    long long standard_deviation;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timespec tv;
> +
> +	if (pp->fd < 0)
> +		return -1;
> +
> +    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
> +    {
> +        condlog(0, "%s: get path_latency args fail", pp->dev);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    memset(path_latency, 0, sizeof(path_latency));
> +
> +    temp = io_num;
> +    while (temp-- > 0)
> +    {
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return -1;
> +        }
> +
> +        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
> +        after = timeval_to_us(&tv);
> +
> +        path_latency[index] = after - before;
> +        toldelay += path_latency[index++];
> +    }
> +
> +    avglatency = toldelay/(long long)io_num;
> +    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
> +
> +    if (avglatency > THRES_USEC_VALUE)
> +    {
> +        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
> +        return DEFAULT_PRIORITY;
> +    }
> +
> +    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
> +    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
> +    if (latency_interval <= (2 * standard_deviation))
> +        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
> +            pp->dev, standard_deviation);
> +
> +	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
> +    return rc;
> +}
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 5939688..3dd0d77 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
>  Generate the path priority based on the regular expression and the
>  priority provided as argument. Requires prio_args keyword.
>  .TP
> +.I path_latency
> +Generate the path priority based on a latency algorithm.
> +Requires prio_args keyword.
> +.TP
>  .I datacore
>  .\" XXX
>  ???. Requires prio_args keyword.
> @@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
>  "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
>  .RE
>  .TP 12
> +.I path_latency
> +Needs a value of the form
> +\fI"<latency_interval>|<io_num>"\fR
> +.RS
> +.TP 8
> +.I latency_interval
> +The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
> +Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
> +For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
> +.TP
> +.I io_num
> +The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [10, 200].
> +.RE
> +.TP 12
>  .I alua
>  If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
>  set will always be in their own path group.
> diff --git a/libmultipath/prio.h b/libmultipath/prio.h
> index 0193c52..c97fe39 100644
> --- a/libmultipath/prio.h
> +++ b/libmultipath/prio.h
> @@ -29,6 +29,7 @@ struct path;
>  #define PRIO_RDAC		"rdac"
>  #define PRIO_WEIGHTED_PATH	"weightedpath"
>  #define PRIO_SYSFS		"sysfs"
> +#define PRIO_PATH_LATENCY	"path_latency"
> 
>  /*
>   * Value used to mark the fact prio was not defined
> -- 
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] multipath-tools:Prioritizer based on a latency algorithm
  2017-05-24 19:57             ` Benjamin Marzinski
@ 2017-05-25  1:55               ` Yang Feng
  0 siblings, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-05-25  1:55 UTC (permalink / raw)
  To: Christophe Varoqui
  Cc: zouming.zouming, guanjunxiong, Chengjike (ISSP),
	shenhong09, dm-devel, hege09



On 2017/5/25 3:57, Benjamin Marzinski wrote:
> On Wed, May 24, 2017 at 10:50:18AM +0800, Yang Feng wrote:
>> Hello Christophe,
> ACK
> 
> -Ben
>  

Hello Christophe,

Any advance for this patch?
Thanks,Best!

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2] multipath-tools: Prioritizer based on a latency algorithm
  2017-05-24  2:50           ` Yang Feng
  2017-05-24 19:57             ` Benjamin Marzinski
@ 2017-06-01  1:50             ` Yang Feng
  1 sibling, 0 replies; 19+ messages in thread
From: Yang Feng @ 2017-06-01  1:50 UTC (permalink / raw)
  To: dm-devel, Christophe Varoqui
  Cc: zouming.zouming, Xose Vazquez Perez, guanjunxiong,
	Chengjike (ISSP), shenhong09, Hege (A),
	Martin Wilck, qiuxin

1. In the Storage-Backup environment of HyperCluster, includes one storage array near
to the host and one remote storage array, and the two storage arrays have the same hardware.
The same LUN is writed or readed by the two storage arrays. However, usually, the average
latency of the paths of the remote storage array is much higher than the near storage array's.
Apparently, the prioritizer can be a good automatic solution. And the current selectors don't
solve it, IOs will send to the paths of the remote storage array, IOPS will be influenced unavoidably.

2. In the environment of single storage array, the prioritizer can automatically separate the paths
who's latency is much higher, IOs will not send to this paths. But the current selectors don't solve
this problem, IOPS will be influenced unavoidably.

The value of "MIN_IO_NUM" is set 2 from 10, the patch v2 as follows:
---
Subject: [PATCH] libmultipath/prioritizers: Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a latency algorithm. And the latency algorithm is dependent on the following
arguments(latency_interval and io_num). The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path continuously, the IOs' average latency can be calculated.
2. According to the average latency of each path and the weight value "latency_interval", the priority "rc" of each path can be provided.

                   latency_interval   latency_interval   latency_interval       latency_interval
         	 |------------------|------------------|------------------|...|------------------|
		 |  priority rank 1 |  priority rank 2 |  priority rank 3 |...|  priority rank x |
		 |------------------|------------------|------------------|...|------------------|
				          Priority Rank Partitioning
---
 libmultipath/prioritizers/Makefile       |   6 +-
 libmultipath/prioritizers/path_latency.c | 271 +++++++++++++++++++++++++++++++
 multipath/multipath.conf.5               |  18 ++
 libmultipath/prio.h 			  |   1 +
 4 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 libmultipath/prioritizers/path_latency.c

diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..d2f20f6 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@ LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriopath_latency.so \
+	libpriosysfs.so

 all: $(LIBS)

 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

+libpriopath_latency.so: path_latency.o  ../checkers/libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lm
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^

diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c
new file mode 100644
index 0000000..a666b6c
--- /dev/null
+++ b/libmultipath/prioritizers/path_latency.c
@@ -0,0 +1,271 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority
+ * values of specific paths are provided by a latency algorithm. And the
+ * latency algorithm is dependent on arguments.
+ *
+ * The principle of the algorithm as follows:
+ * 1. By sending a certain number "io_num" of read IOs to the current path
+ *    continuously, the IOs' average latency can be calculated.
+ * 2. According to the average latency of each path and the weight value
+ *    "latency_interval", the priority "rc" of each path can be provided.
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../checkers/libsg.h"
+
+#define THRES_USEC_VALUE        120000000LL    /*unit: us, =120s*/
+
+#define MAX_IO_NUM              200
+#define MIN_IO_NUM              2
+
+#define MAX_LATENCY_INTERVAL    60            /*unit: s*/
+#define MIN_LATENCY_INTERVAL    1             /*unit: us, or ms, or s*/
+
+#define DEFAULT_PRIORITY        0
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_USEC               "us"
+#define CHAR_MSEC               "ms"
+#define CHAR_SEC                "s"
+
+enum interval_type {
+    INTERVAL_USEC,
+    INTERVAL_MSEC,
+    INTERVAL_SEC,
+    INTERVAL_INVALID
+};
+
+/* interval_unit_str and interval_unit_type keep the same assignment sequence */
+static const char *interval_unit_str[MAX_CHAR_SIZE] = {
+    CHAR_USEC, CHAR_MSEC, CHAR_SEC
+};
+static const int interval_unit_type[] = {
+    INTERVAL_USEC, INTERVAL_MSEC, INTERVAL_SEC
+};
+
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static const int conversion_ratio[] = {
+    [INTERVAL_USEC]		= USEC_PER_USEC,
+    [INTERVAL_MSEC]     = USEC_PER_MSEC,
+    [INTERVAL_SEC]		= USEC_PER_SEC,
+    [INTERVAL_INVALID]	= 0
+};
+
+static long long path_latency[MAX_IO_NUM];
+
+static inline long long timeval_to_us(const struct timespec *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + (tv->tv_nsec >> 10);
+}
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+
+	return ret;
+}
+
+int check_args_valid(int io_num, long long latency_interval, int type)
+{
+    if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM))
+    {
+        condlog(0, "args io_num is more than the valid values range");
+        return 0;
+    }
+
+    /* s:[1, 60], ms:[1, 60000], us:[1, 60000000] */
+    if ((latency_interval < MIN_LATENCY_INTERVAL) || (latency_interval > (MAX_LATENCY_INTERVAL * USEC_PER_SEC / conversion_ratio[type])))
+    {
+        condlog(0, "args latency_interval is more than the valid values range");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int get_interval_type(char *type)
+{
+    int index;
+
+    for (index = 0; index < sizeof(interval_unit_str)/MAX_CHAR_SIZE; index++)
+    {
+        if (strcmp(type, interval_unit_str[index]) == 0)
+        {
+            return interval_unit_type[index];
+        }
+    }
+
+    return INTERVAL_INVALID;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+
+/* In multipath.conf, args form: io_num|latency_interval. For example,
+*  args is "20|10ms", this function can get 20, 10.
+*/
+static int get_interval_and_ionum(char *args,
+                                        int *ionum,
+                                        long long *interval)
+{
+    char source[MAX_CHAR_SIZE];
+    char vertica = '|';
+    char *endstrbefore = NULL;
+    char *endstrafter = NULL;
+    int type;
+    unsigned int size = strlen(args);
+    long long ratio;
+
+    if ((args == NULL) || (ionum == NULL) || (interval == NULL))
+    {
+        condlog(0, "args string is NULL");
+        return 0;
+    }
+
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+    {
+        condlog(0, "args string's size is too long");
+        return 0;
+    }
+
+    memcpy(source, args, size+1);
+
+    if (!isdigit(source[0]))
+    {
+        condlog(0, "args io_num string's first char is not digit");
+        return 0;
+    }
+
+    *ionum = (int)strtoul(source, &endstrbefore, 10);
+    if (endstrbefore[0] != vertica)
+    {
+        condlog(0, "segmentation char is invalid");
+        return 0;
+    }
+
+    if (!isdigit(endstrbefore[1]))
+    {
+        condlog(0, "args latency_interval string's first char is not digit");
+        return 0;
+    }
+
+    *interval = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
+    type = get_interval_type(endstrafter);
+    if (type == INTERVAL_INVALID)
+    {
+        condlog(0, "args latency_interval type is invalid");
+        return 0;
+    }
+
+    if (check_args_valid(*ionum, *interval, type) == 0)
+    {
+        return 0;
+    }
+
+	ratio = get_conversion_ratio(type);
+    *interval *= (long long)ratio;
+
+    return 1;
+}
+
+long long calc_standard_deviation(long long *path_latency, int size, long long avglatency)
+{
+    int index;
+    long long total = 0;
+
+    for (index = 0; index < size; index++)
+    {
+        total += (path_latency[index] - avglatency) * (path_latency[index] - avglatency);
+    }
+
+    total /= (size-1);
+
+    return (long long)sqrt((double)total);
+}
+
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, temp;
+    int index = 0;
+    int io_num;
+    long long latency_interval;
+    long long avglatency;
+    long long standard_deviation;
+    long long toldelay = 0;
+    long long before, after;
+    struct timespec tv;
+
+	if (pp->fd < 0)
+		return -1;
+
+    if (get_interval_and_ionum(args, &io_num, &latency_interval) == 0)
+    {
+        condlog(0, "%s: get path_latency args fail", pp->dev);
+        return DEFAULT_PRIORITY;
+    }
+
+    memset(path_latency, 0, sizeof(path_latency));
+
+    temp = io_num;
+    while (temp-- > 0)
+    {
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return -1;
+        }
+
+        (void)clock_gettime(CLOCK_MONOTONIC, &tv);
+        after = timeval_to_us(&tv);
+
+        path_latency[index] = after - before;
+        toldelay += path_latency[index++];
+    }
+
+    avglatency = toldelay/(long long)io_num;
+    condlog(4, "%s: average latency is (%lld)", pp->dev, avglatency);
+
+    if (avglatency > THRES_USEC_VALUE)
+    {
+        condlog(0, "%s: average latency (%lld) is more than thresold", pp->dev, avglatency);
+        return DEFAULT_PRIORITY;
+    }
+
+    /* warn the user if the latency_interval set is smaller than (2 * standard deviation), or equal */
+    standard_deviation = calc_standard_deviation(path_latency, index, avglatency);
+    if (latency_interval <= (2 * standard_deviation))
+        condlog(3, "%s: args latency_interval set is smaller than 2 * standard deviation (%lld us), or equal",
+            pp->dev, standard_deviation);
+
+	rc = (int)(THRES_USEC_VALUE - (avglatency/(long long)latency_interval));
+    return rc;
+}
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 5939688..3dd0d77 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -293,6 +293,10 @@ Generate a random priority between 1 and 10.
 Generate the path priority based on the regular expression and the
 priority provided as argument. Requires prio_args keyword.
 .TP
+.I path_latency
+Generate the path priority based on a latency algorithm.
+Requires prio_args keyword.
+.TP
 .I datacore
 .\" XXX
 ???. Requires prio_args keyword.
@@ -333,6 +337,20 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 "%N:%R:%n:%r"\fR. For example: 0x200100e08ba0aea0:0x210100e08ba0aea0:.*:.* , .*:.*:iqn.2009-10.com.redhat.msp.lab.ask-06:.*
 .RE
 .TP 12
+.I path_latency
+Needs a value of the form
+\fI"<latency_interval>|<io_num>"\fR
+.RS
+.TP 8
+.I latency_interval
+The interval values of average latency between two different neighbour ranks of path priority, used to partition different priority ranks.
+Form: XXs, or XXXus, or XXXms. Unit: Second, or Microsecond, or Millisecond. Valid Values: Integer, s [1, 60], ms [1, 60000], us [1, 60000000],
+For example: If latency_interval=10ms, the paths will be grouped in priority groups with path latency 0-10ms, 10-20ms, 20-30ms, etc..
+.TP
+.I io_num
+The number of read IOs sent to the current path continuously, used to calculate the average path latency. Valid Values: Integer, [2, 200].
+.RE
+.TP 12
 .I alua
 If \fIexclusive_pref_bit\fR is set, paths with the \fIpreferred path\fR bit
 set will always be in their own path group.
diff --git a/libmultipath/prio.h b/libmultipath/prio.h
index 0193c52..c97fe39 100644
--- a/libmultipath/prio.h
+++ b/libmultipath/prio.h
@@ -29,6 +29,7 @@ struct path;
 #define PRIO_RDAC		"rdac"
 #define PRIO_WEIGHTED_PATH	"weightedpath"
 #define PRIO_SYSFS		"sysfs"
+#define PRIO_PATH_LATENCY	"path_latency"

 /*
  * Value used to mark the fact prio was not defined
-- 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2017-06-01  1:50 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-08  3:58 [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Yang Feng
2017-05-10 22:36 ` Xose Vazquez Perez
2017-05-11  4:57   ` Philip Yang
2017-05-11 11:14 ` Martin Wilck
2017-05-15 10:44   ` Yang Feng
2017-05-16 14:53     ` Yang Feng
2017-05-16 18:54     ` Martin Wilck
2017-05-19  8:43       ` Yang Feng
2017-05-22  8:01         ` Yang Feng
2017-05-24  1:58         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
2017-05-16 21:38     ` [PATCH] multipath-tools:Prioritizer based on a time-delay algorithm Benjamin Marzinski
2017-05-19  9:45       ` Yang Feng
2017-05-22  8:02         ` Yang Feng
2017-05-24  1:59         ` Yang Feng
2017-05-24  2:22         ` [PATCH] multipath-tools:Prioritizer based on a latency algorithm Yang Feng
2017-05-24  2:50           ` Yang Feng
2017-05-24 19:57             ` Benjamin Marzinski
2017-05-25  1:55               ` Yang Feng
2017-06-01  1:50             ` [PATCH v2] multipath-tools: Prioritizer " Yang Feng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.