From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alexander Aring Date: Thu, 23 Jul 2020 10:49:08 -0400 Subject: [Cluster-devel] [PATCH dlm-next 4/4] fs: dlm: implement tcp graceful shutdown In-Reply-To: <20200723144908.271110-1-aahringo@redhat.com> References: <20200723144908.271110-1-aahringo@redhat.com> Message-ID: <20200723144908.271110-5-aahringo@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit During my code inspection I saw there is no implementation of a graceful shutdown for tcp. This patch will introduce a graceful shutdown for tcp connections. The shutdown is implemented synchronized as dlm_lowcomms_stop() is called to end all dlm communication. After shutdown is done, a lot of flush and closing functionality will be called. However I don't see a problem with that. The waitqueue for synchronize the shutdown has a timeout of 10 seconds, if timeout a force close will be exectued. Signed-off-by: Alexander Aring --- fs/dlm/lowcomms.c | 77 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index f5ac40db5e7c1..359d5e003f749 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,6 +63,7 @@ /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) struct cbuf { unsigned int base; @@ -110,10 +111,12 @@ struct connection { #define CF_CLOSE 6 #define CF_APP_LIMITED 7 #define CF_CLOSING 8 +#define CF_SHUTDOWN 9 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ + void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ struct page *rx_page; struct cbuf cb; int retries; @@ -122,6 +125,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ + wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -218,6 +222,7 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) spin_lock_init(&con->writequeue_lock); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); + init_waitqueue_head(&con->shutdown_wait); /* Setup action pointers for child sockets */ if (con->nodeid) { @@ -619,6 +624,54 @@ static void close_connection(struct connection *con, bool and_other, clear_bit(CF_CLOSING, &con->flags); } +static void shutdown_connection(struct connection *con) +{ + int ret; + + if (cancel_work_sync(&con->swork)) { + log_print("canceled swork for node %d", con->nodeid); + clear_bit(CF_WRITE_PENDING, &con->flags); + } + + mutex_lock(&con->sock_mutex); + /* nothing to shutdown */ + if (!con->sock) { + mutex_unlock(&con->sock_mutex); + return; + } + + set_bit(CF_SHUTDOWN, &con->flags); + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + mutex_unlock(&con->sock_mutex); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, + !test_bit(CF_SHUTDOWN, &con->flags), + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + clear_bit(CF_SHUTDOWN, &con->flags); + close_connection(con, false, true, true); +} + +static void dlm_tcp_shutdown(struct connection *con) +{ + if (con->othercon) + shutdown_connection(con->othercon); + shutdown_connection(con); +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { @@ -711,13 +764,18 @@ static int receive_from_sock(struct connection *con) out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false, true, false); /* Reconnect when there is something to send */ + close_connection(con, false, true, false); + if (ret == 0) { + log_print("connection %p got EOF from %d", + con, con->nodeid); + /* handling for tcp shutdown */ + clear_bit(CF_SHUTDOWN, &con->flags); + wake_up(&con->shutdown_wait); + /* signal to breaking receive worker */ + ret = -1; + } } - /* Don't return success if we really got EOF */ - if (ret == 0) - ret = -EAGAIN; - return ret; } @@ -801,6 +859,7 @@ static int accept_from_sock(struct connection *con) spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); + init_waitqueue_head(&othercon->shutdown_wait); set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ @@ -1045,6 +1104,7 @@ static void tcp_connect_to_sock(struct connection *con) con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; + con->shutdown_action = dlm_tcp_shutdown; add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1540,6 +1600,12 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } +static void shutdown_conn(struct connection *con) +{ + if (con->shutdown_action) + con->shutdown_action(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); @@ -1591,6 +1657,7 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; mutex_unlock(&connections_lock); + foreach_conn(shutdown_conn); work_flush(); clean_writequeues(); foreach_conn(free_conn); -- 2.26.2