All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Yan, Zheng" <zheng.z.yan@intel.com>
To: ceph-devel@vger.kernel.org, sage@inktank.com, greg@inktank.com
Cc: "Yan, Zheng" <zheng.z.yan@intel.com>
Subject: [PATCH] mds: avoid sending duplicated table prepare/commit
Date: Mon,  1 Apr 2013 16:51:21 +0800	[thread overview]
Message-ID: <1364806282-15071-1-git-send-email-zheng.z.yan@intel.com> (raw)
In-Reply-To: <1363531902-24909-1-git-send-email-zheng.z.yan@intel.com>

From: "Yan, Zheng" <zheng.z.yan@intel.com>

This patch makes table client defer sending table prepare/commit messages
until receiving table server's 'ready' message. This avoid duplicated table
prepare/commit messages.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/AnchorClient.cc   |  9 +++++--
 src/mds/MDS.cc            | 14 +++++++++--
 src/mds/MDS.h             |  4 +++-
 src/mds/MDSTableClient.cc | 60 +++++++++++++++++++++++++----------------------
 src/mds/MDSTableClient.h  |  7 ++++--
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
index 455e97f..bcc8710 100644
--- a/src/mds/AnchorClient.cc
+++ b/src/mds/AnchorClient.cc
@@ -41,7 +41,9 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m)
   ::decode(ino, p);
   ::decode(trace, p);
 
-  assert(pending_lookup.count(ino));
+  if (!pending_lookup.count(ino))
+    return;
+
   list<_pending_lookup> ls;
   ls.swap(pending_lookup[ino]);
   pending_lookup.erase(ino);
@@ -80,9 +82,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
 
 void AnchorClient::_lookup(inodeno_t ino)
 {
+  int ts = mds->mdsmap->get_tableserver();
+  if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
+    return;
   MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
   ::encode(ino, req->bl);
-  mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  mds->send_message_mds(req, ts);
 }
 
 
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 32bb064..2d48815 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1050,7 +1050,7 @@ void MDS::handle_mds_map(MMDSMap *m)
     for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
       if (oldfailed.count(*p) == 0) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
     
     // or down then up?
@@ -1061,7 +1061,7 @@ void MDS::handle_mds_map(MMDSMap *m)
       if (oldmap->have_inst(*p) &&
 	  oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
   }
   if (is_clientreplay() || is_active() || is_stopping()) {
@@ -1548,6 +1548,16 @@ void MDS::handle_mds_recovery(int who)
   waiting_for_active_peer.erase(who);
 }
 
+void MDS::handle_mds_failure(int who)
+{
+  dout(5) << "handle_mds_failure mds." << who << dendl;
+
+  mdcache->handle_mds_failure(who);
+
+  anchorclient->handle_mds_failure(who);
+  snapclient->handle_mds_failure(who);
+}
+
 void MDS::stopping_start()
 {
   dout(2) << "stopping_start" << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 42e8516..6658cf0 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -378,13 +378,15 @@ class MDS : public Dispatcher {
   void rejoin_joint_start();
   void rejoin_done();
   void recovery_done();
-  void handle_mds_recovery(int who);
   void clientreplay_start();
   void clientreplay_done();
   void active_start();
   void stopping_start();
   void stopping_done();
 
+  void handle_mds_recovery(int who);
+  void handle_mds_failure(int who);
+
   void suicide();
   void respawn();
 
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index 12331f9..2ce3286 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -65,18 +65,15 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
       }
     } 
     else if (pending_commit.count(tid)) {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", already committing, resending COMMIT"
-	       << dendl;      
-      MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-      mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", already committing, will resend COMMIT" << dendl;
+      assert(!server_ready);
+      // will re-send commit when receiving the server ready message
     }
     else {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", sending ROLLBACK"
-	       << dendl;      
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", sending ROLLBACK" << dendl;
+      assert(!server_ready);
       MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_ROLLBACK, 0, tid);
       mds->send_message_mds(req, mds->mdsmap->get_tableserver());
     }
@@ -102,6 +99,9 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
     break;
 
   case TABLESERVER_OP_SERVER_READY:
+    assert(!server_ready);
+    server_ready = true;
+
     if (last_reqid == ~0ULL)
       last_reqid = reqid;
 
@@ -144,26 +144,18 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
   uint64_t reqid = ++last_reqid;
   dout(10) << "_prepare " << reqid << dendl;
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
-  req->bl = mutation;
-
   pending_prepare[reqid].mutation = mutation;
   pending_prepare[reqid].ptid = ptid;
   pending_prepare[reqid].pbl = pbl;
   pending_prepare[reqid].onfinish = onfinish;
 
-  send_to_tableserver(req);
-}
-
-void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
-{
-  int ts = mds->mdsmap->get_tableserver();
-  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
-    mds->send_message_mds(req, ts);
-  else {
-    dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
-  }
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 void MDSTableClient::commit(version_t tid, LogSegment *ls)
@@ -176,9 +168,12 @@ void MDSTableClient::commit(version_t tid, LogSegment *ls)
 
   assert(g_conf->mds_kill_mdstable_at != 4);
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-  send_to_tableserver(req);
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 
@@ -228,3 +223,12 @@ void MDSTableClient::resend_prepares()
     mds->send_message_mds(req, mds->mdsmap->get_tableserver());
   }
 }
+
+void MDSTableClient::handle_mds_failure(int who)
+{
+  if (who != mds->mdsmap->get_tableserver())
+    return; // do nothing.
+
+  dout(7) << "tableserver mds." << who << " fails" << dendl;
+  server_ready = false;
+}
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index 934f5fe..f8a84eb 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -30,6 +30,8 @@ protected:
 
   uint64_t last_reqid;
 
+  bool server_ready;
+
   // prepares
   struct _pending_prepare {
     Context *onfinish;
@@ -63,7 +65,8 @@ protected:
   void _logged_ack(version_t tid);
 
 public:
-  MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(~0ULL) {}
+  MDSTableClient(MDS *m, int tab) :
+    mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {}
   virtual ~MDSTableClient() {}
 
   void handle_request(MMDSTableRequest *m);
@@ -85,7 +88,7 @@ public:
     ack_waiters[tid].push_back(c);
   }
 
-  void send_to_tableserver(MMDSTableRequest *req);
+  void handle_mds_failure(int mds);
 
   // child must implement
   virtual void resend_queries() = 0;
-- 
1.7.11.7


  parent reply	other threads:[~2013-04-01  8:51 UTC|newest]

Thread overview: 117+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
2013-03-19 23:09   ` Greg Farnum
2013-03-20  5:53     ` Yan, Zheng
2013-03-20  6:15       ` Sage Weil
2013-03-20  6:24         ` Yan, Zheng
2013-03-20  6:49         ` Yan, Zheng
2013-03-20 18:31           ` Greg Farnum
2013-03-21  8:07             ` Yan, Zheng
2013-03-21 22:03               ` Gregory Farnum
2013-03-25 11:30                 ` Yan, Zheng
2013-03-29 22:12                   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
2013-03-20 18:34   ` Greg Farnum
2013-03-29 21:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
2013-03-29 22:00   ` Gregory Farnum
2013-03-31 13:21     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
2013-03-20 18:37   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
2013-03-20 18:40   ` Greg Farnum
2013-03-21  2:22     ` Yan, Zheng
2013-03-21 21:43       ` Gregory Farnum
2013-03-20 19:09   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
2013-03-20 19:36   ` Greg Farnum
2013-03-21  2:29     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
2013-03-20 22:52   ` Sage Weil
2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
2013-03-20 21:19   ` Greg Farnum
2013-03-21  2:38     ` Yan, Zheng
2013-03-21  4:15       ` Sage Weil
2013-03-21 21:48         ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
2013-03-20 21:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
2013-03-20 21:56   ` Gregory Farnum
2013-03-21  2:55     ` Yan, Zheng
2013-03-21 21:55       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
2013-03-20 22:09   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
2013-03-20 22:17   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
2013-03-20 22:57   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
2013-03-21 17:59   ` Gregory Farnum
2013-03-22  3:04     ` Yan, Zheng
2013-03-29 22:02       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
2013-03-20 23:26   ` Gregory Farnum
2013-03-20 23:36     ` Sage Weil
2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
2013-03-20 23:33   ` Gregory Farnum
2013-03-20 23:40     ` Gregory Farnum
2013-03-21  6:41     ` Yan, Zheng
2013-03-21 21:58       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
2013-03-21 21:23   ` Gregory Farnum
2013-03-22  3:05     ` Yan, Zheng
2013-03-25 16:14       ` Gregory Farnum
2013-03-26  7:21     ` Yan, Zheng
2013-03-29 22:09       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
2013-03-20 23:59   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
2013-03-21  0:37   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
2013-03-21  0:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
2013-03-21  0:52   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
2013-03-21  3:12   ` Gregory Farnum
2013-03-21  3:20     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
2013-03-21  3:14   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
2013-03-21  3:20   ` Gregory Farnum
2013-03-21  3:33     ` Yan, Zheng
2013-03-21  4:20       ` Sage Weil
2013-03-21 21:58     ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
2013-03-21  3:24   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
2013-03-21  3:27   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
2013-03-21  3:31   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
2013-03-21  3:34   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
2013-03-21  3:38   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
2013-03-21  3:40   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
2013-03-21  3:44   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
2013-03-21  3:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
2013-03-21  3:46   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
2013-03-21  3:49   ` Gregory Farnum
2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-04-01 17:00   ` Gregory Farnum
2013-04-01  8:51 ` Yan, Zheng [this message]
2013-04-01  8:51   ` [PATCH] mds: don't roll back prepared table updates Yan, Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1364806282-15071-1-git-send-email-zheng.z.yan@intel.com \
    --to=zheng.z.yan@intel.com \
    --cc=ceph-devel@vger.kernel.org \
    --cc=greg@inktank.com \
    --cc=sage@inktank.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.